diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/404.html b/404.html new file mode 100644 index 00000000..f6eae60a --- /dev/null +++ b/404.html @@ -0,0 +1,184 @@ + + + + + + + + Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • +
  • +
  • +
+
+
+
+
+ + +

404

+ +

Page not found

+ + +
+
+ +
+
+ +
+ +
+ +
+ + + + + +
+ + + + + + + + diff --git a/css/fonts/Roboto-Slab-Bold.woff b/css/fonts/Roboto-Slab-Bold.woff new file mode 100644 index 00000000..6cb60000 Binary files /dev/null and b/css/fonts/Roboto-Slab-Bold.woff differ diff --git a/css/fonts/Roboto-Slab-Bold.woff2 b/css/fonts/Roboto-Slab-Bold.woff2 new file mode 100644 index 00000000..7059e231 Binary files /dev/null and b/css/fonts/Roboto-Slab-Bold.woff2 differ diff --git a/css/fonts/Roboto-Slab-Regular.woff b/css/fonts/Roboto-Slab-Regular.woff new file mode 100644 index 00000000..f815f63f Binary files /dev/null and b/css/fonts/Roboto-Slab-Regular.woff differ diff --git a/css/fonts/Roboto-Slab-Regular.woff2 b/css/fonts/Roboto-Slab-Regular.woff2 new file mode 100644 index 00000000..f2c76e5b Binary files /dev/null and b/css/fonts/Roboto-Slab-Regular.woff2 differ diff --git a/css/fonts/fontawesome-webfont.eot b/css/fonts/fontawesome-webfont.eot new file mode 100644 index 00000000..e9f60ca9 Binary files /dev/null and b/css/fonts/fontawesome-webfont.eot differ diff --git a/css/fonts/fontawesome-webfont.svg b/css/fonts/fontawesome-webfont.svg new file mode 100644 index 00000000..855c845e --- /dev/null +++ b/css/fonts/fontawesome-webfont.svg @@ -0,0 +1,2671 @@ + + + + +Created by FontForge 20120731 at Mon Oct 24 17:37:40 2016 + By ,,, +Copyright Dave Gandy 2016. All rights reserved. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/css/fonts/fontawesome-webfont.ttf b/css/fonts/fontawesome-webfont.ttf new file mode 100644 index 00000000..35acda2f Binary files /dev/null and b/css/fonts/fontawesome-webfont.ttf differ diff --git a/css/fonts/fontawesome-webfont.woff b/css/fonts/fontawesome-webfont.woff new file mode 100644 index 00000000..400014a4 Binary files /dev/null and b/css/fonts/fontawesome-webfont.woff differ diff --git a/css/fonts/fontawesome-webfont.woff2 b/css/fonts/fontawesome-webfont.woff2 new file mode 100644 index 00000000..4d13fc60 Binary files /dev/null and b/css/fonts/fontawesome-webfont.woff2 differ diff --git a/css/fonts/lato-bold-italic.woff b/css/fonts/lato-bold-italic.woff new file mode 100644 index 00000000..88ad05b9 Binary files /dev/null and b/css/fonts/lato-bold-italic.woff differ diff --git a/css/fonts/lato-bold-italic.woff2 b/css/fonts/lato-bold-italic.woff2 new file mode 100644 index 00000000..c4e3d804 Binary files /dev/null and b/css/fonts/lato-bold-italic.woff2 differ diff --git a/css/fonts/lato-bold.woff b/css/fonts/lato-bold.woff new file mode 100644 index 00000000..c6dff51f Binary files /dev/null and b/css/fonts/lato-bold.woff differ diff --git a/css/fonts/lato-bold.woff2 b/css/fonts/lato-bold.woff2 new file mode 100644 index 00000000..bb195043 Binary files /dev/null and b/css/fonts/lato-bold.woff2 differ diff --git a/css/fonts/lato-normal-italic.woff b/css/fonts/lato-normal-italic.woff new file mode 100644 index 00000000..76114bc0 Binary files /dev/null and b/css/fonts/lato-normal-italic.woff differ diff --git a/css/fonts/lato-normal-italic.woff2 b/css/fonts/lato-normal-italic.woff2 new file mode 100644 index 00000000..3404f37e Binary files /dev/null and b/css/fonts/lato-normal-italic.woff2 differ diff --git a/css/fonts/lato-normal.woff b/css/fonts/lato-normal.woff new file mode 100644 index 00000000..ae1307ff Binary files /dev/null and b/css/fonts/lato-normal.woff differ diff --git a/css/fonts/lato-normal.woff2 b/css/fonts/lato-normal.woff2 new file mode 100644 index 00000000..3bf98433 Binary files /dev/null and b/css/fonts/lato-normal.woff2 differ diff --git a/css/theme.css b/css/theme.css new file mode 100644 index 00000000..ad773009 --- /dev/null +++ b/css/theme.css @@ -0,0 +1,13 @@ +/* + * This file is copied from the upstream ReadTheDocs Sphinx + * theme. To aid upgradability this file should *not* be edited. + * modifications we need should be included in theme_extra.css. + * + * https://github.com/readthedocs/sphinx_rtd_theme + */ + + /* sphinx_rtd_theme version 1.2.0 | MIT license */ +html{box-sizing:border-box}*,:after,:before{box-sizing:inherit}article,aside,details,figcaption,figure,footer,header,hgroup,nav,section{display:block}audio,canvas,video{display:inline-block;*display:inline;*zoom:1}[hidden],audio:not([controls]){display:none}*{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}html{font-size:100%;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%}body{margin:0}a:active,a:hover{outline:0}abbr[title]{border-bottom:1px dotted}b,strong{font-weight:700}blockquote{margin:0}dfn{font-style:italic}ins{background:#ff9;text-decoration:none}ins,mark{color:#000}mark{background:#ff0;font-style:italic;font-weight:700}.rst-content code,.rst-content tt,code,kbd,pre,samp{font-family:monospace,serif;_font-family:courier new,monospace;font-size:1em}pre{white-space:pre}q{quotes:none}q:after,q:before{content:"";content:none}small{font-size:85%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sup{top:-.5em}sub{bottom:-.25em}dl,ol,ul{margin:0;padding:0;list-style:none;list-style-image:none}li{list-style:none}dd{margin:0}img{border:0;-ms-interpolation-mode:bicubic;vertical-align:middle;max-width:100%}svg:not(:root){overflow:hidden}figure,form{margin:0}label{cursor:pointer}button,input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle}button,input{line-height:normal}button,input[type=button],input[type=reset],input[type=submit]{cursor:pointer;-webkit-appearance:button;*overflow:visible}button[disabled],input[disabled]{cursor:default}input[type=search]{-webkit-appearance:textfield;-moz-box-sizing:content-box;-webkit-box-sizing:content-box;box-sizing:content-box}textarea{resize:vertical}table{border-collapse:collapse;border-spacing:0}td{vertical-align:top}.chromeframe{margin:.2em 0;background:#ccc;color:#000;padding:.2em 0}.ir{display:block;border:0;text-indent:-999em;overflow:hidden;background-color:transparent;background-repeat:no-repeat;text-align:left;direction:ltr;*line-height:0}.ir br{display:none}.hidden{display:none!important;visibility:hidden}.visuallyhidden{border:0;clip:rect(0 0 0 0);height:1px;margin:-1px;overflow:hidden;padding:0;position:absolute;width:1px}.visuallyhidden.focusable:active,.visuallyhidden.focusable:focus{clip:auto;height:auto;margin:0;overflow:visible;position:static;width:auto}.invisible{visibility:hidden}.relative{position:relative}big,small{font-size:100%}@media print{body,html,section{background:none!important}*{box-shadow:none!important;text-shadow:none!important;filter:none!important;-ms-filter:none!important}a,a:visited{text-decoration:underline}.ir a:after,a[href^="#"]:after,a[href^="javascript:"]:after{content:""}blockquote,pre{page-break-inside:avoid}thead{display:table-header-group}img,tr{page-break-inside:avoid}img{max-width:100%!important}@page{margin:.5cm}.rst-content .toctree-wrapper>p.caption,h2,h3,p{orphans:3;widows:3}.rst-content .toctree-wrapper>p.caption,h2,h3{page-break-after:avoid}}.btn,.fa:before,.icon:before,.rst-content .admonition,.rst-content .admonition-title:before,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .code-block-caption .headerlink:before,.rst-content .danger,.rst-content .eqno .headerlink:before,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning,.rst-content code.download span:first-child:before,.rst-content dl dt .headerlink:before,.rst-content h1 .headerlink:before,.rst-content h2 .headerlink:before,.rst-content h3 .headerlink:before,.rst-content h4 .headerlink:before,.rst-content h5 .headerlink:before,.rst-content h6 .headerlink:before,.rst-content p.caption .headerlink:before,.rst-content p .headerlink:before,.rst-content table>caption .headerlink:before,.rst-content tt.download span:first-child:before,.wy-alert,.wy-dropdown .caret:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a button.toctree-expand:before,.wy-menu-vertical li button.toctree-expand:before,input[type=color],input[type=date],input[type=datetime-local],input[type=datetime],input[type=email],input[type=month],input[type=number],input[type=password],input[type=search],input[type=tel],input[type=text],input[type=time],input[type=url],input[type=week],select,textarea{-webkit-font-smoothing:antialiased}.clearfix{*zoom:1}.clearfix:after,.clearfix:before{display:table;content:""}.clearfix:after{clear:both}/*! + * Font Awesome 4.7.0 by @davegandy - http://fontawesome.io - @fontawesome + * License - http://fontawesome.io/license (Font: SIL OFL 1.1, CSS: MIT License) + */@font-face{font-family:FontAwesome;src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713);src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713?#iefix&v=4.7.0) format("embedded-opentype"),url(fonts/fontawesome-webfont.woff2?af7ae505a9eed503f8b8e6982036873e) format("woff2"),url(fonts/fontawesome-webfont.woff?fee66e712a8a08eef5805a46892932ad) format("woff"),url(fonts/fontawesome-webfont.ttf?b06871f281fee6b241d60582ae9369b9) format("truetype"),url(fonts/fontawesome-webfont.svg?912ec66d7572ff821749319396470bde#fontawesomeregular) format("svg");font-weight:400;font-style:normal}.fa,.icon,.rst-content .admonition-title,.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content code.download span:first-child,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink,.rst-content tt.download span:first-child,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li button.toctree-expand{display:inline-block;font:normal normal normal 14px/1 FontAwesome;font-size:inherit;text-rendering:auto;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}.fa-lg{font-size:1.33333em;line-height:.75em;vertical-align:-15%}.fa-2x{font-size:2em}.fa-3x{font-size:3em}.fa-4x{font-size:4em}.fa-5x{font-size:5em}.fa-fw{width:1.28571em;text-align:center}.fa-ul{padding-left:0;margin-left:2.14286em;list-style-type:none}.fa-ul>li{position:relative}.fa-li{position:absolute;left:-2.14286em;width:2.14286em;top:.14286em;text-align:center}.fa-li.fa-lg{left:-1.85714em}.fa-border{padding:.2em .25em .15em;border:.08em solid #eee;border-radius:.1em}.fa-pull-left{float:left}.fa-pull-right{float:right}.fa-pull-left.icon,.fa.fa-pull-left,.rst-content .code-block-caption .fa-pull-left.headerlink,.rst-content .eqno .fa-pull-left.headerlink,.rst-content .fa-pull-left.admonition-title,.rst-content code.download span.fa-pull-left:first-child,.rst-content dl dt .fa-pull-left.headerlink,.rst-content h1 .fa-pull-left.headerlink,.rst-content h2 .fa-pull-left.headerlink,.rst-content h3 .fa-pull-left.headerlink,.rst-content h4 .fa-pull-left.headerlink,.rst-content h5 .fa-pull-left.headerlink,.rst-content h6 .fa-pull-left.headerlink,.rst-content p .fa-pull-left.headerlink,.rst-content table>caption .fa-pull-left.headerlink,.rst-content tt.download span.fa-pull-left:first-child,.wy-menu-vertical li.current>a button.fa-pull-left.toctree-expand,.wy-menu-vertical li.on a button.fa-pull-left.toctree-expand,.wy-menu-vertical li button.fa-pull-left.toctree-expand{margin-right:.3em}.fa-pull-right.icon,.fa.fa-pull-right,.rst-content .code-block-caption .fa-pull-right.headerlink,.rst-content .eqno .fa-pull-right.headerlink,.rst-content .fa-pull-right.admonition-title,.rst-content code.download span.fa-pull-right:first-child,.rst-content dl dt .fa-pull-right.headerlink,.rst-content h1 .fa-pull-right.headerlink,.rst-content h2 .fa-pull-right.headerlink,.rst-content h3 .fa-pull-right.headerlink,.rst-content h4 .fa-pull-right.headerlink,.rst-content h5 .fa-pull-right.headerlink,.rst-content h6 .fa-pull-right.headerlink,.rst-content p .fa-pull-right.headerlink,.rst-content table>caption .fa-pull-right.headerlink,.rst-content tt.download span.fa-pull-right:first-child,.wy-menu-vertical li.current>a button.fa-pull-right.toctree-expand,.wy-menu-vertical li.on a button.fa-pull-right.toctree-expand,.wy-menu-vertical li button.fa-pull-right.toctree-expand{margin-left:.3em}.pull-right{float:right}.pull-left{float:left}.fa.pull-left,.pull-left.icon,.rst-content .code-block-caption .pull-left.headerlink,.rst-content .eqno .pull-left.headerlink,.rst-content .pull-left.admonition-title,.rst-content code.download span.pull-left:first-child,.rst-content dl dt .pull-left.headerlink,.rst-content h1 .pull-left.headerlink,.rst-content h2 .pull-left.headerlink,.rst-content h3 .pull-left.headerlink,.rst-content h4 .pull-left.headerlink,.rst-content h5 .pull-left.headerlink,.rst-content h6 .pull-left.headerlink,.rst-content p .pull-left.headerlink,.rst-content table>caption .pull-left.headerlink,.rst-content tt.download span.pull-left:first-child,.wy-menu-vertical li.current>a button.pull-left.toctree-expand,.wy-menu-vertical li.on a button.pull-left.toctree-expand,.wy-menu-vertical li button.pull-left.toctree-expand{margin-right:.3em}.fa.pull-right,.pull-right.icon,.rst-content .code-block-caption .pull-right.headerlink,.rst-content .eqno .pull-right.headerlink,.rst-content .pull-right.admonition-title,.rst-content code.download span.pull-right:first-child,.rst-content dl dt .pull-right.headerlink,.rst-content h1 .pull-right.headerlink,.rst-content h2 .pull-right.headerlink,.rst-content h3 .pull-right.headerlink,.rst-content h4 .pull-right.headerlink,.rst-content h5 .pull-right.headerlink,.rst-content h6 .pull-right.headerlink,.rst-content p .pull-right.headerlink,.rst-content table>caption .pull-right.headerlink,.rst-content tt.download span.pull-right:first-child,.wy-menu-vertical li.current>a button.pull-right.toctree-expand,.wy-menu-vertical li.on a button.pull-right.toctree-expand,.wy-menu-vertical li button.pull-right.toctree-expand{margin-left:.3em}.fa-spin{-webkit-animation:fa-spin 2s linear infinite;animation:fa-spin 2s linear infinite}.fa-pulse{-webkit-animation:fa-spin 1s steps(8) infinite;animation:fa-spin 1s steps(8) infinite}@-webkit-keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}to{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}@keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}to{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}.fa-rotate-90{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=1)";-webkit-transform:rotate(90deg);-ms-transform:rotate(90deg);transform:rotate(90deg)}.fa-rotate-180{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2)";-webkit-transform:rotate(180deg);-ms-transform:rotate(180deg);transform:rotate(180deg)}.fa-rotate-270{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=3)";-webkit-transform:rotate(270deg);-ms-transform:rotate(270deg);transform:rotate(270deg)}.fa-flip-horizontal{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=0, mirror=1)";-webkit-transform:scaleX(-1);-ms-transform:scaleX(-1);transform:scaleX(-1)}.fa-flip-vertical{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2, mirror=1)";-webkit-transform:scaleY(-1);-ms-transform:scaleY(-1);transform:scaleY(-1)}:root .fa-flip-horizontal,:root .fa-flip-vertical,:root .fa-rotate-90,:root .fa-rotate-180,:root .fa-rotate-270{filter:none}.fa-stack{position:relative;display:inline-block;width:2em;height:2em;line-height:2em;vertical-align:middle}.fa-stack-1x,.fa-stack-2x{position:absolute;left:0;width:100%;text-align:center}.fa-stack-1x{line-height:inherit}.fa-stack-2x{font-size:2em}.fa-inverse{color:#fff}.fa-glass:before{content:""}.fa-music:before{content:""}.fa-search:before,.icon-search:before{content:""}.fa-envelope-o:before{content:""}.fa-heart:before{content:""}.fa-star:before{content:""}.fa-star-o:before{content:""}.fa-user:before{content:""}.fa-film:before{content:""}.fa-th-large:before{content:""}.fa-th:before{content:""}.fa-th-list:before{content:""}.fa-check:before{content:""}.fa-close:before,.fa-remove:before,.fa-times:before{content:""}.fa-search-plus:before{content:""}.fa-search-minus:before{content:""}.fa-power-off:before{content:""}.fa-signal:before{content:""}.fa-cog:before,.fa-gear:before{content:""}.fa-trash-o:before{content:""}.fa-home:before,.icon-home:before{content:""}.fa-file-o:before{content:""}.fa-clock-o:before{content:""}.fa-road:before{content:""}.fa-download:before,.rst-content code.download span:first-child:before,.rst-content tt.download span:first-child:before{content:""}.fa-arrow-circle-o-down:before{content:""}.fa-arrow-circle-o-up:before{content:""}.fa-inbox:before{content:""}.fa-play-circle-o:before{content:""}.fa-repeat:before,.fa-rotate-right:before{content:""}.fa-refresh:before{content:""}.fa-list-alt:before{content:""}.fa-lock:before{content:""}.fa-flag:before{content:""}.fa-headphones:before{content:""}.fa-volume-off:before{content:""}.fa-volume-down:before{content:""}.fa-volume-up:before{content:""}.fa-qrcode:before{content:""}.fa-barcode:before{content:""}.fa-tag:before{content:""}.fa-tags:before{content:""}.fa-book:before,.icon-book:before{content:""}.fa-bookmark:before{content:""}.fa-print:before{content:""}.fa-camera:before{content:""}.fa-font:before{content:""}.fa-bold:before{content:""}.fa-italic:before{content:""}.fa-text-height:before{content:""}.fa-text-width:before{content:""}.fa-align-left:before{content:""}.fa-align-center:before{content:""}.fa-align-right:before{content:""}.fa-align-justify:before{content:""}.fa-list:before{content:""}.fa-dedent:before,.fa-outdent:before{content:""}.fa-indent:before{content:""}.fa-video-camera:before{content:""}.fa-image:before,.fa-photo:before,.fa-picture-o:before{content:""}.fa-pencil:before{content:""}.fa-map-marker:before{content:""}.fa-adjust:before{content:""}.fa-tint:before{content:""}.fa-edit:before,.fa-pencil-square-o:before{content:""}.fa-share-square-o:before{content:""}.fa-check-square-o:before{content:""}.fa-arrows:before{content:""}.fa-step-backward:before{content:""}.fa-fast-backward:before{content:""}.fa-backward:before{content:""}.fa-play:before{content:""}.fa-pause:before{content:""}.fa-stop:before{content:""}.fa-forward:before{content:""}.fa-fast-forward:before{content:""}.fa-step-forward:before{content:""}.fa-eject:before{content:""}.fa-chevron-left:before{content:""}.fa-chevron-right:before{content:""}.fa-plus-circle:before{content:""}.fa-minus-circle:before{content:""}.fa-times-circle:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before{content:""}.fa-check-circle:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before{content:""}.fa-question-circle:before{content:""}.fa-info-circle:before{content:""}.fa-crosshairs:before{content:""}.fa-times-circle-o:before{content:""}.fa-check-circle-o:before{content:""}.fa-ban:before{content:""}.fa-arrow-left:before{content:""}.fa-arrow-right:before{content:""}.fa-arrow-up:before{content:""}.fa-arrow-down:before{content:""}.fa-mail-forward:before,.fa-share:before{content:""}.fa-expand:before{content:""}.fa-compress:before{content:""}.fa-plus:before{content:""}.fa-minus:before{content:""}.fa-asterisk:before{content:""}.fa-exclamation-circle:before,.rst-content .admonition-title:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before{content:""}.fa-gift:before{content:""}.fa-leaf:before{content:""}.fa-fire:before,.icon-fire:before{content:""}.fa-eye:before{content:""}.fa-eye-slash:before{content:""}.fa-exclamation-triangle:before,.fa-warning:before{content:""}.fa-plane:before{content:""}.fa-calendar:before{content:""}.fa-random:before{content:""}.fa-comment:before{content:""}.fa-magnet:before{content:""}.fa-chevron-up:before{content:""}.fa-chevron-down:before{content:""}.fa-retweet:before{content:""}.fa-shopping-cart:before{content:""}.fa-folder:before{content:""}.fa-folder-open:before{content:""}.fa-arrows-v:before{content:""}.fa-arrows-h:before{content:""}.fa-bar-chart-o:before,.fa-bar-chart:before{content:""}.fa-twitter-square:before{content:""}.fa-facebook-square:before{content:""}.fa-camera-retro:before{content:""}.fa-key:before{content:""}.fa-cogs:before,.fa-gears:before{content:""}.fa-comments:before{content:""}.fa-thumbs-o-up:before{content:""}.fa-thumbs-o-down:before{content:""}.fa-star-half:before{content:""}.fa-heart-o:before{content:""}.fa-sign-out:before{content:""}.fa-linkedin-square:before{content:""}.fa-thumb-tack:before{content:""}.fa-external-link:before{content:""}.fa-sign-in:before{content:""}.fa-trophy:before{content:""}.fa-github-square:before{content:""}.fa-upload:before{content:""}.fa-lemon-o:before{content:""}.fa-phone:before{content:""}.fa-square-o:before{content:""}.fa-bookmark-o:before{content:""}.fa-phone-square:before{content:""}.fa-twitter:before{content:""}.fa-facebook-f:before,.fa-facebook:before{content:""}.fa-github:before,.icon-github:before{content:""}.fa-unlock:before{content:""}.fa-credit-card:before{content:""}.fa-feed:before,.fa-rss:before{content:""}.fa-hdd-o:before{content:""}.fa-bullhorn:before{content:""}.fa-bell:before{content:""}.fa-certificate:before{content:""}.fa-hand-o-right:before{content:""}.fa-hand-o-left:before{content:""}.fa-hand-o-up:before{content:""}.fa-hand-o-down:before{content:""}.fa-arrow-circle-left:before,.icon-circle-arrow-left:before{content:""}.fa-arrow-circle-right:before,.icon-circle-arrow-right:before{content:""}.fa-arrow-circle-up:before{content:""}.fa-arrow-circle-down:before{content:""}.fa-globe:before{content:""}.fa-wrench:before{content:""}.fa-tasks:before{content:""}.fa-filter:before{content:""}.fa-briefcase:before{content:""}.fa-arrows-alt:before{content:""}.fa-group:before,.fa-users:before{content:""}.fa-chain:before,.fa-link:before,.icon-link:before{content:""}.fa-cloud:before{content:""}.fa-flask:before{content:""}.fa-cut:before,.fa-scissors:before{content:""}.fa-copy:before,.fa-files-o:before{content:""}.fa-paperclip:before{content:""}.fa-floppy-o:before,.fa-save:before{content:""}.fa-square:before{content:""}.fa-bars:before,.fa-navicon:before,.fa-reorder:before{content:""}.fa-list-ul:before{content:""}.fa-list-ol:before{content:""}.fa-strikethrough:before{content:""}.fa-underline:before{content:""}.fa-table:before{content:""}.fa-magic:before{content:""}.fa-truck:before{content:""}.fa-pinterest:before{content:""}.fa-pinterest-square:before{content:""}.fa-google-plus-square:before{content:""}.fa-google-plus:before{content:""}.fa-money:before{content:""}.fa-caret-down:before,.icon-caret-down:before,.wy-dropdown .caret:before{content:""}.fa-caret-up:before{content:""}.fa-caret-left:before{content:""}.fa-caret-right:before{content:""}.fa-columns:before{content:""}.fa-sort:before,.fa-unsorted:before{content:""}.fa-sort-desc:before,.fa-sort-down:before{content:""}.fa-sort-asc:before,.fa-sort-up:before{content:""}.fa-envelope:before{content:""}.fa-linkedin:before{content:""}.fa-rotate-left:before,.fa-undo:before{content:""}.fa-gavel:before,.fa-legal:before{content:""}.fa-dashboard:before,.fa-tachometer:before{content:""}.fa-comment-o:before{content:""}.fa-comments-o:before{content:""}.fa-bolt:before,.fa-flash:before{content:""}.fa-sitemap:before{content:""}.fa-umbrella:before{content:""}.fa-clipboard:before,.fa-paste:before{content:""}.fa-lightbulb-o:before{content:""}.fa-exchange:before{content:""}.fa-cloud-download:before{content:""}.fa-cloud-upload:before{content:""}.fa-user-md:before{content:""}.fa-stethoscope:before{content:""}.fa-suitcase:before{content:""}.fa-bell-o:before{content:""}.fa-coffee:before{content:""}.fa-cutlery:before{content:""}.fa-file-text-o:before{content:""}.fa-building-o:before{content:""}.fa-hospital-o:before{content:""}.fa-ambulance:before{content:""}.fa-medkit:before{content:""}.fa-fighter-jet:before{content:""}.fa-beer:before{content:""}.fa-h-square:before{content:""}.fa-plus-square:before{content:""}.fa-angle-double-left:before{content:""}.fa-angle-double-right:before{content:""}.fa-angle-double-up:before{content:""}.fa-angle-double-down:before{content:""}.fa-angle-left:before{content:""}.fa-angle-right:before{content:""}.fa-angle-up:before{content:""}.fa-angle-down:before{content:""}.fa-desktop:before{content:""}.fa-laptop:before{content:""}.fa-tablet:before{content:""}.fa-mobile-phone:before,.fa-mobile:before{content:""}.fa-circle-o:before{content:""}.fa-quote-left:before{content:""}.fa-quote-right:before{content:""}.fa-spinner:before{content:""}.fa-circle:before{content:""}.fa-mail-reply:before,.fa-reply:before{content:""}.fa-github-alt:before{content:""}.fa-folder-o:before{content:""}.fa-folder-open-o:before{content:""}.fa-smile-o:before{content:""}.fa-frown-o:before{content:""}.fa-meh-o:before{content:""}.fa-gamepad:before{content:""}.fa-keyboard-o:before{content:""}.fa-flag-o:before{content:""}.fa-flag-checkered:before{content:""}.fa-terminal:before{content:""}.fa-code:before{content:""}.fa-mail-reply-all:before,.fa-reply-all:before{content:""}.fa-star-half-empty:before,.fa-star-half-full:before,.fa-star-half-o:before{content:""}.fa-location-arrow:before{content:""}.fa-crop:before{content:""}.fa-code-fork:before{content:""}.fa-chain-broken:before,.fa-unlink:before{content:""}.fa-question:before{content:""}.fa-info:before{content:""}.fa-exclamation:before{content:""}.fa-superscript:before{content:""}.fa-subscript:before{content:""}.fa-eraser:before{content:""}.fa-puzzle-piece:before{content:""}.fa-microphone:before{content:""}.fa-microphone-slash:before{content:""}.fa-shield:before{content:""}.fa-calendar-o:before{content:""}.fa-fire-extinguisher:before{content:""}.fa-rocket:before{content:""}.fa-maxcdn:before{content:""}.fa-chevron-circle-left:before{content:""}.fa-chevron-circle-right:before{content:""}.fa-chevron-circle-up:before{content:""}.fa-chevron-circle-down:before{content:""}.fa-html5:before{content:""}.fa-css3:before{content:""}.fa-anchor:before{content:""}.fa-unlock-alt:before{content:""}.fa-bullseye:before{content:""}.fa-ellipsis-h:before{content:""}.fa-ellipsis-v:before{content:""}.fa-rss-square:before{content:""}.fa-play-circle:before{content:""}.fa-ticket:before{content:""}.fa-minus-square:before{content:""}.fa-minus-square-o:before,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a button.toctree-expand:before{content:""}.fa-level-up:before{content:""}.fa-level-down:before{content:""}.fa-check-square:before{content:""}.fa-pencil-square:before{content:""}.fa-external-link-square:before{content:""}.fa-share-square:before{content:""}.fa-compass:before{content:""}.fa-caret-square-o-down:before,.fa-toggle-down:before{content:""}.fa-caret-square-o-up:before,.fa-toggle-up:before{content:""}.fa-caret-square-o-right:before,.fa-toggle-right:before{content:""}.fa-eur:before,.fa-euro:before{content:""}.fa-gbp:before{content:""}.fa-dollar:before,.fa-usd:before{content:""}.fa-inr:before,.fa-rupee:before{content:""}.fa-cny:before,.fa-jpy:before,.fa-rmb:before,.fa-yen:before{content:""}.fa-rouble:before,.fa-rub:before,.fa-ruble:before{content:""}.fa-krw:before,.fa-won:before{content:""}.fa-bitcoin:before,.fa-btc:before{content:""}.fa-file:before{content:""}.fa-file-text:before{content:""}.fa-sort-alpha-asc:before{content:""}.fa-sort-alpha-desc:before{content:""}.fa-sort-amount-asc:before{content:""}.fa-sort-amount-desc:before{content:""}.fa-sort-numeric-asc:before{content:""}.fa-sort-numeric-desc:before{content:""}.fa-thumbs-up:before{content:""}.fa-thumbs-down:before{content:""}.fa-youtube-square:before{content:""}.fa-youtube:before{content:""}.fa-xing:before{content:""}.fa-xing-square:before{content:""}.fa-youtube-play:before{content:""}.fa-dropbox:before{content:""}.fa-stack-overflow:before{content:""}.fa-instagram:before{content:""}.fa-flickr:before{content:""}.fa-adn:before{content:""}.fa-bitbucket:before,.icon-bitbucket:before{content:""}.fa-bitbucket-square:before{content:""}.fa-tumblr:before{content:""}.fa-tumblr-square:before{content:""}.fa-long-arrow-down:before{content:""}.fa-long-arrow-up:before{content:""}.fa-long-arrow-left:before{content:""}.fa-long-arrow-right:before{content:""}.fa-apple:before{content:""}.fa-windows:before{content:""}.fa-android:before{content:""}.fa-linux:before{content:""}.fa-dribbble:before{content:""}.fa-skype:before{content:""}.fa-foursquare:before{content:""}.fa-trello:before{content:""}.fa-female:before{content:""}.fa-male:before{content:""}.fa-gittip:before,.fa-gratipay:before{content:""}.fa-sun-o:before{content:""}.fa-moon-o:before{content:""}.fa-archive:before{content:""}.fa-bug:before{content:""}.fa-vk:before{content:""}.fa-weibo:before{content:""}.fa-renren:before{content:""}.fa-pagelines:before{content:""}.fa-stack-exchange:before{content:""}.fa-arrow-circle-o-right:before{content:""}.fa-arrow-circle-o-left:before{content:""}.fa-caret-square-o-left:before,.fa-toggle-left:before{content:""}.fa-dot-circle-o:before{content:""}.fa-wheelchair:before{content:""}.fa-vimeo-square:before{content:""}.fa-try:before,.fa-turkish-lira:before{content:""}.fa-plus-square-o:before,.wy-menu-vertical li button.toctree-expand:before{content:""}.fa-space-shuttle:before{content:""}.fa-slack:before{content:""}.fa-envelope-square:before{content:""}.fa-wordpress:before{content:""}.fa-openid:before{content:""}.fa-bank:before,.fa-institution:before,.fa-university:before{content:""}.fa-graduation-cap:before,.fa-mortar-board:before{content:""}.fa-yahoo:before{content:""}.fa-google:before{content:""}.fa-reddit:before{content:""}.fa-reddit-square:before{content:""}.fa-stumbleupon-circle:before{content:""}.fa-stumbleupon:before{content:""}.fa-delicious:before{content:""}.fa-digg:before{content:""}.fa-pied-piper-pp:before{content:""}.fa-pied-piper-alt:before{content:""}.fa-drupal:before{content:""}.fa-joomla:before{content:""}.fa-language:before{content:""}.fa-fax:before{content:""}.fa-building:before{content:""}.fa-child:before{content:""}.fa-paw:before{content:""}.fa-spoon:before{content:""}.fa-cube:before{content:""}.fa-cubes:before{content:""}.fa-behance:before{content:""}.fa-behance-square:before{content:""}.fa-steam:before{content:""}.fa-steam-square:before{content:""}.fa-recycle:before{content:""}.fa-automobile:before,.fa-car:before{content:""}.fa-cab:before,.fa-taxi:before{content:""}.fa-tree:before{content:""}.fa-spotify:before{content:""}.fa-deviantart:before{content:""}.fa-soundcloud:before{content:""}.fa-database:before{content:""}.fa-file-pdf-o:before{content:""}.fa-file-word-o:before{content:""}.fa-file-excel-o:before{content:""}.fa-file-powerpoint-o:before{content:""}.fa-file-image-o:before,.fa-file-photo-o:before,.fa-file-picture-o:before{content:""}.fa-file-archive-o:before,.fa-file-zip-o:before{content:""}.fa-file-audio-o:before,.fa-file-sound-o:before{content:""}.fa-file-movie-o:before,.fa-file-video-o:before{content:""}.fa-file-code-o:before{content:""}.fa-vine:before{content:""}.fa-codepen:before{content:""}.fa-jsfiddle:before{content:""}.fa-life-bouy:before,.fa-life-buoy:before,.fa-life-ring:before,.fa-life-saver:before,.fa-support:before{content:""}.fa-circle-o-notch:before{content:""}.fa-ra:before,.fa-rebel:before,.fa-resistance:before{content:""}.fa-empire:before,.fa-ge:before{content:""}.fa-git-square:before{content:""}.fa-git:before{content:""}.fa-hacker-news:before,.fa-y-combinator-square:before,.fa-yc-square:before{content:""}.fa-tencent-weibo:before{content:""}.fa-qq:before{content:""}.fa-wechat:before,.fa-weixin:before{content:""}.fa-paper-plane:before,.fa-send:before{content:""}.fa-paper-plane-o:before,.fa-send-o:before{content:""}.fa-history:before{content:""}.fa-circle-thin:before{content:""}.fa-header:before{content:""}.fa-paragraph:before{content:""}.fa-sliders:before{content:""}.fa-share-alt:before{content:""}.fa-share-alt-square:before{content:""}.fa-bomb:before{content:""}.fa-futbol-o:before,.fa-soccer-ball-o:before{content:""}.fa-tty:before{content:""}.fa-binoculars:before{content:""}.fa-plug:before{content:""}.fa-slideshare:before{content:""}.fa-twitch:before{content:""}.fa-yelp:before{content:""}.fa-newspaper-o:before{content:""}.fa-wifi:before{content:""}.fa-calculator:before{content:""}.fa-paypal:before{content:""}.fa-google-wallet:before{content:""}.fa-cc-visa:before{content:""}.fa-cc-mastercard:before{content:""}.fa-cc-discover:before{content:""}.fa-cc-amex:before{content:""}.fa-cc-paypal:before{content:""}.fa-cc-stripe:before{content:""}.fa-bell-slash:before{content:""}.fa-bell-slash-o:before{content:""}.fa-trash:before{content:""}.fa-copyright:before{content:""}.fa-at:before{content:""}.fa-eyedropper:before{content:""}.fa-paint-brush:before{content:""}.fa-birthday-cake:before{content:""}.fa-area-chart:before{content:""}.fa-pie-chart:before{content:""}.fa-line-chart:before{content:""}.fa-lastfm:before{content:""}.fa-lastfm-square:before{content:""}.fa-toggle-off:before{content:""}.fa-toggle-on:before{content:""}.fa-bicycle:before{content:""}.fa-bus:before{content:""}.fa-ioxhost:before{content:""}.fa-angellist:before{content:""}.fa-cc:before{content:""}.fa-ils:before,.fa-shekel:before,.fa-sheqel:before{content:""}.fa-meanpath:before{content:""}.fa-buysellads:before{content:""}.fa-connectdevelop:before{content:""}.fa-dashcube:before{content:""}.fa-forumbee:before{content:""}.fa-leanpub:before{content:""}.fa-sellsy:before{content:""}.fa-shirtsinbulk:before{content:""}.fa-simplybuilt:before{content:""}.fa-skyatlas:before{content:""}.fa-cart-plus:before{content:""}.fa-cart-arrow-down:before{content:""}.fa-diamond:before{content:""}.fa-ship:before{content:""}.fa-user-secret:before{content:""}.fa-motorcycle:before{content:""}.fa-street-view:before{content:""}.fa-heartbeat:before{content:""}.fa-venus:before{content:""}.fa-mars:before{content:""}.fa-mercury:before{content:""}.fa-intersex:before,.fa-transgender:before{content:""}.fa-transgender-alt:before{content:""}.fa-venus-double:before{content:""}.fa-mars-double:before{content:""}.fa-venus-mars:before{content:""}.fa-mars-stroke:before{content:""}.fa-mars-stroke-v:before{content:""}.fa-mars-stroke-h:before{content:""}.fa-neuter:before{content:""}.fa-genderless:before{content:""}.fa-facebook-official:before{content:""}.fa-pinterest-p:before{content:""}.fa-whatsapp:before{content:""}.fa-server:before{content:""}.fa-user-plus:before{content:""}.fa-user-times:before{content:""}.fa-bed:before,.fa-hotel:before{content:""}.fa-viacoin:before{content:""}.fa-train:before{content:""}.fa-subway:before{content:""}.fa-medium:before{content:""}.fa-y-combinator:before,.fa-yc:before{content:""}.fa-optin-monster:before{content:""}.fa-opencart:before{content:""}.fa-expeditedssl:before{content:""}.fa-battery-4:before,.fa-battery-full:before,.fa-battery:before{content:""}.fa-battery-3:before,.fa-battery-three-quarters:before{content:""}.fa-battery-2:before,.fa-battery-half:before{content:""}.fa-battery-1:before,.fa-battery-quarter:before{content:""}.fa-battery-0:before,.fa-battery-empty:before{content:""}.fa-mouse-pointer:before{content:""}.fa-i-cursor:before{content:""}.fa-object-group:before{content:""}.fa-object-ungroup:before{content:""}.fa-sticky-note:before{content:""}.fa-sticky-note-o:before{content:""}.fa-cc-jcb:before{content:""}.fa-cc-diners-club:before{content:""}.fa-clone:before{content:""}.fa-balance-scale:before{content:""}.fa-hourglass-o:before{content:""}.fa-hourglass-1:before,.fa-hourglass-start:before{content:""}.fa-hourglass-2:before,.fa-hourglass-half:before{content:""}.fa-hourglass-3:before,.fa-hourglass-end:before{content:""}.fa-hourglass:before{content:""}.fa-hand-grab-o:before,.fa-hand-rock-o:before{content:""}.fa-hand-paper-o:before,.fa-hand-stop-o:before{content:""}.fa-hand-scissors-o:before{content:""}.fa-hand-lizard-o:before{content:""}.fa-hand-spock-o:before{content:""}.fa-hand-pointer-o:before{content:""}.fa-hand-peace-o:before{content:""}.fa-trademark:before{content:""}.fa-registered:before{content:""}.fa-creative-commons:before{content:""}.fa-gg:before{content:""}.fa-gg-circle:before{content:""}.fa-tripadvisor:before{content:""}.fa-odnoklassniki:before{content:""}.fa-odnoklassniki-square:before{content:""}.fa-get-pocket:before{content:""}.fa-wikipedia-w:before{content:""}.fa-safari:before{content:""}.fa-chrome:before{content:""}.fa-firefox:before{content:""}.fa-opera:before{content:""}.fa-internet-explorer:before{content:""}.fa-television:before,.fa-tv:before{content:""}.fa-contao:before{content:""}.fa-500px:before{content:""}.fa-amazon:before{content:""}.fa-calendar-plus-o:before{content:""}.fa-calendar-minus-o:before{content:""}.fa-calendar-times-o:before{content:""}.fa-calendar-check-o:before{content:""}.fa-industry:before{content:""}.fa-map-pin:before{content:""}.fa-map-signs:before{content:""}.fa-map-o:before{content:""}.fa-map:before{content:""}.fa-commenting:before{content:""}.fa-commenting-o:before{content:""}.fa-houzz:before{content:""}.fa-vimeo:before{content:""}.fa-black-tie:before{content:""}.fa-fonticons:before{content:""}.fa-reddit-alien:before{content:""}.fa-edge:before{content:""}.fa-credit-card-alt:before{content:""}.fa-codiepie:before{content:""}.fa-modx:before{content:""}.fa-fort-awesome:before{content:""}.fa-usb:before{content:""}.fa-product-hunt:before{content:""}.fa-mixcloud:before{content:""}.fa-scribd:before{content:""}.fa-pause-circle:before{content:""}.fa-pause-circle-o:before{content:""}.fa-stop-circle:before{content:""}.fa-stop-circle-o:before{content:""}.fa-shopping-bag:before{content:""}.fa-shopping-basket:before{content:""}.fa-hashtag:before{content:""}.fa-bluetooth:before{content:""}.fa-bluetooth-b:before{content:""}.fa-percent:before{content:""}.fa-gitlab:before,.icon-gitlab:before{content:""}.fa-wpbeginner:before{content:""}.fa-wpforms:before{content:""}.fa-envira:before{content:""}.fa-universal-access:before{content:""}.fa-wheelchair-alt:before{content:""}.fa-question-circle-o:before{content:""}.fa-blind:before{content:""}.fa-audio-description:before{content:""}.fa-volume-control-phone:before{content:""}.fa-braille:before{content:""}.fa-assistive-listening-systems:before{content:""}.fa-american-sign-language-interpreting:before,.fa-asl-interpreting:before{content:""}.fa-deaf:before,.fa-deafness:before,.fa-hard-of-hearing:before{content:""}.fa-glide:before{content:""}.fa-glide-g:before{content:""}.fa-sign-language:before,.fa-signing:before{content:""}.fa-low-vision:before{content:""}.fa-viadeo:before{content:""}.fa-viadeo-square:before{content:""}.fa-snapchat:before{content:""}.fa-snapchat-ghost:before{content:""}.fa-snapchat-square:before{content:""}.fa-pied-piper:before{content:""}.fa-first-order:before{content:""}.fa-yoast:before{content:""}.fa-themeisle:before{content:""}.fa-google-plus-circle:before,.fa-google-plus-official:before{content:""}.fa-fa:before,.fa-font-awesome:before{content:""}.fa-handshake-o:before{content:""}.fa-envelope-open:before{content:""}.fa-envelope-open-o:before{content:""}.fa-linode:before{content:""}.fa-address-book:before{content:""}.fa-address-book-o:before{content:""}.fa-address-card:before,.fa-vcard:before{content:""}.fa-address-card-o:before,.fa-vcard-o:before{content:""}.fa-user-circle:before{content:""}.fa-user-circle-o:before{content:""}.fa-user-o:before{content:""}.fa-id-badge:before{content:""}.fa-drivers-license:before,.fa-id-card:before{content:""}.fa-drivers-license-o:before,.fa-id-card-o:before{content:""}.fa-quora:before{content:""}.fa-free-code-camp:before{content:""}.fa-telegram:before{content:""}.fa-thermometer-4:before,.fa-thermometer-full:before,.fa-thermometer:before{content:""}.fa-thermometer-3:before,.fa-thermometer-three-quarters:before{content:""}.fa-thermometer-2:before,.fa-thermometer-half:before{content:""}.fa-thermometer-1:before,.fa-thermometer-quarter:before{content:""}.fa-thermometer-0:before,.fa-thermometer-empty:before{content:""}.fa-shower:before{content:""}.fa-bath:before,.fa-bathtub:before,.fa-s15:before{content:""}.fa-podcast:before{content:""}.fa-window-maximize:before{content:""}.fa-window-minimize:before{content:""}.fa-window-restore:before{content:""}.fa-times-rectangle:before,.fa-window-close:before{content:""}.fa-times-rectangle-o:before,.fa-window-close-o:before{content:""}.fa-bandcamp:before{content:""}.fa-grav:before{content:""}.fa-etsy:before{content:""}.fa-imdb:before{content:""}.fa-ravelry:before{content:""}.fa-eercast:before{content:""}.fa-microchip:before{content:""}.fa-snowflake-o:before{content:""}.fa-superpowers:before{content:""}.fa-wpexplorer:before{content:""}.fa-meetup:before{content:""}.sr-only{position:absolute;width:1px;height:1px;padding:0;margin:-1px;overflow:hidden;clip:rect(0,0,0,0);border:0}.sr-only-focusable:active,.sr-only-focusable:focus{position:static;width:auto;height:auto;margin:0;overflow:visible;clip:auto}.fa,.icon,.rst-content .admonition-title,.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content code.download span:first-child,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink,.rst-content tt.download span:first-child,.wy-dropdown .caret,.wy-inline-validate.wy-inline-validate-danger .wy-input-context,.wy-inline-validate.wy-inline-validate-info .wy-input-context,.wy-inline-validate.wy-inline-validate-success .wy-input-context,.wy-inline-validate.wy-inline-validate-warning .wy-input-context,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li button.toctree-expand{font-family:inherit}.fa:before,.icon:before,.rst-content .admonition-title:before,.rst-content .code-block-caption .headerlink:before,.rst-content .eqno .headerlink:before,.rst-content code.download span:first-child:before,.rst-content dl dt .headerlink:before,.rst-content h1 .headerlink:before,.rst-content h2 .headerlink:before,.rst-content h3 .headerlink:before,.rst-content h4 .headerlink:before,.rst-content h5 .headerlink:before,.rst-content h6 .headerlink:before,.rst-content p.caption .headerlink:before,.rst-content p .headerlink:before,.rst-content table>caption .headerlink:before,.rst-content tt.download span:first-child:before,.wy-dropdown .caret:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a button.toctree-expand:before,.wy-menu-vertical li button.toctree-expand:before{font-family:FontAwesome;display:inline-block;font-style:normal;font-weight:400;line-height:1;text-decoration:inherit}.rst-content .code-block-caption a .headerlink,.rst-content .eqno a .headerlink,.rst-content a .admonition-title,.rst-content code.download a span:first-child,.rst-content dl dt a .headerlink,.rst-content h1 a .headerlink,.rst-content h2 a .headerlink,.rst-content h3 a .headerlink,.rst-content h4 a .headerlink,.rst-content h5 a .headerlink,.rst-content h6 a .headerlink,.rst-content p.caption a .headerlink,.rst-content p a .headerlink,.rst-content table>caption a .headerlink,.rst-content tt.download a span:first-child,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li a button.toctree-expand,a .fa,a .icon,a .rst-content .admonition-title,a .rst-content .code-block-caption .headerlink,a .rst-content .eqno .headerlink,a .rst-content code.download span:first-child,a .rst-content dl dt .headerlink,a .rst-content h1 .headerlink,a .rst-content h2 .headerlink,a .rst-content h3 .headerlink,a .rst-content h4 .headerlink,a .rst-content h5 .headerlink,a .rst-content h6 .headerlink,a .rst-content p.caption .headerlink,a .rst-content p .headerlink,a .rst-content table>caption .headerlink,a .rst-content tt.download span:first-child,a .wy-menu-vertical li button.toctree-expand{display:inline-block;text-decoration:inherit}.btn .fa,.btn .icon,.btn .rst-content .admonition-title,.btn .rst-content .code-block-caption .headerlink,.btn .rst-content .eqno .headerlink,.btn .rst-content code.download span:first-child,.btn .rst-content dl dt .headerlink,.btn .rst-content h1 .headerlink,.btn .rst-content h2 .headerlink,.btn .rst-content h3 .headerlink,.btn .rst-content h4 .headerlink,.btn .rst-content h5 .headerlink,.btn .rst-content h6 .headerlink,.btn .rst-content p .headerlink,.btn .rst-content table>caption .headerlink,.btn .rst-content tt.download span:first-child,.btn .wy-menu-vertical li.current>a button.toctree-expand,.btn .wy-menu-vertical li.on a button.toctree-expand,.btn .wy-menu-vertical li button.toctree-expand,.nav .fa,.nav .icon,.nav .rst-content .admonition-title,.nav .rst-content .code-block-caption .headerlink,.nav .rst-content .eqno .headerlink,.nav .rst-content code.download span:first-child,.nav .rst-content dl dt .headerlink,.nav .rst-content h1 .headerlink,.nav .rst-content h2 .headerlink,.nav .rst-content h3 .headerlink,.nav .rst-content h4 .headerlink,.nav .rst-content h5 .headerlink,.nav .rst-content h6 .headerlink,.nav .rst-content p .headerlink,.nav .rst-content table>caption .headerlink,.nav .rst-content tt.download span:first-child,.nav .wy-menu-vertical li.current>a button.toctree-expand,.nav .wy-menu-vertical li.on a button.toctree-expand,.nav .wy-menu-vertical li button.toctree-expand,.rst-content .btn .admonition-title,.rst-content .code-block-caption .btn .headerlink,.rst-content .code-block-caption .nav .headerlink,.rst-content .eqno .btn .headerlink,.rst-content .eqno .nav .headerlink,.rst-content .nav .admonition-title,.rst-content code.download .btn span:first-child,.rst-content code.download .nav span:first-child,.rst-content dl dt .btn .headerlink,.rst-content dl dt .nav .headerlink,.rst-content h1 .btn .headerlink,.rst-content h1 .nav .headerlink,.rst-content h2 .btn .headerlink,.rst-content h2 .nav .headerlink,.rst-content h3 .btn .headerlink,.rst-content h3 .nav .headerlink,.rst-content h4 .btn .headerlink,.rst-content h4 .nav .headerlink,.rst-content h5 .btn .headerlink,.rst-content h5 .nav .headerlink,.rst-content h6 .btn .headerlink,.rst-content h6 .nav .headerlink,.rst-content p .btn .headerlink,.rst-content p .nav .headerlink,.rst-content table>caption .btn .headerlink,.rst-content table>caption .nav .headerlink,.rst-content tt.download .btn span:first-child,.rst-content tt.download .nav span:first-child,.wy-menu-vertical li .btn button.toctree-expand,.wy-menu-vertical li.current>a .btn button.toctree-expand,.wy-menu-vertical li.current>a .nav button.toctree-expand,.wy-menu-vertical li .nav button.toctree-expand,.wy-menu-vertical li.on a .btn button.toctree-expand,.wy-menu-vertical li.on a .nav button.toctree-expand{display:inline}.btn .fa-large.icon,.btn .fa.fa-large,.btn .rst-content .code-block-caption .fa-large.headerlink,.btn .rst-content .eqno .fa-large.headerlink,.btn .rst-content .fa-large.admonition-title,.btn .rst-content code.download span.fa-large:first-child,.btn .rst-content dl dt .fa-large.headerlink,.btn .rst-content h1 .fa-large.headerlink,.btn .rst-content h2 .fa-large.headerlink,.btn .rst-content h3 .fa-large.headerlink,.btn .rst-content h4 .fa-large.headerlink,.btn .rst-content h5 .fa-large.headerlink,.btn .rst-content h6 .fa-large.headerlink,.btn .rst-content p .fa-large.headerlink,.btn .rst-content table>caption .fa-large.headerlink,.btn .rst-content tt.download span.fa-large:first-child,.btn .wy-menu-vertical li button.fa-large.toctree-expand,.nav .fa-large.icon,.nav .fa.fa-large,.nav .rst-content .code-block-caption .fa-large.headerlink,.nav .rst-content .eqno .fa-large.headerlink,.nav .rst-content .fa-large.admonition-title,.nav .rst-content code.download span.fa-large:first-child,.nav .rst-content dl dt .fa-large.headerlink,.nav .rst-content h1 .fa-large.headerlink,.nav .rst-content h2 .fa-large.headerlink,.nav .rst-content h3 .fa-large.headerlink,.nav .rst-content h4 .fa-large.headerlink,.nav .rst-content h5 .fa-large.headerlink,.nav .rst-content h6 .fa-large.headerlink,.nav .rst-content p .fa-large.headerlink,.nav .rst-content table>caption .fa-large.headerlink,.nav .rst-content tt.download span.fa-large:first-child,.nav .wy-menu-vertical li button.fa-large.toctree-expand,.rst-content .btn .fa-large.admonition-title,.rst-content .code-block-caption .btn .fa-large.headerlink,.rst-content .code-block-caption .nav .fa-large.headerlink,.rst-content .eqno .btn .fa-large.headerlink,.rst-content .eqno .nav .fa-large.headerlink,.rst-content .nav .fa-large.admonition-title,.rst-content code.download .btn span.fa-large:first-child,.rst-content code.download .nav span.fa-large:first-child,.rst-content dl dt .btn .fa-large.headerlink,.rst-content dl dt .nav .fa-large.headerlink,.rst-content h1 .btn .fa-large.headerlink,.rst-content h1 .nav .fa-large.headerlink,.rst-content h2 .btn .fa-large.headerlink,.rst-content h2 .nav .fa-large.headerlink,.rst-content h3 .btn .fa-large.headerlink,.rst-content h3 .nav .fa-large.headerlink,.rst-content h4 .btn .fa-large.headerlink,.rst-content h4 .nav .fa-large.headerlink,.rst-content h5 .btn .fa-large.headerlink,.rst-content h5 .nav .fa-large.headerlink,.rst-content h6 .btn .fa-large.headerlink,.rst-content h6 .nav .fa-large.headerlink,.rst-content p .btn .fa-large.headerlink,.rst-content p .nav .fa-large.headerlink,.rst-content table>caption .btn .fa-large.headerlink,.rst-content table>caption .nav .fa-large.headerlink,.rst-content tt.download .btn span.fa-large:first-child,.rst-content tt.download .nav span.fa-large:first-child,.wy-menu-vertical li .btn button.fa-large.toctree-expand,.wy-menu-vertical li .nav button.fa-large.toctree-expand{line-height:.9em}.btn .fa-spin.icon,.btn .fa.fa-spin,.btn .rst-content .code-block-caption .fa-spin.headerlink,.btn .rst-content .eqno .fa-spin.headerlink,.btn .rst-content .fa-spin.admonition-title,.btn .rst-content code.download span.fa-spin:first-child,.btn .rst-content dl dt .fa-spin.headerlink,.btn .rst-content h1 .fa-spin.headerlink,.btn .rst-content h2 .fa-spin.headerlink,.btn .rst-content h3 .fa-spin.headerlink,.btn .rst-content h4 .fa-spin.headerlink,.btn .rst-content h5 .fa-spin.headerlink,.btn .rst-content h6 .fa-spin.headerlink,.btn .rst-content p .fa-spin.headerlink,.btn .rst-content table>caption .fa-spin.headerlink,.btn .rst-content tt.download span.fa-spin:first-child,.btn .wy-menu-vertical li button.fa-spin.toctree-expand,.nav .fa-spin.icon,.nav .fa.fa-spin,.nav .rst-content .code-block-caption .fa-spin.headerlink,.nav .rst-content .eqno .fa-spin.headerlink,.nav .rst-content .fa-spin.admonition-title,.nav .rst-content code.download span.fa-spin:first-child,.nav .rst-content dl dt .fa-spin.headerlink,.nav .rst-content h1 .fa-spin.headerlink,.nav .rst-content h2 .fa-spin.headerlink,.nav .rst-content h3 .fa-spin.headerlink,.nav .rst-content h4 .fa-spin.headerlink,.nav .rst-content h5 .fa-spin.headerlink,.nav .rst-content h6 .fa-spin.headerlink,.nav .rst-content p .fa-spin.headerlink,.nav .rst-content table>caption .fa-spin.headerlink,.nav .rst-content tt.download span.fa-spin:first-child,.nav .wy-menu-vertical li button.fa-spin.toctree-expand,.rst-content .btn .fa-spin.admonition-title,.rst-content .code-block-caption .btn .fa-spin.headerlink,.rst-content .code-block-caption .nav .fa-spin.headerlink,.rst-content .eqno .btn .fa-spin.headerlink,.rst-content .eqno .nav .fa-spin.headerlink,.rst-content .nav .fa-spin.admonition-title,.rst-content code.download .btn span.fa-spin:first-child,.rst-content code.download .nav span.fa-spin:first-child,.rst-content dl dt .btn .fa-spin.headerlink,.rst-content dl dt .nav .fa-spin.headerlink,.rst-content h1 .btn .fa-spin.headerlink,.rst-content h1 .nav .fa-spin.headerlink,.rst-content h2 .btn .fa-spin.headerlink,.rst-content h2 .nav .fa-spin.headerlink,.rst-content h3 .btn .fa-spin.headerlink,.rst-content h3 .nav .fa-spin.headerlink,.rst-content h4 .btn .fa-spin.headerlink,.rst-content h4 .nav .fa-spin.headerlink,.rst-content h5 .btn .fa-spin.headerlink,.rst-content h5 .nav .fa-spin.headerlink,.rst-content h6 .btn .fa-spin.headerlink,.rst-content h6 .nav .fa-spin.headerlink,.rst-content p .btn .fa-spin.headerlink,.rst-content p .nav .fa-spin.headerlink,.rst-content table>caption .btn .fa-spin.headerlink,.rst-content table>caption .nav .fa-spin.headerlink,.rst-content tt.download .btn span.fa-spin:first-child,.rst-content tt.download .nav span.fa-spin:first-child,.wy-menu-vertical li .btn button.fa-spin.toctree-expand,.wy-menu-vertical li .nav button.fa-spin.toctree-expand{display:inline-block}.btn.fa:before,.btn.icon:before,.rst-content .btn.admonition-title:before,.rst-content .code-block-caption .btn.headerlink:before,.rst-content .eqno .btn.headerlink:before,.rst-content code.download span.btn:first-child:before,.rst-content dl dt .btn.headerlink:before,.rst-content h1 .btn.headerlink:before,.rst-content h2 .btn.headerlink:before,.rst-content h3 .btn.headerlink:before,.rst-content h4 .btn.headerlink:before,.rst-content h5 .btn.headerlink:before,.rst-content h6 .btn.headerlink:before,.rst-content p .btn.headerlink:before,.rst-content table>caption .btn.headerlink:before,.rst-content tt.download span.btn:first-child:before,.wy-menu-vertical li button.btn.toctree-expand:before{opacity:.5;-webkit-transition:opacity .05s ease-in;-moz-transition:opacity .05s ease-in;transition:opacity .05s ease-in}.btn.fa:hover:before,.btn.icon:hover:before,.rst-content .btn.admonition-title:hover:before,.rst-content .code-block-caption .btn.headerlink:hover:before,.rst-content .eqno .btn.headerlink:hover:before,.rst-content code.download span.btn:first-child:hover:before,.rst-content dl dt .btn.headerlink:hover:before,.rst-content h1 .btn.headerlink:hover:before,.rst-content h2 .btn.headerlink:hover:before,.rst-content h3 .btn.headerlink:hover:before,.rst-content h4 .btn.headerlink:hover:before,.rst-content h5 .btn.headerlink:hover:before,.rst-content h6 .btn.headerlink:hover:before,.rst-content p .btn.headerlink:hover:before,.rst-content table>caption .btn.headerlink:hover:before,.rst-content tt.download span.btn:first-child:hover:before,.wy-menu-vertical li button.btn.toctree-expand:hover:before{opacity:1}.btn-mini .fa:before,.btn-mini .icon:before,.btn-mini .rst-content .admonition-title:before,.btn-mini .rst-content .code-block-caption .headerlink:before,.btn-mini .rst-content .eqno .headerlink:before,.btn-mini .rst-content code.download span:first-child:before,.btn-mini .rst-content dl dt .headerlink:before,.btn-mini .rst-content h1 .headerlink:before,.btn-mini .rst-content h2 .headerlink:before,.btn-mini .rst-content h3 .headerlink:before,.btn-mini .rst-content h4 .headerlink:before,.btn-mini .rst-content h5 .headerlink:before,.btn-mini .rst-content h6 .headerlink:before,.btn-mini .rst-content p .headerlink:before,.btn-mini .rst-content table>caption .headerlink:before,.btn-mini .rst-content tt.download span:first-child:before,.btn-mini .wy-menu-vertical li button.toctree-expand:before,.rst-content .btn-mini .admonition-title:before,.rst-content .code-block-caption .btn-mini .headerlink:before,.rst-content .eqno .btn-mini .headerlink:before,.rst-content code.download .btn-mini span:first-child:before,.rst-content dl dt .btn-mini .headerlink:before,.rst-content h1 .btn-mini .headerlink:before,.rst-content h2 .btn-mini .headerlink:before,.rst-content h3 .btn-mini .headerlink:before,.rst-content h4 .btn-mini .headerlink:before,.rst-content h5 .btn-mini .headerlink:before,.rst-content h6 .btn-mini .headerlink:before,.rst-content p .btn-mini .headerlink:before,.rst-content table>caption .btn-mini .headerlink:before,.rst-content tt.download .btn-mini span:first-child:before,.wy-menu-vertical li .btn-mini button.toctree-expand:before{font-size:14px;vertical-align:-15%}.rst-content .admonition,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .danger,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning,.wy-alert{padding:12px;line-height:24px;margin-bottom:24px;background:#e7f2fa}.rst-content .admonition-title,.wy-alert-title{font-weight:700;display:block;color:#fff;background:#6ab0de;padding:6px 12px;margin:-12px -12px 12px}.rst-content .danger,.rst-content .error,.rst-content .wy-alert-danger.admonition,.rst-content .wy-alert-danger.admonition-todo,.rst-content .wy-alert-danger.attention,.rst-content .wy-alert-danger.caution,.rst-content .wy-alert-danger.hint,.rst-content .wy-alert-danger.important,.rst-content .wy-alert-danger.note,.rst-content .wy-alert-danger.seealso,.rst-content .wy-alert-danger.tip,.rst-content .wy-alert-danger.warning,.wy-alert.wy-alert-danger{background:#fdf3f2}.rst-content .danger .admonition-title,.rst-content .danger .wy-alert-title,.rst-content .error .admonition-title,.rst-content .error .wy-alert-title,.rst-content .wy-alert-danger.admonition-todo .admonition-title,.rst-content .wy-alert-danger.admonition-todo .wy-alert-title,.rst-content .wy-alert-danger.admonition .admonition-title,.rst-content .wy-alert-danger.admonition .wy-alert-title,.rst-content .wy-alert-danger.attention .admonition-title,.rst-content .wy-alert-danger.attention .wy-alert-title,.rst-content .wy-alert-danger.caution .admonition-title,.rst-content .wy-alert-danger.caution .wy-alert-title,.rst-content .wy-alert-danger.hint .admonition-title,.rst-content .wy-alert-danger.hint .wy-alert-title,.rst-content .wy-alert-danger.important .admonition-title,.rst-content .wy-alert-danger.important .wy-alert-title,.rst-content .wy-alert-danger.note .admonition-title,.rst-content .wy-alert-danger.note .wy-alert-title,.rst-content .wy-alert-danger.seealso .admonition-title,.rst-content .wy-alert-danger.seealso .wy-alert-title,.rst-content .wy-alert-danger.tip .admonition-title,.rst-content .wy-alert-danger.tip .wy-alert-title,.rst-content .wy-alert-danger.warning .admonition-title,.rst-content .wy-alert-danger.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-danger .admonition-title,.wy-alert.wy-alert-danger .rst-content .admonition-title,.wy-alert.wy-alert-danger .wy-alert-title{background:#f29f97}.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .warning,.rst-content .wy-alert-warning.admonition,.rst-content .wy-alert-warning.danger,.rst-content .wy-alert-warning.error,.rst-content .wy-alert-warning.hint,.rst-content .wy-alert-warning.important,.rst-content .wy-alert-warning.note,.rst-content .wy-alert-warning.seealso,.rst-content .wy-alert-warning.tip,.wy-alert.wy-alert-warning{background:#ffedcc}.rst-content .admonition-todo .admonition-title,.rst-content .admonition-todo .wy-alert-title,.rst-content .attention .admonition-title,.rst-content .attention .wy-alert-title,.rst-content .caution .admonition-title,.rst-content .caution .wy-alert-title,.rst-content .warning .admonition-title,.rst-content .warning .wy-alert-title,.rst-content .wy-alert-warning.admonition .admonition-title,.rst-content .wy-alert-warning.admonition .wy-alert-title,.rst-content .wy-alert-warning.danger .admonition-title,.rst-content .wy-alert-warning.danger .wy-alert-title,.rst-content .wy-alert-warning.error .admonition-title,.rst-content .wy-alert-warning.error .wy-alert-title,.rst-content .wy-alert-warning.hint .admonition-title,.rst-content .wy-alert-warning.hint .wy-alert-title,.rst-content .wy-alert-warning.important .admonition-title,.rst-content .wy-alert-warning.important .wy-alert-title,.rst-content .wy-alert-warning.note .admonition-title,.rst-content .wy-alert-warning.note .wy-alert-title,.rst-content .wy-alert-warning.seealso .admonition-title,.rst-content .wy-alert-warning.seealso .wy-alert-title,.rst-content .wy-alert-warning.tip .admonition-title,.rst-content .wy-alert-warning.tip .wy-alert-title,.rst-content .wy-alert.wy-alert-warning .admonition-title,.wy-alert.wy-alert-warning .rst-content .admonition-title,.wy-alert.wy-alert-warning .wy-alert-title{background:#f0b37e}.rst-content .note,.rst-content .seealso,.rst-content .wy-alert-info.admonition,.rst-content .wy-alert-info.admonition-todo,.rst-content .wy-alert-info.attention,.rst-content .wy-alert-info.caution,.rst-content .wy-alert-info.danger,.rst-content .wy-alert-info.error,.rst-content .wy-alert-info.hint,.rst-content .wy-alert-info.important,.rst-content .wy-alert-info.tip,.rst-content .wy-alert-info.warning,.wy-alert.wy-alert-info{background:#e7f2fa}.rst-content .note .admonition-title,.rst-content .note .wy-alert-title,.rst-content .seealso .admonition-title,.rst-content .seealso .wy-alert-title,.rst-content .wy-alert-info.admonition-todo .admonition-title,.rst-content .wy-alert-info.admonition-todo .wy-alert-title,.rst-content .wy-alert-info.admonition .admonition-title,.rst-content .wy-alert-info.admonition .wy-alert-title,.rst-content .wy-alert-info.attention .admonition-title,.rst-content .wy-alert-info.attention .wy-alert-title,.rst-content .wy-alert-info.caution .admonition-title,.rst-content .wy-alert-info.caution .wy-alert-title,.rst-content .wy-alert-info.danger .admonition-title,.rst-content .wy-alert-info.danger .wy-alert-title,.rst-content .wy-alert-info.error .admonition-title,.rst-content .wy-alert-info.error .wy-alert-title,.rst-content .wy-alert-info.hint .admonition-title,.rst-content .wy-alert-info.hint .wy-alert-title,.rst-content .wy-alert-info.important .admonition-title,.rst-content .wy-alert-info.important .wy-alert-title,.rst-content .wy-alert-info.tip .admonition-title,.rst-content .wy-alert-info.tip .wy-alert-title,.rst-content .wy-alert-info.warning .admonition-title,.rst-content .wy-alert-info.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-info .admonition-title,.wy-alert.wy-alert-info .rst-content .admonition-title,.wy-alert.wy-alert-info .wy-alert-title{background:#6ab0de}.rst-content .hint,.rst-content .important,.rst-content .tip,.rst-content .wy-alert-success.admonition,.rst-content .wy-alert-success.admonition-todo,.rst-content .wy-alert-success.attention,.rst-content .wy-alert-success.caution,.rst-content .wy-alert-success.danger,.rst-content .wy-alert-success.error,.rst-content .wy-alert-success.note,.rst-content .wy-alert-success.seealso,.rst-content .wy-alert-success.warning,.wy-alert.wy-alert-success{background:#dbfaf4}.rst-content .hint .admonition-title,.rst-content .hint .wy-alert-title,.rst-content .important .admonition-title,.rst-content .important .wy-alert-title,.rst-content .tip .admonition-title,.rst-content .tip .wy-alert-title,.rst-content .wy-alert-success.admonition-todo .admonition-title,.rst-content .wy-alert-success.admonition-todo .wy-alert-title,.rst-content .wy-alert-success.admonition .admonition-title,.rst-content .wy-alert-success.admonition .wy-alert-title,.rst-content .wy-alert-success.attention .admonition-title,.rst-content .wy-alert-success.attention .wy-alert-title,.rst-content .wy-alert-success.caution .admonition-title,.rst-content .wy-alert-success.caution .wy-alert-title,.rst-content .wy-alert-success.danger .admonition-title,.rst-content .wy-alert-success.danger .wy-alert-title,.rst-content .wy-alert-success.error .admonition-title,.rst-content .wy-alert-success.error .wy-alert-title,.rst-content .wy-alert-success.note .admonition-title,.rst-content .wy-alert-success.note .wy-alert-title,.rst-content .wy-alert-success.seealso .admonition-title,.rst-content .wy-alert-success.seealso .wy-alert-title,.rst-content .wy-alert-success.warning .admonition-title,.rst-content .wy-alert-success.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-success .admonition-title,.wy-alert.wy-alert-success .rst-content .admonition-title,.wy-alert.wy-alert-success .wy-alert-title{background:#1abc9c}.rst-content .wy-alert-neutral.admonition,.rst-content .wy-alert-neutral.admonition-todo,.rst-content .wy-alert-neutral.attention,.rst-content .wy-alert-neutral.caution,.rst-content .wy-alert-neutral.danger,.rst-content .wy-alert-neutral.error,.rst-content .wy-alert-neutral.hint,.rst-content .wy-alert-neutral.important,.rst-content .wy-alert-neutral.note,.rst-content .wy-alert-neutral.seealso,.rst-content .wy-alert-neutral.tip,.rst-content .wy-alert-neutral.warning,.wy-alert.wy-alert-neutral{background:#f3f6f6}.rst-content .wy-alert-neutral.admonition-todo .admonition-title,.rst-content .wy-alert-neutral.admonition-todo .wy-alert-title,.rst-content .wy-alert-neutral.admonition .admonition-title,.rst-content .wy-alert-neutral.admonition .wy-alert-title,.rst-content .wy-alert-neutral.attention .admonition-title,.rst-content .wy-alert-neutral.attention .wy-alert-title,.rst-content .wy-alert-neutral.caution .admonition-title,.rst-content .wy-alert-neutral.caution .wy-alert-title,.rst-content .wy-alert-neutral.danger .admonition-title,.rst-content .wy-alert-neutral.danger .wy-alert-title,.rst-content .wy-alert-neutral.error .admonition-title,.rst-content .wy-alert-neutral.error .wy-alert-title,.rst-content .wy-alert-neutral.hint .admonition-title,.rst-content .wy-alert-neutral.hint .wy-alert-title,.rst-content .wy-alert-neutral.important .admonition-title,.rst-content .wy-alert-neutral.important .wy-alert-title,.rst-content .wy-alert-neutral.note .admonition-title,.rst-content .wy-alert-neutral.note .wy-alert-title,.rst-content .wy-alert-neutral.seealso .admonition-title,.rst-content .wy-alert-neutral.seealso .wy-alert-title,.rst-content .wy-alert-neutral.tip .admonition-title,.rst-content .wy-alert-neutral.tip .wy-alert-title,.rst-content .wy-alert-neutral.warning .admonition-title,.rst-content .wy-alert-neutral.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-neutral .admonition-title,.wy-alert.wy-alert-neutral .rst-content .admonition-title,.wy-alert.wy-alert-neutral .wy-alert-title{color:#404040;background:#e1e4e5}.rst-content .wy-alert-neutral.admonition-todo a,.rst-content .wy-alert-neutral.admonition a,.rst-content .wy-alert-neutral.attention a,.rst-content .wy-alert-neutral.caution a,.rst-content .wy-alert-neutral.danger a,.rst-content .wy-alert-neutral.error a,.rst-content .wy-alert-neutral.hint a,.rst-content .wy-alert-neutral.important a,.rst-content .wy-alert-neutral.note a,.rst-content .wy-alert-neutral.seealso a,.rst-content .wy-alert-neutral.tip a,.rst-content .wy-alert-neutral.warning a,.wy-alert.wy-alert-neutral a{color:#2980b9}.rst-content .admonition-todo p:last-child,.rst-content .admonition p:last-child,.rst-content .attention p:last-child,.rst-content .caution p:last-child,.rst-content .danger p:last-child,.rst-content .error p:last-child,.rst-content .hint p:last-child,.rst-content .important p:last-child,.rst-content .note p:last-child,.rst-content .seealso p:last-child,.rst-content .tip p:last-child,.rst-content .warning p:last-child,.wy-alert p:last-child{margin-bottom:0}.wy-tray-container{position:fixed;bottom:0;left:0;z-index:600}.wy-tray-container li{display:block;width:300px;background:transparent;color:#fff;text-align:center;box-shadow:0 5px 5px 0 rgba(0,0,0,.1);padding:0 24px;min-width:20%;opacity:0;height:0;line-height:56px;overflow:hidden;-webkit-transition:all .3s ease-in;-moz-transition:all .3s ease-in;transition:all .3s ease-in}.wy-tray-container li.wy-tray-item-success{background:#27ae60}.wy-tray-container li.wy-tray-item-info{background:#2980b9}.wy-tray-container li.wy-tray-item-warning{background:#e67e22}.wy-tray-container li.wy-tray-item-danger{background:#e74c3c}.wy-tray-container li.on{opacity:1;height:56px}@media screen and (max-width:768px){.wy-tray-container{bottom:auto;top:0;width:100%}.wy-tray-container li{width:100%}}button{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle;cursor:pointer;line-height:normal;-webkit-appearance:button;*overflow:visible}button::-moz-focus-inner,input::-moz-focus-inner{border:0;padding:0}button[disabled]{cursor:default}.btn{display:inline-block;border-radius:2px;line-height:normal;white-space:nowrap;text-align:center;cursor:pointer;font-size:100%;padding:6px 12px 8px;color:#fff;border:1px solid rgba(0,0,0,.1);background-color:#27ae60;text-decoration:none;font-weight:400;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;box-shadow:inset 0 1px 2px -1px hsla(0,0%,100%,.5),inset 0 -2px 0 0 rgba(0,0,0,.1);outline-none:false;vertical-align:middle;*display:inline;zoom:1;-webkit-user-drag:none;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;-webkit-transition:all .1s linear;-moz-transition:all .1s linear;transition:all .1s linear}.btn-hover{background:#2e8ece;color:#fff}.btn:hover{background:#2cc36b;color:#fff}.btn:focus{background:#2cc36b;outline:0}.btn:active{box-shadow:inset 0 -1px 0 0 rgba(0,0,0,.05),inset 0 2px 0 0 rgba(0,0,0,.1);padding:8px 12px 6px}.btn:visited{color:#fff}.btn-disabled,.btn-disabled:active,.btn-disabled:focus,.btn-disabled:hover,.btn:disabled{background-image:none;filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);filter:alpha(opacity=40);opacity:.4;cursor:not-allowed;box-shadow:none}.btn::-moz-focus-inner{padding:0;border:0}.btn-small{font-size:80%}.btn-info{background-color:#2980b9!important}.btn-info:hover{background-color:#2e8ece!important}.btn-neutral{background-color:#f3f6f6!important;color:#404040!important}.btn-neutral:hover{background-color:#e5ebeb!important;color:#404040}.btn-neutral:visited{color:#404040!important}.btn-success{background-color:#27ae60!important}.btn-success:hover{background-color:#295!important}.btn-danger{background-color:#e74c3c!important}.btn-danger:hover{background-color:#ea6153!important}.btn-warning{background-color:#e67e22!important}.btn-warning:hover{background-color:#e98b39!important}.btn-invert{background-color:#222}.btn-invert:hover{background-color:#2f2f2f!important}.btn-link{background-color:transparent!important;color:#2980b9;box-shadow:none;border-color:transparent!important}.btn-link:active,.btn-link:hover{background-color:transparent!important;color:#409ad5!important;box-shadow:none}.btn-link:visited{color:#9b59b6}.wy-btn-group .btn,.wy-control .btn{vertical-align:middle}.wy-btn-group{margin-bottom:24px;*zoom:1}.wy-btn-group:after,.wy-btn-group:before{display:table;content:""}.wy-btn-group:after{clear:both}.wy-dropdown{position:relative;display:inline-block}.wy-dropdown-active .wy-dropdown-menu{display:block}.wy-dropdown-menu{position:absolute;left:0;display:none;float:left;top:100%;min-width:100%;background:#fcfcfc;z-index:100;border:1px solid #cfd7dd;box-shadow:0 2px 2px 0 rgba(0,0,0,.1);padding:12px}.wy-dropdown-menu>dd>a{display:block;clear:both;color:#404040;white-space:nowrap;font-size:90%;padding:0 12px;cursor:pointer}.wy-dropdown-menu>dd>a:hover{background:#2980b9;color:#fff}.wy-dropdown-menu>dd.divider{border-top:1px solid #cfd7dd;margin:6px 0}.wy-dropdown-menu>dd.search{padding-bottom:12px}.wy-dropdown-menu>dd.search input[type=search]{width:100%}.wy-dropdown-menu>dd.call-to-action{background:#e3e3e3;text-transform:uppercase;font-weight:500;font-size:80%}.wy-dropdown-menu>dd.call-to-action:hover{background:#e3e3e3}.wy-dropdown-menu>dd.call-to-action .btn{color:#fff}.wy-dropdown.wy-dropdown-up .wy-dropdown-menu{bottom:100%;top:auto;left:auto;right:0}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu{background:#fcfcfc;margin-top:2px}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu a{padding:6px 12px}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu a:hover{background:#2980b9;color:#fff}.wy-dropdown.wy-dropdown-left .wy-dropdown-menu{right:0;left:auto;text-align:right}.wy-dropdown-arrow:before{content:" ";border-bottom:5px solid #f5f5f5;border-left:5px solid transparent;border-right:5px solid transparent;position:absolute;display:block;top:-4px;left:50%;margin-left:-3px}.wy-dropdown-arrow.wy-dropdown-arrow-left:before{left:11px}.wy-form-stacked select{display:block}.wy-form-aligned .wy-help-inline,.wy-form-aligned input,.wy-form-aligned label,.wy-form-aligned select,.wy-form-aligned textarea{display:inline-block;*display:inline;*zoom:1;vertical-align:middle}.wy-form-aligned .wy-control-group>label{display:inline-block;vertical-align:middle;width:10em;margin:6px 12px 0 0;float:left}.wy-form-aligned .wy-control{float:left}.wy-form-aligned .wy-control label{display:block}.wy-form-aligned .wy-control select{margin-top:6px}fieldset{margin:0}fieldset,legend{border:0;padding:0}legend{width:100%;white-space:normal;margin-bottom:24px;font-size:150%;*margin-left:-7px}label,legend{display:block}label{margin:0 0 .3125em;color:#333;font-size:90%}input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle}.wy-control-group{margin-bottom:24px;max-width:1200px;margin-left:auto;margin-right:auto;*zoom:1}.wy-control-group:after,.wy-control-group:before{display:table;content:""}.wy-control-group:after{clear:both}.wy-control-group.wy-control-group-required>label:after{content:" *";color:#e74c3c}.wy-control-group .wy-form-full,.wy-control-group .wy-form-halves,.wy-control-group .wy-form-thirds{padding-bottom:12px}.wy-control-group .wy-form-full input[type=color],.wy-control-group .wy-form-full input[type=date],.wy-control-group .wy-form-full input[type=datetime-local],.wy-control-group .wy-form-full input[type=datetime],.wy-control-group .wy-form-full input[type=email],.wy-control-group .wy-form-full input[type=month],.wy-control-group .wy-form-full input[type=number],.wy-control-group .wy-form-full input[type=password],.wy-control-group .wy-form-full input[type=search],.wy-control-group .wy-form-full input[type=tel],.wy-control-group .wy-form-full input[type=text],.wy-control-group .wy-form-full input[type=time],.wy-control-group .wy-form-full input[type=url],.wy-control-group .wy-form-full input[type=week],.wy-control-group .wy-form-full select,.wy-control-group .wy-form-halves input[type=color],.wy-control-group .wy-form-halves input[type=date],.wy-control-group .wy-form-halves input[type=datetime-local],.wy-control-group .wy-form-halves input[type=datetime],.wy-control-group .wy-form-halves input[type=email],.wy-control-group .wy-form-halves input[type=month],.wy-control-group .wy-form-halves input[type=number],.wy-control-group .wy-form-halves input[type=password],.wy-control-group .wy-form-halves input[type=search],.wy-control-group .wy-form-halves input[type=tel],.wy-control-group .wy-form-halves input[type=text],.wy-control-group .wy-form-halves input[type=time],.wy-control-group .wy-form-halves input[type=url],.wy-control-group .wy-form-halves input[type=week],.wy-control-group .wy-form-halves select,.wy-control-group .wy-form-thirds input[type=color],.wy-control-group .wy-form-thirds input[type=date],.wy-control-group .wy-form-thirds input[type=datetime-local],.wy-control-group .wy-form-thirds input[type=datetime],.wy-control-group .wy-form-thirds input[type=email],.wy-control-group .wy-form-thirds input[type=month],.wy-control-group .wy-form-thirds input[type=number],.wy-control-group .wy-form-thirds input[type=password],.wy-control-group .wy-form-thirds input[type=search],.wy-control-group .wy-form-thirds input[type=tel],.wy-control-group .wy-form-thirds input[type=text],.wy-control-group .wy-form-thirds input[type=time],.wy-control-group .wy-form-thirds input[type=url],.wy-control-group .wy-form-thirds input[type=week],.wy-control-group .wy-form-thirds select{width:100%}.wy-control-group .wy-form-full{float:left;display:block;width:100%;margin-right:0}.wy-control-group .wy-form-full:last-child{margin-right:0}.wy-control-group .wy-form-halves{float:left;display:block;margin-right:2.35765%;width:48.82117%}.wy-control-group .wy-form-halves:last-child,.wy-control-group .wy-form-halves:nth-of-type(2n){margin-right:0}.wy-control-group .wy-form-halves:nth-of-type(odd){clear:left}.wy-control-group .wy-form-thirds{float:left;display:block;margin-right:2.35765%;width:31.76157%}.wy-control-group .wy-form-thirds:last-child,.wy-control-group .wy-form-thirds:nth-of-type(3n){margin-right:0}.wy-control-group .wy-form-thirds:nth-of-type(3n+1){clear:left}.wy-control-group.wy-control-group-no-input .wy-control,.wy-control-no-input{margin:6px 0 0;font-size:90%}.wy-control-no-input{display:inline-block}.wy-control-group.fluid-input input[type=color],.wy-control-group.fluid-input input[type=date],.wy-control-group.fluid-input input[type=datetime-local],.wy-control-group.fluid-input input[type=datetime],.wy-control-group.fluid-input input[type=email],.wy-control-group.fluid-input input[type=month],.wy-control-group.fluid-input input[type=number],.wy-control-group.fluid-input input[type=password],.wy-control-group.fluid-input input[type=search],.wy-control-group.fluid-input input[type=tel],.wy-control-group.fluid-input input[type=text],.wy-control-group.fluid-input input[type=time],.wy-control-group.fluid-input input[type=url],.wy-control-group.fluid-input input[type=week]{width:100%}.wy-form-message-inline{padding-left:.3em;color:#666;font-size:90%}.wy-form-message{display:block;color:#999;font-size:70%;margin-top:.3125em;font-style:italic}.wy-form-message p{font-size:inherit;font-style:italic;margin-bottom:6px}.wy-form-message p:last-child{margin-bottom:0}input{line-height:normal}input[type=button],input[type=reset],input[type=submit]{-webkit-appearance:button;cursor:pointer;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;*overflow:visible}input[type=color],input[type=date],input[type=datetime-local],input[type=datetime],input[type=email],input[type=month],input[type=number],input[type=password],input[type=search],input[type=tel],input[type=text],input[type=time],input[type=url],input[type=week]{-webkit-appearance:none;padding:6px;display:inline-block;border:1px solid #ccc;font-size:80%;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;box-shadow:inset 0 1px 3px #ddd;border-radius:0;-webkit-transition:border .3s linear;-moz-transition:border .3s linear;transition:border .3s linear}input[type=datetime-local]{padding:.34375em .625em}input[disabled]{cursor:default}input[type=checkbox],input[type=radio]{padding:0;margin-right:.3125em;*height:13px;*width:13px}input[type=checkbox],input[type=radio],input[type=search]{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}input[type=search]::-webkit-search-cancel-button,input[type=search]::-webkit-search-decoration{-webkit-appearance:none}input[type=color]:focus,input[type=date]:focus,input[type=datetime-local]:focus,input[type=datetime]:focus,input[type=email]:focus,input[type=month]:focus,input[type=number]:focus,input[type=password]:focus,input[type=search]:focus,input[type=tel]:focus,input[type=text]:focus,input[type=time]:focus,input[type=url]:focus,input[type=week]:focus{outline:0;outline:thin dotted\9;border-color:#333}input.no-focus:focus{border-color:#ccc!important}input[type=checkbox]:focus,input[type=file]:focus,input[type=radio]:focus{outline:thin dotted #333;outline:1px auto #129fea}input[type=color][disabled],input[type=date][disabled],input[type=datetime-local][disabled],input[type=datetime][disabled],input[type=email][disabled],input[type=month][disabled],input[type=number][disabled],input[type=password][disabled],input[type=search][disabled],input[type=tel][disabled],input[type=text][disabled],input[type=time][disabled],input[type=url][disabled],input[type=week][disabled]{cursor:not-allowed;background-color:#fafafa}input:focus:invalid,select:focus:invalid,textarea:focus:invalid{color:#e74c3c;border:1px solid #e74c3c}input:focus:invalid:focus,select:focus:invalid:focus,textarea:focus:invalid:focus{border-color:#e74c3c}input[type=checkbox]:focus:invalid:focus,input[type=file]:focus:invalid:focus,input[type=radio]:focus:invalid:focus{outline-color:#e74c3c}input.wy-input-large{padding:12px;font-size:100%}textarea{overflow:auto;vertical-align:top;width:100%;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif}select,textarea{padding:.5em .625em;display:inline-block;border:1px solid #ccc;font-size:80%;box-shadow:inset 0 1px 3px #ddd;-webkit-transition:border .3s linear;-moz-transition:border .3s linear;transition:border .3s linear}select{border:1px solid #ccc;background-color:#fff}select[multiple]{height:auto}select:focus,textarea:focus{outline:0}input[readonly],select[disabled],select[readonly],textarea[disabled],textarea[readonly]{cursor:not-allowed;background-color:#fafafa}input[type=checkbox][disabled],input[type=radio][disabled]{cursor:not-allowed}.wy-checkbox,.wy-radio{margin:6px 0;color:#404040;display:block}.wy-checkbox input,.wy-radio input{vertical-align:baseline}.wy-form-message-inline{display:inline-block;*display:inline;*zoom:1;vertical-align:middle}.wy-input-prefix,.wy-input-suffix{white-space:nowrap;padding:6px}.wy-input-prefix .wy-input-context,.wy-input-suffix .wy-input-context{line-height:27px;padding:0 8px;display:inline-block;font-size:80%;background-color:#f3f6f6;border:1px solid #ccc;color:#999}.wy-input-suffix .wy-input-context{border-left:0}.wy-input-prefix .wy-input-context{border-right:0}.wy-switch{position:relative;display:block;height:24px;margin-top:12px;cursor:pointer}.wy-switch:before{left:0;top:0;width:36px;height:12px;background:#ccc}.wy-switch:after,.wy-switch:before{position:absolute;content:"";display:block;border-radius:4px;-webkit-transition:all .2s ease-in-out;-moz-transition:all .2s ease-in-out;transition:all .2s ease-in-out}.wy-switch:after{width:18px;height:18px;background:#999;left:-3px;top:-3px}.wy-switch span{position:absolute;left:48px;display:block;font-size:12px;color:#ccc;line-height:1}.wy-switch.active:before{background:#1e8449}.wy-switch.active:after{left:24px;background:#27ae60}.wy-switch.disabled{cursor:not-allowed;opacity:.8}.wy-control-group.wy-control-group-error .wy-form-message,.wy-control-group.wy-control-group-error>label{color:#e74c3c}.wy-control-group.wy-control-group-error input[type=color],.wy-control-group.wy-control-group-error input[type=date],.wy-control-group.wy-control-group-error input[type=datetime-local],.wy-control-group.wy-control-group-error input[type=datetime],.wy-control-group.wy-control-group-error input[type=email],.wy-control-group.wy-control-group-error input[type=month],.wy-control-group.wy-control-group-error input[type=number],.wy-control-group.wy-control-group-error input[type=password],.wy-control-group.wy-control-group-error input[type=search],.wy-control-group.wy-control-group-error input[type=tel],.wy-control-group.wy-control-group-error input[type=text],.wy-control-group.wy-control-group-error input[type=time],.wy-control-group.wy-control-group-error input[type=url],.wy-control-group.wy-control-group-error input[type=week],.wy-control-group.wy-control-group-error textarea{border:1px solid #e74c3c}.wy-inline-validate{white-space:nowrap}.wy-inline-validate .wy-input-context{padding:.5em .625em;display:inline-block;font-size:80%}.wy-inline-validate.wy-inline-validate-success .wy-input-context{color:#27ae60}.wy-inline-validate.wy-inline-validate-danger .wy-input-context{color:#e74c3c}.wy-inline-validate.wy-inline-validate-warning .wy-input-context{color:#e67e22}.wy-inline-validate.wy-inline-validate-info .wy-input-context{color:#2980b9}.rotate-90{-webkit-transform:rotate(90deg);-moz-transform:rotate(90deg);-ms-transform:rotate(90deg);-o-transform:rotate(90deg);transform:rotate(90deg)}.rotate-180{-webkit-transform:rotate(180deg);-moz-transform:rotate(180deg);-ms-transform:rotate(180deg);-o-transform:rotate(180deg);transform:rotate(180deg)}.rotate-270{-webkit-transform:rotate(270deg);-moz-transform:rotate(270deg);-ms-transform:rotate(270deg);-o-transform:rotate(270deg);transform:rotate(270deg)}.mirror{-webkit-transform:scaleX(-1);-moz-transform:scaleX(-1);-ms-transform:scaleX(-1);-o-transform:scaleX(-1);transform:scaleX(-1)}.mirror.rotate-90{-webkit-transform:scaleX(-1) rotate(90deg);-moz-transform:scaleX(-1) rotate(90deg);-ms-transform:scaleX(-1) rotate(90deg);-o-transform:scaleX(-1) rotate(90deg);transform:scaleX(-1) rotate(90deg)}.mirror.rotate-180{-webkit-transform:scaleX(-1) rotate(180deg);-moz-transform:scaleX(-1) rotate(180deg);-ms-transform:scaleX(-1) rotate(180deg);-o-transform:scaleX(-1) rotate(180deg);transform:scaleX(-1) rotate(180deg)}.mirror.rotate-270{-webkit-transform:scaleX(-1) rotate(270deg);-moz-transform:scaleX(-1) rotate(270deg);-ms-transform:scaleX(-1) rotate(270deg);-o-transform:scaleX(-1) rotate(270deg);transform:scaleX(-1) rotate(270deg)}@media only screen and (max-width:480px){.wy-form button[type=submit]{margin:.7em 0 0}.wy-form input[type=color],.wy-form input[type=date],.wy-form input[type=datetime-local],.wy-form input[type=datetime],.wy-form input[type=email],.wy-form input[type=month],.wy-form input[type=number],.wy-form input[type=password],.wy-form input[type=search],.wy-form input[type=tel],.wy-form input[type=text],.wy-form input[type=time],.wy-form input[type=url],.wy-form input[type=week],.wy-form label{margin-bottom:.3em;display:block}.wy-form input[type=color],.wy-form input[type=date],.wy-form input[type=datetime-local],.wy-form input[type=datetime],.wy-form input[type=email],.wy-form input[type=month],.wy-form input[type=number],.wy-form input[type=password],.wy-form input[type=search],.wy-form input[type=tel],.wy-form input[type=time],.wy-form input[type=url],.wy-form input[type=week]{margin-bottom:0}.wy-form-aligned .wy-control-group label{margin-bottom:.3em;text-align:left;display:block;width:100%}.wy-form-aligned .wy-control{margin:1.5em 0 0}.wy-form-message,.wy-form-message-inline,.wy-form .wy-help-inline{display:block;font-size:80%;padding:6px 0}}@media screen and (max-width:768px){.tablet-hide{display:none}}@media screen and (max-width:480px){.mobile-hide{display:none}}.float-left{float:left}.float-right{float:right}.full-width{width:100%}.rst-content table.docutils,.rst-content table.field-list,.wy-table{border-collapse:collapse;border-spacing:0;empty-cells:show;margin-bottom:24px}.rst-content table.docutils caption,.rst-content table.field-list caption,.wy-table caption{color:#000;font:italic 85%/1 arial,sans-serif;padding:1em 0;text-align:center}.rst-content table.docutils td,.rst-content table.docutils th,.rst-content table.field-list td,.rst-content table.field-list th,.wy-table td,.wy-table th{font-size:90%;margin:0;overflow:visible;padding:8px 16px}.rst-content table.docutils td:first-child,.rst-content table.docutils th:first-child,.rst-content table.field-list td:first-child,.rst-content table.field-list th:first-child,.wy-table td:first-child,.wy-table th:first-child{border-left-width:0}.rst-content table.docutils thead,.rst-content table.field-list thead,.wy-table thead{color:#000;text-align:left;vertical-align:bottom;white-space:nowrap}.rst-content table.docutils thead th,.rst-content table.field-list thead th,.wy-table thead th{font-weight:700;border-bottom:2px solid #e1e4e5}.rst-content table.docutils td,.rst-content table.field-list td,.wy-table td{background-color:transparent;vertical-align:middle}.rst-content table.docutils td p,.rst-content table.field-list td p,.wy-table td p{line-height:18px}.rst-content table.docutils td p:last-child,.rst-content table.field-list td p:last-child,.wy-table td p:last-child{margin-bottom:0}.rst-content table.docutils .wy-table-cell-min,.rst-content table.field-list .wy-table-cell-min,.wy-table .wy-table-cell-min{width:1%;padding-right:0}.rst-content table.docutils .wy-table-cell-min input[type=checkbox],.rst-content table.field-list .wy-table-cell-min input[type=checkbox],.wy-table .wy-table-cell-min input[type=checkbox]{margin:0}.wy-table-secondary{color:grey;font-size:90%}.wy-table-tertiary{color:grey;font-size:80%}.rst-content table.docutils:not(.field-list) tr:nth-child(2n-1) td,.wy-table-backed,.wy-table-odd td,.wy-table-striped tr:nth-child(2n-1) td{background-color:#f3f6f6}.rst-content table.docutils,.wy-table-bordered-all{border:1px solid #e1e4e5}.rst-content table.docutils td,.wy-table-bordered-all td{border-bottom:1px solid #e1e4e5;border-left:1px solid #e1e4e5}.rst-content table.docutils tbody>tr:last-child td,.wy-table-bordered-all tbody>tr:last-child td{border-bottom-width:0}.wy-table-bordered{border:1px solid #e1e4e5}.wy-table-bordered-rows td{border-bottom:1px solid #e1e4e5}.wy-table-bordered-rows tbody>tr:last-child td{border-bottom-width:0}.wy-table-horizontal td,.wy-table-horizontal th{border-width:0 0 1px;border-bottom:1px solid #e1e4e5}.wy-table-horizontal tbody>tr:last-child td{border-bottom-width:0}.wy-table-responsive{margin-bottom:24px;max-width:100%;overflow:auto}.wy-table-responsive table{margin-bottom:0!important}.wy-table-responsive table td,.wy-table-responsive table th{white-space:nowrap}a{color:#2980b9;text-decoration:none;cursor:pointer}a:hover{color:#3091d1}a:visited{color:#9b59b6}html{height:100%}body,html{overflow-x:hidden}body{font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;font-weight:400;color:#404040;min-height:100%;background:#edf0f2}.wy-text-left{text-align:left}.wy-text-center{text-align:center}.wy-text-right{text-align:right}.wy-text-large{font-size:120%}.wy-text-normal{font-size:100%}.wy-text-small,small{font-size:80%}.wy-text-strike{text-decoration:line-through}.wy-text-warning{color:#e67e22!important}a.wy-text-warning:hover{color:#eb9950!important}.wy-text-info{color:#2980b9!important}a.wy-text-info:hover{color:#409ad5!important}.wy-text-success{color:#27ae60!important}a.wy-text-success:hover{color:#36d278!important}.wy-text-danger{color:#e74c3c!important}a.wy-text-danger:hover{color:#ed7669!important}.wy-text-neutral{color:#404040!important}a.wy-text-neutral:hover{color:#595959!important}.rst-content .toctree-wrapper>p.caption,h1,h2,h3,h4,h5,h6,legend{margin-top:0;font-weight:700;font-family:Roboto Slab,ff-tisa-web-pro,Georgia,Arial,sans-serif}p{line-height:24px;font-size:16px;margin:0 0 24px}h1{font-size:175%}.rst-content .toctree-wrapper>p.caption,h2{font-size:150%}h3{font-size:125%}h4{font-size:115%}h5{font-size:110%}h6{font-size:100%}hr{display:block;height:1px;border:0;border-top:1px solid #e1e4e5;margin:24px 0;padding:0}.rst-content code,.rst-content tt,code{white-space:nowrap;max-width:100%;background:#fff;border:1px solid #e1e4e5;font-size:75%;padding:0 5px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;color:#e74c3c;overflow-x:auto}.rst-content tt.code-large,code.code-large{font-size:90%}.rst-content .section ul,.rst-content .toctree-wrapper ul,.rst-content section ul,.wy-plain-list-disc,article ul{list-style:disc;line-height:24px;margin-bottom:24px}.rst-content .section ul li,.rst-content .toctree-wrapper ul li,.rst-content section ul li,.wy-plain-list-disc li,article ul li{list-style:disc;margin-left:24px}.rst-content .section ul li p:last-child,.rst-content .section ul li ul,.rst-content .toctree-wrapper ul li p:last-child,.rst-content .toctree-wrapper ul li ul,.rst-content section ul li p:last-child,.rst-content section ul li ul,.wy-plain-list-disc li p:last-child,.wy-plain-list-disc li ul,article ul li p:last-child,article ul li ul{margin-bottom:0}.rst-content .section ul li li,.rst-content .toctree-wrapper ul li li,.rst-content section ul li li,.wy-plain-list-disc li li,article ul li li{list-style:circle}.rst-content .section ul li li li,.rst-content .toctree-wrapper ul li li li,.rst-content section ul li li li,.wy-plain-list-disc li li li,article ul li li li{list-style:square}.rst-content .section ul li ol li,.rst-content .toctree-wrapper ul li ol li,.rst-content section ul li ol li,.wy-plain-list-disc li ol li,article ul li ol li{list-style:decimal}.rst-content .section ol,.rst-content .section ol.arabic,.rst-content .toctree-wrapper ol,.rst-content .toctree-wrapper ol.arabic,.rst-content section ol,.rst-content section ol.arabic,.wy-plain-list-decimal,article ol{list-style:decimal;line-height:24px;margin-bottom:24px}.rst-content .section ol.arabic li,.rst-content .section ol li,.rst-content .toctree-wrapper ol.arabic li,.rst-content .toctree-wrapper ol li,.rst-content section ol.arabic li,.rst-content section ol li,.wy-plain-list-decimal li,article ol li{list-style:decimal;margin-left:24px}.rst-content .section ol.arabic li ul,.rst-content .section ol li p:last-child,.rst-content .section ol li ul,.rst-content .toctree-wrapper ol.arabic li ul,.rst-content .toctree-wrapper ol li p:last-child,.rst-content .toctree-wrapper ol li ul,.rst-content section ol.arabic li ul,.rst-content section ol li p:last-child,.rst-content section ol li ul,.wy-plain-list-decimal li p:last-child,.wy-plain-list-decimal li ul,article ol li p:last-child,article ol li ul{margin-bottom:0}.rst-content .section ol.arabic li ul li,.rst-content .section ol li ul li,.rst-content .toctree-wrapper ol.arabic li ul li,.rst-content .toctree-wrapper ol li ul li,.rst-content section ol.arabic li ul li,.rst-content section ol li ul li,.wy-plain-list-decimal li ul li,article ol li ul li{list-style:disc}.wy-breadcrumbs{*zoom:1}.wy-breadcrumbs:after,.wy-breadcrumbs:before{display:table;content:""}.wy-breadcrumbs:after{clear:both}.wy-breadcrumbs>li{display:inline-block;padding-top:5px}.wy-breadcrumbs>li.wy-breadcrumbs-aside{float:right}.rst-content .wy-breadcrumbs>li code,.rst-content .wy-breadcrumbs>li tt,.wy-breadcrumbs>li .rst-content tt,.wy-breadcrumbs>li code{all:inherit;color:inherit}.breadcrumb-item:before{content:"/";color:#bbb;font-size:13px;padding:0 6px 0 3px}.wy-breadcrumbs-extra{margin-bottom:0;color:#b3b3b3;font-size:80%;display:inline-block}@media screen and (max-width:480px){.wy-breadcrumbs-extra,.wy-breadcrumbs li.wy-breadcrumbs-aside{display:none}}@media print{.wy-breadcrumbs li.wy-breadcrumbs-aside{display:none}}html{font-size:16px}.wy-affix{position:fixed;top:1.618em}.wy-menu a:hover{text-decoration:none}.wy-menu-horiz{*zoom:1}.wy-menu-horiz:after,.wy-menu-horiz:before{display:table;content:""}.wy-menu-horiz:after{clear:both}.wy-menu-horiz li,.wy-menu-horiz ul{display:inline-block}.wy-menu-horiz li:hover{background:hsla(0,0%,100%,.1)}.wy-menu-horiz li.divide-left{border-left:1px solid #404040}.wy-menu-horiz li.divide-right{border-right:1px solid #404040}.wy-menu-horiz a{height:32px;display:inline-block;line-height:32px;padding:0 16px}.wy-menu-vertical{width:300px}.wy-menu-vertical header,.wy-menu-vertical p.caption{color:#55a5d9;height:32px;line-height:32px;padding:0 1.618em;margin:12px 0 0;display:block;font-weight:700;text-transform:uppercase;font-size:85%;white-space:nowrap}.wy-menu-vertical ul{margin-bottom:0}.wy-menu-vertical li.divide-top{border-top:1px solid #404040}.wy-menu-vertical li.divide-bottom{border-bottom:1px solid #404040}.wy-menu-vertical li.current{background:#e3e3e3}.wy-menu-vertical li.current a{color:grey;border-right:1px solid #c9c9c9;padding:.4045em 2.427em}.wy-menu-vertical li.current a:hover{background:#d6d6d6}.rst-content .wy-menu-vertical li tt,.wy-menu-vertical li .rst-content tt,.wy-menu-vertical li code{border:none;background:inherit;color:inherit;padding-left:0;padding-right:0}.wy-menu-vertical li button.toctree-expand{display:block;float:left;margin-left:-1.2em;line-height:18px;color:#4d4d4d;border:none;background:none;padding:0}.wy-menu-vertical li.current>a,.wy-menu-vertical li.on a{color:#404040;font-weight:700;position:relative;background:#fcfcfc;border:none;padding:.4045em 1.618em}.wy-menu-vertical li.current>a:hover,.wy-menu-vertical li.on a:hover{background:#fcfcfc}.wy-menu-vertical li.current>a:hover button.toctree-expand,.wy-menu-vertical li.on a:hover button.toctree-expand{color:grey}.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand{display:block;line-height:18px;color:#333}.wy-menu-vertical li.toctree-l1.current>a{border-bottom:1px solid #c9c9c9;border-top:1px solid #c9c9c9}.wy-menu-vertical .toctree-l1.current .toctree-l2>ul,.wy-menu-vertical .toctree-l2.current .toctree-l3>ul,.wy-menu-vertical .toctree-l3.current .toctree-l4>ul,.wy-menu-vertical .toctree-l4.current .toctree-l5>ul,.wy-menu-vertical .toctree-l5.current .toctree-l6>ul,.wy-menu-vertical .toctree-l6.current .toctree-l7>ul,.wy-menu-vertical .toctree-l7.current .toctree-l8>ul,.wy-menu-vertical .toctree-l8.current .toctree-l9>ul,.wy-menu-vertical .toctree-l9.current .toctree-l10>ul,.wy-menu-vertical .toctree-l10.current .toctree-l11>ul{display:none}.wy-menu-vertical .toctree-l1.current .current.toctree-l2>ul,.wy-menu-vertical .toctree-l2.current .current.toctree-l3>ul,.wy-menu-vertical .toctree-l3.current .current.toctree-l4>ul,.wy-menu-vertical .toctree-l4.current .current.toctree-l5>ul,.wy-menu-vertical .toctree-l5.current .current.toctree-l6>ul,.wy-menu-vertical .toctree-l6.current .current.toctree-l7>ul,.wy-menu-vertical .toctree-l7.current .current.toctree-l8>ul,.wy-menu-vertical .toctree-l8.current .current.toctree-l9>ul,.wy-menu-vertical .toctree-l9.current .current.toctree-l10>ul,.wy-menu-vertical .toctree-l10.current .current.toctree-l11>ul{display:block}.wy-menu-vertical li.toctree-l3,.wy-menu-vertical li.toctree-l4{font-size:.9em}.wy-menu-vertical li.toctree-l2 a,.wy-menu-vertical li.toctree-l3 a,.wy-menu-vertical li.toctree-l4 a,.wy-menu-vertical li.toctree-l5 a,.wy-menu-vertical li.toctree-l6 a,.wy-menu-vertical li.toctree-l7 a,.wy-menu-vertical li.toctree-l8 a,.wy-menu-vertical li.toctree-l9 a,.wy-menu-vertical li.toctree-l10 a{color:#404040}.wy-menu-vertical li.toctree-l2 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l3 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l4 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l5 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l6 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l7 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l8 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l9 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l10 a:hover button.toctree-expand{color:grey}.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a,.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a,.wy-menu-vertical li.toctree-l4.current li.toctree-l5>a,.wy-menu-vertical li.toctree-l5.current li.toctree-l6>a,.wy-menu-vertical li.toctree-l6.current li.toctree-l7>a,.wy-menu-vertical li.toctree-l7.current li.toctree-l8>a,.wy-menu-vertical li.toctree-l8.current li.toctree-l9>a,.wy-menu-vertical li.toctree-l9.current li.toctree-l10>a,.wy-menu-vertical li.toctree-l10.current li.toctree-l11>a{display:block}.wy-menu-vertical li.toctree-l2.current>a{padding:.4045em 2.427em}.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a{padding:.4045em 1.618em .4045em 4.045em}.wy-menu-vertical li.toctree-l3.current>a{padding:.4045em 4.045em}.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a{padding:.4045em 1.618em .4045em 5.663em}.wy-menu-vertical li.toctree-l4.current>a{padding:.4045em 5.663em}.wy-menu-vertical li.toctree-l4.current li.toctree-l5>a{padding:.4045em 1.618em .4045em 7.281em}.wy-menu-vertical li.toctree-l5.current>a{padding:.4045em 7.281em}.wy-menu-vertical li.toctree-l5.current li.toctree-l6>a{padding:.4045em 1.618em .4045em 8.899em}.wy-menu-vertical li.toctree-l6.current>a{padding:.4045em 8.899em}.wy-menu-vertical li.toctree-l6.current li.toctree-l7>a{padding:.4045em 1.618em .4045em 10.517em}.wy-menu-vertical li.toctree-l7.current>a{padding:.4045em 10.517em}.wy-menu-vertical li.toctree-l7.current li.toctree-l8>a{padding:.4045em 1.618em .4045em 12.135em}.wy-menu-vertical li.toctree-l8.current>a{padding:.4045em 12.135em}.wy-menu-vertical li.toctree-l8.current li.toctree-l9>a{padding:.4045em 1.618em .4045em 13.753em}.wy-menu-vertical li.toctree-l9.current>a{padding:.4045em 13.753em}.wy-menu-vertical li.toctree-l9.current li.toctree-l10>a{padding:.4045em 1.618em .4045em 15.371em}.wy-menu-vertical li.toctree-l10.current>a{padding:.4045em 15.371em}.wy-menu-vertical li.toctree-l10.current li.toctree-l11>a{padding:.4045em 1.618em .4045em 16.989em}.wy-menu-vertical li.toctree-l2.current>a,.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a{background:#c9c9c9}.wy-menu-vertical li.toctree-l2 button.toctree-expand{color:#a3a3a3}.wy-menu-vertical li.toctree-l3.current>a,.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a{background:#bdbdbd}.wy-menu-vertical li.toctree-l3 button.toctree-expand{color:#969696}.wy-menu-vertical li.current ul{display:block}.wy-menu-vertical li ul{margin-bottom:0;display:none}.wy-menu-vertical li ul li a{margin-bottom:0;color:#d9d9d9;font-weight:400}.wy-menu-vertical a{line-height:18px;padding:.4045em 1.618em;display:block;position:relative;font-size:90%;color:#d9d9d9}.wy-menu-vertical a:hover{background-color:#4e4a4a;cursor:pointer}.wy-menu-vertical a:hover button.toctree-expand{color:#d9d9d9}.wy-menu-vertical a:active{background-color:#2980b9;cursor:pointer;color:#fff}.wy-menu-vertical a:active button.toctree-expand{color:#fff}.wy-side-nav-search{display:block;width:300px;padding:.809em;margin-bottom:.809em;z-index:200;background-color:#2980b9;text-align:center;color:#fcfcfc}.wy-side-nav-search input[type=text]{width:100%;border-radius:50px;padding:6px 12px;border-color:#2472a4}.wy-side-nav-search img{display:block;margin:auto auto .809em;height:45px;width:45px;background-color:#2980b9;padding:5px;border-radius:100%}.wy-side-nav-search .wy-dropdown>a,.wy-side-nav-search>a{color:#fcfcfc;font-size:100%;font-weight:700;display:inline-block;padding:4px 6px;margin-bottom:.809em;max-width:100%}.wy-side-nav-search .wy-dropdown>a:hover,.wy-side-nav-search>a:hover{background:hsla(0,0%,100%,.1)}.wy-side-nav-search .wy-dropdown>a img.logo,.wy-side-nav-search>a img.logo{display:block;margin:0 auto;height:auto;width:auto;border-radius:0;max-width:100%;background:transparent}.wy-side-nav-search .wy-dropdown>a.icon img.logo,.wy-side-nav-search>a.icon img.logo{margin-top:.85em}.wy-side-nav-search>div.version{margin-top:-.4045em;margin-bottom:.809em;font-weight:400;color:hsla(0,0%,100%,.3)}.wy-nav .wy-menu-vertical header{color:#2980b9}.wy-nav .wy-menu-vertical a{color:#b3b3b3}.wy-nav .wy-menu-vertical a:hover{background-color:#2980b9;color:#fff}[data-menu-wrap]{-webkit-transition:all .2s ease-in;-moz-transition:all .2s ease-in;transition:all .2s ease-in;position:absolute;opacity:1;width:100%;opacity:0}[data-menu-wrap].move-center{left:0;right:auto;opacity:1}[data-menu-wrap].move-left{right:auto;left:-100%;opacity:0}[data-menu-wrap].move-right{right:-100%;left:auto;opacity:0}.wy-body-for-nav{background:#fcfcfc}.wy-grid-for-nav{position:absolute;width:100%;height:100%}.wy-nav-side{position:fixed;top:0;bottom:0;left:0;padding-bottom:2em;width:300px;overflow-x:hidden;overflow-y:hidden;min-height:100%;color:#9b9b9b;background:#343131;z-index:200}.wy-side-scroll{width:320px;position:relative;overflow-x:hidden;overflow-y:scroll;height:100%}.wy-nav-top{display:none;background:#2980b9;color:#fff;padding:.4045em .809em;position:relative;line-height:50px;text-align:center;font-size:100%;*zoom:1}.wy-nav-top:after,.wy-nav-top:before{display:table;content:""}.wy-nav-top:after{clear:both}.wy-nav-top a{color:#fff;font-weight:700}.wy-nav-top img{margin-right:12px;height:45px;width:45px;background-color:#2980b9;padding:5px;border-radius:100%}.wy-nav-top i{font-size:30px;float:left;cursor:pointer;padding-top:inherit}.wy-nav-content-wrap{margin-left:300px;background:#fcfcfc;min-height:100%}.wy-nav-content{padding:1.618em 3.236em;height:100%;max-width:800px;margin:auto}.wy-body-mask{position:fixed;width:100%;height:100%;background:rgba(0,0,0,.2);display:none;z-index:499}.wy-body-mask.on{display:block}footer{color:grey}footer p{margin-bottom:12px}.rst-content footer span.commit tt,footer span.commit .rst-content tt,footer span.commit code{padding:0;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;font-size:1em;background:none;border:none;color:grey}.rst-footer-buttons{*zoom:1}.rst-footer-buttons:after,.rst-footer-buttons:before{width:100%;display:table;content:""}.rst-footer-buttons:after{clear:both}.rst-breadcrumbs-buttons{margin-top:12px;*zoom:1}.rst-breadcrumbs-buttons:after,.rst-breadcrumbs-buttons:before{display:table;content:""}.rst-breadcrumbs-buttons:after{clear:both}#search-results .search li{margin-bottom:24px;border-bottom:1px solid #e1e4e5;padding-bottom:24px}#search-results .search li:first-child{border-top:1px solid #e1e4e5;padding-top:24px}#search-results .search li a{font-size:120%;margin-bottom:12px;display:inline-block}#search-results .context{color:grey;font-size:90%}.genindextable li>ul{margin-left:24px}@media screen and (max-width:768px){.wy-body-for-nav{background:#fcfcfc}.wy-nav-top{display:block}.wy-nav-side{left:-300px}.wy-nav-side.shift{width:85%;left:0}.wy-menu.wy-menu-vertical,.wy-side-nav-search,.wy-side-scroll{width:auto}.wy-nav-content-wrap{margin-left:0}.wy-nav-content-wrap .wy-nav-content{padding:1.618em}.wy-nav-content-wrap.shift{position:fixed;min-width:100%;left:85%;top:0;height:100%;overflow:hidden}}@media screen and (min-width:1100px){.wy-nav-content-wrap{background:rgba(0,0,0,.05)}.wy-nav-content{margin:0;background:#fcfcfc}}@media print{.rst-versions,.wy-nav-side,footer{display:none}.wy-nav-content-wrap{margin-left:0}}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60;*zoom:1}.rst-versions .rst-current-version:after,.rst-versions .rst-current-version:before{display:table;content:""}.rst-versions .rst-current-version:after{clear:both}.rst-content .code-block-caption .rst-versions .rst-current-version .headerlink,.rst-content .eqno .rst-versions .rst-current-version .headerlink,.rst-content .rst-versions .rst-current-version .admonition-title,.rst-content code.download .rst-versions .rst-current-version span:first-child,.rst-content dl dt .rst-versions .rst-current-version .headerlink,.rst-content h1 .rst-versions .rst-current-version .headerlink,.rst-content h2 .rst-versions .rst-current-version .headerlink,.rst-content h3 .rst-versions .rst-current-version .headerlink,.rst-content h4 .rst-versions .rst-current-version .headerlink,.rst-content h5 .rst-versions .rst-current-version .headerlink,.rst-content h6 .rst-versions .rst-current-version .headerlink,.rst-content p .rst-versions .rst-current-version .headerlink,.rst-content table>caption .rst-versions .rst-current-version .headerlink,.rst-content tt.download .rst-versions .rst-current-version span:first-child,.rst-versions .rst-current-version .fa,.rst-versions .rst-current-version .icon,.rst-versions .rst-current-version .rst-content .admonition-title,.rst-versions .rst-current-version .rst-content .code-block-caption .headerlink,.rst-versions .rst-current-version .rst-content .eqno .headerlink,.rst-versions .rst-current-version .rst-content code.download span:first-child,.rst-versions .rst-current-version .rst-content dl dt .headerlink,.rst-versions .rst-current-version .rst-content h1 .headerlink,.rst-versions .rst-current-version .rst-content h2 .headerlink,.rst-versions .rst-current-version .rst-content h3 .headerlink,.rst-versions .rst-current-version .rst-content h4 .headerlink,.rst-versions .rst-current-version .rst-content h5 .headerlink,.rst-versions .rst-current-version .rst-content h6 .headerlink,.rst-versions .rst-current-version .rst-content p .headerlink,.rst-versions .rst-current-version .rst-content table>caption .headerlink,.rst-versions .rst-current-version .rst-content tt.download span:first-child,.rst-versions .rst-current-version .wy-menu-vertical li button.toctree-expand,.wy-menu-vertical li .rst-versions .rst-current-version button.toctree-expand{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up{height:auto;max-height:100%;overflow-y:scroll}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:grey;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:1px solid #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px;max-height:90%}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none;line-height:30px}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge>.rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width:768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}}.rst-content .toctree-wrapper>p.caption,.rst-content h1,.rst-content h2,.rst-content h3,.rst-content h4,.rst-content h5,.rst-content h6{margin-bottom:24px}.rst-content img{max-width:100%;height:auto}.rst-content div.figure,.rst-content figure{margin-bottom:24px}.rst-content div.figure .caption-text,.rst-content figure .caption-text{font-style:italic}.rst-content div.figure p:last-child.caption,.rst-content figure p:last-child.caption{margin-bottom:0}.rst-content div.figure.align-center,.rst-content figure.align-center{text-align:center}.rst-content .section>a>img,.rst-content .section>img,.rst-content section>a>img,.rst-content section>img{margin-bottom:24px}.rst-content abbr[title]{text-decoration:none}.rst-content.style-external-links a.reference.external:after{font-family:FontAwesome;content:"\f08e";color:#b3b3b3;vertical-align:super;font-size:60%;margin:0 .2em}.rst-content blockquote{margin-left:24px;line-height:24px;margin-bottom:24px}.rst-content pre.literal-block{white-space:pre;margin:0;padding:12px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;display:block;overflow:auto}.rst-content div[class^=highlight],.rst-content pre.literal-block{border:1px solid #e1e4e5;overflow-x:auto;margin:1px 0 24px}.rst-content div[class^=highlight] div[class^=highlight],.rst-content pre.literal-block div[class^=highlight]{padding:0;border:none;margin:0}.rst-content div[class^=highlight] td.code{width:100%}.rst-content .linenodiv pre{border-right:1px solid #e6e9ea;margin:0;padding:12px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;user-select:none;pointer-events:none}.rst-content div[class^=highlight] pre{white-space:pre;margin:0;padding:12px;display:block;overflow:auto}.rst-content div[class^=highlight] pre .hll{display:block;margin:0 -12px;padding:0 12px}.rst-content .linenodiv pre,.rst-content div[class^=highlight] pre,.rst-content pre.literal-block{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;font-size:12px;line-height:1.4}.rst-content div.highlight .gp,.rst-content div.highlight span.linenos{user-select:none;pointer-events:none}.rst-content div.highlight span.linenos{display:inline-block;padding-left:0;padding-right:12px;margin-right:12px;border-right:1px solid #e6e9ea}.rst-content .code-block-caption{font-style:italic;font-size:85%;line-height:1;padding:1em 0;text-align:center}@media print{.rst-content .codeblock,.rst-content div[class^=highlight],.rst-content div[class^=highlight] pre{white-space:pre-wrap}}.rst-content .admonition,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .danger,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning{clear:both}.rst-content .admonition-todo .last,.rst-content .admonition-todo>:last-child,.rst-content .admonition .last,.rst-content .admonition>:last-child,.rst-content .attention .last,.rst-content .attention>:last-child,.rst-content .caution .last,.rst-content .caution>:last-child,.rst-content .danger .last,.rst-content .danger>:last-child,.rst-content .error .last,.rst-content .error>:last-child,.rst-content .hint .last,.rst-content .hint>:last-child,.rst-content .important .last,.rst-content .important>:last-child,.rst-content .note .last,.rst-content .note>:last-child,.rst-content .seealso .last,.rst-content .seealso>:last-child,.rst-content .tip .last,.rst-content .tip>:last-child,.rst-content .warning .last,.rst-content .warning>:last-child{margin-bottom:0}.rst-content .admonition-title:before{margin-right:4px}.rst-content .admonition table{border-color:rgba(0,0,0,.1)}.rst-content .admonition table td,.rst-content .admonition table th{background:transparent!important;border-color:rgba(0,0,0,.1)!important}.rst-content .section ol.loweralpha,.rst-content .section ol.loweralpha>li,.rst-content .toctree-wrapper ol.loweralpha,.rst-content .toctree-wrapper ol.loweralpha>li,.rst-content section ol.loweralpha,.rst-content section ol.loweralpha>li{list-style:lower-alpha}.rst-content .section ol.upperalpha,.rst-content .section ol.upperalpha>li,.rst-content .toctree-wrapper ol.upperalpha,.rst-content .toctree-wrapper ol.upperalpha>li,.rst-content section ol.upperalpha,.rst-content section ol.upperalpha>li{list-style:upper-alpha}.rst-content .section ol li>*,.rst-content .section ul li>*,.rst-content .toctree-wrapper ol li>*,.rst-content .toctree-wrapper ul li>*,.rst-content section ol li>*,.rst-content section ul li>*{margin-top:12px;margin-bottom:12px}.rst-content .section ol li>:first-child,.rst-content .section ul li>:first-child,.rst-content .toctree-wrapper ol li>:first-child,.rst-content .toctree-wrapper ul li>:first-child,.rst-content section ol li>:first-child,.rst-content section ul li>:first-child{margin-top:0}.rst-content .section ol li>p,.rst-content .section ol li>p:last-child,.rst-content .section ul li>p,.rst-content .section ul li>p:last-child,.rst-content .toctree-wrapper ol li>p,.rst-content .toctree-wrapper ol li>p:last-child,.rst-content .toctree-wrapper ul li>p,.rst-content .toctree-wrapper ul li>p:last-child,.rst-content section ol li>p,.rst-content section ol li>p:last-child,.rst-content section ul li>p,.rst-content section ul li>p:last-child{margin-bottom:12px}.rst-content .section ol li>p:only-child,.rst-content .section ol li>p:only-child:last-child,.rst-content .section ul li>p:only-child,.rst-content .section ul li>p:only-child:last-child,.rst-content .toctree-wrapper ol li>p:only-child,.rst-content .toctree-wrapper ol li>p:only-child:last-child,.rst-content .toctree-wrapper ul li>p:only-child,.rst-content .toctree-wrapper ul li>p:only-child:last-child,.rst-content section ol li>p:only-child,.rst-content section ol li>p:only-child:last-child,.rst-content section ul li>p:only-child,.rst-content section ul li>p:only-child:last-child{margin-bottom:0}.rst-content .section ol li>ol,.rst-content .section ol li>ul,.rst-content .section ul li>ol,.rst-content .section ul li>ul,.rst-content .toctree-wrapper ol li>ol,.rst-content .toctree-wrapper ol li>ul,.rst-content .toctree-wrapper ul li>ol,.rst-content .toctree-wrapper ul li>ul,.rst-content section ol li>ol,.rst-content section ol li>ul,.rst-content section ul li>ol,.rst-content section ul li>ul{margin-bottom:12px}.rst-content .section ol.simple li>*,.rst-content .section ol.simple li ol,.rst-content .section ol.simple li ul,.rst-content .section ul.simple li>*,.rst-content .section ul.simple li ol,.rst-content .section ul.simple li ul,.rst-content .toctree-wrapper ol.simple li>*,.rst-content .toctree-wrapper ol.simple li ol,.rst-content .toctree-wrapper ol.simple li ul,.rst-content .toctree-wrapper ul.simple li>*,.rst-content .toctree-wrapper ul.simple li ol,.rst-content .toctree-wrapper ul.simple li ul,.rst-content section ol.simple li>*,.rst-content section ol.simple li ol,.rst-content section ol.simple li ul,.rst-content section ul.simple li>*,.rst-content section ul.simple li ol,.rst-content section ul.simple li ul{margin-top:0;margin-bottom:0}.rst-content .line-block{margin-left:0;margin-bottom:24px;line-height:24px}.rst-content .line-block .line-block{margin-left:24px;margin-bottom:0}.rst-content .topic-title{font-weight:700;margin-bottom:12px}.rst-content .toc-backref{color:#404040}.rst-content .align-right{float:right;margin:0 0 24px 24px}.rst-content .align-left{float:left;margin:0 24px 24px 0}.rst-content .align-center{margin:auto}.rst-content .align-center:not(table){display:block}.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content .toctree-wrapper>p.caption .headerlink,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink{opacity:0;font-size:14px;font-family:FontAwesome;margin-left:.5em}.rst-content .code-block-caption .headerlink:focus,.rst-content .code-block-caption:hover .headerlink,.rst-content .eqno .headerlink:focus,.rst-content .eqno:hover .headerlink,.rst-content .toctree-wrapper>p.caption .headerlink:focus,.rst-content .toctree-wrapper>p.caption:hover .headerlink,.rst-content dl dt .headerlink:focus,.rst-content dl dt:hover .headerlink,.rst-content h1 .headerlink:focus,.rst-content h1:hover .headerlink,.rst-content h2 .headerlink:focus,.rst-content h2:hover .headerlink,.rst-content h3 .headerlink:focus,.rst-content h3:hover .headerlink,.rst-content h4 .headerlink:focus,.rst-content h4:hover .headerlink,.rst-content h5 .headerlink:focus,.rst-content h5:hover .headerlink,.rst-content h6 .headerlink:focus,.rst-content h6:hover .headerlink,.rst-content p.caption .headerlink:focus,.rst-content p.caption:hover .headerlink,.rst-content p .headerlink:focus,.rst-content p:hover .headerlink,.rst-content table>caption .headerlink:focus,.rst-content table>caption:hover .headerlink{opacity:1}.rst-content p a{overflow-wrap:anywhere}.rst-content .wy-table td p,.rst-content .wy-table td ul,.rst-content .wy-table th p,.rst-content .wy-table th ul,.rst-content table.docutils td p,.rst-content table.docutils td ul,.rst-content table.docutils th p,.rst-content table.docutils th ul,.rst-content table.field-list td p,.rst-content table.field-list td ul,.rst-content table.field-list th p,.rst-content table.field-list th ul{font-size:inherit}.rst-content .btn:focus{outline:2px solid}.rst-content table>caption .headerlink:after{font-size:12px}.rst-content .centered{text-align:center}.rst-content .sidebar{float:right;width:40%;display:block;margin:0 0 24px 24px;padding:24px;background:#f3f6f6;border:1px solid #e1e4e5}.rst-content .sidebar dl,.rst-content .sidebar p,.rst-content .sidebar ul{font-size:90%}.rst-content .sidebar .last,.rst-content .sidebar>:last-child{margin-bottom:0}.rst-content .sidebar .sidebar-title{display:block;font-family:Roboto Slab,ff-tisa-web-pro,Georgia,Arial,sans-serif;font-weight:700;background:#e1e4e5;padding:6px 12px;margin:-24px -24px 24px;font-size:100%}.rst-content .highlighted{background:#f1c40f;box-shadow:0 0 0 2px #f1c40f;display:inline;font-weight:700}.rst-content .citation-reference,.rst-content .footnote-reference{vertical-align:baseline;position:relative;top:-.4em;line-height:0;font-size:90%}.rst-content .citation-reference>span.fn-bracket,.rst-content .footnote-reference>span.fn-bracket{display:none}.rst-content .hlist{width:100%}.rst-content dl dt span.classifier:before{content:" : "}.rst-content dl dt span.classifier-delimiter{display:none!important}html.writer-html4 .rst-content table.docutils.citation,html.writer-html4 .rst-content table.docutils.footnote{background:none;border:none}html.writer-html4 .rst-content table.docutils.citation td,html.writer-html4 .rst-content table.docutils.citation tr,html.writer-html4 .rst-content table.docutils.footnote td,html.writer-html4 .rst-content table.docutils.footnote tr{border:none;background-color:transparent!important;white-space:normal}html.writer-html4 .rst-content table.docutils.citation td.label,html.writer-html4 .rst-content table.docutils.footnote td.label{padding-left:0;padding-right:0;vertical-align:top}html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.field-list,html.writer-html5 .rst-content dl.footnote{display:grid;grid-template-columns:auto minmax(80%,95%)}html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.field-list>dt,html.writer-html5 .rst-content dl.footnote>dt{display:inline-grid;grid-template-columns:max-content auto}html.writer-html5 .rst-content aside.citation,html.writer-html5 .rst-content aside.footnote,html.writer-html5 .rst-content div.citation{display:grid;grid-template-columns:auto auto minmax(.65rem,auto) minmax(40%,95%)}html.writer-html5 .rst-content aside.citation>span.label,html.writer-html5 .rst-content aside.footnote>span.label,html.writer-html5 .rst-content div.citation>span.label{grid-column-start:1;grid-column-end:2}html.writer-html5 .rst-content aside.citation>span.backrefs,html.writer-html5 .rst-content aside.footnote>span.backrefs,html.writer-html5 .rst-content div.citation>span.backrefs{grid-column-start:2;grid-column-end:3;grid-row-start:1;grid-row-end:3}html.writer-html5 .rst-content aside.citation>p,html.writer-html5 .rst-content aside.footnote>p,html.writer-html5 .rst-content div.citation>p{grid-column-start:4;grid-column-end:5}html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.field-list,html.writer-html5 .rst-content dl.footnote{margin-bottom:24px}html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.field-list>dt,html.writer-html5 .rst-content dl.footnote>dt{padding-left:1rem}html.writer-html5 .rst-content dl.citation>dd,html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.field-list>dd,html.writer-html5 .rst-content dl.field-list>dt,html.writer-html5 .rst-content dl.footnote>dd,html.writer-html5 .rst-content dl.footnote>dt{margin-bottom:0}html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.footnote{font-size:.9rem}html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.footnote>dt{margin:0 .5rem .5rem 0;line-height:1.2rem;word-break:break-all;font-weight:400}html.writer-html5 .rst-content dl.citation>dt>span.brackets:before,html.writer-html5 .rst-content dl.footnote>dt>span.brackets:before{content:"["}html.writer-html5 .rst-content dl.citation>dt>span.brackets:after,html.writer-html5 .rst-content dl.footnote>dt>span.brackets:after{content:"]"}html.writer-html5 .rst-content dl.citation>dt>span.fn-backref,html.writer-html5 .rst-content dl.footnote>dt>span.fn-backref{text-align:left;font-style:italic;margin-left:.65rem;word-break:break-word;word-spacing:-.1rem;max-width:5rem}html.writer-html5 .rst-content dl.citation>dt>span.fn-backref>a,html.writer-html5 .rst-content dl.footnote>dt>span.fn-backref>a{word-break:keep-all}html.writer-html5 .rst-content dl.citation>dt>span.fn-backref>a:not(:first-child):before,html.writer-html5 .rst-content dl.footnote>dt>span.fn-backref>a:not(:first-child):before{content:" "}html.writer-html5 .rst-content dl.citation>dd,html.writer-html5 .rst-content dl.footnote>dd{margin:0 0 .5rem;line-height:1.2rem}html.writer-html5 .rst-content dl.citation>dd p,html.writer-html5 .rst-content dl.footnote>dd p{font-size:.9rem}html.writer-html5 .rst-content aside.citation,html.writer-html5 .rst-content aside.footnote,html.writer-html5 .rst-content div.citation{padding-left:1rem;padding-right:1rem;font-size:.9rem;line-height:1.2rem}html.writer-html5 .rst-content aside.citation p,html.writer-html5 .rst-content aside.footnote p,html.writer-html5 .rst-content div.citation p{font-size:.9rem;line-height:1.2rem;margin-bottom:12px}html.writer-html5 .rst-content aside.citation span.backrefs,html.writer-html5 .rst-content aside.footnote span.backrefs,html.writer-html5 .rst-content div.citation span.backrefs{text-align:left;font-style:italic;margin-left:.65rem;word-break:break-word;word-spacing:-.1rem;max-width:5rem}html.writer-html5 .rst-content aside.citation span.backrefs>a,html.writer-html5 .rst-content aside.footnote span.backrefs>a,html.writer-html5 .rst-content div.citation span.backrefs>a{word-break:keep-all}html.writer-html5 .rst-content aside.citation span.backrefs>a:not(:first-child):before,html.writer-html5 .rst-content aside.footnote span.backrefs>a:not(:first-child):before,html.writer-html5 .rst-content div.citation span.backrefs>a:not(:first-child):before{content:" "}html.writer-html5 .rst-content aside.citation span.label,html.writer-html5 .rst-content aside.footnote span.label,html.writer-html5 .rst-content div.citation span.label{line-height:1.2rem}html.writer-html5 .rst-content aside.citation-list,html.writer-html5 .rst-content aside.footnote-list,html.writer-html5 .rst-content div.citation-list{margin-bottom:24px}html.writer-html5 .rst-content dl.option-list kbd{font-size:.9rem}.rst-content table.docutils.footnote,html.writer-html4 .rst-content table.docutils.citation,html.writer-html5 .rst-content aside.footnote,html.writer-html5 .rst-content aside.footnote-list aside.footnote,html.writer-html5 .rst-content div.citation-list>div.citation,html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.footnote{color:grey}.rst-content table.docutils.footnote code,.rst-content table.docutils.footnote tt,html.writer-html4 .rst-content table.docutils.citation code,html.writer-html4 .rst-content table.docutils.citation tt,html.writer-html5 .rst-content aside.footnote-list aside.footnote code,html.writer-html5 .rst-content aside.footnote-list aside.footnote tt,html.writer-html5 .rst-content aside.footnote code,html.writer-html5 .rst-content aside.footnote tt,html.writer-html5 .rst-content div.citation-list>div.citation code,html.writer-html5 .rst-content div.citation-list>div.citation tt,html.writer-html5 .rst-content dl.citation code,html.writer-html5 .rst-content dl.citation tt,html.writer-html5 .rst-content dl.footnote code,html.writer-html5 .rst-content dl.footnote tt{color:#555}.rst-content .wy-table-responsive.citation,.rst-content .wy-table-responsive.footnote{margin-bottom:0}.rst-content .wy-table-responsive.citation+:not(.citation),.rst-content .wy-table-responsive.footnote+:not(.footnote){margin-top:24px}.rst-content .wy-table-responsive.citation:last-child,.rst-content .wy-table-responsive.footnote:last-child{margin-bottom:24px}.rst-content table.docutils th{border-color:#e1e4e5}html.writer-html5 .rst-content table.docutils th{border:1px solid #e1e4e5}html.writer-html5 .rst-content table.docutils td>p,html.writer-html5 .rst-content table.docutils th>p{line-height:1rem;margin-bottom:0;font-size:.9rem}.rst-content table.docutils td .last,.rst-content table.docutils td .last>:last-child{margin-bottom:0}.rst-content table.field-list,.rst-content table.field-list td{border:none}.rst-content table.field-list td p{line-height:inherit}.rst-content table.field-list td>strong{display:inline-block}.rst-content table.field-list .field-name{padding-right:10px;text-align:left;white-space:nowrap}.rst-content table.field-list .field-body{text-align:left}.rst-content code,.rst-content tt{color:#000;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;padding:2px 5px}.rst-content code big,.rst-content code em,.rst-content tt big,.rst-content tt em{font-size:100%!important;line-height:normal}.rst-content code.literal,.rst-content tt.literal{color:#e74c3c;white-space:normal}.rst-content code.xref,.rst-content tt.xref,a .rst-content code,a .rst-content tt{font-weight:700;color:#404040;overflow-wrap:normal}.rst-content kbd,.rst-content pre,.rst-content samp{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace}.rst-content a code,.rst-content a tt{color:#2980b9}.rst-content dl{margin-bottom:24px}.rst-content dl dt{font-weight:700;margin-bottom:12px}.rst-content dl ol,.rst-content dl p,.rst-content dl table,.rst-content dl ul{margin-bottom:12px}.rst-content dl dd{margin:0 0 12px 24px;line-height:24px}.rst-content dl dd>ol:last-child,.rst-content dl dd>p:last-child,.rst-content dl dd>table:last-child,.rst-content dl dd>ul:last-child{margin-bottom:0}html.writer-html4 .rst-content dl:not(.docutils),html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple){margin-bottom:24px}html.writer-html4 .rst-content dl:not(.docutils)>dt,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt{display:table;margin:6px 0;font-size:90%;line-height:normal;background:#e7f2fa;color:#2980b9;border-top:3px solid #6ab0de;padding:6px;position:relative}html.writer-html4 .rst-content dl:not(.docutils)>dt:before,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt:before{color:#6ab0de}html.writer-html4 .rst-content dl:not(.docutils)>dt .headerlink,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt .headerlink{color:#404040;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt{margin-bottom:6px;border:none;border-left:3px solid #ccc;background:#f0f0f0;color:#555}html.writer-html4 .rst-content dl:not(.docutils) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt .headerlink,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt .headerlink{color:#404040;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils)>dt:first-child,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt:first-child{margin-top:0}html.writer-html4 .rst-content dl:not(.docutils) code.descclassname,html.writer-html4 .rst-content dl:not(.docutils) code.descname,html.writer-html4 .rst-content dl:not(.docutils) tt.descclassname,html.writer-html4 .rst-content dl:not(.docutils) tt.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) code.descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) code.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) tt.descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) tt.descname{background-color:transparent;border:none;padding:0;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils) code.descname,html.writer-html4 .rst-content dl:not(.docutils) tt.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) code.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) tt.descname{font-weight:700}html.writer-html4 .rst-content dl:not(.docutils) .optional,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .optional{display:inline-block;padding:0 4px;color:#000;font-weight:700}html.writer-html4 .rst-content dl:not(.docutils) .property,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .property{display:inline-block;padding-right:8px;max-width:100%}html.writer-html4 .rst-content dl:not(.docutils) .k,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .k{font-style:italic}html.writer-html4 .rst-content dl:not(.docutils) .descclassname,html.writer-html4 .rst-content dl:not(.docutils) .descname,html.writer-html4 .rst-content dl:not(.docutils) .sig-name,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .sig-name{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;color:#000}.rst-content .viewcode-back,.rst-content .viewcode-link{display:inline-block;color:#27ae60;font-size:80%;padding-left:24px}.rst-content .viewcode-back{display:block;float:right}.rst-content p.rubric{margin-bottom:12px;font-weight:700}.rst-content code.download,.rst-content tt.download{background:inherit;padding:inherit;font-weight:400;font-family:inherit;font-size:inherit;color:inherit;border:inherit;white-space:inherit}.rst-content code.download span:first-child,.rst-content tt.download span:first-child{-webkit-font-smoothing:subpixel-antialiased}.rst-content code.download span:first-child:before,.rst-content tt.download span:first-child:before{margin-right:4px}.rst-content .guilabel{border:1px solid #7fbbe3;background:#e7f2fa;font-size:80%;font-weight:700;border-radius:4px;padding:2.4px 6px;margin:auto 2px}.rst-content :not(dl.option-list)>:not(dt):not(kbd):not(.kbd)>.kbd,.rst-content :not(dl.option-list)>:not(dt):not(kbd):not(.kbd)>kbd{color:inherit;font-size:80%;background-color:#fff;border:1px solid #a6a6a6;border-radius:4px;box-shadow:0 2px grey;padding:2.4px 6px;margin:auto 0}.rst-content .versionmodified{font-style:italic}@media screen and (max-width:480px){.rst-content .sidebar{width:100%}}span[id*=MathJax-Span]{color:#404040}.math{text-align:center}@font-face{font-family:Lato;src:url(fonts/lato-normal.woff2?bd03a2cc277bbbc338d464e679fe9942) format("woff2"),url(fonts/lato-normal.woff?27bd77b9162d388cb8d4c4217c7c5e2a) format("woff");font-weight:400;font-style:normal;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-bold.woff2?cccb897485813c7c256901dbca54ecf2) format("woff2"),url(fonts/lato-bold.woff?d878b6c29b10beca227e9eef4246111b) format("woff");font-weight:700;font-style:normal;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-bold-italic.woff2?0b6bb6725576b072c5d0b02ecdd1900d) format("woff2"),url(fonts/lato-bold-italic.woff?9c7e4e9eb485b4a121c760e61bc3707c) format("woff");font-weight:700;font-style:italic;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-normal-italic.woff2?4eb103b4d12be57cb1d040ed5e162e9d) format("woff2"),url(fonts/lato-normal-italic.woff?f28f2d6482446544ef1ea1ccc6dd5892) format("woff");font-weight:400;font-style:italic;font-display:block}@font-face{font-family:Roboto Slab;font-style:normal;font-weight:400;src:url(fonts/Roboto-Slab-Regular.woff2?7abf5b8d04d26a2cafea937019bca958) format("woff2"),url(fonts/Roboto-Slab-Regular.woff?c1be9284088d487c5e3ff0a10a92e58c) format("woff");font-display:block}@font-face{font-family:Roboto Slab;font-style:normal;font-weight:700;src:url(fonts/Roboto-Slab-Bold.woff2?9984f4a9bda09be08e83f2506954adbe) format("woff2"),url(fonts/Roboto-Slab-Bold.woff?bed5564a116b05148e3b3bea6fb1162a) format("woff");font-display:block} diff --git a/css/theme_extra.css b/css/theme_extra.css new file mode 100644 index 00000000..9f4b063c --- /dev/null +++ b/css/theme_extra.css @@ -0,0 +1,191 @@ +/* + * Wrap inline code samples otherwise they shoot of the side and + * can't be read at all. + * + * https://github.com/mkdocs/mkdocs/issues/313 + * https://github.com/mkdocs/mkdocs/issues/233 + * https://github.com/mkdocs/mkdocs/issues/834 + */ +.rst-content code { + white-space: pre-wrap; + word-wrap: break-word; + padding: 2px 5px; +} + +/** + * Make code blocks display as blocks and give them the appropriate + * font size and padding. + * + * https://github.com/mkdocs/mkdocs/issues/855 + * https://github.com/mkdocs/mkdocs/issues/834 + * https://github.com/mkdocs/mkdocs/issues/233 + */ +.rst-content pre code { + white-space: pre; + word-wrap: normal; + display: block; + padding: 12px; + font-size: 12px; +} + +/** + * Fix code colors + * + * https://github.com/mkdocs/mkdocs/issues/2027 + */ +.rst-content code { + color: #E74C3C; +} + +.rst-content pre code { + color: #000; + background: #f8f8f8; +} + +/* + * Fix link colors when the link text is inline code. + * + * https://github.com/mkdocs/mkdocs/issues/718 + */ +a code { + color: #2980B9; +} +a:hover code { + color: #3091d1; +} +a:visited code { + color: #9B59B6; +} + +/* + * The CSS classes from highlight.js seem to clash with the + * ReadTheDocs theme causing some code to be incorrectly made + * bold and italic. + * + * https://github.com/mkdocs/mkdocs/issues/411 + */ +pre .cs, pre .c { + font-weight: inherit; + font-style: inherit; +} + +/* + * Fix some issues with the theme and non-highlighted code + * samples. Without and highlighting styles attached the + * formatting is broken. + * + * https://github.com/mkdocs/mkdocs/issues/319 + */ +.rst-content .no-highlight { + display: block; + padding: 0.5em; + color: #333; +} + + +/* + * Additions specific to the search functionality provided by MkDocs + */ + +.search-results { + margin-top: 23px; +} + +.search-results article { + border-top: 1px solid #E1E4E5; + padding-top: 24px; +} + +.search-results article:first-child { + border-top: none; +} + +form .search-query { + width: 100%; + border-radius: 50px; + padding: 6px 12px; /* csslint allow: box-model */ + border-color: #D1D4D5; +} + +/* + * Improve inline code blocks within admonitions. + * + * https://github.com/mkdocs/mkdocs/issues/656 + */ + .rst-content .admonition code { + color: #404040; + border: 1px solid #c7c9cb; + border: 1px solid rgba(0, 0, 0, 0.2); + background: #f8fbfd; + background: rgba(255, 255, 255, 0.7); +} + +/* + * Account for wide tables which go off the side. + * Override borders to avoid weirdness on narrow tables. + * + * https://github.com/mkdocs/mkdocs/issues/834 + * https://github.com/mkdocs/mkdocs/pull/1034 + */ +.rst-content .section .docutils { + width: 100%; + overflow: auto; + display: block; + border: none; +} + +td, th { + border: 1px solid #e1e4e5 !important; /* csslint allow: important */ + border-collapse: collapse; +} + +/* + * Without the following amendments, the navigation in the theme will be + * slightly cut off. This is due to the fact that the .wy-nav-side has a + * padding-bottom of 2em, which must not necessarily align with the font-size of + * 90 % on the .rst-current-version container, combined with the padding of 12px + * above and below. These amendments fix this in two steps: First, make sure the + * .rst-current-version container has a fixed height of 40px, achieved using + * line-height, and then applying a padding-bottom of 40px to this container. In + * a second step, the items within that container are re-aligned using flexbox. + * + * https://github.com/mkdocs/mkdocs/issues/2012 + */ + .wy-nav-side { + padding-bottom: 40px; +} + +/* + * The second step of above amendment: Here we make sure the items are aligned + * correctly within the .rst-current-version container. Using flexbox, we + * achieve it in such a way that it will look like the following: + * + * [No repo_name] + * Next >> // On the first page + * << Previous Next >> // On all subsequent pages + * + * [With repo_name] + * Next >> // On the first page + * << Previous Next >> // On all subsequent pages + * + * https://github.com/mkdocs/mkdocs/issues/2012 + */ +.rst-versions .rst-current-version { + padding: 0 12px; + display: flex; + font-size: initial; + justify-content: space-between; + align-items: center; + line-height: 40px; +} + +/* + * Please note that this amendment also involves removing certain inline-styles + * from the file ./mkdocs/themes/readthedocs/versions.html. + * + * https://github.com/mkdocs/mkdocs/issues/2012 + */ +.rst-current-version span { + flex: 1; + text-align: center; +} diff --git a/diagram/gen.py b/diagram/gen.py new file mode 100644 index 00000000..37f0302d --- /dev/null +++ b/diagram/gen.py @@ -0,0 +1,294 @@ +elen = -1 +vlen = -1 +row_index = 0 +col_index = 0 + +row_space = 100 +index_x = 20 +arg_x = 100 +desc_y_off = 15 +element_x = 10 +element_x_unit = 2 +col_space = -1 +box_height = 26 + + +def init(f, rows, cols): + global col_space + if vlen == 256: + col_space = 570 + else: + col_space = 300 + print( + f"""""", + file=f, + ) + + +def end(f): + global row_index, elen, vlen + row_index = 0 + elen = -1 + vlen = -1 + print( + "", + file=f, + ) + + +def add_row(f): + global row_index, col_index + # row index + row_index = row_index + 1 + col_index = 0 + print( + f'({row_index})', file=f + ) + + +def add_box(f, arg, desc, indices=None): + global col_index + x_base = arg_x + col_index * col_space + col_index += 1 + # arg name + print( + f'{arg}', + file=f, + ) + # desc name + print( + f'{desc}', + file=f, + ) + # add element rects + for i in range(vlen // elen): + per_element_x = element_x_unit * elen + print( + f'', + file=f, + ) + if indices is not None: + print( + f'{indices[i]}', + file=f, + ) + + # add upper/lower indicator + if row_index == 1: + print( + f'upper', + file=f, + ) + print( + f'lower', + file=f, + ) + + +def add_line(f, from_row, from_col, from_index, to_row, to_col, to_index): + x_base = arg_x + from_col * col_space + per_element_x = element_x_unit * elen + x1 = x_base + element_x + per_element_x * from_index + per_element_x // 2 + y1 = row_space * from_row - 10 + box_height + + x_base = arg_x + to_col * col_space + per_element_x = element_x_unit * elen + x2 = x_base + element_x + per_element_x * to_index + per_element_x // 2 + y2 = row_space * to_row - 10 + + print(f'', file=f) + + +def xvshuf(): + global elen, vlen + for el, name in [(64, "d"), (32, "w"), (16, "h"), (8, "b")]: + elen = el + vlen = 256 + with open(f"xvshuf_{name}.svg", "w") as f: + init(f, 4, 2) + add_row(f) + add_box( + f, + "b" if elen > 8 else "a", + "data", + indices=list(range(vlen // elen - 1, vlen // elen // 2 - 1, -1)) * 2, + ) + add_box( + f, + "c" if elen > 8 else "b", + "data", + indices=list(range(vlen // elen // 2 - 1, -1, -1)) * 2, + ) + add_row(f) + add_box(f, "hi", "merged", indices=list(range(vlen // elen - 1, -1, -1))) + add_box(f, "lo", "merged", indices=list(range(vlen // elen - 1, -1, -1))) + add_row(f) + add_box(f, "a" if elen > 8 else "c", "indices") + add_row(f) + add_box(f, "ret", "returns") + + # b to merge & c to merge + for i in range(vlen // elen // 2): + add_line(f, 1, 0, i, 2, 0, i) + add_line(f, 1, 0, i + vlen // elen // 2, 2, 1, i) + for i in range(vlen // elen // 2): + add_line(f, 1, 1, i, 2, 0, i + vlen // elen // 2) + add_line(f, 1, 1, i + vlen // elen // 2, 2, 1, i + vlen // elen // 2) + + # merge to a + for i in range(vlen // elen): + for j in range(vlen // elen // 2): + add_line(f, 2, 0, i, 3, 0, j) + add_line(f, 2, 1, i, 3, 0, j + vlen // elen // 2) + + # a to ret + for i in range(vlen // elen): + add_line(f, 3, 0, i, 4, 0, i) + end(f) + + +def xvshuf4i_bhw(): + global elen, vlen + for vl, prefix in [(256, "xv"), (128, "v")]: + for el, name in [(32, "w"), (16, "h"), (8, "b")]: + elen = el + vlen = vl + with open(f"{prefix}shuf4i_{name}.svg", "w") as f: + init(f, 2, 1) + add_row(f) + add_box( + f, + "a", + "data", + indices=list(range(3, -1, -1)) * (vlen // el // 4), + ) + add_row(f) + add_box(f, "ret", "returns") + + # a to returns + for i in range(vlen // elen): + for j in range(4): + add_line(f, 1, 0, i, 2, 0, i // 4 * 4 + j) + end(f) + + +def xvshuf4i_d(): + global elen, vlen + elen = 64 + vlen = 256 + with open("xvshuf4i_d.svg", "w") as f: + init(f, 2, 2) + add_row(f) + add_box( + f, + "b", + "data", + indices=[3, 2, 3, 2], + ) + add_box( + f, + "a", + "data", + indices=[1, 0, 1, 0], + ) + add_row(f) + add_box(f, "ret", "returns") + + # a & b to returns + add_line(f, 1, 0, 2, 2, 0, 3) + add_line(f, 1, 0, 3, 2, 0, 3) + add_line(f, 1, 1, 2, 2, 0, 3) + add_line(f, 1, 1, 3, 2, 0, 3) + add_line(f, 1, 0, 2, 2, 0, 2) + add_line(f, 1, 0, 3, 2, 0, 2) + add_line(f, 1, 1, 2, 2, 0, 2) + add_line(f, 1, 1, 3, 2, 0, 2) + add_line(f, 1, 0, 0, 2, 0, 1) + add_line(f, 1, 0, 1, 2, 0, 1) + add_line(f, 1, 1, 0, 2, 0, 1) + add_line(f, 1, 1, 1, 2, 0, 1) + add_line(f, 1, 0, 0, 2, 0, 0) + add_line(f, 1, 0, 1, 2, 0, 0) + add_line(f, 1, 1, 0, 2, 0, 0) + add_line(f, 1, 1, 1, 2, 0, 0) + end(f) + + +def vshuf(): + global elen, vlen + for el, name in [(64, "d"), (32, "w"), (16, "h"), (8, "b")]: + elen = el + vlen = 128 + with open(f"vshuf_{name}.svg", "w") as f: + init(f, 3, 2) + add_row(f) + add_box( + f, + "b" if elen > 8 else "a", + "data", + indices=list(range(vlen // elen * 2 - 1, vlen // elen - 1, -1)), + ) + add_box( + f, + "c" if elen > 8 else "b", + "data", + indices=list(range(vlen // elen - 1, -1, -1)), + ) + add_row(f) + add_box(f, "a" if elen > 8 else "c", "indices") + add_row(f) + add_box(f, "ret", "returns") + + # b to a & c to a + for i in range(vlen // elen): + for j in range(vlen // elen): + add_line(f, 1, 0, j, 2, 0, i) + add_line(f, 1, 1, j, 2, 0, i) + + # a to ret + for i in range(vlen // elen): + add_line(f, 2, 0, i, 3, 0, i) + end(f) + +def vshuf4i_d(): + global elen, vlen + elen = 64 + vlen = 128 + with open("vshuf4i_d.svg", "w") as f: + init(f, 2, 2) + add_row(f) + add_box( + f, + "b", + "data", + indices=[3, 2], + ) + add_box( + f, + "a", + "data", + indices=[1, 0], + ) + add_row(f) + add_box(f, "ret", "returns") + + # a & b to returns + add_line(f, 1, 0, 0, 2, 0, 1) + add_line(f, 1, 0, 1, 2, 0, 1) + add_line(f, 1, 1, 0, 2, 0, 1) + add_line(f, 1, 1, 1, 2, 0, 1) + add_line(f, 1, 0, 0, 2, 0, 0) + add_line(f, 1, 0, 1, 2, 0, 0) + add_line(f, 1, 1, 0, 2, 0, 0) + add_line(f, 1, 1, 1, 2, 0, 0) + end(f) + + +if __name__ == "__main__": + xvshuf() + xvshuf4i_bhw() + xvshuf4i_d() + vshuf() + vshuf4i_d() diff --git a/diagram/vshuf4i_b.svg b/diagram/vshuf4i_b.svg new file mode 100644 index 00000000..c2bc66b4 --- /dev/null +++ b/diagram/vshuf4i_b.svg @@ -0,0 +1,124 @@ + +(1) +a +data + +3 + +2 + +1 + +0 + +3 + +2 + +1 + +0 + +3 + +2 + +1 + +0 + +3 + +2 + +1 + +0 +upper +lower +(2) +ret +returns + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/diagram/vshuf4i_d.svg b/diagram/vshuf4i_d.svg new file mode 100644 index 00000000..10a24caf --- /dev/null +++ b/diagram/vshuf4i_d.svg @@ -0,0 +1,34 @@ + +(1) +b +data + +3 + +2 +upper +lower +a +data + +1 + +0 +upper +lower +(2) +ret +returns + + + + + + + + + + + diff --git a/diagram/vshuf4i_h.svg b/diagram/vshuf4i_h.svg new file mode 100644 index 00000000..3893dc4d --- /dev/null +++ b/diagram/vshuf4i_h.svg @@ -0,0 +1,68 @@ + +(1) +a +data + +3 + +2 + +1 + +0 + +3 + +2 + +1 + +0 +upper +lower +(2) +ret +returns + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/diagram/vshuf4i_w.svg b/diagram/vshuf4i_w.svg new file mode 100644 index 00000000..1ae56c19 --- /dev/null +++ b/diagram/vshuf4i_w.svg @@ -0,0 +1,40 @@ + +(1) +a +data + +3 + +2 + +1 + +0 +upper +lower +(2) +ret +returns + + + + + + + + + + + + + + + + + + + + + diff --git a/diagram/vshuf_b.svg b/diagram/vshuf_b.svg new file mode 100644 index 00000000..8d80113f --- /dev/null +++ b/diagram/vshuf_b.svg @@ -0,0 +1,643 @@ + +(1) +a +data + +31 + +30 + +29 + +28 + +27 + +26 + +25 + +24 + +23 + +22 + +21 + +20 + +19 + +18 + +17 + +16 +upper +lower +b +data + +15 + +14 + +13 + +12 + +11 + +10 + +9 + +8 + +7 + +6 + +5 + +4 + +3 + +2 + +1 + +0 +upper +lower +(2) +c +indices + + + + + + + + + + + + + + + + +(3) +ret +returns + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/diagram/vshuf_d.svg b/diagram/vshuf_d.svg new file mode 100644 index 00000000..677c9548 --- /dev/null +++ b/diagram/vshuf_d.svg @@ -0,0 +1,41 @@ + +(1) +b +data + +3 + +2 +upper +lower +c +data + +1 + +0 +upper +lower +(2) +a +indices + + +(3) +ret +returns + + + + + + + + + + + + + diff --git a/diagram/vshuf_h.svg b/diagram/vshuf_h.svg new file mode 100644 index 00000000..5c528285 --- /dev/null +++ b/diagram/vshuf_h.svg @@ -0,0 +1,203 @@ + +(1) +b +data + +15 + +14 + +13 + +12 + +11 + +10 + +9 + +8 +upper +lower +c +data + +7 + +6 + +5 + +4 + +3 + +2 + +1 + +0 +upper +lower +(2) +a +indices + + + + + + + + +(3) +ret +returns + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/diagram/vshuf_w.svg b/diagram/vshuf_w.svg new file mode 100644 index 00000000..5ddf2000 --- /dev/null +++ b/diagram/vshuf_w.svg @@ -0,0 +1,79 @@ + +(1) +b +data + +7 + +6 + +5 + +4 +upper +lower +c +data + +3 + +2 + +1 + +0 +upper +lower +(2) +a +indices + + + + +(3) +ret +returns + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/diagram/xvshuf4i_b.svg b/diagram/xvshuf4i_b.svg new file mode 100644 index 00000000..e21a7c31 --- /dev/null +++ b/diagram/xvshuf4i_b.svg @@ -0,0 +1,236 @@ + +(1) +a +data + +3 + +2 + +1 + +0 + +3 + +2 + +1 + +0 + +3 + +2 + +1 + +0 + +3 + +2 + +1 + +0 + +3 + +2 + +1 + +0 + +3 + +2 + +1 + +0 + +3 + +2 + +1 + +0 + +3 + +2 + +1 + +0 +upper +lower +(2) +ret +returns + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/diagram/xvshuf4i_d.svg b/diagram/xvshuf4i_d.svg new file mode 100644 index 00000000..af8ed307 --- /dev/null +++ b/diagram/xvshuf4i_d.svg @@ -0,0 +1,52 @@ + +(1) +b +data + +3 + +2 + +3 + +2 +upper +lower +a +data + +1 + +0 + +1 + +0 +upper +lower +(2) +ret +returns + + + + + + + + + + + + + + + + + + + + + diff --git a/diagram/xvshuf4i_h.svg b/diagram/xvshuf4i_h.svg new file mode 100644 index 00000000..2b4b1164 --- /dev/null +++ b/diagram/xvshuf4i_h.svg @@ -0,0 +1,124 @@ + +(1) +a +data + +3 + +2 + +1 + +0 + +3 + +2 + +1 + +0 + +3 + +2 + +1 + +0 + +3 + +2 + +1 + +0 +upper +lower +(2) +ret +returns + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/diagram/xvshuf4i_w.svg b/diagram/xvshuf4i_w.svg new file mode 100644 index 00000000..f8d4ca15 --- /dev/null +++ b/diagram/xvshuf4i_w.svg @@ -0,0 +1,68 @@ + +(1) +a +data + +3 + +2 + +1 + +0 + +3 + +2 + +1 + +0 +upper +lower +(2) +ret +returns + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/diagram/xvshuf_b.svg b/diagram/xvshuf_b.svg new file mode 100644 index 00000000..9dce3330 --- /dev/null +++ b/diagram/xvshuf_b.svg @@ -0,0 +1,1464 @@ + +(1) +a +data + +31 + +30 + +29 + +28 + +27 + +26 + +25 + +24 + +23 + +22 + +21 + +20 + +19 + +18 + +17 + +16 + +31 + +30 + +29 + +28 + +27 + +26 + +25 + +24 + +23 + +22 + +21 + +20 + +19 + +18 + +17 + +16 +upper +lower +b +data + +15 + +14 + +13 + +12 + +11 + +10 + +9 + +8 + +7 + +6 + +5 + +4 + +3 + +2 + +1 + +0 + +15 + +14 + +13 + +12 + +11 + +10 + +9 + +8 + +7 + +6 + +5 + +4 + +3 + +2 + +1 + +0 +upper +lower +(2) +hi +merged + +31 + +30 + +29 + +28 + +27 + +26 + +25 + +24 + +23 + +22 + +21 + +20 + +19 + +18 + +17 + +16 + +15 + +14 + +13 + +12 + +11 + +10 + +9 + +8 + +7 + +6 + +5 + +4 + +3 + +2 + +1 + +0 +lo +merged + +31 + +30 + +29 + +28 + +27 + +26 + +25 + +24 + +23 + +22 + +21 + +20 + +19 + +18 + +17 + +16 + +15 + +14 + +13 + +12 + +11 + +10 + +9 + +8 + +7 + +6 + +5 + +4 + +3 + +2 + +1 + +0 +(3) +c +indices + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +(4) +ret +returns + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/diagram/xvshuf_d.svg b/diagram/xvshuf_d.svg new file mode 100644 index 00000000..835b9838 --- /dev/null +++ b/diagram/xvshuf_d.svg @@ -0,0 +1,92 @@ + +(1) +b +data + +3 + +2 + +3 + +2 +upper +lower +c +data + +1 + +0 + +1 + +0 +upper +lower +(2) +hi +merged + +3 + +2 + +1 + +0 +lo +merged + +3 + +2 + +1 + +0 +(3) +a +indices + + + + +(4) +ret +returns + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/diagram/xvshuf_h.svg b/diagram/xvshuf_h.svg new file mode 100644 index 00000000..731a277e --- /dev/null +++ b/diagram/xvshuf_h.svg @@ -0,0 +1,488 @@ + +(1) +b +data + +15 + +14 + +13 + +12 + +11 + +10 + +9 + +8 + +15 + +14 + +13 + +12 + +11 + +10 + +9 + +8 +upper +lower +c +data + +7 + +6 + +5 + +4 + +3 + +2 + +1 + +0 + +7 + +6 + +5 + +4 + +3 + +2 + +1 + +0 +upper +lower +(2) +hi +merged + +15 + +14 + +13 + +12 + +11 + +10 + +9 + +8 + +7 + +6 + +5 + +4 + +3 + +2 + +1 + +0 +lo +merged + +15 + +14 + +13 + +12 + +11 + +10 + +9 + +8 + +7 + +6 + +5 + +4 + +3 + +2 + +1 + +0 +(3) +a +indices + + + + + + + + + + + + + + + + +(4) +ret +returns + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/diagram/xvshuf_w.svg b/diagram/xvshuf_w.svg new file mode 100644 index 00000000..9e4999c2 --- /dev/null +++ b/diagram/xvshuf_w.svg @@ -0,0 +1,192 @@ + +(1) +b +data + +7 + +6 + +5 + +4 + +7 + +6 + +5 + +4 +upper +lower +c +data + +3 + +2 + +1 + +0 + +3 + +2 + +1 + +0 +upper +lower +(2) +hi +merged + +7 + +6 + +5 + +4 + +3 + +2 + +1 + +0 +lo +merged + +7 + +6 + +5 + +4 + +3 + +2 + +1 + +0 +(3) +a +indices + + + + + + + + +(4) +ret +returns + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/img/favicon.ico b/img/favicon.ico new file mode 100644 index 00000000..e85006a3 Binary files /dev/null and b/img/favicon.ico differ diff --git a/index.html b/index.html new file mode 100644 index 00000000..84fe14a0 --- /dev/null +++ b/index.html @@ -0,0 +1,227 @@ + + + + + + + + Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + +
  • +
  • +
+
+
+
+
+ +

Unofficial LoongArch Intrinsics Guide

+

This is the Unofficial LoongArch Intrinsics Guide by Jiajie Chen et, al. The documentation is arranged from the following sources:

+
    +
  • QEMU
  • +
  • GCC
  • +
  • Observations from real hardware incl. 3C5000 and 3A6000
  • +
+

The guide provides pseudo code for the SIMD intrinsics. The code assumes that the elements of the LSX/LASX vector registers can be accessed via members of a union:

+
union lsx_register {
+  uint8_t byte[16];
+  uint16_t half[8];
+  uint32_t word[4];
+  uint64_t dword[2];
+  uint128_t qword[1];
+  float fp32[4];
+  double fp64[2];
+};
+
+union lasx_register {
+  uint8_t byte[32];
+  uint16_t half[16];
+  uint32_t word[8];
+  uint64_t dword[4];
+  uint128_t qword[2];
+  float fp32[8];
+  double fp64[4];
+};
+
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + + Next » + + +
+ + + + + + + + + + diff --git a/js/html5shiv.min.js b/js/html5shiv.min.js new file mode 100644 index 00000000..1a01c94b --- /dev/null +++ b/js/html5shiv.min.js @@ -0,0 +1,4 @@ +/** +* @preserve HTML5 Shiv 3.7.3 | @afarkas @jdalton @jon_neal @rem | MIT/GPL2 Licensed +*/ +!function(a,b){function c(a,b){var c=a.createElement("p"),d=a.getElementsByTagName("head")[0]||a.documentElement;return c.innerHTML="x",d.insertBefore(c.lastChild,d.firstChild)}function d(){var a=t.elements;return"string"==typeof a?a.split(" "):a}function e(a,b){var c=t.elements;"string"!=typeof c&&(c=c.join(" ")),"string"!=typeof a&&(a=a.join(" ")),t.elements=c+" "+a,j(b)}function f(a){var b=s[a[q]];return b||(b={},r++,a[q]=r,s[r]=b),b}function g(a,c,d){if(c||(c=b),l)return c.createElement(a);d||(d=f(c));var e;return e=d.cache[a]?d.cache[a].cloneNode():p.test(a)?(d.cache[a]=d.createElem(a)).cloneNode():d.createElem(a),!e.canHaveChildren||o.test(a)||e.tagUrn?e:d.frag.appendChild(e)}function h(a,c){if(a||(a=b),l)return a.createDocumentFragment();c=c||f(a);for(var e=c.frag.cloneNode(),g=0,h=d(),i=h.length;i>g;g++)e.createElement(h[g]);return e}function i(a,b){b.cache||(b.cache={},b.createElem=a.createElement,b.createFrag=a.createDocumentFragment,b.frag=b.createFrag()),a.createElement=function(c){return t.shivMethods?g(c,a,b):b.createElem(c)},a.createDocumentFragment=Function("h,f","return function(){var n=f.cloneNode(),c=n.createElement;h.shivMethods&&("+d().join().replace(/[\w\-:]+/g,function(a){return b.createElem(a),b.frag.createElement(a),'c("'+a+'")'})+");return n}")(t,b.frag)}function j(a){a||(a=b);var d=f(a);return!t.shivCSS||k||d.hasCSS||(d.hasCSS=!!c(a,"article,aside,dialog,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}mark{background:#FF0;color:#000}template{display:none}")),l||i(a,d),a}var k,l,m="3.7.3",n=a.html5||{},o=/^<|^(?:button|map|select|textarea|object|iframe|option|optgroup)$/i,p=/^(?:a|b|code|div|fieldset|h1|h2|h3|h4|h5|h6|i|label|li|ol|p|q|span|strong|style|table|tbody|td|th|tr|ul)$/i,q="_html5shiv",r=0,s={};!function(){try{var a=b.createElement("a");a.innerHTML="",k="hidden"in a,l=1==a.childNodes.length||function(){b.createElement("a");var a=b.createDocumentFragment();return"undefined"==typeof a.cloneNode||"undefined"==typeof a.createDocumentFragment||"undefined"==typeof a.createElement}()}catch(c){k=!0,l=!0}}();var t={elements:n.elements||"abbr article aside audio bdi canvas data datalist details dialog figcaption figure footer header hgroup main mark meter nav output picture progress section summary template time video",version:m,shivCSS:n.shivCSS!==!1,supportsUnknownElements:l,shivMethods:n.shivMethods!==!1,type:"default",shivDocument:j,createElement:g,createDocumentFragment:h,addElements:e};a.html5=t,j(b),"object"==typeof module&&module.exports&&(module.exports=t)}("undefined"!=typeof window?window:this,document); diff --git a/js/jquery-3.6.0.min.js b/js/jquery-3.6.0.min.js new file mode 100644 index 00000000..c4c6022f --- /dev/null +++ b/js/jquery-3.6.0.min.js @@ -0,0 +1,2 @@ +/*! jQuery v3.6.0 | (c) OpenJS Foundation and other contributors | jquery.org/license */ +!function(e,t){"use strict";"object"==typeof module&&"object"==typeof module.exports?module.exports=e.document?t(e,!0):function(e){if(!e.document)throw new Error("jQuery requires a window with a document");return t(e)}:t(e)}("undefined"!=typeof window?window:this,function(C,e){"use strict";var t=[],r=Object.getPrototypeOf,s=t.slice,g=t.flat?function(e){return t.flat.call(e)}:function(e){return t.concat.apply([],e)},u=t.push,i=t.indexOf,n={},o=n.toString,v=n.hasOwnProperty,a=v.toString,l=a.call(Object),y={},m=function(e){return"function"==typeof e&&"number"!=typeof e.nodeType&&"function"!=typeof e.item},x=function(e){return null!=e&&e===e.window},E=C.document,c={type:!0,src:!0,nonce:!0,noModule:!0};function b(e,t,n){var r,i,o=(n=n||E).createElement("script");if(o.text=e,t)for(r in c)(i=t[r]||t.getAttribute&&t.getAttribute(r))&&o.setAttribute(r,i);n.head.appendChild(o).parentNode.removeChild(o)}function w(e){return null==e?e+"":"object"==typeof e||"function"==typeof e?n[o.call(e)]||"object":typeof e}var f="3.6.0",S=function(e,t){return new S.fn.init(e,t)};function p(e){var t=!!e&&"length"in e&&e.length,n=w(e);return!m(e)&&!x(e)&&("array"===n||0===t||"number"==typeof t&&0+~]|"+M+")"+M+"*"),U=new RegExp(M+"|>"),X=new RegExp(F),V=new RegExp("^"+I+"$"),G={ID:new RegExp("^#("+I+")"),CLASS:new RegExp("^\\.("+I+")"),TAG:new RegExp("^("+I+"|[*])"),ATTR:new RegExp("^"+W),PSEUDO:new RegExp("^"+F),CHILD:new RegExp("^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\("+M+"*(even|odd|(([+-]|)(\\d*)n|)"+M+"*(?:([+-]|)"+M+"*(\\d+)|))"+M+"*\\)|)","i"),bool:new RegExp("^(?:"+R+")$","i"),needsContext:new RegExp("^"+M+"*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\("+M+"*((?:-\\d)?\\d*)"+M+"*\\)|)(?=[^-]|$)","i")},Y=/HTML$/i,Q=/^(?:input|select|textarea|button)$/i,J=/^h\d$/i,K=/^[^{]+\{\s*\[native \w/,Z=/^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/,ee=/[+~]/,te=new RegExp("\\\\[\\da-fA-F]{1,6}"+M+"?|\\\\([^\\r\\n\\f])","g"),ne=function(e,t){var n="0x"+e.slice(1)-65536;return t||(n<0?String.fromCharCode(n+65536):String.fromCharCode(n>>10|55296,1023&n|56320))},re=/([\0-\x1f\x7f]|^-?\d)|^-$|[^\0-\x1f\x7f-\uFFFF\w-]/g,ie=function(e,t){return t?"\0"===e?"\ufffd":e.slice(0,-1)+"\\"+e.charCodeAt(e.length-1).toString(16)+" ":"\\"+e},oe=function(){T()},ae=be(function(e){return!0===e.disabled&&"fieldset"===e.nodeName.toLowerCase()},{dir:"parentNode",next:"legend"});try{H.apply(t=O.call(p.childNodes),p.childNodes),t[p.childNodes.length].nodeType}catch(e){H={apply:t.length?function(e,t){L.apply(e,O.call(t))}:function(e,t){var n=e.length,r=0;while(e[n++]=t[r++]);e.length=n-1}}}function se(t,e,n,r){var i,o,a,s,u,l,c,f=e&&e.ownerDocument,p=e?e.nodeType:9;if(n=n||[],"string"!=typeof t||!t||1!==p&&9!==p&&11!==p)return n;if(!r&&(T(e),e=e||C,E)){if(11!==p&&(u=Z.exec(t)))if(i=u[1]){if(9===p){if(!(a=e.getElementById(i)))return n;if(a.id===i)return n.push(a),n}else if(f&&(a=f.getElementById(i))&&y(e,a)&&a.id===i)return n.push(a),n}else{if(u[2])return H.apply(n,e.getElementsByTagName(t)),n;if((i=u[3])&&d.getElementsByClassName&&e.getElementsByClassName)return H.apply(n,e.getElementsByClassName(i)),n}if(d.qsa&&!N[t+" "]&&(!v||!v.test(t))&&(1!==p||"object"!==e.nodeName.toLowerCase())){if(c=t,f=e,1===p&&(U.test(t)||z.test(t))){(f=ee.test(t)&&ye(e.parentNode)||e)===e&&d.scope||((s=e.getAttribute("id"))?s=s.replace(re,ie):e.setAttribute("id",s=S)),o=(l=h(t)).length;while(o--)l[o]=(s?"#"+s:":scope")+" "+xe(l[o]);c=l.join(",")}try{return H.apply(n,f.querySelectorAll(c)),n}catch(e){N(t,!0)}finally{s===S&&e.removeAttribute("id")}}}return g(t.replace($,"$1"),e,n,r)}function ue(){var r=[];return function e(t,n){return r.push(t+" ")>b.cacheLength&&delete e[r.shift()],e[t+" "]=n}}function le(e){return e[S]=!0,e}function ce(e){var t=C.createElement("fieldset");try{return!!e(t)}catch(e){return!1}finally{t.parentNode&&t.parentNode.removeChild(t),t=null}}function fe(e,t){var n=e.split("|"),r=n.length;while(r--)b.attrHandle[n[r]]=t}function pe(e,t){var n=t&&e,r=n&&1===e.nodeType&&1===t.nodeType&&e.sourceIndex-t.sourceIndex;if(r)return r;if(n)while(n=n.nextSibling)if(n===t)return-1;return e?1:-1}function de(t){return function(e){return"input"===e.nodeName.toLowerCase()&&e.type===t}}function he(n){return function(e){var t=e.nodeName.toLowerCase();return("input"===t||"button"===t)&&e.type===n}}function ge(t){return function(e){return"form"in e?e.parentNode&&!1===e.disabled?"label"in e?"label"in e.parentNode?e.parentNode.disabled===t:e.disabled===t:e.isDisabled===t||e.isDisabled!==!t&&ae(e)===t:e.disabled===t:"label"in e&&e.disabled===t}}function ve(a){return le(function(o){return o=+o,le(function(e,t){var n,r=a([],e.length,o),i=r.length;while(i--)e[n=r[i]]&&(e[n]=!(t[n]=e[n]))})})}function ye(e){return e&&"undefined"!=typeof e.getElementsByTagName&&e}for(e in d=se.support={},i=se.isXML=function(e){var t=e&&e.namespaceURI,n=e&&(e.ownerDocument||e).documentElement;return!Y.test(t||n&&n.nodeName||"HTML")},T=se.setDocument=function(e){var t,n,r=e?e.ownerDocument||e:p;return r!=C&&9===r.nodeType&&r.documentElement&&(a=(C=r).documentElement,E=!i(C),p!=C&&(n=C.defaultView)&&n.top!==n&&(n.addEventListener?n.addEventListener("unload",oe,!1):n.attachEvent&&n.attachEvent("onunload",oe)),d.scope=ce(function(e){return a.appendChild(e).appendChild(C.createElement("div")),"undefined"!=typeof e.querySelectorAll&&!e.querySelectorAll(":scope fieldset div").length}),d.attributes=ce(function(e){return e.className="i",!e.getAttribute("className")}),d.getElementsByTagName=ce(function(e){return e.appendChild(C.createComment("")),!e.getElementsByTagName("*").length}),d.getElementsByClassName=K.test(C.getElementsByClassName),d.getById=ce(function(e){return a.appendChild(e).id=S,!C.getElementsByName||!C.getElementsByName(S).length}),d.getById?(b.filter.ID=function(e){var t=e.replace(te,ne);return function(e){return e.getAttribute("id")===t}},b.find.ID=function(e,t){if("undefined"!=typeof t.getElementById&&E){var n=t.getElementById(e);return n?[n]:[]}}):(b.filter.ID=function(e){var n=e.replace(te,ne);return function(e){var t="undefined"!=typeof e.getAttributeNode&&e.getAttributeNode("id");return t&&t.value===n}},b.find.ID=function(e,t){if("undefined"!=typeof t.getElementById&&E){var n,r,i,o=t.getElementById(e);if(o){if((n=o.getAttributeNode("id"))&&n.value===e)return[o];i=t.getElementsByName(e),r=0;while(o=i[r++])if((n=o.getAttributeNode("id"))&&n.value===e)return[o]}return[]}}),b.find.TAG=d.getElementsByTagName?function(e,t){return"undefined"!=typeof t.getElementsByTagName?t.getElementsByTagName(e):d.qsa?t.querySelectorAll(e):void 0}:function(e,t){var n,r=[],i=0,o=t.getElementsByTagName(e);if("*"===e){while(n=o[i++])1===n.nodeType&&r.push(n);return r}return o},b.find.CLASS=d.getElementsByClassName&&function(e,t){if("undefined"!=typeof t.getElementsByClassName&&E)return t.getElementsByClassName(e)},s=[],v=[],(d.qsa=K.test(C.querySelectorAll))&&(ce(function(e){var t;a.appendChild(e).innerHTML="",e.querySelectorAll("[msallowcapture^='']").length&&v.push("[*^$]="+M+"*(?:''|\"\")"),e.querySelectorAll("[selected]").length||v.push("\\["+M+"*(?:value|"+R+")"),e.querySelectorAll("[id~="+S+"-]").length||v.push("~="),(t=C.createElement("input")).setAttribute("name",""),e.appendChild(t),e.querySelectorAll("[name='']").length||v.push("\\["+M+"*name"+M+"*="+M+"*(?:''|\"\")"),e.querySelectorAll(":checked").length||v.push(":checked"),e.querySelectorAll("a#"+S+"+*").length||v.push(".#.+[+~]"),e.querySelectorAll("\\\f"),v.push("[\\r\\n\\f]")}),ce(function(e){e.innerHTML="";var t=C.createElement("input");t.setAttribute("type","hidden"),e.appendChild(t).setAttribute("name","D"),e.querySelectorAll("[name=d]").length&&v.push("name"+M+"*[*^$|!~]?="),2!==e.querySelectorAll(":enabled").length&&v.push(":enabled",":disabled"),a.appendChild(e).disabled=!0,2!==e.querySelectorAll(":disabled").length&&v.push(":enabled",":disabled"),e.querySelectorAll("*,:x"),v.push(",.*:")})),(d.matchesSelector=K.test(c=a.matches||a.webkitMatchesSelector||a.mozMatchesSelector||a.oMatchesSelector||a.msMatchesSelector))&&ce(function(e){d.disconnectedMatch=c.call(e,"*"),c.call(e,"[s!='']:x"),s.push("!=",F)}),v=v.length&&new RegExp(v.join("|")),s=s.length&&new RegExp(s.join("|")),t=K.test(a.compareDocumentPosition),y=t||K.test(a.contains)?function(e,t){var n=9===e.nodeType?e.documentElement:e,r=t&&t.parentNode;return e===r||!(!r||1!==r.nodeType||!(n.contains?n.contains(r):e.compareDocumentPosition&&16&e.compareDocumentPosition(r)))}:function(e,t){if(t)while(t=t.parentNode)if(t===e)return!0;return!1},j=t?function(e,t){if(e===t)return l=!0,0;var n=!e.compareDocumentPosition-!t.compareDocumentPosition;return n||(1&(n=(e.ownerDocument||e)==(t.ownerDocument||t)?e.compareDocumentPosition(t):1)||!d.sortDetached&&t.compareDocumentPosition(e)===n?e==C||e.ownerDocument==p&&y(p,e)?-1:t==C||t.ownerDocument==p&&y(p,t)?1:u?P(u,e)-P(u,t):0:4&n?-1:1)}:function(e,t){if(e===t)return l=!0,0;var n,r=0,i=e.parentNode,o=t.parentNode,a=[e],s=[t];if(!i||!o)return e==C?-1:t==C?1:i?-1:o?1:u?P(u,e)-P(u,t):0;if(i===o)return pe(e,t);n=e;while(n=n.parentNode)a.unshift(n);n=t;while(n=n.parentNode)s.unshift(n);while(a[r]===s[r])r++;return r?pe(a[r],s[r]):a[r]==p?-1:s[r]==p?1:0}),C},se.matches=function(e,t){return se(e,null,null,t)},se.matchesSelector=function(e,t){if(T(e),d.matchesSelector&&E&&!N[t+" "]&&(!s||!s.test(t))&&(!v||!v.test(t)))try{var n=c.call(e,t);if(n||d.disconnectedMatch||e.document&&11!==e.document.nodeType)return n}catch(e){N(t,!0)}return 0":{dir:"parentNode",first:!0}," ":{dir:"parentNode"},"+":{dir:"previousSibling",first:!0},"~":{dir:"previousSibling"}},preFilter:{ATTR:function(e){return e[1]=e[1].replace(te,ne),e[3]=(e[3]||e[4]||e[5]||"").replace(te,ne),"~="===e[2]&&(e[3]=" "+e[3]+" "),e.slice(0,4)},CHILD:function(e){return e[1]=e[1].toLowerCase(),"nth"===e[1].slice(0,3)?(e[3]||se.error(e[0]),e[4]=+(e[4]?e[5]+(e[6]||1):2*("even"===e[3]||"odd"===e[3])),e[5]=+(e[7]+e[8]||"odd"===e[3])):e[3]&&se.error(e[0]),e},PSEUDO:function(e){var t,n=!e[6]&&e[2];return G.CHILD.test(e[0])?null:(e[3]?e[2]=e[4]||e[5]||"":n&&X.test(n)&&(t=h(n,!0))&&(t=n.indexOf(")",n.length-t)-n.length)&&(e[0]=e[0].slice(0,t),e[2]=n.slice(0,t)),e.slice(0,3))}},filter:{TAG:function(e){var t=e.replace(te,ne).toLowerCase();return"*"===e?function(){return!0}:function(e){return e.nodeName&&e.nodeName.toLowerCase()===t}},CLASS:function(e){var t=m[e+" "];return t||(t=new RegExp("(^|"+M+")"+e+"("+M+"|$)"))&&m(e,function(e){return t.test("string"==typeof e.className&&e.className||"undefined"!=typeof e.getAttribute&&e.getAttribute("class")||"")})},ATTR:function(n,r,i){return function(e){var t=se.attr(e,n);return null==t?"!="===r:!r||(t+="","="===r?t===i:"!="===r?t!==i:"^="===r?i&&0===t.indexOf(i):"*="===r?i&&-1:\x20\t\r\n\f]*)[\x20\t\r\n\f]*\/?>(?:<\/\1>|)$/i;function j(e,n,r){return m(n)?S.grep(e,function(e,t){return!!n.call(e,t,e)!==r}):n.nodeType?S.grep(e,function(e){return e===n!==r}):"string"!=typeof n?S.grep(e,function(e){return-1)[^>]*|#([\w-]+))$/;(S.fn.init=function(e,t,n){var r,i;if(!e)return this;if(n=n||D,"string"==typeof e){if(!(r="<"===e[0]&&">"===e[e.length-1]&&3<=e.length?[null,e,null]:q.exec(e))||!r[1]&&t)return!t||t.jquery?(t||n).find(e):this.constructor(t).find(e);if(r[1]){if(t=t instanceof S?t[0]:t,S.merge(this,S.parseHTML(r[1],t&&t.nodeType?t.ownerDocument||t:E,!0)),N.test(r[1])&&S.isPlainObject(t))for(r in t)m(this[r])?this[r](t[r]):this.attr(r,t[r]);return this}return(i=E.getElementById(r[2]))&&(this[0]=i,this.length=1),this}return e.nodeType?(this[0]=e,this.length=1,this):m(e)?void 0!==n.ready?n.ready(e):e(S):S.makeArray(e,this)}).prototype=S.fn,D=S(E);var L=/^(?:parents|prev(?:Until|All))/,H={children:!0,contents:!0,next:!0,prev:!0};function O(e,t){while((e=e[t])&&1!==e.nodeType);return e}S.fn.extend({has:function(e){var t=S(e,this),n=t.length;return this.filter(function(){for(var e=0;e\x20\t\r\n\f]*)/i,he=/^$|^module$|\/(?:java|ecma)script/i;ce=E.createDocumentFragment().appendChild(E.createElement("div")),(fe=E.createElement("input")).setAttribute("type","radio"),fe.setAttribute("checked","checked"),fe.setAttribute("name","t"),ce.appendChild(fe),y.checkClone=ce.cloneNode(!0).cloneNode(!0).lastChild.checked,ce.innerHTML="",y.noCloneChecked=!!ce.cloneNode(!0).lastChild.defaultValue,ce.innerHTML="",y.option=!!ce.lastChild;var ge={thead:[1,"","
"],col:[2,"","
"],tr:[2,"","
"],td:[3,"","
"],_default:[0,"",""]};function ve(e,t){var n;return n="undefined"!=typeof e.getElementsByTagName?e.getElementsByTagName(t||"*"):"undefined"!=typeof e.querySelectorAll?e.querySelectorAll(t||"*"):[],void 0===t||t&&A(e,t)?S.merge([e],n):n}function ye(e,t){for(var n=0,r=e.length;n",""]);var me=/<|&#?\w+;/;function xe(e,t,n,r,i){for(var o,a,s,u,l,c,f=t.createDocumentFragment(),p=[],d=0,h=e.length;d\s*$/g;function je(e,t){return A(e,"table")&&A(11!==t.nodeType?t:t.firstChild,"tr")&&S(e).children("tbody")[0]||e}function De(e){return e.type=(null!==e.getAttribute("type"))+"/"+e.type,e}function qe(e){return"true/"===(e.type||"").slice(0,5)?e.type=e.type.slice(5):e.removeAttribute("type"),e}function Le(e,t){var n,r,i,o,a,s;if(1===t.nodeType){if(Y.hasData(e)&&(s=Y.get(e).events))for(i in Y.remove(t,"handle events"),s)for(n=0,r=s[i].length;n").attr(n.scriptAttrs||{}).prop({charset:n.scriptCharset,src:n.url}).on("load error",i=function(e){r.remove(),i=null,e&&t("error"===e.type?404:200,e.type)}),E.head.appendChild(r[0])},abort:function(){i&&i()}}});var _t,zt=[],Ut=/(=)\?(?=&|$)|\?\?/;S.ajaxSetup({jsonp:"callback",jsonpCallback:function(){var e=zt.pop()||S.expando+"_"+wt.guid++;return this[e]=!0,e}}),S.ajaxPrefilter("json jsonp",function(e,t,n){var r,i,o,a=!1!==e.jsonp&&(Ut.test(e.url)?"url":"string"==typeof e.data&&0===(e.contentType||"").indexOf("application/x-www-form-urlencoded")&&Ut.test(e.data)&&"data");if(a||"jsonp"===e.dataTypes[0])return r=e.jsonpCallback=m(e.jsonpCallback)?e.jsonpCallback():e.jsonpCallback,a?e[a]=e[a].replace(Ut,"$1"+r):!1!==e.jsonp&&(e.url+=(Tt.test(e.url)?"&":"?")+e.jsonp+"="+r),e.converters["script json"]=function(){return o||S.error(r+" was not called"),o[0]},e.dataTypes[0]="json",i=C[r],C[r]=function(){o=arguments},n.always(function(){void 0===i?S(C).removeProp(r):C[r]=i,e[r]&&(e.jsonpCallback=t.jsonpCallback,zt.push(r)),o&&m(i)&&i(o[0]),o=i=void 0}),"script"}),y.createHTMLDocument=((_t=E.implementation.createHTMLDocument("").body).innerHTML="
",2===_t.childNodes.length),S.parseHTML=function(e,t,n){return"string"!=typeof e?[]:("boolean"==typeof t&&(n=t,t=!1),t||(y.createHTMLDocument?((r=(t=E.implementation.createHTMLDocument("")).createElement("base")).href=E.location.href,t.head.appendChild(r)):t=E),o=!n&&[],(i=N.exec(e))?[t.createElement(i[1])]:(i=xe([e],t,o),o&&o.length&&S(o).remove(),S.merge([],i.childNodes)));var r,i,o},S.fn.load=function(e,t,n){var r,i,o,a=this,s=e.indexOf(" ");return-1").append(S.parseHTML(e)).find(r):e)}).always(n&&function(e,t){a.each(function(){n.apply(this,o||[e.responseText,t,e])})}),this},S.expr.pseudos.animated=function(t){return S.grep(S.timers,function(e){return t===e.elem}).length},S.offset={setOffset:function(e,t,n){var r,i,o,a,s,u,l=S.css(e,"position"),c=S(e),f={};"static"===l&&(e.style.position="relative"),s=c.offset(),o=S.css(e,"top"),u=S.css(e,"left"),("absolute"===l||"fixed"===l)&&-1<(o+u).indexOf("auto")?(a=(r=c.position()).top,i=r.left):(a=parseFloat(o)||0,i=parseFloat(u)||0),m(t)&&(t=t.call(e,n,S.extend({},s))),null!=t.top&&(f.top=t.top-s.top+a),null!=t.left&&(f.left=t.left-s.left+i),"using"in t?t.using.call(e,f):c.css(f)}},S.fn.extend({offset:function(t){if(arguments.length)return void 0===t?this:this.each(function(e){S.offset.setOffset(this,t,e)});var e,n,r=this[0];return r?r.getClientRects().length?(e=r.getBoundingClientRect(),n=r.ownerDocument.defaultView,{top:e.top+n.pageYOffset,left:e.left+n.pageXOffset}):{top:0,left:0}:void 0},position:function(){if(this[0]){var e,t,n,r=this[0],i={top:0,left:0};if("fixed"===S.css(r,"position"))t=r.getBoundingClientRect();else{t=this.offset(),n=r.ownerDocument,e=r.offsetParent||n.documentElement;while(e&&(e===n.body||e===n.documentElement)&&"static"===S.css(e,"position"))e=e.parentNode;e&&e!==r&&1===e.nodeType&&((i=S(e).offset()).top+=S.css(e,"borderTopWidth",!0),i.left+=S.css(e,"borderLeftWidth",!0))}return{top:t.top-i.top-S.css(r,"marginTop",!0),left:t.left-i.left-S.css(r,"marginLeft",!0)}}},offsetParent:function(){return this.map(function(){var e=this.offsetParent;while(e&&"static"===S.css(e,"position"))e=e.offsetParent;return e||re})}}),S.each({scrollLeft:"pageXOffset",scrollTop:"pageYOffset"},function(t,i){var o="pageYOffset"===i;S.fn[t]=function(e){return $(this,function(e,t,n){var r;if(x(e)?r=e:9===e.nodeType&&(r=e.defaultView),void 0===n)return r?r[i]:e[t];r?r.scrollTo(o?r.pageXOffset:n,o?n:r.pageYOffset):e[t]=n},t,e,arguments.length)}}),S.each(["top","left"],function(e,n){S.cssHooks[n]=Fe(y.pixelPosition,function(e,t){if(t)return t=We(e,n),Pe.test(t)?S(e).position()[n]+"px":t})}),S.each({Height:"height",Width:"width"},function(a,s){S.each({padding:"inner"+a,content:s,"":"outer"+a},function(r,o){S.fn[o]=function(e,t){var n=arguments.length&&(r||"boolean"!=typeof e),i=r||(!0===e||!0===t?"margin":"border");return $(this,function(e,t,n){var r;return x(e)?0===o.indexOf("outer")?e["inner"+a]:e.document.documentElement["client"+a]:9===e.nodeType?(r=e.documentElement,Math.max(e.body["scroll"+a],r["scroll"+a],e.body["offset"+a],r["offset"+a],r["client"+a])):void 0===n?S.css(e,t,i):S.style(e,t,n,i)},s,n?e:void 0,n)}})}),S.each(["ajaxStart","ajaxStop","ajaxComplete","ajaxError","ajaxSuccess","ajaxSend"],function(e,t){S.fn[t]=function(e){return this.on(t,e)}}),S.fn.extend({bind:function(e,t,n){return this.on(e,null,t,n)},unbind:function(e,t){return this.off(e,null,t)},delegate:function(e,t,n,r){return this.on(t,e,n,r)},undelegate:function(e,t,n){return 1===arguments.length?this.off(e,"**"):this.off(t,e||"**",n)},hover:function(e,t){return this.mouseenter(e).mouseleave(t||e)}}),S.each("blur focus focusin focusout resize scroll click dblclick mousedown mouseup mousemove mouseover mouseout mouseenter mouseleave change select submit keydown keypress keyup contextmenu".split(" "),function(e,n){S.fn[n]=function(e,t){return 0"),n("table.docutils.footnote").wrap("
"),n("table.docutils.citation").wrap("
"),n(".wy-menu-vertical ul").not(".simple").siblings("a").each((function(){var t=n(this);expand=n(''),expand.on("click",(function(n){return e.toggleCurrent(t),n.stopPropagation(),!1})),t.prepend(expand)}))},reset:function(){var n=encodeURI(window.location.hash)||"#";try{var e=$(".wy-menu-vertical"),t=e.find('[href="'+n+'"]');if(0===t.length){var i=$('.document [id="'+n.substring(1)+'"]').closest("div.section");0===(t=e.find('[href="#'+i.attr("id")+'"]')).length&&(t=e.find('[href="#"]'))}if(t.length>0){$(".wy-menu-vertical .current").removeClass("current").attr("aria-expanded","false"),t.addClass("current").attr("aria-expanded","true"),t.closest("li.toctree-l1").parent().addClass("current").attr("aria-expanded","true");for(let n=1;n<=10;n++)t.closest("li.toctree-l"+n).addClass("current").attr("aria-expanded","true");t[0].scrollIntoView()}}catch(n){console.log("Error expanding nav for anchor",n)}},onScroll:function(){this.winScroll=!1;var n=this.win.scrollTop(),e=n+this.winHeight,t=this.navBar.scrollTop()+(n-this.winPosition);n<0||e>this.docHeight||(this.navBar.scrollTop(t),this.winPosition=n)},onResize:function(){this.winResize=!1,this.winHeight=this.win.height(),this.docHeight=$(document).height()},hashChange:function(){this.linkScroll=!0,this.win.one("hashchange",(function(){this.linkScroll=!1}))},toggleCurrent:function(n){var e=n.closest("li");e.siblings("li.current").removeClass("current").attr("aria-expanded","false"),e.siblings().find("li.current").removeClass("current").attr("aria-expanded","false");var t=e.find("> ul li");t.length&&(t.removeClass("current").attr("aria-expanded","false"),e.toggleClass("current").attr("aria-expanded",(function(n,e){return"true"==e?"false":"true"})))}},"undefined"!=typeof window&&(window.SphinxRtdTheme={Navigation:n.exports.ThemeNav,StickyNav:n.exports.ThemeNav}),function(){for(var n=0,e=["ms","moz","webkit","o"],t=0;t + + + + + + + Bitwise Operations - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Bitwise Operations

+

__m256i __lasx_xvbitsel_v (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvbitsel_v (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvbitsel.v xr, xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute bitwise selection: for each bit position, if the bit in c equals to one, copy the bit from b to dst, otherwise copy from a.

+

Examples

+
__m256i __lasx_xvbitsel_v(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, __m256i{0xffff0000aaaabbbb, 0x1111222233334444, 0x00000000ffffffff, 0xffffffff00000000})
+= 0xabab3344ffeeefab 0x98ba9beccfedfb00 0xabcdef1243214321 0x56785678ddeeddee
+
+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (c.dword[i] & b.dword[i]) | (~c.dword[i] & a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m256i __lasx_xvbitseli_b (__m256i a, __m256i b, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvbitseli_b (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvbitseli.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute bitwise selection: for each bit position, if the bit in a equals to one, copy the bit from imm to dst, otherwise copy from b.

+

Examples

+
__m256i __lasx_xvbitseli_b( __m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{ 0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, 0x12)
+= 0xba8b9aabba8b9a23 0x1216123012031221 0x1230123653115311 0x5652565212121212
+
+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (~a.byte[i] & b.byte[i]) | (a.byte[i] & (u8)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m256i __lasx_xvbitclr_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvbitclr_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitclr.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Clear the bit specified by elements in b from 8-bit elements in a, save the result in dst.

+

Examples

+
__m256i __lasx_xvbitclr_b(__m256i{0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0xf7f7f7f7f7f7f7f7 0x99aabbccd5ecf700 0xabcdeb0212341234 0xaabaaaba9dee9dee
+
+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] & (~((u8)1 << (b.byte[i] % 8)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitclr_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvbitclr_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitclr.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Clear the bit specified by elements in b from 16-bit elements in a, save the result in dst.

+

Examples

+
__m256i __lasx_xvbitclr_h(__m256i{0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0xf7fff7fff7fff7ff 0x99aabbccddecff00 0xabcdef0212341234 0xaabbaabbdceedcee
+
+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] & (~((u16)1 << (b.half[i] % 16)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitclr_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvbitclr_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitclr.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Clear the bit specified by elements in b from 32-bit elements in a, save the result in dst.

+

Examples

+
__m256i __lasx_xvbitclr_w(__m256i{0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0xfffff7fffffff7ff 0x99aabbccddeeff00 0xabcdef1212341234 0xaabbaabbdceeddee
+
+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] & (~((u32)1 << (b.word[i] % 32)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitclr_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvbitclr_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitclr.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Clear the bit specified by elements in b from 64-bit elements in a, save the result in dst.

+

Examples

+
__m256i __lasx_xvbitclr_d(__m256i{0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0xfffff7ffffffffff 0x99aabbccddeeff00 0xabcdef1012341234 0xaabbaabbddeeddee
+
+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] & (~((u64)1 << (b.dword[i] % 64)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitclri_b (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvbitclri_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvbitclri.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Clear the bit specified by imm from 8-bit elements in a, save the result in dst.

+

Examples

+
__m256i __lasx_xvbitclri_b( __m256i{ 0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0xfdfdfdfdfdfdfdfd 0x99a8b9ccddecfd00 0xa9cded1010341034 0xa8b9a8b9ddecddec
+
+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] & (~((u8)1 << imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitclri_h (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvbitclri_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvbitclri.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Clear the bit specified by imm from 16-bit elements in a, save the result in dst.

+

Examples

+
__m256i __lasx_xvbitclri_h( __m256i{ 0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0xfffdfffdfffdfffd 0x99a8bbccddecff00 0xabcdef1012341234 0xaab9aab9ddecddec
+
+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] & (~((u16)1 << imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitclri_w (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvbitclri_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvbitclri.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Clear the bit specified by imm from 32-bit elements in a, save the result in dst.

+

Examples

+
__m256i __lasx_xvbitclri_w( __m256i{ 0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0xfffffffdfffffffd 0x99aabbccddeeff00 0xabcdef1012341234 0xaabbaab9ddeeddec
+
+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] & (~((u32)1 << imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitclri_d (__m256i a, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvbitclri_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvbitclri.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Clear the bit specified by imm from 64-bit elements in a, save the result in dst.

+

Examples

+
__m256i __lasx_xvbitclri_d( __m256i{ 0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0xfffffffffffffffd 0x99aabbccddeeff00 0xabcdef1212341234 0xaabbaabbddeeddec
+
+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] & (~((u64)1 << imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitset_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvbitset_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitset.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Set the bit specified by elements in b from 8-bit elements in a, save the result in dst.

+

Examples

+
__m256i __lasx_xvbitset_b(__m256i{0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x0808080808080808 0x9dbabfdcddeeff02 0xafddef121a361a36 0xeabbeabbddefddef
+
+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] | ((u8)1 << (b.byte[i] % 8));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitset_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvbitset_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitset.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Set the bit specified by elements in b from 16-bit elements in a, save the result in dst.

+

Examples

+
__m256i __lasx_xvbitset_h(__m256i{0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x0800080008000800 0x99babbdcddeeff02 0xabddef1212361236 0xabbbabbbddeeddee
+
+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] | ((u16)1 << (b.half[i] % 16));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitset_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvbitset_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitset.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Set the bit specified by elements in b from 32-bit elements in a, save the result in dst.

+

Examples

+
__m256i __lasx_xvbitset_w(__m256i{0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x0000080000000800 0x99babbccddeeff02 0xabddef1212341236 0xabbbaabbddeeddee
+
+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] | ((u32)1 << (b.word[i] % 32));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitset_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvbitset_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitset.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Set the bit specified by elements in b from 64-bit elements in a, save the result in dst.

+

Examples

+
__m256i __lasx_xvbitset_d(__m256i{0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x0000080000000000 0x99aabbceddeeff00 0xabcdef1212341234 0xabbbaabbddeeddee
+
+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] | ((u64)1 << (b.dword[i] % 64));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitseti_b (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvbitseti_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvbitseti.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Set the bit specified by imm from 8-bit elements in a, save the result in dst.

+

Examples

+
__m256i __lasx_xvbitseti_b( __m256i{ 0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0x0202020202020202 0x9baabbcedfeeff02 0xabcfef1212361236 0xaabbaabbdfeedfee
+
+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] | ((u8)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitseti_h (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvbitseti_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvbitseti.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Set the bit specified by imm from 16-bit elements in a, save the result in dst.

+

Examples

+
__m256i __lasx_xvbitseti_h( __m256i{ 0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0x0002000200020002 0x99aabbceddeeff02 0xabcfef1212361236 0xaabbaabbddeeddee
+
+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] | ((u16)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitseti_w (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvbitseti_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvbitseti.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Set the bit specified by imm from 32-bit elements in a, save the result in dst.

+

Examples

+
__m256i __lasx_xvbitseti_w( __m256i{ 0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0x0000000200000002 0x99aabbceddeeff02 0xabcdef1212341236 0xaabbaabbddeeddee
+
+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] | ((u32)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitseti_d (__m256i a, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvbitseti_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvbitseti.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Set the bit specified by imm from 64-bit elements in a, save the result in dst.

+

Examples

+
__m256i __lasx_xvbitseti_d( __m256i{ 0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0x0000000000000002 0x99aabbccddeeff02 0xabcdef1212341236 0xaabbaabbddeeddee
+
+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] | ((u64)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitrev_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvbitrev_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitrev.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Toggle the bit specified by elements in b from 8-bit elements in a, save the result in dst.

+

Examples

+
__m256i __lasx_xvbitrev_b(__m256i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x0707070707070707 0x9dbabfdcd5ecf702 0xafddeb021a361a36 0xeabaeaba9def9def
+
+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] ^ ((u8)1 << (b.byte[i] % 8));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitrev_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvbitrev_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitrev.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Toggle the bit specified by elements in b from 16-bit elements in a, save the result in dst.

+

Examples

+
__m256i __lasx_xvbitrev_h(__m256i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x070f070f070f070f 0x99babbdcddecff02 0xabddef0212361236 0xabbbabbbdceedcee
+
+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] ^ ((u16)1 << (b.half[i] % 16));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitrev_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvbitrev_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitrev.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Toggle the bit specified by elements in b from 32-bit elements in a, save the result in dst.

+

Examples

+
__m256i __lasx_xvbitrev_w(__m256i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x0f0f070f0f0f070f 0x99babbccddeeff02 0xabddef1212341236 0xabbbaabbdceeddee
+
+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] ^ ((u32)1 << (b.word[i] % 32));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitrev_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvbitrev_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitrev.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Toggle the bit specified by elements in b from 64-bit elements in a, save the result in dst.

+

Examples

+
__m256i __lasx_xvbitrev_d(__m256i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x0f0f070f0f0f0f0f 0x99aabbceddeeff00 0xabcdef1012341234 0xabbbaabbddeeddee
+
+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] ^ ((u64)1 << (b.dword[i] % 64));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitrevi_b (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvbitrevi_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvbitrevi.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Toggle the bit specified by imm from 8-bit elements in a, save the result in dst.

+

Examples

+
__m256i __lasx_xvbitrevi_b( __m256i{ 0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0x0d0d0d0d0d0d0d0d 0x9ba8b9cedfecfd02 0xa9cfed1010361036 0xa8b9a8b9dfecdfec
+
+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] ^ ((u8)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitrevi_h (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvbitrevi_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvbitrevi.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Toggle the bit specified by imm from 16-bit elements in a, save the result in dst.

+

Examples

+
__m256i __lasx_xvbitrevi_h( __m256i{ 0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0x0f0d0f0d0f0d0f0d 0x99a8bbceddecff02 0xabcfef1012361236 0xaab9aab9ddecddec
+
+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] ^ ((u16)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitrevi_w (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvbitrevi_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvbitrevi.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Toggle the bit specified by imm from 32-bit elements in a, save the result in dst.

+

Examples

+
__m256i __lasx_xvbitrevi_w( __m256i{ 0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0x0f0f0f0d0f0f0f0d 0x99aabbceddeeff02 0xabcdef1012341236 0xaabbaab9ddeeddec
+
+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] ^ ((u32)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitrevi_d (__m256i a, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvbitrevi_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvbitrevi.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Toggle the bit specified by imm from 64-bit elements in a, save the result in dst.

+

Examples

+
__m256i __lasx_xvbitrevi_d( __m256i{ 0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0x0f0f0f0f0f0f0f0d 0x99aabbccddeeff02 0xabcdef1212341236 0xaabbaabbddeeddec
+
+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] ^ ((u64)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvclo_b (__m256i a)

+

Synopsis

+
__m256i __lasx_xvclo_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclo.b xr, xr
+CPU Flags: LASX
+
+

Description

+

Count leading ones of 8-bit elements in a.

+

Examples

+
__m256i __lasx_xvclo_b(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0000000000000001 0x0101010202030800 0x0102030000000000 0x0101010102030203
+
+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = clo(a.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvclo_h (__m256i a)

+

Synopsis

+
__m256i __lasx_xvclo_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclo.h xr, xr
+CPU Flags: LASX
+
+

Description

+

Count leading ones of 16-bit elements in a.

+

Examples

+
__m256i __lasx_xvclo_h(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0000000000000000 0x0001000100020008 0x0001000300000000 0x0001000100020002
+
+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = clo(a.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvclo_w (__m256i a)

+

Synopsis

+
__m256i __lasx_xvclo_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclo.w xr, xr
+CPU Flags: LASX
+
+

Description

+

Count leading ones of 32-bit elements in a.

+

Examples

+
__m256i __lasx_xvclo_w(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0000000000000000 0x0000000100000002 0x0000000100000000 0x0000000100000002
+
+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = clo(a.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvclo_d (__m256i a)

+

Synopsis

+
__m256i __lasx_xvclo_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclo.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Count leading ones of 64-bit elements in a.

+

Examples

+
__m256i __lasx_xvclo_d(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0000000000000000 0x0000000000000001 0x0000000000000001 0x0000000000000001
+
+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = clo(a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvclz_b (__m256i a)

+

Synopsis

+
__m256i __lasx_xvclz_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclz.b xr, xr
+CPU Flags: LASX
+
+

Description

+

Count leading zeros of 8-bit elements in a.

+

Examples

+
__m256i __lasx_xvclz_b(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0302020101010100 0x0000000000000008 0x0000000303020302 0x0000000000000000
+
+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = clz(a.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvclz_h (__m256i a)

+

Synopsis

+
__m256i __lasx_xvclz_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclz.h xr, xr
+CPU Flags: LASX
+
+

Description

+

Count leading zeros of 16-bit elements in a.

+

Examples

+
__m256i __lasx_xvclz_h(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0003000200010001 0x0000000000000000 0x0000000000030003 0x0000000000000000
+
+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = clz(a.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvclz_w (__m256i a)

+

Synopsis

+
__m256i __lasx_xvclz_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclz.w xr, xr
+CPU Flags: LASX
+
+

Description

+

Count leading zeros of 32-bit elements in a.

+

Examples

+
__m256i __lasx_xvclz_w(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0000000300000001 0x0000000000000000 0x0000000000000003 0x0000000000000000
+
+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = clz(a.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvclz_d (__m256i a)

+

Synopsis

+
__m256i __lasx_xvclz_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclz.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Count leading zeros of 64-bit elements in a.

+

Examples

+
__m256i __lasx_xvclz_d(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0000000000000003 0x0000000000000000 0x0000000000000000 0x0000000000000000
+
+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = clz(a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvpcnt_b (__m256i a)

+

Synopsis

+
__m256i __lasx_xvpcnt_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvpcnt.b xr, xr
+CPU Flags: LASX
+
+

Description

+

Count the number of ones (population, popcount) in 8-bit elements in a.

+

Examples

+
__m256i __lasx_xvpcnt_b(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0202040204040602 0x0404060406060800 0x0505070202030203 0x0406040606060606
+
+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = popcount(a.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvpcnt_h (__m256i a)

+

Synopsis

+
__m256i __lasx_xvpcnt_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvpcnt.h xr, xr
+CPU Flags: LASX
+
+

Description

+

Count the number of ones (population, popcount) in 16-bit elements in a.

+

Examples

+
__m256i __lasx_xvpcnt_h(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0004000600080008 0x0008000a000c0008 0x000a000900050005 0x000a000a000c000c
+
+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = popcount(a.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvpcnt_w (__m256i a)

+

Synopsis

+
__m256i __lasx_xvpcnt_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvpcnt.w xr, xr
+CPU Flags: LASX
+
+

Description

+

Count the number of ones (population, popcount) in 32-bit elements in a.

+

Examples

+
__m256i __lasx_xvpcnt_w(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0000000a00000010 0x0000001200000014 0x000000130000000a 0x0000001400000018
+
+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = popcount(a.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvpcnt_d (__m256i a)

+

Synopsis

+
__m256i __lasx_xvpcnt_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvpcnt.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Count the number of ones (population, popcount) in 64-bit elements in a.

+

Examples

+
__m256i __lasx_xvpcnt_d(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x000000000000001a 0x0000000000000026 0x000000000000001d 0x000000000000002c
+
+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = popcount(a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/branch/index.html b/lasx/branch/index.html new file mode 100644 index 00000000..77ee7cf8 --- /dev/null +++ b/lasx/branch/index.html @@ -0,0 +1,709 @@ + + + + + + + + Branch - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Branch

+

int __lasx_xbz_v (__m256i a)

+

Synopsis

+
int __lasx_xbz_v (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvseteqz.v fcc, xr; bcnez
+CPU Flags: LASX
+
+

Description

+

Expected to be used in branches: branch if the whole vector a equals to zero.

+

Operation

+
dst = a.qword[0] == 0 && a.qword[1] == 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A2
3C5000N/A2
+

int __lasx_xbnz_v (__m256i a)

+

Synopsis

+
int __lasx_xbnz_v (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetnez.v fcc, xr; bcnez
+CPU Flags: LASX
+
+

Description

+

Expected to be used in branches: branch if the whole vector a is non-zero.

+

Operation

+
dst = a.qword[0] != 0 || a.qword[1] != 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A2
3C5000N/A2
+

int __lasx_xbz_b (__m256i a)

+

Synopsis

+
int __lasx_xbz_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetanyeqz.b fcc, xr; bcnez
+CPU Flags: LASX
+
+

Description

+

Expected to be used in branches: branch if any 8-bit element in a equals to zero.

+

Operation

+
dst = 0;
+for (int i = 0; i < 32; i++) {
+  if (a.byte[i] == 0) {
+    dst = 1;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A2
3C5000N/A2
+

int __lasx_xbz_h (__m256i a)

+

Synopsis

+
int __lasx_xbz_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetanyeqz.h fcc, xr; bcnez
+CPU Flags: LASX
+
+

Description

+

Expected to be used in branches: branch if any 16-bit element in a equals to zero.

+

Operation

+
dst = 0;
+for (int i = 0; i < 16; i++) {
+  if (a.half[i] == 0) {
+    dst = 1;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A2
3C5000N/A2
+

int __lasx_xbz_w (__m256i a)

+

Synopsis

+
int __lasx_xbz_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetanyeqz.w fcc, xr; bcnez
+CPU Flags: LASX
+
+

Description

+

Expected to be used in branches: branch if any 32-bit element in a equals to zero.

+

Operation

+
dst = 0;
+for (int i = 0; i < 8; i++) {
+  if (a.word[i] == 0) {
+    dst = 1;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A2
3C5000N/A2
+

int __lasx_xbz_d (__m256i a)

+

Synopsis

+
int __lasx_xbz_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetanyeqz.d fcc, xr; bcnez
+CPU Flags: LASX
+
+

Description

+

Expected to be used in branches: branch if any 64-bit element in a equals to zero.

+

Operation

+
dst = 0;
+for (int i = 0; i < 4; i++) {
+  if (a.dword[i] == 0) {
+    dst = 1;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A2
3C5000N/A2
+

int __lasx_xbnz_b (__m256i a)

+

Synopsis

+
int __lasx_xbnz_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetallnez.b fcc, xr; bcnez
+CPU Flags: LASX
+
+

Description

+

Expected to be used in branches: branch if all 8-bit elements in a are non-zero.

+

Operation

+
dst = 1;
+for (int i = 0; i < 32; i++) {
+  if (a.byte[i] == 0) {
+    dst = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A2
3C5000N/A2
+

int __lasx_xbnz_h (__m256i a)

+

Synopsis

+
int __lasx_xbnz_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetallnez.h fcc, xr; bcnez
+CPU Flags: LASX
+
+

Description

+

Expected to be used in branches: branch if all 16-bit elements in a are non-zero.

+

Operation

+
dst = 1;
+for (int i = 0; i < 16; i++) {
+  if (a.half[i] == 0) {
+    dst = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A2
3C5000N/A2
+

int __lasx_xbnz_w (__m256i a)

+

Synopsis

+
int __lasx_xbnz_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetallnez.w fcc, xr; bcnez
+CPU Flags: LASX
+
+

Description

+

Expected to be used in branches: branch if all 32-bit elements in a are non-zero.

+

Operation

+
dst = 1;
+for (int i = 0; i < 8; i++) {
+  if (a.word[i] == 0) {
+    dst = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A2
3C5000N/A2
+

int __lasx_xbnz_d (__m256i a)

+

Synopsis

+
int __lasx_xbnz_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetallnez.d fcc, xr; bcnez
+CPU Flags: LASX
+
+

Description

+

Expected to be used in branches: branch if all 64-bit elements in a are non-zero.

+

Operation

+
dst = 1;
+for (int i = 0; i < 4; i++) {
+  if (a.dword[i] == 0) {
+    dst = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A2
3C5000N/A2
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/float_comparison/index.html b/lasx/float_comparison/index.html new file mode 100644 index 00000000..a69fb159 --- /dev/null +++ b/lasx/float_comparison/index.html @@ -0,0 +1,2443 @@ + + + + + + + + Floating Point Comparison - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Floating Point Comparison

+

__m256i __lasx_xvfcmp_caf_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_caf_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.caf.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_caf(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_caf_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_caf_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.caf.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_caf(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_ceq_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_ceq_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.ceq.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_ceq(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_ceq_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_ceq_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.ceq.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_ceq(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cle_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cle_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cle.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_cle(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cle_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cle_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cle.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cle(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_clt_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_clt_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.clt.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_clt(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_clt_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_clt_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.clt.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_clt(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cne_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cne_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cne.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_cne(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cne_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cne_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cne.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cne(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cor_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cor_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cor.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_cor(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cor_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cor_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cor.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cor(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cueq_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cueq_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cueq.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_cueq(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cueq_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cueq_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cueq.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cueq(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cule_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cule_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cule.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_cule(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cule_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cule_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cule.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cule(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cult_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cult_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cult.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_cult(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cult_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cult_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cult.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cult(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cun_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cun_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cun.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_cun(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cun_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cun_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cun.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cun(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cune_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cune_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cune.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_cune(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cune_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cune_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cune.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cune(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_saf_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_saf_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.saf.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_saf(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_saf_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_saf_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.saf.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_saf(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_seq_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_seq_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.seq.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_seq(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_seq_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_seq_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.seq.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_seq(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sle_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sle_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sle.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_sle(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sle_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sle_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sle.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sle(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_slt_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_slt_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.slt.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_slt(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_slt_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_slt_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.slt.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_slt(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sne_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sne_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sne.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_sne(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sne_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sne_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sne.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sne(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sor_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sor_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sor.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_sor(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sor_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sor_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sor.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sor(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sueq_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sueq_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sueq.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_sueq(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sueq_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sueq_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sueq.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sueq(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sule_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sule_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sule.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_sule(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sule_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sule_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sule.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sule(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sult_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sult_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sult.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_sult(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sult_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sult_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sult.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sult(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sun_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sun_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sun.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_sun(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sun_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sun_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sun.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sun(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sune_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sune_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sune.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_sune(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sune_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sune_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sune.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sune(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/float_computation/index.html b/lasx/float_computation/index.html new file mode 100644 index 00000000..f0021902 --- /dev/null +++ b/lasx/float_computation/index.html @@ -0,0 +1,1447 @@ + + + + + + + + Floating Point Computation - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Floating Point Computation

+

__m256 __lasx_xvfadd_s (__m256 a, __m256 b)

+

Synopsis

+
__m256 __lasx_xvfadd_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfadd.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add single precision floating point elements in a to elements in b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = a.fp32[i] + b.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500052
+

__m256d __lasx_xvfadd_d (__m256d a, __m256d b)

+

Synopsis

+
__m256d __lasx_xvfadd_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfadd.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add double precision floating point elements in a to elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = a.fp64[i] + b.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500052
+

__m256 __lasx_xvfdiv_s (__m256 a, __m256 b)

+

Synopsis

+
__m256 __lasx_xvfdiv_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfdiv.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Divide single precision floating point elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = a.fp32[i] / b.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000110.18(1/5.5)
3C500011, 19.50.1(1/10.5)
+

__m256d __lasx_xvfdiv_d (__m256d a, __m256d b)

+

Synopsis

+
__m256d __lasx_xvfdiv_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfdiv.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Divide double precision floating point elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = a.fp64[i] / b.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60008, 21.50.25(1/4)
3C50008, 170.08(1/12.5)
+

__m256 __lasx_xvfmax_s (__m256 a, __m256 b)

+

Synopsis

+
__m256 __lasx_xvfmax_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfmax.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute maximum of single precision floating point elements in a and b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = fmax(a.fp32[i], b.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256d __lasx_xvfmax_d (__m256d a, __m256d b)

+

Synopsis

+
__m256d __lasx_xvfmax_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfmax.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute maximum of double precision floating point elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = fmax(a.fp64[i], b.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256 __lasx_xvfmaxa_s (__m256 a, __m256 b)

+

Synopsis

+
__m256 __lasx_xvfmaxa_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfmaxa.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute maximum of single precision floating point elements in a and b by magnitude.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = (abs(a.fp32[i]) > abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256d __lasx_xvfmaxa_d (__m256d a, __m256d b)

+

Synopsis

+
__m256d __lasx_xvfmaxa_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfmaxa.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute maximum of double precision floating point elements in a and b by magnitude.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = (abs(a.fp64[i]) > abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256 __lasx_xvfmin_s (__m256 a, __m256 b)

+

Synopsis

+
__m256 __lasx_xvfmin_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfmax.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute minimum of single precision floating point elements in a and b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = fmin(a.fp32[i], b.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256d __lasx_xvfmin_d (__m256d a, __m256d b)

+

Synopsis

+
__m256d __lasx_xvfmin_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfmax.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute minimum of double precision floating point elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = fmin(a.fp64[i], b.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256 __lasx_xvfmina_s (__m256 a, __m256 b)

+

Synopsis

+
__m256 __lasx_xvfmina_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfmina.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute minimum of single precision floating point elements in a and b by magnitude.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = (abs(a.fp32[i]) < abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256d __lasx_xvfmina_d (__m256d a, __m256d b)

+

Synopsis

+
__m256d __lasx_xvfmina_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfmina.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute minimum of double precision floating point elements in a and b by magnitude.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = (abs(a.fp64[i]) < abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256 __lasx_xvfmul_s (__m256 a, __m256 b)

+

Synopsis

+
__m256 __lasx_xvfmul_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfmul.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply single precision floating point elements in a and elements in b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = a.fp32[i] * b.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m256d __lasx_xvfmul_d (__m256d a, __m256d b)

+

Synopsis

+
__m256d __lasx_xvfmul_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfmul.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply double precision floating point elements in a and elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = a.fp64[i] * b.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m256 __lasx_xvfsub_s (__m256 a, __m256 b)

+

Synopsis

+
__m256 __lasx_xvfsub_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfsub.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract single precision floating point elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = a.fp32[i] - b.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500052
+

__m256d __lasx_xvfsub_d (__m256d a, __m256d b)

+

Synopsis

+
__m256d __lasx_xvfsub_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfsub.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract double precision floating point elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = a.fp64[i] - b.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500052
+

__m256 __lasx_xvflogb_s (__m256 a)

+

Synopsis

+
__m256 __lasx_xvflogb_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvflogb.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute 2-based logarithm of single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = log2(a.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256d __lasx_xvflogb_d (__m256d a)

+

Synopsis

+
__m256d __lasx_xvflogb_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvflogb.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute 2-based logarithm of double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = log2(a.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256 __lasx_xvfsqrt_s (__m256 a)

+

Synopsis

+
__m256 __lasx_xvfsqrt_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfsqrt.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute square root of single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = sqrt(a.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000150.08(1/12)
3C5000150.07(1/13.5)
+

__m256d __lasx_xvfsqrt_d (__m256d a)

+

Synopsis

+
__m256d __lasx_xvfsqrt_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfsqrt.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute square root of double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = sqrt(a.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000360.06(1/17.5)
3C5000360.05(1/18.5)
+

__m256 __lasx_xvfrsqrt_s (__m256 a)

+

Synopsis

+
__m256 __lasx_xvfrsqrt_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrsqrt.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute reciprocal of square root of single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = 1.0 / sqrt(a.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000250.05(1/19)
3C5000250.03(1/32)
+

__m256d __lasx_xvfrsqrt_d (__m256d a)

+

Synopsis

+
__m256d __lasx_xvfrsqrt_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrsqrt.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute reciprocal of square root of double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = 1.0 / sqrt(a.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000150.04(1/26.5)
3C5000150.04(1/27.5)
+

__m256 __lasx_xvfrecip_s (__m256 a)

+

Synopsis

+
__m256 __lasx_xvfrecip_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrecip.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute reciprocal of single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = 1 / a.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000270.18(1/5.5)
3C5000270.14(1/7)
+

__m256d __lasx_xvfrecip_d (__m256d a)

+

Synopsis

+
__m256d __lasx_xvfrecip_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrecip.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute reciprocal of double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = 1 / a.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000230.25(1/4)
3C5000230.08(1/12)
+

__m256 __lasx_xvfrsqrte_s (__m256 a)

+

Synopsis

+
__m256 __lasx_xvfrsqrte_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrsqrte.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute estimated reciprocal of square root of single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = 1.0 / sqrt(a.fp32[i]); // estimated
+}
+
+

__m256d __lasx_xvfrsqrte_d (__m256d a)

+

Synopsis

+
__m256d __lasx_xvfrsqrte_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrsqrte.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute estimated reciprocal of square root of double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = 1.0 / sqrt(a.fp64[i]); // estimated
+}
+
+

__m256 __lasx_xvfrecipe_s (__m256 a)

+

Synopsis

+
__m256 __lasx_xvfrecipe_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrecipe.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute estimated reciprocal of single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = 1 / a.fp32[i]; // estimated
+}
+
+

__m256d __lasx_xvfrecipe_d (__m256d a)

+

Synopsis

+
__m256d __lasx_xvfrecipe_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrecipe.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute estimated reciprocal of double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = 1 / a.fp64[i]; // estimated
+}
+
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/float_conversion/index.html b/lasx/float_conversion/index.html new file mode 100644 index 00000000..fce6577b --- /dev/null +++ b/lasx/float_conversion/index.html @@ -0,0 +1,2235 @@ + + + + + + + + Floating Point Conversion - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Floating Point Conversion

+

__m256d __lasx_xvfcvth_d_s (__m256 a)

+

Synopsis

+
__m256d __lasx_xvfcvth_d_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfcvth.d.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single precision floating point elements in higher half of a to double precision.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = a.fp32[4 + i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500031
+

__m256d __lasx_xvfcvtl_d_s (__m256 a)

+

Synopsis

+
__m256d __lasx_xvfcvtl_d_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfcvtl.d.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single precision floating point elements in lower half of a to double precision.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = a.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500031
+

__m256 __lasx_xvfcvt_s_d (__m256d a, __m256d b)

+

Synopsis

+
__m256 __lasx_xvfcvt_s_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcvt.s.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double precision floating point elements in a and b to single precision.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    dst.fp32[i] = b.fp64[i];
+  } else {
+    dst.fp32[i] = a.fp64[i - 4];
+  }
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500031
+

__m256 __lasx_xvfcvth_s_h (__m256i a)

+

Synopsis

+
__m256 __lasx_xvfcvth_s_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvfcvth.s.h xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert half precision floating point elements in higher half of a to single precision.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = a.fp16[8 + i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500031
+

__m256 __lasx_xvfcvtl_s_h (__m256i a)

+

Synopsis

+
__m256 __lasx_xvfcvtl_s_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvfcvtl.s.h xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert half precision floating point elements in lower half of a to single precision.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = a.fp16[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500031
+

__m256i __lasx_xvfcvt_h_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcvt_h_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcvt.h.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single precision floating point elements in a and b to half precision.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    dst.fp16[i] = b.fp32[i];
+  } else {
+    dst.fp16[i] = a.fp32[i - 8];
+  }
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500031
+

__m256d __lasx_xvffinth_d_w (__m256i a)

+

Synopsis

+
__m256d __lasx_xvffinth_d_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvffinth.d.w xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert 32-bit integer elements in higher part of a to double precision floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = (f64)(s32)a.word[i + 4]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256d __lasx_xvffintl_d_w (__m256i a)

+

Synopsis

+
__m256d __lasx_xvffintl_d_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvffintl.d.w xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert 32-bit integer elements in lower part of a to double precision floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = (f64)(s32)a.word[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256d __lasx_xvffint_d_l (__m256i a)

+

Synopsis

+
__m256d __lasx_xvffint_d_l (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvffint.d.l xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert signed 64-bit integer elements in a to double-precision floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = (f64)(s64)a.dword[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256d __lasx_xvffint_d_lu (__m256i a)

+

Synopsis

+
__m256d __lasx_xvffint_d_lu (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvffint.d.lu xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert unsigned 64-bit integer elements in a to double-precision floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = (f64)(u64)a.dword[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256 __lasx_xvffint_s_w (__m256i a)

+

Synopsis

+
__m256 __lasx_xvffint_s_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvffint.s.w xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert signed 32-bit integer elements in a to single-precision floating point numbers.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = (f32)(s32)a.word[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256 __lasx_xvffint_s_wu (__m256i a)

+

Synopsis

+
__m256 __lasx_xvffint_s_wu (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvffint.s.wu xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert unsigned 32-bit integer elements in a to single-precision floating point numbers.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = (f32)(u32)a.word[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256 __lasx_xvffint_s_l (__m256i a, __m256i b)

+

Synopsis

+
__m256 __lasx_xvffint_s_l (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvffint.s.l xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert 64-bit integer elements in a and b to single-precision floating point numbers.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] =
+      (i < 4) ? (f32)(s32)a.dword[i]
+              : (f32)(s32)b.dword[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintl_l_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintl_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintl.l.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in lower part of a to 64-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftinth_l_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftinth_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftinth.l.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in higher part of a to 64-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintrml_l_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrml_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrml.l.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in lower part of a to 64-bit integer, rounding towards negative infinity.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintrmh_l_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrmh_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrmh.l.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in higher part of a to 64-bit integer, rounding towards negative infinity.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintrpl_l_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrpl_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrpl.l.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in lower part of a to 64-bit integer, rounding towards positive infinity.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintrph_l_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrph_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrph.l.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in higher part of a to 64-bit integer, rounding towards positive infinity.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintrzl_l_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrzl_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrzl.l.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in lower part of a to 64-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintrzh_l_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrzh_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrzh.l.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in higher part of a to 64-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintrnel_l_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrnel_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrnel.l.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in lower part of a to 64-bit integer, rounding towards nearest even.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintrneh_l_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrneh_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrneh.l.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in higher part of a to 64-bit integer, rounding towards nearest even.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftint_l_d (__m256d a)

+

Synopsis

+
__m256i __lasx_xvftint_l_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftint.l.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double-precision floating point elements in a to signed 64-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftint_w_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftint_w_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftint.w.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in a to signed 32-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftintrm_l_d (__m256d a)

+

Synopsis

+
__m256i __lasx_xvftintrm_l_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftintrm.l.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double-precision floating point elements in a to signed 64-bit integer, rounding towards negative infinity.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftintrm_w_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrm_w_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrm.w.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in a to signed 32-bit integer, rounding towards negative infinity.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftintrp_l_d (__m256d a)

+

Synopsis

+
__m256i __lasx_xvftintrp_l_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftintrp.l.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double-precision floating point elements in a to signed 64-bit integer, rounding towards positive infinity.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftintrp_w_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrp_w_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrp.w.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in a to signed 32-bit integer, rounding towards positive infinity.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftintrz_l_d (__m256d a)

+

Synopsis

+
__m256i __lasx_xvftintrz_l_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftintrz.l.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double-precision floating point elements in a to signed 64-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftintrz_w_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrz_w_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrz.w.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in a to signed 32-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftintrne_l_d (__m256d a)

+

Synopsis

+
__m256i __lasx_xvftintrne_l_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftintrne.l.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double-precision floating point elements in a to signed 64-bit integer, rounding towards nearest even.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftintrne_w_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrne_w_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrne.w.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in a to signed 32-bit integer, rounding towards nearest even.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftint_lu_d (__m256d a)

+

Synopsis

+
__m256i __lasx_xvftint_lu_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftint.lu.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double-precision floating point elements in a to unsigned 64-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftint_wu_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftint_wu_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftint.wu.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in a to unsigned 32-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftintrz_lu_d (__m256d a)

+

Synopsis

+
__m256i __lasx_xvftintrz_lu_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftintrz.lu.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double-precision floating point elements in a to unsigned 64-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftintrz_wu_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrz_wu_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrz.wu.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in a to unsigned 32-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftint_w_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvftint_w_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvftint.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double-precision floating point elements in a and b to 32-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (i < 2)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintrm_w_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvftintrm_w_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvftintrm.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double-precision floating point elements in a and b to 32-bit integer, rounding towards negative infinity.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (i < 2)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintrp_w_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvftintrp_w_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvftintrp.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double-precision floating point elements in a and b to 32-bit integer, rounding towards positive infinity.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (i < 2)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintrz_w_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvftintrz_w_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvftintrz.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double-precision floating point elements in a and b to 32-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (i < 2)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintrne_w_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvftintrne_w_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvftintrne.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double-precision floating point elements in a and b to 32-bit integer, rounding towards nearest even.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (i < 2)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/float_misc/index.html b/lasx/float_misc/index.html new file mode 100644 index 00000000..c00faa6d --- /dev/null +++ b/lasx/float_misc/index.html @@ -0,0 +1,775 @@ + + + + + + + + Floating Point Misc - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Floating Point Misc

+

__m256i __lasx_xvfclass_d (__m256d a)

+

Synopsis

+
__m256i __lasx_xvfclass_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfclass.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Classifiy each double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = fp_classify(a.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfclass_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvfclass_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfclass.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Classifiy each single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.word[i] = fp_classify(a.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256 __lasx_xvfrint_s (__m256 a)

+

Synopsis

+
__m256 __lasx_xvfrint_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrint.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Round single-precision floating point elements in a to integers, using current rounding mode specified in fscr, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256d __lasx_xvfrint_d (__m256d a)

+

Synopsis

+
__m256d __lasx_xvfrint_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrint.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Round single-precision floating point elements in a to integers, using current rounding mode specified in fscr, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256 __lasx_xvfrintrp_s (__m256 a)

+

Synopsis

+
__m256 __lasx_xvfrintrp_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrintrp.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards positive infinity, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256d __lasx_xvfrintrp_d (__m256d a)

+

Synopsis

+
__m256d __lasx_xvfrintrp_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrintrp.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards positive infinity, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256 __lasx_xvfrintrm_s (__m256 a)

+

Synopsis

+
__m256 __lasx_xvfrintrm_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrintrm.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards negative infinity, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256d __lasx_xvfrintrm_d (__m256d a)

+

Synopsis

+
__m256d __lasx_xvfrintrm_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrintrm.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards negative infinity, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256 __lasx_xvfrintrz_s (__m256 a)

+

Synopsis

+
__m256 __lasx_xvfrintrz_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrintrz.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards zero, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256d __lasx_xvfrintrz_d (__m256d a)

+

Synopsis

+
__m256d __lasx_xvfrintrz_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrintrz.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards zero, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256 __lasx_xvfrintrne_s (__m256 a)

+

Synopsis

+
__m256 __lasx_xvfrintrne_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrintrne.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards nearest even, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256d __lasx_xvfrintrne_d (__m256d a)

+

Synopsis

+
__m256d __lasx_xvfrintrne_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrintrne.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards nearest even, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/fma/index.html b/lasx/fma/index.html new file mode 100644 index 00000000..285180c4 --- /dev/null +++ b/lasx/fma/index.html @@ -0,0 +1,583 @@ + + + + + + + + Fused Multiply-Add - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Fused Multiply-Add

+

__m256d __lasx_xvfmadd_d (__m256d a, __m256d b, __m256d c)

+

Synopsis

+
__m256d __lasx_xvfmadd_d (__m256d a, __m256d b, __m256d c)
+#include <lasxintrin.h>
+Instruction: xvfmadd.d xr, xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, accumulate to elements in c and store the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = a.fp64[i] * b.fp64[i] + c.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m256 __lasx_xvfmadd_s (__m256 a, __m256 b, __m256 c)

+

Synopsis

+
__m256 __lasx_xvfmadd_s (__m256 a, __m256 b, __m256 c)
+#include <lasxintrin.h>
+Instruction: xvfmadd.s xr, xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, accumulate to elements in c and store the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = a.fp32[i] * b.fp32[i] + c.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m256d __lasx_xvfmsub_d (__m256d a, __m256d b, __m256d c)

+

Synopsis

+
__m256d __lasx_xvfmsub_d (__m256d a, __m256d b, __m256d c)
+#include <lasxintrin.h>
+Instruction: xvfmsub.d xr, xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, subtract elements in c and store the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = a.fp64[i] * b.fp64[i] - c.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m256 __lasx_xvfmsub_s (__m256 a, __m256 b, __m256 c)

+

Synopsis

+
__m256 __lasx_xvfmsub_s (__m256 a, __m256 b, __m256 c)
+#include <lasxintrin.h>
+Instruction: xvfmsub.s xr, xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, subtract elements in c and store the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = a.fp32[i] * b.fp32[i] - c.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m256d __lasx_xvfnmadd_d (__m256d a, __m256d b, __m256d c)

+

Synopsis

+
__m256d __lasx_xvfnmadd_d (__m256d a, __m256d b, __m256d c)
+#include <lasxintrin.h>
+Instruction: xvfnmadd.d xr, xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, accumulate to elements in c and store the negated result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = (a.fp64[i] * b.fp64[i] + c.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m256 __lasx_xvfnmadd_s (__m256 a, __m256 b, __m256 c)

+

Synopsis

+
__m256 __lasx_xvfnmadd_s (__m256 a, __m256 b, __m256 c)
+#include <lasxintrin.h>
+Instruction: xvfnmadd.s xr, xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, accumulate to elements in c and store the negated result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = -(a.fp32[i] * b.fp32[i] + c.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m256d __lasx_xvfnmsub_d (__m256d a, __m256d b, __m256d c)

+

Synopsis

+
__m256d __lasx_xvfnmsub_d (__m256d a, __m256d b, __m256d c)
+#include <lasxintrin.h>
+Instruction: xvfnmsub.d xr, xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, subtract elements in c and store the negated result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = -(a.fp64[i] * b.fp64[i] - c.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m256 __lasx_xvfnmsub_s (__m256 a, __m256 b, __m256 c)

+

Synopsis

+
__m256 __lasx_xvfnmsub_s (__m256 a, __m256 b, __m256 c)
+#include <lasxintrin.h>
+Instruction: xvfnmsub.s xr, xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, subtract elements in c and store the negated result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = -(a.fp32[i] * b.fp32[i] - c.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/integer_comparison/index.html b/lasx/integer_comparison/index.html new file mode 100644 index 00000000..89463740 --- /dev/null +++ b/lasx/integer_comparison/index.html @@ -0,0 +1,2159 @@ + + + + + + + + Integer Comparison - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Integer Comparison

+

__m256i __lasx_xvseq_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvseq_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvseq.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the 8-bit elements in a and b, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (a.byte[i] == b.byte[i]) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvseq_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvseq_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvseq.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the 16-bit elements in a and b, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (a.half[i] == b.half[i]) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvseq_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvseq_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvseq.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the 32-bit elements in a and b, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (a.word[i] == b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvseq_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvseq_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvseq.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the 64-bit elements in a and b, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (a.dword[i] == b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvseqi_b (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvseqi_b (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvseqi.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the 8-bit elements in a and imm, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i] == imm) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvseqi_h (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvseqi_h (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvseqi.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the 16-bit elements in a and imm, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((s16)a.half[i] == imm) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvseqi_w (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvseqi_w (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvseqi.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the 32-bit elements in a and imm, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((s32)a.word[i] == imm) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvseqi_d (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvseqi_d (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvseqi.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the 64-bit elements in a and imm, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i] == imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslt_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvslt_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the signed 8-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i] < (s8)b.byte[i]) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslt_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvslt_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 8-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((u8)a.byte[i] < (u8)b.byte[i]) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslt_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvslt_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the signed 16-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((s16)a.half[i] < (s16)b.half[i]) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslt_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvslt_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 16-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((u16)a.half[i] < (u16)b.half[i]) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslt_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvslt_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the signed 32-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((s32)a.word[i] < (s32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslt_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvslt_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 32-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((u32)a.word[i] < (u32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslt_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvslt_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the signed 64-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i] < (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvslt_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvslt_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 64-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((u64)a.dword[i] < (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvslti_b (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvslti_b (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the signed 8-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i] < imm) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslti_bu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvslti_bu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.bu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 8-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((u8)a.byte[i] < imm) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslti_h (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvslti_h (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the signed 16-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((s16)a.half[i] < imm) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslti_hu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvslti_hu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.hu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 16-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((u16)a.half[i] < imm) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslti_w (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvslti_w (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the signed 32-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((s32)a.word[i] < imm) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslti_wu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvslti_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.wu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 32-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((u32)a.word[i] < imm) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslti_d (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvslti_d (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the signed 64-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i] < imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvslti_du (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvslti_du (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.du xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 64-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((u64)a.dword[i] < imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvsle_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsle_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the signed 8-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i] <= (s8)b.byte[i]) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsle_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsle_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 8-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((u8)a.byte[i] <= (u8)b.byte[i]) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsle_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsle_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the signed 16-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((s16)a.half[i] <= (s16)b.half[i]) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsle_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsle_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 16-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((u16)a.half[i] <= (u16)b.half[i]) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsle_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsle_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the signed 32-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((s32)a.word[i] <= (s32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsle_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsle_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 32-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((u32)a.word[i] <= (u32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsle_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsle_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the signed 64-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i] <= (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvsle_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsle_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 64-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((u64)a.dword[i] <= (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvslei_b (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvslei_b (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the signed 8-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i] <= imm) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslei_bu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvslei_bu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.bu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 8-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((u8)a.byte[i] <= imm) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslei_h (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvslei_h (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the signed 16-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((s16)a.half[i] <= imm) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslei_hu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvslei_hu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.hu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 16-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((u16)a.half[i] <= imm) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslei_w (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvslei_w (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the signed 32-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((s32)a.word[i] <= imm) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslei_wu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvslei_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.wu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 32-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((u32)a.word[i] <= imm) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslei_d (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvslei_d (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the signed 64-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i] <= imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvslei_du (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvslei_du (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.du xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 64-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((u64)a.dword[i] <= imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/integer_computation/index.html b/lasx/integer_computation/index.html new file mode 100644 index 00000000..c12b71cb --- /dev/null +++ b/lasx/integer_computation/index.html @@ -0,0 +1,11911 @@ + + + + + + + + Integer Computation - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Integer Computation

+

__m256i __lasx_xvadd_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvadd_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadd.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] + b.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvadd_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvadd_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadd.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] + b.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvadd_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvadd_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadd.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] + b.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvadd_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvadd_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadd.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] + b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvadd_q (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvadd_q (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadd.q xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add 128-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = a.qword[i] + b.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvabsd_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvabsd_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute absolute difference of signed 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i] > (s8)b.byte[i]) ? (a.byte[i] - b.byte[i])
+                                                : (b.byte[i] - a.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvabsd_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvabsd_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute absolute difference of unsigned 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((u8)a.byte[i] > (u8)b.byte[i]) ? (a.byte[i] - b.byte[i])
+                                                : (b.byte[i] - a.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvabsd_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvabsd_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute absolute difference of signed 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((s16)a.half[i] > (s16)b.half[i]) ? (a.half[i] - b.half[i])
+                                                  : (b.half[i] - a.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvabsd_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvabsd_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute absolute difference of unsigned 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((u16)a.half[i] > (u16)b.half[i]) ? (a.half[i] - b.half[i])
+                                                  : (b.half[i] - a.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvabsd_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvabsd_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute absolute difference of signed 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((s32)a.word[i] > (s32)b.word[i]) ? (a.word[i] - b.word[i])
+                                                  : (b.word[i] - a.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvabsd_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvabsd_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute absolute difference of unsigned 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((u32)a.word[i] > (u32)b.word[i]) ? (a.word[i] - b.word[i])
+                                                  : (b.word[i] - a.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvabsd_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvabsd_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute absolute difference of signed 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i] > (s64)b.dword[i])
+                     ? (a.dword[i] - b.dword[i])
+                     : (b.dword[i] - a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvabsd_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvabsd_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute absolute difference of unsigned 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((u64)a.dword[i] > (u64)b.dword[i])
+                     ? (a.dword[i] - b.dword[i])
+                     : (b.dword[i] - a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvadda_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvadda_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadda.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add absolute of 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = abs((s8)a.byte[i]) + abs((s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvadda_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvadda_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadda.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add absolute of 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = abs((s16)a.half[i]) + abs((s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvadda_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvadda_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadda.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add absolute of 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = abs((s32)a.word[i]) + abs((s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvadda_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvadda_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadda.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add absolute of 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = abs((s64)a.dword[i]) + abs((s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvaddi_bu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvaddi_bu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvaddi.bu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Add 8-bit elements in a and imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] + imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvaddi_hu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvaddi_hu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvaddi.hu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Add 16-bit elements in a and imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] + imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvaddi_wu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvaddi_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvaddi.wu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Add 32-bit elements in a and imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] + imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvaddi_du (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvaddi_du (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvaddi.du xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Add 64-bit elements in a and imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] + imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvaddwev_h_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwev_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.h.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add even-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwev_h_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwev_h_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.h.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add even-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] + (u16)(u8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwev_h_bu_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwev_h_bu_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.h.bu.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add even-positioned unsigned 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwev_w_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwev_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.w.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add even-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwev_w_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwev_w_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.w.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add even-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] + (u32)(u16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwev_w_hu_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwev_w_hu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.w.hu.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add even-positioned unsigned 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwev_d_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwev_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.d.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add even-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwev_d_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwev_d_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.d.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add even-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] + (u64)(u32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwev_d_wu_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwev_d_wu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.d.wu.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add even-positioned unsigned 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwev_q_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwev_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.q.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add even-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvaddwev_q_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwev_q_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.q.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add even-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] + (u128)(u64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvaddwev_q_du_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwev_q_du_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.q.du.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add even-positioned unsigned 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvaddwod_h_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwod_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.h.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwod_h_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwod_h_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.h.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwod_h_bu_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwod_h_bu_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.h.bu.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned unsigned 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwod_w_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwod_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.w.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwod_w_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwod_w_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.w.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwod_w_hu_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwod_w_hu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.w.hu.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned unsigned 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwod_d_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwod_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.d.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwod_d_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwod_d_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.d.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwod_d_wu_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwod_d_wu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.d.wu.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned unsigned 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwod_q_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwod_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.q.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvaddwod_q_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwod_q_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.q.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvaddwod_q_du_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwod_q_du_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.q.du.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned unsigned 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvavg_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavg_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards negative infinity) of signed 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) +
+                ((a.byte[i] & b.byte[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvavg_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavg_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards negative infinity) of unsigned 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) +
+                ((a.byte[i] & b.byte[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvavg_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavg_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards negative infinity) of signed 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +
+                ((a.half[i] & b.half[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvavg_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavg_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards negative infinity) of unsigned 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +
+                ((a.half[i] & b.half[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvavg_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavg_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards negative infinity) of signed 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +
+                ((a.word[i] & b.word[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvavg_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavg_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards negative infinity) of unsigned 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +
+                ((a.word[i] & b.word[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvavg_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavg_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards negative infinity) of signed 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +
+                 ((a.dword[i] & b.dword[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvavg_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavg_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards negative infinity) of unsigned 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +
+                 ((a.dword[i] & b.dword[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvavgr_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavgr_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards positive infinity) of signed 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) +
+                ((a.byte[i] | b.byte[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvavgr_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavgr_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards positive infinity) of unsigned 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) +
+                ((a.byte[i] | b.byte[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvavgr_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavgr_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards positive infinity) of signed 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +
+                ((a.half[i] | b.half[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvavgr_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavgr_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards positive infinity) of unsigned 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +
+                ((a.half[i] | b.half[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvavgr_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavgr_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards positive infinity) of signed 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +
+                ((a.word[i] | b.word[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvavgr_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavgr_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards positive infinity) of unsigned 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +
+                ((a.word[i] | b.word[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvavgr_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavgr_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards positive infinity) of signed 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +
+                 ((a.dword[i] | b.dword[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvavgr_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavgr_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards positive infinity) of unsigned 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +
+                 ((a.dword[i] | b.dword[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvdiv_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvdiv_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Divide signed 8-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] / (s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600029, 320.06(1/15.5)
3C500032, 360.05(1/20.5)
+

__m256i __lasx_xvdiv_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvdiv_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Divide unsigned 8-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] / (u8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600029, 330.06(1/16.5)
3C500029, 360.05(1/20.5)
+

__m256i __lasx_xvdiv_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvdiv_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Divide signed 16-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] / (s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000170.12(1/8.5)
3C500021.5, 220.08(1/13)
+

__m256i __lasx_xvdiv_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvdiv_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Divide unsigned 16-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] / (u16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600017, 220.11(1/9)
3C500017, 21.50.07(1/15)
+

__m256i __lasx_xvdiv_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvdiv_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Divide signed 32-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] / (s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000110.18(1/5.5)
3C500011, 17.50.09(1/11.5)
+

__m256i __lasx_xvdiv_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvdiv_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Divide unsigned 32-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] / (u32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000110.18(1/5.5)
3C500011, 17.50.07(1/15)
+

__m256i __lasx_xvdiv_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvdiv_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Divide signed 64-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] / (s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600080.25(1/4)
3C50008, 18.50.11(1/9)
+

__m256i __lasx_xvdiv_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvdiv_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Divide unsigned 64-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] / (u64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600080.25(1/4)
3C50008, 18.50.11(1/9)
+

__m256i __lasx_xvhaddw_h_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhaddw_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.h.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned signed 8-bit elements in a to even-positioned signed 8-bit elements in b to get 16-bit result.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvhaddw_hu_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhaddw_hu_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.hu.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned unsigned 8-bit elements in a to even-positioned unsigned 8-bit elements in b to get 16-bit result.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvhaddw_w_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhaddw_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.w.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned signed 16-bit elements in a to even-positioned signed 16-bit elements in b to get 32-bit result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvhaddw_wu_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhaddw_wu_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.wu.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned unsigned 16-bit elements in a to even-positioned unsigned 16-bit elements in b to get 32-bit result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvhaddw_d_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhaddw_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.d.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned signed 32-bit elements in a to even-positioned signed 32-bit elements in b to get 64-bit result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvhaddw_du_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhaddw_du_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.du.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned unsigned 32-bit elements in a to even-positioned unsigned 32-bit elements in b to get 64-bit result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvhaddw_q_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhaddw_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.q.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned signed 64-bit elements in a to even-positioned signed 64-bit elements in b to get 128-bit result.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvhaddw_qu_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhaddw_qu_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.qu.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned unsigned 64-bit elements in a to even-positioned unsigned 64-bit elements in b to get 128-bit result.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvhsubw_h_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhsubw_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.h.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned signed 8-bit elements in a by even-positioned signed 8-bit elements in b to get 16-bit result.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvhsubw_hu_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhsubw_hu_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.hu.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned unsigned 8-bit elements in a by even-positioned unsigned 8-bit elements in b to get 16-bit result.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvhsubw_w_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhsubw_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.w.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned signed 16-bit elements in a by even-positioned signed 16-bit elements in b to get 32-bit result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvhsubw_wu_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhsubw_wu_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.wu.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned unsigned 16-bit elements in a by even-positioned unsigned 16-bit elements in b to get 32-bit result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvhsubw_d_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhsubw_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.d.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned signed 32-bit elements in a by even-positioned signed 32-bit elements in b to get 64-bit result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvhsubw_du_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhsubw_du_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.du.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned unsigned 32-bit elements in a by even-positioned unsigned 32-bit elements in b to get 64-bit result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvhsubw_q_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhsubw_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.q.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned signed 64-bit elements in a by even-positioned signed 64-bit elements in b to get 128-bit result.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvhsubw_qu_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhsubw_qu_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.qu.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned unsigned 64-bit elements in a by even-positioned unsigned 64-bit elements in b to get 128-bit result.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvmadd_b (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmadd_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmadd.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply 8-bit elements in b and c, add to elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = b.byte[i] * c.byte[i] + a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmadd_h (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmadd_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmadd.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply 16-bit elements in b and c, add to elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = b.half[i] * c.half[i] + a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmadd_w (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmadd_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmadd.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply 32-bit elements in b and c, add to elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = b.word[i] * c.word[i] + a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmadd_d (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmadd_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmadd.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply 64-bit elements in b and c, add to elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = b.dword[i] * c.dword[i] + a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwev_h_b (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwev_h_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.h.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned signed 8-bit elements in b and signed elements in c, add to 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] =
+      (s16)(s8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwev_h_bu (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwev_h_bu (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.h.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 8-bit elements in b and unsigned elements in c, add to 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] =
+      (u16)(u8)b.byte[2 * i] * (u16)(u8)c.byte[2 * i] + (u16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwev_h_bu_b (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwev_h_bu_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.h.bu.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 8-bit elements in b and signed elements in c, add to 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] =
+      (u16)(u8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwev_w_h (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwev_w_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.w.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned signed 16-bit elements in b and signed elements in c, add to 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] =
+      (s32)(s16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwev_w_hu (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwev_w_hu (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.w.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 16-bit elements in b and unsigned elements in c, add to 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] =
+      (u32)(u16)b.half[2 * i] * (u32)(u16)c.half[2 * i] + (u32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwev_w_hu_h (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwev_w_hu_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.w.hu.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 16-bit elements in b and signed elements in c, add to 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] =
+      (u32)(u16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwev_d_w (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwev_d_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.d.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned signed 32-bit elements in b and signed elements in c, add to 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] =
+      (s64)(s32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwev_d_wu (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwev_d_wu (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.d.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 32-bit elements in b and unsigned elements in c, add to 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] =
+      (u64)(u32)b.word[2 * i] * (u64)(u32)c.word[2 * i] + (u64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwev_d_wu_w (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwev_d_wu_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.d.wu.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 32-bit elements in b and signed elements in c, add to 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] =
+      (u64)(u32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwev_q_d (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwev_q_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.q.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned signed 64-bit elements in b and signed elements in c, add to 128-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] =
+      (s128)(s64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600071.14
3C500071.14
+

__m256i __lasx_xvmaddwev_q_du (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwev_q_du (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.q.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 64-bit elements in b and unsigned elements in c, add to 128-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] =
+      (u128)(u64)b.dword[2 * i] * (u128)(u64)c.dword[2 * i] + (u128)a.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600071.14
3C500071.14
+

__m256i __lasx_xvmaddwev_q_du_d (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwev_q_du_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.q.du.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 64-bit elements in b and signed elements in c, add to 128-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] =
+      (u128)(u64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600071.14
3C500071.14
+

__m256i __lasx_xvmaddwod_h_b (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwod_h_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.h.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned signed 8-bit elements in b and signed elements in c, add to 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] =
+      (s16)(s8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwod_h_bu (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwod_h_bu (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.h.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 8-bit elements in b and unsigned elements in c, add to 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] =
+      (u16)(u8)b.byte[2 * i + 1] * (u16)(u8)c.byte[2 * i + 1] + (u16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwod_h_bu_b (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwod_h_bu_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.h.bu.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 8-bit elements in b and signed elements in c, add to 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] =
+      (u16)(u8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwod_w_h (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwod_w_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.w.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned signed 16-bit elements in b and signed elements in c, add to 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)(s16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
+                (s32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwod_w_hu (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwod_w_hu (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.w.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 16-bit elements in b and unsigned elements in c, add to 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (u32)(u16)c.half[2 * i + 1] +
+                (u32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwod_w_hu_h (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwod_w_hu_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.w.hu.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 16-bit elements in b and signed elements in c, add to 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
+                (s32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwod_d_w (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwod_d_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.d.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned signed 32-bit elements in b and signed elements in c, add to 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)(s32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
+                 (s64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwod_d_wu (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwod_d_wu (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.d.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 32-bit elements in b and unsigned elements in c, add to 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (u64)(u32)c.word[2 * i + 1] +
+                 (u64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwod_d_wu_w (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwod_d_wu_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.d.wu.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 32-bit elements in b and signed elements in c, add to 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
+                 (s64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwod_q_d (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwod_q_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.q.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned signed 64-bit elements in b and signed elements in c, add to 128-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (s128)(s64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
+                 (s128)a.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600071.14
3C500071.14
+

__m256i __lasx_xvmaddwod_q_du (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwod_q_du (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.q.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 64-bit elements in b and unsigned elements in c, add to 128-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (u128)(u64)c.dword[2 * i + 1] +
+                 (u128)a.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600071.14
3C500071.14
+

__m256i __lasx_xvmaddwod_q_du_d (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwod_q_du_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.q.du.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 64-bit elements in b and signed elements in c, add to 128-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
+                 (s128)a.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600071.14
3C500071.14
+

__m256i __lasx_xvmax_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmax_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for signed 8-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = max((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmax_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmax_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for unsigned 8-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = max((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmax_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmax_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for signed 16-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = max((s16)a.half[i], (s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmax_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmax_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for unsigned 16-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = max((u16)a.half[i], (u16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmax_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmax_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for signed 32-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = max((s32)a.word[i], (s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmax_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmax_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for unsigned 32-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = max((u32)a.word[i], (u32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmax_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmax_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for signed 64-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = max((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvmax_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmax_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for unsigned 64-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = max((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvmaxi_b (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvmaxi_b (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for signed 8-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = max((s8)a.byte[i], (s8)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmaxi_bu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvmaxi_bu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.bu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for unsigned 8-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = max((u8)a.byte[i], (u8)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmaxi_h (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvmaxi_h (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for signed 16-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = max((s16)a.half[i], (s16)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmaxi_hu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvmaxi_hu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.hu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for unsigned 16-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = max((u16)a.half[i], (u16)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmaxi_w (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvmaxi_w (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for signed 32-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = max((s32)a.word[i], (s32)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmaxi_wu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvmaxi_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.wu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for unsigned 32-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = max((u32)a.word[i], (u32)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmaxi_d (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvmaxi_d (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for signed 64-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = max((s64)a.dword[i], (s64)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvmaxi_du (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvmaxi_du (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.du xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for unsigned 64-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = max((u64)a.dword[i], (u64)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvmin_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmin_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for signed 8-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = min((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmin_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmin_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for unsigned 8-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = min((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmin_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmin_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for signed 16-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = min((s16)a.half[i], (s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmin_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmin_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for unsigned 16-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = min((u16)a.half[i], (u16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmin_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmin_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for signed 32-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = min((s32)a.word[i], (s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmin_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmin_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for unsigned 32-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = min((u32)a.word[i], (u32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmin_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmin_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for signed 64-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = min((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvmin_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmin_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for unsigned 64-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = min((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvmini_b (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvmini_b (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for signed 8-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = min((s8)a.byte[i], (s8)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmini_bu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvmini_bu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.bu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for unsigned 8-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = min((u8)a.byte[i], (u8)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmini_h (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvmini_h (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for signed 16-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = min((s16)a.half[i], (s16)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmini_hu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvmini_hu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.hu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for unsigned 16-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = min((u16)a.half[i], (u16)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmini_w (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvmini_w (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for signed 32-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = min((s32)a.word[i], (s32)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmini_wu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvmini_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.wu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for unsigned 32-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = min((u32)a.word[i], (u32)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmini_d (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvmini_d (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for signed 64-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = min((s64)a.dword[i], (s64)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvmini_du (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvmini_du (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.du xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for unsigned 64-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = min((u64)a.dword[i], (u64)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvmod_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmod_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Modulo residual signed 8-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] % (s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600029, 410.06(1/15.5)
3C500029, 330.05(1/21.5)
+

__m256i __lasx_xvmod_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmod_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Modulo residual unsigned 8-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] % (u8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600029, 370.06(1/17.5)
3C500029, 370.05(1/22)
+

__m256i __lasx_xvmod_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmod_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Modulo residual signed 16-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] % (s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600017, 210.12(1/8.5)
3C500017, 210.07(1/13.5)
+

__m256i __lasx_xvmod_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmod_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Modulo residual unsigned 16-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] % (u16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600017, 250.11(1/9.5)
3C500017, 230.06(1/16)
+

__m256i __lasx_xvmod_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmod_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Modulo residual signed 32-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] % (s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011, 130.18(1/5.5)
3C500011, 150.07(1/13.5)
+

__m256i __lasx_xvmod_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmod_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Modulo residual unsigned 32-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] % (u32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011, 130.18(1/5.5)
3C500011, 150.06(1/16)
+

__m256i __lasx_xvmod_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmod_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Modulo residual signed 64-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] % (s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60008, 100.25(1/4)
3C50008, 100.11(1/9.5)
+

__m256i __lasx_xvmod_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmod_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Modulo residual unsigned 64-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] % (u64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60008, 100.25(1/4)
3C50008, 100.11(1/9.5)
+

__m256i __lasx_xvmsub_b (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmsub_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmsub.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply 8-bit elements in b and c, negate and add elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = -b.byte[i] * c.byte[i] + a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmsub_h (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmsub_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmsub.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply 16-bit elements in b and c, negate and add elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = -b.half[i] * c.half[i] + a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmsub_w (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmsub_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmsub.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply 32-bit elements in b and c, negate and add elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = -b.word[i] * c.word[i] + a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmsub_d (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmsub_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmsub.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply 64-bit elements in b and c, negate and add elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = -b.dword[i] * c.dword[i] + a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmuh_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmuh_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply signed 8-bit elements in a and b, save the high 8-bit result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (((s16)(s8)a.byte[i] * (s16)(s8)b.byte[i])) >> 8;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmuh_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmuh_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply unsigned 8-bit elements in a and b, save the high 8-bit result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (((u16)(u8)a.byte[i] * (u16)(u8)b.byte[i])) >> 8;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmuh_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmuh_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply signed 16-bit elements in a and b, save the high 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (((s32)(s16)a.half[i] * (s32)(s16)b.half[i])) >> 16;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmuh_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmuh_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply unsigned 16-bit elements in a and b, save the high 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (((u32)(u16)a.half[i] * (u32)(u16)b.half[i])) >> 16;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmuh_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmuh_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply signed 32-bit elements in a and b, save the high 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (((s64)(s32)a.word[i] * (s64)(s32)b.word[i])) >> 32;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmuh_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmuh_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply unsigned 32-bit elements in a and b, save the high 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (((u64)(u32)a.word[i] * (u64)(u32)b.word[i])) >> 32;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmuh_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmuh_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply signed 64-bit elements in a and b, save the high 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (((s128)(s64)a.dword[i] * (s128)(s64)b.dword[i])) >> 64;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmuh_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmuh_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply unsigned 64-bit elements in a and b, save the high 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (((u128)(u64)a.dword[i] * (u128)(u64)b.dword[i])) >> 64;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmul_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmul_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmul.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] * b.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmul_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmul_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmul.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] * b.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmul_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmul_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmul.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] * b.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmul_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmul_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmul.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] * b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwev_h_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwev_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.h.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwev_h_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwev_h_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.h.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] * (u16)(u8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwev_h_bu_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwev_h_bu_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.h.bu.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwev_w_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwev_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.w.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwev_w_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwev_w_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.w.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] * (u32)(u16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwev_w_hu_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwev_w_hu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.w.hu.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwev_d_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwev_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.d.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwev_d_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwev_d_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.d.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] * (u64)(u32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwev_d_wu_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwev_d_wu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.d.wu.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwev_q_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwev_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.q.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600072
3C500072
+

__m256i __lasx_xvmulwev_q_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwev_q_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.q.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] * (u128)(u64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600072
3C500072
+

__m256i __lasx_xvmulwev_q_du_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwev_q_du_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.q.du.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600072
3C500072
+

__m256i __lasx_xvmulwod_h_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwod_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.h.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwod_h_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwod_h_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.h.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (u16)(u8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwod_h_bu_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwod_h_bu_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.h.bu.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwod_w_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwod_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.w.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwod_w_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwod_w_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.w.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (u32)(u16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwod_w_hu_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwod_w_hu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.w.hu.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwod_d_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwod_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.d.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwod_d_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwod_d_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.d.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (u64)(u32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwod_d_wu_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwod_d_wu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.d.wu.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwod_q_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwod_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.q.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600072
3C500072
+

__m256i __lasx_xvmulwod_q_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwod_q_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.q.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (u128)(u64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600072
3C500072
+

__m256i __lasx_xvmulwod_q_du_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwod_q_du_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.q.du.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600072
3C500072
+

__m256i __lasx_xvneg_b (__m256i a)

+

Synopsis

+
__m256i __lasx_xvneg_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvneg.b xr, xr
+CPU Flags: LASX
+
+

Description

+

Negate 8-bit elements in a and save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = -a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvneg_h (__m256i a)

+

Synopsis

+
__m256i __lasx_xvneg_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvneg.h xr, xr
+CPU Flags: LASX
+
+

Description

+

Negate 16-bit elements in a and save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = -a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvneg_w (__m256i a)

+

Synopsis

+
__m256i __lasx_xvneg_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvneg.w xr, xr
+CPU Flags: LASX
+
+

Description

+

Negate 32-bit elements in a and save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = -a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvneg_d (__m256i a)

+

Synopsis

+
__m256i __lasx_xvneg_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvneg.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Negate 64-bit elements in a and save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = -a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsadd_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsadd_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing add the signed 8-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (s8)sadd((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsadd_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsadd_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing add the unsigned 8-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (u8)sadd((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsadd_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsadd_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing add the signed 16-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (s16)sadd((s16)a.half[i], (s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsadd_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsadd_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing add the unsigned 16-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)sadd((u16)a.half[i], (u16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsadd_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsadd_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing add the signed 32-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)sadd((s32)a.word[i], (s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsadd_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsadd_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing add the unsigned 32-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)sadd((u32)a.word[i], (u32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsadd_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsadd_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing add the signed 64-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)sadd((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsadd_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsadd_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing add the unsigned 64-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)sadd((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvssub_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssub_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing subtract the signed 8-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (s8)ssub((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvssub_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssub_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing subtract the unsigned 8-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (u8)ssub((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvssub_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssub_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing subtract the signed 16-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (s16)ssub((s16)a.half[i], (s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvssub_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssub_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing subtract the unsigned 16-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)ssub((u16)a.half[i], (u16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvssub_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssub_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing subtract the signed 32-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)ssub((s32)a.word[i], (s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvssub_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssub_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing subtract the unsigned 32-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)ssub((u32)a.word[i], (u32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvssub_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssub_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing subtract the signed 64-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)ssub((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvssub_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssub_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing subtract the unsigned 64-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)ssub((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsub_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsub_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsub.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] - b.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsub_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsub_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsub.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] - b.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsub_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsub_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsub.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] - b.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsub_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsub_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsub.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] - b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsub_q (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsub_q (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsub.q xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract 128-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = a.qword[i] - b.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsubi_bu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsubi_bu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsubi.bu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Subtract 8-bit elements in a by imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] - imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsubi_hu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsubi_hu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsubi.hu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Subtract 16-bit elements in a by imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] - imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsubi_wu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsubi_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsubi.wu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Subtract 32-bit elements in a by imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] - imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsubi_du (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsubi_du (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsubi.du xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Subtract 64-bit elements in a by imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] - imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsubwev_h_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwev_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.h.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract even-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i] - (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsubwev_h_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwev_h_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.h.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract even-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] - (u16)(u8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsubwev_w_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwev_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.w.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract even-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i] - (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsubwev_w_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwev_w_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.w.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract even-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] - (u32)(u16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsubwev_d_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwev_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.d.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract even-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i] - (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsubwev_d_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwev_d_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.d.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract even-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] - (u64)(u32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsubwev_q_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwev_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.q.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract even-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i] - (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsubwev_q_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwev_q_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.q.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract even-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] - (u128)(u64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsubwod_h_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwod_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.h.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsubwod_h_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwod_h_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.h.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsubwod_w_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwod_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.w.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsubwod_w_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwod_w_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.w.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsubwod_d_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwod_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.d.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsubwod_d_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwod_d_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.d.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsubwod_q_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwod_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.q.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsubwod_q_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwod_q_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.q.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/logical/index.html b/lasx/logical/index.html new file mode 100644 index 00000000..4f8026c4 --- /dev/null +++ b/lasx/logical/index.html @@ -0,0 +1,689 @@ + + + + + + + + Logical - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Logical

+

__m256i __lasx_xvand_v (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvand_v (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvand.v xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute bitwise AND between elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] & b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvandi_b (__m256i a, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvandi_b (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvandi.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute bitwise AND between elements in a and imm.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] & imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvandn_v (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvandn_v (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvandn.v xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute bitwise ANDN between elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = b.dword[i] & (~a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvnor_v (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvnor_v (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvnor.v xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute bitwise NOR between elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ~(a.dword[i] | b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvnori_b (__m256i a, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvnori_b (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvnori.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute bitwise NOR between elements in a and imm.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ~(a.byte[i] | imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvor_v (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvor_v (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvor.v xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute bitwise OR between elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] | b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvori_b (__m256i a, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvori_b (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvori.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute bitwise OR between elements in a and imm.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] | imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvorn_v (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvorn_v (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvorn.v xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute bitwise ORN between elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] | (~b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvxor_v (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvxor_v (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvxor.v xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute bitwise XOR between elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] ^ b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvxori_b (__m256i a, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvxori_b (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvxori.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute bitwise XOR between elements in a and imm.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] ^ imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/memory/index.html b/lasx/memory/index.html new file mode 100644 index 00000000..f8135271 --- /dev/null +++ b/lasx/memory/index.html @@ -0,0 +1,475 @@ + + + + + + + + Memory Load & Store - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Memory Load & Store

+

__m256i __lasx_xvld (void * addr, imm_n2048_2047 offset)

+

Synopsis

+
__m256i __lasx_xvld (void * addr, imm_n2048_2047 offset)
+#include <lasxintrin.h>
+Instruction: xvld xr, r, imm
+CPU Flags: LASX
+
+

Description

+

Read whole vector from memory address addr + offset, save the data into dst. Note that you can use this intrinsic to load floating point vectors, even though the return type represents integer vectors.

+

Operation

+
dst = memory_load(256, addr + offset);
+
+

__m256i __lasx_xvldx (void * addr, long int offset)

+

Synopsis

+
__m256i __lasx_xvldx (void * addr, long int offset)
+#include <lasxintrin.h>
+Instruction: xvldx xr, r, r
+CPU Flags: LASX
+
+

Description

+

Read whole vector from memory address addr + offset, save the data into dst. Note that you can use this intrinsic to load floating point vectors, even though the return type represents integer vectors.

+

Operation

+
dst = memory_load(256, addr + offset);
+
+

__m256i __lasx_xvldrepl_b (void * addr, imm_n2048_2047 offset)

+

Synopsis

+
__m256i __lasx_xvldrepl_b (void * addr, imm_n2048_2047 offset)
+#include <lasxintrin.h>
+Instruction: xvldrepl.b xr, r, imm
+CPU Flags: LASX
+
+

Description

+

Read 8-bit data from memory address addr + (offset << 0), replicate the data to all vector lanes and save into dst.

+

Operation

+
u8 data = memory_load(8, addr + offset);
+for (int i = 0; i < 32; i++) {
+  dst.byte[i] = data;
+}
+
+

__m256i __lasx_xvldrepl_h (void * addr, imm_n1024_1023 offset)

+

Synopsis

+
__m256i __lasx_xvldrepl_h (void * addr, imm_n1024_1023 offset)
+#include <lasxintrin.h>
+Instruction: xvldrepl.h xr, r, imm
+CPU Flags: LASX
+
+

Description

+

Read 16-bit data from memory address addr + (offset << 1), replicate the data to all vector lanes and save into dst.

+

Operation

+
u16 data = memory_load(16, addr + (offset << 1));
+for (int i = 0; i < 16; i++) {
+  dst.half[i] = data;
+}
+
+

__m256i __lasx_xvldrepl_w (void * addr, imm_n512_511 offset)

+

Synopsis

+
__m256i __lasx_xvldrepl_w (void * addr, imm_n512_511 offset)
+#include <lasxintrin.h>
+Instruction: xvldrepl.w xr, r, imm
+CPU Flags: LASX
+
+

Description

+

Read 32-bit data from memory address addr + (offset << 2), replicate the data to all vector lanes and save into dst.

+

Operation

+
u32 data = memory_load(32, addr + (offset << 2));
+for (int i = 0; i < 8; i++) {
+  dst.word[i] = data;
+}
+
+

__m256i __lasx_xvldrepl_d (void * addr, imm_n256_255 offset)

+

Synopsis

+
__m256i __lasx_xvldrepl_d (void * addr, imm_n256_255 offset)
+#include <lasxintrin.h>
+Instruction: xvldrepl.d xr, r, imm
+CPU Flags: LASX
+
+

Description

+

Read 64-bit data from memory address addr + (offset << 3), replicate the data to all vector lanes and save into dst.

+

Operation

+
u64 data = memory_load(64, addr + (offset << 3));
+for (int i = 0; i < 4; i++) {
+  dst.dword[i] = data;
+}
+
+

void __lasx_xvst (__m256i data, void * addr, imm_n2048_2047 offset)

+

Synopsis

+
void __lasx_xvst (__m256i data, void * addr, imm_n2048_2047 offset)
+#include <lasxintrin.h>
+Instruction: xvst xr, r, imm
+CPU Flags: LASX
+
+

Description

+

Write whole vector data in data to memory address addr + offset.

+

Operation

+
memory_store(256, data, addr + offset);
+
+

void __lasx_xvstx (__m256i data, void * addr, long int offset)

+

Synopsis

+
void __lasx_xvstx (__m256i data, void * addr, long int offset)
+#include <lasxintrin.h>
+Instruction: xvstx xr, r, r
+CPU Flags: LASX
+
+

Description

+

Write whole-vector data in data to memory address addr + offset.

+

Operation

+
memory_store(256, data, addr + offset);
+
+

void __lasx_xvstelm_b (__m256i data, void * addr, imm_n128_127 offset, imm0_31 lane)

+

Synopsis

+
void __lasx_xvstelm_b (__m256i data, void * addr, imm_n128_127 offset, imm0_31 lane)
+#include <lasxintrin.h>
+Instruction: xvstelm.b xr, r, imm, imm
+CPU Flags: LASX
+
+

Description

+

Store the 8-bit element in data specified by lane to memory address addr + offset.

+

Operation

+
memory_store(8, data.byte[lane], addr + offset);
+
+

void __lasx_xvstelm_h (__m256i data, void * addr, imm_n128_127 offset, imm0_15 lane)

+

Synopsis

+
void __lasx_xvstelm_h (__m256i data, void * addr, imm_n128_127 offset, imm0_15 lane)
+#include <lasxintrin.h>
+Instruction: xvstelm.h xr, r, imm, imm
+CPU Flags: LASX
+
+

Description

+

Store the 16-bit element in data specified by lane to memory address addr + offset.

+

Operation

+
memory_store(16, data.half[lane], addr + offset);
+
+

void __lasx_xvstelm_w (__m256i data, void * addr, imm_n128_127 offset, imm0_7 lane)

+

Synopsis

+
void __lasx_xvstelm_w (__m256i data, void * addr, imm_n128_127 offset, imm0_7 lane)
+#include <lasxintrin.h>
+Instruction: xvstelm.w xr, r, imm, imm
+CPU Flags: LASX
+
+

Description

+

Store the 32-bit element in data specified by lane to memory address addr + offset.

+

Operation

+
memory_store(32, data.word[lane], addr + offset);
+
+

void __lasx_xvstelm_d (__m256i data, void * addr, imm_n128_127 offset, imm0_3 lane)

+

Synopsis

+
void __lasx_xvstelm_d (__m256i data, void * addr, imm_n128_127 offset, imm0_3 lane)
+#include <lasxintrin.h>
+Instruction: xvstelm.d xr, r, imm, imm
+CPU Flags: LASX
+
+

Description

+

Store the 64-bit element in data specified by lane to memory address addr + offset.

+

Operation

+
memory_store(64, data.dword[lane], addr + offset);
+
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/misc/index.html b/lasx/misc/index.html new file mode 100644 index 00000000..14bdfcd3 --- /dev/null +++ b/lasx/misc/index.html @@ -0,0 +1,5745 @@ + + + + + + + + Misc - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Misc

+

__m256i __lasx_xvexth_h_b (__m256i a)

+

Synopsis

+
__m256i __lasx_xvexth_h_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.h.b xr, xr
+CPU Flags: LASX
+
+

Description

+

Extend signed 8-bit elements in the higher half of a to 16-bit.

+

Operation

+
int i;
+for (i = 0; i < 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[8 + i];
+}
+for (; i < 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[16 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvexth_hu_bu (__m256i a)

+

Synopsis

+
__m256i __lasx_xvexth_hu_bu (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.hu.bu xr, xr
+CPU Flags: LASX
+
+

Description

+

Extend unsigned 8-bit elements in the higher half of a to 16-bit.

+

Operation

+
int i;
+for (i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[8 + i];
+}
+for (; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[16 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvexth_w_h (__m256i a)

+

Synopsis

+
__m256i __lasx_xvexth_w_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.w.h xr, xr
+CPU Flags: LASX
+
+

Description

+

Extend signed 16-bit elements in the higher half of a to 32-bit.

+

Operation

+
int i;
+for (i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[4 + i];
+}
+for (; i < 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[8 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvexth_wu_hu (__m256i a)

+

Synopsis

+
__m256i __lasx_xvexth_wu_hu (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.wu.hu xr, xr
+CPU Flags: LASX
+
+

Description

+

Extend unsigned 16-bit elements in the higher half of a to 32-bit.

+

Operation

+
int i;
+for (i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[4 + i];
+}
+for (; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[8 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvexth_d_w (__m256i a)

+

Synopsis

+
__m256i __lasx_xvexth_d_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.d.w xr, xr
+CPU Flags: LASX
+
+

Description

+

Extend signed 32-bit elements in the higher half of a to 64-bit.

+

Operation

+
int i;
+for (i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 + i];
+}
+for (; i < 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[4 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvexth_du_wu (__m256i a)

+

Synopsis

+
__m256i __lasx_xvexth_du_wu (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.du.wu xr, xr
+CPU Flags: LASX
+
+

Description

+

Extend unsigned 32-bit elements in the higher half of a to 64-bit.

+

Operation

+
int i;
+for (i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 + i];
+}
+for (; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[4 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvexth_q_d (__m256i a)

+

Synopsis

+
__m256i __lasx_xvexth_q_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.q.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Extend signed 64-bit elements in the higher half of a to 128-bit.

+

Operation

+
int i;
+for (i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[1 + i];
+}
+for (; i < 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvexth_qu_du (__m256i a)

+

Synopsis

+
__m256i __lasx_xvexth_qu_du (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.qu.du xr, xr
+CPU Flags: LASX
+
+

Description

+

Extend unsigned 64-bit elements in the higher half of a to 128-bit.

+

Operation

+
int i;
+for (i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[1 + i];
+}
+for (; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvextl_q_d (__m256i a)

+

Synopsis

+
__m256i __lasx_xvextl_q_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvextl.q.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Extend signed 64-bit elements in the lower half of a to 128-bit.

+

Operation

+
int i;
+for (i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[i];
+}
+for (; i < 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvextl_qu_du (__m256i a)

+

Synopsis

+
__m256i __lasx_xvextl_qu_du (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvextl.qu.du xr, xr
+CPU Flags: LASX
+
+

Description

+

Extend unsigned 64-bit elements in the lower half of a to 128-bit.

+

Operation

+
int i;
+for (i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[i];
+}
+for (; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvextrins_b (__m256i a, __m256i b, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvextrins_b (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvextrins.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Extract one 8-bit element in b and insert it to a according to imm.

+

Operation

+
int i;
+for (i = 0; i < 16; i++) {
+  dst.byte[i] = (i == ((imm >> 4) & 15)) ? b.byte[imm & 15] : a.byte[i];
+}
+for (; i < 32; i++) {
+  dst.byte[i] =
+      (i - 16 == ((imm >> 4) & 15)) ? b.byte[(imm & 15) + 16] : a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvextrins_h (__m256i a, __m256i b, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvextrins_h (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvextrins.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Extract one 16-bit element in b and insert it to a according to imm.

+

Operation

+
int i;
+for (i = 0; i < 8; i++) {
+  dst.half[i] = (i == ((imm >> 4) & 7)) ? b.half[imm & 7] : a.half[i];
+}
+for (; i < 16; i++) {
+  dst.half[i] = (i - 8 == ((imm >> 4) & 7)) ? b.half[(imm & 7) + 8] : a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvextrins_w (__m256i a, __m256i b, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvextrins_w (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvextrins.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Extract one 32-bit element in b and insert it to a according to imm.

+

Operation

+
int i;
+for (i = 0; i < 4; i++) {
+  dst.word[i] = (i == ((imm >> 4) & 3)) ? b.word[imm & 3] : a.word[i];
+}
+for (; i < 8; i++) {
+  dst.word[i] = (i - 4 == ((imm >> 4) & 3)) ? b.word[(imm & 3) + 4] : a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvextrins_d (__m256i a, __m256i b, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvextrins_d (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvextrins.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Extract one 64-bit element in b and insert it to a according to imm.

+

Operation

+
int i;
+for (i = 0; i < 2; i++) {
+  dst.dword[i] = (i == ((imm >> 4) & 1)) ? b.dword[imm & 1] : a.dword[i];
+}
+for (; i < 4; i++) {
+  dst.dword[i] =
+      (i - 2 == ((imm >> 4) & 1)) ? b.dword[(imm & 1) + 2] : a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_vext2xv_h_b (__m256i a)

+

Synopsis

+
__m256i __lasx_vext2xv_h_b (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.h.b xr, xr
+CPU Flags: LSX
+
+

Description

+

Extend signed 8-bit lane of a to signed 16-bit elements.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_vext2xv_hu_bu (__m256i a)

+

Synopsis

+
__m256i __lasx_vext2xv_hu_bu (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.hu.bu xr, xr
+CPU Flags: LSX
+
+

Description

+

Extend unsigned 8-bit lane of a to unsigned 16-bit elements.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_vext2xv_w_b (__m256i a)

+

Synopsis

+
__m256i __lasx_vext2xv_w_b (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.w.b xr, xr
+CPU Flags: LSX
+
+

Description

+

Extend signed 8-bit lane of a to signed 32-bit elements.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)(s8)a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_vext2xv_wu_bu (__m256i a)

+

Synopsis

+
__m256i __lasx_vext2xv_wu_bu (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.wu.bu xr, xr
+CPU Flags: LSX
+
+

Description

+

Extend unsigned 8-bit lane of a to unsigned 32-bit elements.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u8)a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_vext2xv_w_h (__m256i a)

+

Synopsis

+
__m256i __lasx_vext2xv_w_h (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.w.h xr, xr
+CPU Flags: LSX
+
+

Description

+

Extend signed 16-bit lane of a to signed 32-bit elements.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_vext2xv_wu_hu (__m256i a)

+

Synopsis

+
__m256i __lasx_vext2xv_wu_hu (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.wu.hu xr, xr
+CPU Flags: LSX
+
+

Description

+

Extend unsigned 16-bit lane of a to unsigned 32-bit elements.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_vext2xv_d_b (__m256i a)

+

Synopsis

+
__m256i __lasx_vext2xv_d_b (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.d.b xr, xr
+CPU Flags: LSX
+
+

Description

+

Extend signed 8-bit lane of a to signed 64-bit elements.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)(s8)a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_vext2xv_du_bu (__m256i a)

+

Synopsis

+
__m256i __lasx_vext2xv_du_bu (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.du.bu xr, xr
+CPU Flags: LSX
+
+

Description

+

Extend unsigned 8-bit lane of a to unsigned 64-bit elements.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u8)a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_vext2xv_d_h (__m256i a)

+

Synopsis

+
__m256i __lasx_vext2xv_d_h (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.d.h xr, xr
+CPU Flags: LSX
+
+

Description

+

Extend signed 16-bit lane of a to signed 64-bit elements.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)(s16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_vext2xv_du_hu (__m256i a)

+

Synopsis

+
__m256i __lasx_vext2xv_du_hu (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.du.hu xr, xr
+CPU Flags: LSX
+
+

Description

+

Extend unsigned 16-bit lane of a to unsigned 64-bit elements.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_vext2xv_d_w (__m256i a)

+

Synopsis

+
__m256i __lasx_vext2xv_d_w (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.d.w xr, xr
+CPU Flags: LSX
+
+

Description

+

Extend signed 32-bit lane of a to signed 64-bit elements.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_vext2xv_du_wu (__m256i a)

+

Synopsis

+
__m256i __lasx_vext2xv_du_wu (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.du.wu xr, xr
+CPU Flags: LSX
+
+

Description

+

Extend unsigned 32-bit lane of a to unsigned 64-bit elements.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_xvilvh_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvilvh_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvh.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Interleave 8-bit elements in higher half of a and b.

+

Operation

+
int i;
+for (i = 0; i < 16; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];
+}
+for (; i < 32; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 16] : b.byte[i / 2 + 16];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvilvh_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvilvh_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvh.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Interleave 16-bit elements in higher half of a and b.

+

Operation

+
int i;
+for (i = 0; i < 8; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];
+}
+for (; i < 16; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 8] : b.half[i / 2 + 8];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvilvh_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvilvh_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvh.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Interleave 32-bit elements in higher half of a and b.

+

Operation

+
int i;
+for (i = 0; i < 4; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];
+}
+for (; i < 8; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 4] : b.word[i / 2 + 4];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvilvh_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvilvh_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvh.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Interleave 64-bit elements in higher half of a and b.

+

Operation

+
int i;
+for (i = 0; i < 2; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];
+}
+for (; i < 4; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 2] : b.dword[i / 2 + 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvilvl_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvilvl_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvl.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Interleave 8-bit elements in lower half of a and b.

+

Operation

+
int i;
+for (i = 0; i < 16; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2] : b.byte[i / 2];
+}
+for (; i < 32; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvilvl_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvilvl_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvl.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Interleave 16-bit elements in lower half of a and b.

+

Operation

+
int i;
+for (i = 0; i < 8; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i / 2] : b.half[i / 2];
+}
+for (; i < 16; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvilvl_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvilvl_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvl.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Interleave 32-bit elements in lower half of a and b.

+

Operation

+
int i;
+for (i = 0; i < 4; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i / 2] : b.word[i / 2];
+}
+for (; i < 8; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvilvl_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvilvl_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvl.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Interleave 64-bit elements in lower half of a and b.

+

Operation

+
int i;
+for (i = 0; i < 2; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2] : b.dword[i / 2];
+}
+for (; i < 4; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvinsgr2vr_w (__m256i a, int b, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvinsgr2vr_w (__m256i a, int b, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvinsgr2vr.w xr, r, imm
+CPU Flags: LASX
+
+

Description

+

Insert 32-bit element into lane indexed imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (i == imm) ? b : a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m256i __lasx_xvinsgr2vr_d (__m256i a, long int b, imm0_3 imm)

+

Synopsis

+
__m256i __lasx_xvinsgr2vr_d (__m256i a, long int b, imm0_3 imm)
+#include <lasxintrin.h>
+Instruction: xvinsgr2vr.d xr, r, imm
+CPU Flags: LASX
+
+

Description

+

Insert 64-bit element into lane indexed imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (i == imm) ? b : a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m256i __lasx_xvinsve0_w (__m256i a, __m256i b, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvinsve0_w (__m256i a, __m256i b, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvinsve0.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Insert the first 32-bit lane of b into lane indexed imm of a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (i == imm) ? b.word[0] : a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvinsve0_d (__m256i a, __m256i b, imm0_3 imm)

+

Synopsis

+
__m256i __lasx_xvinsve0_d (__m256i a, __m256i b, imm0_3 imm)
+#include <lasxintrin.h>
+Instruction: xvinsve0.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Insert the first 64-bit lane of b into lane indexed imm of a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (i == imm) ? b.dword[0] : a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvfrstp_b (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvfrstp_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvfrstp.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Find the first negative 8-bit element in b, set the index of the element to the lane of a specified by c.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i];
+}
+int i;
+for (i = 0; i < 16; i++) {
+  if ((s8)b.byte[i] < 0) {
+    break;
+  }
+}
+dst.byte[c.byte[0] % 16] = i;
+for (i = 16; i < 32; i++) {
+  if ((s8)b.byte[i] < 0) {
+    break;
+  }
+}
+dst.byte[(c.byte[16] % 16) + 16] = i - 16;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvfrstp_h (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvfrstp_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvfrstp.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Find the first negative 16-bit element in b, set the index of the element to the lane of a specified by c.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i];
+}
+int i;
+for (i = 0; i < 8; i++) {
+  if ((s16)b.half[i] < 0) {
+    break;
+  }
+}
+dst.half[c.half[0] % 8] = i;
+for (i = 8; i < 16; i++) {
+  if ((s16)b.half[i] < 0) {
+    break;
+  }
+}
+dst.half[(c.half[8] % 8) + 8] = i - 8;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvfrstpi_b (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvfrstpi_b (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvfrstpi.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Find the first negative 8-bit element in b, set the index of the element to the lane of a specified by imm.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i];
+}
+int i;
+for (i = 0; i < 16; i++) {
+  if ((s8)b.byte[i] < 0) {
+    break;
+  }
+}
+dst.byte[imm % 16] = i;
+for (i = 16; i < 32; i++) {
+  if ((s8)b.byte[i] < 0) {
+    break;
+  }
+}
+dst.byte[(imm % 16) + 16] = i - 16;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvfrstpi_h (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvfrstpi_h (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvfrstpi.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Find the first negative 16-bit element in b, set the index of the element to the lane of a specified by imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i];
+}
+int i;
+for (i = 0; i < 8; i++) {
+  if ((s16)b.half[i] < 0) {
+    break;
+  }
+}
+dst.half[imm % 8] = i;
+for (i = 8; i < 16; i++) {
+  if ((s16)b.half[i] < 0) {
+    break;
+  }
+}
+dst.half[(imm % 8) + 8] = i - 8;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvmskgez_b (__m256i a)

+

Synopsis

+
__m256i __lasx_xvmskgez_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvmskgez.b xr, xr
+CPU Flags: LASX
+
+

Description

+

For each 8-bit element in a, if the element is greater than or equal to zero, set one bit in dst, otherwise clear it.

+

Examples

+
__m256i __lasx_xvmskgez_b(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})
+= 0x00000000000001fe 0x0000000000000000 0x000000000000ff0f 0x0000000000000000
+__m256i __lasx_xvmskgez_b(__m256i{0x0000191100000000, 0x00a1000011b11c11, 0x1181000008010101, 0x0000000000000000})
+= 0x000000000000bbff 0x0000000000000000 0x000000000000ffbf 0x0000000000000000
+
+

Operation

+
u64 m = 0x8080808080808080;
+u64 c = m & a.dword[0];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[0] = (u16)~dst.dword[0];
+dst.dword[1] = 0;
+
+c = m & a.dword[2];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[2] = c;
+c = m & a.dword[3];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[2] |= c << 8;
+dst.dword[2] = (u16)~dst.dword[2];
+dst.dword[3] = 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmskltz_b (__m256i a)

+

Synopsis

+
__m256i __lasx_xvmskltz_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvmskltz.b xr, xr
+CPU Flags: LASX
+
+

Description

+

For each 8-bit element in a, if the element is less than zero, set one bit in dst, otherwise clear it.

+

Examples

+
__m256i __lasx_xvmskltz_b(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})
+= 0x000000000000fe01 0x0000000000000000 0x00000000000000f0 0x0000000000000000
+__m256i __lasx_xvmskltz_b(__m256i{0x0000118100000000, 0x0081000081111118, 0x1181000001010801, 0x0000000000000000})
+= 0x0000000000004810 0x0000000000000000 0x0000000000000040 0x0000000000000000
+
+

Operation

+
u64 m = 0x8080808080808080;
+u64 c = m & a.dword[0];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[1] = 0;
+
+c = m & a.dword[2];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[2] = c;
+c = m & a.dword[3];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[2] |= c << 8;
+dst.dword[3] = 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmskltz_h (__m256i a)

+

Synopsis

+
__m256i __lasx_xvmskltz_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvmskltz.h xr, xr
+CPU Flags: LASX
+
+

Description

+

For each 16-bit element in a, if the element is less than zero, set one bit in dst, otherwise clear it.

+

Examples

+
__m256i __lasx_xvmskltz_h(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})
+= 0x00000000000000f0 0x0000000000000000 0x000000000000000c 0x0000000000000000
+__m256i __lasx_xvmskltz_h(__m256i{0x0000818100000000, 0x0018000018181881, 0x1181000008080808, 0x0000000000000000})
+= 0x0000000000000004 0x0000000000000000 0x0000000000000000 0x0000000000000000
+
+

Operation

+
u64 m = 0x8000800080008000;
+u64 c = m & a.dword[0];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[0] |= c << 4;
+dst.dword[1] = 0;
+
+c = m & a.dword[2];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[2] = c;
+c = m & a.dword[3];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[2] |= c << 4;
+dst.dword[3] = 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmskltz_w (__m256i a)

+

Synopsis

+
__m256i __lasx_xvmskltz_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvmskltz.w xr, xr
+CPU Flags: LASX
+
+

Description

+

For each 32-bit element in a, if the element is less than zero, set one bit in dst, otherwise clear it.

+

Examples

+
__m256i __lasx_xvmskltz_w(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})
+= 0x000000000000000c 0x0000000000000000 0x0000000000000002 0x0000000000000000
+__m256i __lasx_xvmskltz_w(__m256i{0x0000811100000000, 0x0018000081111111, 0x8111000001010108, 0x0000000000000000})
+= 0x0000000000000004 0x0000000000000000 0x0000000000000002 0x0000000000000000
+
+

Operation

+
u64 m = 0x8000000080000000;
+u64 c = m & a.dword[0];
+c |= c << 31;
+c >>= 62;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 31;
+c >>= 62;
+dst.dword[0] |= c << 2;
+dst.dword[1] = 0;
+
+c = m & a.dword[2];
+c |= c << 31;
+c >>= 62;
+dst.dword[2] = c;
+c = m & a.dword[3];
+c |= c << 31;
+c >>= 62;
+dst.dword[2] |= c << 2;
+dst.dword[3] = 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmskltz_d (__m256i a)

+

Synopsis

+
__m256i __lasx_xvmskltz_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvmskltz.d xr, xr
+CPU Flags: LASX
+
+

Description

+

For each 64-bit element in a, if the element is less than zero, set one bit in dst, otherwise clear it.

+

Examples

+
__m256i __lasx_xvmskltz_d(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})
+= 0x0000000000000002 0x0000000000000000 0x0000000000000001 0x0000000000000000
+__m256i __lasx_xvmskltz_d(__m256i{0x0000111800000000, 0x0081000081111111, 0x8111000008010101, 0x0000000000000000})
+= 0x0000000000000000 0x0000000000000000 0x0000000000000001 0x0000000000000000
+
+

Operation

+
u64 m = 0x8000000000000000;
+u64 c = m & a.dword[0];
+c >>= 63;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c >>= 63;
+dst.dword[0] |= c << 1;
+dst.dword[1] = 0;
+
+c = m & a.dword[2];
+c >>= 63;
+dst.dword[2] = c;
+c = m & a.dword[3];
+c >>= 63;
+dst.dword[2] |= c << 1;
+dst.dword[3] = 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmsknz_b (__m256i a)

+

Synopsis

+
__m256i __lasx_xvmsknz_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvmsknz.b xr, xr
+CPU Flags: LASX
+
+

Description

+

For each 8-bit element in a, if the element is non-zero, set one bit in dst, otherwise clear it.

+

Examples

+
__m256i __lasx_xvmsknz_b(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})
+= 0x000000000000feff 0x0000000000000000 0x000000000000ffff 0x0000000000000000
+__m256i __lasx_xvmsknz_b(__m256i{0x0000111100000000, 0x0011000011111111, 0x1111000001010101, 0x0000000000000000})
+= 0x0000000000004f30 0x0000000000000000 0x00000000000000cf 0x0000000000000000
+
+

Operation

+
u64 m = 0x7F7F7F7F7F7F7F7F;
+u64 c = ~(((a.dword[0] & m) + m) | a.dword[0] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = ~(((a.dword[1] & m) + m) | a.dword[1] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[0] = (u16)~dst.dword[0];
+dst.dword[1] = 0;
+
+c = ~(((a.dword[2] & m) + m) | a.dword[2] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[2] = c;
+c = ~(((a.dword[3] & m) + m) | a.dword[3] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[2] |= c << 8;
+dst.dword[2] = (u16)~dst.dword[2];
+dst.dword[3] = 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpackev_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpackev_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackev.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Collect and pack even-positioned 8-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i - 1] : b.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpackev_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpackev_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackev.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Collect and pack even-positioned 16-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i - 1] : b.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpackev_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpackev_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackev.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Collect and pack even-positioned 32-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i - 1] : b.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpackev_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpackev_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackev.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Collect and pack even-positioned 64-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i - 1] : b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpackod_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpackod_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackod.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Collect and pack odd-positioned 8-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i] : b.byte[i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpackod_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpackod_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackod.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Collect and pack odd-positioned 16-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i] : b.half[i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpackod_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpackod_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackod.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Collect and pack odd-positioned 32-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i] : b.word[i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpackod_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpackod_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackod.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Collect and pack odd-positioned 64-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i] : b.dword[i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpickev_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpickev_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickev.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Pick even-positioned 8-bit elements in b first, then pick even-positioned 8-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i < 8) ? b.byte[i * 2] : a.byte[(i - 8) * 2];
+}
+for (int i = 16; i < 32; i++) {
+  dst.byte[i] = (i < 24) ? b.byte[(i - 8) * 2] : a.byte[(i - 16) * 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpickev_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpickev_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickev.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Pick even-positioned 16-bit elements in b first, then pick even-positioned 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i < 4) ? b.half[i * 2] : a.half[(i - 4) * 2];
+}
+for (int i = 8; i < 16; i++) {
+  dst.half[i] = (i < 12) ? b.half[(i - 4) * 2] : a.half[(i - 8) * 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpickev_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpickev_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickev.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Pick even-positioned 32-bit elements in b first, then pick even-positioned 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i < 2) ? b.word[i * 2] : a.word[(i - 2) * 2];
+}
+for (int i = 4; i < 8; i++) {
+  dst.word[i] = (i < 6) ? b.word[(i - 2) * 2] : a.word[(i - 4) * 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpickev_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpickev_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickev.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Pick even-positioned 64-bit elements in b first, then pick even-positioned 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1) ? b.dword[i * 2] : a.dword[(i - 1) * 2];
+}
+for (int i = 2; i < 4; i++) {
+  dst.dword[i] = (i < 3) ? b.dword[(i - 1) * 2] : a.dword[(i - 2) * 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpickve_w (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvpickve_w (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvpickve.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Copy one 32-bit lane from a specified by imm to the first lane of dst, and set the other lanes to zero.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (i == 0) ? a.word[imm] : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_xvpickve_d (__m256i a, imm0_3 imm)

+

Synopsis

+
__m256i __lasx_xvpickve_d (__m256i a, imm0_3 imm)
+#include <lasxintrin.h>
+Instruction: xvpickve.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Copy one 64-bit lane from a specified by imm to the first lane of dst, and set the other lanes to zero.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (i == 0) ? a.dword[imm] : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256 __lasx_xvpickve_w_f (__m256 a, imm0_7 imm)

+

Synopsis

+
__m256 __lasx_xvpickve_w_f (__m256 a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvpickve.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Copy one 32-bit lane from a specified by imm to the first lane of dst, and set the other lanes to zero.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (i == 0) ? a.word[imm] : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256d __lasx_xvpickve_d_f (__m256d a, imm0_3 imm)

+

Synopsis

+
__m256d __lasx_xvpickve_d_f (__m256d a, imm0_3 imm)
+#include <lasxintrin.h>
+Instruction: xvpickve.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Copy one 64-bit lane from a specified by imm to the first lane of dst, and set the other lanes to zero.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (i == 0) ? a.dword[imm] : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

int __lasx_xvpickve2gr_w (__m256i a, imm0_7 idx)

+

Synopsis

+
int __lasx_xvpickve2gr_w (__m256i a, imm0_7 idx)
+#include <lasxintrin.h>
+Instruction: xvpickve2gr.w r, xr, imm
+CPU Flags: LASX
+
+

Description

+

Pick the lane specified by idx from a and store into dst.

+

Operation

+
dst = (s32)a.word[idx];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

unsigned int __lasx_xvpickve2gr_wu (__m256i a, imm0_7 idx)

+

Synopsis

+
unsigned int __lasx_xvpickve2gr_wu (__m256i a, imm0_7 idx)
+#include <lasxintrin.h>
+Instruction: xvpickve2gr.wu r, xr, imm
+CPU Flags: LASX
+
+

Description

+

Pick the lane specified by idx from a and store into dst.

+

Operation

+
dst = (u32)a.word[idx];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

long int __lasx_xvpickve2gr_d (__m256i a, imm0_3 idx)

+

Synopsis

+
long int __lasx_xvpickve2gr_d (__m256i a, imm0_3 idx)
+#include <lasxintrin.h>
+Instruction: xvpickve2gr.d r, xr, imm
+CPU Flags: LASX
+
+

Description

+

Pick the lane specified by idx from a and store into dst.

+

Operation

+
dst = (s64)a.dword[idx];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

unsigned long int __lasx_xvpickve2gr_du (__m256i a, imm0_3 idx)

+

Synopsis

+
unsigned long int __lasx_xvpickve2gr_du (__m256i a, imm0_3 idx)
+#include <lasxintrin.h>
+Instruction: xvpickve2gr.du r, xr, imm
+CPU Flags: LASX
+
+

Description

+

Pick the lane specified by idx from a and store into dst.

+

Operation

+
dst = (u64)a.dword[idx];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m256i __lasx_xvpickod_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpickod_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickod.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Pick odd-positioned 8-bit elements in b first, then pick odd-positioned 8-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i < 8) ? b.byte[i * 2 + 1] : a.byte[(i - 8) * 2 + 1];
+}
+for (int i = 16; i < 32; i++) {
+  dst.byte[i] = (i < 24) ? b.byte[(i - 8) * 2 + 1] : a.byte[(i - 16) * 2 + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpickod_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpickod_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickod.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Pick odd-positioned 16-bit elements in b first, then pick odd-positioned 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i < 4) ? b.half[i * 2 + 1] : a.half[(i - 4) * 2 + 1];
+}
+for (int i = 8; i < 16; i++) {
+  dst.half[i] = (i < 12) ? b.half[(i - 4) * 2 + 1] : a.half[(i - 8) * 2 + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpickod_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpickod_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickod.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Pick odd-positioned 32-bit elements in b first, then pick odd-positioned 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i < 2) ? b.word[i * 2 + 1] : a.word[(i - 2) * 2 + 1];
+}
+for (int i = 4; i < 8; i++) {
+  dst.word[i] = (i < 6) ? b.word[(i - 2) * 2 + 1] : a.word[(i - 4) * 2 + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpickod_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpickod_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickod.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Pick odd-positioned 64-bit elements in b first, then pick odd-positioned 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1) ? b.dword[i * 2 + 1] : a.dword[(i - 1) * 2 + 1];
+}
+for (int i = 2; i < 4; i++) {
+  dst.dword[i] = (i < 3) ? b.dword[(i - 1) * 2 + 1] : a.dword[(i - 2) * 2 + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvrepli_b (imm_n512_511 imm)

+

Synopsis

+
__m256i __lasx_xvrepli_b (imm_n512_511 imm)
+#include <lasxintrin.h>
+Instruction: xvldi xr, imm
+CPU Flags: LASX
+
+

Description

+

Repeat imm to fill whole vector.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = imm;
+}
+
+

Tested on real machine.

+

__m256i __lasx_xvrepli_h (imm_n512_511 imm)

+

Synopsis

+
__m256i __lasx_xvrepli_h (imm_n512_511 imm)
+#include <lasxintrin.h>
+Instruction: xvldi xr, imm
+CPU Flags: LASX
+
+

Description

+

Repeat imm to fill whole vector.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = imm;
+}
+
+

Tested on real machine.

+

__m256i __lasx_xvrepli_w (imm_n512_511 imm)

+

Synopsis

+
__m256i __lasx_xvrepli_w (imm_n512_511 imm)
+#include <lasxintrin.h>
+Instruction: xvldi xr, imm
+CPU Flags: LASX
+
+

Description

+

Repeat imm to fill whole vector.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = imm;
+}
+
+

Tested on real machine.

+

__m256i __lasx_xvrepli_d (imm_n512_511 imm)

+

Synopsis

+
__m256i __lasx_xvrepli_d (imm_n512_511 imm)
+#include <lasxintrin.h>
+Instruction: xvldi xr, imm
+CPU Flags: LASX
+
+

Description

+

Repeat imm to fill whole vector.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = imm;
+}
+
+

Tested on real machine.

+

__m256i __lasx_xvreplgr2vr_b (int val)

+

Synopsis

+
__m256i __lasx_xvreplgr2vr_b (int val)
+#include <lasxintrin.h>
+Instruction: xvreplgr2vr.b xr, r
+CPU Flags: LASX
+
+

Description

+

Repeat val to whole vector.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = val;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A1
3C5000N/A1
+

__m256i __lasx_xvreplgr2vr_h (int val)

+

Synopsis

+
__m256i __lasx_xvreplgr2vr_h (int val)
+#include <lasxintrin.h>
+Instruction: xvreplgr2vr.h xr, r
+CPU Flags: LASX
+
+

Description

+

Repeat val to whole vector.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = val;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A1
3C5000N/A1
+

__m256i __lasx_xvreplgr2vr_w (int val)

+

Synopsis

+
__m256i __lasx_xvreplgr2vr_w (int val)
+#include <lasxintrin.h>
+Instruction: xvreplgr2vr.w xr, r
+CPU Flags: LASX
+
+

Description

+

Repeat val to whole vector.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = val;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A1
3C5000N/A1
+

__m256i __lasx_xvreplgr2vr_d (long int val)

+

Synopsis

+
__m256i __lasx_xvreplgr2vr_d (long int val)
+#include <lasxintrin.h>
+Instruction: xvreplgr2vr.d xr, r
+CPU Flags: LASX
+
+

Description

+

Repeat val to whole vector.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = val;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A1
3C5000N/A1
+

__m256i __lasx_xvreplve_b (__m256i a, int idx)

+

Synopsis

+
__m256i __lasx_xvreplve_b (__m256i a, int idx)
+#include <lasxintrin.h>
+Instruction: xvreplve.b xr, xr, r
+CPU Flags: LASX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[idx % 16];
+}
+for (int i = 16; i < 32; i++) {
+  dst.byte[i] = a.byte[(idx % 16) + 16];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m256i __lasx_xvreplve_h (__m256i a, int idx)

+

Synopsis

+
__m256i __lasx_xvreplve_h (__m256i a, int idx)
+#include <lasxintrin.h>
+Instruction: xvreplve.h xr, xr, r
+CPU Flags: LASX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[idx % 8];
+}
+for (int i = 8; i < 16; i++) {
+  dst.half[i] = a.half[(idx % 8) + 8];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m256i __lasx_xvreplve_w (__m256i a, int idx)

+

Synopsis

+
__m256i __lasx_xvreplve_w (__m256i a, int idx)
+#include <lasxintrin.h>
+Instruction: xvreplve.w xr, xr, r
+CPU Flags: LASX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[idx % 4];
+}
+for (int i = 4; i < 8; i++) {
+  dst.word[i] = a.word[(idx % 4) + 4];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m256i __lasx_xvreplve_d (__m256i a, int idx)

+

Synopsis

+
__m256i __lasx_xvreplve_d (__m256i a, int idx)
+#include <lasxintrin.h>
+Instruction: xvreplve.d xr, xr, r
+CPU Flags: LASX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[idx % 2];
+}
+for (int i = 2; i < 4; i++) {
+  dst.dword[i] = a.dword[(idx % 2) + 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m256i __lasx_xvreplve0_b (__m256i a)

+

Synopsis

+
__m256i __lasx_xvreplve0_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvreplve0.b xr, xr
+CPU Flags: LASX
+
+

Description

+

Repeat the first 8-bit lane from a to all lanes of dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[0];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_xvreplve0_h (__m256i a)

+

Synopsis

+
__m256i __lasx_xvreplve0_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvreplve0.h xr, xr
+CPU Flags: LASX
+
+

Description

+

Repeat the first 16-bit lane from a to all lanes of dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[0];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_xvreplve0_w (__m256i a)

+

Synopsis

+
__m256i __lasx_xvreplve0_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvreplve0.w xr, xr
+CPU Flags: LASX
+
+

Description

+

Repeat the first 32-bit lane from a to all lanes of dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[0];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_xvreplve0_d (__m256i a)

+

Synopsis

+
__m256i __lasx_xvreplve0_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvreplve0.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Repeat the first 64-bit lane from a to all lanes of dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[0];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_xvreplve0_q (__m256i a)

+

Synopsis

+
__m256i __lasx_xvreplve0_q (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvreplve0.q xr, xr
+CPU Flags: LASX
+
+

Description

+

Repeat the first 128-bit lane from a to all lanes of dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = a.qword[0];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_xvrepl128vei_b (__m256i a, imm0_15 idx)

+

Synopsis

+
__m256i __lasx_xvrepl128vei_b (__m256i a, imm0_15 idx)
+#include <lasxintrin.h>
+Instruction: xvrepl128vei.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[idx];
+}
+for (int i = 16; i < 32; i++) {
+  dst.byte[i] = a.byte[idx + 16];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvrepl128vei_h (__m256i a, imm0_7 idx)

+

Synopsis

+
__m256i __lasx_xvrepl128vei_h (__m256i a, imm0_7 idx)
+#include <lasxintrin.h>
+Instruction: xvrepl128vei.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[idx];
+}
+for (int i = 8; i < 16; i++) {
+  dst.half[i] = a.half[idx + 8];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvrepl128vei_w (__m256i a, imm0_3 idx)

+

Synopsis

+
__m256i __lasx_xvrepl128vei_w (__m256i a, imm0_3 idx)
+#include <lasxintrin.h>
+Instruction: xvrepl128vei.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[idx];
+}
+for (int i = 4; i < 8; i++) {
+  dst.word[i] = a.word[idx + 4];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvrepl128vei_d (__m256i a, imm0_1 idx)

+

Synopsis

+
__m256i __lasx_xvrepl128vei_d (__m256i a, imm0_1 idx)
+#include <lasxintrin.h>
+Instruction: xvrepl128vei.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[idx];
+}
+for (int i = 2; i < 4; i++) {
+  dst.dword[i] = a.dword[idx + 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsat_b (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvsat_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Clamp signed 8-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = clamp<s8>(a.byte[i], -(1 << imm), (1 << imm) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsat_bu (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvsat_bu (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.bu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Clamp unsigned 8-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = clamp<u8>(a.byte[i], 0, (1 << (imm + 1)) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsat_h (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvsat_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Clamp signed 16-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = clamp<s16>(a.half[i], -(1 << imm), (1 << imm) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsat_hu (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvsat_hu (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.hu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Clamp unsigned 16-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = clamp<u16>(a.half[i], 0, (1 << (imm + 1)) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsat_w (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsat_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Clamp signed 32-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = clamp<s32>(a.word[i], -(1 << imm), (1 << imm) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsat_wu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsat_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.wu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Clamp unsigned 32-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = clamp<u32>(a.word[i], 0, (1 << (imm + 1)) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsat_d (__m256i a, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvsat_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Clamp signed 64-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = clamp<s64>(a.dword[i], -(1 << imm), (1 << imm) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsat_du (__m256i a, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvsat_du (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.du xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Clamp unsigned 64-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = clamp<u64>(a.dword[i], 0, (1 << (imm + 1)) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsigncov_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsigncov_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsigncov.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

If the 8-bit element in a equals to zero, set the result to zero. If the signed 8-bit element in a is positive, copy element in b to result. Otherwise, copy negated element in b to result. If a and b are the same vectors, it is equivalent to computing absolute value.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] =
+      (a.byte[i] == 0) ? 0 : ((s8)a.byte[i] > 0 ? b.byte[i] : -b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m256i __lasx_xvsigncov_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsigncov_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsigncov.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

If the 16-bit element in a equals to zero, set the result to zero. If the signed 16-bit element in a is positive, copy element in b to result. Otherwise, copy negated element in b to result. If a and b are the same vectors, it is equivalent to computing absolute value.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] =
+      (a.half[i] == 0) ? 0 : ((s16)a.half[i] > 0 ? b.half[i] : -b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m256i __lasx_xvsigncov_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsigncov_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsigncov.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

If the 32-bit element in a equals to zero, set the result to zero. If the signed 32-bit element in a is positive, copy element in b to result. Otherwise, copy negated element in b to result. If a and b are the same vectors, it is equivalent to computing absolute value.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] =
+      (a.word[i] == 0) ? 0 : ((s32)a.word[i] > 0 ? b.word[i] : -b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m256i __lasx_xvsigncov_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsigncov_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsigncov.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

If the 64-bit element in a equals to zero, set the result to zero. If the signed 64-bit element in a is positive, copy element in b to result. Otherwise, copy negated element in b to result. If a and b are the same vectors, it is equivalent to computing absolute value.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] =
+      (a.dword[i] == 0) ? 0 : ((s64)a.dword[i] > 0 ? b.dword[i] : -b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m256i __lasx_xvldi (imm_n1024_1023 imm)

+

Synopsis

+
__m256i __lasx_xvldi (imm_n1024_1023 imm)
+#include <lasxintrin.h>
+Instruction: xvldi xr, imm
+CPU Flags: LASX
+
+

Description

+

Initialize dst using predefined patterns:

+
    +
  • imm[12:10]=0b000: broadcast imm[7:0] as 8-bit elements to all lanes
  • +
  • imm[12:10]=0b001: broadcast sign-extended imm[9:0] as 16-bit elements to all lanes
  • +
  • imm[12:10]=0b010: broadcast sign-extended imm[9:0] as 32-bit elements to all lanes
  • +
  • imm[12:10]=0b011: broadcast sign-extended imm[9:0] as 64-bit elements to all lanes
  • +
  • imm[12:8]=0b10000: broadcast imm[7:0] as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b10001: broadcast imm[7:0] << 8 as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b10010: broadcast imm[7:0] << 16 as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b10011: broadcast imm[7:0] << 24 as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b10100: broadcast imm[7:0] as 16-bit elements to all lanes
  • +
  • imm[12:8]=0b10101: broadcast imm[7:0] << 8 as 16-bit elements to all lanes
  • +
  • imm[12:8]=0b10110: broadcast (imm[7:0] << 8) | 0xFF as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b10111: broadcast (imm[7:0] << 16) | 0xFFFF as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b11000: broadcast imm[7:0] as 8-bit elements to all lanes
  • +
  • imm[12:8]=0b11001: repeat each bit of imm[7:0] eight times, and broadcast the result as 64-bit elements to all lanes
  • +
  • imm[12:8]=0b11010: broadcast (imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19) as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b11011: broadcast (imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19) as 64-bit elements to all lanes
  • +
  • imm[12:8]=0b11100: broadcast (imm[7] << 63) | ((1-imm[6]) << 62) | ((imm[6] * 0xFF) << 54) | (imm[5:0] << 48) as 64-bit elements to all lanes
  • +
+

Operation

+
u64 imm12_10 = (imm >> 10) & 0b111;
+u64 imm12_8 = (imm >> 8) & 0b11111;
+u64 imm9_0 = imm & 0x3FF;
+s64 simm9_0 = ((s64)imm9_0 << 54) >> 54;
+u64 imm7_0 = imm & 0xFF;
+u64 imm7 = (imm >> 7) & 0x1;
+u64 imm6 = (imm >> 6) & 0x1;
+u64 imm5 = (imm >> 5) & 0x1;
+u64 imm5_0 = imm & 0x3F;
+u64 imm4 = (imm >> 4) & 0x1;
+u64 imm3 = (imm >> 3) & 0x1;
+u64 imm2 = (imm >> 2) & 0x1;
+u64 imm1 = (imm >> 1) & 0x1;
+u64 imm0 = imm & 0x1;
+
+u64 broadcast_value;
+u64 broadcast_width;
+if (imm12_10 == 0b000) {
+  broadcast_value = imm7_0;
+  broadcast_width = 8;
+} else if (imm12_10 == 0b001) {
+  broadcast_value = simm9_0;
+  broadcast_width = 16;
+} else if (imm12_10 == 0b010) {
+  broadcast_value = simm9_0;
+  broadcast_width = 32;
+} else if (imm12_10 == 0b011) {
+  broadcast_value = simm9_0;
+  broadcast_width = 64;
+} else if (imm12_8 == 0b10000) {
+  broadcast_value = imm7_0;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10001) {
+  broadcast_value = imm7_0 << 8;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10010) {
+  broadcast_value = imm7_0 << 16;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10011) {
+  broadcast_value = imm7_0 << 24;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10100) {
+  broadcast_value = imm7_0;
+  broadcast_width = 16;
+} else if (imm12_8 == 0b10101) {
+  broadcast_value = imm7_0 << 8;
+  broadcast_width = 16;
+} else if (imm12_8 == 0b10110) {
+  broadcast_value = (imm7_0 << 8) | 0xFF;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10111) {
+  broadcast_value = (imm7_0 << 16) | 0xFFFF;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b11000) {
+  broadcast_value = imm7_0;
+  broadcast_width = 8;
+} else if (imm12_8 == 0b11001) {
+  broadcast_value = imm0 * 0xFF + imm1 * 0xFF00 + imm2 * 0xFF0000 +
+                    imm3 * 0xFF000000 + imm4 * 0xFF00000000 +
+                    imm5 * 0xFF0000000000 + imm6 * 0xFF000000000000 +
+                    imm7 * 0xFF00000000000000;
+  broadcast_width = 64;
+} else if (imm12_8 == 0b11010) {
+  broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |
+                    (imm5_0 << 19);
+  broadcast_width = 32;
+} else if (imm12_8 == 0b11011) {
+  broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |
+                    (imm5_0 << 19);
+  broadcast_width = 64;
+} else if (imm12_8 == 0b11100) {
+  broadcast_value = (imm7 << 63) | ((1 - imm6) << 62) | ((imm6 * 0xFF) << 54) |
+                    (imm5_0 << 48);
+  broadcast_width = 64;
+}
+
+if (broadcast_width == 8) {
+  for (int i = 0; i < 32; i++) {
+    dst.byte[i] = broadcast_value;
+  }
+} else if (broadcast_width == 16) {
+  for (int i = 0; i < 16; i++) {
+    dst.half[i] = broadcast_value;
+  }
+} else if (broadcast_width == 32) {
+  for (int i = 0; i < 8; i++) {
+    dst.word[i] = broadcast_value;
+  }
+} else if (broadcast_width == 64) {
+  for (int i = 0; i < 4; i++) {
+    dst.dword[i] = broadcast_value;
+  }
+}
+
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/permutation/index.html b/lasx/permutation/index.html new file mode 100644 index 00000000..26b7fbfe --- /dev/null +++ b/lasx/permutation/index.html @@ -0,0 +1,411 @@ + + + + + + + + Permutation - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Permutation

+

__m256i __lasx_xvpermi_w (__m256i a, __m256i b, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvpermi_w (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvpermi.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Permute words from a and b with indices recorded in imm and store into dst.

+

Operation

+
dst.word[0] = b.word[imm & 0x3];
+dst.word[1] = b.word[(imm >> 2) & 0x3];
+dst.word[2] = a.word[(imm >> 4) & 0x3];
+dst.word[3] = a.word[(imm >> 6) & 0x3];
+dst.word[4] = b.word[4 + (imm & 0x3)];
+dst.word[5] = b.word[4 + ((imm >> 2) & 0x3)];
+dst.word[6] = a.word[4 + ((imm >> 4) & 0x3)];
+dst.word[7] = a.word[4 + ((imm >> 6) & 0x3)];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpermi_d (__m256i a, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvpermi_d (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvpermi.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Permute double words from a and b with indices recorded in imm and store into dst.

+

Operation

+
dst.dword[0] = a.dword[imm & 0x3];
+dst.dword[1] = a.dword[(imm >> 2) & 0x3];
+dst.dword[2] = a.dword[(imm >> 4) & 0x3];
+dst.dword[3] = a.dword[(imm >> 6) & 0x3];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_xvpermi_q (__m256i a, __m256i b, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvpermi_q (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvpermi.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Permute quad words from a and b with indices recorded in imm and store into dst.

+

Operation

+
if ((imm & 0x4) && MACHINE_3C5000) {
+  // Caveat: observed in 3C5000
+  dst.qword[0] = 0;
+} else {
+  dst.qword[0] = (imm & 2) ? a.qword[imm & 0x1] : b.qword[imm & 0x1];
+}
+if ((imm & 0x80) && MACHINE_3C5000) {
+  // Caveat: observed in 3C5000
+  dst.qword[1] = 0;
+} else {
+  dst.qword[1] =
+      (imm & 0x20) ? a.qword[(imm >> 4) & 0x1] : b.qword[(imm >> 4) & 0x1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_xvperm_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvperm_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvperm.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Permute words from a with indices recorded in b and store into dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[b.word[i] % 0x8];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/shift/index.html b/lasx/shift/index.html new file mode 100644 index 00000000..b0788626 --- /dev/null +++ b/lasx/shift/index.html @@ -0,0 +1,8908 @@ + + + + + + + + Shift - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Shift

+

__m256i __lasx_xvbsll_v (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvbsll_v (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvbsll.v xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute whole vector a shifted left by imm * 8 bits.

+

Operation

+
int shift = (imm * 8) % 128;
+dst.qword[0] = (u128)a.qword[0] << shift;
+dst.qword[1] = (u128)a.qword[1] << shift;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvbsrl_v (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvbsrl_v (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvbsrl.v xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute whole vector a shifted right by imm * 8 bits.

+

Operation

+
int shift = (imm * 8) % 128;
+dst.qword[0] = (u128)a.qword[0] >> shift;
+dst.qword[1] = (u128)a.qword[1] >> shift;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsll_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsll_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsll.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical left shift the unsigned 8-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] << (b.byte[i] & 0x7);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsll_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsll_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsll.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical left shift the unsigned 16-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] << (b.half[i] & 0xf);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsll_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsll_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsll.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical left shift the unsigned 32-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] << (b.word[i] & 0x1f);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsll_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsll_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsll.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical left shift the unsigned 64-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] << (b.dword[i] & 0x3f);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslli_b (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvslli_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvslli.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical left shift the unsigned 8-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslli_h (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvslli_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslli.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical left shift the unsigned 16-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslli_w (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvslli_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslli.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical left shift the unsigned 32-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslli_d (__m256i a, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvslli_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvslli.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical left shift the unsigned 64-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsllwil_h_b (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvsllwil_h_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsllwil.h.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Extend and shift signed 8-bit elements in a by imm to signed 16-bit result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[i] << imm;
+}
+for (int i = 8; i < 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[i + 8] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m256i __lasx_xvsllwil_hu_bu (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvsllwil_hu_bu (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsllwil.hu.bu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Extend and shift unsigned 8-bit elements in a by imm to unsigned 16-bit result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[i] << imm;
+}
+for (int i = 8; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[i + 8] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m256i __lasx_xvsllwil_w_h (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvsllwil_w_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsllwil.w.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Extend and shift signed 16-bit elements in a by imm to signed 32-bit result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[i] << imm;
+}
+for (int i = 4; i < 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[i + 4] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m256i __lasx_xvsllwil_wu_hu (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvsllwil_wu_hu (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsllwil.wu.hu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Extend and shift unsigned 16-bit elements in a by imm to unsigned 32-bit result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[i] << imm;
+}
+for (int i = 4; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[i + 4] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m256i __lasx_xvsllwil_d_w (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsllwil_d_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsllwil.d.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Extend and shift signed 32-bit elements in a by imm to signed 64-bit result.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[i] << imm;
+}
+for (int i = 2; i < 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[i + 2] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m256i __lasx_xvsllwil_du_wu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsllwil_du_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsllwil.du.wu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Extend and shift unsigned 32-bit elements in a by imm to unsigned 64-bit result.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[i] << imm;
+}
+for (int i = 2; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[i + 2] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m256i __lasx_xvsra_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsra_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsra.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 8-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i]) >> (b.byte[i] & 0x7);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsra_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsra_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsra.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 16-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((s16)a.half[i]) >> (b.half[i] & 0xf);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsra_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsra_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsra.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 32-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((s32)a.word[i]) >> (b.word[i] & 0x1f);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsra_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsra_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsra.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 64-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i]) >> (b.dword[i] & 0x3f);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsrai_b (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvsrai_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsrai.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 8-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i]) >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsrai_h (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvsrai_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrai.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 16-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((s16)a.half[i]) >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsrai_w (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsrai_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrai.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 32-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((s32)a.word[i]) >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsrai_d (__m256i a, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvsrai_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrai.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 64-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i]) >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsran_b_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsran_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsran.b.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 16-bit elements in a by elements in b, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i < 8) ? (s8)((s16)a.half[i] >> (b.half[i] & 15)) : 0;
+}
+for (int i = 16; i < 32; i++) {
+  dst.byte[i] = (i < 24) ? (s8)((s16)a.half[i - 8] >> (b.half[i - 8] & 15)) : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m256i __lasx_xvsran_h_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsran_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsran.h.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 32-bit elements in a by elements in b, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i < 4) ? (s16)((s32)a.word[i] >> (b.word[i] & 31)) : 0;
+}
+for (int i = 8; i < 16; i++) {
+  dst.half[i] =
+      (i < 12) ? (s16)((s32)a.word[i - 4] >> (b.word[i - 4] & 31)) : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m256i __lasx_xvsran_w_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsran_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsran.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 64-bit elements in a by elements in b, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i < 2) ? (s32)((s64)a.dword[i] >> (b.dword[i] & 63)) : 0;
+}
+for (int i = 4; i < 8; i++) {
+  dst.word[i] =
+      (i < 6) ? (s32)((s64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m256i __lasx_xvsrani_b_h (__m256i a, __m256i b, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvsrani_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrani.b.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 16-bit elements in a and b by imm, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] =
+      (i < 8) ? (s8)((s16)b.half[i] >> imm) : (s8)((s16)a.half[i - 8] >> imm);
+}
+for (int i = 16; i < 32; i++) {
+  dst.byte[i] = (i < 24) ? (s8)((s16)b.half[i - 8] >> imm)
+                         : (s8)((s16)a.half[i - 16] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrani_h_w (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsrani_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrani.h.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 32-bit elements in a and b by imm, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] =
+      (i < 4) ? (s16)((s32)b.word[i] >> imm) : (s16)((s32)a.word[i - 4] >> imm);
+}
+for (int i = 8; i < 16; i++) {
+  dst.half[i] = (i < 12) ? (s16)((s32)b.word[i - 4] >> imm)
+                         : (s16)((s32)a.word[i - 8] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrani_w_d (__m256i a, __m256i b, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvsrani_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrani.w.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 64-bit elements in a and b by imm, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i < 2) ? (s32)((s64)b.dword[i] >> imm)
+                        : (s32)((s64)a.dword[i - 2] >> imm);
+}
+for (int i = 4; i < 8; i++) {
+  dst.word[i] = (i < 6) ? (s32)((s64)b.dword[i - 2] >> imm)
+                        : (s32)((s64)a.dword[i - 4] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrani_d_q (__m256i a, __m256i b, imm0_127 imm)

+

Synopsis

+
__m256i __lasx_xvsrani_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvsrani.d.q xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 128-bit elements in a and b by imm, truncate to 64-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1) ? (s64)((s128)b.qword[i] >> imm)
+                         : (s64)((s128)a.qword[i - 1] >> imm);
+}
+for (int i = 2; i < 4; i++) {
+  dst.dword[i] = (i < 3) ? (s64)((s128)b.qword[i - 1] >> imm)
+                         : (s64)((s128)a.qword[i - 2] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrar_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrar_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrar.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 8-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  if ((b.byte[i] & 0x7) == 0) {
+    dst.byte[i] = a.byte[i];
+  } else {
+    dst.byte[i] = ((s8)a.byte[i] >> (b.byte[i] & 0x7)) +
+                  (((s8)a.byte[i] >> ((b.byte[i] & 0x7) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrar_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrar_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrar.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 16-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if ((b.half[i] & 0xf) == 0) {
+    dst.half[i] = a.half[i];
+  } else {
+    dst.half[i] = ((s16)a.half[i] >> (b.half[i] & 0xf)) +
+                  (((s16)a.half[i] >> ((b.half[i] & 0xf) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrar_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrar_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrar.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 32-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if ((b.word[i] & 0x1f) == 0) {
+    dst.word[i] = a.word[i];
+  } else {
+    dst.word[i] = ((s32)a.word[i] >> (b.word[i] & 0x1f)) +
+                  (((s32)a.word[i] >> ((b.word[i] & 0x1f) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrar_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrar_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrar.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 64-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if ((b.dword[i] & 0x3f) == 0) {
+    dst.dword[i] = a.dword[i];
+  } else {
+    dst.dword[i] = ((s64)a.dword[i] >> (b.dword[i] & 0x3f)) +
+                   (((s64)a.dword[i] >> ((b.dword[i] & 0x3f) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrari_b (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvsrari_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsrari.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 8-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  if (imm == 0) {
+    dst.byte[i] = a.byte[i];
+  } else {
+    dst.byte[i] = ((s8)a.byte[i] >> imm) + (((s8)a.byte[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrari_h (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvsrari_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrari.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 16-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (imm == 0) {
+    dst.half[i] = a.half[i];
+  } else {
+    dst.half[i] =
+        ((s16)a.half[i] >> imm) + (((s16)a.half[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrari_w (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsrari_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrari.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 32-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (imm == 0) {
+    dst.word[i] = a.word[i];
+  } else {
+    dst.word[i] =
+        ((s32)a.word[i] >> imm) + (((s32)a.word[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrari_d (__m256i a, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvsrari_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrari.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 64-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (imm == 0) {
+    dst.dword[i] = a.dword[i];
+  } else {
+    dst.dword[i] =
+        ((s64)a.dword[i] >> imm) + (((s64)a.dword[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrarn_b_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrarn_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrarn.b.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 16-bit elements in a by elements in b, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u8 shift = (b.half[i] & 15);
+    if (shift == 0) {
+      dst.byte[i] = (s8)(s16)a.half[i];
+    } else {
+      dst.byte[i] = (s8)(((s16)a.half[i] >> shift) +
+                         (((s16)a.half[i] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    u8 shift = (b.half[i - 8] & 15);
+    if (shift == 0) {
+      dst.byte[i] = (s8)(s16)a.half[i - 8];
+    } else {
+      dst.byte[i] = (s8)(((s16)a.half[i - 8] >> shift) +
+                         (((s16)a.half[i - 8] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrarn_h_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrarn_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrarn.h.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 32-bit elements in a by elements in b, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u8 shift = (b.word[i] & 31);
+    if (shift == 0) {
+      dst.half[i] = (s16)(s32)a.word[i];
+    } else {
+      dst.half[i] = (s16)(((s32)a.word[i] >> shift) +
+                          (((s32)a.word[i] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    u8 shift = (b.word[i - 4] & 31);
+    if (shift == 0) {
+      dst.half[i] = (s16)(s32)a.word[i - 4];
+    } else {
+      dst.half[i] = (s16)(((s32)a.word[i - 4] >> shift) +
+                          (((s32)a.word[i - 4] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrarn_w_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrarn_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrarn.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 64-bit elements in a by elements in b, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u8 shift = (b.dword[i] & 63);
+    if (shift == 0) {
+      dst.word[i] = (s32)(s64)a.dword[i];
+    } else {
+      dst.word[i] = (s32)(((s64)a.dword[i] >> shift) +
+                          (((s64)a.dword[i] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    u8 shift = (b.dword[i - 2] & 63);
+    if (shift == 0) {
+      dst.word[i] = (s32)(s64)a.dword[i - 2];
+    } else {
+      dst.word[i] = (s32)(((s64)a.dword[i - 2] >> shift) +
+                          (((s64)a.dword[i - 2] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrarni_b_h (__m256i a, __m256i b, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvsrarni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrarni.b.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 16-bit elements in a and b by imm, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    if (imm == 0) {
+      dst.byte[i] = (s8)(s16)b.half[i];
+    } else {
+      dst.byte[i] =
+          (s8)(((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.byte[i] = (s8)(s16)a.half[i - 8];
+    } else {
+      dst.byte[i] = (s8)(((s16)a.half[i - 8] >> imm) +
+                         (((s16)a.half[i - 8] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    if (imm == 0) {
+      dst.byte[i] = (s8)(s16)b.half[i - 8];
+    } else {
+      dst.byte[i] = (s8)(((s16)b.half[i - 8] >> imm) +
+                         (((s16)b.half[i - 8] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.byte[i] = (s8)(s16)a.half[i - 16];
+    } else {
+      dst.byte[i] = (s8)(((s16)a.half[i - 16] >> imm) +
+                         (((s16)a.half[i - 16] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrarni_h_w (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsrarni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrarni.h.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 32-bit elements in a and b by imm, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    if (imm == 0) {
+      dst.half[i] = (s16)(s32)b.word[i];
+    } else {
+      dst.half[i] = (s16)(((s32)b.word[i] >> imm) +
+                          (((s32)b.word[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.half[i] = (s16)(s32)a.word[i - 4];
+    } else {
+      dst.half[i] = (s16)(((s32)a.word[i - 4] >> imm) +
+                          (((s32)a.word[i - 4] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    if (imm == 0) {
+      dst.half[i] = (s16)(s32)b.word[i - 4];
+    } else {
+      dst.half[i] = (s16)(((s32)b.word[i - 4] >> imm) +
+                          (((s32)b.word[i - 4] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.half[i] = (s16)(s32)a.word[i - 8];
+    } else {
+      dst.half[i] = (s16)(((s32)a.word[i - 8] >> imm) +
+                          (((s32)a.word[i - 8] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrarni_w_d (__m256i a, __m256i b, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvsrarni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrarni.w.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 64-bit elements in a and b by imm, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    if (imm == 0) {
+      dst.word[i] = (s32)(s64)b.dword[i];
+    } else {
+      dst.word[i] = (s32)(((s64)b.dword[i] >> imm) +
+                          (((s64)b.dword[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.word[i] = (s32)(s64)a.dword[i - 2];
+    } else {
+      dst.word[i] = (s32)(((s64)a.dword[i - 2] >> imm) +
+                          (((s64)a.dword[i - 2] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    if (imm == 0) {
+      dst.word[i] = (s32)(s64)b.dword[i - 2];
+    } else {
+      dst.word[i] = (s32)(((s64)b.dword[i - 2] >> imm) +
+                          (((s64)b.dword[i - 2] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.word[i] = (s32)(s64)a.dword[i - 4];
+    } else {
+      dst.word[i] = (s32)(((s64)a.dword[i - 4] >> imm) +
+                          (((s64)a.dword[i - 4] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrarni_d_q (__m256i a, __m256i b, imm0_127 imm)

+

Synopsis

+
__m256i __lasx_xvsrarni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvsrarni.d.q xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 128-bit elements in a and b by imm, truncate to 64-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    if (imm == 0) {
+      dst.dword[i] = (s64)(s128)b.qword[i];
+    } else {
+      dst.dword[i] = (s64)(((s128)b.qword[i] >> imm) +
+                           (((s128)b.qword[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.dword[i] = (s64)(s128)a.qword[i - 1];
+    } else {
+      dst.dword[i] = (s64)(((s128)a.qword[i - 1] >> imm) +
+                           (((s128)a.qword[i - 1] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+for (int i = 2; i < 4; i++) {
+  if (i < 3) {
+    if (imm == 0) {
+      dst.dword[i] = (s64)(s128)b.qword[i - 1];
+    } else {
+      dst.dword[i] = (s64)(((s128)b.qword[i - 1] >> imm) +
+                           (((s128)b.qword[i - 1] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.dword[i] = (s64)(s128)a.qword[i - 2];
+    } else {
+      dst.dword[i] = (s64)(((s128)a.qword[i - 2] >> imm) +
+                           (((s128)a.qword[i - 2] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrl_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrl_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrl.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 8-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] >> (b.byte[i] & 0x7);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsrl_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrl_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrl.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] >> (b.half[i] & 0xf);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsrl_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrl_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrl.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] >> (b.word[i] & 0x1f);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsrl_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrl_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrl.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] >> (b.dword[i] & 0x3f);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsrli_b (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvsrli_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsrli.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 8-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsrli_h (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvsrli_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrli.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsrli_w (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsrli_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrli.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsrli_d (__m256i a, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvsrli_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrli.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsrln_b_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrln_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrln.b.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a by elements in b, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i < 8) ? (u8)((u16)a.half[i] >> (b.half[i] & 15)) : 0;
+}
+for (int i = 16; i < 32; i++) {
+  dst.byte[i] = (i < 24) ? (u8)((u16)a.half[i - 8] >> (b.half[i - 8] & 15)) : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m256i __lasx_xvsrln_h_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrln_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrln.h.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a by elements in b, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i < 4) ? (u16)((u32)a.word[i] >> (b.word[i] & 31)) : 0;
+}
+for (int i = 8; i < 16; i++) {
+  dst.half[i] =
+      (i < 12) ? (u16)((u32)a.word[i - 4] >> (b.word[i - 4] & 31)) : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m256i __lasx_xvsrln_w_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrln_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrln.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a by elements in b, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i < 2) ? (u32)((u64)a.dword[i] >> (b.dword[i] & 63)) : 0;
+}
+for (int i = 4; i < 8; i++) {
+  dst.word[i] =
+      (i < 6) ? (u32)((u64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m256i __lasx_xvsrlni_b_h (__m256i a, __m256i b, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvsrlni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlni.b.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a and b by imm, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] =
+      (i < 8) ? (u8)((u16)b.half[i] >> imm) : (u8)((u16)a.half[i - 8] >> imm);
+}
+for (int i = 16; i < 32; i++) {
+  dst.byte[i] = (i < 24) ? (u8)((u16)b.half[i - 8] >> imm)
+                         : (u8)((u16)a.half[i - 16] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrlni_h_w (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsrlni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlni.h.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a and b by imm, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] =
+      (i < 4) ? (u16)((u32)b.word[i] >> imm) : (u16)((u32)a.word[i - 4] >> imm);
+}
+for (int i = 8; i < 16; i++) {
+  dst.half[i] = (i < 12) ? (u16)((u32)b.word[i - 4] >> imm)
+                         : (u16)((u32)a.word[i - 8] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrlni_w_d (__m256i a, __m256i b, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvsrlni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlni.w.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a and b by imm, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i < 2) ? (u32)((u64)b.dword[i] >> imm)
+                        : (u32)((u64)a.dword[i - 2] >> imm);
+}
+for (int i = 4; i < 8; i++) {
+  dst.word[i] = (i < 6) ? (u32)((u64)b.dword[i - 2] >> imm)
+                        : (u32)((u64)a.dword[i - 4] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrlni_d_q (__m256i a, __m256i b, imm0_127 imm)

+

Synopsis

+
__m256i __lasx_xvsrlni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlni.d.q xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 128-bit elements in a and b by imm, truncate to 64-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1) ? (u64)((u128)b.qword[i] >> imm)
+                         : (u64)((u128)a.qword[i - 1] >> imm);
+}
+for (int i = 2; i < 4; i++) {
+  dst.dword[i] = (i < 3) ? (u64)((u128)b.qword[i - 1] >> imm)
+                         : (u64)((u128)a.qword[i - 2] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrlr_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrlr_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlr.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 8-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  if ((b.byte[i] & 0x7) == 0) {
+    dst.byte[i] = a.byte[i];
+  } else {
+    dst.byte[i] = (a.byte[i] >> (b.byte[i] & 0x7)) +
+                  ((a.byte[i] >> ((b.byte[i] & 0x7) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrlr_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrlr_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlr.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if ((b.half[i] & 0xf) == 0) {
+    dst.half[i] = a.half[i];
+  } else {
+    dst.half[i] = (a.half[i] >> (b.half[i] & 0xf)) +
+                  ((a.half[i] >> ((b.half[i] & 0xf) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrlr_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrlr_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlr.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if ((b.word[i] & 0x1f) == 0) {
+    dst.word[i] = a.word[i];
+  } else {
+    dst.word[i] = (a.word[i] >> (b.word[i] & 0x1f)) +
+                  ((a.word[i] >> ((b.word[i] & 0x1f) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrlr_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrlr_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlr.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if ((b.dword[i] & 0x3f) == 0) {
+    dst.dword[i] = a.dword[i];
+  } else {
+    dst.dword[i] = (a.dword[i] >> (b.dword[i] & 0x3f)) +
+                   ((a.dword[i] >> ((b.dword[i] & 0x3f) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrlri_b (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvsrlri_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlri.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 8-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  if (imm == 0) {
+    dst.byte[i] = a.byte[i];
+  } else {
+    dst.byte[i] = (a.byte[i] >> imm) + ((a.byte[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrlri_h (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvsrlri_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlri.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (imm == 0) {
+    dst.half[i] = a.half[i];
+  } else {
+    dst.half[i] = (a.half[i] >> imm) + ((a.half[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrlri_w (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsrlri_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlri.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (imm == 0) {
+    dst.word[i] = a.word[i];
+  } else {
+    dst.word[i] = (a.word[i] >> imm) + ((a.word[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrlri_d (__m256i a, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvsrlri_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlri.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (imm == 0) {
+    dst.dword[i] = a.dword[i];
+  } else {
+    dst.dword[i] = (a.dword[i] >> imm) + ((a.dword[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrlrn_b_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrlrn_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlrn.b.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a by elements in b, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u8 shift = (b.half[i] & 15);
+    if (shift == 0) {
+      dst.byte[i] = (u8)(u16)a.half[i];
+    } else {
+      dst.byte[i] = (u8)(((u16)a.half[i] >> shift) +
+                         (((u16)a.half[i] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    u8 shift = (b.half[i - 8] & 15);
+    if (shift == 0) {
+      dst.byte[i] = (u8)(u16)a.half[i - 8];
+    } else {
+      dst.byte[i] = (u8)(((u16)a.half[i - 8] >> shift) +
+                         (((u16)a.half[i - 8] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrlrn_h_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrlrn_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlrn.h.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a by elements in b, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u8 shift = (b.word[i] & 31);
+    if (shift == 0) {
+      dst.half[i] = (u16)(u32)a.word[i];
+    } else {
+      dst.half[i] = (u16)(((u32)a.word[i] >> shift) +
+                          (((u32)a.word[i] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    u8 shift = (b.word[i - 4] & 31);
+    if (shift == 0) {
+      dst.half[i] = (u16)(u32)a.word[i - 4];
+    } else {
+      dst.half[i] = (u16)(((u32)a.word[i - 4] >> shift) +
+                          (((u32)a.word[i - 4] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrlrn_w_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrlrn_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlrn.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a by elements in b, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u8 shift = (b.dword[i] & 63);
+    if (shift == 0) {
+      dst.word[i] = (u32)(u64)a.dword[i];
+    } else {
+      dst.word[i] = (u32)(((u64)a.dword[i] >> shift) +
+                          (((u64)a.dword[i] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    u8 shift = (b.dword[i - 2] & 63);
+    if (shift == 0) {
+      dst.word[i] = (u32)(u64)a.dword[i - 2];
+    } else {
+      dst.word[i] = (u32)(((u64)a.dword[i - 2] >> shift) +
+                          (((u64)a.dword[i - 2] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrlrni_b_h (__m256i a, __m256i b, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvsrlrni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlrni.b.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a and b by imm, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    if (imm == 0) {
+      dst.byte[i] = (u8)(u16)b.half[i];
+    } else {
+      dst.byte[i] =
+          (u8)(((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.byte[i] = (u8)(u16)a.half[i - 8];
+    } else {
+      dst.byte[i] = (u8)(((u16)a.half[i - 8] >> imm) +
+                         (((u16)a.half[i - 8] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    if (imm == 0) {
+      dst.byte[i] = (u8)(u16)b.half[i - 8];
+    } else {
+      dst.byte[i] = (u8)(((u16)b.half[i - 8] >> imm) +
+                         (((u16)b.half[i - 8] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.byte[i] = (u8)(u16)a.half[i - 16];
+    } else {
+      dst.byte[i] = (u8)(((u16)a.half[i - 16] >> imm) +
+                         (((u16)a.half[i - 16] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrlrni_h_w (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsrlrni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlrni.h.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a and b by imm, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    if (imm == 0) {
+      dst.half[i] = (u16)(u32)b.word[i];
+    } else {
+      dst.half[i] = (u16)(((u32)b.word[i] >> imm) +
+                          (((u32)b.word[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.half[i] = (u16)(u32)a.word[i - 4];
+    } else {
+      dst.half[i] = (u16)(((u32)a.word[i - 4] >> imm) +
+                          (((u32)a.word[i - 4] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    if (imm == 0) {
+      dst.half[i] = (u16)(u32)b.word[i - 4];
+    } else {
+      dst.half[i] = (u16)(((u32)b.word[i - 4] >> imm) +
+                          (((u32)b.word[i - 4] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.half[i] = (u16)(u32)a.word[i - 8];
+    } else {
+      dst.half[i] = (u16)(((u32)a.word[i - 8] >> imm) +
+                          (((u32)a.word[i - 8] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrlrni_w_d (__m256i a, __m256i b, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvsrlrni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlrni.w.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a and b by imm, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    if (imm == 0) {
+      dst.word[i] = (u32)(u64)b.dword[i];
+    } else {
+      dst.word[i] = (u32)(((u64)b.dword[i] >> imm) +
+                          (((u64)b.dword[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.word[i] = (u32)(u64)a.dword[i - 2];
+    } else {
+      dst.word[i] = (u32)(((u64)a.dword[i - 2] >> imm) +
+                          (((u64)a.dword[i - 2] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    if (imm == 0) {
+      dst.word[i] = (u32)(u64)b.dword[i - 2];
+    } else {
+      dst.word[i] = (u32)(((u64)b.dword[i - 2] >> imm) +
+                          (((u64)b.dword[i - 2] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.word[i] = (u32)(u64)a.dword[i - 4];
+    } else {
+      dst.word[i] = (u32)(((u64)a.dword[i - 4] >> imm) +
+                          (((u64)a.dword[i - 4] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrlrni_d_q (__m256i a, __m256i b, imm0_127 imm)

+

Synopsis

+
__m256i __lasx_xvsrlrni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlrni.d.q xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 128-bit elements in a and b by imm, truncate to 64-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    if (imm == 0) {
+      dst.dword[i] = (u64)(u128)b.qword[i];
+    } else {
+      dst.dword[i] = (u64)(((u128)b.qword[i] >> imm) +
+                           (((u128)b.qword[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.dword[i] = (u64)(u128)a.qword[i - 1];
+    } else {
+      dst.dword[i] = (u64)(((u128)a.qword[i - 1] >> imm) +
+                           (((u128)a.qword[i - 1] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+for (int i = 2; i < 4; i++) {
+  if (i < 3) {
+    if (imm == 0) {
+      dst.dword[i] = (u64)(u128)b.qword[i - 1];
+    } else {
+      dst.dword[i] = (u64)(((u128)b.qword[i - 1] >> imm) +
+                           (((u128)b.qword[i - 1] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.dword[i] = (u64)(u128)a.qword[i - 2];
+    } else {
+      dst.dword[i] = (u64)(((u128)a.qword[i - 2] >> imm) +
+                           (((u128)a.qword[i - 2] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvssran_b_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssran_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssran.b.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 16-bit elements in a by elements in b, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp = (s16)a.half[i] >> (b.half[i] & 15);
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    s16 temp = (s16)a.half[i - 8] >> (b.half[i - 8] & 15);
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssran_bu_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssran_bu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssran.bu.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 16-bit elements in a by elements in b, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp = (s16)a.half[i] >> (b.half[i] & 15);
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    s16 temp = (s16)a.half[i - 8] >> (b.half[i - 8] & 15);
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssran_h_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssran_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssran.h.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 32-bit elements in a by elements in b, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp = (s32)a.word[i] >> (b.word[i] & 31);
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    s32 temp = (s32)a.word[i - 4] >> (b.word[i - 4] & 31);
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssran_hu_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssran_hu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssran.hu.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 32-bit elements in a by elements in b, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp = (s32)a.word[i] >> (b.word[i] & 31);
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    s32 temp = (s32)a.word[i - 4] >> (b.word[i - 4] & 31);
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssran_w_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssran_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssran.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 64-bit elements in a by elements in b, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp = (s64)a.dword[i] >> (b.dword[i] & 63);
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    s64 temp = (s64)a.dword[i - 2] >> (b.dword[i - 2] & 63);
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssran_wu_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssran_wu_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssran.wu.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 64-bit elements in a by elements in b, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp = (s64)a.dword[i] >> (b.dword[i] & 63);
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    s64 temp = (s64)a.dword[i - 2] >> (b.dword[i - 2] & 63);
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrani_b_h (__m256i a, __m256i b, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvssrani_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.b.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 16-bit elements in a and b by imm, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp = (s16)b.half[i] >> imm;
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  } else {
+    s16 temp = (s16)a.half[i - 8] >> imm;
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    s16 temp = (s16)b.half[i - 8] >> imm;
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  } else {
+    s16 temp = (s16)a.half[i - 16] >> imm;
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrani_bu_h (__m256i a, __m256i b, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvssrani_bu_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.bu.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 16-bit elements in a and b by imm, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp = (s16)b.half[i] >> imm;
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  } else {
+    s16 temp = (s16)a.half[i - 8] >> imm;
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    s16 temp = (s16)b.half[i - 8] >> imm;
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  } else {
+    s16 temp = (s16)a.half[i - 16] >> imm;
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrani_h_w (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvssrani_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.h.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 32-bit elements in a and b by imm, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp = (s32)b.word[i] >> imm;
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  } else {
+    s32 temp = (s32)a.word[i - 4] >> imm;
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    s32 temp = (s32)b.word[i - 4] >> imm;
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  } else {
+    s32 temp = (s32)a.word[i - 8] >> imm;
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrani_hu_w (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvssrani_hu_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.hu.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 32-bit elements in a and b by imm, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp = (s32)b.word[i] >> imm;
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  } else {
+    s32 temp = (s32)a.word[i - 4] >> imm;
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    s32 temp = (s32)b.word[i - 4] >> imm;
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  } else {
+    s32 temp = (s32)a.word[i - 8] >> imm;
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrani_w_d (__m256i a, __m256i b, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvssrani_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.w.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 64-bit elements in a and b by imm, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp = (s64)b.dword[i] >> imm;
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  } else {
+    s64 temp = (s64)a.dword[i - 2] >> imm;
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    s64 temp = (s64)b.dword[i - 2] >> imm;
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  } else {
+    s64 temp = (s64)a.dword[i - 4] >> imm;
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrani_wu_d (__m256i a, __m256i b, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvssrani_wu_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.wu.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 64-bit elements in a and b by imm, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp = (s64)b.dword[i] >> imm;
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  } else {
+    s64 temp = (s64)a.dword[i - 2] >> imm;
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    s64 temp = (s64)b.dword[i - 2] >> imm;
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  } else {
+    s64 temp = (s64)a.dword[i - 4] >> imm;
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrani_d_q (__m256i a, __m256i b, imm0_127 imm)

+

Synopsis

+
__m256i __lasx_xvssrani_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.d.q xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 128-bit elements in a and b by imm, clamp to fit in signed 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    s128 temp = (s128)b.qword[i] >> imm;
+    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+  } else {
+    s128 temp = (s128)a.qword[i - 1] >> imm;
+    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+  }
+}
+for (int i = 2; i < 4; i++) {
+  if (i < 3) {
+    s128 temp = (s128)b.qword[i - 1] >> imm;
+    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+  } else {
+    s128 temp = (s128)a.qword[i - 2] >> imm;
+    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvssrani_du_q (__m256i a, __m256i b, imm0_127 imm)

+

Synopsis

+
__m256i __lasx_xvssrani_du_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.du.q xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 128-bit elements in a and b by imm, clamp to fit in unsigned 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    s128 temp = (s128)b.qword[i] >> imm;
+    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+  } else {
+    s128 temp = (s128)a.qword[i - 1] >> imm;
+    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+  }
+}
+for (int i = 2; i < 4; i++) {
+  if (i < 3) {
+    s128 temp = (s128)b.qword[i - 1] >> imm;
+    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+  } else {
+    s128 temp = (s128)a.qword[i - 2] >> imm;
+    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvssrarn_b_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrarn_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrarn.b.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 16-bit elements in a by elements in b, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp;
+    if ((b.half[i] & 15) == 0) {
+      temp = (s16)a.half[i];
+    } else {
+      temp = ((s16)a.half[i] >> (b.half[i] & 15)) +
+             (((s16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    s16 temp;
+    if ((b.half[i - 8] & 15) == 0) {
+      temp = (s16)a.half[i - 8];
+    } else {
+      temp = ((s16)a.half[i - 8] >> (b.half[i - 8] & 15)) +
+             (((s16)a.half[i - 8] >> ((b.half[i - 8] & 15) - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrarn_bu_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrarn_bu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrarn.bu.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 16-bit elements in a by elements in b, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp;
+    if ((b.half[i] & 15) == 0) {
+      temp = (s16)a.half[i];
+    } else {
+      temp = ((s16)a.half[i] >> (b.half[i] & 15)) +
+             (((s16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    s16 temp;
+    if ((b.half[i - 8] & 15) == 0) {
+      temp = (s16)a.half[i - 8];
+    } else {
+      temp = ((s16)a.half[i - 8] >> (b.half[i - 8] & 15)) +
+             (((s16)a.half[i - 8] >> ((b.half[i - 8] & 15) - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrarn_h_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrarn_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrarn.h.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 32-bit elements in a by elements in b, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp;
+    if ((b.word[i] & 31) == 0) {
+      temp = (s32)a.word[i];
+    } else {
+      temp = ((s32)a.word[i] >> (b.word[i] & 31)) +
+             (((s32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    s32 temp;
+    if ((b.word[i - 4] & 31) == 0) {
+      temp = (s32)a.word[i - 4];
+    } else {
+      temp = ((s32)a.word[i - 4] >> (b.word[i - 4] & 31)) +
+             (((s32)a.word[i - 4] >> ((b.word[i - 4] & 31) - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrarn_hu_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrarn_hu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrarn.hu.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 32-bit elements in a by elements in b, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp;
+    if ((b.word[i] & 31) == 0) {
+      temp = (s32)a.word[i];
+    } else {
+      temp = ((s32)a.word[i] >> (b.word[i] & 31)) +
+             (((s32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    s32 temp;
+    if ((b.word[i - 4] & 31) == 0) {
+      temp = (s32)a.word[i - 4];
+    } else {
+      temp = ((s32)a.word[i - 4] >> (b.word[i - 4] & 31)) +
+             (((s32)a.word[i - 4] >> ((b.word[i - 4] & 31) - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrarn_w_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrarn_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrarn.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 64-bit elements in a by elements in b, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp;
+    if ((b.dword[i] & 63) == 0) {
+      temp = (s64)a.dword[i];
+    } else {
+      temp = ((s64)a.dword[i] >> (b.dword[i] & 63)) +
+             (((s64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    s64 temp;
+    if ((b.dword[i - 2] & 63) == 0) {
+      temp = (s64)a.dword[i - 2];
+    } else {
+      temp = ((s64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) +
+             (((s64)a.dword[i - 2] >> ((b.dword[i - 2] & 63) - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrarn_wu_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrarn_wu_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrarn.wu.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 64-bit elements in a by elements in b, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp;
+    if ((b.dword[i] & 63) == 0) {
+      temp = (s64)a.dword[i];
+    } else {
+      temp = ((s64)a.dword[i] >> (b.dword[i] & 63)) +
+             (((s64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    s64 temp;
+    if ((b.dword[i - 2] & 63) == 0) {
+      temp = (s64)a.dword[i - 2];
+    } else {
+      temp = ((s64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) +
+             (((s64)a.dword[i - 2] >> ((b.dword[i - 2] & 63) - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrarni_b_h (__m256i a, __m256i b, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvssrarni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.b.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 16-bit elements in a and b by imm, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)b.half[i];
+    } else {
+      temp = ((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  } else {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)a.half[i - 8];
+    } else {
+      temp =
+          ((s16)a.half[i - 8] >> imm) + (((s16)a.half[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)b.half[i - 8];
+    } else {
+      temp =
+          ((s16)b.half[i - 8] >> imm) + (((s16)b.half[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  } else {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)a.half[i - 16];
+    } else {
+      temp = ((s16)a.half[i - 16] >> imm) +
+             (((s16)a.half[i - 16] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrarni_bu_h (__m256i a, __m256i b, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvssrarni_bu_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.bu.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 16-bit elements in a and b by imm, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)b.half[i];
+    } else {
+      temp = ((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  } else {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)a.half[i - 8];
+    } else {
+      temp =
+          ((s16)a.half[i - 8] >> imm) + (((s16)a.half[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)b.half[i - 8];
+    } else {
+      temp =
+          ((s16)b.half[i - 8] >> imm) + (((s16)b.half[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  } else {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)a.half[i - 16];
+    } else {
+      temp = ((s16)a.half[i - 16] >> imm) +
+             (((s16)a.half[i - 16] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrarni_h_w (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvssrarni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.h.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 32-bit elements in a and b by imm, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)b.word[i];
+    } else {
+      temp = ((s32)b.word[i] >> imm) + (((s32)b.word[i] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  } else {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)a.word[i - 4];
+    } else {
+      temp =
+          ((s32)a.word[i - 4] >> imm) + (((s32)a.word[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)b.word[i - 4];
+    } else {
+      temp =
+          ((s32)b.word[i - 4] >> imm) + (((s32)b.word[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  } else {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)a.word[i - 8];
+    } else {
+      temp =
+          ((s32)a.word[i - 8] >> imm) + (((s32)a.word[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrarni_hu_w (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvssrarni_hu_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.hu.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 32-bit elements in a and b by imm, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)b.word[i];
+    } else {
+      temp = ((s32)b.word[i] >> imm) + (((s32)b.word[i] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  } else {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)a.word[i - 4];
+    } else {
+      temp =
+          ((s32)a.word[i - 4] >> imm) + (((s32)a.word[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)b.word[i - 4];
+    } else {
+      temp =
+          ((s32)b.word[i - 4] >> imm) + (((s32)b.word[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  } else {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)a.word[i - 8];
+    } else {
+      temp =
+          ((s32)a.word[i - 8] >> imm) + (((s32)a.word[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrarni_w_d (__m256i a, __m256i b, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvssrarni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.w.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 64-bit elements in a and b by imm, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)b.dword[i];
+    } else {
+      temp = ((s64)b.dword[i] >> imm) + (((s64)b.dword[i] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  } else {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)a.dword[i - 2];
+    } else {
+      temp = ((s64)a.dword[i - 2] >> imm) +
+             (((s64)a.dword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)b.dword[i - 2];
+    } else {
+      temp = ((s64)b.dword[i - 2] >> imm) +
+             (((s64)b.dword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  } else {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)a.dword[i - 4];
+    } else {
+      temp = ((s64)a.dword[i - 4] >> imm) +
+             (((s64)a.dword[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrarni_wu_d (__m256i a, __m256i b, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvssrarni_wu_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.wu.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 64-bit elements in a and b by imm, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)b.dword[i];
+    } else {
+      temp = ((s64)b.dword[i] >> imm) + (((s64)b.dword[i] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  } else {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)a.dword[i - 2];
+    } else {
+      temp = ((s64)a.dword[i - 2] >> imm) +
+             (((s64)a.dword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)b.dword[i - 2];
+    } else {
+      temp = ((s64)b.dword[i - 2] >> imm) +
+             (((s64)b.dword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  } else {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)a.dword[i - 4];
+    } else {
+      temp = ((s64)a.dword[i - 4] >> imm) +
+             (((s64)a.dword[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrarni_d_q (__m256i a, __m256i b, imm0_127 imm)

+

Synopsis

+
__m256i __lasx_xvssrarni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.d.q xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 128-bit elements in a and b by imm, clamp to fit in signed 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)b.qword[i];
+    } else {
+      temp = ((s128)b.qword[i] >> imm) + (((s128)b.qword[i] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+  } else {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)a.qword[i - 1];
+    } else {
+      temp = ((s128)a.qword[i - 1] >> imm) +
+             (((s128)a.qword[i - 1] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+  }
+}
+for (int i = 2; i < 4; i++) {
+  if (i < 3) {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)b.qword[i - 1];
+    } else {
+      temp = ((s128)b.qword[i - 1] >> imm) +
+             (((s128)b.qword[i - 1] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+  } else {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)a.qword[i - 2];
+    } else {
+      temp = ((s128)a.qword[i - 2] >> imm) +
+             (((s128)a.qword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvssrarni_du_q (__m256i a, __m256i b, imm0_127 imm)

+

Synopsis

+
__m256i __lasx_xvssrarni_du_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.du.q xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 128-bit elements in a and b by imm, clamp to fit in unsigned 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)b.qword[i];
+    } else {
+      temp = ((s128)b.qword[i] >> imm) + (((s128)b.qword[i] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+  } else {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)a.qword[i - 1];
+    } else {
+      temp = ((s128)a.qword[i - 1] >> imm) +
+             (((s128)a.qword[i - 1] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+  }
+}
+for (int i = 2; i < 4; i++) {
+  if (i < 3) {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)b.qword[i - 1];
+    } else {
+      temp = ((s128)b.qword[i - 1] >> imm) +
+             (((s128)b.qword[i - 1] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+  } else {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)a.qword[i - 2];
+    } else {
+      temp = ((s128)a.qword[i - 2] >> imm) +
+             (((s128)a.qword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvssrln_b_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrln_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrln.b.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a by elements in b, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp = (u16)a.half[i] >> (b.half[i] & 15);
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    u16 temp = (u16)a.half[i - 8] >> (b.half[i - 8] & 15);
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrln_bu_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrln_bu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrln.bu.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a by elements in b, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp = (u16)a.half[i] >> (b.half[i] & 15);
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    u16 temp = (u16)a.half[i - 8] >> (b.half[i - 8] & 15);
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrln_h_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrln_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrln.h.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a by elements in b, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp = (u32)a.word[i] >> (b.word[i] & 31);
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    u32 temp = (u32)a.word[i - 4] >> (b.word[i - 4] & 31);
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrln_hu_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrln_hu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrln.hu.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a by elements in b, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp = (u32)a.word[i] >> (b.word[i] & 31);
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    u32 temp = (u32)a.word[i - 4] >> (b.word[i - 4] & 31);
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrln_w_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrln_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrln.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a by elements in b, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp = (u64)a.dword[i] >> (b.dword[i] & 63);
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    u64 temp = (u64)a.dword[i - 2] >> (b.dword[i - 2] & 63);
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrln_wu_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrln_wu_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrln.wu.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a by elements in b, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp = (u64)a.dword[i] >> (b.dword[i] & 63);
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    u64 temp = (u64)a.dword[i - 2] >> (b.dword[i - 2] & 63);
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlni_b_h (__m256i a, __m256i b, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvssrlni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.b.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a and b by imm, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp = (u16)b.half[i] >> imm;
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  } else {
+    u16 temp = (u16)a.half[i - 8] >> imm;
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    u16 temp = (u16)b.half[i - 8] >> imm;
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  } else {
+    u16 temp = (u16)a.half[i - 16] >> imm;
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlni_bu_h (__m256i a, __m256i b, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvssrlni_bu_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.bu.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a and b by imm, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp = (u16)b.half[i] >> imm;
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  } else {
+    u16 temp = (u16)a.half[i - 8] >> imm;
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    u16 temp = (u16)b.half[i - 8] >> imm;
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  } else {
+    u16 temp = (u16)a.half[i - 16] >> imm;
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlni_h_w (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvssrlni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.h.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a and b by imm, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp = (u32)b.word[i] >> imm;
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  } else {
+    u32 temp = (u32)a.word[i - 4] >> imm;
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    u32 temp = (u32)b.word[i - 4] >> imm;
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  } else {
+    u32 temp = (u32)a.word[i - 8] >> imm;
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlni_hu_w (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvssrlni_hu_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.hu.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a and b by imm, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp = (u32)b.word[i] >> imm;
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  } else {
+    u32 temp = (u32)a.word[i - 4] >> imm;
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    u32 temp = (u32)b.word[i - 4] >> imm;
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  } else {
+    u32 temp = (u32)a.word[i - 8] >> imm;
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlni_w_d (__m256i a, __m256i b, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvssrlni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.w.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a and b by imm, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp = (u64)b.dword[i] >> imm;
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  } else {
+    u64 temp = (u64)a.dword[i - 2] >> imm;
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    u64 temp = (u64)b.dword[i - 2] >> imm;
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  } else {
+    u64 temp = (u64)a.dword[i - 4] >> imm;
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlni_wu_d (__m256i a, __m256i b, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvssrlni_wu_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.wu.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a and b by imm, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp = (u64)b.dword[i] >> imm;
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  } else {
+    u64 temp = (u64)a.dword[i - 2] >> imm;
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    u64 temp = (u64)b.dword[i - 2] >> imm;
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  } else {
+    u64 temp = (u64)a.dword[i - 4] >> imm;
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlni_d_q (__m256i a, __m256i b, imm0_127 imm)

+

Synopsis

+
__m256i __lasx_xvssrlni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.d.q xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 128-bit elements in a and b by imm, clamp to fit in signed 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    u128 temp = (u128)b.qword[i] >> imm;
+    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+  } else {
+    u128 temp = (u128)a.qword[i - 1] >> imm;
+    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+  }
+}
+for (int i = 2; i < 4; i++) {
+  if (i < 3) {
+    u128 temp = (u128)b.qword[i - 1] >> imm;
+    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+  } else {
+    u128 temp = (u128)a.qword[i - 2] >> imm;
+    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvssrlni_du_q (__m256i a, __m256i b, imm0_127 imm)

+

Synopsis

+
__m256i __lasx_xvssrlni_du_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.du.q xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 128-bit elements in a and b by imm, clamp to fit in unsigned 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    u128 temp = (u128)b.qword[i] >> imm;
+    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+  } else {
+    u128 temp = (u128)a.qword[i - 1] >> imm;
+    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+  }
+}
+for (int i = 2; i < 4; i++) {
+  if (i < 3) {
+    u128 temp = (u128)b.qword[i - 1] >> imm;
+    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+  } else {
+    u128 temp = (u128)a.qword[i - 2] >> imm;
+    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvssrlrn_b_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrlrn_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrlrn.b.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a by elements in b, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp;
+    if ((b.half[i] & 15) == 0) {
+      temp = (u16)a.half[i];
+    } else {
+      temp = ((u16)a.half[i] >> (b.half[i] & 15)) +
+             (((u16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    u16 temp;
+    if ((b.half[i - 8] & 15) == 0) {
+      temp = (u16)a.half[i - 8];
+    } else {
+      temp = ((u16)a.half[i - 8] >> (b.half[i - 8] & 15)) +
+             (((u16)a.half[i - 8] >> ((b.half[i - 8] & 15) - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlrn_bu_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrlrn_bu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrlrn.bu.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a by elements in b, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp;
+    if ((b.half[i] & 15) == 0) {
+      temp = (u16)a.half[i];
+    } else {
+      temp = ((u16)a.half[i] >> (b.half[i] & 15)) +
+             (((u16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    u16 temp;
+    if ((b.half[i - 8] & 15) == 0) {
+      temp = (u16)a.half[i - 8];
+    } else {
+      temp = ((u16)a.half[i - 8] >> (b.half[i - 8] & 15)) +
+             (((u16)a.half[i - 8] >> ((b.half[i - 8] & 15) - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlrn_h_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrlrn_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrlrn.h.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a by elements in b, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp;
+    if ((b.word[i] & 31) == 0) {
+      temp = (u32)a.word[i];
+    } else {
+      temp = ((u32)a.word[i] >> (b.word[i] & 31)) +
+             (((u32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    u32 temp;
+    if ((b.word[i - 4] & 31) == 0) {
+      temp = (u32)a.word[i - 4];
+    } else {
+      temp = ((u32)a.word[i - 4] >> (b.word[i - 4] & 31)) +
+             (((u32)a.word[i - 4] >> ((b.word[i - 4] & 31) - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlrn_hu_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrlrn_hu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrlrn.hu.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a by elements in b, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp;
+    if ((b.word[i] & 31) == 0) {
+      temp = (u32)a.word[i];
+    } else {
+      temp = ((u32)a.word[i] >> (b.word[i] & 31)) +
+             (((u32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    u32 temp;
+    if ((b.word[i - 4] & 31) == 0) {
+      temp = (u32)a.word[i - 4];
+    } else {
+      temp = ((u32)a.word[i - 4] >> (b.word[i - 4] & 31)) +
+             (((u32)a.word[i - 4] >> ((b.word[i - 4] & 31) - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlrn_w_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrlrn_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrlrn.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a by elements in b, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp;
+    if ((b.dword[i] & 63) == 0) {
+      temp = (u64)a.dword[i];
+    } else {
+      temp = ((u64)a.dword[i] >> (b.dword[i] & 63)) +
+             (((u64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    u64 temp;
+    if ((b.dword[i - 2] & 63) == 0) {
+      temp = (u64)a.dword[i - 2];
+    } else {
+      temp = ((u64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) +
+             (((u64)a.dword[i - 2] >> ((b.dword[i - 2] & 63) - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlrn_wu_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrlrn_wu_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrlrn.wu.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a by elements in b, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp;
+    if ((b.dword[i] & 63) == 0) {
+      temp = (u64)a.dword[i];
+    } else {
+      temp = ((u64)a.dword[i] >> (b.dword[i] & 63)) +
+             (((u64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    u64 temp;
+    if ((b.dword[i - 2] & 63) == 0) {
+      temp = (u64)a.dword[i - 2];
+    } else {
+      temp = ((u64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) +
+             (((u64)a.dword[i - 2] >> ((b.dword[i - 2] & 63) - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlrni_b_h (__m256i a, __m256i b, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvssrlrni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.b.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a and b by imm, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)b.half[i];
+    } else {
+      temp = ((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  } else {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)a.half[i - 8];
+    } else {
+      temp =
+          ((u16)a.half[i - 8] >> imm) + (((u16)a.half[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)b.half[i - 8];
+    } else {
+      temp =
+          ((u16)b.half[i - 8] >> imm) + (((u16)b.half[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  } else {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)a.half[i - 16];
+    } else {
+      temp = ((u16)a.half[i - 16] >> imm) +
+             (((u16)a.half[i - 16] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlrni_bu_h (__m256i a, __m256i b, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvssrlrni_bu_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.bu.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a and b by imm, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)b.half[i];
+    } else {
+      temp = ((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  } else {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)a.half[i - 8];
+    } else {
+      temp =
+          ((u16)a.half[i - 8] >> imm) + (((u16)a.half[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)b.half[i - 8];
+    } else {
+      temp =
+          ((u16)b.half[i - 8] >> imm) + (((u16)b.half[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  } else {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)a.half[i - 16];
+    } else {
+      temp = ((u16)a.half[i - 16] >> imm) +
+             (((u16)a.half[i - 16] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlrni_h_w (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvssrlrni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.h.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a and b by imm, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)b.word[i];
+    } else {
+      temp = ((u32)b.word[i] >> imm) + (((u32)b.word[i] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  } else {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)a.word[i - 4];
+    } else {
+      temp =
+          ((u32)a.word[i - 4] >> imm) + (((u32)a.word[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)b.word[i - 4];
+    } else {
+      temp =
+          ((u32)b.word[i - 4] >> imm) + (((u32)b.word[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  } else {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)a.word[i - 8];
+    } else {
+      temp =
+          ((u32)a.word[i - 8] >> imm) + (((u32)a.word[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlrni_hu_w (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvssrlrni_hu_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.hu.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a and b by imm, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)b.word[i];
+    } else {
+      temp = ((u32)b.word[i] >> imm) + (((u32)b.word[i] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  } else {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)a.word[i - 4];
+    } else {
+      temp =
+          ((u32)a.word[i - 4] >> imm) + (((u32)a.word[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)b.word[i - 4];
+    } else {
+      temp =
+          ((u32)b.word[i - 4] >> imm) + (((u32)b.word[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  } else {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)a.word[i - 8];
+    } else {
+      temp =
+          ((u32)a.word[i - 8] >> imm) + (((u32)a.word[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlrni_w_d (__m256i a, __m256i b, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvssrlrni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.w.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a and b by imm, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)b.dword[i];
+    } else {
+      temp = ((u64)b.dword[i] >> imm) + (((u64)b.dword[i] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  } else {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)a.dword[i - 2];
+    } else {
+      temp = ((u64)a.dword[i - 2] >> imm) +
+             (((u64)a.dword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)b.dword[i - 2];
+    } else {
+      temp = ((u64)b.dword[i - 2] >> imm) +
+             (((u64)b.dword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  } else {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)a.dword[i - 4];
+    } else {
+      temp = ((u64)a.dword[i - 4] >> imm) +
+             (((u64)a.dword[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlrni_wu_d (__m256i a, __m256i b, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvssrlrni_wu_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.wu.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a and b by imm, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)b.dword[i];
+    } else {
+      temp = ((u64)b.dword[i] >> imm) + (((u64)b.dword[i] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  } else {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)a.dword[i - 2];
+    } else {
+      temp = ((u64)a.dword[i - 2] >> imm) +
+             (((u64)a.dword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)b.dword[i - 2];
+    } else {
+      temp = ((u64)b.dword[i - 2] >> imm) +
+             (((u64)b.dword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  } else {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)a.dword[i - 4];
+    } else {
+      temp = ((u64)a.dword[i - 4] >> imm) +
+             (((u64)a.dword[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlrni_d_q (__m256i a, __m256i b, imm0_127 imm)

+

Synopsis

+
__m256i __lasx_xvssrlrni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.d.q xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 128-bit elements in a and b by imm, clamp to fit in signed 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)b.qword[i];
+    } else {
+      temp = ((u128)b.qword[i] >> imm) + (((u128)b.qword[i] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+  } else {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)a.qword[i - 1];
+    } else {
+      temp = ((u128)a.qword[i - 1] >> imm) +
+             (((u128)a.qword[i - 1] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+  }
+}
+for (int i = 2; i < 4; i++) {
+  if (i < 3) {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)b.qword[i - 1];
+    } else {
+      temp = ((u128)b.qword[i - 1] >> imm) +
+             (((u128)b.qword[i - 1] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+  } else {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)a.qword[i - 2];
+    } else {
+      temp = ((u128)a.qword[i - 2] >> imm) +
+             (((u128)a.qword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvssrlrni_du_q (__m256i a, __m256i b, imm0_127 imm)

+

Synopsis

+
__m256i __lasx_xvssrlrni_du_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.du.q xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 128-bit elements in a and b by imm, clamp to fit in unsigned 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)b.qword[i];
+    } else {
+      temp = ((u128)b.qword[i] >> imm) + (((u128)b.qword[i] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+  } else {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)a.qword[i - 1];
+    } else {
+      temp = ((u128)a.qword[i - 1] >> imm) +
+             (((u128)a.qword[i - 1] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+  }
+}
+for (int i = 2; i < 4; i++) {
+  if (i < 3) {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)b.qword[i - 1];
+    } else {
+      temp = ((u128)b.qword[i - 1] >> imm) +
+             (((u128)b.qword[i - 1] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+  } else {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)a.qword[i - 2];
+    } else {
+      temp = ((u128)a.qword[i - 2] >> imm) +
+             (((u128)a.qword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvrotr_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvrotr_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvrotr.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Rotate right the unsigned 8-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] =
+      (a.byte[i] >> (b.byte[i] & 0x7)) | (a.byte[i] << (8 - (b.byte[i] & 0x7)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m256i __lasx_xvrotr_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvrotr_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvrotr.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Rotate right the unsigned 16-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (a.half[i] >> (b.half[i] & 0xf)) |
+                (a.half[i] << (16 - (b.half[i] & 0xf)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m256i __lasx_xvrotr_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvrotr_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvrotr.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Rotate right the unsigned 32-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (a.word[i] >> (b.word[i] & 0x1f)) |
+                (a.word[i] << (32 - (b.word[i] & 0x1f)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m256i __lasx_xvrotr_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvrotr_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvrotr.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Rotate right the unsigned 64-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (a.dword[i] >> (b.dword[i] & 0x3f)) |
+                 (a.dword[i] << (64 - (b.dword[i] & 0x3f)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m256i __lasx_xvrotri_b (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvrotri_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvrotri.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Rotate right the unsigned 8-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (a.byte[i] >> imm) | (a.byte[i] << (8 - imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m256i __lasx_xvrotri_h (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvrotri_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvrotri.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Rotate right the unsigned 16-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (a.half[i] >> imm) | (a.half[i] << (16 - imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m256i __lasx_xvrotri_w (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvrotri_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvrotri.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Rotate right the unsigned 32-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (a.word[i] >> imm) | (a.word[i] << (32 - imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m256i __lasx_xvrotri_d (__m256i a, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvrotri_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvrotri.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Rotate right the unsigned 64-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (a.dword[i] >> imm) | (a.dword[i] << (64 - imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/shuffling/index.html b/lasx/shuffling/index.html new file mode 100644 index 00000000..8c527ddf --- /dev/null +++ b/lasx/shuffling/index.html @@ -0,0 +1,679 @@ + + + + + + + + Shuffling - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Shuffling

+

__m256i __lasx_xvshuf_b (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvshuf_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvshuf.b xr, xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Shuffle bytes from a and b with indices from c.

+

Caveat: the indices are placed in c, while in other vshuf intrinsics, they are placed in a.

+

+

Examples

+
__m256i __lasx_xvshuf_b(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, __m256i{0x1f1f00001a0a1b0b, 0x1111120213031404, 0x0102030405060708, 0x1112131405060708})
+= 0x99997878ee21dd43 0x7777661555144413 0x4321433412341278 0x1234121212341278
+
+

Operation

+
for (int i = 0; i < 32; i++) {
+  if ((c.byte[i] % 256) >= 64 && MACHINE_3C5000) {
+    // Caveat: observed in 3C5000
+    dst.byte[i] = 0;
+  } else if ((c.byte[i] % 32) < 16) {
+    dst.byte[i] = b.byte[(c.byte[i] % 32) + ((i >= 16) ? 16 : 0)];
+  } else {
+    dst.byte[i] = a.byte[(c.byte[i] % 32) + ((i >= 16) ? 0 : -16)];
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m256i __lasx_xvshuf_h (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvshuf_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvshuf.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Shuffle 16-bit elements in b and c with indices from a, save the result to dst.

+

+

Examples

+
__m256i __lasx_xvshuf_h(__m256i{0x0001000200030004, 0x0005000a000b000c, 0x000f000e00010002, 0x0008000900020001}, __m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x1415ef13abcd4321 0x432133441122ff00 0xaabbaabb43211234 0x1234123412344321
+
+

Operation

+
for (int i = 0; i < 16; i++) {
+  if ((a.half[i] % 256) >= 64 && MACHINE_3C5000) {
+    // Caveat: observed in 3C5000
+    dst.half[i] = 0;
+  } else if ((a.half[i] % 16) < 8) {
+    dst.half[i] = c.half[(a.half[i] % 16) + ((i >= 8) ? 8 : 0)];
+  } else {
+    dst.half[i] = b.half[(a.half[i] % 16) + ((i >= 8) ? 0 : -8)];
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m256i __lasx_xvshuf_w (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvshuf_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvshuf.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Shuffle 32-bit elements in b and c with indices from a, save the result to dst.

+

+

Examples

+
__m256i __lasx_xvshuf_w(__m256i{0x0000000200000004, 0x0000000700000005, 0x0000000100000003, 0x0000000400000000}, __m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x4321432155667788 0x99aabbcc11223344 0x1234123456785678 0x1234123443214321
+
+

Operation

+
for (int i = 0; i < 8; i++) {
+  if ((a.word[i] % 256) >= 64 && MACHINE_3C5000) {
+    // Caveat: observed in 3C5000
+    dst.word[i] = 0;
+  } else if ((a.word[i] % 8) < 4) {
+    dst.word[i] = c.word[(a.word[i] % 8) + ((i >= 4) ? 4 : 0)];
+  } else {
+    dst.word[i] = b.word[(a.word[i] % 8) + ((i >= 4) ? 0 : -4)];
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m256i __lasx_xvshuf_d (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvshuf_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvshuf.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Shuffle 64-bit elements in b and c with indices from a, save the result to dst.

+

+

Examples

+
__m256i __lasx_xvshuf_d(__m256i{0x0000000000000000, 0x0000000000000003, 0x0000000000000002, 0x0000000000000001}, __m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0xabcdef1314156678 0x99aabbccddeeff00 0xabcdef1212341234 0x5678567856785678
+
+

Operation

+
for (int i = 0; i < 4; i++) {
+  if ((a.dword[i] % 256) >= 64 && MACHINE_3C5000) {
+    // Caveat: observed in 3C5000
+    dst.dword[i] = 0;
+  } else if ((a.dword[i] % 4) < 2) {
+    dst.dword[i] = c.dword[(a.dword[i] % 4) + ((i >= 2) ? 2 : 0)];
+  } else {
+    dst.dword[i] = b.dword[(a.dword[i] % 4) + ((i >= 2) ? 0 : -2)];
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m256i __lasx_xvshuf4i_b (__m256i a, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvshuf4i_b (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvshuf4i.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Shuffle every four 8-bit elements in a with indices packed in imm, save the result to dst.

+

+

Examples

+
__m256i __lasx_xvshuf4i_b( __m256i{ 0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, 0x12)
+= 0x13ef13cd78667815 0x3412343421432121 0x3412343421432121 0x7856787878567878
+
+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvshuf4i_h (__m256i a, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvshuf4i_h (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvshuf4i.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Shuffle every four 16-bit elements in a with indices packed in imm, save the result to dst.

+

+

Examples

+
__m256i __lasx_xvshuf4i_h( __m256i{ 0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, 0x12)
+= 0x667814156678ef13 0x4321432143211234 0x4321432143211234 0x5678567856785678
+
+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvshuf4i_w (__m256i a, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvshuf4i_w (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvshuf4i.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Shuffle every four 32-bit elements in a with indices packed in imm, save the result to dst.

+

+

Examples

+
__m256i __lasx_xvshuf4i_w( __m256i{ 0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, 0x12)
+= 0x1415667843214321 0x14156678abcdef13 0x4321432156785678 0x4321432112341234
+
+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvshuf4i_d (__m256i a, __m256i b, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvshuf4i_d (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvshuf4i.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Shuffle every four 64-bit elements in a and b with indices packed in imm, save the result to dst.

+

+

Examples

+
__m256i __lasx_xvshuf4i_d( __m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{ 0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, 0x12)
+= 0xabcdef1314156678 0x1122334455667788 0x1234123443214321 0xabcdef1212341234
+
+

Operation

+
dst.dword[0] = (imm & 2) ? b.dword[(imm & 1)] : a.dword[(imm & 1)];
+dst.dword[1] =
+    (imm & 8) ? b.dword[((imm >> 2) & 1)] : a.dword[((imm >> 2) & 1)];
+dst.dword[2] = (imm & 2) ? b.dword[(imm & 1) + 2] : a.dword[(imm & 1) + 2];
+dst.dword[3] =
+    (imm & 8) ? b.dword[((imm >> 2) & 1) + 2] : a.dword[((imm >> 2) & 1) + 2];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/latency_throughput/index.html b/latency_throughput/index.html new file mode 100644 index 00000000..5b048e7d --- /dev/null +++ b/latency_throughput/index.html @@ -0,0 +1,200 @@ + + + + + + + + Latency and Throughput of Instructions - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + +
  • +
  • +
+
+
+
+
+ +

Latency and Throughput of Instructions

+

Latency and throughput (CPI) of each instruction:

+ + + + + + + + + + + + +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/bitwise_operations/index.html b/lsx/bitwise_operations/index.html new file mode 100644 index 00000000..6fa9df32 --- /dev/null +++ b/lsx/bitwise_operations/index.html @@ -0,0 +1,2289 @@ + + + + + + + + Bitwise Operations - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Bitwise Operations

+

__m128i __lsx_vbitsel_v (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vbitsel_v (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vbitsel.v vr, vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute bitwise selection: for each bit position, if the bit in c equals to one, copy the bit from b to dst, otherwise copy from a.

+

Examples

+
__m128i __lsx_vbitsel_v(__m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321}, __m128i{0xffff0000aaaabbbb, 0x1111222233334444})
+= 0xabab3344ffeeefab 0x98ba9beccfedfb00
+
+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (c.dword[i] & b.dword[i]) | (~c.dword[i] & a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+
Instruction3A60003C5000
LatencyThroughput (CPI)LatencyThroughput (CPI)
vabsd.b2222
vabsd.bu2222
vabsd.d2222
vabsd.du2222
vabsd.h2222
vabsd.hu2222
vabsd.w2222
vabsd.wu2222
vadd.b1412
vadd.d1412
vadd.h1412
vadd.q3232
vadd.w1412
vadda.b3232
vadda.d3232
vadda.h3232
vadda.w3232
vaddi.bu1412
vaddi.du1412
vaddi.hu1412
vaddi.wu1412
vaddwev.d.w2222
vaddwev.d.wu2222
vaddwev.d.wu.w2222
vaddwev.h.b2222
vaddwev.h.bu2222
vaddwev.h.bu.b2222
vaddwev.q.d3232
vaddwev.q.du3232
vaddwev.q.du.d3232
vaddwev.w.h2222
vaddwev.w.hu2222
vaddwev.w.hu.h2222
vaddwod.d.w2222
vaddwod.d.wu2222
vaddwod.d.wu.w2222
vaddwod.h.b2222
vaddwod.h.bu2222
vaddwod.h.bu.b2222
vaddwod.q.d3232
vaddwod.q.du3232
vaddwod.q.du.d3232
vaddwod.w.h2222
vaddwod.w.hu2222
vaddwod.w.hu.h2222
vand.v1412
vandi.b1412
vandn.v1412
vavg.b1412
vavg.bu1412
vavg.d2422
vavg.du2422
vavg.h1412
vavg.hu1412
vavg.w1412
vavg.wu1412
vavgr.b1412
vavgr.bu1412
vavgr.d2422
vavgr.du2422
vavgr.h1412
vavgr.hu1412
vavgr.w1412
vavgr.wu1412
vbitclr.b2222
vbitclr.d2222
vbitclr.h2222
vbitclr.w2222
vbitclri.b2222
vbitclri.d2222
vbitclri.h2222
vbitclri.w2222
vbitrev.b2222
vbitrev.d2222
vbitrev.h2222
vbitrev.w2222
vbitrevi.b2222
vbitrevi.d2222
vbitrevi.h2222
vbitrevi.w2222
vbitsel.v1212
vbitseli.b1212
vbitset.b2222
vbitset.d2222
vbitset.h2222
vbitset.w2222
vbitseti.b2222
vbitseti.d2222
vbitseti.h2222
vbitseti.w2222
vbsll.v1412
vbsrl.v1412
vclo.b2422
vclo.d2422
vclo.h2422
vclo.w2422
vclz.b2422
vclz.d2422
vclz.h2422
vclz.w2422
vdiv.b29, 320.06(1/15.5)29, 320.06(1/17)
vdiv.bu29, 330.06(1/16.5)29, 360.06(1/18)
vdiv.d80.25(1/4)8, 18.50.11(1/9)
vdiv.du80.25(1/4)8, 18.50.11(1/9)
vdiv.h170.12(1/8.5)17, 21.50.09(1/11)
vdiv.hu17, 220.11(1/9)17, 21.50.07(1/14)
vdiv.w110.18(1/5.5)11, 17.50.09(1/11.5)
vdiv.wu110.18(1/5.5)11, 17.50.07(1/15)
vext2xv.d.b3432
vext2xv.d.h3432
vext2xv.d.w3432
vext2xv.du.bu3432
vext2xv.du.hu3432
vext2xv.du.wu3432
vext2xv.h.b3432
vext2xv.hu.bu3432
vext2xv.w.b3432
vext2xv.w.h3432
vext2xv.wu.bu3432
vext2xv.wu.hu3432
vexth.d.w1412
vexth.du.wu1412
vexth.h.b1412
vexth.hu.bu1412
vexth.q.d1412
vexth.qu.du1412
vexth.w.h1412
vexth.wu.hu1412
vextl.q.d1412
vextl.qu.du1412
vextrins.b1412
vextrins.d1412
vextrins.h1412
vextrins.w1412
vfadd.d3452
vfadd.s3452
vfclass.d2422
vfclass.s2422
vfcmp.caf.d2422
vfcmp.caf.s2422
vfcmp.ceq.d2422
vfcmp.ceq.s2422
vfcmp.cle.d2422
vfcmp.cle.s2422
vfcmp.clt.d2422
vfcmp.clt.s2422
vfcmp.cne.d2422
vfcmp.cne.s2422
vfcmp.cor.d2422
vfcmp.cor.s2422
vfcmp.cueq.d2422
vfcmp.cueq.s2422
vfcmp.cule.d2422
vfcmp.cule.s2422
vfcmp.cult.d2422
vfcmp.cult.s2422
vfcmp.cun.d2422
vfcmp.cun.s2422
vfcmp.cune.d2422
vfcmp.cune.s2422
vfcmp.saf.d2422
vfcmp.saf.s2422
vfcmp.seq.d2422
vfcmp.seq.s2422
vfcmp.sle.d2422
vfcmp.sle.s2422
vfcmp.slt.d2422
vfcmp.slt.s2422
vfcmp.sne.d2422
vfcmp.sne.s2422
vfcmp.sor.d2422
vfcmp.sor.s2422
vfcmp.sueq.d2422
vfcmp.sueq.s2422
vfcmp.sule.d2422
vfcmp.sule.s2422
vfcmp.sult.d2422
vfcmp.sult.s2422
vfcmp.sun.d2422
vfcmp.sun.s2422
vfcmp.sune.d2422
vfcmp.sune.s2422
vfcvt.h.s3231
vfcvt.s.d3231
vfcvth.d.s3231
vfcvth.s.h3231
vfcvtl.d.s3231
vfcvtl.s.h3231
vfdiv.d8, 21.50.25(1/4)8, 16.50.08(1/12.5)
vfdiv.s110.18(1/5.5)11, 19.50.13(1/7.5)
vffint.d.l4442
vffint.d.lu4442
vffint.s.l5251
vffint.s.w4442
vffint.s.wu4442
vffinth.d.w5251
vffintl.d.w5251
vflogb.d4442
vflogb.s4442
vfmadd.d5252
vfmadd.s5252
vfmax.d2422
vfmax.s2422
vfmaxa.d2422
vfmaxa.s2422
vfmin.d2422
vfmin.s2422
vfmina.d2422
vfmina.s2422
vfmsub.d5252
vfmsub.s5252
vfmul.d5252
vfmul.s5252
vfnmadd.d5252
vfnmadd.s5252
vfnmsub.d5252
vfnmsub.s5252
vfrecip.d80.25(1/4)230.08(1/12)
vfrecip.s110.18(1/5.5)270.14(1/7)
vfrint.d4242
vfrint.s4242
vfrintrm.d4242
vfrintrm.s4242
vfrintrne.d4242
vfrintrne.s4242
vfrintrp.d4242
vfrintrp.s4242
vfrintrz.d4242
vfrintrz.s4242
vfrsqrt.d150.04(1/26.5)150.04(1/27.5)
vfrsqrt.s170.05(1/19)210.11(1/9)
vfrstp.b2222
vfrstp.h2222
vfrstpi.b2222
vfrstpi.h2222
vfsqrt.d360.06(1/17.5)360.05(1/18.5)
vfsqrt.s110.08(1/12)270.17(1/6)
vfsub.d3452
vfsub.s3452
vftint.l.d4442
vftint.lu.d4442
vftint.w.d5251
vftint.w.s4442
vftint.wu.s4442
vftinth.l.s5251
vftintl.l.s5251
vftintrm.l.d4442
vftintrm.w.d5251
vftintrm.w.s4442
vftintrmh.l.s5251
vftintrml.l.s5251
vftintrne.l.d4442
vftintrne.w.d5251
vftintrne.w.s4442
vftintrneh.l.s5251
vftintrnel.l.s5251
vftintrp.l.d4442
vftintrp.w.d5251
vftintrp.w.s4442
vftintrph.l.s5251
vftintrpl.l.s5251
vftintrz.l.d4442
vftintrz.lu.d4442
vftintrz.w.d5251
vftintrz.w.s4442
vftintrz.wu.s4442
vftintrzh.l.s5251
vftintrzl.l.s5251
vhaddw.d.w2222
vhaddw.du.wu2222
vhaddw.h.b2222
vhaddw.hu.bu2222
vhaddw.q.d3232
vhaddw.qu.du3232
vhaddw.w.h2222
vhaddw.wu.hu2222
vhsubw.d.w2222
vhsubw.du.wu2222
vhsubw.h.b2222
vhsubw.hu.bu2222
vhsubw.q.d3232
vhsubw.qu.du3232
vhsubw.w.h2222
vhsubw.wu.hu2222
vilvh.b1412
vilvh.d1412
vilvh.h1412
vilvh.w1412
vilvl.b1412
vilvl.d1412
vilvl.h1412
vilvl.w1412
vinsgr2vr.b1111
vinsgr2vr.d1111
vinsgr2vr.h1111
vinsgr2vr.w1111
vmadd.b4242
vmadd.d4242
vmadd.h4242
vmadd.w4242
vmaddwev.d.w4242
vmaddwev.d.wu4242
vmaddwev.d.wu.w4242
vmaddwev.h.b4242
vmaddwev.h.bu4242
vmaddwev.h.bu.b4242
vmaddwev.q.d71.1471.14
vmaddwev.q.du71.1471.14
vmaddwev.q.du.d71.1471.14
vmaddwev.w.h4242
vmaddwev.w.hu4242
vmaddwev.w.hu.h4242
vmaddwod.d.w4242
vmaddwod.d.wu4242
vmaddwod.d.wu.w4242
vmaddwod.h.b4242
vmaddwod.h.bu4242
vmaddwod.h.bu.b4242
vmaddwod.q.d71.1471.14
vmaddwod.q.du71.1471.14
vmaddwod.q.du.d71.1471.14
vmaddwod.w.h4242
vmaddwod.w.hu4242
vmaddwod.w.hu.h4242
vmax.b1412
vmax.bu1412
vmax.d2422
vmax.du2422
vmax.h1412
vmax.hu1412
vmax.w1412
vmax.wu1412
vmaxi.b1412
vmaxi.bu1412
vmaxi.d2422
vmaxi.du2422
vmaxi.h1412
vmaxi.hu1412
vmaxi.w1412
vmaxi.wu1412
vmin.b1412
vmin.bu1412
vmin.d2422
vmin.du2422
vmin.h1412
vmin.hu1412
vmin.w1412
vmin.wu1412
vmini.b1412
vmini.bu1412
vmini.d2422
vmini.du2422
vmini.h1412
vmini.hu1412
vmini.w1412
vmini.wu1412
vmod.b29, 350.06(1/15.5)29, 330.06(1/17)
vmod.bu29, 370.06(1/17.5)29, 330.05(1/19)
vmod.d8, 100.25(1/4)8, 100.11(1/9.5)
vmod.du8, 100.25(1/4)8, 100.11(1/9.5)
vmod.h17, 210.12(1/8.5)17, 210.09(1/11)
vmod.hu17, 210.11(1/9.5)17, 210.07(1/15)
vmod.w11, 130.18(1/5.5)11, 150.08(1/12)
vmod.wu11, 130.18(1/5.5)11, 150.06(1/16)
vmskgez.b1412
vmskltz.b1412
vmskltz.d1412
vmskltz.h1412
vmskltz.w1412
vmsknz.b1412
vmsub.b4242
vmsub.d4242
vmsub.h4242
vmsub.w4242
vmuh.b4242
vmuh.bu4242
vmuh.d4242
vmuh.du4242
vmuh.h4242
vmuh.hu4242
vmuh.w4242
vmuh.wu4242
vmul.b4242
vmul.d4242
vmul.h4242
vmul.w4242
vmulwev.d.w4242
vmulwev.d.wu4242
vmulwev.d.wu.w4242
vmulwev.h.b4242
vmulwev.h.bu4242
vmulwev.h.bu.b4242
vmulwev.q.d7272
vmulwev.q.du7272
vmulwev.q.du.d7272
vmulwev.w.h4242
vmulwev.w.hu4242
vmulwev.w.hu.h4242
vmulwod.d.w4242
vmulwod.d.wu4242
vmulwod.d.wu.w4242
vmulwod.h.b4242
vmulwod.h.bu4242
vmulwod.h.bu.b4242
vmulwod.q.d7272
vmulwod.q.du7272
vmulwod.q.du.d7272
vmulwod.w.h4242
vmulwod.w.hu4242
vmulwod.w.hu.h4242
vneg.b1412
vneg.d1412
vneg.h1412
vneg.w1412
vnor.v1412
vnori.b1412
vor.v1412
vori.b1412
vorn.v1412
vpackev.b1412
vpackev.d1412
vpackev.h1412
vpackev.w1412
vpackod.b1412
vpackod.d1412
vpackod.h1412
vpackod.w1412
vpcnt.b2222
vpcnt.d2222
vpcnt.h2222
vpcnt.w2222
vpermi.w1412
vpickev.b1412
vpickev.d1412
vpickev.h1412
vpickev.w1412
vpickod.b1412
vpickod.d1412
vpickod.h1412
vpickod.w1412
vpickve2gr.b1111
vpickve2gr.bu1111
vpickve2gr.d1111
vpickve2gr.du1111
vpickve2gr.h1111
vpickve2gr.hu1111
vpickve2gr.w1111
vpickve2gr.wu1111
vreplgr2vr.bN/A1N/A1
vreplgr2vr.dN/A1N/A1
vreplgr2vr.hN/A1N/A1
vreplgr2vr.wN/A1N/A1
vrepli.bN/A6N/A2
vrepli.dN/A4N/A2
vrepli.hN/A4N/A2
vrepli.wN/A4N/A2
vreplve.b1111
vreplve.d1111
vreplve.h1111
vreplve.w1111
vreplvei.b1412
vreplvei.d1412
vreplvei.h1412
vreplvei.w1412
vrotr.b1422
vrotr.d1422
vrotr.h1422
vrotr.w1422
vrotri.b1422
vrotri.d1422
vrotri.h1422
vrotri.w1422
vsadd.b1412
vsadd.bu1412
vsadd.d1412
vsadd.du1412
vsadd.h1412
vsadd.hu1412
vsadd.w1412
vsadd.wu1412
vsat.b2222
vsat.bu2222
vsat.d2222
vsat.du2222
vsat.h2222
vsat.hu2222
vsat.w2222
vsat.wu2222
vseq.b1412
vseq.d1412
vseq.h1412
vseq.w1412
vseqi.b1412
vseqi.d1412
vseqi.h1412
vseqi.w1412
vsetallnez.bN/A2N/A2
vsetallnez.dN/A2N/A2
vsetallnez.hN/A2N/A2
vsetallnez.wN/A2N/A2
vsetanyeqz.bN/A2N/A2
vsetanyeqz.dN/A2N/A2
vsetanyeqz.hN/A2N/A2
vsetanyeqz.wN/A2N/A2
vseteqz.vN/A2N/A2
vsetnez.vN/A2N/A2
vshuf4i.b1412
vshuf4i.d1412
vshuf4i.h1412
vshuf4i.w1412
vshuf.b1212
vshuf.d1212
vshuf.h1212
vshuf.w1212
vsigncov.b1212
vsigncov.d1212
vsigncov.h1212
vsigncov.w1212
vsle.b1412
vsle.bu1412
vsle.d2422
vsle.du2422
vsle.h1412
vsle.hu1412
vsle.w1412
vsle.wu1412
vslei.b1412
vslei.bu1412
vslei.d2422
vslei.du2422
vslei.h1412
vslei.hu1412
vslei.w1412
vslei.wu1412
vsll.b1412
vsll.d1412
vsll.h1412
vsll.w1412
vslli.b1412
vslli.d1412
vslli.h1412
vslli.w1412
vsllwil.d.w2221
vsllwil.du.wu2221
vsllwil.h.b2221
vsllwil.hu.bu2221
vsllwil.w.h2221
vsllwil.wu.hu2221
vslt.b1412
vslt.bu1412
vslt.d2422
vslt.du2422
vslt.h1412
vslt.hu1412
vslt.w1412
vslt.wu1412
vslti.b1412
vslti.bu1412
vslti.d2422
vslti.du2422
vslti.h1412
vslti.hu1412
vslti.w1412
vslti.wu1412
vsra.b1412
vsra.d1412
vsra.h1412
vsra.w1412
vsrai.b1412
vsrai.d1412
vsrai.h1412
vsrai.w1412
vsran.b.h2221
vsran.h.w2221
vsran.w.d2221
vsrani.b.h4241
vsrani.d.q3232
vsrani.h.w4241
vsrani.w.d4241
vsrar.b3232
vsrar.d3232
vsrar.h3232
vsrar.w3232
vsrari.b3232
vsrari.d3232
vsrari.h3232
vsrari.w3232
vsrarn.b.h4241
vsrarn.h.w4241
vsrarn.w.d4241
vsrarni.b.h4241
vsrarni.d.q3232
vsrarni.h.w4241
vsrarni.w.d4241
vsrl.b1412
vsrl.d1412
vsrl.h1412
vsrl.w1412
vsrli.b1412
vsrli.d1412
vsrli.h1412
vsrli.w1412
vsrln.b.h2221
vsrln.h.w2221
vsrln.w.d2221
vsrlni.b.h4241
vsrlni.d.q3232
vsrlni.h.w4241
vsrlni.w.d4241
vsrlr.b3232
vsrlr.d3232
vsrlr.h3232
vsrlr.w3232
vsrlri.b3232
vsrlri.d3232
vsrlri.h3232
vsrlri.w3232
vsrlrn.b.h4241
vsrlrn.h.w4241
vsrlrn.w.d4241
vsrlrni.b.h4241
vsrlrni.d.q3232
vsrlrni.h.w4241
vsrlrni.w.d4241
vssran.b.h4241
vssran.bu.h4241
vssran.h.w4241
vssran.hu.w4241
vssran.w.d4241
vssran.wu.d4241
vssrani.b.h4241
vssrani.bu.h4241
vssrani.d.q3232
vssrani.du.q3232
vssrani.h.w4241
vssrani.hu.w4241
vssrani.w.d4241
vssrani.wu.d4241
vssrarn.b.h4241
vssrarn.bu.h4241
vssrarn.h.w4241
vssrarn.hu.w4241
vssrarn.w.d4241
vssrarn.wu.d4241
vssrarni.b.h4241
vssrarni.bu.h4241
vssrarni.d.q3232
vssrarni.du.q3232
vssrarni.h.w4241
vssrarni.hu.w4241
vssrarni.w.d4241
vssrarni.wu.d4241
vssrln.b.h4241
vssrln.bu.h4241
vssrln.h.w4241
vssrln.hu.w4241
vssrln.w.d4241
vssrln.wu.d4241
vssrlni.b.h4241
vssrlni.bu.h4241
vssrlni.d.q3232
vssrlni.du.q3232
vssrlni.h.w4241
vssrlni.hu.w4241
vssrlni.w.d4241
vssrlni.wu.d4241
vssrlrn.b.h4241
vssrlrn.bu.h4241
vssrlrn.h.w4241
vssrlrn.hu.w4241
vssrlrn.w.d4241
vssrlrn.wu.d4241
vssrlrni.b.h4241
vssrlrni.bu.h4241
vssrlrni.d.q3232
vssrlrni.du.q3232
vssrlrni.h.w4241
vssrlrni.hu.w4241
vssrlrni.w.d4241
vssrlrni.wu.d4241
vssub.b1412
vssub.bu1412
vssub.d1412
vssub.du1412
vssub.h1412
vssub.hu1412
vssub.w1412
vssub.wu1412
vsub.b1412
vsub.d1412
vsub.h1412
vsub.q3232
vsub.w1412
vsubi.bu1412
vsubi.du1412
vsubi.hu1412
vsubi.wu1412
vsubwev.d.w2222
vsubwev.d.wu2222
vsubwev.h.b2222
vsubwev.h.bu2222
vsubwev.q.d3232
vsubwev.q.du3232
vsubwev.w.h2222
vsubwev.w.hu2222
vsubwod.d.w2222
vsubwod.d.wu2222
vsubwod.h.b2222
vsubwod.h.bu2222
vsubwod.q.d3232
vsubwod.q.du3232
vsubwod.w.h2222
vsubwod.w.hu2222
vxor.v1412
vxori.b1412
xvabsd.b2222
xvabsd.bu2222
xvabsd.d2222
xvabsd.du2222
xvabsd.h2222
xvabsd.hu2222
xvabsd.w2222
xvabsd.wu2222
xvadd.b1412
xvadd.d1412
xvadd.h1412
xvadd.q3232
xvadd.w1412
xvadda.b3232
xvadda.d3232
xvadda.h3232
xvadda.w3232
xvaddi.bu1412
xvaddi.du1412
xvaddi.hu1412
xvaddi.wu1412
xvaddwev.d.w2222
xvaddwev.d.wu2222
xvaddwev.d.wu.w2222
xvaddwev.h.b2222
xvaddwev.h.bu2222
xvaddwev.h.bu.b2222
xvaddwev.q.d3232
xvaddwev.q.du3232
xvaddwev.q.du.d3232
xvaddwev.w.h2222
xvaddwev.w.hu2222
xvaddwev.w.hu.h2222
xvaddwod.d.w2222
xvaddwod.d.wu2222
xvaddwod.d.wu.w2222
xvaddwod.h.b2222
xvaddwod.h.bu2222
xvaddwod.h.bu.b2222
xvaddwod.q.d3232
xvaddwod.q.du3232
xvaddwod.q.du.d3232
xvaddwod.w.h2222
xvaddwod.w.hu2222
xvaddwod.w.hu.h2222
xvand.v1412
xvandi.b1412
xvandn.v1412
xvavg.b1412
xvavg.bu1412
xvavg.d2422
xvavg.du2422
xvavg.h1412
xvavg.hu1412
xvavg.w1412
xvavg.wu1412
xvavgr.b1412
xvavgr.bu1412
xvavgr.d2422
xvavgr.du2422
xvavgr.h1412
xvavgr.hu1412
xvavgr.w1412
xvavgr.wu1412
xvbitclr.b2222
xvbitclr.d2222
xvbitclr.h2222
xvbitclr.w2222
xvbitclri.b2222
xvbitclri.d2222
xvbitclri.h2222
xvbitclri.w2222
xvbitrev.b2222
xvbitrev.d2222
xvbitrev.h2222
xvbitrev.w2222
xvbitrevi.b2222
xvbitrevi.d2222
xvbitrevi.h2222
xvbitrevi.w2222
xvbitsel.v1212
xvbitseli.b1212
xvbitset.b2222
xvbitset.d2222
xvbitset.h2222
xvbitset.w2222
xvbitseti.b2222
xvbitseti.d2222
xvbitseti.h2222
xvbitseti.w2222
xvbsll.v1412
xvbsrl.v1412
xvclo.b2422
xvclo.d2422
xvclo.h2422
xvclo.w2422
xvclz.b2422
xvclz.d2422
xvclz.h2422
xvclz.w2422
xvdiv.b29, 320.06(1/15.5)32, 360.05(1/20.5)
xvdiv.bu29, 330.06(1/16.5)29, 360.05(1/20.5)
xvdiv.d80.25(1/4)8, 18.50.11(1/9)
xvdiv.du80.25(1/4)8, 18.50.11(1/9)
xvdiv.h170.12(1/8.5)21.5, 220.08(1/13)
xvdiv.hu17, 220.11(1/9)17, 21.50.07(1/15)
xvdiv.w110.18(1/5.5)11, 17.50.09(1/11.5)
xvdiv.wu110.18(1/5.5)11, 17.50.07(1/15)
xvexth.d.w1412
xvexth.du.wu1412
xvexth.h.b1412
xvexth.hu.bu1412
xvexth.q.d1412
xvexth.qu.du1412
xvexth.w.h1412
xvexth.wu.hu1412
xvextl.q.d1412
xvextl.qu.du1412
xvextrins.b1412
xvextrins.d1412
xvextrins.h1412
xvextrins.w1412
xvfadd.d3452
xvfadd.s3452
xvfclass.d2422
xvfclass.s2422
xvfcmp.caf.d2422
xvfcmp.caf.s2422
xvfcmp.ceq.d2422
xvfcmp.ceq.s2422
xvfcmp.cle.d2422
xvfcmp.cle.s2422
xvfcmp.clt.d2422
xvfcmp.clt.s2422
xvfcmp.cne.d2422
xvfcmp.cne.s2422
xvfcmp.cor.d2422
xvfcmp.cor.s2422
xvfcmp.cueq.d2422
xvfcmp.cueq.s2422
xvfcmp.cule.d2422
xvfcmp.cule.s2422
xvfcmp.cult.d2422
xvfcmp.cult.s2422
xvfcmp.cun.d2422
xvfcmp.cun.s2422
xvfcmp.cune.d2422
xvfcmp.cune.s2422
xvfcmp.saf.d2422
xvfcmp.saf.s2422
xvfcmp.seq.d2422
xvfcmp.seq.s2422
xvfcmp.sle.d2422
xvfcmp.sle.s2422
xvfcmp.slt.d2422
xvfcmp.slt.s2422
xvfcmp.sne.d2422
xvfcmp.sne.s2422
xvfcmp.sor.d2422
xvfcmp.sor.s2422
xvfcmp.sueq.d2422
xvfcmp.sueq.s2422
xvfcmp.sule.d2422
xvfcmp.sule.s2422
xvfcmp.sult.d2422
xvfcmp.sult.s2422
xvfcmp.sun.d2422
xvfcmp.sun.s2422
xvfcmp.sune.d2422
xvfcmp.sune.s2422
xvfcvt.h.s3231
xvfcvt.s.d3231
xvfcvth.d.s3231
xvfcvth.s.h3231
xvfcvtl.d.s3231
xvfcvtl.s.h3231
xvfdiv.d8, 21.50.25(1/4)8, 170.08(1/12.5)
xvfdiv.s110.18(1/5.5)11, 19.50.1(1/10.5)
xvffint.d.l4442
xvffint.d.lu4442
xvffint.s.l5251
xvffint.s.w4442
xvffint.s.wu4442
xvffinth.d.w5251
xvffintl.d.w5251
xvflogb.d4442
xvflogb.s4442
xvfmadd.d5252
xvfmadd.s5252
xvfmax.d2422
xvfmax.s2422
xvfmaxa.d2422
xvfmaxa.s2422
xvfmin.d2422
xvfmin.s2422
xvfmina.d2422
xvfmina.s2422
xvfmsub.d5252
xvfmsub.s5252
xvfmul.d5252
xvfmul.s5252
xvfnmadd.d5252
xvfnmadd.s5252
xvfnmsub.d5252
xvfnmsub.s5252
xvfrecip.d230.25(1/4)230.08(1/12)
xvfrecip.s270.18(1/5.5)270.14(1/7)
xvfrint.d4242
xvfrint.s4242
xvfrintrm.d4242
xvfrintrm.s4242
xvfrintrne.d4242
xvfrintrne.s4242
xvfrintrp.d4242
xvfrintrp.s4242
xvfrintrz.d4242
xvfrintrz.s4242
xvfrsqrt.d150.04(1/26.5)150.04(1/27.5)
xvfrsqrt.s250.05(1/19)250.03(1/32)
xvfrstp.b2222
xvfrstp.h2222
xvfrstpi.b2222
xvfrstpi.h2222
xvfsqrt.d360.06(1/17.5)360.05(1/18.5)
xvfsqrt.s150.08(1/12)150.07(1/13.5)
xvfsub.d3452
xvfsub.s3452
xvftint.l.d4442
xvftint.lu.d4442
xvftint.w.d5251
xvftint.w.s4442
xvftint.wu.s4442
xvftinth.l.s5251
xvftintl.l.s5251
xvftintrm.l.d4442
xvftintrm.w.d5251
xvftintrm.w.s4442
xvftintrmh.l.s5251
xvftintrml.l.s5251
xvftintrne.l.d4442
xvftintrne.w.d5251
xvftintrne.w.s4442
xvftintrneh.l.s5251
xvftintrnel.l.s5251
xvftintrp.l.d4442
xvftintrp.w.d5251
xvftintrp.w.s4442
xvftintrph.l.s5251
xvftintrpl.l.s5251
xvftintrz.l.d4442
xvftintrz.lu.d4442
xvftintrz.w.d5251
xvftintrz.w.s4442
xvftintrz.wu.s4442
xvftintrzh.l.s5251
xvftintrzl.l.s5251
xvhaddw.d.w2222
xvhaddw.du.wu2222
xvhaddw.h.b2222
xvhaddw.hu.bu2222
xvhaddw.q.d3232
xvhaddw.qu.du3232
xvhaddw.w.h2222
xvhaddw.wu.hu2222
xvhseli.d1111
xvhsubw.d.w2222
xvhsubw.du.wu2222
xvhsubw.h.b2222
xvhsubw.hu.bu2222
xvhsubw.q.d3232
xvhsubw.qu.du3232
xvhsubw.w.h2222
xvhsubw.wu.hu2222
xvilvh.b1412
xvilvh.d1412
xvilvh.h1412
xvilvh.w1412
xvilvl.b1412
xvilvl.d1412
xvilvl.h1412
xvilvl.w1412
xvinsgr2vr.d1111
xvinsgr2vr.w1111
xvinsve0.d1412
xvinsve0.w1412
xvmadd.b4242
xvmadd.d4242
xvmadd.h4242
xvmadd.w4242
xvmaddwev.d.w4242
xvmaddwev.d.wu4242
xvmaddwev.d.wu.w4242
xvmaddwev.h.b4242
xvmaddwev.h.bu4242
xvmaddwev.h.bu.b4242
xvmaddwev.q.d71.1471.14
xvmaddwev.q.du71.1471.14
xvmaddwev.q.du.d71.1471.14
xvmaddwev.w.h4242
xvmaddwev.w.hu4242
xvmaddwev.w.hu.h4242
xvmaddwod.d.w4242
xvmaddwod.d.wu4242
xvmaddwod.d.wu.w4242
xvmaddwod.h.b4242
xvmaddwod.h.bu4242
xvmaddwod.h.bu.b4242
xvmaddwod.q.d71.1471.14
xvmaddwod.q.du71.1471.14
xvmaddwod.q.du.d71.1471.14
xvmaddwod.w.h4242
xvmaddwod.w.hu4242
xvmaddwod.w.hu.h4242
xvmax.b1412
xvmax.bu1412
xvmax.d2422
xvmax.du2422
xvmax.h1412
xvmax.hu1412
xvmax.w1412
xvmax.wu1412
xvmaxi.b1412
xvmaxi.bu1412
xvmaxi.d2422
xvmaxi.du2422
xvmaxi.h1412
xvmaxi.hu1412
xvmaxi.w1412
xvmaxi.wu1412
xvmin.b1412
xvmin.bu1412
xvmin.d2422
xvmin.du2422
xvmin.h1412
xvmin.hu1412
xvmin.w1412
xvmin.wu1412
xvmini.b1412
xvmini.bu1412
xvmini.d2422
xvmini.du2422
xvmini.h1412
xvmini.hu1412
xvmini.w1412
xvmini.wu1412
xvmod.b29, 410.06(1/15.5)29, 330.05(1/21.5)
xvmod.bu29, 370.06(1/17.5)29, 370.05(1/22)
xvmod.d8, 100.25(1/4)8, 100.11(1/9.5)
xvmod.du8, 100.25(1/4)8, 100.11(1/9.5)
xvmod.h17, 210.12(1/8.5)17, 210.07(1/13.5)
xvmod.hu17, 250.11(1/9.5)17, 230.06(1/16)
xvmod.w11, 130.18(1/5.5)11, 150.07(1/13.5)
xvmod.wu11, 130.18(1/5.5)11, 150.06(1/16)
xvmskgez.b1412
xvmskltz.b1412
xvmskltz.d1412
xvmskltz.h1412
xvmskltz.w1412
xvmsknz.b1412
xvmsub.b4242
xvmsub.d4242
xvmsub.h4242
xvmsub.w4242
xvmuh.b4242
xvmuh.bu4242
xvmuh.d4242
xvmuh.du4242
xvmuh.h4242
xvmuh.hu4242
xvmuh.w4242
xvmuh.wu4242
xvmul.b4242
xvmul.d4242
xvmul.h4242
xvmul.w4242
xvmulwev.d.w4242
xvmulwev.d.wu4242
xvmulwev.d.wu.w4242
xvmulwev.h.b4242
xvmulwev.h.bu4242
xvmulwev.h.bu.b4242
xvmulwev.q.d7272
xvmulwev.q.du7272
xvmulwev.q.du.d7272
xvmulwev.w.h4242
xvmulwev.w.hu4242
xvmulwev.w.hu.h4242
xvmulwod.d.w4242
xvmulwod.d.wu4242
xvmulwod.d.wu.w4242
xvmulwod.h.b4242
xvmulwod.h.bu4242
xvmulwod.h.bu.b4242
xvmulwod.q.d7272
xvmulwod.q.du7272
xvmulwod.q.du.d7272
xvmulwod.w.h4242
xvmulwod.w.hu4242
xvmulwod.w.hu.h4242
xvneg.b1412
xvneg.d1412
xvneg.h1412
xvneg.w1412
xvnor.v1412
xvnori.b1412
xvor.v1412
xvori.b1412
xvorn.v1412
xvpackev.b1412
xvpackev.d1412
xvpackev.h1412
xvpackev.w1412
xvpackod.b1412
xvpackod.d1412
xvpackod.h1412
xvpackod.w1412
xvpcnt.b2222
xvpcnt.d2222
xvpcnt.h2222
xvpcnt.w2222
xvperm.w3432
xvpermi.d3432
xvpermi.q32.6732
xvpermi.w1412
xvpickev.b1412
xvpickev.d1412
xvpickev.h1412
xvpickev.w1412
xvpickod.b1412
xvpickod.d1412
xvpickod.h1412
xvpickod.w1412
xvpickve2gr.d1111
xvpickve2gr.du1111
xvpickve2gr.w1111
xvpickve2gr.wu1111
xvpickve.d3432
xvpickve.w3432
xvrepl128vei.b1412
xvrepl128vei.d1412
xvrepl128vei.h1412
xvrepl128vei.w1412
xvreplgr2vr.bN/A1N/A1
xvreplgr2vr.dN/A1N/A1
xvreplgr2vr.hN/A1N/A1
xvreplgr2vr.wN/A1N/A1
xvrepli.bN/A6N/A2
xvrepli.dN/A4N/A2
xvrepli.hN/A4N/A2
xvrepli.wN/A4N/A2
xvreplve0.b3432
xvreplve0.d3432
xvreplve0.h3432
xvreplve0.q3432
xvreplve0.w3432
xvreplve.b1111
xvreplve.d1111
xvreplve.h1111
xvreplve.w1111
xvrotr.b1422
xvrotr.d1422
xvrotr.h1422
xvrotr.w1422
xvrotri.b1422
xvrotri.d1422
xvrotri.h1422
xvrotri.w1422
xvsadd.b1412
xvsadd.bu1412
xvsadd.d1412
xvsadd.du1412
xvsadd.h1412
xvsadd.hu1412
xvsadd.w1412
xvsadd.wu1412
xvsat.b2222
xvsat.bu2222
xvsat.d2222
xvsat.du2222
xvsat.h2222
xvsat.hu2222
xvsat.w2222
xvsat.wu2222
xvseq.b1412
xvseq.d1412
xvseq.h1412
xvseq.w1412
xvseqi.b1412
xvseqi.d1412
xvseqi.h1412
xvseqi.w1412
xvsetallnez.bN/A2N/A2
xvsetallnez.dN/A2N/A2
xvsetallnez.hN/A2N/A2
xvsetallnez.wN/A2N/A2
xvsetanyeqz.bN/A2N/A2
xvsetanyeqz.dN/A2N/A2
xvsetanyeqz.hN/A2N/A2
xvsetanyeqz.wN/A2N/A2
xvseteqz.vN/A2N/A2
xvsetnez.vN/A2N/A2
xvshuf4i.b1412
xvshuf4i.d1412
xvshuf4i.h1412
xvshuf4i.w1412
xvshuf.b1212
xvshuf.d1212
xvshuf.h1212
xvshuf.w1212
xvsigncov.b1212
xvsigncov.d1212
xvsigncov.h1212
xvsigncov.w1212
xvsle.b1412
xvsle.bu1412
xvsle.d2422
xvsle.du2422
xvsle.h1412
xvsle.hu1412
xvsle.w1412
xvsle.wu1412
xvslei.b1412
xvslei.bu1412
xvslei.d2422
xvslei.du2422
xvslei.h1412
xvslei.hu1412
xvslei.w1412
xvslei.wu1412
xvsll.b1412
xvsll.d1412
xvsll.h1412
xvsll.w1412
xvslli.b1412
xvslli.d1412
xvslli.h1412
xvslli.w1412
xvsllwil.d.w2221
xvsllwil.du.wu2221
xvsllwil.h.b2221
xvsllwil.hu.bu2221
xvsllwil.w.h2221
xvsllwil.wu.hu2221
xvslt.b1412
xvslt.bu1412
xvslt.d2422
xvslt.du2422
xvslt.h1412
xvslt.hu1412
xvslt.w1412
xvslt.wu1412
xvslti.b1412
xvslti.bu1412
xvslti.d2422
xvslti.du2422
xvslti.h1412
xvslti.hu1412
xvslti.w1412
xvslti.wu1412
xvsra.b1412
xvsra.d1412
xvsra.h1412
xvsra.w1412
xvsrai.b1412
xvsrai.d1412
xvsrai.h1412
xvsrai.w1412
xvsran.b.h2221
xvsran.h.w2221
xvsran.w.d2221
xvsrani.b.h4241
xvsrani.d.q3232
xvsrani.h.w4241
xvsrani.w.d4241
xvsrar.b3232
xvsrar.d3232
xvsrar.h3232
xvsrar.w3232
xvsrari.b3232
xvsrari.d3232
xvsrari.h3232
xvsrari.w3232
xvsrarn.b.h4241
xvsrarn.h.w4241
xvsrarn.w.d4241
xvsrarni.b.h4241
xvsrarni.d.q3232
xvsrarni.h.w4241
xvsrarni.w.d4241
xvsrl.b1412
xvsrl.d1412
xvsrl.h1412
xvsrl.w1412
xvsrli.b1412
xvsrli.d1412
xvsrli.h1412
xvsrli.w1412
xvsrln.b.h2221
xvsrln.h.w2221
xvsrln.w.d2221
xvsrlni.b.h4241
xvsrlni.d.q3232
xvsrlni.h.w4241
xvsrlni.w.d4241
xvsrlr.b3232
xvsrlr.d3232
xvsrlr.h3232
xvsrlr.w3232
xvsrlri.b3232
xvsrlri.d3232
xvsrlri.h3232
xvsrlri.w3232
xvsrlrn.b.h4241
xvsrlrn.h.w4241
xvsrlrn.w.d4241
xvsrlrni.b.h4241
xvsrlrni.d.q3232
xvsrlrni.h.w4241
xvsrlrni.w.d4241
xvssran.b.h4241
xvssran.bu.h4241
xvssran.h.w4241
xvssran.hu.w4241
xvssran.w.d4241
xvssran.wu.d4241
xvssrani.b.h4241
xvssrani.bu.h4241
xvssrani.d.q3232
xvssrani.du.q3232
xvssrani.h.w4241
xvssrani.hu.w4241
xvssrani.w.d4241
xvssrani.wu.d4241
xvssrarn.b.h4241
xvssrarn.bu.h4241
xvssrarn.h.w4241
xvssrarn.hu.w4241
xvssrarn.w.d4241
xvssrarn.wu.d4241
xvssrarni.b.h4241
xvssrarni.bu.h4241
xvssrarni.d.q3232
xvssrarni.du.q3232
xvssrarni.h.w4241
xvssrarni.hu.w4241
xvssrarni.w.d4241
xvssrarni.wu.d4241
xvssrln.b.h4241
xvssrln.bu.h4241
xvssrln.h.w4241
xvssrln.hu.w4241
xvssrln.w.d4241
xvssrln.wu.d4241
xvssrlni.b.h4241
xvssrlni.bu.h4241
xvssrlni.d.q3232
xvssrlni.du.q3232
xvssrlni.h.w4241
xvssrlni.hu.w4241
xvssrlni.w.d4241
xvssrlni.wu.d4241
xvssrlrn.b.h4241
xvssrlrn.bu.h4241
xvssrlrn.h.w4241
xvssrlrn.hu.w4241
xvssrlrn.w.d4241
xvssrlrn.wu.d4241
xvssrlrni.b.h4241
xvssrlrni.bu.h4241
xvssrlrni.d.q3232
xvssrlrni.du.q3232
xvssrlrni.h.w4241
xvssrlrni.hu.w4241
xvssrlrni.w.d4241
xvssrlrni.wu.d4241
xvssub.b1412
xvssub.bu1412
xvssub.d1412
xvssub.du1412
xvssub.h1412
xvssub.hu1412
xvssub.w1412
xvssub.wu1412
xvsub.b1412
xvsub.d1412
xvsub.h1412
xvsub.q3232
xvsub.w1412
xvsubi.bu1412
xvsubi.du1412
xvsubi.hu1412
xvsubi.wu1412
xvsubwev.d.w2222
xvsubwev.d.wu2222
xvsubwev.h.b2222
xvsubwev.h.bu2222
xvsubwev.q.d3232
xvsubwev.q.du3232
xvsubwev.w.h2222
xvsubwev.w.hu2222
xvsubwod.d.w2222
xvsubwod.d.wu2222
xvsubwod.h.b2222
xvsubwod.h.bu2222
xvsubwod.q.d3232
xvsubwod.q.du3232
xvsubwod.w.h2222
xvsubwod.w.hu2222
xvxor.v1412
xvxori.b1412
+ + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m128i __lsx_vbitseli_b (__m128i a, __m128i b, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vbitseli_b (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vbitseli.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute bitwise selection: for each bit position, if the bit in a equals to one, copy the bit from imm to dst, otherwise copy from b.

+

Examples

+
__m128i __lsx_vbitseli_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321}, 0x12)
+= 0xba8b9aabba8b9a23 0x1216123012031221
+
+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (~a.byte[i] & b.byte[i]) | (a.byte[i] & (u8)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m128i __lsx_vbitclr_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vbitclr_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitclr.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Clear the bit specified by elements in b from 8-bit elements in a, save the result in dst.

+

Examples

+
__m128i __lsx_vbitclr_b(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0xf7f7f7f7f7f7f7f7 0x99aabbccd5ecf700
+
+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] & (~((u8)1 << (b.byte[i] % 8)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitclr_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vbitclr_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitclr.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Clear the bit specified by elements in b from 16-bit elements in a, save the result in dst.

+

Examples

+
__m128i __lsx_vbitclr_h(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0xf7fff7fff7fff7ff 0x99aabbccddecff00
+
+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] & (~((u16)1 << (b.half[i] % 16)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitclr_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vbitclr_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitclr.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Clear the bit specified by elements in b from 32-bit elements in a, save the result in dst.

+

Examples

+
__m128i __lsx_vbitclr_w(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0xfffff7fffffff7ff 0x99aabbccddeeff00
+
+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] & (~((u32)1 << (b.word[i] % 32)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitclr_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vbitclr_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitclr.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Clear the bit specified by elements in b from 64-bit elements in a, save the result in dst.

+

Examples

+
__m128i __lsx_vbitclr_d(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0xfffff7ffffffffff 0x99aabbccddeeff00
+
+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] & (~((u64)1 << (b.dword[i] % 64)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitclri_b (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vbitclri_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vbitclri.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Clear the bit specified by imm from 8-bit elements in a, save the result in dst.

+

Examples

+
__m128i __lsx_vbitclri_b(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, 1)
+= 0xfdfdfdfdfdfdfdfd 0x99a8b9ccddecfd00
+
+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] & (~((u8)1 << imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitclri_h (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vbitclri_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vbitclri.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Clear the bit specified by imm from 16-bit elements in a, save the result in dst.

+

Examples

+
__m128i __lsx_vbitclri_h(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, 1)
+= 0xfffdfffdfffdfffd 0x99a8bbccddecff00
+
+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] & (~((u16)1 << imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitclri_w (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vbitclri_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbitclri.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Clear the bit specified by imm from 32-bit elements in a, save the result in dst.

+

Examples

+
__m128i __lsx_vbitclri_w(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, 1)
+= 0xfffffffdfffffffd 0x99aabbccddeeff00
+
+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] & (~((u32)1 << imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitclri_d (__m128i a, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vbitclri_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vbitclri.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Clear the bit specified by imm from 64-bit elements in a, save the result in dst.

+

Examples

+
__m128i __lsx_vbitclri_d(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, 1)
+= 0xfffffffffffffffd 0x99aabbccddeeff00
+
+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] & (~((u64)1 << imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitset_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vbitset_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitset.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Set the bit specified by elements in b from 8-bit elements in a, save the result in dst.

+

Examples

+
__m128i __lsx_vbitset_b(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0x0808080808080808 0x9dbabfdcddeeff02
+
+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] | ((u8)1 << (b.byte[i] % 8));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitset_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vbitset_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitset.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Set the bit specified by elements in b from 16-bit elements in a, save the result in dst.

+

Examples

+
__m128i __lsx_vbitset_h(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0x0800080008000800 0x99babbdcddeeff02
+
+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] | ((u16)1 << (b.half[i] % 16));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitset_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vbitset_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitset.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Set the bit specified by elements in b from 32-bit elements in a, save the result in dst.

+

Examples

+
__m128i __lsx_vbitset_w(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0x0000080000000800 0x99babbccddeeff02
+
+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] | ((u32)1 << (b.word[i] % 32));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitset_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vbitset_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitset.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Set the bit specified by elements in b from 64-bit elements in a, save the result in dst.

+

Examples

+
__m128i __lsx_vbitset_d(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0x0000080000000000 0x99aabbceddeeff00
+
+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] | ((u64)1 << (b.dword[i] % 64));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitseti_b (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vbitseti_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vbitseti.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Set the bit specified by imm from 8-bit elements in a, save the result in dst.

+

Examples

+
__m128i __lsx_vbitseti_b(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, 1)
+= 0x0202020202020202 0x9baabbcedfeeff02
+
+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] | ((u8)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitseti_h (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vbitseti_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vbitseti.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Set the bit specified by imm from 16-bit elements in a, save the result in dst.

+

Examples

+
__m128i __lsx_vbitseti_h(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, 1)
+= 0x0002000200020002 0x99aabbceddeeff02
+
+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] | ((u16)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitseti_w (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vbitseti_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbitseti.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Set the bit specified by imm from 32-bit elements in a, save the result in dst.

+

Examples

+
__m128i __lsx_vbitseti_w(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, 1)
+= 0x0000000200000002 0x99aabbceddeeff02
+
+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] | ((u32)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitseti_d (__m128i a, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vbitseti_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vbitseti.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Set the bit specified by imm from 64-bit elements in a, save the result in dst.

+

Examples

+
__m128i __lsx_vbitseti_d(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, 1)
+= 0x0000000000000002 0x99aabbccddeeff02
+
+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] | ((u64)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitrev_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vbitrev_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitrev.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Toggle the bit specified by elements in b from 8-bit elements in a, save the result in dst.

+

Examples

+
__m128i __lsx_vbitrev_b(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0x0707070707070707 0x9dbabfdcd5ecf702
+
+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] ^ ((u8)1 << (b.byte[i] % 8));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitrev_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vbitrev_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitrev.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Toggle the bit specified by elements in b from 16-bit elements in a, save the result in dst.

+

Examples

+
__m128i __lsx_vbitrev_h(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0x070f070f070f070f 0x99babbdcddecff02
+
+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] ^ ((u16)1 << (b.half[i] % 16));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitrev_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vbitrev_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitrev.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Toggle the bit specified by elements in b from 32-bit elements in a, save the result in dst.

+

Examples

+
__m128i __lsx_vbitrev_w(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0x0f0f070f0f0f070f 0x99babbccddeeff02
+
+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] ^ ((u32)1 << (b.word[i] % 32));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitrev_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vbitrev_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitrev.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Toggle the bit specified by elements in b from 64-bit elements in a, save the result in dst.

+

Examples

+
__m128i __lsx_vbitrev_d(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0x0f0f070f0f0f0f0f 0x99aabbceddeeff00
+
+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] ^ ((u64)1 << (b.dword[i] % 64));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitrevi_b (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vbitrevi_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vbitrevi.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Toggle the bit specified by imm from 8-bit elements in a, save the result in dst.

+

Examples

+
__m128i __lsx_vbitrevi_b(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, 1)
+= 0x0d0d0d0d0d0d0d0d 0x9ba8b9cedfecfd02
+
+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] ^ ((u8)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitrevi_h (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vbitrevi_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vbitrevi.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Toggle the bit specified by imm from 16-bit elements in a, save the result in dst.

+

Examples

+
__m128i __lsx_vbitrevi_h(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, 1)
+= 0x0f0d0f0d0f0d0f0d 0x99a8bbceddecff02
+
+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] ^ ((u16)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitrevi_w (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vbitrevi_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbitrevi.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Toggle the bit specified by imm from 32-bit elements in a, save the result in dst.

+

Examples

+
__m128i __lsx_vbitrevi_w(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, 1)
+= 0x0f0f0f0d0f0f0f0d 0x99aabbceddeeff02
+
+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] ^ ((u32)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitrevi_d (__m128i a, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vbitrevi_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vbitrevi.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Toggle the bit specified by imm from 64-bit elements in a, save the result in dst.

+

Examples

+
__m128i __lsx_vbitrevi_d(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, 1)
+= 0x0f0f0f0f0f0f0f0d 0x99aabbccddeeff02
+
+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] ^ ((u64)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vclo_b (__m128i a)

+

Synopsis

+
__m128i __lsx_vclo_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclo.b vr, vr
+CPU Flags: LSX
+
+

Description

+

Count leading ones of 8-bit elements in a.

+

Examples

+
__m128i __lsx_vclo_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0000000000000001 0x0101010202030800
+
+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = clo(a.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vclo_h (__m128i a)

+

Synopsis

+
__m128i __lsx_vclo_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclo.h vr, vr
+CPU Flags: LSX
+
+

Description

+

Count leading ones of 16-bit elements in a.

+

Examples

+
__m128i __lsx_vclo_h(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0000000000000000 0x0001000100020008
+
+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = clo(a.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vclo_w (__m128i a)

+

Synopsis

+
__m128i __lsx_vclo_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclo.w vr, vr
+CPU Flags: LSX
+
+

Description

+

Count leading ones of 32-bit elements in a.

+

Examples

+
__m128i __lsx_vclo_w(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0000000000000000 0x0000000100000002
+
+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = clo(a.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vclo_d (__m128i a)

+

Synopsis

+
__m128i __lsx_vclo_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclo.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Count leading ones of 64-bit elements in a.

+

Examples

+
__m128i __lsx_vclo_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0000000000000000 0x0000000000000001
+
+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = clo(a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vclz_b (__m128i a)

+

Synopsis

+
__m128i __lsx_vclz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclz.b vr, vr
+CPU Flags: LSX
+
+

Description

+

Count leading zeros of 8-bit elements in a.

+

Examples

+
__m128i __lsx_vclz_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0302020101010100 0x0000000000000008
+
+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = clz(a.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vclz_h (__m128i a)

+

Synopsis

+
__m128i __lsx_vclz_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclz.h vr, vr
+CPU Flags: LSX
+
+

Description

+

Count leading zeros of 16-bit elements in a.

+

Examples

+
__m128i __lsx_vclz_h(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0003000200010001 0x0000000000000000
+
+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = clz(a.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vclz_w (__m128i a)

+

Synopsis

+
__m128i __lsx_vclz_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclz.w vr, vr
+CPU Flags: LSX
+
+

Description

+

Count leading zeros of 32-bit elements in a.

+

Examples

+
__m128i __lsx_vclz_w(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0000000300000001 0x0000000000000000
+
+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = clz(a.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vclz_d (__m128i a)

+

Synopsis

+
__m128i __lsx_vclz_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclz.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Count leading zeros of 64-bit elements in a.

+

Examples

+
__m128i __lsx_vclz_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0000000000000003 0x0000000000000000
+
+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = clz(a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vpcnt_b (__m128i a)

+

Synopsis

+
__m128i __lsx_vpcnt_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vpcnt.b vr, vr
+CPU Flags: LSX
+
+

Description

+

Count the number of ones (population, popcount) in 8-bit elements in a.

+

Examples

+
__m128i __lsx_vpcnt_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0202040204040602 0x0404060406060800
+
+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = popcount(a.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vpcnt_h (__m128i a)

+

Synopsis

+
__m128i __lsx_vpcnt_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vpcnt.h vr, vr
+CPU Flags: LSX
+
+

Description

+

Count the number of ones (population, popcount) in 16-bit elements in a.

+

Examples

+
__m128i __lsx_vpcnt_h(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0004000600080008 0x0008000a000c0008
+
+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = popcount(a.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vpcnt_w (__m128i a)

+

Synopsis

+
__m128i __lsx_vpcnt_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vpcnt.w vr, vr
+CPU Flags: LSX
+
+

Description

+

Count the number of ones (population, popcount) in 32-bit elements in a.

+

Examples

+
__m128i __lsx_vpcnt_w(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0000000a00000010 0x0000001200000014
+
+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = popcount(a.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vpcnt_d (__m128i a)

+

Synopsis

+
__m128i __lsx_vpcnt_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vpcnt.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Count the number of ones (population, popcount) in 64-bit elements in a.

+

Examples

+
__m128i __lsx_vpcnt_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x000000000000001a 0x0000000000000026
+
+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = popcount(a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/branch/index.html b/lsx/branch/index.html new file mode 100644 index 00000000..3770b352 --- /dev/null +++ b/lsx/branch/index.html @@ -0,0 +1,709 @@ + + + + + + + + Branch - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Branch

+

int __lsx_bz_v (__m128i a)

+

Synopsis

+
int __lsx_bz_v (__m128i a)
+#include <lsxintrin.h>
+Instruction: vseteqz.v fcc, vr; bcnez
+CPU Flags: LSX
+
+

Description

+

Expected to be used in branches: branch if the whole vector a equals to zero.

+

Operation

+
dst = a.qword[0] == 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A2
3C5000N/A2
+

int __lsx_bnz_v (__m128i a)

+

Synopsis

+
int __lsx_bnz_v (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetnez.v fcc, vr; bcnez
+CPU Flags: LSX
+
+

Description

+

Expected to be used in branches: branch if the whole vector a is non-zero.

+

Operation

+
dst = a.qword[0] != 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A2
3C5000N/A2
+

int __lsx_bz_b (__m128i a)

+

Synopsis

+
int __lsx_bz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetanyeqz.b fcc, vr; bcnez
+CPU Flags: LSX
+
+

Description

+

Expected to be used in branches: branch if any 8-bit element in a equals to zero.

+

Operation

+
dst = 0;
+for (int i = 0; i < 16; i++) {
+  if (a.byte[i] == 0) {
+    dst = 1;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A2
3C5000N/A2
+

int __lsx_bz_h (__m128i a)

+

Synopsis

+
int __lsx_bz_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetanyeqz.h fcc, vr; bcnez
+CPU Flags: LSX
+
+

Description

+

Expected to be used in branches: branch if any 16-bit element in a equals to zero.

+

Operation

+
dst = 0;
+for (int i = 0; i < 8; i++) {
+  if (a.half[i] == 0) {
+    dst = 1;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A2
3C5000N/A2
+

int __lsx_bz_w (__m128i a)

+

Synopsis

+
int __lsx_bz_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetanyeqz.w fcc, vr; bcnez
+CPU Flags: LSX
+
+

Description

+

Expected to be used in branches: branch if any 32-bit element in a equals to zero.

+

Operation

+
dst = 0;
+for (int i = 0; i < 4; i++) {
+  if (a.word[i] == 0) {
+    dst = 1;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A2
3C5000N/A2
+

int __lsx_bz_d (__m128i a)

+

Synopsis

+
int __lsx_bz_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetanyeqz.d fcc, vr; bcnez
+CPU Flags: LSX
+
+

Description

+

Expected to be used in branches: branch if any 64-bit element in a equals to zero.

+

Operation

+
dst = 0;
+for (int i = 0; i < 2; i++) {
+  if (a.dword[i] == 0) {
+    dst = 1;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A2
3C5000N/A2
+

int __lsx_bnz_b (__m128i a)

+

Synopsis

+
int __lsx_bnz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetallnez.b fcc, vr; bcnez
+CPU Flags: LSX
+
+

Description

+

Expected to be used in branches: branch if all 8-bit elements in a are non-zero.

+

Operation

+
dst = 1;
+for (int i = 0; i < 16; i++) {
+  if (a.byte[i] == 0) {
+    dst = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A2
3C5000N/A2
+

int __lsx_bnz_h (__m128i a)

+

Synopsis

+
int __lsx_bnz_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetallnez.h fcc, vr; bcnez
+CPU Flags: LSX
+
+

Description

+

Expected to be used in branches: branch if all 16-bit elements in a are non-zero.

+

Operation

+
dst = 1;
+for (int i = 0; i < 8; i++) {
+  if (a.half[i] == 0) {
+    dst = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A2
3C5000N/A2
+

int __lsx_bnz_w (__m128i a)

+

Synopsis

+
int __lsx_bnz_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetallnez.w fcc, vr; bcnez
+CPU Flags: LSX
+
+

Description

+

Expected to be used in branches: branch if all 32-bit elements in a are non-zero.

+

Operation

+
dst = 1;
+for (int i = 0; i < 4; i++) {
+  if (a.word[i] == 0) {
+    dst = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A2
3C5000N/A2
+

int __lsx_bnz_d (__m128i a)

+

Synopsis

+
int __lsx_bnz_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetallnez.d fcc, vr; bcnez
+CPU Flags: LSX
+
+

Description

+

Expected to be used in branches: branch if all 64-bit elements in a are non-zero.

+

Operation

+
dst = 1;
+for (int i = 0; i < 2; i++) {
+  if (a.dword[i] == 0) {
+    dst = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A2
3C5000N/A2
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/float_comparison/index.html b/lsx/float_comparison/index.html new file mode 100644 index 00000000..f01ca71d --- /dev/null +++ b/lsx/float_comparison/index.html @@ -0,0 +1,2443 @@ + + + + + + + + Floating Point Comparison - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Floating Point Comparison

+

__m128i __lsx_vfcmp_caf_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_caf_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.caf.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_caf(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_caf_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_caf_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.caf.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_caf(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_ceq_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_ceq_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.ceq.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_ceq(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_ceq_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_ceq_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.ceq.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_ceq(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cle_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_cle_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cle.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cle(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cle_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_cle_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cle.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_cle(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_clt_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_clt_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.clt.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_clt(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_clt_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_clt_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.clt.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_clt(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cne_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_cne_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cne.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cne(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cne_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_cne_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cne.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_cne(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cor_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_cor_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cor.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cor(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cor_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_cor_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cor.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_cor(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cueq_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_cueq_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cueq.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cueq(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cueq_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_cueq_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cueq.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_cueq(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cule_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_cule_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cule.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cule(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cule_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_cule_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cule.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_cule(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cult_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_cult_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cult.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cult(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cult_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_cult_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cult.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_cult(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cun_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_cun_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cun.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cun(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cun_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_cun_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cun.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_cun(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cune_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_cune_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cune.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cune(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cune_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_cune_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cune.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_cune(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_saf_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_saf_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.saf.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_saf(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_saf_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_saf_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.saf.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_saf(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_seq_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_seq_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.seq.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_seq(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_seq_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_seq_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.seq.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_seq(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sle_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_sle_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sle.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sle(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sle_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_sle_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sle.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_sle(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_slt_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_slt_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.slt.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_slt(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_slt_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_slt_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.slt.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_slt(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sne_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_sne_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sne.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sne(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sne_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_sne_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sne.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_sne(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sor_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_sor_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sor.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sor(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sor_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_sor_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sor.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_sor(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sueq_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_sueq_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sueq.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sueq(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sueq_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_sueq_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sueq.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_sueq(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sule_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_sule_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sule.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sule(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sule_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_sule_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sule.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_sule(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sult_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_sult_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sult.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sult(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sult_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_sult_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sult.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_sult(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sun_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_sun_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sun.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sun(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sun_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_sun_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sun.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_sun(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sune_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_sune_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sune.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sune(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sune_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_sune_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sune.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_sune(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/float_computation/index.html b/lsx/float_computation/index.html new file mode 100644 index 00000000..a130d12d --- /dev/null +++ b/lsx/float_computation/index.html @@ -0,0 +1,1451 @@ + + + + + + + + Floating Point Computation - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Floating Point Computation

+

__m128 __lsx_vfadd_s (__m128 a, __m128 b)

+

Synopsis

+
__m128 __lsx_vfadd_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfadd.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add single precision floating point elements in a to elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = a.fp32[i] + b.fp32[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500052
+

__m128d __lsx_vfadd_d (__m128d a, __m128d b)

+

Synopsis

+
__m128d __lsx_vfadd_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfadd.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add double precision floating point elements in a to elements in b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = a.fp64[i] + b.fp64[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500052
+

__m128 __lsx_vfdiv_s (__m128 a, __m128 b)

+

Synopsis

+
__m128 __lsx_vfdiv_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfdiv.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Divide single precision floating point elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = a.fp32[i] / b.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000110.18(1/5.5)
3C500011, 19.50.13(1/7.5)
+

__m128d __lsx_vfdiv_d (__m128d a, __m128d b)

+

Synopsis

+
__m128d __lsx_vfdiv_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfdiv.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Divide double precision floating point elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = a.fp64[i] / b.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60008, 21.50.25(1/4)
3C50008, 16.50.08(1/12.5)
+

__m128 __lsx_vfmax_s (__m128 a, __m128 b)

+

Synopsis

+
__m128 __lsx_vfmax_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfmax.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute maximum of single precision floating point elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = fmax(a.fp32[i], b.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128d __lsx_vfmax_d (__m128d a, __m128d b)

+

Synopsis

+
__m128d __lsx_vfmax_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfmax.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute maximum of double precision floating point elements in a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = fmax(a.fp64[i], b.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128 __lsx_vfmaxa_s (__m128 a, __m128 b)

+

Synopsis

+
__m128 __lsx_vfmaxa_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfmaxa.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute maximum of single precision floating point elements in a and b by magnitude.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = (abs(a.fp32[i]) > abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128d __lsx_vfmaxa_d (__m128d a, __m128d b)

+

Synopsis

+
__m128d __lsx_vfmaxa_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfmaxa.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute maximum of double precision floating point elements in a and b by magnitude.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = (abs(a.fp64[i]) > abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128 __lsx_vfmin_s (__m128 a, __m128 b)

+

Synopsis

+
__m128 __lsx_vfmin_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfmax.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute minimum of single precision floating point elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = fmin(a.fp32[i], b.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128d __lsx_vfmin_d (__m128d a, __m128d b)

+

Synopsis

+
__m128d __lsx_vfmin_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfmax.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute minimum of double precision floating point elements in a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = fmin(a.fp64[i], b.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128 __lsx_vfmina_s (__m128 a, __m128 b)

+

Synopsis

+
__m128 __lsx_vfmina_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfmina.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute minimum of single precision floating point elements in a and b by magnitude.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = (abs(a.fp32[i]) < abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128d __lsx_vfmina_d (__m128d a, __m128d b)

+

Synopsis

+
__m128d __lsx_vfmina_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfmina.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute minimum of double precision floating point elements in a and b by magnitude.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = (abs(a.fp64[i]) < abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128 __lsx_vfmul_s (__m128 a, __m128 b)

+

Synopsis

+
__m128 __lsx_vfmul_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfmul.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply single precision floating point elements in a and elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = a.fp32[i] * b.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m128d __lsx_vfmul_d (__m128d a, __m128d b)

+

Synopsis

+
__m128d __lsx_vfmul_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfmul.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply double precision floating point elements in a and elements in b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = a.fp64[i] * b.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m128 __lsx_vfsub_s (__m128 a, __m128 b)

+

Synopsis

+
__m128 __lsx_vfsub_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfsub.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract single precision floating point elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = a.fp32[i] - b.fp32[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500052
+

__m128d __lsx_vfsub_d (__m128d a, __m128d b)

+

Synopsis

+
__m128d __lsx_vfsub_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfsub.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract double precision floating point elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = a.fp64[i] - b.fp64[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500052
+

__m128 __lsx_vflogb_s (__m128 a)

+

Synopsis

+
__m128 __lsx_vflogb_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vflogb.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute 2-based logarithm of single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = log2(a.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128d __lsx_vflogb_d (__m128d a)

+

Synopsis

+
__m128d __lsx_vflogb_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vflogb.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute 2-based logarithm of double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = log2(a.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128 __lsx_vfsqrt_s (__m128 a)

+

Synopsis

+
__m128 __lsx_vfsqrt_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfsqrt.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute square root of single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = sqrt(a.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000110.08(1/12)
3C5000270.17(1/6)
+

__m128d __lsx_vfsqrt_d (__m128d a)

+

Synopsis

+
__m128d __lsx_vfsqrt_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfsqrt.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute square root of double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = sqrt(a.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000360.06(1/17.5)
3C5000360.05(1/18.5)
+

__m128 __lsx_vfrsqrt_s (__m128 a)

+

Synopsis

+
__m128 __lsx_vfrsqrt_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrsqrt.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute reciprocal of square root of single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = 1.0 / sqrt(a.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000170.05(1/19)
3C5000210.11(1/9)
+

__m128d __lsx_vfrsqrt_d (__m128d a)

+

Synopsis

+
__m128d __lsx_vfrsqrt_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrsqrt.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute reciprocal of square root of double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = 1.0 / sqrt(a.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000150.04(1/26.5)
3C5000150.04(1/27.5)
+

__m128 __lsx_vfrecip_s (__m128 a)

+

Synopsis

+
__m128 __lsx_vfrecip_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrecip.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute reciprocal of single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = 1 / a.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000110.18(1/5.5)
3C5000270.14(1/7)
+

__m128d __lsx_vfrecip_d (__m128d a)

+

Synopsis

+
__m128d __lsx_vfrecip_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrecip.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute reciprocal of double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = 1 / a.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600080.25(1/4)
3C5000230.08(1/12)
+

__m128 __lsx_vfrsqrte_s (__m128 a)

+

Synopsis

+
__m128 __lsx_vfrsqrte_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrsqrte.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute estimated reciprocal of square root of single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = 1.0 / sqrt(a.fp32[i]); // estimated
+}
+
+

__m128d __lsx_vfrsqrte_d (__m128d a)

+

Synopsis

+
__m128d __lsx_vfrsqrte_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrsqrte.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute estimated reciprocal of square root of double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = 1.0 / sqrt(a.fp64[i]); // estimated
+}
+
+

__m128 __lsx_vfrecipe_s (__m128 a)

+

Synopsis

+
__m128 __lsx_vfrecipe_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrecipe.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute estimated reciprocal of single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = 1 / a.fp32[i]; // estimated
+}
+
+

__m128d __lsx_vfrecipe_d (__m128d a)

+

Synopsis

+
__m128d __lsx_vfrecipe_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrecipe.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute estimated reciprocal of double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = 1 / a.fp64[i]; // estimated
+}
+
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/float_conversion/index.html b/lsx/float_conversion/index.html new file mode 100644 index 00000000..b6f92c39 --- /dev/null +++ b/lsx/float_conversion/index.html @@ -0,0 +1,2240 @@ + + + + + + + + Floating Point Conversion - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Floating Point Conversion

+

__m128d __lsx_vfcvth_d_s (__m128 a)

+

Synopsis

+
__m128d __lsx_vfcvth_d_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfcvth.d.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single precision floating point elements in higher half of a to double precision.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = a.fp32[2 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500031
+

__m128d __lsx_vfcvtl_d_s (__m128 a)

+

Synopsis

+
__m128d __lsx_vfcvtl_d_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfcvtl.d.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single precision floating point elements in lower half of a to double precision.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = a.fp32[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500031
+

__m128 __lsx_vfcvt_s_d (__m128d a, __m128d b)

+

Synopsis

+
__m128 __lsx_vfcvt_s_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcvt.s.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double precision floating point elements in a and b to single precision.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    dst.fp32[i] = b.fp64[i];
+  } else {
+    dst.fp32[i] = a.fp64[i - 2];
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500031
+

__m128 __lsx_vfcvth_s_h (__m128i a)

+

Synopsis

+
__m128 __lsx_vfcvth_s_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vfcvth.s.h vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert half precision floating point elements in higher half of a to single precision.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = a.fp16[4 + i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500031
+

__m128 __lsx_vfcvtl_s_h (__m128i a)

+

Synopsis

+
__m128 __lsx_vfcvtl_s_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vfcvtl.s.h vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert half precision floating point elements in lower half of a to single precision.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = a.fp16[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500031
+

__m128i __lsx_vfcvt_h_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcvt_h_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcvt.h.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single precision floating point elements in a and b to half precision.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    dst.fp16[i] = b.fp32[i];
+  } else {
+    dst.fp16[i] = a.fp32[i - 4];
+  }
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500031
+

__m128d __lsx_vffinth_d_w (__m128i a)

+

Synopsis

+
__m128d __lsx_vffinth_d_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffinth.d.w vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert 32-bit integer elements in higher part of a to double precision floating point numbers.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = (f64)(s32)a.word[i + 2]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128d __lsx_vffintl_d_w (__m128i a)

+

Synopsis

+
__m128d __lsx_vffintl_d_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffintl.d.w vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert 32-bit integer elements in lower part of a to double precision floating point numbers.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = (f64)(s32)a.word[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128d __lsx_vffint_d_l (__m128i a)

+

Synopsis

+
__m128d __lsx_vffint_d_l (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffint.d.l vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert signed 64-bit integer elements in a to double-precision floating point numbers.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = (f64)(s64)a.dword[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128d __lsx_vffint_d_lu (__m128i a)

+

Synopsis

+
__m128d __lsx_vffint_d_lu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffint.d.lu vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert unsigned 64-bit integer elements in a to double-precision floating point numbers.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = (f64)(u64)a.dword[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128 __lsx_vffint_s_w (__m128i a)

+

Synopsis

+
__m128 __lsx_vffint_s_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffint.s.w vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert signed 32-bit integer elements in a to single-precision floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = (f32)(s32)a.word[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128 __lsx_vffint_s_wu (__m128i a)

+

Synopsis

+
__m128 __lsx_vffint_s_wu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffint.s.wu vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert unsigned 32-bit integer elements in a to single-precision floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = (f32)(u32)a.word[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128 __lsx_vffint_s_l (__m128i a, __m128i b)

+

Synopsis

+
__m128 __lsx_vffint_s_l (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vffint.s.l vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert 64-bit integer elements in a and b to single-precision floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] =
+      (i < 2) ? (f32)(s32)a.dword[i]
+              : (f32)(s32)b.dword[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintl_l_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintl_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintl.l.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in lower part of a to 64-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftinth_l_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftinth_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftinth.l.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in higher part of a to 64-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintrml_l_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrml_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrml.l.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in lower part of a to 64-bit integer, rounding towards negative infinity.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintrmh_l_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrmh_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrmh.l.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in higher part of a to 64-bit integer, rounding towards negative infinity.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintrpl_l_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrpl_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrpl.l.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in lower part of a to 64-bit integer, rounding towards positive infinity.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintrph_l_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrph_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrph.l.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in higher part of a to 64-bit integer, rounding towards positive infinity.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintrzl_l_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrzl_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrzl.l.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in lower part of a to 64-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintrzh_l_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrzh_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrzh.l.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in higher part of a to 64-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintrnel_l_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrnel_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrnel.l.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in lower part of a to 64-bit integer, rounding towards nearest even.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintrneh_l_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrneh_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrneh.l.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in higher part of a to 64-bit integer, rounding towards nearest even.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftint_l_d (__m128d a)

+

Synopsis

+
__m128i __lsx_vftint_l_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftint.l.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double-precision floating point elements in a to signed 64-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftint_w_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftint_w_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftint.w.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in a to signed 32-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftintrm_l_d (__m128d a)

+

Synopsis

+
__m128i __lsx_vftintrm_l_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftintrm.l.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double-precision floating point elements in a to signed 64-bit integer, rounding towards negative infinity.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftintrm_w_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrm_w_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrm.w.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in a to signed 32-bit integer, rounding towards negative infinity.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftintrp_l_d (__m128d a)

+

Synopsis

+
__m128i __lsx_vftintrp_l_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftintrp.l.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double-precision floating point elements in a to signed 64-bit integer, rounding towards positive infinity.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftintrp_w_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrp_w_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrp.w.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in a to signed 32-bit integer, rounding towards positive infinity.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftintrz_l_d (__m128d a)

+

Synopsis

+
__m128i __lsx_vftintrz_l_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftintrz.l.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double-precision floating point elements in a to signed 64-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftintrz_w_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrz_w_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrz.w.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in a to signed 32-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftintrne_l_d (__m128d a)

+

Synopsis

+
__m128i __lsx_vftintrne_l_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftintrne.l.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double-precision floating point elements in a to signed 64-bit integer, rounding towards nearest even.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftintrne_w_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrne_w_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrne.w.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in a to signed 32-bit integer, rounding towards nearest even.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftint_lu_d (__m128d a)

+

Synopsis

+
__m128i __lsx_vftint_lu_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftint.lu.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double-precision floating point elements in a to unsigned 64-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftint_wu_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftint_wu_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftint.wu.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in a to unsigned 32-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftintrz_lu_d (__m128d a)

+

Synopsis

+
__m128i __lsx_vftintrz_lu_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftintrz.lu.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double-precision floating point elements in a to unsigned 64-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftintrz_wu_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrz_wu_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrz.wu.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in a to unsigned 32-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftint_w_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vftint_w_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vftint.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double-precision floating point elements in a and b to 32-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintrm_w_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vftintrm_w_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vftintrm.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double-precision floating point elements in a and b to 32-bit integer, rounding towards negative infinity.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintrp_w_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vftintrp_w_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vftintrp.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double-precision floating point elements in a and b to 32-bit integer, rounding towards positive infinity.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintrz_w_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vftintrz_w_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vftintrz.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double-precision floating point elements in a and b to 32-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintrne_w_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vftintrne_w_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vftintrne.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double-precision floating point elements in a and b to 32-bit integer, rounding towards nearest even.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/float_misc/index.html b/lsx/float_misc/index.html new file mode 100644 index 00000000..b0fe8122 --- /dev/null +++ b/lsx/float_misc/index.html @@ -0,0 +1,775 @@ + + + + + + + + Floating Point Misc - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Floating Point Misc

+

__m128i __lsx_vfclass_d (__m128d a)

+

Synopsis

+
__m128i __lsx_vfclass_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfclass.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Classifiy each double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = fp_classify(a.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfclass_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vfclass_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfclass.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Classifiy each single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = fp_classify(a.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128 __lsx_vfrint_s (__m128 a)

+

Synopsis

+
__m128 __lsx_vfrint_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrint.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Round single-precision floating point elements in a to integers, using current rounding mode specified in fscr, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128d __lsx_vfrint_d (__m128d a)

+

Synopsis

+
__m128d __lsx_vfrint_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrint.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Round single-precision floating point elements in a to integers, using current rounding mode specified in fscr, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128 __lsx_vfrintrp_s (__m128 a)

+

Synopsis

+
__m128 __lsx_vfrintrp_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrintrp.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards positive infinity, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128d __lsx_vfrintrp_d (__m128d a)

+

Synopsis

+
__m128d __lsx_vfrintrp_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrintrp.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards positive infinity, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128 __lsx_vfrintrm_s (__m128 a)

+

Synopsis

+
__m128 __lsx_vfrintrm_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrintrm.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards negative infinity, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128d __lsx_vfrintrm_d (__m128d a)

+

Synopsis

+
__m128d __lsx_vfrintrm_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrintrm.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards negative infinity, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128 __lsx_vfrintrz_s (__m128 a)

+

Synopsis

+
__m128 __lsx_vfrintrz_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrintrz.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards zero, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128d __lsx_vfrintrz_d (__m128d a)

+

Synopsis

+
__m128d __lsx_vfrintrz_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrintrz.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards zero, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128 __lsx_vfrintrne_s (__m128 a)

+

Synopsis

+
__m128 __lsx_vfrintrne_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrintrne.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards nearest even, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128d __lsx_vfrintrne_d (__m128d a)

+

Synopsis

+
__m128d __lsx_vfrintrne_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrintrne.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards nearest even, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/fma/index.html b/lsx/fma/index.html new file mode 100644 index 00000000..e033d461 --- /dev/null +++ b/lsx/fma/index.html @@ -0,0 +1,583 @@ + + + + + + + + Fused Multiply-Add - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Fused Multiply-Add

+

__m128d __lsx_vfmadd_d (__m128d a, __m128d b, __m128d c)

+

Synopsis

+
__m128d __lsx_vfmadd_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfmadd.d vr, vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, accumulate to elements in c and store the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = a.fp64[i] * b.fp64[i] + c.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m128 __lsx_vfmadd_s (__m128 a, __m128 b, __m128 c)

+

Synopsis

+
__m128 __lsx_vfmadd_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfmadd.s vr, vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, accumulate to elements in c and store the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = a.fp32[i] * b.fp32[i] + c.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m128d __lsx_vfmsub_d (__m128d a, __m128d b, __m128d c)

+

Synopsis

+
__m128d __lsx_vfmsub_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfmsub.d vr, vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, subtract elements in c and store the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = a.fp64[i] * b.fp64[i] - c.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m128 __lsx_vfmsub_s (__m128 a, __m128 b, __m128 c)

+

Synopsis

+
__m128 __lsx_vfmsub_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfmsub.s vr, vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, subtract elements in c and store the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = a.fp32[i] * b.fp32[i] - c.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m128d __lsx_vfnmadd_d (__m128d a, __m128d b, __m128d c)

+

Synopsis

+
__m128d __lsx_vfnmadd_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfnmadd.d vr, vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, accumulate to elements in c and store the negated result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = (a.fp64[i] * b.fp64[i] + c.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m128 __lsx_vfnmadd_s (__m128 a, __m128 b, __m128 c)

+

Synopsis

+
__m128 __lsx_vfnmadd_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfnmadd.s vr, vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, accumulate to elements in c and store the negated result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = -(a.fp32[i] * b.fp32[i] + c.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m128d __lsx_vfnmsub_d (__m128d a, __m128d b, __m128d c)

+

Synopsis

+
__m128d __lsx_vfnmsub_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfnmsub.d vr, vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, subtract elements in c and store the negated result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = -(a.fp64[i] * b.fp64[i] - c.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m128 __lsx_vfnmsub_s (__m128 a, __m128 b, __m128 c)

+

Synopsis

+
__m128 __lsx_vfnmsub_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfnmsub.s vr, vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, subtract elements in c and store the negated result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = -(a.fp32[i] * b.fp32[i] - c.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/integer_comparison/index.html b/lsx/integer_comparison/index.html new file mode 100644 index 00000000..0d136b62 --- /dev/null +++ b/lsx/integer_comparison/index.html @@ -0,0 +1,2159 @@ + + + + + + + + Integer Comparison - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Integer Comparison

+

__m128i __lsx_vseq_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vseq_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vseq.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the 8-bit elements in a and b, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (a.byte[i] == b.byte[i]) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vseq_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vseq_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vseq.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the 16-bit elements in a and b, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (a.half[i] == b.half[i]) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vseq_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vseq_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vseq.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the 32-bit elements in a and b, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (a.word[i] == b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vseq_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vseq_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vseq.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the 64-bit elements in a and b, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (a.dword[i] == b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vseqi_b (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vseqi_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vseqi.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the 8-bit elements in a and imm, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i] == imm) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vseqi_h (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vseqi_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vseqi.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the 16-bit elements in a and imm, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((s16)a.half[i] == imm) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vseqi_w (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vseqi_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vseqi.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the 32-bit elements in a and imm, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((s32)a.word[i] == imm) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vseqi_d (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vseqi_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vseqi.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the 64-bit elements in a and imm, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i] == imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslt_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vslt_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the signed 8-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i] < (s8)b.byte[i]) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslt_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vslt_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 8-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((u8)a.byte[i] < (u8)b.byte[i]) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslt_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vslt_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the signed 16-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((s16)a.half[i] < (s16)b.half[i]) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslt_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vslt_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 16-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((u16)a.half[i] < (u16)b.half[i]) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslt_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vslt_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the signed 32-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((s32)a.word[i] < (s32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslt_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vslt_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 32-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((u32)a.word[i] < (u32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslt_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vslt_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the signed 64-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i] < (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vslt_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vslt_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 64-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((u64)a.dword[i] < (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vslti_b (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vslti_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslti.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the signed 8-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i] < imm) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslti_bu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vslti_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslti.bu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 8-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((u8)a.byte[i] < imm) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslti_h (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vslti_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslti.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the signed 16-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((s16)a.half[i] < imm) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslti_hu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vslti_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslti.hu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 16-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((u16)a.half[i] < imm) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslti_w (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vslti_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslti.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the signed 32-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((s32)a.word[i] < imm) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslti_wu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vslti_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslti.wu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 32-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((u32)a.word[i] < imm) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslti_d (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vslti_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslti.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the signed 64-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i] < imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vslti_du (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vslti_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslti.du vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 64-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((u64)a.dword[i] < imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vsle_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsle_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the signed 8-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i] <= (s8)b.byte[i]) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsle_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsle_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 8-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((u8)a.byte[i] <= (u8)b.byte[i]) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsle_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsle_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the signed 16-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((s16)a.half[i] <= (s16)b.half[i]) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsle_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsle_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 16-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((u16)a.half[i] <= (u16)b.half[i]) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsle_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsle_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the signed 32-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((s32)a.word[i] <= (s32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsle_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsle_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 32-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((u32)a.word[i] <= (u32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsle_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsle_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the signed 64-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i] <= (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vsle_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsle_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 64-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((u64)a.dword[i] <= (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vslei_b (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vslei_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslei.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the signed 8-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i] <= imm) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslei_bu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vslei_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslei.bu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 8-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((u8)a.byte[i] <= imm) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslei_h (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vslei_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslei.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the signed 16-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((s16)a.half[i] <= imm) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslei_hu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vslei_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslei.hu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 16-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((u16)a.half[i] <= imm) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslei_w (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vslei_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslei.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the signed 32-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((s32)a.word[i] <= imm) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslei_wu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vslei_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslei.wu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 32-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((u32)a.word[i] <= imm) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslei_d (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vslei_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslei.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the signed 64-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i] <= imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vslei_du (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vslei_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslei.du vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 64-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((u64)a.dword[i] <= imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/integer_computation/index.html b/lsx/integer_computation/index.html new file mode 100644 index 00000000..eb2b459a --- /dev/null +++ b/lsx/integer_computation/index.html @@ -0,0 +1,11907 @@ + + + + + + + + Integer Computation - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Integer Computation

+

__m128i __lsx_vadd_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vadd_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] + b.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vadd_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vadd_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] + b.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vadd_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vadd_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] + b.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vadd_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vadd_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] + b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vadd_q (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vadd_q (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.q vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add 128-bit elements in a and b, save the result in dst.

+

Operation

+
dst.qword[0] = a.qword[0] + b.qword[0];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vabsd_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vabsd_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute absolute difference of signed 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i] > (s8)b.byte[i]) ? (a.byte[i] - b.byte[i])
+                                                : (b.byte[i] - a.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vabsd_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vabsd_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute absolute difference of unsigned 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((u8)a.byte[i] > (u8)b.byte[i]) ? (a.byte[i] - b.byte[i])
+                                                : (b.byte[i] - a.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vabsd_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vabsd_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute absolute difference of signed 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((s16)a.half[i] > (s16)b.half[i]) ? (a.half[i] - b.half[i])
+                                                  : (b.half[i] - a.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vabsd_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vabsd_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute absolute difference of unsigned 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((u16)a.half[i] > (u16)b.half[i]) ? (a.half[i] - b.half[i])
+                                                  : (b.half[i] - a.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vabsd_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vabsd_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute absolute difference of signed 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((s32)a.word[i] > (s32)b.word[i]) ? (a.word[i] - b.word[i])
+                                                  : (b.word[i] - a.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vabsd_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vabsd_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute absolute difference of unsigned 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((u32)a.word[i] > (u32)b.word[i]) ? (a.word[i] - b.word[i])
+                                                  : (b.word[i] - a.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vabsd_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vabsd_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute absolute difference of signed 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i] > (s64)b.dword[i])
+                     ? (a.dword[i] - b.dword[i])
+                     : (b.dword[i] - a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vabsd_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vabsd_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute absolute difference of unsigned 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((u64)a.dword[i] > (u64)b.dword[i])
+                     ? (a.dword[i] - b.dword[i])
+                     : (b.dword[i] - a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vadda_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vadda_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add absolute of 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = abs((s8)a.byte[i]) + abs((s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vadda_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vadda_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add absolute of 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = abs((s16)a.half[i]) + abs((s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vadda_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vadda_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add absolute of 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = abs((s32)a.word[i]) + abs((s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vadda_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vadda_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add absolute of 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = abs((s64)a.dword[i]) + abs((s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vaddi_bu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vaddi_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.bu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Add 8-bit elements in a and imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] + imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vaddi_hu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vaddi_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.hu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Add 16-bit elements in a and imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] + imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vaddi_wu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vaddi_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.wu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Add 32-bit elements in a and imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] + imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vaddi_du (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vaddi_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.du vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Add 64-bit elements in a and imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] + imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vaddwev_h_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwev_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add even-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwev_h_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwev_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add even-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] + (u16)(u8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwev_h_bu_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwev_h_bu_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add even-positioned unsigned 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwev_w_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwev_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add even-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwev_w_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwev_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add even-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] + (u32)(u16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwev_w_hu_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwev_w_hu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add even-positioned unsigned 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwev_d_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwev_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add even-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwev_d_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwev_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add even-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] + (u64)(u32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwev_d_wu_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwev_d_wu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add even-positioned unsigned 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwev_q_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwev_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add even-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vaddwev_q_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwev_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add even-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] + (u128)(u64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vaddwev_q_du_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwev_q_du_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add even-positioned unsigned 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vaddwod_h_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwod_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.h.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwod_h_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwod_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.h.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwod_h_bu_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwod_h_bu_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned unsigned 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwod_w_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwod_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.w.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwod_w_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwod_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.w.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwod_w_hu_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwod_w_hu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned unsigned 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwod_d_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwod_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.d.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwod_d_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwod_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.d.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwod_d_wu_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwod_d_wu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned unsigned 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwod_q_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwod_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.q.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vaddwod_q_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwod_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.q.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vaddwod_q_du_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwod_q_du_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned unsigned 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vavg_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavg_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards negative infinity) of signed 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) +
+                ((a.byte[i] & b.byte[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vavg_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavg_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards negative infinity) of unsigned 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) +
+                ((a.byte[i] & b.byte[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vavg_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavg_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards negative infinity) of signed 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +
+                ((a.half[i] & b.half[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vavg_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavg_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards negative infinity) of unsigned 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +
+                ((a.half[i] & b.half[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vavg_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavg_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards negative infinity) of signed 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +
+                ((a.word[i] & b.word[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vavg_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavg_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards negative infinity) of unsigned 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +
+                ((a.word[i] & b.word[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vavg_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavg_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards negative infinity) of signed 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +
+                 ((a.dword[i] & b.dword[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vavg_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavg_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards negative infinity) of unsigned 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +
+                 ((a.dword[i] & b.dword[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vavgr_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavgr_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards positive infinity) of signed 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) +
+                ((a.byte[i] | b.byte[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vavgr_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavgr_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards positive infinity) of unsigned 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) +
+                ((a.byte[i] | b.byte[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vavgr_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavgr_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards positive infinity) of signed 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +
+                ((a.half[i] | b.half[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vavgr_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavgr_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards positive infinity) of unsigned 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +
+                ((a.half[i] | b.half[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vavgr_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavgr_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards positive infinity) of signed 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +
+                ((a.word[i] | b.word[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vavgr_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavgr_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards positive infinity) of unsigned 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +
+                ((a.word[i] | b.word[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vavgr_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavgr_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards positive infinity) of signed 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +
+                 ((a.dword[i] | b.dword[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vavgr_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavgr_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards positive infinity) of unsigned 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +
+                 ((a.dword[i] | b.dword[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vdiv_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vdiv_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Divide signed 8-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] / (s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600029, 320.06(1/15.5)
3C500029, 320.06(1/17)
+

__m128i __lsx_vdiv_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vdiv_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Divide unsigned 8-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] / (u8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600029, 330.06(1/16.5)
3C500029, 360.06(1/18)
+

__m128i __lsx_vdiv_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vdiv_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Divide signed 16-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] / (s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000170.12(1/8.5)
3C500017, 21.50.09(1/11)
+

__m128i __lsx_vdiv_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vdiv_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Divide unsigned 16-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] / (u16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600017, 220.11(1/9)
3C500017, 21.50.07(1/14)
+

__m128i __lsx_vdiv_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vdiv_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Divide signed 32-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] / (s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000110.18(1/5.5)
3C500011, 17.50.09(1/11.5)
+

__m128i __lsx_vdiv_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vdiv_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Divide unsigned 32-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] / (u32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000110.18(1/5.5)
3C500011, 17.50.07(1/15)
+

__m128i __lsx_vdiv_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vdiv_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Divide signed 64-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] / (s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600080.25(1/4)
3C50008, 18.50.11(1/9)
+

__m128i __lsx_vdiv_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vdiv_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Divide unsigned 64-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] / (u64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600080.25(1/4)
3C50008, 18.50.11(1/9)
+

__m128i __lsx_vhaddw_h_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhaddw_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.h.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned signed 8-bit elements in a to even-positioned signed 8-bit elements in b to get 16-bit result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vhaddw_hu_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhaddw_hu_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.hu.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned unsigned 8-bit elements in a to even-positioned unsigned 8-bit elements in b to get 16-bit result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vhaddw_w_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhaddw_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.w.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned signed 16-bit elements in a to even-positioned signed 16-bit elements in b to get 32-bit result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vhaddw_wu_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhaddw_wu_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.wu.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned unsigned 16-bit elements in a to even-positioned unsigned 16-bit elements in b to get 32-bit result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vhaddw_d_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhaddw_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.d.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned signed 32-bit elements in a to even-positioned signed 32-bit elements in b to get 64-bit result.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vhaddw_du_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhaddw_du_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.du.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned unsigned 32-bit elements in a to even-positioned unsigned 32-bit elements in b to get 64-bit result.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vhaddw_q_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhaddw_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.q.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned signed 64-bit elements in a to even-positioned signed 64-bit elements in b to get 128-bit result.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vhaddw_qu_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhaddw_qu_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.qu.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned unsigned 64-bit elements in a to even-positioned unsigned 64-bit elements in b to get 128-bit result.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vhsubw_h_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhsubw_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.h.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned signed 8-bit elements in a by even-positioned signed 8-bit elements in b to get 16-bit result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vhsubw_hu_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhsubw_hu_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.hu.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned unsigned 8-bit elements in a by even-positioned unsigned 8-bit elements in b to get 16-bit result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vhsubw_w_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhsubw_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.w.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned signed 16-bit elements in a by even-positioned signed 16-bit elements in b to get 32-bit result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vhsubw_wu_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhsubw_wu_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.wu.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned unsigned 16-bit elements in a by even-positioned unsigned 16-bit elements in b to get 32-bit result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vhsubw_d_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhsubw_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.d.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned signed 32-bit elements in a by even-positioned signed 32-bit elements in b to get 64-bit result.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vhsubw_du_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhsubw_du_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.du.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned unsigned 32-bit elements in a by even-positioned unsigned 32-bit elements in b to get 64-bit result.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vhsubw_q_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhsubw_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.q.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned signed 64-bit elements in a by even-positioned signed 64-bit elements in b to get 128-bit result.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vhsubw_qu_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhsubw_qu_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.qu.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned unsigned 64-bit elements in a by even-positioned unsigned 64-bit elements in b to get 128-bit result.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vmadd_b (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmadd_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmadd.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply 8-bit elements in b and c, add to elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = b.byte[i] * c.byte[i] + a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmadd_h (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmadd_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmadd.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply 16-bit elements in b and c, add to elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = b.half[i] * c.half[i] + a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmadd_w (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmadd_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmadd.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply 32-bit elements in b and c, add to elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = b.word[i] * c.word[i] + a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmadd_d (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmadd_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmadd.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply 64-bit elements in b and c, add to elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = b.dword[i] * c.dword[i] + a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwev_h_b (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwev_h_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned signed 8-bit elements in b and signed elements in c, add to 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] =
+      (s16)(s8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwev_h_bu (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwev_h_bu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 8-bit elements in b and unsigned elements in c, add to 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] =
+      (u16)(u8)b.byte[2 * i] * (u16)(u8)c.byte[2 * i] + (u16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwev_h_bu_b (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwev_h_bu_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 8-bit elements in b and signed elements in c, add to 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] =
+      (u16)(u8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwev_w_h (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwev_w_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned signed 16-bit elements in b and signed elements in c, add to 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] =
+      (s32)(s16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwev_w_hu (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwev_w_hu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 16-bit elements in b and unsigned elements in c, add to 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] =
+      (u32)(u16)b.half[2 * i] * (u32)(u16)c.half[2 * i] + (u32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwev_w_hu_h (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwev_w_hu_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 16-bit elements in b and signed elements in c, add to 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] =
+      (u32)(u16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwev_d_w (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwev_d_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned signed 32-bit elements in b and signed elements in c, add to 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] =
+      (s64)(s32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwev_d_wu (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwev_d_wu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 32-bit elements in b and unsigned elements in c, add to 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] =
+      (u64)(u32)b.word[2 * i] * (u64)(u32)c.word[2 * i] + (u64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwev_d_wu_w (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwev_d_wu_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 32-bit elements in b and signed elements in c, add to 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] =
+      (u64)(u32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwev_q_d (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwev_q_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned signed 64-bit elements in b and signed elements in c, add to 128-bit elements in a.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] =
+      (s128)(s64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600071.14
3C500071.14
+

__m128i __lsx_vmaddwev_q_du (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwev_q_du (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 64-bit elements in b and unsigned elements in c, add to 128-bit elements in a.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] =
+      (u128)(u64)b.dword[2 * i] * (u128)(u64)c.dword[2 * i] + (u128)a.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600071.14
3C500071.14
+

__m128i __lsx_vmaddwev_q_du_d (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwev_q_du_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 64-bit elements in b and signed elements in c, add to 128-bit elements in a.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] =
+      (u128)(u64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600071.14
3C500071.14
+

__m128i __lsx_vmaddwod_h_b (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwod_h_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.h.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned signed 8-bit elements in b and signed elements in c, add to 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] =
+      (s16)(s8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwod_h_bu (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwod_h_bu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.h.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 8-bit elements in b and unsigned elements in c, add to 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] =
+      (u16)(u8)b.byte[2 * i + 1] * (u16)(u8)c.byte[2 * i + 1] + (u16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwod_h_bu_b (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwod_h_bu_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 8-bit elements in b and signed elements in c, add to 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] =
+      (u16)(u8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwod_w_h (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwod_w_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.w.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned signed 16-bit elements in b and signed elements in c, add to 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
+                (s32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwod_w_hu (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwod_w_hu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.w.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 16-bit elements in b and unsigned elements in c, add to 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (u32)(u16)c.half[2 * i + 1] +
+                (u32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwod_w_hu_h (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwod_w_hu_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 16-bit elements in b and signed elements in c, add to 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
+                (s32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwod_d_w (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwod_d_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.d.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned signed 32-bit elements in b and signed elements in c, add to 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
+                 (s64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwod_d_wu (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwod_d_wu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.d.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 32-bit elements in b and unsigned elements in c, add to 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (u64)(u32)c.word[2 * i + 1] +
+                 (u64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwod_d_wu_w (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwod_d_wu_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 32-bit elements in b and signed elements in c, add to 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
+                 (s64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwod_q_d (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwod_q_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.q.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned signed 64-bit elements in b and signed elements in c, add to 128-bit elements in a.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
+                 (s128)a.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600071.14
3C500071.14
+

__m128i __lsx_vmaddwod_q_du (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwod_q_du (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.q.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 64-bit elements in b and unsigned elements in c, add to 128-bit elements in a.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (u128)(u64)c.dword[2 * i + 1] +
+                 (u128)a.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600071.14
3C500071.14
+

__m128i __lsx_vmaddwod_q_du_d (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwod_q_du_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 64-bit elements in b and signed elements in c, add to 128-bit elements in a.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
+                 (s128)a.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600071.14
3C500071.14
+

__m128i __lsx_vmax_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmax_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for signed 8-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = max((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmax_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmax_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for unsigned 8-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = max((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmax_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmax_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for signed 16-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = max((s16)a.half[i], (s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmax_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmax_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for unsigned 16-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = max((u16)a.half[i], (u16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmax_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmax_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for signed 32-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = max((s32)a.word[i], (s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmax_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmax_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for unsigned 32-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = max((u32)a.word[i], (u32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmax_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmax_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for signed 64-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = max((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vmax_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmax_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for unsigned 64-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = max((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vmaxi_b (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vmaxi_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for signed 8-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = max((s8)a.byte[i], (s8)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmaxi_bu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vmaxi_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.bu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for unsigned 8-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = max((u8)a.byte[i], (u8)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmaxi_h (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vmaxi_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for signed 16-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = max((s16)a.half[i], (s16)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmaxi_hu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vmaxi_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.hu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for unsigned 16-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = max((u16)a.half[i], (u16)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmaxi_w (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vmaxi_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for signed 32-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = max((s32)a.word[i], (s32)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmaxi_wu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vmaxi_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.wu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for unsigned 32-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = max((u32)a.word[i], (u32)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmaxi_d (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vmaxi_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for signed 64-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = max((s64)a.dword[i], (s64)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vmaxi_du (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vmaxi_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.du vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for unsigned 64-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = max((u64)a.dword[i], (u64)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vmin_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmin_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for signed 8-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = min((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmin_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmin_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for unsigned 8-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = min((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmin_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmin_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for signed 16-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = min((s16)a.half[i], (s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmin_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmin_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for unsigned 16-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = min((u16)a.half[i], (u16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmin_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmin_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for signed 32-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = min((s32)a.word[i], (s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmin_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmin_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for unsigned 32-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = min((u32)a.word[i], (u32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmin_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmin_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for signed 64-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = min((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vmin_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmin_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for unsigned 64-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = min((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vmini_b (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vmini_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmini.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for signed 8-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = min((s8)a.byte[i], (s8)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmini_bu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vmini_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmini.bu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for unsigned 8-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = min((u8)a.byte[i], (u8)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmini_h (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vmini_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmini.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for signed 16-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = min((s16)a.half[i], (s16)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmini_hu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vmini_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmini.hu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for unsigned 16-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = min((u16)a.half[i], (u16)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmini_w (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vmini_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmini.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for signed 32-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = min((s32)a.word[i], (s32)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmini_wu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vmini_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmini.wu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for unsigned 32-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = min((u32)a.word[i], (u32)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmini_d (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vmini_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmini.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for signed 64-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = min((s64)a.dword[i], (s64)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vmini_du (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vmini_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmini.du vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for unsigned 64-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = min((u64)a.dword[i], (u64)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vmod_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmod_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Modulo residual signed 8-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] % (s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600029, 350.06(1/15.5)
3C500029, 330.06(1/17)
+

__m128i __lsx_vmod_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmod_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Modulo residual unsigned 8-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] % (u8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600029, 370.06(1/17.5)
3C500029, 330.05(1/19)
+

__m128i __lsx_vmod_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmod_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Modulo residual signed 16-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] % (s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600017, 210.12(1/8.5)
3C500017, 210.09(1/11)
+

__m128i __lsx_vmod_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmod_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Modulo residual unsigned 16-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] % (u16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600017, 210.11(1/9.5)
3C500017, 210.07(1/15)
+

__m128i __lsx_vmod_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmod_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Modulo residual signed 32-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] % (s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011, 130.18(1/5.5)
3C500011, 150.08(1/12)
+

__m128i __lsx_vmod_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmod_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Modulo residual unsigned 32-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] % (u32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011, 130.18(1/5.5)
3C500011, 150.06(1/16)
+

__m128i __lsx_vmod_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmod_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Modulo residual signed 64-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] % (s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60008, 100.25(1/4)
3C50008, 100.11(1/9.5)
+

__m128i __lsx_vmod_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmod_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Modulo residual unsigned 64-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] % (u64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60008, 100.25(1/4)
3C50008, 100.11(1/9.5)
+

__m128i __lsx_vmsub_b (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmsub_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmsub.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply 8-bit elements in b and c, negate and add elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = -b.byte[i] * c.byte[i] + a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmsub_h (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmsub_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmsub.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply 16-bit elements in b and c, negate and add elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = -b.half[i] * c.half[i] + a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmsub_w (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmsub_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmsub.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply 32-bit elements in b and c, negate and add elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = -b.word[i] * c.word[i] + a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmsub_d (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmsub_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmsub.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply 64-bit elements in b and c, negate and add elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = -b.dword[i] * c.dword[i] + a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmuh_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmuh_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply signed 8-bit elements in a and b, save the high 8-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (((s16)(s8)a.byte[i] * (s16)(s8)b.byte[i])) >> 8;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmuh_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmuh_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply unsigned 8-bit elements in a and b, save the high 8-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (((u16)(u8)a.byte[i] * (u16)(u8)b.byte[i])) >> 8;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmuh_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmuh_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply signed 16-bit elements in a and b, save the high 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (((s32)(s16)a.half[i] * (s32)(s16)b.half[i])) >> 16;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmuh_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmuh_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply unsigned 16-bit elements in a and b, save the high 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (((u32)(u16)a.half[i] * (u32)(u16)b.half[i])) >> 16;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmuh_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmuh_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply signed 32-bit elements in a and b, save the high 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (((s64)(s32)a.word[i] * (s64)(s32)b.word[i])) >> 32;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmuh_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmuh_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply unsigned 32-bit elements in a and b, save the high 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (((u64)(u32)a.word[i] * (u64)(u32)b.word[i])) >> 32;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmuh_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmuh_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply signed 64-bit elements in a and b, save the high 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (((s128)(s64)a.dword[i] * (s128)(s64)b.dword[i])) >> 64;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmuh_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmuh_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply unsigned 64-bit elements in a and b, save the high 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (((u128)(u64)a.dword[i] * (u128)(u64)b.dword[i])) >> 64;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmul_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmul_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmul.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] * b.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmul_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmul_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmul.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] * b.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmul_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmul_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmul.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] * b.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmul_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmul_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmul.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] * b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwev_h_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwev_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwev_h_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwev_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] * (u16)(u8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwev_h_bu_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwev_h_bu_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwev_w_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwev_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwev_w_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwev_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] * (u32)(u16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwev_w_hu_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwev_w_hu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwev_d_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwev_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwev_d_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwev_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] * (u64)(u32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwev_d_wu_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwev_d_wu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwev_q_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwev_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600072
3C500072
+

__m128i __lsx_vmulwev_q_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwev_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] * (u128)(u64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600072
3C500072
+

__m128i __lsx_vmulwev_q_du_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwev_q_du_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600072
3C500072
+

__m128i __lsx_vmulwod_h_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwod_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.h.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwod_h_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwod_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.h.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (u16)(u8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwod_h_bu_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwod_h_bu_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwod_w_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwod_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.w.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwod_w_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwod_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.w.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (u32)(u16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwod_w_hu_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwod_w_hu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwod_d_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwod_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.d.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwod_d_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwod_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.d.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (u64)(u32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwod_d_wu_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwod_d_wu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwod_q_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwod_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.q.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600072
3C500072
+

__m128i __lsx_vmulwod_q_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwod_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.q.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (u128)(u64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600072
3C500072
+

__m128i __lsx_vmulwod_q_du_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwod_q_du_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600072
3C500072
+

__m128i __lsx_vneg_b (__m128i a)

+

Synopsis

+
__m128i __lsx_vneg_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vneg.b vr, vr
+CPU Flags: LSX
+
+

Description

+

Negate 8-bit elements in a and save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = -a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vneg_h (__m128i a)

+

Synopsis

+
__m128i __lsx_vneg_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vneg.h vr, vr
+CPU Flags: LSX
+
+

Description

+

Negate 16-bit elements in a and save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = -a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vneg_w (__m128i a)

+

Synopsis

+
__m128i __lsx_vneg_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vneg.w vr, vr
+CPU Flags: LSX
+
+

Description

+

Negate 32-bit elements in a and save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = -a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vneg_d (__m128i a)

+

Synopsis

+
__m128i __lsx_vneg_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vneg.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Negate 64-bit elements in a and save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = -a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsadd_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsadd_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing add the signed 8-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (s8)sadd((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsadd_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsadd_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing add the unsigned 8-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (u8)sadd((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsadd_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsadd_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing add the signed 16-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)sadd((s16)a.half[i], (s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsadd_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsadd_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing add the unsigned 16-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)sadd((u16)a.half[i], (u16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsadd_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsadd_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing add the signed 32-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)sadd((s32)a.word[i], (s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsadd_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsadd_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing add the unsigned 32-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)sadd((u32)a.word[i], (u32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsadd_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsadd_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing add the signed 64-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)sadd((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsadd_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsadd_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing add the unsigned 64-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)sadd((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vssub_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssub_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing subtract the signed 8-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (s8)ssub((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vssub_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssub_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing subtract the unsigned 8-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (u8)ssub((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vssub_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssub_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing subtract the signed 16-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)ssub((s16)a.half[i], (s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vssub_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssub_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing subtract the unsigned 16-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)ssub((u16)a.half[i], (u16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vssub_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssub_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing subtract the signed 32-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)ssub((s32)a.word[i], (s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vssub_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssub_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing subtract the unsigned 32-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)ssub((u32)a.word[i], (u32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vssub_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssub_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing subtract the signed 64-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)ssub((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vssub_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssub_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing subtract the unsigned 64-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)ssub((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsub_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsub_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] - b.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsub_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsub_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] - b.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsub_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsub_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] - b.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsub_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsub_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] - b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsub_q (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsub_q (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.q vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract 128-bit elements in a and b, save the result in dst.

+

Operation

+
dst.qword[0] = a.qword[0] - b.qword[0];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsubi_bu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsubi_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsubi.bu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Subtract 8-bit elements in a by imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] - imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsubi_hu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsubi_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsubi.hu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Subtract 16-bit elements in a by imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] - imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsubi_wu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsubi_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsubi.wu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Subtract 32-bit elements in a by imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] - imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsubi_du (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsubi_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsubi.du vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Subtract 64-bit elements in a by imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] - imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsubwev_h_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwev_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract even-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i] - (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsubwev_h_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwev_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract even-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] - (u16)(u8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsubwev_w_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwev_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract even-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i] - (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsubwev_w_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwev_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract even-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] - (u32)(u16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsubwev_d_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwev_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract even-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i] - (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsubwev_d_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwev_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract even-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] - (u64)(u32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsubwev_q_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwev_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract even-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i] - (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsubwev_q_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwev_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract even-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] - (u128)(u64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsubwod_h_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwod_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.h.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsubwod_h_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwod_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.h.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsubwod_w_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwod_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.w.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsubwod_w_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwod_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.w.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsubwod_d_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwod_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.d.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsubwod_d_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwod_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.d.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsubwod_q_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwod_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.q.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsubwod_q_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwod_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.q.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/logical/index.html b/lsx/logical/index.html new file mode 100644 index 00000000..8c2636cd --- /dev/null +++ b/lsx/logical/index.html @@ -0,0 +1,689 @@ + + + + + + + + Logical - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Logical

+

__m128i __lsx_vand_v (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vand_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vand.v vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute bitwise AND between elements in a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] & b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vandi_b (__m128i a, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vandi_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vandi.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute bitwise AND between elements in a and imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] & imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vandn_v (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vandn_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vandn.v vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute bitwise ANDN between elements in a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = b.dword[i] & (~a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vnor_v (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vnor_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vnor.v vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute bitwise NOR between elements in a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ~(a.dword[i] | b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vnori_b (__m128i a, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vnori_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vnori.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute bitwise NOR between elements in a and imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ~(a.byte[i] | imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vor_v (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vor_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vor.v vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute bitwise OR between elements in a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] | b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vori_b (__m128i a, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vori_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vori.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute bitwise OR between elements in a and imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] | imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vorn_v (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vorn_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vorn.v vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute bitwise ORN between elements in a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] | (~b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vxor_v (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vxor_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vxor.v vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute bitwise XOR between elements in a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] ^ b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vxori_b (__m128i a, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vxori_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vxori.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute bitwise XOR between elements in a and imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] ^ imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/memory/index.html b/lsx/memory/index.html new file mode 100644 index 00000000..16cfc29f --- /dev/null +++ b/lsx/memory/index.html @@ -0,0 +1,475 @@ + + + + + + + + Memory Load & Store - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Memory Load & Store

+

__m128i __lsx_vld (void * addr, imm_n2048_2047 offset)

+

Synopsis

+
__m128i __lsx_vld (void * addr, imm_n2048_2047 offset)
+#include <lsxintrin.h>
+Instruction: vld vr, r, imm
+CPU Flags: LSX
+
+

Description

+

Read whole vector from memory address addr + offset, save the data into dst. Note that you can use this intrinsic to load floating point vectors, even though the return type represents integer vectors.

+

Operation

+
dst = memory_load(128, addr + offset);
+
+

__m128i __lsx_vldx (void * addr, long int offset)

+

Synopsis

+
__m128i __lsx_vldx (void * addr, long int offset)
+#include <lsxintrin.h>
+Instruction: vldx vr, r, r
+CPU Flags: LSX
+
+

Description

+

Read whole vector from memory address addr + offset, save the data into dst. Note that you can use this intrinsic to load floating point vectors, even though the return type represents integer vectors.

+

Operation

+
dst = memory_load(128, addr + offset);
+
+

__m128i __lsx_vldrepl_b (void * addr, imm_n2048_2047 offset)

+

Synopsis

+
__m128i __lsx_vldrepl_b (void * addr, imm_n2048_2047 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.b vr, r, imm
+CPU Flags: LSX
+
+

Description

+

Read 8-bit data from memory address addr + (offset << 0), replicate the data to all vector lanes and save into dst.

+

Operation

+
u8 data = memory_load(8, addr + offset);
+for (int i = 0; i < 16; i++) {
+  dst.byte[i] = data;
+}
+
+

__m128i __lsx_vldrepl_h (void * addr, imm_n1024_1023 offset)

+

Synopsis

+
__m128i __lsx_vldrepl_h (void * addr, imm_n1024_1023 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.h vr, r, imm
+CPU Flags: LSX
+
+

Description

+

Read 16-bit data from memory address addr + (offset << 1), replicate the data to all vector lanes and save into dst.

+

Operation

+
u16 data = memory_load(16, addr + (offset << 1));
+for (int i = 0; i < 8; i++) {
+  dst.half[i] = data;
+}
+
+

__m128i __lsx_vldrepl_w (void * addr, imm_n512_511 offset)

+

Synopsis

+
__m128i __lsx_vldrepl_w (void * addr, imm_n512_511 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.w vr, r, imm
+CPU Flags: LSX
+
+

Description

+

Read 32-bit data from memory address addr + (offset << 2), replicate the data to all vector lanes and save into dst.

+

Operation

+
u32 data = memory_load(32, addr + (offset << 2));
+for (int i = 0; i < 4; i++) {
+  dst.word[i] = data;
+}
+
+

__m128i __lsx_vldrepl_d (void * addr, imm_n256_255 offset)

+

Synopsis

+
__m128i __lsx_vldrepl_d (void * addr, imm_n256_255 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.d vr, r, imm
+CPU Flags: LSX
+
+

Description

+

Read 64-bit data from memory address addr + (offset << 3), replicate the data to all vector lanes and save into dst.

+

Operation

+
u64 data = memory_load(64, addr + (offset << 3));
+for (int i = 0; i < 2; i++) {
+  dst.dword[i] = data;
+}
+
+

void __lsx_vst (__m128i data, void * addr, imm_n2048_2047 offset)

+

Synopsis

+
void __lsx_vst (__m128i data, void * addr, imm_n2048_2047 offset)
+#include <lsxintrin.h>
+Instruction: vst vr, r, imm
+CPU Flags: LSX
+
+

Description

+

Write whole vector data in data to memory address addr + offset.

+

Operation

+
memory_store(128, data, addr + offset);
+
+

void __lsx_vstx (__m128i data, void * addr, long int offset)

+

Synopsis

+
void __lsx_vstx (__m128i data, void * addr, long int offset)
+#include <lsxintrin.h>
+Instruction: vstx vr, r, r
+CPU Flags: LSX
+
+

Description

+

Write whole-vector data in data to memory address addr + offset.

+

Operation

+
memory_store(128, data, addr + offset);
+
+

void __lsx_vstelm_b (__m128i data, void * addr, imm_n128_127 offset, imm0_15 lane)

+

Synopsis

+
void __lsx_vstelm_b (__m128i data, void * addr, imm_n128_127 offset, imm0_15 lane)
+#include <lsxintrin.h>
+Instruction: vstelm.b vr, r, imm, imm
+CPU Flags: LSX
+
+

Description

+

Store the 8-bit element in data specified by lane to memory address addr + offset.

+

Operation

+
memory_store(8, data.byte[lane], addr + offset);
+
+

void __lsx_vstelm_h (__m128i data, void * addr, imm_n128_127 offset, imm0_7 lane)

+

Synopsis

+
void __lsx_vstelm_h (__m128i data, void * addr, imm_n128_127 offset, imm0_7 lane)
+#include <lsxintrin.h>
+Instruction: vstelm.h vr, r, imm, imm
+CPU Flags: LSX
+
+

Description

+

Store the 16-bit element in data specified by lane to memory address addr + offset.

+

Operation

+
memory_store(16, data.half[lane], addr + offset);
+
+

void __lsx_vstelm_w (__m128i data, void * addr, imm_n128_127 offset, imm0_3 lane)

+

Synopsis

+
void __lsx_vstelm_w (__m128i data, void * addr, imm_n128_127 offset, imm0_3 lane)
+#include <lsxintrin.h>
+Instruction: vstelm.w vr, r, imm, imm
+CPU Flags: LSX
+
+

Description

+

Store the 32-bit element in data specified by lane to memory address addr + offset.

+

Operation

+
memory_store(32, data.word[lane], addr + offset);
+
+

void __lsx_vstelm_d (__m128i data, void * addr, imm_n128_127 offset, imm0_1 lane)

+

Synopsis

+
void __lsx_vstelm_d (__m128i data, void * addr, imm_n128_127 offset, imm0_1 lane)
+#include <lsxintrin.h>
+Instruction: vstelm.d vr, r, imm, imm
+CPU Flags: LSX
+
+

Description

+

Store the 64-bit element in data specified by lane to memory address addr + offset.

+

Operation

+
memory_store(64, data.dword[lane], addr + offset);
+
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/misc/index.html b/lsx/misc/index.html new file mode 100644 index 00000000..ba3d554f --- /dev/null +++ b/lsx/misc/index.html @@ -0,0 +1,4669 @@ + + + + + + + + Misc - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Misc

+

__m128i __lsx_vexth_h_b (__m128i a)

+

Synopsis

+
__m128i __lsx_vexth_h_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.h.b vr, vr
+CPU Flags: LSX
+
+

Description

+

Extend signed 8-bit elements in the higher half of a to 16-bit.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[8 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vexth_hu_bu (__m128i a)

+

Synopsis

+
__m128i __lsx_vexth_hu_bu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.hu.bu vr, vr
+CPU Flags: LSX
+
+

Description

+

Extend unsigned 8-bit elements in the higher half of a to 16-bit.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[8 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vexth_w_h (__m128i a)

+

Synopsis

+
__m128i __lsx_vexth_w_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.w.h vr, vr
+CPU Flags: LSX
+
+

Description

+

Extend signed 16-bit elements in the higher half of a to 32-bit.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[4 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vexth_wu_hu (__m128i a)

+

Synopsis

+
__m128i __lsx_vexth_wu_hu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.wu.hu vr, vr
+CPU Flags: LSX
+
+

Description

+

Extend unsigned 16-bit elements in the higher half of a to 32-bit.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[4 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vexth_d_w (__m128i a)

+

Synopsis

+
__m128i __lsx_vexth_d_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.d.w vr, vr
+CPU Flags: LSX
+
+

Description

+

Extend signed 32-bit elements in the higher half of a to 64-bit.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vexth_du_wu (__m128i a)

+

Synopsis

+
__m128i __lsx_vexth_du_wu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.du.wu vr, vr
+CPU Flags: LSX
+
+

Description

+

Extend unsigned 32-bit elements in the higher half of a to 64-bit.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vexth_q_d (__m128i a)

+

Synopsis

+
__m128i __lsx_vexth_q_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.q.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Extend signed 64-bit elements in the higher half of a to 128-bit.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[1 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vexth_qu_du (__m128i a)

+

Synopsis

+
__m128i __lsx_vexth_qu_du (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.qu.du vr, vr
+CPU Flags: LSX
+
+

Description

+

Extend unsigned 64-bit elements in the higher half of a to 128-bit.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[1 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vextl_q_d (__m128i a)

+

Synopsis

+
__m128i __lsx_vextl_q_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vextl.q.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Extend signed 64-bit elements in the lower half of a to 128-bit.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vextl_qu_du (__m128i a)

+

Synopsis

+
__m128i __lsx_vextl_qu_du (__m128i a)
+#include <lsxintrin.h>
+Instruction: vextl.qu.du vr, vr
+CPU Flags: LSX
+
+

Description

+

Extend unsigned 64-bit elements in the lower half of a to 128-bit.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vextrins_b (__m128i a, __m128i b, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vextrins_b (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vextrins.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Extract one 8-bit element in b and insert it to a according to imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i == ((imm >> 4) & 15)) ? b.byte[imm & 15] : a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vextrins_h (__m128i a, __m128i b, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vextrins_h (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vextrins.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Extract one 16-bit element in b and insert it to a according to imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i == ((imm >> 4) & 7)) ? b.half[imm & 7] : a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vextrins_w (__m128i a, __m128i b, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vextrins_w (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vextrins.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Extract one 32-bit element in b and insert it to a according to imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i == ((imm >> 4) & 3)) ? b.word[imm & 3] : a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vextrins_d (__m128i a, __m128i b, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vextrins_d (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vextrins.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Extract one 64-bit element in b and insert it to a according to imm.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i == ((imm >> 4) & 1)) ? b.dword[imm & 1] : a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vilvh_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vilvh_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvh.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Interleave 8-bit elements in higher half of a and b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vilvh_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vilvh_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvh.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Interleave 16-bit elements in higher half of a and b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vilvh_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vilvh_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvh.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Interleave 32-bit elements in higher half of a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vilvh_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vilvh_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvh.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Interleave 64-bit elements in higher half of a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vilvl_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vilvl_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvl.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Interleave 8-bit elements in lower half of a and b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2] : b.byte[i / 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vilvl_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vilvl_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvl.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Interleave 16-bit elements in lower half of a and b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i / 2] : b.half[i / 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vilvl_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vilvl_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvl.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Interleave 32-bit elements in lower half of a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i / 2] : b.word[i / 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vilvl_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vilvl_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvl.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Interleave 64-bit elements in lower half of a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2] : b.dword[i / 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vinsgr2vr_b (__m128i a, int b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vinsgr2vr_b (__m128i a, int b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vinsgr2vr.b vr, r, imm
+CPU Flags: LSX
+
+

Description

+

Insert 8-bit element into lane indexed imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i == imm) ? b : a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m128i __lsx_vinsgr2vr_h (__m128i a, int b, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vinsgr2vr_h (__m128i a, int b, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vinsgr2vr.h vr, r, imm
+CPU Flags: LSX
+
+

Description

+

Insert 16-bit element into lane indexed imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i == imm) ? b : a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m128i __lsx_vinsgr2vr_w (__m128i a, int b, imm0_3 imm)

+

Synopsis

+
__m128i __lsx_vinsgr2vr_w (__m128i a, int b, imm0_3 imm)
+#include <lsxintrin.h>
+Instruction: vinsgr2vr.w vr, r, imm
+CPU Flags: LSX
+
+

Description

+

Insert 32-bit element into lane indexed imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i == imm) ? b : a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m128i __lsx_vinsgr2vr_d (__m128i a, long int b, imm0_1 imm)

+

Synopsis

+
__m128i __lsx_vinsgr2vr_d (__m128i a, long int b, imm0_1 imm)
+#include <lsxintrin.h>
+Instruction: vinsgr2vr.d vr, r, imm
+CPU Flags: LSX
+
+

Description

+

Insert 64-bit element into lane indexed imm.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i == imm) ? b : a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m128i __lsx_vfrstp_b (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vfrstp_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vfrstp.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Find the first negative 8-bit element in b, set the index of the element to the lane of a specified by c.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i];
+}
+int i;
+for (i = 0; i < 16; i++) {
+  if ((s8)b.byte[i] < 0) {
+    break;
+  }
+}
+dst.byte[c.byte[0] % 16] = i;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vfrstp_h (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vfrstp_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vfrstp.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Find the first negative 16-bit element in b, set the index of the element to the lane of a specified by c.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i];
+}
+int i;
+for (i = 0; i < 8; i++) {
+  if ((s16)b.half[i] < 0) {
+    break;
+  }
+}
+dst.half[c.half[0] % 8] = i;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vfrstpi_b (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vfrstpi_b (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vfrstpi.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Find the first negative 8-bit element in b, set the index of the element to the lane of a specified by imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i];
+}
+int i;
+for (i = 0; i < 16; i++) {
+  if ((s8)b.byte[i] < 0) {
+    break;
+  }
+}
+dst.byte[imm % 16] = i;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vfrstpi_h (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vfrstpi_h (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vfrstpi.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Find the first negative 16-bit element in b, set the index of the element to the lane of a specified by imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i];
+}
+int i;
+for (i = 0; i < 8; i++) {
+  if ((s16)b.half[i] < 0) {
+    break;
+  }
+}
+dst.half[imm % 8] = i;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vmskgez_b (__m128i a)

+

Synopsis

+
__m128i __lsx_vmskgez_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmskgez.b vr, vr
+CPU Flags: LSX
+
+

Description

+

For each 8-bit element in a, if the element is greater than or equal to zero, set one bit in dst, otherwise clear it.

+

Examples

+
__m128i __lsx_vmskgez_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x00000000000001fe 0x0000000000000000
+__m128i __lsx_vmskgez_b(__m128i{0x0000808000000000, 0x0081000081716151})
+= 0x000000000000b7cf 0x0000000000000000
+
+

Operation

+
u64 m = 0x8080808080808080;
+u64 c = m & a.dword[0];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[0] = (u16)~dst.dword[0];
+dst.dword[1] = 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmskltz_b (__m128i a)

+

Synopsis

+
__m128i __lsx_vmskltz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmskltz.b vr, vr
+CPU Flags: LSX
+
+

Description

+

For each 8-bit element in a, if the element is less than zero, set one bit in dst, otherwise clear it.

+

Examples

+
__m128i __lsx_vmskltz_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x000000000000fe01 0x0000000000000000
+__m128i __lsx_vmskltz_b(__m128i{0x0000808000000000, 0x0081000081716151})
+= 0x0000000000004830 0x0000000000000000
+
+

Operation

+
u64 m = 0x8080808080808080;
+u64 c = m & a.dword[0];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[1] = 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmskltz_h (__m128i a)

+

Synopsis

+
__m128i __lsx_vmskltz_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmskltz.h vr, vr
+CPU Flags: LSX
+
+

Description

+

For each 16-bit element in a, if the element is less than zero, set one bit in dst, otherwise clear it.

+

Examples

+
__m128i __lsx_vmskltz_h(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x00000000000000f0 0x0000000000000000
+__m128i __lsx_vmskltz_h(__m128i{0x0000808000000000, 0x0081000081716151})
+= 0x0000000000000024 0x0000000000000000
+
+

Operation

+
u64 m = 0x8000800080008000;
+u64 c = m & a.dword[0];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[0] |= c << 4;
+dst.dword[1] = 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmskltz_w (__m128i a)

+

Synopsis

+
__m128i __lsx_vmskltz_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmskltz.w vr, vr
+CPU Flags: LSX
+
+

Description

+

For each 32-bit element in a, if the element is less than zero, set one bit in dst, otherwise clear it.

+

Examples

+
__m128i __lsx_vmskltz_w(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x000000000000000c 0x0000000000000000
+__m128i __lsx_vmskltz_w(__m128i{0x0000808000000000, 0x0081000081716151})
+= 0x0000000000000004 0x0000000000000000
+
+

Operation

+
u64 m = 0x8000000080000000;
+u64 c = m & a.dword[0];
+c |= c << 31;
+c >>= 62;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 31;
+c >>= 62;
+dst.dword[0] |= c << 2;
+dst.dword[1] = 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmskltz_d (__m128i a)

+

Synopsis

+
__m128i __lsx_vmskltz_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmskltz.d vr, vr
+CPU Flags: LSX
+
+

Description

+

For each 64-bit element in a, if the element is less than zero, set one bit in dst, otherwise clear it.

+

Examples

+
__m128i __lsx_vmskltz_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0000000000000002 0x0000000000000000
+__m128i __lsx_vmskltz_d(__m128i{0x0000808000000000, 0x0081000081716151})
+= 0x0000000000000000 0x0000000000000000
+
+

Operation

+
u64 m = 0x8000000000000000;
+u64 c = m & a.dword[0];
+c >>= 63;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c >>= 63;
+dst.dword[0] |= c << 1;
+dst.dword[1] = 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmsknz_b (__m128i a)

+

Synopsis

+
__m128i __lsx_vmsknz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmsknz.b vr, vr
+CPU Flags: LSX
+
+

Description

+

For each 8-bit element in a, if the element is non-zero, set one bit in dst, otherwise clear it.

+

Examples

+
__m128i __lsx_vmsknz_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x000000000000feff 0x0000000000000000
+__m128i __lsx_vmsknz_b(__m128i{0x0000111100000000, 0x0011000011111111})
+= 0x0000000000004f30 0x0000000000000000
+
+

Operation

+
u64 m = 0x7F7F7F7F7F7F7F7F;
+u64 c = ~(((a.dword[0] & m) + m) | a.dword[0] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = ~(((a.dword[1] & m) + m) | a.dword[1] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[0] = (u16)~dst.dword[0];
+dst.dword[1] = 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpackev_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpackev_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackev.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Collect and pack even-positioned 8-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i - 1] : b.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpackev_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpackev_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackev.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Collect and pack even-positioned 16-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i - 1] : b.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpackev_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpackev_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackev.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Collect and pack even-positioned 32-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i - 1] : b.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpackev_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpackev_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackev.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Collect and pack even-positioned 64-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i - 1] : b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpackod_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpackod_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackod.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Collect and pack odd-positioned 8-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i] : b.byte[i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpackod_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpackod_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackod.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Collect and pack odd-positioned 16-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i] : b.half[i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpackod_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpackod_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackod.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Collect and pack odd-positioned 32-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i] : b.word[i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpackod_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpackod_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackod.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Collect and pack odd-positioned 64-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i] : b.dword[i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpickev_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpickev_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickev.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Pick even-positioned 8-bit elements in b first, then pick even-positioned 8-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i < 8) ? b.byte[i * 2] : a.byte[(i - 8) * 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpickev_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpickev_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickev.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Pick even-positioned 16-bit elements in b first, then pick even-positioned 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i < 4) ? b.half[i * 2] : a.half[(i - 4) * 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpickev_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpickev_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickev.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Pick even-positioned 32-bit elements in b first, then pick even-positioned 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i < 2) ? b.word[i * 2] : a.word[(i - 2) * 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpickev_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpickev_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickev.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Pick even-positioned 64-bit elements in b first, then pick even-positioned 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1) ? b.dword[i * 2] : a.dword[(i - 1) * 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

int __lsx_vpickve2gr_b (__m128i a, imm0_15 idx)

+

Synopsis

+
int __lsx_vpickve2gr_b (__m128i a, imm0_15 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.b r, vr, imm
+CPU Flags: LSX
+
+

Description

+

Pick the lane specified by idx from a and store into dst.

+

Operation

+
dst = (s8)a.byte[idx];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

unsigned int __lsx_vpickve2gr_bu (__m128i a, imm0_15 idx)

+

Synopsis

+
unsigned int __lsx_vpickve2gr_bu (__m128i a, imm0_15 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.bu r, vr, imm
+CPU Flags: LSX
+
+

Description

+

Pick the lane specified by idx from a and store into dst.

+

Operation

+
dst = (u8)a.byte[idx];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

int __lsx_vpickve2gr_h (__m128i a, imm0_7 idx)

+

Synopsis

+
int __lsx_vpickve2gr_h (__m128i a, imm0_7 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.h r, vr, imm
+CPU Flags: LSX
+
+

Description

+

Pick the lane specified by idx from a and store into dst.

+

Operation

+
dst = (s16)a.half[idx];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

unsigned int __lsx_vpickve2gr_hu (__m128i a, imm0_7 idx)

+

Synopsis

+
unsigned int __lsx_vpickve2gr_hu (__m128i a, imm0_7 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.hu r, vr, imm
+CPU Flags: LSX
+
+

Description

+

Pick the lane specified by idx from a and store into dst.

+

Operation

+
dst = (u16)a.half[idx];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

int __lsx_vpickve2gr_w (__m128i a, imm0_3 idx)

+

Synopsis

+
int __lsx_vpickve2gr_w (__m128i a, imm0_3 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.w r, vr, imm
+CPU Flags: LSX
+
+

Description

+

Pick the lane specified by idx from a and store into dst.

+

Operation

+
dst = (s32)a.word[idx];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

unsigned int __lsx_vpickve2gr_wu (__m128i a, imm0_3 idx)

+

Synopsis

+
unsigned int __lsx_vpickve2gr_wu (__m128i a, imm0_3 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.wu r, vr, imm
+CPU Flags: LSX
+
+

Description

+

Pick the lane specified by idx from a and store into dst.

+

Operation

+
dst = (u32)a.word[idx];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

long int __lsx_vpickve2gr_d (__m128i a, imm0_1 idx)

+

Synopsis

+
long int __lsx_vpickve2gr_d (__m128i a, imm0_1 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.d r, vr, imm
+CPU Flags: LSX
+
+

Description

+

Pick the lane specified by idx from a and store into dst.

+

Operation

+
dst = (s64)a.dword[idx];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

unsigned long int __lsx_vpickve2gr_du (__m128i a, imm0_1 idx)

+

Synopsis

+
unsigned long int __lsx_vpickve2gr_du (__m128i a, imm0_1 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.du r, vr, imm
+CPU Flags: LSX
+
+

Description

+

Pick the lane specified by idx from a and store into dst.

+

Operation

+
dst = (u64)a.dword[idx];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m128i __lsx_vpickod_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpickod_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickod.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Pick odd-positioned 8-bit elements in b first, then pick odd-positioned 8-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i < 8) ? b.byte[i * 2 + 1] : a.byte[(i - 8) * 2 + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpickod_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpickod_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickod.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Pick odd-positioned 16-bit elements in b first, then pick odd-positioned 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i < 4) ? b.half[i * 2 + 1] : a.half[(i - 4) * 2 + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpickod_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpickod_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickod.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Pick odd-positioned 32-bit elements in b first, then pick odd-positioned 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i < 2) ? b.word[i * 2 + 1] : a.word[(i - 2) * 2 + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpickod_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpickod_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickod.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Pick odd-positioned 64-bit elements in b first, then pick odd-positioned 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1) ? b.dword[i * 2 + 1] : a.dword[(i - 1) * 2 + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vrepli_b (imm_n512_511 imm)

+

Synopsis

+
__m128i __lsx_vrepli_b (imm_n512_511 imm)
+#include <lsxintrin.h>
+Instruction: vldi vr, imm
+CPU Flags: LSX
+
+

Description

+

Repeat imm to fill whole vector.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = imm;
+}
+
+

Tested on real machine.

+

__m128i __lsx_vrepli_h (imm_n512_511 imm)

+

Synopsis

+
__m128i __lsx_vrepli_h (imm_n512_511 imm)
+#include <lsxintrin.h>
+Instruction: vldi vr, imm
+CPU Flags: LSX
+
+

Description

+

Repeat imm to fill whole vector.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = imm;
+}
+
+

Tested on real machine.

+

__m128i __lsx_vrepli_w (imm_n512_511 imm)

+

Synopsis

+
__m128i __lsx_vrepli_w (imm_n512_511 imm)
+#include <lsxintrin.h>
+Instruction: vldi vr, imm
+CPU Flags: LSX
+
+

Description

+

Repeat imm to fill whole vector.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = imm;
+}
+
+

Tested on real machine.

+

__m128i __lsx_vrepli_d (imm_n512_511 imm)

+

Synopsis

+
__m128i __lsx_vrepli_d (imm_n512_511 imm)
+#include <lsxintrin.h>
+Instruction: vldi vr, imm
+CPU Flags: LSX
+
+

Description

+

Repeat imm to fill whole vector.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = imm;
+}
+
+

Tested on real machine.

+

__m128i __lsx_vreplgr2vr_b (int val)

+

Synopsis

+
__m128i __lsx_vreplgr2vr_b (int val)
+#include <lsxintrin.h>
+Instruction: vreplgr2vr.b vr, r
+CPU Flags: LSX
+
+

Description

+

Repeat val to whole vector.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = val;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A1
3C5000N/A1
+

__m128i __lsx_vreplgr2vr_h (int val)

+

Synopsis

+
__m128i __lsx_vreplgr2vr_h (int val)
+#include <lsxintrin.h>
+Instruction: vreplgr2vr.h vr, r
+CPU Flags: LSX
+
+

Description

+

Repeat val to whole vector.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = val;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A1
3C5000N/A1
+

__m128i __lsx_vreplgr2vr_w (int val)

+

Synopsis

+
__m128i __lsx_vreplgr2vr_w (int val)
+#include <lsxintrin.h>
+Instruction: vreplgr2vr.w vr, r
+CPU Flags: LSX
+
+

Description

+

Repeat val to whole vector.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = val;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A1
3C5000N/A1
+

__m128i __lsx_vreplgr2vr_d (long int val)

+

Synopsis

+
__m128i __lsx_vreplgr2vr_d (long int val)
+#include <lsxintrin.h>
+Instruction: vreplgr2vr.d vr, r
+CPU Flags: LSX
+
+

Description

+

Repeat val to whole vector.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = val;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A1
3C5000N/A1
+

__m128i __lsx_vreplve_b (__m128i a, int idx)

+

Synopsis

+
__m128i __lsx_vreplve_b (__m128i a, int idx)
+#include <lsxintrin.h>
+Instruction: vreplve.b vr, vr, r
+CPU Flags: LSX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[idx % 16];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m128i __lsx_vreplve_h (__m128i a, int idx)

+

Synopsis

+
__m128i __lsx_vreplve_h (__m128i a, int idx)
+#include <lsxintrin.h>
+Instruction: vreplve.h vr, vr, r
+CPU Flags: LSX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[idx % 8];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m128i __lsx_vreplve_w (__m128i a, int idx)

+

Synopsis

+
__m128i __lsx_vreplve_w (__m128i a, int idx)
+#include <lsxintrin.h>
+Instruction: vreplve.w vr, vr, r
+CPU Flags: LSX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[idx % 4];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m128i __lsx_vreplve_d (__m128i a, int idx)

+

Synopsis

+
__m128i __lsx_vreplve_d (__m128i a, int idx)
+#include <lsxintrin.h>
+Instruction: vreplve.d vr, vr, r
+CPU Flags: LSX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[idx % 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m128i __lsx_vreplvei_b (__m128i a, imm0_15 idx)

+

Synopsis

+
__m128i __lsx_vreplvei_b (__m128i a, imm0_15 idx)
+#include <lsxintrin.h>
+Instruction: vreplvei.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[idx];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vreplvei_h (__m128i a, imm0_7 idx)

+

Synopsis

+
__m128i __lsx_vreplvei_h (__m128i a, imm0_7 idx)
+#include <lsxintrin.h>
+Instruction: vreplvei.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[idx];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vreplvei_w (__m128i a, imm0_3 idx)

+

Synopsis

+
__m128i __lsx_vreplvei_w (__m128i a, imm0_3 idx)
+#include <lsxintrin.h>
+Instruction: vreplvei.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[idx];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vreplvei_d (__m128i a, imm0_1 idx)

+

Synopsis

+
__m128i __lsx_vreplvei_d (__m128i a, imm0_1 idx)
+#include <lsxintrin.h>
+Instruction: vreplvei.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[idx];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsat_b (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vsat_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsat.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Clamp signed 8-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = clamp<s8>(a.byte[i], -(1 << imm), (1 << imm) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsat_bu (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vsat_bu (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsat.bu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Clamp unsigned 8-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = clamp<u8>(a.byte[i], 0, (1 << (imm + 1)) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsat_h (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vsat_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsat.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Clamp signed 16-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = clamp<s16>(a.half[i], -(1 << imm), (1 << imm) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsat_hu (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vsat_hu (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsat.hu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Clamp unsigned 16-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = clamp<u16>(a.half[i], 0, (1 << (imm + 1)) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsat_w (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsat_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsat.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Clamp signed 32-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = clamp<s32>(a.word[i], -(1 << imm), (1 << imm) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsat_wu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsat_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsat.wu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Clamp unsigned 32-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = clamp<u32>(a.word[i], 0, (1 << (imm + 1)) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsat_d (__m128i a, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vsat_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsat.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Clamp signed 64-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = clamp<s64>(a.dword[i], -(1 << imm), (1 << imm) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsat_du (__m128i a, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vsat_du (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsat.du vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Clamp unsigned 64-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = clamp<u64>(a.dword[i], 0, (1 << (imm + 1)) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsigncov_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsigncov_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsigncov.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

If the 8-bit element in a equals to zero, set the result to zero. If the signed 8-bit element in a is positive, copy element in b to result. Otherwise, copy negated element in b to result. If a and b are the same vectors, it is equivalent to computing absolute value.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] =
+      (a.byte[i] == 0) ? 0 : ((s8)a.byte[i] > 0 ? b.byte[i] : -b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m128i __lsx_vsigncov_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsigncov_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsigncov.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

If the 16-bit element in a equals to zero, set the result to zero. If the signed 16-bit element in a is positive, copy element in b to result. Otherwise, copy negated element in b to result. If a and b are the same vectors, it is equivalent to computing absolute value.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] =
+      (a.half[i] == 0) ? 0 : ((s16)a.half[i] > 0 ? b.half[i] : -b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m128i __lsx_vsigncov_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsigncov_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsigncov.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

If the 32-bit element in a equals to zero, set the result to zero. If the signed 32-bit element in a is positive, copy element in b to result. Otherwise, copy negated element in b to result. If a and b are the same vectors, it is equivalent to computing absolute value.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] =
+      (a.word[i] == 0) ? 0 : ((s32)a.word[i] > 0 ? b.word[i] : -b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m128i __lsx_vsigncov_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsigncov_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsigncov.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

If the 64-bit element in a equals to zero, set the result to zero. If the signed 64-bit element in a is positive, copy element in b to result. Otherwise, copy negated element in b to result. If a and b are the same vectors, it is equivalent to computing absolute value.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] =
+      (a.dword[i] == 0) ? 0 : ((s64)a.dword[i] > 0 ? b.dword[i] : -b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m128i __lsx_vldi (imm_n1024_1023 imm)

+

Synopsis

+
__m128i __lsx_vldi (imm_n1024_1023 imm)
+#include <lsxintrin.h>
+Instruction: vldi vr, imm
+CPU Flags: LSX
+
+

Description

+

Initialize dst using predefined patterns:

+
    +
  • imm[12:10]=0b000: broadcast imm[7:0] as 8-bit elements to all lanes
  • +
  • imm[12:10]=0b001: broadcast sign-extended imm[9:0] as 16-bit elements to all lanes
  • +
  • imm[12:10]=0b010: broadcast sign-extended imm[9:0] as 32-bit elements to all lanes
  • +
  • imm[12:10]=0b011: broadcast sign-extended imm[9:0] as 64-bit elements to all lanes
  • +
  • imm[12:8]=0b10000: broadcast imm[7:0] as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b10001: broadcast imm[7:0] << 8 as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b10010: broadcast imm[7:0] << 16 as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b10011: broadcast imm[7:0] << 24 as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b10100: broadcast imm[7:0] as 16-bit elements to all lanes
  • +
  • imm[12:8]=0b10101: broadcast imm[7:0] << 8 as 16-bit elements to all lanes
  • +
  • imm[12:8]=0b10110: broadcast (imm[7:0] << 8) | 0xFF as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b10111: broadcast (imm[7:0] << 16) | 0xFFFF as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b11000: broadcast imm[7:0] as 8-bit elements to all lanes
  • +
  • imm[12:8]=0b11001: repeat each bit of imm[7:0] eight times, and broadcast the result as 64-bit elements to all lanes
  • +
  • imm[12:8]=0b11010: broadcast (imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19) as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b11011: broadcast (imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19) as 64-bit elements to all lanes
  • +
  • imm[12:8]=0b11100: broadcast (imm[7] << 63) | ((1-imm[6]) << 62) | ((imm[6] * 0xFF) << 54) | (imm[5:0] << 48) as 64-bit elements to all lanes
  • +
+

Operation

+
u64 imm12_10 = (imm >> 10) & 0b111;
+u64 imm12_8 = (imm >> 8) & 0b11111;
+u64 imm9_0 = imm & 0x3FF;
+s64 simm9_0 = ((s64)imm9_0 << 54) >> 54;
+u64 imm7_0 = imm & 0xFF;
+u64 imm7 = (imm >> 7) & 0x1;
+u64 imm6 = (imm >> 6) & 0x1;
+u64 imm5 = (imm >> 5) & 0x1;
+u64 imm5_0 = imm & 0x3F;
+u64 imm4 = (imm >> 4) & 0x1;
+u64 imm3 = (imm >> 3) & 0x1;
+u64 imm2 = (imm >> 2) & 0x1;
+u64 imm1 = (imm >> 1) & 0x1;
+u64 imm0 = imm & 0x1;
+
+u64 broadcast_value;
+u64 broadcast_width;
+if (imm12_10 == 0b000) {
+  broadcast_value = imm7_0;
+  broadcast_width = 8;
+} else if (imm12_10 == 0b001) {
+  broadcast_value = simm9_0;
+  broadcast_width = 16;
+} else if (imm12_10 == 0b010) {
+  broadcast_value = simm9_0;
+  broadcast_width = 32;
+} else if (imm12_10 == 0b011) {
+  broadcast_value = simm9_0;
+  broadcast_width = 64;
+} else if (imm12_8 == 0b10000) {
+  broadcast_value = imm7_0;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10001) {
+  broadcast_value = imm7_0 << 8;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10010) {
+  broadcast_value = imm7_0 << 16;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10011) {
+  broadcast_value = imm7_0 << 24;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10100) {
+  broadcast_value = imm7_0;
+  broadcast_width = 16;
+} else if (imm12_8 == 0b10101) {
+  broadcast_value = imm7_0 << 8;
+  broadcast_width = 16;
+} else if (imm12_8 == 0b10110) {
+  broadcast_value = (imm7_0 << 8) | 0xFF;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10111) {
+  broadcast_value = (imm7_0 << 16) | 0xFFFF;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b11000) {
+  broadcast_value = imm7_0;
+  broadcast_width = 8;
+} else if (imm12_8 == 0b11001) {
+  broadcast_value = imm0 * 0xFF + imm1 * 0xFF00 + imm2 * 0xFF0000 +
+                    imm3 * 0xFF000000 + imm4 * 0xFF00000000 +
+                    imm5 * 0xFF0000000000 + imm6 * 0xFF000000000000 +
+                    imm7 * 0xFF00000000000000;
+  broadcast_width = 64;
+} else if (imm12_8 == 0b11010) {
+  broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |
+                    (imm5_0 << 19);
+  broadcast_width = 32;
+} else if (imm12_8 == 0b11011) {
+  broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |
+                    (imm5_0 << 19);
+  broadcast_width = 64;
+} else if (imm12_8 == 0b11100) {
+  broadcast_value = (imm7 << 63) | ((1 - imm6) << 62) | ((imm6 * 0xFF) << 54) |
+                    (imm5_0 << 48);
+  broadcast_width = 64;
+}
+
+if (broadcast_width == 8) {
+  for (int i = 0; i < 16; i++) {
+    dst.byte[i] = broadcast_value;
+  }
+} else if (broadcast_width == 16) {
+  for (int i = 0; i < 8; i++) {
+    dst.half[i] = broadcast_value;
+  }
+} else if (broadcast_width == 32) {
+  for (int i = 0; i < 4; i++) {
+    dst.word[i] = broadcast_value;
+  }
+} else if (broadcast_width == 64) {
+  for (int i = 0; i < 2; i++) {
+    dst.dword[i] = broadcast_value;
+  }
+}
+
+

Tested on real machine.

+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/permutation/index.html b/lsx/permutation/index.html new file mode 100644 index 00000000..41aacfdf --- /dev/null +++ b/lsx/permutation/index.html @@ -0,0 +1,249 @@ + + + + + + + + Permutation - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Permutation

+

__m128i __lsx_vpermi_w (__m128i a, __m128i b, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vpermi_w (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vpermi.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Permute words from a and b with indices recorded in imm and store into dst.

+

Operation

+
dst.word[0] = b.word[imm & 0x3];
+dst.word[1] = b.word[(imm >> 2) & 0x3];
+dst.word[2] = a.word[(imm >> 4) & 0x3];
+dst.word[3] = a.word[(imm >> 6) & 0x3];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/shift/index.html b/lsx/shift/index.html new file mode 100644 index 00000000..aade3246 --- /dev/null +++ b/lsx/shift/index.html @@ -0,0 +1,7876 @@ + + + + + + + + Shift - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Shift

+

__m128i __lsx_vbsll_v (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vbsll_v (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbsll.v vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute whole vector a shifted left by imm * 8 bits.

+

Operation

+
int shift = (imm * 8) % 128;
+dst.qword[0] = (u128)a.qword[0] << shift;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vbsrl_v (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vbsrl_v (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbsrl.v vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute whole vector a shifted right by imm * 8 bits.

+

Operation

+
int shift = (imm * 8) % 128;
+dst.qword[0] = (u128)a.qword[0] >> shift;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsll_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsll_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsll.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical left shift the unsigned 8-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] << (b.byte[i] & 0x7);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsll_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsll_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsll.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical left shift the unsigned 16-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] << (b.half[i] & 0xf);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsll_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsll_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsll.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical left shift the unsigned 32-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] << (b.word[i] & 0x1f);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsll_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsll_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsll.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical left shift the unsigned 64-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] << (b.dword[i] & 0x3f);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslli_b (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vslli_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vslli.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical left shift the unsigned 8-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslli_h (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vslli_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vslli.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical left shift the unsigned 16-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslli_w (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vslli_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslli.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical left shift the unsigned 32-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslli_d (__m128i a, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vslli_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vslli.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical left shift the unsigned 64-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsllwil_h_b (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vsllwil_h_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.h.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Extend and shift signed 8-bit elements in a by imm to signed 16-bit result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m128i __lsx_vsllwil_hu_bu (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vsllwil_hu_bu (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.hu.bu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Extend and shift unsigned 8-bit elements in a by imm to unsigned 16-bit result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m128i __lsx_vsllwil_w_h (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vsllwil_w_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.w.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Extend and shift signed 16-bit elements in a by imm to signed 32-bit result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m128i __lsx_vsllwil_wu_hu (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vsllwil_wu_hu (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.wu.hu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Extend and shift unsigned 16-bit elements in a by imm to unsigned 32-bit result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m128i __lsx_vsllwil_d_w (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsllwil_d_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.d.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Extend and shift signed 32-bit elements in a by imm to signed 64-bit result.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m128i __lsx_vsllwil_du_wu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsllwil_du_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.du.wu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Extend and shift unsigned 32-bit elements in a by imm to unsigned 64-bit result.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m128i __lsx_vsra_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsra_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsra.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 8-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i]) >> (b.byte[i] & 0x7);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsra_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsra_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsra.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 16-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((s16)a.half[i]) >> (b.half[i] & 0xf);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsra_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsra_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsra.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 32-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((s32)a.word[i]) >> (b.word[i] & 0x1f);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsra_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsra_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsra.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 64-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i]) >> (b.dword[i] & 0x3f);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsrai_b (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vsrai_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsrai.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 8-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i]) >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsrai_h (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vsrai_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrai.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 16-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((s16)a.half[i]) >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsrai_w (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsrai_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrai.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 32-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((s32)a.word[i]) >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsrai_d (__m128i a, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vsrai_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrai.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 64-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i]) >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsran_b_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsran_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsran.b.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 16-bit elements in a by elements in b, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i < 8) ? (s8)((s16)a.half[i] >> (b.half[i] & 15)) : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m128i __lsx_vsran_h_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsran_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsran.h.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 32-bit elements in a by elements in b, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i < 4) ? (s16)((s32)a.word[i] >> (b.word[i] & 31)) : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m128i __lsx_vsran_w_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsran_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsran.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 64-bit elements in a by elements in b, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i < 2) ? (s32)((s64)a.dword[i] >> (b.dword[i] & 63)) : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m128i __lsx_vsrani_b_h (__m128i a, __m128i b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vsrani_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrani.b.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 16-bit elements in a and b by imm, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] =
+      (i < 8) ? (s8)((s16)b.half[i] >> imm) : (s8)((s16)a.half[i - 8] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrani_h_w (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsrani_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrani.h.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 32-bit elements in a and b by imm, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] =
+      (i < 4) ? (s16)((s32)b.word[i] >> imm) : (s16)((s32)a.word[i - 4] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrani_w_d (__m128i a, __m128i b, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vsrani_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrani.w.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 64-bit elements in a and b by imm, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i < 2) ? (s32)((s64)b.dword[i] >> imm)
+                        : (s32)((s64)a.dword[i - 2] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrani_d_q (__m128i a, __m128i b, imm0_127 imm)

+

Synopsis

+
__m128i __lsx_vsrani_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vsrani.d.q vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 128-bit elements in a and b by imm, truncate to 64-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1) ? (s64)((s128)b.qword[i] >> imm)
+                         : (s64)((s128)a.qword[i - 1] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrar_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrar_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrar.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 8-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if ((b.byte[i] & 0x7) == 0) {
+    dst.byte[i] = a.byte[i];
+  } else {
+    dst.byte[i] = ((s8)a.byte[i] >> (b.byte[i] & 0x7)) +
+                  (((s8)a.byte[i] >> ((b.byte[i] & 0x7) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrar_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrar_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrar.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 16-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if ((b.half[i] & 0xf) == 0) {
+    dst.half[i] = a.half[i];
+  } else {
+    dst.half[i] = ((s16)a.half[i] >> (b.half[i] & 0xf)) +
+                  (((s16)a.half[i] >> ((b.half[i] & 0xf) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrar_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrar_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrar.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 32-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if ((b.word[i] & 0x1f) == 0) {
+    dst.word[i] = a.word[i];
+  } else {
+    dst.word[i] = ((s32)a.word[i] >> (b.word[i] & 0x1f)) +
+                  (((s32)a.word[i] >> ((b.word[i] & 0x1f) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrar_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrar_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrar.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 64-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if ((b.dword[i] & 0x3f) == 0) {
+    dst.dword[i] = a.dword[i];
+  } else {
+    dst.dword[i] = ((s64)a.dword[i] >> (b.dword[i] & 0x3f)) +
+                   (((s64)a.dword[i] >> ((b.dword[i] & 0x3f) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrari_b (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vsrari_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsrari.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 8-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (imm == 0) {
+    dst.byte[i] = a.byte[i];
+  } else {
+    dst.byte[i] = ((s8)a.byte[i] >> imm) + (((s8)a.byte[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrari_h (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vsrari_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrari.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 16-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (imm == 0) {
+    dst.half[i] = a.half[i];
+  } else {
+    dst.half[i] =
+        ((s16)a.half[i] >> imm) + (((s16)a.half[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrari_w (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsrari_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrari.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 32-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (imm == 0) {
+    dst.word[i] = a.word[i];
+  } else {
+    dst.word[i] =
+        ((s32)a.word[i] >> imm) + (((s32)a.word[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrari_d (__m128i a, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vsrari_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrari.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 64-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (imm == 0) {
+    dst.dword[i] = a.dword[i];
+  } else {
+    dst.dword[i] =
+        ((s64)a.dword[i] >> imm) + (((s64)a.dword[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrarn_b_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrarn_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrarn.b.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 16-bit elements in a by elements in b, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u8 shift = (b.half[i] & 15);
+    if (shift == 0) {
+      dst.byte[i] = (s8)(s16)a.half[i];
+    } else {
+      dst.byte[i] = (s8)(((s16)a.half[i] >> shift) +
+                         (((s16)a.half[i] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrarn_h_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrarn_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrarn.h.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 32-bit elements in a by elements in b, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u8 shift = (b.word[i] & 31);
+    if (shift == 0) {
+      dst.half[i] = (s16)(s32)a.word[i];
+    } else {
+      dst.half[i] = (s16)(((s32)a.word[i] >> shift) +
+                          (((s32)a.word[i] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrarn_w_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrarn_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrarn.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 64-bit elements in a by elements in b, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u8 shift = (b.dword[i] & 63);
+    if (shift == 0) {
+      dst.word[i] = (s32)(s64)a.dword[i];
+    } else {
+      dst.word[i] = (s32)(((s64)a.dword[i] >> shift) +
+                          (((s64)a.dword[i] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrarni_b_h (__m128i a, __m128i b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vsrarni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrarni.b.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 16-bit elements in a and b by imm, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    if (imm == 0) {
+      dst.byte[i] = (s8)(s16)b.half[i];
+    } else {
+      dst.byte[i] =
+          (s8)(((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.byte[i] = (s8)(s16)a.half[i - 8];
+    } else {
+      dst.byte[i] = (s8)(((s16)a.half[i - 8] >> imm) +
+                         (((s16)a.half[i - 8] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrarni_h_w (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsrarni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrarni.h.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 32-bit elements in a and b by imm, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    if (imm == 0) {
+      dst.half[i] = (s16)(s32)b.word[i];
+    } else {
+      dst.half[i] = (s16)(((s32)b.word[i] >> imm) +
+                          (((s32)b.word[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.half[i] = (s16)(s32)a.word[i - 4];
+    } else {
+      dst.half[i] = (s16)(((s32)a.word[i - 4] >> imm) +
+                          (((s32)a.word[i - 4] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrarni_w_d (__m128i a, __m128i b, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vsrarni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrarni.w.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 64-bit elements in a and b by imm, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    if (imm == 0) {
+      dst.word[i] = (s32)(s64)b.dword[i];
+    } else {
+      dst.word[i] = (s32)(((s64)b.dword[i] >> imm) +
+                          (((s64)b.dword[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.word[i] = (s32)(s64)a.dword[i - 2];
+    } else {
+      dst.word[i] = (s32)(((s64)a.dword[i - 2] >> imm) +
+                          (((s64)a.dword[i - 2] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrarni_d_q (__m128i a, __m128i b, imm0_127 imm)

+

Synopsis

+
__m128i __lsx_vsrarni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vsrarni.d.q vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 128-bit elements in a and b by imm, truncate to 64-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    if (imm == 0) {
+      dst.dword[i] = (s64)(s128)b.qword[i];
+    } else {
+      dst.dword[i] = (s64)(((s128)b.qword[i] >> imm) +
+                           (((s128)b.qword[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.dword[i] = (s64)(s128)a.qword[i - 1];
+    } else {
+      dst.dword[i] = (s64)(((s128)a.qword[i - 1] >> imm) +
+                           (((s128)a.qword[i - 1] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrl_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrl_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrl.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 8-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] >> (b.byte[i] & 0x7);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsrl_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrl_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrl.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] >> (b.half[i] & 0xf);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsrl_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrl_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrl.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] >> (b.word[i] & 0x1f);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsrl_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrl_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrl.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] >> (b.dword[i] & 0x3f);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsrli_b (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vsrli_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsrli.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 8-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsrli_h (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vsrli_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrli.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsrli_w (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsrli_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrli.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsrli_d (__m128i a, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vsrli_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrli.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsrln_b_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrln_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrln.b.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a by elements in b, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i < 8) ? (u8)((u16)a.half[i] >> (b.half[i] & 15)) : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m128i __lsx_vsrln_h_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrln_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrln.h.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a by elements in b, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i < 4) ? (u16)((u32)a.word[i] >> (b.word[i] & 31)) : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m128i __lsx_vsrln_w_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrln_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrln.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a by elements in b, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i < 2) ? (u32)((u64)a.dword[i] >> (b.dword[i] & 63)) : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m128i __lsx_vsrlni_b_h (__m128i a, __m128i b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vsrlni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrlni.b.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a and b by imm, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] =
+      (i < 8) ? (u8)((u16)b.half[i] >> imm) : (u8)((u16)a.half[i - 8] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrlni_h_w (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsrlni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrlni.h.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a and b by imm, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] =
+      (i < 4) ? (u16)((u32)b.word[i] >> imm) : (u16)((u32)a.word[i - 4] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrlni_w_d (__m128i a, __m128i b, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vsrlni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrlni.w.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a and b by imm, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i < 2) ? (u32)((u64)b.dword[i] >> imm)
+                        : (u32)((u64)a.dword[i - 2] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrlni_d_q (__m128i a, __m128i b, imm0_127 imm)

+

Synopsis

+
__m128i __lsx_vsrlni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vsrlni.d.q vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 128-bit elements in a and b by imm, truncate to 64-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1) ? (u64)((u128)b.qword[i] >> imm)
+                         : (u64)((u128)a.qword[i - 1] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrlr_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrlr_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlr.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 8-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if ((b.byte[i] & 0x7) == 0) {
+    dst.byte[i] = a.byte[i];
+  } else {
+    dst.byte[i] = (a.byte[i] >> (b.byte[i] & 0x7)) +
+                  ((a.byte[i] >> ((b.byte[i] & 0x7) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrlr_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrlr_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlr.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if ((b.half[i] & 0xf) == 0) {
+    dst.half[i] = a.half[i];
+  } else {
+    dst.half[i] = (a.half[i] >> (b.half[i] & 0xf)) +
+                  ((a.half[i] >> ((b.half[i] & 0xf) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrlr_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrlr_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlr.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if ((b.word[i] & 0x1f) == 0) {
+    dst.word[i] = a.word[i];
+  } else {
+    dst.word[i] = (a.word[i] >> (b.word[i] & 0x1f)) +
+                  ((a.word[i] >> ((b.word[i] & 0x1f) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrlr_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrlr_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlr.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if ((b.dword[i] & 0x3f) == 0) {
+    dst.dword[i] = a.dword[i];
+  } else {
+    dst.dword[i] = (a.dword[i] >> (b.dword[i] & 0x3f)) +
+                   ((a.dword[i] >> ((b.dword[i] & 0x3f) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrlri_b (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vsrlri_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsrlri.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 8-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (imm == 0) {
+    dst.byte[i] = a.byte[i];
+  } else {
+    dst.byte[i] = (a.byte[i] >> imm) + ((a.byte[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrlri_h (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vsrlri_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrlri.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (imm == 0) {
+    dst.half[i] = a.half[i];
+  } else {
+    dst.half[i] = (a.half[i] >> imm) + ((a.half[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrlri_w (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsrlri_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrlri.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (imm == 0) {
+    dst.word[i] = a.word[i];
+  } else {
+    dst.word[i] = (a.word[i] >> imm) + ((a.word[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrlri_d (__m128i a, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vsrlri_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrlri.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (imm == 0) {
+    dst.dword[i] = a.dword[i];
+  } else {
+    dst.dword[i] = (a.dword[i] >> imm) + ((a.dword[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrlrn_b_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrlrn_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlrn.b.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a by elements in b, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u8 shift = (b.half[i] & 15);
+    if (shift == 0) {
+      dst.byte[i] = (u8)(u16)a.half[i];
+    } else {
+      dst.byte[i] = (u8)(((u16)a.half[i] >> shift) +
+                         (((u16)a.half[i] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrlrn_h_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrlrn_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlrn.h.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a by elements in b, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u8 shift = (b.word[i] & 31);
+    if (shift == 0) {
+      dst.half[i] = (u16)(u32)a.word[i];
+    } else {
+      dst.half[i] = (u16)(((u32)a.word[i] >> shift) +
+                          (((u32)a.word[i] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrlrn_w_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrlrn_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlrn.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a by elements in b, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u8 shift = (b.dword[i] & 63);
+    if (shift == 0) {
+      dst.word[i] = (u32)(u64)a.dword[i];
+    } else {
+      dst.word[i] = (u32)(((u64)a.dword[i] >> shift) +
+                          (((u64)a.dword[i] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vsrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrlrni.b.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a and b by imm, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    if (imm == 0) {
+      dst.byte[i] = (u8)(u16)b.half[i];
+    } else {
+      dst.byte[i] =
+          (u8)(((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.byte[i] = (u8)(u16)a.half[i - 8];
+    } else {
+      dst.byte[i] = (u8)(((u16)a.half[i - 8] >> imm) +
+                         (((u16)a.half[i - 8] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrlrni.h.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a and b by imm, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    if (imm == 0) {
+      dst.half[i] = (u16)(u32)b.word[i];
+    } else {
+      dst.half[i] = (u16)(((u32)b.word[i] >> imm) +
+                          (((u32)b.word[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.half[i] = (u16)(u32)a.word[i - 4];
+    } else {
+      dst.half[i] = (u16)(((u32)a.word[i - 4] >> imm) +
+                          (((u32)a.word[i - 4] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vsrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrlrni.w.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a and b by imm, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    if (imm == 0) {
+      dst.word[i] = (u32)(u64)b.dword[i];
+    } else {
+      dst.word[i] = (u32)(((u64)b.dword[i] >> imm) +
+                          (((u64)b.dword[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.word[i] = (u32)(u64)a.dword[i - 2];
+    } else {
+      dst.word[i] = (u32)(((u64)a.dword[i - 2] >> imm) +
+                          (((u64)a.dword[i - 2] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)

+

Synopsis

+
__m128i __lsx_vsrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vsrlrni.d.q vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 128-bit elements in a and b by imm, truncate to 64-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    if (imm == 0) {
+      dst.dword[i] = (u64)(u128)b.qword[i];
+    } else {
+      dst.dword[i] = (u64)(((u128)b.qword[i] >> imm) +
+                           (((u128)b.qword[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.dword[i] = (u64)(u128)a.qword[i - 1];
+    } else {
+      dst.dword[i] = (u64)(((u128)a.qword[i - 1] >> imm) +
+                           (((u128)a.qword[i - 1] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vssran_b_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssran_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.b.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 16-bit elements in a by elements in b, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp = (s16)a.half[i] >> (b.half[i] & 15);
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssran_bu_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssran_bu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.bu.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 16-bit elements in a by elements in b, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp = (s16)a.half[i] >> (b.half[i] & 15);
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssran_h_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssran_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.h.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 32-bit elements in a by elements in b, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp = (s32)a.word[i] >> (b.word[i] & 31);
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssran_hu_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssran_hu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.hu.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 32-bit elements in a by elements in b, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp = (s32)a.word[i] >> (b.word[i] & 31);
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssran_w_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssran_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 64-bit elements in a by elements in b, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp = (s64)a.dword[i] >> (b.dword[i] & 63);
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssran_wu_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssran_wu_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.wu.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 64-bit elements in a by elements in b, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp = (s64)a.dword[i] >> (b.dword[i] & 63);
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrani_b_h (__m128i a, __m128i b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vssrani_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.b.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 16-bit elements in a and b by imm, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp = (s16)b.half[i] >> imm;
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  } else {
+    s16 temp = (s16)a.half[i - 8] >> imm;
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrani_bu_h (__m128i a, __m128i b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vssrani_bu_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.bu.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 16-bit elements in a and b by imm, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp = (s16)b.half[i] >> imm;
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  } else {
+    s16 temp = (s16)a.half[i - 8] >> imm;
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrani_h_w (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vssrani_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.h.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 32-bit elements in a and b by imm, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp = (s32)b.word[i] >> imm;
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  } else {
+    s32 temp = (s32)a.word[i - 4] >> imm;
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrani_hu_w (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vssrani_hu_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.hu.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 32-bit elements in a and b by imm, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp = (s32)b.word[i] >> imm;
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  } else {
+    s32 temp = (s32)a.word[i - 4] >> imm;
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrani_w_d (__m128i a, __m128i b, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vssrani_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.w.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 64-bit elements in a and b by imm, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp = (s64)b.dword[i] >> imm;
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  } else {
+    s64 temp = (s64)a.dword[i - 2] >> imm;
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrani_wu_d (__m128i a, __m128i b, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vssrani_wu_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.wu.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 64-bit elements in a and b by imm, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp = (s64)b.dword[i] >> imm;
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  } else {
+    s64 temp = (s64)a.dword[i - 2] >> imm;
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrani_d_q (__m128i a, __m128i b, imm0_127 imm)

+

Synopsis

+
__m128i __lsx_vssrani_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.d.q vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 128-bit elements in a and b by imm, clamp to fit in signed 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    s128 temp = (s128)b.qword[i] >> imm;
+    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+  } else {
+    s128 temp = (s128)a.qword[i - 1] >> imm;
+    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vssrani_du_q (__m128i a, __m128i b, imm0_127 imm)

+

Synopsis

+
__m128i __lsx_vssrani_du_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.du.q vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 128-bit elements in a and b by imm, clamp to fit in unsigned 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    s128 temp = (s128)b.qword[i] >> imm;
+    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+  } else {
+    s128 temp = (s128)a.qword[i - 1] >> imm;
+    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vssrarn_b_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrarn_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.b.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 16-bit elements in a by elements in b, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp;
+    if ((b.half[i] & 15) == 0) {
+      temp = (s16)a.half[i];
+    } else {
+      temp = ((s16)a.half[i] >> (b.half[i] & 15)) +
+             (((s16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrarn_bu_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrarn_bu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.bu.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 16-bit elements in a by elements in b, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp;
+    if ((b.half[i] & 15) == 0) {
+      temp = (s16)a.half[i];
+    } else {
+      temp = ((s16)a.half[i] >> (b.half[i] & 15)) +
+             (((s16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrarn_h_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrarn_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.h.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 32-bit elements in a by elements in b, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp;
+    if ((b.word[i] & 31) == 0) {
+      temp = (s32)a.word[i];
+    } else {
+      temp = ((s32)a.word[i] >> (b.word[i] & 31)) +
+             (((s32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrarn_hu_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrarn_hu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.hu.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 32-bit elements in a by elements in b, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp;
+    if ((b.word[i] & 31) == 0) {
+      temp = (s32)a.word[i];
+    } else {
+      temp = ((s32)a.word[i] >> (b.word[i] & 31)) +
+             (((s32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrarn_w_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrarn_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 64-bit elements in a by elements in b, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp;
+    if ((b.dword[i] & 63) == 0) {
+      temp = (s64)a.dword[i];
+    } else {
+      temp = ((s64)a.dword[i] >> (b.dword[i] & 63)) +
+             (((s64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrarn_wu_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrarn_wu_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.wu.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 64-bit elements in a by elements in b, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp;
+    if ((b.dword[i] & 63) == 0) {
+      temp = (s64)a.dword[i];
+    } else {
+      temp = ((s64)a.dword[i] >> (b.dword[i] & 63)) +
+             (((s64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrarni_b_h (__m128i a, __m128i b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vssrarni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.b.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 16-bit elements in a and b by imm, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)b.half[i];
+    } else {
+      temp = ((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  } else {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)a.half[i - 8];
+    } else {
+      temp =
+          ((s16)a.half[i - 8] >> imm) + (((s16)a.half[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrarni_bu_h (__m128i a, __m128i b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vssrarni_bu_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.bu.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 16-bit elements in a and b by imm, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)b.half[i];
+    } else {
+      temp = ((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  } else {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)a.half[i - 8];
+    } else {
+      temp =
+          ((s16)a.half[i - 8] >> imm) + (((s16)a.half[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrarni_h_w (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vssrarni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.h.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 32-bit elements in a and b by imm, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)b.word[i];
+    } else {
+      temp = ((s32)b.word[i] >> imm) + (((s32)b.word[i] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  } else {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)a.word[i - 4];
+    } else {
+      temp =
+          ((s32)a.word[i - 4] >> imm) + (((s32)a.word[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrarni_hu_w (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vssrarni_hu_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.hu.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 32-bit elements in a and b by imm, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)b.word[i];
+    } else {
+      temp = ((s32)b.word[i] >> imm) + (((s32)b.word[i] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  } else {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)a.word[i - 4];
+    } else {
+      temp =
+          ((s32)a.word[i - 4] >> imm) + (((s32)a.word[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrarni_w_d (__m128i a, __m128i b, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vssrarni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.w.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 64-bit elements in a and b by imm, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)b.dword[i];
+    } else {
+      temp = ((s64)b.dword[i] >> imm) + (((s64)b.dword[i] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  } else {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)a.dword[i - 2];
+    } else {
+      temp = ((s64)a.dword[i - 2] >> imm) +
+             (((s64)a.dword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrarni_wu_d (__m128i a, __m128i b, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vssrarni_wu_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.wu.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 64-bit elements in a and b by imm, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)b.dword[i];
+    } else {
+      temp = ((s64)b.dword[i] >> imm) + (((s64)b.dword[i] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  } else {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)a.dword[i - 2];
+    } else {
+      temp = ((s64)a.dword[i - 2] >> imm) +
+             (((s64)a.dword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrarni_d_q (__m128i a, __m128i b, imm0_127 imm)

+

Synopsis

+
__m128i __lsx_vssrarni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.d.q vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 128-bit elements in a and b by imm, clamp to fit in signed 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)b.qword[i];
+    } else {
+      temp = ((s128)b.qword[i] >> imm) + (((s128)b.qword[i] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+  } else {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)a.qword[i - 1];
+    } else {
+      temp = ((s128)a.qword[i - 1] >> imm) +
+             (((s128)a.qword[i - 1] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vssrarni_du_q (__m128i a, __m128i b, imm0_127 imm)

+

Synopsis

+
__m128i __lsx_vssrarni_du_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.du.q vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 128-bit elements in a and b by imm, clamp to fit in unsigned 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)b.qword[i];
+    } else {
+      temp = ((s128)b.qword[i] >> imm) + (((s128)b.qword[i] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+  } else {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)a.qword[i - 1];
+    } else {
+      temp = ((s128)a.qword[i - 1] >> imm) +
+             (((s128)a.qword[i - 1] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vssrln_b_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrln_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.b.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a by elements in b, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp = (u16)a.half[i] >> (b.half[i] & 15);
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrln_bu_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrln_bu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.bu.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a by elements in b, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp = (u16)a.half[i] >> (b.half[i] & 15);
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrln_h_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrln_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.h.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a by elements in b, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp = (u32)a.word[i] >> (b.word[i] & 31);
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrln_hu_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrln_hu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.hu.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a by elements in b, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp = (u32)a.word[i] >> (b.word[i] & 31);
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrln_w_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrln_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a by elements in b, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp = (u64)a.dword[i] >> (b.dword[i] & 63);
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrln_wu_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrln_wu_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.wu.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a by elements in b, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp = (u64)a.dword[i] >> (b.dword[i] & 63);
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlni_b_h (__m128i a, __m128i b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vssrlni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.b.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a and b by imm, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp = (u16)b.half[i] >> imm;
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  } else {
+    u16 temp = (u16)a.half[i - 8] >> imm;
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlni_bu_h (__m128i a, __m128i b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vssrlni_bu_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.bu.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a and b by imm, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp = (u16)b.half[i] >> imm;
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  } else {
+    u16 temp = (u16)a.half[i - 8] >> imm;
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlni_h_w (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vssrlni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.h.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a and b by imm, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp = (u32)b.word[i] >> imm;
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  } else {
+    u32 temp = (u32)a.word[i - 4] >> imm;
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlni_hu_w (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vssrlni_hu_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.hu.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a and b by imm, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp = (u32)b.word[i] >> imm;
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  } else {
+    u32 temp = (u32)a.word[i - 4] >> imm;
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlni_w_d (__m128i a, __m128i b, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vssrlni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.w.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a and b by imm, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp = (u64)b.dword[i] >> imm;
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  } else {
+    u64 temp = (u64)a.dword[i - 2] >> imm;
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlni_wu_d (__m128i a, __m128i b, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vssrlni_wu_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.wu.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a and b by imm, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp = (u64)b.dword[i] >> imm;
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  } else {
+    u64 temp = (u64)a.dword[i - 2] >> imm;
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlni_d_q (__m128i a, __m128i b, imm0_127 imm)

+

Synopsis

+
__m128i __lsx_vssrlni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.d.q vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 128-bit elements in a and b by imm, clamp to fit in signed 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    u128 temp = (u128)b.qword[i] >> imm;
+    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+  } else {
+    u128 temp = (u128)a.qword[i - 1] >> imm;
+    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vssrlni_du_q (__m128i a, __m128i b, imm0_127 imm)

+

Synopsis

+
__m128i __lsx_vssrlni_du_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.du.q vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 128-bit elements in a and b by imm, clamp to fit in unsigned 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    u128 temp = (u128)b.qword[i] >> imm;
+    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+  } else {
+    u128 temp = (u128)a.qword[i - 1] >> imm;
+    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vssrlrn_b_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrlrn_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.b.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a by elements in b, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp;
+    if ((b.half[i] & 15) == 0) {
+      temp = (u16)a.half[i];
+    } else {
+      temp = ((u16)a.half[i] >> (b.half[i] & 15)) +
+             (((u16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlrn_bu_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrlrn_bu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.bu.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a by elements in b, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp;
+    if ((b.half[i] & 15) == 0) {
+      temp = (u16)a.half[i];
+    } else {
+      temp = ((u16)a.half[i] >> (b.half[i] & 15)) +
+             (((u16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlrn_h_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrlrn_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.h.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a by elements in b, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp;
+    if ((b.word[i] & 31) == 0) {
+      temp = (u32)a.word[i];
+    } else {
+      temp = ((u32)a.word[i] >> (b.word[i] & 31)) +
+             (((u32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlrn_hu_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrlrn_hu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.hu.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a by elements in b, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp;
+    if ((b.word[i] & 31) == 0) {
+      temp = (u32)a.word[i];
+    } else {
+      temp = ((u32)a.word[i] >> (b.word[i] & 31)) +
+             (((u32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlrn_w_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrlrn_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a by elements in b, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp;
+    if ((b.dword[i] & 63) == 0) {
+      temp = (u64)a.dword[i];
+    } else {
+      temp = ((u64)a.dword[i] >> (b.dword[i] & 63)) +
+             (((u64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlrn_wu_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrlrn_wu_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.wu.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a by elements in b, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp;
+    if ((b.dword[i] & 63) == 0) {
+      temp = (u64)a.dword[i];
+    } else {
+      temp = ((u64)a.dword[i] >> (b.dword[i] & 63)) +
+             (((u64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vssrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.b.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a and b by imm, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)b.half[i];
+    } else {
+      temp = ((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  } else {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)a.half[i - 8];
+    } else {
+      temp =
+          ((u16)a.half[i - 8] >> imm) + (((u16)a.half[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlrni_bu_h (__m128i a, __m128i b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vssrlrni_bu_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.bu.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a and b by imm, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)b.half[i];
+    } else {
+      temp = ((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  } else {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)a.half[i - 8];
+    } else {
+      temp =
+          ((u16)a.half[i - 8] >> imm) + (((u16)a.half[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vssrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.h.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a and b by imm, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)b.word[i];
+    } else {
+      temp = ((u32)b.word[i] >> imm) + (((u32)b.word[i] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  } else {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)a.word[i - 4];
+    } else {
+      temp =
+          ((u32)a.word[i - 4] >> imm) + (((u32)a.word[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlrni_hu_w (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vssrlrni_hu_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.hu.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a and b by imm, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)b.word[i];
+    } else {
+      temp = ((u32)b.word[i] >> imm) + (((u32)b.word[i] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  } else {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)a.word[i - 4];
+    } else {
+      temp =
+          ((u32)a.word[i - 4] >> imm) + (((u32)a.word[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vssrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.w.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a and b by imm, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)b.dword[i];
+    } else {
+      temp = ((u64)b.dword[i] >> imm) + (((u64)b.dword[i] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  } else {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)a.dword[i - 2];
+    } else {
+      temp = ((u64)a.dword[i - 2] >> imm) +
+             (((u64)a.dword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlrni_wu_d (__m128i a, __m128i b, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vssrlrni_wu_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.wu.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a and b by imm, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)b.dword[i];
+    } else {
+      temp = ((u64)b.dword[i] >> imm) + (((u64)b.dword[i] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  } else {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)a.dword[i - 2];
+    } else {
+      temp = ((u64)a.dword[i - 2] >> imm) +
+             (((u64)a.dword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)

+

Synopsis

+
__m128i __lsx_vssrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.d.q vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 128-bit elements in a and b by imm, clamp to fit in signed 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)b.qword[i];
+    } else {
+      temp = ((u128)b.qword[i] >> imm) + (((u128)b.qword[i] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+  } else {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)a.qword[i - 1];
+    } else {
+      temp = ((u128)a.qword[i - 1] >> imm) +
+             (((u128)a.qword[i - 1] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vssrlrni_du_q (__m128i a, __m128i b, imm0_127 imm)

+

Synopsis

+
__m128i __lsx_vssrlrni_du_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.du.q vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 128-bit elements in a and b by imm, clamp to fit in unsigned 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)b.qword[i];
+    } else {
+      temp = ((u128)b.qword[i] >> imm) + (((u128)b.qword[i] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+  } else {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)a.qword[i - 1];
+    } else {
+      temp = ((u128)a.qword[i - 1] >> imm) +
+             (((u128)a.qword[i - 1] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vrotr_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vrotr_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vrotr.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Rotate right the unsigned 8-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] =
+      (a.byte[i] >> (b.byte[i] & 0x7)) | (a.byte[i] << (8 - (b.byte[i] & 0x7)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m128i __lsx_vrotr_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vrotr_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vrotr.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Rotate right the unsigned 16-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (a.half[i] >> (b.half[i] & 0xf)) |
+                (a.half[i] << (16 - (b.half[i] & 0xf)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m128i __lsx_vrotr_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vrotr_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vrotr.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Rotate right the unsigned 32-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (a.word[i] >> (b.word[i] & 0x1f)) |
+                (a.word[i] << (32 - (b.word[i] & 0x1f)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m128i __lsx_vrotr_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vrotr_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vrotr.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Rotate right the unsigned 64-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (a.dword[i] >> (b.dword[i] & 0x3f)) |
+                 (a.dword[i] << (64 - (b.dword[i] & 0x3f)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m128i __lsx_vrotri_b (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vrotri_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vrotri.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Rotate right the unsigned 8-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (a.byte[i] >> imm) | (a.byte[i] << (8 - imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m128i __lsx_vrotri_h (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vrotri_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vrotri.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Rotate right the unsigned 16-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (a.half[i] >> imm) | (a.half[i] << (16 - imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m128i __lsx_vrotri_w (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vrotri_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vrotri.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Rotate right the unsigned 32-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (a.word[i] >> imm) | (a.word[i] << (32 - imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m128i __lsx_vrotri_d (__m128i a, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vrotri_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vrotri.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Rotate right the unsigned 64-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (a.dword[i] >> imm) | (a.dword[i] << (64 - imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/shuffling/index.html b/lsx/shuffling/index.html new file mode 100644 index 00000000..ceadbba7 --- /dev/null +++ b/lsx/shuffling/index.html @@ -0,0 +1,673 @@ + + + + + + + + Shuffling - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Shuffling

+

__m128i __lsx_vshuf_b (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vshuf_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.b vr, vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Shuffle bytes from a and b with indices from c.

+

Caveat: the indices are placed in c, while in other vshuf intrinsics, they are placed in a.

+

+

Examples

+
__m128i __lsx_vshuf_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321}, __m128i{0x0011021304050607, 0x0811120213031404})
+= 0x7877155513efcdab 0x2177661555144413
+
+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (c.byte[i] >= 64 && MACHINE_3C5000) {
+    // Caveat: observed in 3C5000
+    dst.byte[i] = 0;
+  } else if ((c.byte[i] % 32) < 16) {
+    dst.byte[i] = b.byte[c.byte[i] % 16];
+  } else {
+    dst.byte[i] = a.byte[c.byte[i] % 16];
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m128i __lsx_vshuf_h (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vshuf_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Shuffle 16-bit elements in b and c with indices from a, save the result to dst.

+

+

Examples

+
__m128i __lsx_vshuf_h(__m128i{0x0001000200030004, 0x0005000a000b000c}, __m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321})
+= 0x1415ef13abcd4321 0x432133441122ff00
+
+

Operation

+
for (int i = 0; i < 8; i++) {
+  if ((a.half[i] % 256) >= 64 && MACHINE_3C5000) {
+    // Caveat: observed in 3C5000
+    dst.half[i] = 0;
+  } else if ((a.half[i] % 16) < 8) {
+    dst.half[i] = c.half[a.half[i] % 8];
+  } else {
+    dst.half[i] = b.half[a.half[i] % 8];
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m128i __lsx_vshuf_w (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vshuf_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Shuffle 32-bit elements in b and c with indices from a, save the result to dst.

+

+

Examples

+
__m128i __lsx_vshuf_w(__m128i{0x0000000200000004, 0x0000000700000005}, __m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321})
+= 0x4321432155667788 0x99aabbcc11223344
+
+

Operation

+
for (int i = 0; i < 4; i++) {
+  if ((a.word[i] % 256) >= 64 && MACHINE_3C5000) {
+    // Caveat: observed in 3C5000
+    dst.word[i] = 0;
+  } else if ((a.word[i] % 8) < 4) {
+    dst.word[i] = c.word[a.word[i] % 4];
+  } else {
+    dst.word[i] = b.word[a.word[i] % 4];
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m128i __lsx_vshuf_d (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vshuf_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Shuffle 64-bit elements in b and c with indices from a, save the result to dst.

+

+

Examples

+
__m128i __lsx_vshuf_d(__m128i{0x0000000000000001, 0x0000000000000002}, __m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321})
+= 0x1234123443214321 0x1122334455667788
+
+

Operation

+
for (int i = 0; i < 2; i++) {
+  if ((a.dword[i] % 256) >= 64 && MACHINE_3C5000) {
+    // Caveat: observed in 3C5000
+    dst.dword[i] = 0;
+  } else if ((a.dword[i] % 4) < 2) {
+    dst.dword[i] = c.dword[a.dword[i] % 2];
+  } else {
+    dst.dword[i] = b.dword[a.dword[i] % 2];
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m128i __lsx_vshuf4i_b (__m128i a, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vshuf4i_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vshuf4i.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Shuffle every four 8-bit elements in a with indices packed in imm, save the result to dst.

+

+

Examples

+
__m128i __lsx_vshuf4i_b(__m128i{0xabcdef1314156678, 0x1234123443214321}, 0x12)
+= 0x13ef13cd78667815 0x3412343421432121
+
+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vshuf4i_h (__m128i a, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vshuf4i_h (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vshuf4i.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Shuffle every four 16-bit elements in a with indices packed in imm, save the result to dst.

+

+

Examples

+
__m128i __lsx_vshuf4i_h(__m128i{0xabcdef1314156678, 0x1234123443214321}, 0x12)
+= 0x667814156678ef13 0x4321432143211234
+
+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vshuf4i_w (__m128i a, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vshuf4i_w (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vshuf4i.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Shuffle every four 32-bit elements in a with indices packed in imm, save the result to dst.

+

+

Examples

+
__m128i __lsx_vshuf4i_w(__m128i{0xabcdef1314156678, 0x1234123443214321}, 0x12)
+= 0x1415667843214321 0x14156678abcdef13
+
+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vshuf4i_d (__m128i a, __m128i b, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vshuf4i_d (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vshuf4i.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Shuffle every four 64-bit elements in a and b with indices packed in imm, save the result to dst.

+

+

Examples

+
__m128i __lsx_vshuf4i_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321}, 0x12)
+= 0xabcdef1314156678 0x1122334455667788
+
+

Operation

+
dst.dword[0] = (imm & 2) ? b.dword[(imm & 1)] : a.dword[(imm & 1)];
+dst.dword[1] =
+    (imm & 8) ? b.dword[((imm >> 2) & 1)] : a.dword[((imm >> 2) & 1)];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + +
+ + + + + + + + diff --git a/main.css b/main.css new file mode 100644 index 00000000..f7ffbf8f --- /dev/null +++ b/main.css @@ -0,0 +1,3 @@ +[v-cloak] { + display: none +} \ No newline at end of file diff --git a/migrating_avx/index.html b/migrating_avx/index.html new file mode 100644 index 00000000..2b2c8661 --- /dev/null +++ b/migrating_avx/index.html @@ -0,0 +1,2185 @@ + + + + + + + + Migrating from AVX to LASX - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + +
  • +
  • +
+
+
+
+
+ +

Migrating from AVX to LASX

+

AVX is a 256-bit SIMD extension to X86. It is possible to migrate existing AVX code to leverage LoongArch LASX extension by rewriting the intrinsics or instructions manually, or by using tools like SIMD Everywhere to implement AVX intrinsics with LASX counterparts. But to unleash the full performance, you may want to port your code to LASX manually.

+

Thankfully, LASX intrinsics adopt the same type as AVX: you can use the following familiar types for SIMD:

+
    +
  • __m256: 256-bit vector of single precision floating point numbers
  • +
  • __m256d: 256-bit vector of double precision floating point numbers
  • +
  • __m256i: 256-bit vector of integers, which can be of any width
  • +
+

Here is a table of a mapping from AVX intrinsics to their LASX counterpart (WIP):

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AVXLASX
_mm256_abs_epi16__lasx_xvsigncov_h
_mm256_abs_epi32__lasx_xvsigncov_w
_mm256_abs_epi8__lasx_xvsigncov_b
_mm256_add_epi16__lasx_xvadd_h
_mm256_add_epi32__lasx_xvadd_w
_mm256_add_epi64__lasx_xvadd_d
_mm256_add_epi8__lasx_xvadd_b
_mm256_add_pd__lasx_xvfadd_d
_mm256_add_ps__lasx_xvfadd_s
_mm256_adds_epi16__lasx_xvsadd_h
_mm256_adds_epi8__lasx_xvsadd_b
_mm256_adds_epu16__lasx_xvsadd_hu
_mm256_adds_epu8__lasx_xvsadd_bu
_mm256_addsub_pd
_mm256_addsub_ps
_mm256_alignr_epi8
_mm256_and_pd__lasx_xvand_v
_mm256_and_ps__lasx_xvand_v
_mm256_and_si256__lasx_xvand_v
_mm256_andnot_pd__lasx_xvandn_v
_mm256_andnot_ps__lasx_xvandn_v
_mm256_andnot_si256__lasx_xvandn_v
_mm256_avg_epu16__lasx_xvavgr_hu
_mm256_avg_epu8__lasx_xvavgr_bu
_mm256_bcstnebf16_ps
_mm256_bcstnesh_ps
_mm256_blend_epi16
_mm256_blend_epi32
_mm256_blend_pd
_mm256_blend_ps
_mm256_blendv_epi8
_mm256_blendv_pd
_mm256_blendv_ps
_mm256_broadcast_pd
_mm256_broadcast_ps
_mm256_broadcast_sd
_mm256_broadcast_ss
_mm256_broadcastb_epi8
_mm256_broadcastd_epi32
_mm256_broadcastq_epi64
_mm256_broadcastsd_pd
_mm256_broadcastsi128_si256
_mm256_broadcastss_ps
_mm256_broadcastw_epi16
_mm256_bslli_epi128
_mm256_bsrli_epi128
_mm256_castpd128_pd256
_mm256_castpd256_pd128
_mm256_castpd_ps
_mm256_castpd_si256
_mm256_castps128_ps256
_mm256_castps256_ps128
_mm256_castps_pd
_mm256_castps_si256
_mm256_castsi128_si256
_mm256_castsi256_pd
_mm256_castsi256_ps
_mm256_castsi256_si128
_mm256_ceil_pd
_mm256_ceil_ps
_mm256_cmp_pd
_mm256_cmp_ps
_mm256_cmpeq_epi16
_mm256_cmpeq_epi32
_mm256_cmpeq_epi64
_mm256_cmpeq_epi8
_mm256_cmpgt_epi16
_mm256_cmpgt_epi32
_mm256_cmpgt_epi64
_mm256_cmpgt_epi8
_mm256_cvtepi16_epi32
_mm256_cvtepi16_epi64
_mm256_cvtepi32_epi64
_mm256_cvtepi32_pd
_mm256_cvtepi32_ps
_mm256_cvtepi8_epi16
_mm256_cvtepi8_epi32
_mm256_cvtepi8_epi64
_mm256_cvtepu16_epi32
_mm256_cvtepu16_epi64
_mm256_cvtepu32_epi64
_mm256_cvtepu8_epi16
_mm256_cvtepu8_epi32
_mm256_cvtepu8_epi64
_mm256_cvtneebf16_ps
_mm256_cvtneeph_ps
_mm256_cvtneobf16_ps
_mm256_cvtneoph_ps
_mm256_cvtneps_avx_pbh
_mm256_cvtneps_pbh
_mm256_cvtpd_epi32
_mm256_cvtpd_ps
_mm256_cvtph_ps
_mm256_cvtps_epi32
_mm256_cvtps_pd
_mm256_cvtps_ph
_mm256_cvtsd_f64
_mm256_cvtsi256_si32
_mm256_cvtss_f32
_mm256_cvttpd_epi32
_mm256_cvttps_epi32
_mm256_div_pd__lasx_xvfdiv_d
_mm256_div_ps__lasx_xvfdiv_s
_mm256_dp_ps
_mm256_dpbssd_epi32
_mm256_dpbssds_epi32
_mm256_dpbsud_epi32
_mm256_dpbsuds_epi32
_mm256_dpbusd_avx_epi32
_mm256_dpbusd_epi32
_mm256_dpbusds_avx_epi32
_mm256_dpbusds_epi32
_mm256_dpbuud_epi32
_mm256_dpbuuds_epi32
_mm256_dpwssd_avx_epi32
_mm256_dpwssd_epi32
_mm256_dpwssds_avx_epi32
_mm256_dpwssds_epi32
_mm256_dpwsud_epi32
_mm256_dpwsuds_epi32
_mm256_dpwusd_epi32
_mm256_dpwusds_epi32
_mm256_dpwuud_epi32
_mm256_dpwuuds_epi32
_mm256_extract_epi16
_mm256_extract_epi32
_mm256_extract_epi64
_mm256_extract_epi8
_mm256_extractf128_pd
_mm256_extractf128_ps
_mm256_extractf128_si256
_mm256_extracti128_si256
_mm256_floor_pd
_mm256_floor_ps
_mm256_fmadd_pd
_mm256_fmadd_ps
_mm256_fmaddsub_pd
_mm256_fmaddsub_ps
_mm256_fmsub_pd
_mm256_fmsub_ps
_mm256_fmsubadd_pd
_mm256_fmsubadd_ps
_mm256_fnmadd_pd
_mm256_fnmadd_ps
_mm256_fnmsub_pd
_mm256_fnmsub_ps
_mm256_hadd_epi16
_mm256_hadd_epi32
_mm256_hadd_pd
_mm256_hadd_ps
_mm256_hadds_epi16
_mm256_hsub_epi16
_mm256_hsub_epi32
_mm256_hsub_pd
_mm256_hsub_ps
_mm256_hsubs_epi16
_mm256_i32gather_epi32
_mm256_i32gather_epi64
_mm256_i32gather_pd
_mm256_i32gather_ps
_mm256_i64gather_epi32
_mm256_i64gather_epi64
_mm256_i64gather_pd
_mm256_i64gather_ps
_mm256_insert_epi16
_mm256_insert_epi32
_mm256_insert_epi64
_mm256_insert_epi8
_mm256_insertf128_pd
_mm256_insertf128_ps
_mm256_insertf128_si256
_mm256_inserti128_si256
_mm256_lddqu_si256
_mm256_load_pd
_mm256_load_ps
_mm256_load_si256
_mm256_loadu2_m128d
_mm256_loadu2_m128i
_mm256_loadu2_m128
_mm256_loadu_pd
_mm256_loadu_ps
_mm256_loadu_si256
_mm256_madd52hi_avx_epu64
_mm256_madd52hi_epu64
_mm256_madd52lo_avx_epu64
_mm256_madd52lo_epu64
_mm256_madd_epi16
_mm256_maddubs_epi16
_mm256_mask_i32gather_epi32
_mm256_mask_i32gather_epi64
_mm256_mask_i32gather_pd
_mm256_mask_i32gather_ps
_mm256_mask_i64gather_epi32
_mm256_mask_i64gather_epi64
_mm256_mask_i64gather_pd
_mm256_mask_i64gather_ps
_mm256_maskload_epi32
_mm256_maskload_epi64
_mm256_maskload_pd
_mm256_maskload_ps
_mm256_maskstore_epi32
_mm256_maskstore_epi64
_mm256_maskstore_pd
_mm256_maskstore_ps
_mm256_max_epi16__lasx_xvmax_h
_mm256_max_epi32__lasx_xvmax_w
_mm256_max_epi8__lasx_xvmax_b
_mm256_max_epu16__lasx_xvmax_hu
_mm256_max_epu32__lasx_xvmax_wu
_mm256_max_epu8__lasx_xvmax_bu
_mm256_max_pd__lasx_xvfmax_d
_mm256_max_ps__lasx_xvfmax_s
_mm256_min_epi16__lasx_xvmin_h
_mm256_min_epi32__lasx_xvmin_w
_mm256_min_epi8__lasx_xvmin_b
_mm256_min_epu16__lasx_xvmin_hu
_mm256_min_epu32__lasx_xvmin_wu
_mm256_min_epu8__lasx_xvmin_bu
_mm256_min_pd__lasx_xvfmin_d
_mm256_min_ps__lasx_xvfmin_s
_mm256_movedup_pd
_mm256_movehdup_ps
_mm256_moveldup_ps
_mm256_movemask_epi8
_mm256_movemask_pd
_mm256_movemask_ps
_mm256_mpsadbw_epu8
_mm256_mul_epi32
_mm256_mul_epu32
_mm256_mul_pd__lasx_xvfmul_d
_mm256_mul_ps__lasx_xvfmul_s
_mm256_mulhi_epi16
_mm256_mulhi_epu16
_mm256_mulhrs_epi16
_mm256_mullo_epi16
_mm256_mullo_epi32
_mm256_or_pd
_mm256_or_ps
_mm256_or_si256__lasx_xvor_v
_mm256_packs_epi16
_mm256_packs_epi32
_mm256_packus_epi16
_mm256_packus_epi32
_mm256_permute2f128_pd
_mm256_permute2f128_ps
_mm256_permute2f128_si256
_mm256_permute2x128_si256
_mm256_permute4x64_epi64
_mm256_permute4x64_pd
_mm256_permute_pd
_mm256_permute_ps
_mm256_permutevar8x32_epi32
_mm256_permutevar8x32_ps
_mm256_permutevar_pd
_mm256_permutevar_ps
_mm256_rcp_ps
_mm256_round_pd
_mm256_round_ps
_mm256_rsqrt_ps
_mm256_sad_epu8
_mm256_set1_epi16
_mm256_set1_epi32
_mm256_set1_epi64x
_mm256_set1_epi8
_mm256_set1_pd
_mm256_set1_ps
_mm256_set_epi16
_mm256_set_epi32
_mm256_set_epi64x
_mm256_set_epi8
_mm256_set_m128d
_mm256_set_m128i
_mm256_set_m128
_mm256_set_pd
_mm256_set_ps
_mm256_setr_epi16
_mm256_setr_epi32
_mm256_setr_epi64x
_mm256_setr_epi8
_mm256_setr_m128d
_mm256_setr_m128i
_mm256_setr_m128
_mm256_setr_pd
_mm256_setr_ps
_mm256_setzero_pd
_mm256_setzero_ps
_mm256_setzero_si256
_mm256_sha512msg1_epi64
_mm256_sha512msg2_epi64
_mm256_sha512rnds2_epi64
_mm256_shuffle_epi32
_mm256_shuffle_epi8
_mm256_shuffle_pd
_mm256_shuffle_ps
_mm256_shufflehi_epi16
_mm256_shufflelo_epi16
_mm256_sign_epi16
_mm256_sign_epi32
_mm256_sign_epi8
_mm256_sll_epi16__lasx_xvsll_h
_mm256_sll_epi32__lasx_xvsll_w
_mm256_sll_epi64__lasx_xvsll_d
_mm256_slli_epi16__lasx_xvslli_h
_mm256_slli_epi32__lasx_xvslli_w
_mm256_slli_epi64__lasx_xvslli_d
_mm256_slli_si256
_mm256_sllv_epi32
_mm256_sllv_epi64
_mm256_sm4key4_epi32
_mm256_sm4rnds4_epi32
_mm256_sqrt_pd
_mm256_sqrt_ps
_mm256_sra_epi16__lasx_xvsra_h
_mm256_sra_epi32__lasx_xvsra_w
_mm256_srai_epi16__lasx_xvsrai_h
_mm256_srai_epi32__lasx_xvsrai_w
_mm256_srav_epi32
_mm256_srl_epi16__lasx_xvsrl_h
_mm256_srl_epi32__lasx_xvsrl_w
_mm256_srl_epi64__lasx_xvsrl_d
_mm256_srli_epi16__lasx_xvsrli_h
_mm256_srli_epi32__lasx_xvsrli_w
_mm256_srli_epi64__lasx_xvsrli_d
_mm256_srli_si256
_mm256_srlv_epi32
_mm256_srlv_epi64
_mm256_store_pd
_mm256_store_ps
_mm256_store_si256
_mm256_storeu2_m128d
_mm256_storeu2_m128i
_mm256_storeu2_m128
_mm256_storeu_pd
_mm256_storeu_ps
_mm256_storeu_si256
_mm256_stream_load_si256
_mm256_stream_pd
_mm256_stream_ps
_mm256_stream_si256
_mm256_sub_epi16__lasx_xvsub_h
_mm256_sub_epi32__lasx_xvsub_w
_mm256_sub_epi64__lasx_xvsub_d
_mm256_sub_epi8__lasx_xvsub_b
_mm256_sub_pd__lasx_xvfsub_d
_mm256_sub_ps__lasx_xvfsub_s
_mm256_subs_epi16
_mm256_subs_epi8
_mm256_subs_epu16
_mm256_subs_epu8
_mm256_testc_pd
_mm256_testc_ps
_mm256_testc_si256
_mm256_testnzc_pd
_mm256_testnzc_ps
_mm256_testnzc_si256
_mm256_testz_pd
_mm256_testz_ps
_mm256_testz_si256
_mm256_undefined_pd
_mm256_undefined_ps
_mm256_undefined_si256
_mm256_unpackhi_epi16__lasx_xvilvh_h
_mm256_unpackhi_epi32__lasx_xvilvh_w
_mm256_unpackhi_epi64__lasx_xvilvh_d
_mm256_unpackhi_epi8__lasx_xvilvh_b
_mm256_unpackhi_pd
_mm256_unpackhi_ps
_mm256_unpacklo_epi16__lasx_xvilvl_h
_mm256_unpacklo_epi32__lasx_xvilvl_w
_mm256_unpacklo_epi64__lasx_xvilvl_d
_mm256_unpacklo_epi8__lasx_xvilvl_b
_mm256_unpacklo_pd
_mm256_unpacklo_ps
_mm256_xor_pd
_mm256_xor_ps
_mm256_xor_si256
_mm256_zeroall
_mm256_zeroupper
_mm256_zextpd128_pd256
_mm256_zextps128_ps256
_mm256_zextsi128_si256
_mm_bcstnebf16_ps
_mm_bcstnesh_ps
_mm_blend_epi32
_mm_broadcast_ss
_mm_broadcastb_epi8
_mm_broadcastd_epi32
_mm_broadcastq_epi64
_mm_broadcastsd_pd
_mm_broadcastsi128_si256
_mm_broadcastss_ps
_mm_broadcastw_epi16
_mm_cmp_pd
_mm_cmp_ps
_mm_cmp_sd
_mm_cmp_ss
_mm_cvtneebf16_ps
_mm_cvtneeph_ps
_mm_cvtneobf16_ps
_mm_cvtneoph_ps
_mm_cvtneps_avx_pbh
_mm_cvtneps_pbh
_mm_cvtph_ps
_mm_cvtps_ph
_mm_dpbssd_epi32
_mm_dpbssds_epi32
_mm_dpbsud_epi32
_mm_dpbsuds_epi32
_mm_dpbusd_avx_epi32
_mm_dpbusd_epi32
_mm_dpbusds_avx_epi32
_mm_dpbusds_epi32
_mm_dpbuud_epi32
_mm_dpbuuds_epi32
_mm_dpwssd_avx_epi32
_mm_dpwssd_epi32
_mm_dpwssds_avx_epi32
_mm_dpwssds_epi32
_mm_dpwsud_epi32
_mm_dpwsuds_epi32
_mm_dpwusd_epi32
_mm_dpwusds_epi32
_mm_dpwuud_epi32
_mm_dpwuuds_epi32
_mm_fmadd_pd
_mm_fmadd_ps
_mm_fmadd_sd
_mm_fmadd_ss
_mm_fmaddsub_pd
_mm_fmaddsub_ps
_mm_fmsub_pd
_mm_fmsub_ps
_mm_fmsub_sd
_mm_fmsub_ss
_mm_fmsubadd_pd
_mm_fmsubadd_ps
_mm_fnmadd_pd
_mm_fnmadd_ps
_mm_fnmadd_sd
_mm_fnmadd_ss
_mm_fnmsub_pd
_mm_fnmsub_ps
_mm_fnmsub_sd
_mm_fnmsub_ss
_mm_i32gather_epi32
_mm_i32gather_epi64
_mm_i32gather_pd
_mm_i32gather_ps
_mm_i64gather_epi32
_mm_i64gather_epi64
_mm_i64gather_pd
_mm_i64gather_ps
_mm_madd52hi_avx_epu64
_mm_madd52hi_epu64
_mm_madd52lo_avx_epu64
_mm_madd52lo_epu64
_mm_mask_i32gather_epi32
_mm_mask_i32gather_epi64
_mm_mask_i32gather_pd
_mm_mask_i32gather_ps
_mm_mask_i64gather_epi32
_mm_mask_i64gather_epi64
_mm_mask_i64gather_pd
_mm_mask_i64gather_ps
_mm_maskload_epi32
_mm_maskload_epi64
_mm_maskload_pd
_mm_maskload_ps
_mm_maskstore_epi32
_mm_maskstore_epi64
_mm_maskstore_pd
_mm_maskstore_ps
_mm_permute_pd
_mm_permute_ps
_mm_permutevar_pd
_mm_permutevar_ps
_mm_sllv_epi32
_mm_sllv_epi64
_mm_sm3msg1_epi32
_mm_sm3msg2_epi32
_mm_sm3rnds2_epi32
_mm_sm4key4_epi32
_mm_sm4rnds4_epi32
_mm_srav_epi32
_mm_srlv_epi32
_mm_srlv_epi64
_mm_testc_pd
_mm_testc_ps
_mm_testnzc_pd
_mm_testnzc_ps
_mm_testz_pd
_mm_testz_ps
+

The list of AVX intrinsics came from Intel Intrinsics Guide.

+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/migrating_sse/index.html b/migrating_sse/index.html new file mode 100644 index 00000000..f0ca3bb1 --- /dev/null +++ b/migrating_sse/index.html @@ -0,0 +1,1997 @@ + + + + + + + + Migrating from SSE to LSX - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + +
  • +
  • +
+
+
+
+
+ +

Migrating from SSE to LSX

+

SSE is a 128-bit SIMD extension to X86. It is possible to migrate existing SSE code to leverage LoongArch LSX extension by rewriting the intrinsics or instructions manually, or by using tools like SIMD Everywhere to implement SSE intrinsics with LSX counterparts. But to unleash the full performance, you may want to port your code to LSX manually.

+

Thankfully, LSX intrinsics adopt the same type as SSE: you can use the following familiar types for SIMD:

+
    +
  • __m128: 128-bit vector of single precision floating point numbers
  • +
  • __m128d: 128-bit vector of double precision floating point numbers
  • +
  • __m128i: 128-bit vector of integers, which can be of any width
  • +
+

Here is a table of a mapping from SSE intrinsics to their LSX counterpart (WIP):

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
SSELSX
_mm_abs_epi16__lsx_vsigncov_h
_mm_abs_epi32__lsx_vsigncov_w
_mm_abs_epi8__lsx_vsigncov_b
_mm_add_epi16__lsx_vadd_h
_mm_add_epi32__lsx_vadd_w
_mm_add_epi64__lsx_vadd_d
_mm_add_epi8__lsx_vadd_b
_mm_add_pd__lsx_vfadd_d
_mm_add_ps__lsx_vfadd_s
_mm_add_sd__lsx_vfadd_d + __lsx_vextrins_d
_mm_add_ss__lsx_vfadd_s + __lsx_vextrins_w
_mm_adds_epi16__lsx_vsadd_h
_mm_adds_epi8__lsx_vsadd_b
_mm_adds_epu16__lsx_vsadd_hu
_mm_adds_epu8__lsx_vsadd_bu
_mm_addsub_pd
_mm_addsub_ps
_mm_alignr_epi8
_mm_and_pd__lsx_vand_v
_mm_and_ps__lsx_vand_v
_mm_and_si128__lsx_vand_v
_mm_andnot_pd__lsx_vandn_v
_mm_andnot_ps__lsx_vandn_v
_mm_andnot_si128__lsx_vandn_v
_mm_avg_epu16__lsx_vavgr_hu
_mm_avg_epu8__lsx_vavgr_bu
_mm_blend_epi16
_mm_blend_pd
_mm_blend_ps
_mm_blendv_epi8
_mm_blendv_pd
_mm_blendv_ps
_mm_bslli_si128__lsx_vbsll_v
_mm_bsrli_si128__lsx_vbsrl_v
_mm_castpd_pstype conversion
_mm_castpd_si128type conversion
_mm_castps_pdtype conversion
_mm_castps_si128type conversion
_mm_castsi128_pdtype conversion
_mm_castsi128_pstype conversion
_mm_ceil_pd__lsx_vfrintrp_d
_mm_ceil_ps__lsx_vfrintrp_s
_mm_ceil_sd__lsx_vfrintrp_d + __lsx_vextrins_d
_mm_ceil_ss__lsx_vfrintrp_s + __lsx_vextrins_w
_mm_cmpeq_epi16__lsx_vseq_h
_mm_cmpeq_epi32__lsx_vseq_w
_mm_cmpeq_epi64__lsx_vseq_d
_mm_cmpeq_epi8__lsx_vseq_b
_mm_cmpeq_pd__lsx_vfcmp_ceq_d
_mm_cmpeq_ps__lsx_vfcmp_ceq_s
_mm_cmpeq_sd__lsx_vfcmp_ceq_d + __lsx_vextrins_d
_mm_cmpeq_ss__lsx_vfcmp_ceq_s + __lsx_vextrins_w
_mm_cmpestra
_mm_cmpestrc
_mm_cmpestri
_mm_cmpestrm
_mm_cmpestro
_mm_cmpestrs
_mm_cmpestrz
_mm_cmpge_pd__lsx_vfcmp_cle_d
_mm_cmpge_ps__lsx_vfcmp_cle_s
_mm_cmpge_sd__lsx_vfcmp_cle_d + __lsx_vextrins_d
_mm_cmpge_ss__lsx_vfcmp_cle_s + __lsx_vextrins_w
_mm_cmpgt_epi16__lsx_vslt_h
_mm_cmpgt_epi32__lsx_vslt_w
_mm_cmpgt_epi64__lsx_vslt_d
_mm_cmpgt_epi8__lsx_vslt_b
_mm_cmpgt_pd__lsx_vfcmp_clt_d
_mm_cmpgt_ps__lsx_vfcmp_clt_s
_mm_cmpgt_sd__lsx_vfcmp_clt_d + __lsx_vextrins_d
_mm_cmpgt_ss__lsx_vfcmp_clt_s + __lsx_vextrins_w
_mm_cmpistra
_mm_cmpistrc
_mm_cmpistri
_mm_cmpistrm
_mm_cmpistro
_mm_cmpistrs
_mm_cmpistrz
_mm_cmple_pd__lsx_vfcmp_cle_d
_mm_cmple_ps__lsx_vfcmp_cle_s
_mm_cmple_sd__lsx_vfcmp_cle_d + __lsx_vextrins_d
_mm_cmple_ss__lsx_vfcmp_cle_s + __lsx_vextrins_w
_mm_cmplt_epi16__lsx_vslt_h
_mm_cmplt_epi32__lsx_vslt_w
_mm_cmplt_epi8__lsx_vslt_b
_mm_cmplt_pd__lsx_vfcmp_clt_d
_mm_cmplt_ps__lsx_vfcmp_clt_s
_mm_cmplt_sd__lsx_vfcmp_clt_d + __lsx_vextrins_d
_mm_cmplt_ss__lsx_vfcmp_clt_s + __lsx_vextrins_w
_mm_cmpneq_pd__lsx_vfcmp_cune_d
_mm_cmpneq_ps__lsx_vfcmp_cune_s
_mm_cmpneq_sd__lsx_vfcmp_cune_d + __lsx_vextrins_d
_mm_cmpneq_ss__lsx_vfcmp_cune_s + __lsx_vextrins_w
_mm_cmpnge_pd__lsx_vfcmp_cult_d
_mm_cmpnge_ps__lsx_vfcmp_cult_s
_mm_cmpnge_sd__lsx_vfcmp_cult_d + __lsx_vextrins_d
_mm_cmpnge_ss__lsx_vfcmp_cult_s + __lsx_vextrins_w
_mm_cmpngt_pd__lsx_vfcmp_cule_d
_mm_cmpngt_ps__lsx_vfcmp_cule_s
_mm_cmpngt_sd__lsx_vfcmp_cule_d + __lsx_vextrins_d
_mm_cmpngt_ss__lsx_vfcmp_cule_s + __lsx_vextrins_w
_mm_cmpnle_pd__lsx_vfcmp_cult_d
_mm_cmpnle_ps__lsx_vfcmp_cult_s
_mm_cmpnle_sd__lsx_vfcmp_cult_d + __lsx_vextrins_d
_mm_cmpnle_ss__lsx_vfcmp_cult_s + __lsx_vextrins_w
_mm_cmpnlt_pd__lsx_vfcmp_cule_d
_mm_cmpnlt_ps__lsx_vfcmp_cule_s
_mm_cmpnlt_sd__lsx_vfcmp_cule_d + __lsx_vextrins_d
_mm_cmpnlt_ss__lsx_vfcmp_cule_s + __lsx_vextrins_w
_mm_cmpord_pd__lsx_vfcmp_cor_d
_mm_cmpord_ps__lsx_vfcmp_cor_s
_mm_cmpord_sd__lsx_vfcmp_cor_d + __lsx_vextrins_d
_mm_cmpord_ss__lsx_vfcmp_cor_s + __lsx_vextrins_w
_mm_cmpunord_pd__lsx_vfcmp_cun_d
_mm_cmpunord_ps__lsx_vfcmp_cun_s
_mm_cmpunord_sd__lsx_vfcmp_cun_d + __lsx_vextrins_d
_mm_cmpunord_ss__lsx_vfcmp_cun_s + __lsx_vextrins_w
_mm_comieq_sd
_mm_comieq_ss
_mm_comige_sd
_mm_comige_ss
_mm_comigt_sd
_mm_comigt_ss
_mm_comile_sd
_mm_comile_ss
_mm_comilt_sd
_mm_comilt_ss
_mm_comineq_sd
_mm_comineq_ss
_mm_cvt_pi2ps
_mm_cvt_ps2pi
_mm_cvt_si2ss
_mm_cvt_ss2si
_mm_cvtepi16_epi32__lsx_vsllwil_w_h
_mm_cvtepi16_epi64
_mm_cvtepi32_epi64__lsx_vsllwil_d_w
_mm_cvtepi32_pd__lsx_vffintl_d_w
_mm_cvtepi32_ps__lsx_vffint_s_w
_mm_cvtepi8_epi16__lsx_vsllwil_h_b
_mm_cvtepi8_epi32
_mm_cvtepi8_epi64
_mm_cvtepu16_epi32__lsx_vsllwil_wu_hu
_mm_cvtepu16_epi64
_mm_cvtepu32_epi64__lsx_vsllwil_du_wu
_mm_cvtepu8_epi16__lsx_vsllwil_hu_bu
_mm_cvtepu8_epi32
_mm_cvtepu8_epi64
_mm_cvtpd_epi32__lsx_vftint_w_d
_mm_cvtpd_pi32
_mm_cvtpd_ps__lsx_vfcvt_s_d
_mm_cvtpi16_ps
_mm_cvtpi32_pd
_mm_cvtpi32_ps
_mm_cvtpi32x2_ps
_mm_cvtpi8_ps
_mm_cvtps_epi32__lsx_vftint_w_s
_mm_cvtps_pd__lsx_vfcvtl_d_s
_mm_cvtps_pi16
_mm_cvtps_pi32
_mm_cvtps_pi8
_mm_cvtpu16_ps
_mm_cvtpu8_ps
_mm_cvtsd_f64
_mm_cvtsd_si32
_mm_cvtsd_si64
_mm_cvtsd_si64x
_mm_cvtsd_ss
_mm_cvtsi128_si32__lsx_vpickve2gr_w
_mm_cvtsi128_si64__lsx_vpickve2gr_d
_mm_cvtsi128_si64x__lsx_vpickve2gr_d
_mm_cvtsi32_sd
_mm_cvtsi32_si128
_mm_cvtsi32_ss
_mm_cvtsi64_sd
_mm_cvtsi64_si128
_mm_cvtsi64_ss
_mm_cvtsi64x_sd
_mm_cvtsi64x_si128
_mm_cvtss_f32
_mm_cvtss_sd
_mm_cvtss_si32
_mm_cvtss_si64
_mm_cvtt_ps2pi
_mm_cvtt_ss2si
_mm_cvttpd_epi32__lsx_vftint_w_d
_mm_cvttpd_pi32
_mm_cvttps_epi32__lsx_vftint_w_s
_mm_cvttps_pi32
_mm_cvttsd_si32
_mm_cvttsd_si64
_mm_cvttsd_si64x
_mm_cvttss_si32
_mm_cvttss_si64
_mm_div_pd__lsx_vfdiv_d
_mm_div_ps__lsx_vfdiv_s
_mm_div_sd__lsx_vfdiv_d + __lsx_vextrins_d
_mm_div_ss__lsx_vfdiv_s + __lsx_vextrins_w
_mm_dp_pd
_mm_dp_ps
_mm_extract_epi16__lsx_vpickve2gr_h
_mm_extract_epi32__lsx_vpickve2gr_w
_mm_extract_epi64__lsx_vpickve2gr_d
_mm_extract_epi8__lsx_vpickve2gr_b
_mm_extract_ps__lsx_vpickve2gr_w
_mm_floor_pd__lsx_vfrintrm_d
_mm_floor_ps__lsx_vfrintrm_s
_mm_floor_sd__lsx_vfrintrm_d + __lsx_vextrins_d
_mm_floor_ss__lsx_vfrintrm_s + __lsx_vextrins_w
_mm_hadd_epi16
_mm_hadd_epi32
_mm_hadd_pd
_mm_hadd_ps
_mm_hadds_epi16
_mm_hsub_epi16
_mm_hsub_epi32
_mm_hsub_pd
_mm_hsub_ps
_mm_hsubs_epi16
_mm_insert_epi16__lsx_vinsgr2vr_h
_mm_insert_epi32__lsx_vinsgr2vr_w
_mm_insert_epi64__lsx_vinsgr2vr_d
_mm_insert_epi8__lsx_vinsgr2vr_b
_mm_insert_ps__lsx_vinsgr2vr_w
_mm_lddqu_si128
_mm_load_pd__lsx_vld
_mm_load_pd1__lsx_vldrepl_d
_mm_load_ps__lsx_vld
_mm_load_ps1__lsx_vldrepl_w
_mm_load_sd
_mm_load_si128
_mm_load_ss
_mm_load1_pd__lsx_vldrepl_d
_mm_load1_ps__lsx_vldrepl_w
_mm_loaddup_pd
_mm_loadh_pd
_mm_loadh_pi
_mm_loadl_epi64
_mm_loadl_pd
_mm_loadl_pi
_mm_loadr_pd__lsx_vld + __lsx_vshuf4i_w
_mm_loadr_ps__lsx_vld + __lsx_vshuf4i_w
_mm_loadu_pd__lsx_vld
_mm_loadu_ps__lsx_vld
_mm_loadu_si128__lsx_vld
_mm_loadu_si16
_mm_loadu_si32
_mm_loadu_si64
_mm_madd_epi16
_mm_maddubs_epi16
_mm_maskmoveu_si128
_mm_max_epi16__lsx_vmax_h
_mm_max_epi32__lsx_vmax_w
_mm_max_epi8__lsx_vmax_b
_mm_max_epu16__lsx_vmax_hu
_mm_max_epu32__lsx_vmax_wu
_mm_max_epu8__lsx_vmax_bu
_mm_max_pd__lsx_vfmax_d
_mm_max_ps__lsx_vfmax_s
_mm_max_sd__lsx_vfmax_d + __lsx_vextrins_d
_mm_max_ss__lsx_vfmax_s + __lsx_vextrins_w
_mm_min_epi16__lsx_vmin_h
_mm_min_epi32__lsx_vmin_w
_mm_min_epi8__lsx_vmin_b
_mm_min_epu16__lsx_vmin_hu
_mm_min_epu32__lsx_vmin_wu
_mm_min_epu8__lsx_vmin_bu
_mm_min_pd__lsx_vfmin_d
_mm_min_ps__lsx_vfmin_s
_mm_min_sd__lsx_vfmin_d + __lsx_vextrins_d
_mm_min_ss__lsx_vfmin_s + __lsx_vextrins_w
_mm_minpos_epu16
_mm_move_epi64
_mm_move_sd__lsx_vextrins_d
_mm_move_ss__lsx_vextrins_w
_mm_movedup_pd
_mm_movehdup_ps
_mm_movehl_ps__lsx_vilvh_d
_mm_moveldup_ps
_mm_movelh_ps__lsx_vilvl_d
_mm_movemask_epi8
_mm_movemask_pd
_mm_movemask_ps__lsx_vmskltz_w + __lsx_vpickve2gr_wu
_mm_movepi64_pi64
_mm_movpi64_epi64
_mm_mpsadbw_epu8
_mm_mul_epi32
_mm_mul_epu32
_mm_mul_pd__lsx_vfmul_d
_mm_mul_ps__lsx_vfmul_s
_mm_mul_sd
_mm_mul_ss
_mm_mulhi_epi16
_mm_mulhi_epu16
_mm_mulhrs_epi16
_mm_mullo_epi16
_mm_mullo_epi32
_mm_or_pd__lsx_vor_v
_mm_or_ps__lsx_vor_v
_mm_or_si128__lsx_vor_v
_mm_packs_epi16
_mm_packs_epi32
_mm_packus_epi16
_mm_packus_epi32
_mm_rcp_ps__lsx_vfrecip_s
_mm_rcp_ss
_mm_round_pd__lsx_vfrintr*_d
_mm_round_ps__lsx_vfrintr*_s
_mm_round_sd
_mm_round_ss
_mm_rsqrt_ps__lsx_vfrsqrt_s
_mm_rsqrt_ss
_mm_sad_epu8
_mm_set_epi16
_mm_set_epi32
_mm_set_epi64
_mm_set_epi64x
_mm_set_epi8
_mm_set_pd
_mm_set_pd1__lsx_vdrepl_d/__lsx_vreplgr2vd_d
_mm_set_ps
_mm_set_ps1__lsx_vdrepl_w/__lsx_vreplgr2vr_w
_mm_set_sd
_mm_set_ss
_mm_set1_epi16__lsx_vreplgr2vr_h
_mm_set1_epi32__lsx_vreplgr2vr_w
_mm_set1_epi64__lsx_vreplgr2vr_d
_mm_set1_epi64x
_mm_set1_epi8__lsx_vreplgr2vr_b
_mm_set1_pd
_mm_set1_ps
_mm_setr_epi16use lsxintrin.h--v8i16 to reverse construction
_mm_setr_epi32use lsxintrin.h--v4i32 to reverse construction
_mm_setr_epi64use lsxintrin.h--v2i64 to reverse construction
_mm_setr_epi8use lsxintrin.h--v16i8 to reverse construction
_mm_setr_pduse lsxintrin.h--v2f64 to reverse construction
_mm_setr_psuse lsxintrin.h--v4f32 to reverse construction
_mm_setzero_pd(__m128d)__lsx_vldi(0)
_mm_setzero_ps(__m128)__lsx_vldi(0)
_mm_setzero_si128__lsx_vldi(0)
_mm_shuffle_epi32
_mm_shuffle_epi8
_mm_shuffle_pd
_mm_shuffle_ps
_mm_shufflehi_epi16
_mm_shufflelo_epi16
_mm_sign_epi16
_mm_sign_epi32
_mm_sign_epi8
_mm_sll_epi16__lsx_vsll_h
_mm_sll_epi32__lsx_vsll_w
_mm_sll_epi64__lsx_vsll_d
_mm_slli_epi16__lsx_vslli_h
_mm_slli_epi32__lsx_vslli_w
_mm_slli_epi64__lsx_vslli_d
_mm_slli_si128
_mm_sqrt_pd__lsx_vfsqrt_d
_mm_sqrt_ps__lsx_vfsqrt_s
_mm_sqrt_sd
_mm_sqrt_ss
_mm_sra_epi16__lsx_vsra_h
_mm_sra_epi32__lsx_vsra_w
_mm_srai_epi16__lsx_vsrai_h
_mm_srai_epi32__lsx_vsrai_w
_mm_srl_epi16__lsx_vsrl_h
_mm_srl_epi32__lsx_vsrl_w
_mm_srl_epi64__lsx_vsrl_d
_mm_srli_epi16__lsx_vsrli_h
_mm_srli_epi32__lsx_vsrli_w
_mm_srli_epi64__lsx_vsrli_d
_mm_srli_si128
_mm_store_pd__lsx_vst
_mm_store_pd1
_mm_store_ps__lsx_vst
_mm_store_ps1
_mm_store_sd
_mm_store_si128
_mm_store_ss__lsx_vstelm_w
_mm_store1_pd__lsx_vreplvei_d + __lsx_vst
_mm_store1_ps__lsx_vreplvei_w + __lsx_vst
_mm_storeh_pd
_mm_storeh_pi
_mm_storel_epi64
_mm_storel_pd
_mm_storel_pi
_mm_storer_pd
_mm_storer_ps__lsx_vshuf4i_w + __lsx_vst
_mm_storeu_pd__lsx_vst
_mm_storeu_ps__lsx_vst
_mm_storeu_si128
_mm_storeu_si16
_mm_storeu_si32
_mm_storeu_si64
_mm_stream_load_si128
_mm_stream_pd
_mm_stream_ps
_mm_stream_si128
_mm_sub_epi16__lsx_vsub_h
_mm_sub_epi32__lsx_vsub_w
_mm_sub_epi64__lsx_vsub_d
_mm_sub_epi8__lsx_vsub_b
_mm_sub_pd__lsx_vfsub_s
_mm_sub_ps__lsx_vfsub_s
_mm_sub_sd
_mm_sub_ss
_mm_subs_epi16
_mm_subs_epi8
_mm_subs_epu16
_mm_subs_epu8
_mm_test_all_ones
_mm_test_all_zeros
_mm_test_mix_ones_zeros
_mm_testc_si128
_mm_testnzc_si128
_mm_testz_si128
_MM_TRANSPOSE4_PS
_mm_ucomieq_sd
_mm_ucomieq_ss
_mm_ucomige_sd
_mm_ucomige_ss
_mm_ucomigt_sd
_mm_ucomigt_ss
_mm_ucomile_sd
_mm_ucomile_ss
_mm_ucomilt_sd
_mm_ucomilt_ss
_mm_ucomineq_sd
_mm_ucomineq_ss
_mm_undefined_pd
_mm_undefined_ps
_mm_undefined_si128
_mm_unpackhi_epi16__lsx_vilvh_h
_mm_unpackhi_epi32__lsx_vilvh_w
_mm_unpackhi_epi64__lsx_vilvh_d
_mm_unpackhi_epi8__lsx_vilvh_b
_mm_unpackhi_pd__lsx_vilvh_d
_mm_unpackhi_ps__lsx_vilvh_w
_mm_unpacklo_epi16__lsx_vilvl_h
_mm_unpacklo_epi32__lsx_vilvl_w
_mm_unpacklo_epi64__lsx_vilvl_d
_mm_unpacklo_epi8__lsx_vilvl_b
_mm_unpacklo_pd__lsx_vilvl_d
_mm_unpacklo_ps__lsx_vilvl_w
_mm_xor_pd__lsx_vxor_v
_mm_xor_ps__lsx_vxor_v
_mm_xor_si128__lsx_vxor_v
+

The list of SSE intrinsics came from Intel Intrinsics Guide.

+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/sitemap.xml b/sitemap.xml new file mode 100644 index 00000000..e226d815 --- /dev/null +++ b/sitemap.xml @@ -0,0 +1,178 @@ + + + + https://jia.je/unofficial-loongarch-intrinsics-guide/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/latency_throughput/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/migrating_avx/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/migrating_sse/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/viewer/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/bitwise_operations/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/branch/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/float_comparison/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/float_computation/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/float_conversion/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/float_misc/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/fma/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/integer_comparison/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/integer_computation/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/logical/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/memory/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/misc/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/permutation/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/shift/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/shuffling/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/bitwise_operations/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/branch/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/float_comparison/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/float_computation/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/float_conversion/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/float_misc/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/fma/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/integer_comparison/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/integer_computation/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/logical/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/memory/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/misc/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/permutation/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/shift/ + 2024-07-17 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/shuffling/ + 2024-07-17 + daily + + \ No newline at end of file diff --git a/sitemap.xml.gz b/sitemap.xml.gz new file mode 100644 index 00000000..21f13a89 Binary files /dev/null and b/sitemap.xml.gz differ diff --git a/viewer/index.html b/viewer/index.html new file mode 100644 index 00000000..a05393ea --- /dev/null +++ b/viewer/index.html @@ -0,0 +1,341 @@ + + + + + + + + Browse All Intrinsics - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + +
  • +
  • +
+
+
+
+
+ +

Browse All Intrinsics

+
+ +
+ Loading... Please wait... +

+
+ +
+ +Categories: +

+ +
+ + +
+ +

+Instruction Set Extensions: +

+ +
+ + +
+ +

+Filter by content: +

+ + + +

+

+ +Found {{intrinsics.length}} intrinsics. + +

+

+

+
+ {{ intrinsic.name }} +
+
+ +

+

+ +
+
+ + + +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + +