vec__int512__ppc_8h.html

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=9"/>
<meta name="generator" content="Doxygen 1.8.17"/>
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<title>POWER Vector Library Manual: src/pveclib/vec_int512_ppc.h File Reference</title>
<link href="tabs.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="dynsections.js"></script>
<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="search/searchdata.js"></script>
<script type="text/javascript" src="search/search.js"></script>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
</head>
<body>
<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
<div id="titlearea">
<table cellspacing="0" cellpadding="0">
 <tbody>
 <tr style="height: 56px;">
  <td id="projectalign" style="padding-left: 0.5em;">
   <div id="projectname">POWER Vector Library Manual
   &#160;<span id="projectnumber">1.0.4</span>
   </div>
  </td>
 </tr>
 </tbody>
</table>
</div>
<!-- end header part -->
<!-- Generated by Doxygen 1.8.17 -->
<script type="text/javascript">
/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt GPL-v2 */
var searchBox = new SearchBox("searchBox", "search",false,'Search');
/* @license-end */
</script>
<script type="text/javascript" src="menudata.js"></script>
<script type="text/javascript" src="menu.js"></script>
<script type="text/javascript">
/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt GPL-v2 */
$(function() {
  initMenu('',true,false,'search.php','Search');
  $(document).ready(function() { init_search(); });
});
/* @license-end */</script>
<div id="main-nav"></div>
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
     onmouseover="return searchBox.OnSearchSelectShow()"
     onmouseout="return searchBox.OnSearchSelectHide()"
     onkeydown="return searchBox.OnSearchSelectKey(event)">
</div>

<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0" 
        name="MSearchResults" id="MSearchResults">
</iframe>
</div>

<div id="nav-path" class="navpath">
  <ul>
<li class="navelem"><a class="el" href="dir_68267d1309a1af8e8297ef4c3efbcdba.html">src</a></li><li class="navelem"><a class="el" href="dir_3653a864936a87c29f489ec2a5b8be1c.html">pveclib</a></li>  </ul>
</div>
</div><!-- top -->
<div class="header">
  <div class="summary">
<a href="#nested-classes">Classes</a> &#124;
<a href="#define-members">Macros</a> &#124;
<a href="#func-members">Functions</a>  </div>
  <div class="headertitle">
<div class="title">vec_int512_ppc.h File Reference</div>  </div>
</div><!--header-->
<div class="contents">

<p>Header package containing a collection of multiple precision quadword integer computation functions implemented with 128-bit PowerISA VMX and VSX instructions.  
<a href="#details">More...</a></p>
<div class="textblock"><code>#include &lt;<a class="el" href="vec__int128__ppc_8h_source.html">pveclib/vec_int128_ppc.h</a>&gt;</code><br />
</div>
<p><a href="vec__int512__ppc_8h_source.html">Go to the source code of this file.</a></p>
<table class="memberdecls">
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="nested-classes"></a>
Classes</h2></td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct____VEC__U__256.html">__VEC_U_256</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">A vector representation of a 256-bit unsigned integer.  <a href="struct____VEC__U__256.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">A vector representation of a 512-bit unsigned integer.  <a href="struct____VEC__U__512.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct____VEC__U__640.html">__VEC_U_640</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">A vector representation of a 640-bit unsigned integer.  <a href="struct____VEC__U__640.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">union &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="union____VEC__U__512x1.html">__VEC_U_512x1</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">A vector representation of a 512-bit unsigned integer and a 128-bit carry-out.  <a href="union____VEC__U__512x1.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct____VEC__U__1024.html">__VEC_U_1024</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">A vector representation of a 1024-bit unsigned integer.  <a href="struct____VEC__U__1024.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct____VEC__U__1152.html">__VEC_U_1152</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">A vector representation of a 1152-bit unsigned integer.  <a href="struct____VEC__U__1152.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct____VEC__U__2048.html">__VEC_U_2048</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">A vector representation of a 2048-bit unsigned integer.  <a href="struct____VEC__U__2048.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">union &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="union____VEC__U__1024x512.html">__VEC_U_1024x512</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">A vector representation of a 1024-bit unsigned integer as two 512-bit fields.  <a href="union____VEC__U__1024x512.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">union &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="union____VEC__U__2048x512.html">__VEC_U_2048x512</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">A vector representation of a 2048-bit unsigned integer as 4 x 512-bit integer fields.  <a href="union____VEC__U__2048x512.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct____VEC__U__2176.html">__VEC_U_2176</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">A vector representation of a 2176-bit unsigned integer.  <a href="struct____VEC__U__2176.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct____VEC__U__4096.html">__VEC_U_4096</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">A vector representation of a 4096-bit unsigned integer.  <a href="struct____VEC__U__4096.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">union &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="union____VEC__U__4096x512.html">__VEC_U_4096x512</a></td></tr>
<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">A vector representation of a 4096-bit unsigned integer as 8 x 512-bit integer fields.  <a href="union____VEC__U__4096x512.html#details">More...</a><br /></td></tr>
<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
</table><table class="memberdecls">
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="define-members"></a>
Macros</h2></td></tr>
<tr class="memitem:aac9d31829610c29b0f5558bfb1f18e4a"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#aac9d31829610c29b0f5558bfb1f18e4a">CONST_VINT512_Q</a>(__q0,  __q1,  __q2,  __q3)&#160;&#160;&#160;{__q3, __q2, __q1, __q0}</td></tr>
<tr class="memdesc:aac9d31829610c29b0f5558bfb1f18e4a"><td class="mdescLeft">&#160;</td><td class="mdescRight">Generate a 512-bit vector unsigned integer constant from 4 x quadword constants.  <a href="vec__int512__ppc_8h.html#aac9d31829610c29b0f5558bfb1f18e4a">More...</a><br /></td></tr>
<tr class="separator:aac9d31829610c29b0f5558bfb1f18e4a"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a9376cb8baadb875605593f95422a1902"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#a9376cb8baadb875605593f95422a1902">COMPILE_FENCE</a>&#160;&#160;&#160;__asm (&quot;;&quot;:::)</td></tr>
<tr class="memdesc:a9376cb8baadb875605593f95422a1902"><td class="mdescLeft">&#160;</td><td class="mdescRight">A compiler fence to prevent excessive code motion.  <a href="vec__int512__ppc_8h.html#a9376cb8baadb875605593f95422a1902">More...</a><br /></td></tr>
<tr class="separator:a9376cb8baadb875605593f95422a1902"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a77eca5d7bebe0f30894fe9669c01b7a7"><td class="memItemLeft" align="right" valign="top"><a id="a77eca5d7bebe0f30894fe9669c01b7a7"></a>
#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#a77eca5d7bebe0f30894fe9669c01b7a7">__VEC_PWR_IMP</a>(FNAME)&#160;&#160;&#160;FNAME ## _PWR7</td></tr>
<tr class="memdesc:a77eca5d7bebe0f30894fe9669c01b7a7"><td class="mdescLeft">&#160;</td><td class="mdescRight">Macro to add platform suffix for static calls. <br /></td></tr>
<tr class="separator:a77eca5d7bebe0f30894fe9669c01b7a7"><td class="memSeparator" colspan="2">&#160;</td></tr>
</table><table class="memberdecls">
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="func-members"></a>
Functions</h2></td></tr>
<tr class="memitem:abf330c11973fdef2cccefa199c3473b9"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="struct____VEC__U__640.html">__VEC_U_640</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#abf330c11973fdef2cccefa199c3473b9">vec_add512cu</a> (<a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> a, <a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> b)</td></tr>
<tr class="memdesc:abf330c11973fdef2cccefa199c3473b9"><td class="mdescLeft">&#160;</td><td class="mdescRight">Vector Add 512-bit Unsigned Integer &amp; Write Carry.  <a href="vec__int512__ppc_8h.html#abf330c11973fdef2cccefa199c3473b9">More...</a><br /></td></tr>
<tr class="separator:abf330c11973fdef2cccefa199c3473b9"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a65346bdb9a4a7cc51bcca3e26443936a"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="struct____VEC__U__640.html">__VEC_U_640</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#a65346bdb9a4a7cc51bcca3e26443936a">vec_add512ecu</a> (<a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> a, <a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> b, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> c)</td></tr>
<tr class="memdesc:a65346bdb9a4a7cc51bcca3e26443936a"><td class="mdescLeft">&#160;</td><td class="mdescRight">Vector Add Extended 512-bit Unsigned Integer &amp; Write Carry.  <a href="vec__int512__ppc_8h.html#a65346bdb9a4a7cc51bcca3e26443936a">More...</a><br /></td></tr>
<tr class="separator:a65346bdb9a4a7cc51bcca3e26443936a"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a386ebc9ccc979fc415b8d6d66ef2e31c"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#a386ebc9ccc979fc415b8d6d66ef2e31c">vec_add512eum</a> (<a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> a, <a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> b, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> c)</td></tr>
<tr class="memdesc:a386ebc9ccc979fc415b8d6d66ef2e31c"><td class="mdescLeft">&#160;</td><td class="mdescRight">Vector Add Extended 512-bit Unsigned Integer Modulo.  <a href="vec__int512__ppc_8h.html#a386ebc9ccc979fc415b8d6d66ef2e31c">More...</a><br /></td></tr>
<tr class="separator:a386ebc9ccc979fc415b8d6d66ef2e31c"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a7508430b3dbea4d708c1e45de93dca04"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#a7508430b3dbea4d708c1e45de93dca04">vec_add512um</a> (<a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> a, <a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> b)</td></tr>
<tr class="memdesc:a7508430b3dbea4d708c1e45de93dca04"><td class="mdescLeft">&#160;</td><td class="mdescRight">Vector Add 512-bit Unsigned Integer Modulo.  <a href="vec__int512__ppc_8h.html#a7508430b3dbea4d708c1e45de93dca04">More...</a><br /></td></tr>
<tr class="separator:a7508430b3dbea4d708c1e45de93dca04"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a07a919fc91b93cf47cd48c9dd2a79ae6"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#a07a919fc91b93cf47cd48c9dd2a79ae6">vec_add512ze</a> (<a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> a, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> c)</td></tr>
<tr class="memdesc:a07a919fc91b93cf47cd48c9dd2a79ae6"><td class="mdescLeft">&#160;</td><td class="mdescRight">Vector Add 512-bit to Zero Extended Unsigned Integer Modulo.  <a href="vec__int512__ppc_8h.html#a07a919fc91b93cf47cd48c9dd2a79ae6">More...</a><br /></td></tr>
<tr class="separator:a07a919fc91b93cf47cd48c9dd2a79ae6"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a3f54578765372cc100062d52caaba70c"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#a3f54578765372cc100062d52caaba70c">vec_add512ze2</a> (<a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> a, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> c1, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> c2)</td></tr>
<tr class="memdesc:a3f54578765372cc100062d52caaba70c"><td class="mdescLeft">&#160;</td><td class="mdescRight">Vector Add 512-bit to Zero Extended2 Unsigned Integer Modulo.  <a href="vec__int512__ppc_8h.html#a3f54578765372cc100062d52caaba70c">More...</a><br /></td></tr>
<tr class="separator:a3f54578765372cc100062d52caaba70c"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a958e029fc824ec3a73ad9550bf7ea506"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="struct____VEC__U__256.html">__VEC_U_256</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#a958e029fc824ec3a73ad9550bf7ea506">vec_mul128x128_inline</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> a, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> b)</td></tr>
<tr class="memdesc:a958e029fc824ec3a73ad9550bf7ea506"><td class="mdescLeft">&#160;</td><td class="mdescRight">Vector 128x128bit Unsigned Integer Multiply.  <a href="vec__int512__ppc_8h.html#a958e029fc824ec3a73ad9550bf7ea506">More...</a><br /></td></tr>
<tr class="separator:a958e029fc824ec3a73ad9550bf7ea506"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a92120de408d445766efcd709d73840d9"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#a92120de408d445766efcd709d73840d9">vec_mul256x256_inline</a> (<a class="el" href="struct____VEC__U__256.html">__VEC_U_256</a> m1, <a class="el" href="struct____VEC__U__256.html">__VEC_U_256</a> m2)</td></tr>
<tr class="memdesc:a92120de408d445766efcd709d73840d9"><td class="mdescLeft">&#160;</td><td class="mdescRight">Vector 256x256-bit Unsigned Integer Multiply.  <a href="vec__int512__ppc_8h.html#a92120de408d445766efcd709d73840d9">More...</a><br /></td></tr>
<tr class="separator:a92120de408d445766efcd709d73840d9"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:ad10d62fe43f329c396b6486402f4f7af"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="struct____VEC__U__640.html">__VEC_U_640</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#ad10d62fe43f329c396b6486402f4f7af">vec_mul512x128_inline</a> (<a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> m1, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> m2)</td></tr>
<tr class="memdesc:ad10d62fe43f329c396b6486402f4f7af"><td class="mdescLeft">&#160;</td><td class="mdescRight">Vector 512x128-bit Unsigned Integer Multiply.  <a href="vec__int512__ppc_8h.html#ad10d62fe43f329c396b6486402f4f7af">More...</a><br /></td></tr>
<tr class="separator:ad10d62fe43f329c396b6486402f4f7af"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a9f2101271dd5d072a8406bbed160b9c8"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="struct____VEC__U__640.html">__VEC_U_640</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#a9f2101271dd5d072a8406bbed160b9c8">vec_madd512x128a128_inline</a> (<a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> m1, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> m2, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> a1)</td></tr>
<tr class="memdesc:a9f2101271dd5d072a8406bbed160b9c8"><td class="mdescLeft">&#160;</td><td class="mdescRight">Vector 512x128-bit Multiply-Add Unsigned Integer.  <a href="vec__int512__ppc_8h.html#a9f2101271dd5d072a8406bbed160b9c8">More...</a><br /></td></tr>
<tr class="separator:a9f2101271dd5d072a8406bbed160b9c8"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:ac9f6153c72e2194f14627ffca1b26c17"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="struct____VEC__U__640.html">__VEC_U_640</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#ac9f6153c72e2194f14627ffca1b26c17">vec_madd512x128a512_inline</a> (<a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> m1, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> m2, <a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> a2)</td></tr>
<tr class="memdesc:ac9f6153c72e2194f14627ffca1b26c17"><td class="mdescLeft">&#160;</td><td class="mdescRight">Vector 512x128-bit Multiply-Add Unsigned Integer.  <a href="vec__int512__ppc_8h.html#ac9f6153c72e2194f14627ffca1b26c17">More...</a><br /></td></tr>
<tr class="separator:ac9f6153c72e2194f14627ffca1b26c17"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a68ccbd4fd74977eb3f69276c6b934c26"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="struct____VEC__U__640.html">__VEC_U_640</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#a68ccbd4fd74977eb3f69276c6b934c26">vec_madd512x128a128a512_inline</a> (<a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> m1, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> m2, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> a1, <a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> a2)</td></tr>
<tr class="memdesc:a68ccbd4fd74977eb3f69276c6b934c26"><td class="mdescLeft">&#160;</td><td class="mdescRight">Vector 512x128-bit Multiply-Add Unsigned Integer.  <a href="vec__int512__ppc_8h.html#a68ccbd4fd74977eb3f69276c6b934c26">More...</a><br /></td></tr>
<tr class="separator:a68ccbd4fd74977eb3f69276c6b934c26"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a6f917597902625a8218ba41fc6dca426"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="struct____VEC__U__1024.html">__VEC_U_1024</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#a6f917597902625a8218ba41fc6dca426">vec_mul512x512_inline</a> (<a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> m1, <a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> m2)</td></tr>
<tr class="memdesc:a6f917597902625a8218ba41fc6dca426"><td class="mdescLeft">&#160;</td><td class="mdescRight">Vector 512x512-bit Unsigned Integer Multiply.  <a href="vec__int512__ppc_8h.html#a6f917597902625a8218ba41fc6dca426">More...</a><br /></td></tr>
<tr class="separator:a6f917597902625a8218ba41fc6dca426"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a666241f67c39d7fae639235edfb8c3b5"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="struct____VEC__U__1024.html">__VEC_U_1024</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#a666241f67c39d7fae639235edfb8c3b5">vec_madd512x512a512_inline</a> (<a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> m1, <a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> m2, <a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> a1)</td></tr>
<tr class="memdesc:a666241f67c39d7fae639235edfb8c3b5"><td class="mdescLeft">&#160;</td><td class="mdescRight">Vector 512-bit Unsigned Integer Multiply-Add.  <a href="vec__int512__ppc_8h.html#a666241f67c39d7fae639235edfb8c3b5">More...</a><br /></td></tr>
<tr class="separator:a666241f67c39d7fae639235edfb8c3b5"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:ab5b80fd9694cea8bf502b26e55af37f7"><td class="memItemLeft" align="right" valign="top"><a class="el" href="struct____VEC__U__256.html">__VEC_U_256</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#ab5b80fd9694cea8bf502b26e55af37f7">vec_mul128x128</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> m1, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> m2)</td></tr>
<tr class="memdesc:ab5b80fd9694cea8bf502b26e55af37f7"><td class="mdescLeft">&#160;</td><td class="mdescRight">Vector 128x128bit Unsigned Integer Multiply.  <a href="vec__int512__ppc_8h.html#ab5b80fd9694cea8bf502b26e55af37f7">More...</a><br /></td></tr>
<tr class="separator:ab5b80fd9694cea8bf502b26e55af37f7"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a131bdfc55718991610c886b2c77f6ae7"><td class="memItemLeft" align="right" valign="top"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#a131bdfc55718991610c886b2c77f6ae7">vec_mul256x256</a> (<a class="el" href="struct____VEC__U__256.html">__VEC_U_256</a> m1, <a class="el" href="struct____VEC__U__256.html">__VEC_U_256</a> m2)</td></tr>
<tr class="memdesc:a131bdfc55718991610c886b2c77f6ae7"><td class="mdescLeft">&#160;</td><td class="mdescRight">Vector 256x256-bit Unsigned Integer Multiply.  <a href="vec__int512__ppc_8h.html#a131bdfc55718991610c886b2c77f6ae7">More...</a><br /></td></tr>
<tr class="separator:a131bdfc55718991610c886b2c77f6ae7"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a0cfdc3e00f5e2c3a9a959969f684203e"><td class="memItemLeft" align="right" valign="top"><a class="el" href="struct____VEC__U__640.html">__VEC_U_640</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#a0cfdc3e00f5e2c3a9a959969f684203e">vec_mul512x128</a> (<a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> m1, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> m2)</td></tr>
<tr class="memdesc:a0cfdc3e00f5e2c3a9a959969f684203e"><td class="mdescLeft">&#160;</td><td class="mdescRight">Vector 512x128-bit Unsigned Integer Multiply.  <a href="vec__int512__ppc_8h.html#a0cfdc3e00f5e2c3a9a959969f684203e">More...</a><br /></td></tr>
<tr class="separator:a0cfdc3e00f5e2c3a9a959969f684203e"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:acf5c808a77a8486a82a9ee87ff414fd2"><td class="memItemLeft" align="right" valign="top"><a class="el" href="struct____VEC__U__640.html">__VEC_U_640</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#acf5c808a77a8486a82a9ee87ff414fd2">vec_madd512x128a512</a> (<a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> m1, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> m2, <a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> a2)</td></tr>
<tr class="memdesc:acf5c808a77a8486a82a9ee87ff414fd2"><td class="mdescLeft">&#160;</td><td class="mdescRight">Vector 512x128-bit Multiply-Add Unsigned Integer.  <a href="vec__int512__ppc_8h.html#acf5c808a77a8486a82a9ee87ff414fd2">More...</a><br /></td></tr>
<tr class="separator:acf5c808a77a8486a82a9ee87ff414fd2"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a56a5da10870d9878e2ab888d3c4d2e7b"><td class="memItemLeft" align="right" valign="top"><a class="el" href="struct____VEC__U__1024.html">__VEC_U_1024</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#a56a5da10870d9878e2ab888d3c4d2e7b">vec_mul512x512</a> (<a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> m1, <a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> m2)</td></tr>
<tr class="memdesc:a56a5da10870d9878e2ab888d3c4d2e7b"><td class="mdescLeft">&#160;</td><td class="mdescRight">Vector 512x512-bit Unsigned Integer Multiply.  <a href="vec__int512__ppc_8h.html#a56a5da10870d9878e2ab888d3c4d2e7b">More...</a><br /></td></tr>
<tr class="separator:a56a5da10870d9878e2ab888d3c4d2e7b"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:ad4ade47617ecf223a2f7b0325e1fc877"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#ad4ade47617ecf223a2f7b0325e1fc877">vec_mul1024x1024</a> (<a class="el" href="struct____VEC__U__2048.html">__VEC_U_2048</a> *p2048, <a class="el" href="struct____VEC__U__1024.html">__VEC_U_1024</a> *m1, <a class="el" href="struct____VEC__U__1024.html">__VEC_U_1024</a> *m2)</td></tr>
<tr class="memdesc:ad4ade47617ecf223a2f7b0325e1fc877"><td class="mdescLeft">&#160;</td><td class="mdescRight">Vector 1024x1024-bit Unsigned Integer Multiply.  <a href="vec__int512__ppc_8h.html#ad4ade47617ecf223a2f7b0325e1fc877">More...</a><br /></td></tr>
<tr class="separator:ad4ade47617ecf223a2f7b0325e1fc877"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a8287aa4483acb25ac3188a97cc23b89a"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#a8287aa4483acb25ac3188a97cc23b89a">vec_mul2048x2048</a> (<a class="el" href="struct____VEC__U__4096.html">__VEC_U_4096</a> *p4096, <a class="el" href="struct____VEC__U__2048.html">__VEC_U_2048</a> *m1, <a class="el" href="struct____VEC__U__2048.html">__VEC_U_2048</a> *m2)</td></tr>
<tr class="memdesc:a8287aa4483acb25ac3188a97cc23b89a"><td class="mdescLeft">&#160;</td><td class="mdescRight">Vector 2048x2048-bit Unsigned Integer Multiply.  <a href="vec__int512__ppc_8h.html#a8287aa4483acb25ac3188a97cc23b89a">More...</a><br /></td></tr>
<tr class="separator:a8287aa4483acb25ac3188a97cc23b89a"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a1a6652dfd5b6e5966acf4b75f0b89682"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#a1a6652dfd5b6e5966acf4b75f0b89682">vec_mul128_byMN</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *p, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *m1, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *m2, unsigned long M, unsigned long N)</td></tr>
<tr class="memdesc:a1a6652dfd5b6e5966acf4b75f0b89682"><td class="mdescLeft">&#160;</td><td class="mdescRight">Vector Unsigned Integer Quadword MxN Multiply.  <a href="vec__int512__ppc_8h.html#a1a6652dfd5b6e5966acf4b75f0b89682">More...</a><br /></td></tr>
<tr class="separator:a1a6652dfd5b6e5966acf4b75f0b89682"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a46ee75bf2ea0b7095d60f1448cf2f097"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int512__ppc_8h.html#a46ee75bf2ea0b7095d60f1448cf2f097">vec_mul512_byMN</a> (<a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> *p, <a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> *m1, <a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> *m2, unsigned long M, unsigned long N)</td></tr>
<tr class="memdesc:a46ee75bf2ea0b7095d60f1448cf2f097"><td class="mdescLeft">&#160;</td><td class="mdescRight">Vector Unsigned Integer Quadword 4xMxN Multiply.  <a href="vec__int512__ppc_8h.html#a46ee75bf2ea0b7095d60f1448cf2f097">More...</a><br /></td></tr>
<tr class="separator:a46ee75bf2ea0b7095d60f1448cf2f097"><td class="memSeparator" colspan="2">&#160;</td></tr>
</table>
<a name="details" id="details"></a><h2 class="groupheader">Detailed Description</h2>
<div class="textblock"><p>Header package containing a collection of multiple precision quadword integer computation functions implemented with 128-bit PowerISA VMX and VSX instructions. </p>
<p>PVECLIB <a class="el" href="vec__int128__ppc_8h.html" title="Header package containing a collection of 128-bit computation functions implemented with PowerISA VMX...">vec_int128_ppc.h</a> provides the 128x128-bit multiply and 128-bit add with carry/extend operations. This is most of what we need to implement multiple precision integer computation. This header builds on those operations to build 256x256, 512x128, 512x512, 1024x1024 and 2048x2048 multiplies. We also provide 512-bit add with carry/extend operations as a general aid to construct multiple quadword precision arithmetic.</p>
<p>We provide static inline implementations for up to 512x512 multiplies and 512x512 add with carry/extend. These in-line operations are provided as building blocks for coding implementations of larger multiply and sum operations. Otherwise the in-line code expansion is getting too large for normal coding. So we also provide callable (static and dynamic) library implementations as well (<a class="el" href="vec__int512__ppc_8h.html#i512_libary_issues_0_0">Building libraries for vec_int512_ppc</a>).</p>
<h1><a class="anchor" id="i512_security_issues_0_0"></a>
Security related implications</h1>
<p>The challenge is delivering a 2048x2048 bit multiply, producing a 4096-bit product, while minimizing cache and timing side-channel exploits. The goal is to minimize the memory visibility of intermediate products and sums and internal conditional logic (like early exit optimizations). The working theory is to use vector registers and operations and avoid storing intermediate results. This implies:</p><ul>
<li>While the final 4096-bit product is so large (32 quadwords), it requires a memory buffer for the result, we should not use any part of this buffer to hold intermediate partial sums.</li>
<li>The 2048-bit multiplicands are also large (2 x 16 quadwords) and will be passed in memory buffers that are effectively constant.</li>
<li>All intermediate partial products and sums should be held in vector registers (VSRs) until quadwords of the final product are computed and ready to store into the result buffer.</li>
<li>Avoid conditional logic that effects function timing based on values of the inputs or results.</li>
<li>Internally the code can be organized as straight line code or loops, in-line functions or calls to carefully crafted leaf functions, as long as the above goals are met.</li>
</ul>
<p>Achieving these goals requires some knowledge of the Application Binary Interface (ABI) and foibles of the Instruction Set Architecture (PowerISA) and how they impact what the compiler can generate. The compiler itself has internal strategies (and foibles) that need to be managed as well.</p>
<h2><a class="anchor" id="i512_security_issues_0_0_0"></a>
Implications of the ABI</h2>
<p>The computation requires a number of internal temporary vectors in addition to the inputs and outputs. The Power Architecture, 64-Bit ELF V2 ABI Specification (AKA the ABI) places some generous but important restrictions on how the compiler generates code (and how compliant assembler code is written).</p><ul>
<li>Up to 20 volatile vector registers v0-v19 (VSRs vs32-vs51) of which 12 can be used for function arguments/return values.<ul>
<li>Up to 12 vector arguments are passed in vector registers v2-v13 (VSRs vs34-vs45).</li>
<li>Longer vector argument lists are forced into the callers parameter save area (Stack pointer +32).</li>
<li>Functions can return a 128-bit vector value or a homogeneous aggregate of up to 8 vector values in vector registers v2-v9 (VSRs 34-41).</li>
<li>Wider (8 x vectors) function return values are returned in memory via a reference pointer passed as a hidden parameter in GPR 3.</li>
</ul>
</li>
<li>Up to 12 additional non-volatile vector registers v20-v31 (vs51-vs63). Any non-volatile registers must be saved before use and restored before function return.</li>
<li>The lower half for the VSRs (vs0-vs31) are prioritized for scalar floating-point operations. If a function is using vectors and but not scalar floating-point then the lower VSRs are available for vector logical and integer operations and temporary spill from vector registers.<ul>
<li>Up to 14 volatile float double (f0-f13) or vector registers (vs0-vs13).</li>
<li>Up to 18 non-volatile float double (f14-f31) or vector registers (vs14-vs31).</li>
</ul>
</li>
<li>All volatile registers are a considered “clobbered” after a function call.<ul>
<li>So the calling function must hold any local vector variables in memory or non-volatile registers if the live range extends across the function call.</li>
<li>In-lining the called function allows the compiler to manage register allocation across the whole sequence. This can reduce register pressure when the called function does not actually use/modify all the volatile registers.</li>
</ul>
</li>
</ul>
<h3><a class="anchor" id="i512_security_issues_0_0_0_0"></a>
Implications for parameter passing and Product size</h3>
<p>Care is required in selecting the width (256, 512-bit etc) of parameter and return values. Parameters totaling more then 12 vector quadwords or return values totaling more then 8 vector quadwords will be spilled to the callers parameter save area. This may expose intermediate partial products to cache side-channel attacks. A 512x128-bit multiply returning a 640-bit product and a 512x512-bit multiply returning a 1024-bit product meets this criteria (both the parameters and return values fit within the ABI limits). But a 1024x128-bit multiply returning 1152-bits is not OK because the 1152-bit return value requires 9 vector registers, which will be returned in memory.</p>
<p>Also if any of these sub-functions are used without in-lining, the generated code must be inspected to insure it is not spilling any local variables. In my experiments with GCC 8.1 the 128x128, 256x256, and 512x128 multiplies all avoid spilling. However the stand-alone 512x512 implementation does require saving 3 non-volatile registers. This can be eliminated by in-lining the 512x512 multiply into the 2048x2048 multiply function.</p>
<dl class="section note"><dt>Note</dt><dd>GCC compilers before version 8 have an incomplete design for homogeneous aggregates of vectors and may generate sub-optimal code for these parameters.</dd></dl>
<h2><a class="anchor" id="i512_security_issues_0_0_1"></a>
Implications of the PowerISA</h2>
<p>The Power Instruction Set Architecture (PowerISA) also imposes some restriction on the registers vector instructions can access.</p><ul>
<li>The original VMX (AKA Altivec) facility has 32 vector registers and instruction encoding to access those 32 registers.<ul>
<li>This original instruction set was incorporated unchanged into the later versions of the PowerISA.</li>
<li>When Vector Scalar Extended facility was added, the original VMX instructions where restricted to the upper 32 VSRs (original vector registers).</li>
</ul>
</li>
<li>VSX was originally focused on vector and scalar floating-point operations. With a handful of vector logical/permute/splat operations added for completeness. These instructions where encoded to access all 64 VSRs.<ul>
<li>All vector integer arithmetic operations remained restricted to the upper 32 VSRs (the original VRs).</li>
<li>Later versions of the PowerISA (POWER8/9) added new vector integer arithmetic operations. This includes word/doubleword multiply and doubleword/quadword add/subtract. But these are also encoded to access only 32 vector registers.</li>
<li>The lower VSRs can still be used hold temporaries and local variables for vector integer operations.</li>
</ul>
</li>
</ul>
<h2><a class="anchor" id="i512_security_issues_0_0_2"></a>
Implications for the compiler</h2>
<p>The compiler has to find a path though the ABI and ISA restriction above while it performs:</p><ul>
<li>function in-lining</li>
<li>instruction selection</li>
<li>instruction scheduling</li>
<li>register allocation</li>
</ul>
<p>For operations defined in PVECLIB, most operations are defined in terms of AltiVec/VSX Built-in Functions. So the compiler does not get much choice for instruction selection. The PVECLIB coding style does leverage C language vector extensions to load constants and manage temporary variables. Using compiler Altivec/VSX built-ins and vector extensions allows the compiler visibility to and control of these optimizations.</p>
<p>Internal function calls effectively <em>clobber</em> all (34 VSRs) volatile registers. As the compiler marshals parameters into ABI prescribed VRs it needs to preserve previous live content for later computation. Similarly for volatile registers not used for parameter passing as they are assumed to be clobbered by the called function. The compiler preserves local live variables before the call by copying their contents to non-volatile registers or spilling to memory. This may put more <em>register pressure</em> on the available non-volatile registers. Small to medium sized functions often require only a fraction of the available volatile registers. In this case, in-lining the function avoids the disruptive volatile register clobber and allows better overall register allocation. So there is a strong incentive to in-line local/static functions.</p>
<p>These compiler optimizations are not independent processes. For example specific VSX instruction can access all 64 VSRs, others are restricted to the 32 VRs (like vector integer instructions). So the compiler prioritizes VRs (the higher 32 VSRs) for allocation to vector integer computation. While the lower 32 VSRs can be used for logical/permute operations and as a <em>level 1</em> spill area for VRs. These restrictions combined with code size/complexity can increase <em>register pressure</em> to the point the compiler is forced to spill active (or live) vector registers to secondary storage. This secondary storage can be:</p><ul>
<li>other architected registers that are available for direct transfer but not usable in the computation.</li>
<li>Local variables allocated on the stack</li>
<li>Compiler temporaries allocated on the stack.</li>
</ul>
<p>Instruction scheduling can increase register pressure by moving (reordering) instructions. This is more prevalent when there are large differences in instruction latency in the code stream. For example moving independent / long latency instructions earlier and dependent / short latency instructions later. This tends to increase the distance between the instruction that sets a register result and the next instruction the uses that result in its computation. The distance between a registers set and use is called the <em>live range</em>. This also tends to increase the number of concurrently active and overlapping live ranges.</p>
<p>For this specific (multi-precision integer multiply) example, integer multiple and add/carry/extend instructions predominate. For POWER9, vector integer multiply instructions run 7 cycles, while integer add/carry/extend quadword instruction run 3 cycles. The compiler will want to move the independent multiply instructions earlier while the dependent add/carry instructions are moved later until the latency of the (multiply) instruction (on which it depends) is satisfied. Moving dependent instructions apart and moving independent instructions into the scheduling gap increases register pressure.</p>
<p>In extreme cases, this can get out of hand. At high optimization levels, the compiler can push instruction scheduling to the point that it runs out of registers. This forces the compiler to spill live register values, splitting the live range into two smaller live ranges. Any spilled values have to be reloaded later so they can used in computation. This causes the compiler to generate more instructions that need additional register allocation and scheduling.</p>
<dl class="section note"><dt>Note</dt><dd>A 2048x2048-bit multiply is definitely an extreme case. The implementation requires 256 128x128-bit multiplies, where each 128x128-bit multiply requires 18-30 instructions. The POWER9 implementation requires 1024 vector doublewword multiplies plus 2400+ vector add/carry/extend quadword instructions. When implemented as straight line code and expanded in-line (<em>attribute (flatten)</em>) the total runs over 6000 instructions.</dd></dl>
<p>Compiler spill code usually needs registers in addition (perhaps of a different class) to the registers being spilled. This can be as simple as moving to a register of the same size but different class. For example, register moves to/from VRs and the lower 32 VSRs. But it gets more complex when spilling vector registers to memory. For example, vector register spill code needs GPRs to compute stack addresses for vector load/store instructions. Normally this OK, unless the the spill code consumes so many GPRs that it needs to spill GPRs. In that case we can see serious performance bottlenecks.</p>
<p>But remember that a primary goal (<a class="el" href="vec__int512__ppc_8h.html#i512_security_issues_0_0">Security related implications</a>) was to avoid spilling intermediate results to memory. Spilling between high and low VSRs is acceptable (no cache side-channel), but spilling to memory must be avoided. The compiler should have heuristics to back off in-lining and scheduling-driven code motions just enough to avoid negative performance impacts. But this is difficult to model and may not handle all cases with equal grace. Also this may not prevent spilling VRs to memory if the compiler scheduler's cost computation indicates that is an acceptable trade-off.</p>
<p>So we will have to directly override compiler settings and heuristics to guarantee the result we want/need. The PVECLIB implementation already marks most operations as <b>static inline</b>. But as we use these inline operations as building blocks to implement larger operations we can push the resulting code size over the compiler's default inline limits (<b>-finline-limit</b>). Then compiler will stop in-lining for the duration of compiling the current function.</p>
<p>This may require stronger options/attributes to the compiler like (<em>attribute (always_inline)</em>), (<em>attribute (gnu_inline)</em>), or (<em>attribute (flatten)</em>). The first two are not any help unless you are compiling at lower optimization level (<b>-O0</b> or <b>-O1</b>). <b>-O2</b> defaults to <b>-finline-small-functions</b> and <b>-O3</b> defaults to the stronger <b>-finline-functions</b>. However <em>attribute (flatten)</em> seems do exactly what we want. Every call inside this function is in-lined unless explicitly told not to (<em>attribute (noinline)</em>). It seems that <em>attribute (flatten)</em> ignores the <b>-finline-limit</b>.</p>
<dl class="section note"><dt>Note</dt><dd>You should be compiling PVECLIB applications at <b>-O3</b> anyway.</dd></dl>
<p>Now we have a large block of code for the compiler's instruction scheduler to work on. In this case the code is very repetitive (multiply, add the column, generate carries, repeat). The instruction will have lots of opportunity for scheduling long vs short latency instructions and create new and longer live ranges.</p>
<p>/note In fact after applying <em>attribute (flatten)</em> to vec_mul2048x2048_PWR9 we see a lot of spill code. This expands the code to over 9300 instructions with ~3300 instructions associated with spill code.</p>
<p>We need a mechanism to limit (set boundaries) on code motion while preserving optimization over smaller blocks of code. This is normally called a <em>compiler fence</em> but there are multiple definitions so we need to be careful what we use.</p>
<p>We want something that will prevent the compiler from moving instructions (in either direction) across specified <em>lines in the code</em>.</p>
<p>We don't need an atomic memory fence (like <b> __atomic_thread_fence</b> or <b>__sync_synchronize</b>) that forces the processor to order loads and stores relative to a specific synchronization point.</p>
<p>We don't need a compiler memory fence (like <b>asm ("" ::: "memory")</b>). The "memory" clobber forces GCC to assume that any memory may be arbitrarily read or written by the asm block. So any registers holding live local variables will be forced to memory before and need to be reloaded after. This prevents the compiler from reordering loads, stores, and arithmetic operations across it, but does not prevent the processor from reordering them.</p>
<dl class="section note"><dt>Note</dt><dd>POWER process have an aggressively <em>Speculative Superscalar</em> design with out-of-order issue and execution.</dd></dl>
<p>Neither of the above are what we want for this case. We specifically want to avoid memory side effects in this computation. We only need the minimal compiler fence (like <b>asm (";" :::)</b>) that prevents the compiler from reordering any code across it but does not prevent the processor from reordering them.</p>
<p>By placing this compiler fence between multiply/sum stages of <a class="el" href="vec__int512__ppc_8h.html#ad10d62fe43f329c396b6486402f4f7af" title="Vector 512x128-bit Unsigned Integer Multiply.">vec_mul512x128_inline()</a>, <a class="el" href="vec__int512__ppc_8h.html#a6f917597902625a8218ba41fc6dca426" title="Vector 512x512-bit Unsigned Integer Multiply.">vec_mul512x512_inline()</a> and <a class="el" href="vec__int512__ppc_8h.html#a8287aa4483acb25ac3188a97cc23b89a" title="Vector 2048x2048-bit Unsigned Integer Multiply.">vec_mul2048x2048()</a> we limit instruction scheduling and code motion to smaller code blocks. This in turn reduces register pressure to the point where all 64 VSRs are in use, but no spilling to stack memory is required.</p>
<h2><a class="anchor" id="i512_security_issues_0_0_3"></a>
So what does this all mean?</h2>
<p>The 2048x2048 multiplicands and the resulting product are so large (8192-bits, 64 quadwords total) that at the outer most function the inputs and the result must be in memory and passed by reference. The implementation of a 2048x2048-bit multiply requires 256 128x128-bit multiplies. Otherwise the code can be organized into sub-functions generating intermediate partial products and sums.</p>
<p>Coding 256 128x128 products and generating column sums would be tedious. One approach builds up products into larger and larger blocks in stages. For example code a <a class="el" href="vec__int512__ppc_8h.html#ad10d62fe43f329c396b6486402f4f7af" title="Vector 512x128-bit Unsigned Integer Multiply.">vec_mul512x128_inline()</a> operation then use that in the implementation of <a class="el" href="vec__int512__ppc_8h.html#a6f917597902625a8218ba41fc6dca426" title="Vector 512x512-bit Unsigned Integer Multiply.">vec_mul512x512_inline()</a>. We also provide 512-bit add/carry/extend operations to simplify generating sums of 512-bit partial products. Then load blocks of 512-bits (4 quadwords, 64-bytes) using <a class="el" href="vec__int512__ppc_8h.html#a6f917597902625a8218ba41fc6dca426" title="Vector 512x512-bit Unsigned Integer Multiply.">vec_mul512x512_inline()</a> to produce a 1024-bit partial product (<a class="el" href="vec__int512__ppc_8h.html#i512_security_issues_0_0_0_0">Implications for parameter passing and Product size</a>).</p>
<p>Then multiply the 512-bit blocks across one 2048-bit (4 x 512-bit) multiplicand. The completion of a 2048x512-bit partial product (of 2560-bits) includes the low order 512-bits ready to store to the output operand. Repeat for each 512-bit block of the other 2048-bit multiplicand summing across the 512-bit columns. The final sum, after the final 2048x512 partial product, produces the high order 2048-bits of the 2048x2048 product ready to store to the output operand.</p>
<dl class="section note"><dt>Note</dt><dd>Security aware implementations could use masking countermeasures associated with these load/store operations. The base PVECLIB implementation does not do this. The source is available in ./src/vec_int512_runtime.c.</dd></dl>
<p>It is best if the sub-functions code can be fully in-lined into the 2048x2048-bit multiply or the sub-functions are carefully written. In this case these sub-functions should be leaf-functions (does not call other functions) and can execute without spilling register state or requiring stored (by reference) parameters.</p>
<p>All levels of implementation should avoid conditional logic based on values of inputs or partial products (For example early exits for leading or trailing zero quadwords). Doing so may expose the multiply function to timing side-channel attacks. So the best case would be one large function implemented as straight-line code.</p>
<p>We will need all 64 VSX registers for operations and local variables. So the outer function will need to allocate a stack-frame and save all of the non-volatile floating point registers (allowing the use of vs14-vs31 for local vector variables) and vector registers (v20-v31 AKA vs51-vs63) on entry. This frees up (18+12=) 30 additional quadword registers for local vector variables within the outer multiply function.</p>
<p>These saved registers reflect the state of the calling (or higher) function and may not have any crypto sensitive content. These register save areas will not be updated with internal state from the 2048x2048-bit multiply operation itself.</p>
<p>The 128x128-bit vector multiply is implemented with Vector Multiply-Sum Unsigned Doubleword Modulo for Power9 and Vector Multiply Even/Odd Unsigned Word for Power8. The timing for vector integer multiply operations are fixed at 7 cycles latency for Power8/9. The sums of partial products are implemented with Vector Add Unsigned Quadword Modulo/write-Carry/Extended. The timing of integer add quadword operations are fixed at 4 cycles for Power8 and 3 cycles for Power9. The rest of the 128x128-bit multiply operation is a combination of Vector Doubleword Permute Immediate, Vector Shift Left Double by Octet Immediate, Vector Splats, and Vector Logical Or (used as a vector register move spanning the 64 VSRs). All of these have fixed timings of 2 or 3 cycles.</p>
<p>So the overall timing of the 2048x2048-bit multiply should be consistent independent of input values. The only measurable variations would be as the processor changes Simultaneous Multithreading (SMT) modes (controlled by the virtual machine and kernel). The SMT mode (1,2,4,8) controls each hardware thread's priority to issue instructions to the core and if the instruction stream is dual or single issue (from that thread's perspective).</p>
<p>But the better news is that with some extra function attributes (always_inline and flatten) the entire 2048x2048 multiply function can be flattened into a single function of straight line code (no internal function calls or conditional branches) running ~6.3K instructions. And no spill code was generated for local variables (no register spill within the function body).</p>
<h1><a class="anchor" id="i512_Endian_issues_0_0"></a>
Endian for Multi-quadword precision operations</h1>
<p>As described in <a class="el" href="index.html#mainpage_endian_issues_1_1">General Endian Issues</a> and <a class="el" href="vec__int128__ppc_8h.html#i128_endian_issues_0_0">Endian problems with quadword implementations</a> supporting both big and little endian in a single implementation has its challenges. But I think we can leave the details of quadword operations to the <a class="el" href="vec__int128__ppc_8h.html" title="Header package containing a collection of 128-bit computation functions implemented with PowerISA VMX...">vec_int128_ppc.h</a> implementation. The decision needed for these implementations is how the quadwords of a multi-quadword integer are ordered in storage. For example given an array or structure of 16 quadwords representing a single 2048-bit binary number which quadword contains the low order bits and which the high order bits.</p>
<p>This is largely arbitrary and independent from the system endian. But we should be consistent within the API defined by this header and PVECLIB as a whole. Placing the low order bits in the first (lowest address in memory) quadword and the high order bits in last (highest address in memory) quadword would be consistent with little endian. While placing the high order bits in the first (lowest address in memory) quadword and the low order bits in last (highest address in memory) quadfword would be consistent with big endian. Either is valid internal to the implementation where the key issue is accessing the quadwords of the multiplicands is a convenient order to generate the partial products in an order that support efficient generation of column sums and carries.</p>
<p>It is best for the API if the order of quadwords in multi-quadword integers match the endian of the platform. This should be helpful where we want the use the PVECLIB implementations under existing APIs using arrays of smaller integer types.</p>
<p>So on powerpc64le systems the low order quadword is the first quadword. While on older powwerpc64 systems the high order quadword is the first quadword. For example we can represent a 512-bit integer with the following structure. </p><div class="fragment"><div class="line"> <span class="keyword">typedef</span> <span class="keyword">struct</span></div>
<div class="line">{</div>
<div class="line"><span class="preprocessor">#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__</span></div>
<div class="line">  <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vx0;</div>
<div class="line">  <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vx1;</div>
<div class="line">  <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vx2;</div>
<div class="line">  <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vx3;</div>
<div class="line"><span class="preprocessor">#else</span></div>
<div class="line">  <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vx3;</div>
<div class="line">  <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vx2;</div>
<div class="line">  <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vx1;</div>
<div class="line">  <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vx0;</div>
<div class="line"><span class="preprocessor">#endif</span></div>
<div class="line"><span class="preprocessor">} __VEC_U_512;</span></div>
</div><!-- fragment --><p> In this example the field <em>vx0</em> is always the low order quadword and <em>vx3</em> is always the high order quadword, independent of endian. We repeat this pattern for the range of multi-quadword integer sizes (from <a class="el" href="struct____VEC__U__256.html" title="A vector representation of a 256-bit unsigned integer.">__VEC_U_256</a> to <a class="el" href="struct____VEC__U__4096.html" title="A vector representation of a 4096-bit unsigned integer.">__VEC_U_4096</a>) supported by this header. In each case the field name <em>vx0</em> is consistently the low order quadword. The field name suffix numbering continues from low to high with the highest numbered field name being the high order quadword.</p>
<h2><a class="anchor" id="i512_Endian_issues_0_0_1"></a>
Multi-quadword Integer Constants</h2>
<p>As we have seen, initializing larger multiple precision constants an be challenging (<a class="el" href="vec__int128__ppc_8h.html#int128_const_0_0_1">Quadword Integer Constants</a>). The good news we can continue to to use aggregate initializers for structures and arrays of vector quadwords. For example:</p>
<div class="fragment"><div class="line"><span class="keyword">const</span> <a class="code" href="struct____VEC__U__512.html">__VEC_U_512</a> vec512_one =</div>
<div class="line">    {</div>
<div class="line">      (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) ((<span class="keywordtype">unsigned</span> __int128) 0x00000000),</div>
<div class="line">      (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) ((<span class="keywordtype">unsigned</span> __int128) 0x00000000),</div>
<div class="line">      (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) ((<span class="keywordtype">unsigned</span> __int128) 0x00000000),</div>
<div class="line">      (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) ((<span class="keywordtype">unsigned</span> __int128) 0x00000001)</div>
<div class="line">    };</div>
</div><!-- fragment --><p> This example is in the expected high to low order for the 512-bit constant 1. Unfortunately endian raises it ugly head again and this would a different value on little endian platform.</p>
<p>So PVECLIB provides another helper macro (<a class="el" href="vec__int512__ppc_8h.html#aac9d31829610c29b0f5558bfb1f18e4a" title="Generate a 512-bit vector unsigned integer constant from 4 x quadword constants.">CONST_VINT512_Q()</a>) to provide a consistent numbericial order for multiple quadword constants. For example: </p><div class="fragment"><div class="line"><span class="keyword">const</span> <a class="code" href="struct____VEC__U__512.html">__VEC_U_512</a> vec512_one = <a class="code" href="vec__int512__ppc_8h.html#aac9d31829610c29b0f5558bfb1f18e4a">CONST_VINT512_Q</a></div>
<div class="line">    (</div>
<div class="line">      (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) ((<span class="keywordtype">unsigned</span> __int128) 0x00000000),</div>
<div class="line">      (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) ((<span class="keywordtype">unsigned</span> __int128) 0x00000000),</div>
<div class="line">      (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) ((<span class="keywordtype">unsigned</span> __int128) 0x00000000),</div>
<div class="line">      (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) ((<span class="keywordtype">unsigned</span> __int128) 0x00000001)</div>
<div class="line">    );</div>
</div><!-- fragment --><p> and </p><div class="fragment"><div class="line"><span class="comment">// const for 10**128</span></div>
<div class="line"><span class="keyword">const</span> <a class="code" href="struct____VEC__U__512.html">__VEC_U_512</a>  vec512_ten128th = <a class="code" href="vec__int512__ppc_8h.html#aac9d31829610c29b0f5558bfb1f18e4a">CONST_VINT512_Q</a></div>
<div class="line">    (</div>
<div class="line">      <a class="code" href="vec__int128__ppc_8h.html#a0f75e65180e68c4753f3d9c2f42d1a31">CONST_VUINT128_QxW</a> (0x00000000, 0x00000000, 0x0000024e, 0xe91f2603),</div>
<div class="line">      <a class="code" href="vec__int128__ppc_8h.html#a0f75e65180e68c4753f3d9c2f42d1a31">CONST_VUINT128_QxW</a> (0xa6337f19, 0xbccdb0da, 0xc404dc08, 0xd3cff5ec),</div>
<div class="line">      <a class="code" href="vec__int128__ppc_8h.html#a0f75e65180e68c4753f3d9c2f42d1a31">CONST_VUINT128_QxW</a> (0x2374e42f, 0x0f1538fd, 0x03df9909, 0x2e953e01),</div>
<div class="line">      <a class="code" href="vec__int128__ppc_8h.html#a0f75e65180e68c4753f3d9c2f42d1a31">CONST_VUINT128_QxW</a> (0x00000000, 0x00000000, 0x00000000, 0x00000000)</div>
<div class="line">    );</div>
</div><!-- fragment --><p> Unfortunately the compiler can not help with multi-quadword decimal constants. So we must resort to external tools like <b>bc</b> to compute large constant values and convert them to hexadecimal which are easier to break into words and doubleword. These can then be used a constants in program source to represent arbitrarily large binary values.</p>
<h1><a class="anchor" id="i512_libary_issues_0_0"></a>
Building libraries for vec_int512_ppc</h1>
<dl class="section see"><dt>See also</dt><dd><a class="el" href="index.html#main_libary_issues_0_0">Putting the Library into PVECLIB</a></dd></dl>
<p>Many of the implementations associated with 512-bit integer operations are uncomfortably large to expand as in-line code (Examples include <a class="el" href="vec__int512__ppc_8h.html#a56a5da10870d9878e2ab888d3c4d2e7b" title="Vector 512x512-bit Unsigned Integer Multiply.">vec_mul512x512()</a>, <a class="el" href="vec__int512__ppc_8h.html#ad4ade47617ecf223a2f7b0325e1fc877" title="Vector 1024x1024-bit Unsigned Integer Multiply.">vec_mul1024x1024()</a>, and <a class="el" href="vec__int512__ppc_8h.html#a8287aa4483acb25ac3188a97cc23b89a" title="Vector 2048x2048-bit Unsigned Integer Multiply.">vec_mul2048x2048()</a>). It is better to collect these large implementations in separately compiled run-time libraries. Another consideration is that most of these operations are multiple quadword multiplies and the optimum quadword multiply is processor (and PowerISA version) dependent. This is especially true for Vector integer multiplies across POWER7-POWER9.</p>
<p>This places requirements on the structure of runtime implementation codes and the library build process.</p><ul>
<li>Building a set of source implementations for multiple compile (-mcpu=) targets.</li>
<li>Providing unique function names based on the operation and the compile target.</li>
<li>Providing static (archive) and dynamic (DSO) libraries, while adjusting the the compile options appropriately for each.<ul>
<li>Objects compiled for inclusion in dynamic libraries should be position independent code (i.e. compiled with -fpic or -fPIC).</li>
<li>DSOs supporting operations optimized for multiple compile (-mcpu=) targets need to export matching <a href="https://sourceware.org/glibc/wiki/GNU_IFUNC">IFUNC</a> symbols and resolver stubs.</li>
</ul>
</li>
</ul>
<p>For the first requirement we can collect the runtime implementations for vec_int512_ppc in to a single source file (vec_int512_runtime.c). The build system can then collect this and other runtime source files to compile for different targets. This can be as simple as: </p><div class="fragment"><div class="line"><span class="comment">//  \file  vec_runtime_PWR9.c</span></div>
<div class="line"><span class="preprocessor">#include &quot;vec_int512_runtime.c&quot;</span></div>
<div class="line">...</div>
</div><!-- fragment --><p> and similarly for vec_runtime_PWR7.c and vec_runtime_PWR8.c.</p>
<p>As the implementation of vec_int512_ppc.c is already leveraging _ARCH_PWR7/8/9 tuned static inline operations from <a class="el" href="vec__int512__ppc_8h.html" title="Header package containing a collection of multiple precision quadword integer computation functions i...">vec_int512_ppc.h</a>, <a class="el" href="vec__int128__ppc_8h.html" title="Header package containing a collection of 128-bit computation functions implemented with PowerISA VMX...">vec_int128_ppc.h</a>, etc, all we need to do is apply the appropriate -mcpu=power7/8/9 compile option to each (target qualified) runtime source file.</p>
<p>The second requirement is addressed by applying a target qualifying suffix to each runtime function implementation. Here we use the <a class="el" href="vec__int512__ppc_8h.html#a77eca5d7bebe0f30894fe9669c01b7a7" title="Macro to add platform suffix for static calls.">__VEC_PWR_IMP()</a> as function name wrapper macro. </p><div class="fragment"><div class="line"><span class="preprocessor">#ifdef _ARCH_PWR10</span></div>
<div class="line"><span class="preprocessor">#define __VEC_PWR_IMP(FNAME) FNAME ## _PWR10</span></div>
<div class="line"><span class="preprocessor">#else</span></div>
<div class="line"><span class="preprocessor">#ifdef _ARCH_PWR9</span></div>
<div class="line"><span class="preprocessor">#define __VEC_PWR_IMP(FNAME) FNAME ## _PWR9</span></div>
<div class="line"><span class="preprocessor">#else</span></div>
<div class="line"><span class="preprocessor">#ifdef _ARCH_PWR8</span></div>
<div class="line"><span class="preprocessor">#define __VEC_PWR_IMP(FNAME) FNAME ## _PWR8</span></div>
<div class="line"><span class="preprocessor">#else</span></div>
<div class="line"><span class="preprocessor">#define __VEC_PWR_IMP(FNAME) FNAME ## _PWR7</span></div>
<div class="line"><span class="preprocessor">#endif</span></div>
<div class="line"><span class="preprocessor">#endif</span></div>
<div class="line"><span class="preprocessor">#endif</span></div>
</div><!-- fragment --><p> We need to apply the name wrapper to both the functions extern (in <a class="el" href="vec__int512__ppc_8h.html" title="Header package containing a collection of multiple precision quadword integer computation functions i...">vec_int512_ppc.h</a>) and the function implementation (in vec_int512_runtime.c). For example: </p><div class="fragment"><div class="line"><span class="comment">//  \file  vec_int512_ppc.h</span></div>
<div class="line"> ...</div>
<div class="line">extern <a class="code" href="struct____VEC__U__256.html">__VEC_U_256</a></div>
<div class="line"><a class="code" href="vec__int512__ppc_8h.html#a77eca5d7bebe0f30894fe9669c01b7a7">__VEC_PWR_IMP</a> (<a class="code" href="vec__int512__ppc_8h.html#ab5b80fd9694cea8bf502b26e55af37f7">vec_mul128x128</a>) (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> m1l, <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> m2l);</div>
<div class="line"> ...</div>
</div><!-- fragment --> <dl class="section note"><dt>Note</dt><dd>Doxygen does not tolerate attributes or macros in function prototypes. So these externs are guarded by a @cond INTERNAL ... @endcond" block. The \brief and @param descriptions are provided for the unqualified dynamic function symbol and apply to the corresponding qualified function symbols.</dd></dl>
<div class="fragment"><div class="line"><span class="comment">//  \file  vec_int512_runtime.c</span></div>
<div class="line"><span class="preprocessor">#include &lt;altivec.h&gt;</span></div>
<div class="line"><span class="preprocessor">#include &lt;<a class="code" href="vec__int128__ppc_8h.html">pveclib/vec_int128_ppc.h</a>&gt;</span></div>
<div class="line"><span class="preprocessor">#include &lt;<a class="code" href="vec__int512__ppc_8h.html">pveclib/vec_int512_ppc.h</a>&gt;</span></div>
<div class="line"> ...</div>
<div class="line"><span class="comment">// vec_mul128x128_inline is defined in vec_int512_ppc.h</span></div>
<div class="line">__VEC_U_256</div>
<div class="line"><a class="code" href="vec__int512__ppc_8h.html#a77eca5d7bebe0f30894fe9669c01b7a7">__VEC_PWR_IMP</a> (<a class="code" href="vec__int512__ppc_8h.html#ab5b80fd9694cea8bf502b26e55af37f7">vec_mul128x128</a>) (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> m1l, <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> m2l)</div>
<div class="line">{</div>
<div class="line">  <span class="keywordflow">return</span> <a class="code" href="vec__int512__ppc_8h.html#a958e029fc824ec3a73ad9550bf7ea506">vec_mul128x128_inline</a> (m1l, m2l);</div>
<div class="line">}</div>
</div><!-- fragment --><p> This ensures that target specific runtime implementations have unique function symbols. This is important to avoid linker errors (due to duplicate symbol names).</p>
<dl class="section note"><dt>Note</dt><dd>Each runtime operation will have 2 or 3 target qualified implementations. This is times 2 with separate builds for static archives and dynamic (DSO) libraries. The big endian powerpc64 platform supports 3 VSX enabled targets -mcpu=[power7|power8|power9]. The little endian powerpc64le platform currently supports 2 VSX enabled targets -mcpu=[power8|power9]. POWER7 is not supported for powerpc64le and the vec_runtime_PWR7.c source files are conditionally nulled out for powerpc64le targets. As new POWER processors are released, additional targets will be added.</dd></dl>
<h2><a class="anchor" id="i512_libary_issues_0_0_1_1"></a>
Static linkage to platform specific functions</h2>
<p>For static linkage the application is compiled for a specific platform target (via -mcpu=). So function calls should be bound to the matching platform specific implementations. The application may select the platform specific function directly by defining a <em>extern</em> and invoking the platform qualified function.</p>
<p>For applications binding to PVECLIB via static archives it is convenient to apply the <a class="el" href="vec__int512__ppc_8h.html#a77eca5d7bebe0f30894fe9669c01b7a7" title="Macro to add platform suffix for static calls.">__VEC_PWR_IMP()</a> wrapper to the function call: </p><div class="fragment"><div class="line">k = <a class="code" href="vec__int512__ppc_8h.html#a77eca5d7bebe0f30894fe9669c01b7a7">__VEC_PWR_IMP</a> (<a class="code" href="vec__int512__ppc_8h.html#ab5b80fd9694cea8bf502b26e55af37f7">vec_mul128x128</a>)(i, j);</div>
</div><!-- fragment --><p> The function call symbol picks up the target suffix based on the compile target (-mcpu=) for the application (see <a class="el" href="index.html#main_libary_issues_0_0_1_1">Static linkage to platform specific functions</a>). The linker will extract the matching implementations from the PVECLIB archive and (statically) bind them with the application. This simplifies binding the application to the matching target specific implementations.</p>
<h2><a class="anchor" id="i512_libary_issues_0_0_1_2"></a>
Dynamic linkage to platform specific functions</h2>
<p>For applications binding to dynamic libraries, the target qualified naming strategy also simplifies the implementation of IFUNC resolvers for the DSO library (see <a class="el" href="index.html#main_libary_issues_0_0_2">Building dynamic runtime libraries</a>). Here the target qualified names of the PIC implementations are known to the corresponding resolver function but are not exported from the DSO. Allowing the application to bind to the target qualified names would defeat the automatic selection of target optimized implementations.</p>
<p>Applications using dynamic linkage will call the unqualified function symbol. For example: </p><div class="fragment"><div class="line"><span class="comment">//  \file  vec_int512_ppc.h</span></div>
<div class="line"> ...</div>
<div class="line">extern <a class="code" href="struct____VEC__U__256.html">__VEC_U_256</a></div>
<div class="line"><a class="code" href="vec__int512__ppc_8h.html#ab5b80fd9694cea8bf502b26e55af37f7">vec_mul128x128</a> (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>, <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>);</div>
</div><!-- fragment --><p>This symbol's implementation has a special <b>STT_GNU_IFUNC</b> attribute recognized by the dynamic linker. This attribute associates this symbol with the corresponding runtime resolver function. So in addition to any platform specific implementations we need to provide the resolver function referenced by the <em>IFUNC</em> symbol. For example: </p><div class="fragment"><div class="line"><span class="comment">//  \file  vec_runtime_DYN.c</span></div>
<div class="line"> ...</div>
<div class="line">extern <a class="code" href="struct____VEC__U__256.html">__VEC_U_256</a></div>
<div class="line">vec_mul128x128_PWR7 (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>, <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>);</div>
<div class="line"> </div>
<div class="line"><span class="keyword">extern</span> <a class="code" href="struct____VEC__U__256.html">__VEC_U_256</a></div>
<div class="line">vec_mul128x128_PWR8 (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>, <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>);</div>
<div class="line"> </div>
<div class="line"><span class="keyword">extern</span> <a class="code" href="struct____VEC__U__256.html">__VEC_U_256</a></div>
<div class="line">vec_mul128x128_PWR9 (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>, <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>);</div>
<div class="line"> </div>
<div class="line"><span class="keyword">static</span> <a class="code" href="struct____VEC__U__256.html">__VEC_U_256</a></div>
<div class="line">(*resolve_vec_mul128x128 (<span class="keywordtype">void</span>))(<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>, <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>)</div>
<div class="line">{</div>
<div class="line"><span class="preprocessor">#ifdef  __BUILTIN_CPU_SUPPORTS__</span></div>
<div class="line">  <span class="keywordflow">if</span> (__builtin_cpu_is (<span class="stringliteral">&quot;power9&quot;</span>))</div>
<div class="line">    <span class="keywordflow">return</span> vec_mul128x128_PWR9;</div>
<div class="line">  <span class="keywordflow">else</span></div>
<div class="line">    {</div>
<div class="line">      <span class="keywordflow">if</span> (__builtin_cpu_is (<span class="stringliteral">&quot;power8&quot;</span>))</div>
<div class="line">        <span class="keywordflow">return</span> vec_mul128x128_PWR8;</div>
<div class="line">      <span class="keywordflow">else</span></div>
<div class="line">        <span class="keywordflow">return</span> vec_mul128x128_PWR7;</div>
<div class="line">    }</div>
<div class="line"><span class="preprocessor">#else // ! __BUILTIN_CPU_SUPPORTS__</span></div>
<div class="line">    <span class="keywordflow">return</span> vec_mul128x128_PWR7;</div>
<div class="line"><span class="preprocessor">#endif</span></div>
<div class="line">}</div>
<div class="line"> </div>
<div class="line"><a class="code" href="struct____VEC__U__256.html">__VEC_U_256</a></div>
<div class="line"><a class="code" href="vec__int512__ppc_8h.html#ab5b80fd9694cea8bf502b26e55af37f7">vec_mul128x128</a> (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>, <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>)</div>
<div class="line">__attribute__ ((ifunc (<span class="stringliteral">&quot;resolve_vec_mul128x128&quot;</span>)));</div>
</div><!-- fragment --><p>On the program's first call to a <em>IFUNC</em> symbol, the dynamic linker calls the resolver function associated with that symbol. The resolver function performs a runtime check to determine the platform, selects the (closest) matching platform specific function, then returns that functions address to the dynamic linker.</p>
<p>The dynamic linker stores this function address in the callers Procedure Linkage Tables (PLT) before forwarding the call to the resolved implementation. Any subsequent calls to this function symbol branch (via the PLT) directly to appropriate platform specific implementation.</p>
<dl class="section note"><dt>Note</dt><dd>The operation <a class="el" href="vec__int512__ppc_8h.html#ab5b80fd9694cea8bf502b26e55af37f7" title="Vector 128x128bit Unsigned Integer Multiply.">vec_mul128x128()</a> has multiple implementations and names. It has a static inline implementation <a class="el" href="vec__int512__ppc_8h.html#a958e029fc824ec3a73ad9550bf7ea506" title="Vector 128x128bit Unsigned Integer Multiply.">vec_mul128x128_inline()</a>. This uses the static inline <a class="el" href="vec__int128__ppc_8h.html#aee5c5b2998ef105b4c6f39739748ffa8" title="Vector Multiply Unsigned Double Quadword.">vec_muludq()</a> from _vec_int128_ppc.h but returns the 256-bit result as a single struct <a class="el" href="struct____VEC__U__256.html" title="A vector representation of a 256-bit unsigned integer.">__VEC_U_256</a>. It has a number (currently 2 or 3) of target qualified extern declarations and static implementations for static linkage. And it has a unqualified extern declaration and IFUNC attributed symbol associated with its resolver for dynamic linkage.</dd></dl>
<dl class="todo"><dt><b><a class="el" href="todo.html#_todo000002">Todo:</a></b></dt><dd>Currently the dynamic resolvers and <em>IFUNC</em> symbols for vec_int512_runtime.c are contained within vec_runtime_DYN.c. As the list of runtime operations expands to other element sizes/types, vec_runtime_DYN.c should be refactored into multiple files.</dd></dl>
</div><h2 class="groupheader">Macro Definition Documentation</h2>
<a id="a9376cb8baadb875605593f95422a1902"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a9376cb8baadb875605593f95422a1902">&#9670;&nbsp;</a></span>COMPILE_FENCE</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">#define COMPILE_FENCE&#160;&#160;&#160;__asm (&quot;;&quot;:::)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>A compiler fence to prevent excessive code motion. </p>
<p>We use the COMPILER_FENCE to limit instruction scheduling and code motion to smaller code blocks. This in turn reduces register pressure and avoids generating spill code. </p>

</div>
</div>
<a id="aac9d31829610c29b0f5558bfb1f18e4a"></a>
<h2 class="memtitle"><span class="permalink"><a href="#aac9d31829610c29b0f5558bfb1f18e4a">&#9670;&nbsp;</a></span>CONST_VINT512_Q</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">#define CONST_VINT512_Q</td>
          <td>(</td>
          <td class="paramtype">&#160;</td>
          <td class="paramname">__q0, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">&#160;</td>
          <td class="paramname">__q1, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">&#160;</td>
          <td class="paramname">__q2, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">&#160;</td>
          <td class="paramname">__q3&#160;</td>
        </tr>
        <tr>
          <td></td>
          <td>)</td>
          <td></td><td>&#160;&#160;&#160;{__q3, __q2, __q1, __q0}</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Generate a 512-bit vector unsigned integer constant from 4 x quadword constants. </p>
<p>Combine 4 x quadwords constants into a 512-bit <a class="el" href="struct____VEC__U__512.html" title="A vector representation of a 512-bit unsigned integer.">__VEC_U_512</a> constant. The 4 parameters are quadword integer constant values in high to low order. For example:</p>
<div class="fragment"><div class="line"><span class="comment">// 512-bit integer constant for 10**128</span></div>
<div class="line"><span class="keyword">const</span> <a class="code" href="struct____VEC__U__512.html">__VEC_U_512</a>  vec512_ten128th = <a class="code" href="vec__int512__ppc_8h.html#aac9d31829610c29b0f5558bfb1f18e4a">CONST_VINT512_Q</a></div>
<div class="line">    (</div>
<div class="line">      <a class="code" href="vec__int128__ppc_8h.html#a0f75e65180e68c4753f3d9c2f42d1a31">CONST_VUINT128_QxW</a> (0x00000000, 0x00000000, 0x0000024e, 0xe91f2603),</div>
<div class="line">      <a class="code" href="vec__int128__ppc_8h.html#a0f75e65180e68c4753f3d9c2f42d1a31">CONST_VUINT128_QxW</a> (0xa6337f19, 0xbccdb0da, 0xc404dc08, 0xd3cff5ec),</div>
<div class="line">      <a class="code" href="vec__int128__ppc_8h.html#a0f75e65180e68c4753f3d9c2f42d1a31">CONST_VUINT128_QxW</a> (0x2374e42f, 0x0f1538fd, 0x03df9909, 0x2e953e01),</div>
<div class="line">      <a class="code" href="vec__int128__ppc_8h.html#a0f75e65180e68c4753f3d9c2f42d1a31">CONST_VUINT128_QxW</a> (0x00000000, 0x00000000, 0x00000000, 0x00000000)</div>
<div class="line">    );</div>
</div><!-- fragment --> 
</div>
</div>
<h2 class="groupheader">Function Documentation</h2>
<a id="abf330c11973fdef2cccefa199c3473b9"></a>
<h2 class="memtitle"><span class="permalink"><a href="#abf330c11973fdef2cccefa199c3473b9">&#9670;&nbsp;</a></span>vec_add512cu()</h2>

<div class="memitem">
<div class="memproto">
<table class="mlabels">
  <tr>
  <td class="mlabels-left">
      <table class="memname">
        <tr>
          <td class="memname">static <a class="el" href="struct____VEC__U__640.html">__VEC_U_640</a> vec_add512cu </td>
          <td>(</td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>a</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>b</em>&#160;</td>
        </tr>
        <tr>
          <td></td>
          <td>)</td>
          <td></td><td></td>
        </tr>
      </table>
  </td>
  <td class="mlabels-right">
<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span>  </td>
  </tr>
</table>
</div><div class="memdoc">

<p>Vector Add 512-bit Unsigned Integer &amp; Write Carry. </p>
<p>Compute the 512 bit sum of two 512 bit values a, b and produce the carry. The sum (with-carry) is returned as single 640-bit integer in a homogeneous aggregate structure.</p>
<table class="markdownTable">
<tr class="markdownTableHead">
<th class="markdownTableHeadRight">processor </th><th class="markdownTableHeadCenter">Latency </th><th class="markdownTableHeadLeft">Throughput  </th></tr>
<tr class="markdownTableRowOdd">
<td class="markdownTableBodyRight">power8 </td><td class="markdownTableBodyCenter">16 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
<tr class="markdownTableRowEven">
<td class="markdownTableBodyRight">power9 </td><td class="markdownTableBodyCenter">12 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">a</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
    <tr><td class="paramname">b</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
  </table>
  </dd>
</dl>
<dl class="section return"><dt>Returns</dt><dd>homogeneous aggregate representation of the unsigned 640-bit sum of a + b. </dd></dl>

</div>
</div>
<a id="a65346bdb9a4a7cc51bcca3e26443936a"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a65346bdb9a4a7cc51bcca3e26443936a">&#9670;&nbsp;</a></span>vec_add512ecu()</h2>

<div class="memitem">
<div class="memproto">
<table class="mlabels">
  <tr>
  <td class="mlabels-left">
      <table class="memname">
        <tr>
          <td class="memname">static <a class="el" href="struct____VEC__U__640.html">__VEC_U_640</a> vec_add512ecu </td>
          <td>(</td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>a</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>b</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>&#160;</td>
          <td class="paramname"><em>c</em>&#160;</td>
        </tr>
        <tr>
          <td></td>
          <td>)</td>
          <td></td><td></td>
        </tr>
      </table>
  </td>
  <td class="mlabels-right">
<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span>  </td>
  </tr>
</table>
</div><div class="memdoc">

<p>Vector Add Extended 512-bit Unsigned Integer &amp; Write Carry. </p>
<p>Compute the 512 bit sum of two 512 bit values a, b and 1 bit value carry-in value c. Produce the carry out of the high order bit of the sum. The sum (with-carry) is returned as single 640-bit integer in a homogeneous aggregate structure.</p>
<table class="markdownTable">
<tr class="markdownTableHead">
<th class="markdownTableHeadRight">processor </th><th class="markdownTableHeadCenter">Latency </th><th class="markdownTableHeadLeft">Throughput  </th></tr>
<tr class="markdownTableRowOdd">
<td class="markdownTableBodyRight">power8 </td><td class="markdownTableBodyCenter">16 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
<tr class="markdownTableRowEven">
<td class="markdownTableBodyRight">power9 </td><td class="markdownTableBodyCenter">12 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">a</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
    <tr><td class="paramname">b</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
    <tr><td class="paramname">c</td><td>vector representation of a unsigned 1-bit carry. </td></tr>
  </table>
  </dd>
</dl>
<dl class="section return"><dt>Returns</dt><dd>homogeneous aggregate representation of the unsigned 640-bit sum of a + b + c. </dd></dl>

</div>
</div>
<a id="a386ebc9ccc979fc415b8d6d66ef2e31c"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a386ebc9ccc979fc415b8d6d66ef2e31c">&#9670;&nbsp;</a></span>vec_add512eum()</h2>

<div class="memitem">
<div class="memproto">
<table class="mlabels">
  <tr>
  <td class="mlabels-left">
      <table class="memname">
        <tr>
          <td class="memname">static <a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> vec_add512eum </td>
          <td>(</td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>a</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>b</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>&#160;</td>
          <td class="paramname"><em>c</em>&#160;</td>
        </tr>
        <tr>
          <td></td>
          <td>)</td>
          <td></td><td></td>
        </tr>
      </table>
  </td>
  <td class="mlabels-right">
<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span>  </td>
  </tr>
</table>
</div><div class="memdoc">

<p>Vector Add Extended 512-bit Unsigned Integer Modulo. </p>
<p>Compute the 512 bit sum of two 512 bit values a, b and 1 bit value carry-in value c. The sum is returned as single 512-bit integer in a homogeneous aggregate structure. Any carry-out of the high order bit of the sum is lost.</p>
<table class="markdownTable">
<tr class="markdownTableHead">
<th class="markdownTableHeadRight">processor </th><th class="markdownTableHeadCenter">Latency </th><th class="markdownTableHeadLeft">Throughput  </th></tr>
<tr class="markdownTableRowOdd">
<td class="markdownTableBodyRight">power8 </td><td class="markdownTableBodyCenter">16 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
<tr class="markdownTableRowEven">
<td class="markdownTableBodyRight">power9 </td><td class="markdownTableBodyCenter">12 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">a</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
    <tr><td class="paramname">b</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
    <tr><td class="paramname">c</td><td>vector representation of a unsigned 1-bit carry. </td></tr>
  </table>
  </dd>
</dl>
<dl class="section return"><dt>Returns</dt><dd>homogeneous aggregate representation of the unsigned 512-bit sum of a + b + c. </dd></dl>

</div>
</div>
<a id="a7508430b3dbea4d708c1e45de93dca04"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a7508430b3dbea4d708c1e45de93dca04">&#9670;&nbsp;</a></span>vec_add512um()</h2>

<div class="memitem">
<div class="memproto">
<table class="mlabels">
  <tr>
  <td class="mlabels-left">
      <table class="memname">
        <tr>
          <td class="memname">static <a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> vec_add512um </td>
          <td>(</td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>a</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>b</em>&#160;</td>
        </tr>
        <tr>
          <td></td>
          <td>)</td>
          <td></td><td></td>
        </tr>
      </table>
  </td>
  <td class="mlabels-right">
<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span>  </td>
  </tr>
</table>
</div><div class="memdoc">

<p>Vector Add 512-bit Unsigned Integer Modulo. </p>
<p>Compute the 512 bit sum of two 512 bit values a, b. The sum is returned as single 512-bit integer in a homogeneous aggregate structure. Any carry-out of the high order bit of the sum is lost.</p>
<table class="markdownTable">
<tr class="markdownTableHead">
<th class="markdownTableHeadRight">processor </th><th class="markdownTableHeadCenter">Latency </th><th class="markdownTableHeadLeft">Throughput  </th></tr>
<tr class="markdownTableRowOdd">
<td class="markdownTableBodyRight">power8 </td><td class="markdownTableBodyCenter">16 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
<tr class="markdownTableRowEven">
<td class="markdownTableBodyRight">power9 </td><td class="markdownTableBodyCenter">12 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">a</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
    <tr><td class="paramname">b</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
  </table>
  </dd>
</dl>
<dl class="section return"><dt>Returns</dt><dd>homogeneous aggregate representation of the unsigned 512-bit sum of a + b. </dd></dl>

</div>
</div>
<a id="a07a919fc91b93cf47cd48c9dd2a79ae6"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a07a919fc91b93cf47cd48c9dd2a79ae6">&#9670;&nbsp;</a></span>vec_add512ze()</h2>

<div class="memitem">
<div class="memproto">
<table class="mlabels">
  <tr>
  <td class="mlabels-left">
      <table class="memname">
        <tr>
          <td class="memname">static <a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> vec_add512ze </td>
          <td>(</td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>a</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>&#160;</td>
          <td class="paramname"><em>c</em>&#160;</td>
        </tr>
        <tr>
          <td></td>
          <td>)</td>
          <td></td><td></td>
        </tr>
      </table>
  </td>
  <td class="mlabels-right">
<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span>  </td>
  </tr>
</table>
</div><div class="memdoc">

<p>Vector Add 512-bit to Zero Extended Unsigned Integer Modulo. </p>
<p>The carry-in is zero extended to the left before computing the 512-bit sum a + c. The sum is returned as single 512-bit integer in a homogeneous aggregate structure. Any carry-out of the high order bit of the sum is lost.</p>
<table class="markdownTable">
<tr class="markdownTableHead">
<th class="markdownTableHeadRight">processor </th><th class="markdownTableHeadCenter">Latency </th><th class="markdownTableHeadLeft">Throughput  </th></tr>
<tr class="markdownTableRowOdd">
<td class="markdownTableBodyRight">power8 </td><td class="markdownTableBodyCenter">16 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
<tr class="markdownTableRowEven">
<td class="markdownTableBodyRight">power9 </td><td class="markdownTableBodyCenter">12 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">a</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
    <tr><td class="paramname">c</td><td>vector representation of a unsigned 1-bit carry. </td></tr>
  </table>
  </dd>
</dl>
<dl class="section return"><dt>Returns</dt><dd>homogeneous aggregate representation of the unsigned 512-bit sum of a + c. </dd></dl>

</div>
</div>
<a id="a3f54578765372cc100062d52caaba70c"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a3f54578765372cc100062d52caaba70c">&#9670;&nbsp;</a></span>vec_add512ze2()</h2>

<div class="memitem">
<div class="memproto">
<table class="mlabels">
  <tr>
  <td class="mlabels-left">
      <table class="memname">
        <tr>
          <td class="memname">static <a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> vec_add512ze2 </td>
          <td>(</td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>a</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>&#160;</td>
          <td class="paramname"><em>c1</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>&#160;</td>
          <td class="paramname"><em>c2</em>&#160;</td>
        </tr>
        <tr>
          <td></td>
          <td>)</td>
          <td></td><td></td>
        </tr>
      </table>
  </td>
  <td class="mlabels-right">
<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span>  </td>
  </tr>
</table>
</div><div class="memdoc">

<p>Vector Add 512-bit to Zero Extended2 Unsigned Integer Modulo. </p>
<p>The two carry-ins are zero extended to the left before Computing the 512 bit sum a + c1 + c2. The sum is returned as single 512-bit integer in a homogeneous aggregate structure. Any carry-out of the high order bit of the sum is lost.</p>
<table class="markdownTable">
<tr class="markdownTableHead">
<th class="markdownTableHeadRight">processor </th><th class="markdownTableHeadCenter">Latency </th><th class="markdownTableHeadLeft">Throughput  </th></tr>
<tr class="markdownTableRowOdd">
<td class="markdownTableBodyRight">power8 </td><td class="markdownTableBodyCenter">16 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
<tr class="markdownTableRowEven">
<td class="markdownTableBodyRight">power9 </td><td class="markdownTableBodyCenter">12 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">a</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
    <tr><td class="paramname">c1</td><td>vector representation of a unsigned 1-bit carry. </td></tr>
    <tr><td class="paramname">c2</td><td>vector representation of a unsigned 1-bit carry. </td></tr>
  </table>
  </dd>
</dl>
<dl class="section return"><dt>Returns</dt><dd>homogeneous aggregate representation of the unsigned 512-bit sum of a + c1 + c2. </dd></dl>

</div>
</div>
<a id="a9f2101271dd5d072a8406bbed160b9c8"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a9f2101271dd5d072a8406bbed160b9c8">&#9670;&nbsp;</a></span>vec_madd512x128a128_inline()</h2>

<div class="memitem">
<div class="memproto">
<table class="mlabels">
  <tr>
  <td class="mlabels-left">
      <table class="memname">
        <tr>
          <td class="memname">static <a class="el" href="struct____VEC__U__640.html">__VEC_U_640</a> vec_madd512x128a128_inline </td>
          <td>(</td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>m1</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>&#160;</td>
          <td class="paramname"><em>m2</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>&#160;</td>
          <td class="paramname"><em>a1</em>&#160;</td>
        </tr>
        <tr>
          <td></td>
          <td>)</td>
          <td></td><td></td>
        </tr>
      </table>
  </td>
  <td class="mlabels-right">
<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span>  </td>
  </tr>
</table>
</div><div class="memdoc">

<p>Vector 512x128-bit Multiply-Add Unsigned Integer. </p>
<p>Compute the 640 bit sum of 512 bit value m1 and 128-bit value m2 plus 128-bit value a1. The product is returned as single 640-bit integer in a homogeneous aggregate structure.</p>
<dl class="section note"><dt>Note</dt><dd>The advantage of this form is that the final 640 bit sum can not overflow and carries between stages are eliminated. Also applying the addend early (1st multiply stage) reduces the live ranges for registers passing partial products for larger multiple precision multiplies.</dd>
<dd>
We use the COMPILER_FENCE to limit instruction scheduling and code motion to smaller code blocks. This in turn reduces register pressure and avoids generating spill code.</dd></dl>
<table class="markdownTable">
<tr class="markdownTableHead">
<th class="markdownTableHeadRight">processor </th><th class="markdownTableHeadCenter">Latency </th><th class="markdownTableHeadLeft">Throughput  </th></tr>
<tr class="markdownTableRowOdd">
<td class="markdownTableBodyRight">power8 </td><td class="markdownTableBodyCenter">224-232 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
<tr class="markdownTableRowEven">
<td class="markdownTableBodyRight">power9 </td><td class="markdownTableBodyCenter">132-135 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">m1</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
    <tr><td class="paramname">m2</td><td>vector representation of a unsigned 128-bit integer. </td></tr>
    <tr><td class="paramname">a1</td><td>vector representation of a unsigned 128-bit integer. </td></tr>
  </table>
  </dd>
</dl>
<dl class="section return"><dt>Returns</dt><dd>homogeneous aggregate representation of the unsigned 640-bit sum of (m1 * m2) + c. </dd></dl>

</div>
</div>
<a id="a68ccbd4fd74977eb3f69276c6b934c26"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a68ccbd4fd74977eb3f69276c6b934c26">&#9670;&nbsp;</a></span>vec_madd512x128a128a512_inline()</h2>

<div class="memitem">
<div class="memproto">
<table class="mlabels">
  <tr>
  <td class="mlabels-left">
      <table class="memname">
        <tr>
          <td class="memname">static <a class="el" href="struct____VEC__U__640.html">__VEC_U_640</a> vec_madd512x128a128a512_inline </td>
          <td>(</td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>m1</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>&#160;</td>
          <td class="paramname"><em>m2</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>&#160;</td>
          <td class="paramname"><em>a1</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>a2</em>&#160;</td>
        </tr>
        <tr>
          <td></td>
          <td>)</td>
          <td></td><td></td>
        </tr>
      </table>
  </td>
  <td class="mlabels-right">
<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span>  </td>
  </tr>
</table>
</div><div class="memdoc">

<p>Vector 512x128-bit Multiply-Add Unsigned Integer. </p>
<p>Compute the 640 bit sum of 512 bit value m1 and 128-bit value m2, plus 128-bit value a1, plus 512-bit value a2. The sum is returned as single 640-bit integer in a homogeneous aggregate structure.</p>
<dl class="section note"><dt>Note</dt><dd>The advantage of this form is that the final 640 bit sum can not overflow and carries between stages are eliminated. Also applying the addend early (1st multiply stage) reduces the live ranges for registers passing partial products for larger multiple precision multiplies.</dd>
<dd>
We use the COMPILER_FENCE to limit instruction scheduling and code motion to smaller code blocks. This in turn reduces register pressure and avoids generating spill code.</dd></dl>
<table class="markdownTable">
<tr class="markdownTableHead">
<th class="markdownTableHeadRight">processor </th><th class="markdownTableHeadCenter">Latency </th><th class="markdownTableHeadLeft">Throughput  </th></tr>
<tr class="markdownTableRowOdd">
<td class="markdownTableBodyRight">power8 </td><td class="markdownTableBodyCenter">224-232 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
<tr class="markdownTableRowEven">
<td class="markdownTableBodyRight">power9 </td><td class="markdownTableBodyCenter">132-135 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">m1</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
    <tr><td class="paramname">m2</td><td>vector representation of a unsigned 128-bit integer. </td></tr>
    <tr><td class="paramname">a1</td><td>vector representation of a unsigned 128-bit integer. </td></tr>
    <tr><td class="paramname">a2</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
  </table>
  </dd>
</dl>
<dl class="section return"><dt>Returns</dt><dd>homogeneous aggregate representation of the unsigned 640-bit sum of (m1 * m2) + a1 + a2. </dd></dl>

</div>
</div>
<a id="acf5c808a77a8486a82a9ee87ff414fd2"></a>
<h2 class="memtitle"><span class="permalink"><a href="#acf5c808a77a8486a82a9ee87ff414fd2">&#9670;&nbsp;</a></span>vec_madd512x128a512()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname"><a class="el" href="struct____VEC__U__640.html">__VEC_U_640</a> vec_madd512x128a512 </td>
          <td>(</td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>m1</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>&#160;</td>
          <td class="paramname"><em>m2</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>a2</em>&#160;</td>
        </tr>
        <tr>
          <td></td>
          <td>)</td>
          <td></td><td></td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Vector 512x128-bit Multiply-Add Unsigned Integer. </p>
<p>Compute the 640 bit sum of the product of the 512 bit value m1 and 128-bit value m2 plus the 512-bit value a2. The sum is returned as single 640-bit integer in a homogeneous aggregate structure.</p>
<dl class="section note"><dt>Note</dt><dd>The advantage of this form is that the final 640 bit sum can not overflow and carries between stages are eliminated. Also applying the addend early (1st multiply stage) reduces the live ranges for registers passing partial products for larger multiple precision multiplies.</dd></dl>
<table class="markdownTable">
<tr class="markdownTableHead">
<th class="markdownTableHeadRight">processor </th><th class="markdownTableHeadCenter">Latency </th><th class="markdownTableHeadLeft">Throughput  </th></tr>
<tr class="markdownTableRowOdd">
<td class="markdownTableBodyRight">power8 </td><td class="markdownTableBodyCenter">224-232 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
<tr class="markdownTableRowEven">
<td class="markdownTableBodyRight">power9 </td><td class="markdownTableBodyCenter">132-135 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">m1</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
    <tr><td class="paramname">m2</td><td>vector representation of a unsigned 128-bit integer. </td></tr>
    <tr><td class="paramname">a2</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
  </table>
  </dd>
</dl>
<dl class="section return"><dt>Returns</dt><dd>homogeneous aggregate representation of the unsigned 640-bit sum of (m1 * m2) + a2. </dd></dl>

</div>
</div>
<a id="ac9f6153c72e2194f14627ffca1b26c17"></a>
<h2 class="memtitle"><span class="permalink"><a href="#ac9f6153c72e2194f14627ffca1b26c17">&#9670;&nbsp;</a></span>vec_madd512x128a512_inline()</h2>

<div class="memitem">
<div class="memproto">
<table class="mlabels">
  <tr>
  <td class="mlabels-left">
      <table class="memname">
        <tr>
          <td class="memname">static <a class="el" href="struct____VEC__U__640.html">__VEC_U_640</a> vec_madd512x128a512_inline </td>
          <td>(</td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>m1</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>&#160;</td>
          <td class="paramname"><em>m2</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>a2</em>&#160;</td>
        </tr>
        <tr>
          <td></td>
          <td>)</td>
          <td></td><td></td>
        </tr>
      </table>
  </td>
  <td class="mlabels-right">
<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span>  </td>
  </tr>
</table>
</div><div class="memdoc">

<p>Vector 512x128-bit Multiply-Add Unsigned Integer. </p>
<p>Compute the 640 bit sum of 512 bit value m1 and 128-bit value m2 plus 512-bit value a2. The sum is returned as single 640-bit integer in a homogeneous aggregate structure.</p>
<dl class="section note"><dt>Note</dt><dd>The advantage of this form is that the final 640 bit sum can not overflow and carries between stages are eliminated. Also applying the addend early (1st multiply stage) reduces the live ranges for registers passing partial products for larger multiple precision multiplies.</dd>
<dd>
We use the COMPILER_FENCE to limit instruction scheduling and code motion to smaller code blocks. This in turn reduces register pressure and avoids generating spill code.</dd></dl>
<table class="markdownTable">
<tr class="markdownTableHead">
<th class="markdownTableHeadRight">processor </th><th class="markdownTableHeadCenter">Latency </th><th class="markdownTableHeadLeft">Throughput  </th></tr>
<tr class="markdownTableRowOdd">
<td class="markdownTableBodyRight">power8 </td><td class="markdownTableBodyCenter">224-232 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
<tr class="markdownTableRowEven">
<td class="markdownTableBodyRight">power9 </td><td class="markdownTableBodyCenter">132-135 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">m1</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
    <tr><td class="paramname">m2</td><td>vector representation of a unsigned 128-bit integer. </td></tr>
    <tr><td class="paramname">a2</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
  </table>
  </dd>
</dl>
<dl class="section return"><dt>Returns</dt><dd>homogeneous aggregate representation of the unsigned 640-bit sum of (m1 * m2) + a2. </dd></dl>

</div>
</div>
<a id="a666241f67c39d7fae639235edfb8c3b5"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a666241f67c39d7fae639235edfb8c3b5">&#9670;&nbsp;</a></span>vec_madd512x512a512_inline()</h2>

<div class="memitem">
<div class="memproto">
<table class="mlabels">
  <tr>
  <td class="mlabels-left">
      <table class="memname">
        <tr>
          <td class="memname">static <a class="el" href="struct____VEC__U__1024.html">__VEC_U_1024</a> vec_madd512x512a512_inline </td>
          <td>(</td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>m1</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>m2</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>a1</em>&#160;</td>
        </tr>
        <tr>
          <td></td>
          <td>)</td>
          <td></td><td></td>
        </tr>
      </table>
  </td>
  <td class="mlabels-right">
<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span>  </td>
  </tr>
</table>
</div><div class="memdoc">

<p>Vector 512-bit Unsigned Integer Multiply-Add. </p>
<p>Compute the 1024 bit sum of the product of 512 bit values m1 and m2 and 512 bit addend a1. The sum is returned as single 1024-bit integer in a homogeneous aggregate structure.</p>
<dl class="section note"><dt>Note</dt><dd>The advantage of this form is that the final 1024 bit sum can not overflow and carries between stages are eliminated. Also applying the addend early (1st multiply stage) reduces the live ranges for registers passing partial products for larger multiple precision multiplies. </dd>
<dd>
We use the COMPILER_FENCE to limit instruction scheduling and code motion to smaller code blocks. This in turn reduces register pressure and avoids generating spill code.</dd></dl>
<table class="markdownTable">
<tr class="markdownTableHead">
<th class="markdownTableHeadRight">processor </th><th class="markdownTableHeadCenter">Latency </th><th class="markdownTableHeadLeft">Throughput  </th></tr>
<tr class="markdownTableRowOdd">
<td class="markdownTableBodyRight">power8 </td><td class="markdownTableBodyCenter">~600 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
<tr class="markdownTableRowEven">
<td class="markdownTableBodyRight">power9 </td><td class="markdownTableBodyCenter">~210 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">m1</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
    <tr><td class="paramname">m2</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
    <tr><td class="paramname">a1</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
  </table>
  </dd>
</dl>
<dl class="section return"><dt>Returns</dt><dd>homogeneous aggregate representation of the unsigned 1028-bit product of a * b. </dd></dl>

</div>
</div>
<a id="ad4ade47617ecf223a2f7b0325e1fc877"></a>
<h2 class="memtitle"><span class="permalink"><a href="#ad4ade47617ecf223a2f7b0325e1fc877">&#9670;&nbsp;</a></span>vec_mul1024x1024()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void vec_mul1024x1024 </td>
          <td>(</td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__2048.html">__VEC_U_2048</a> *&#160;</td>
          <td class="paramname"><em>p2048</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__1024.html">__VEC_U_1024</a> *&#160;</td>
          <td class="paramname"><em>m1</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__1024.html">__VEC_U_1024</a> *&#160;</td>
          <td class="paramname"><em>m2</em>&#160;</td>
        </tr>
        <tr>
          <td></td>
          <td>)</td>
          <td></td><td></td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Vector 1024x1024-bit Unsigned Integer Multiply. </p>
<p>Compute the 2048 bit product of 1024 bit values m1 and m2. The product is returned as single 2048-bit integer in a homogeneous aggregate structure.</p>
<dl class="section note"><dt>Note</dt><dd>This is the dynamic call ABI for IFUNC selection. The static implementations are vec_mul1024x1024_PWR8 and vec_mul1024x1024_PWR9. For static calls the <a class="el" href="vec__int512__ppc_8h.html#a77eca5d7bebe0f30894fe9669c01b7a7" title="Macro to add platform suffix for static calls.">__VEC_PWR_IMP()</a> macro will add appropriate suffix based on the compile -mcpu= option. </dd>
<dd>
The storage order for quadwords matches the system endian. On Little Endian systems the least significant quadword is quadword element 0. The most significant is quadword elements [M-1], [N-1], and [M+N-1]. On Big Endian systems the least significant quadword is quadword elements [M-1], [N-1], and [M+N-1]. The most significant is quadword element 0.</dd></dl>
<table class="markdownTable">
<tr class="markdownTableHead">
<th class="markdownTableHeadRight">processor </th><th class="markdownTableHeadCenter">Latency </th><th class="markdownTableHeadLeft">Throughput  </th></tr>
<tr class="markdownTableRowOdd">
<td class="markdownTableBodyRight">power8 </td><td class="markdownTableBodyCenter">~2500 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
<tr class="markdownTableRowEven">
<td class="markdownTableBodyRight">power9 </td><td class="markdownTableBodyCenter">~810 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">p2048</td><td>vector result as a unsigned 2048-bit integer in storage. </td></tr>
    <tr><td class="paramname">m1</td><td>vector representation of a unsigned 1024-bit integer. </td></tr>
    <tr><td class="paramname">m2</td><td>vector representation of a unsigned 1024-bit integer. </td></tr>
  </table>
  </dd>
</dl>

</div>
</div>
<a id="a1a6652dfd5b6e5966acf4b75f0b89682"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a1a6652dfd5b6e5966acf4b75f0b89682">&#9670;&nbsp;</a></span>vec_mul128_byMN()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void vec_mul128_byMN </td>
          <td>(</td>
          <td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *&#160;</td>
          <td class="paramname"><em>p</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *&#160;</td>
          <td class="paramname"><em>m1</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *&#160;</td>
          <td class="paramname"><em>m2</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long&#160;</td>
          <td class="paramname"><em>M</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long&#160;</td>
          <td class="paramname"><em>N</em>&#160;</td>
        </tr>
        <tr>
          <td></td>
          <td>)</td>
          <td></td><td></td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Vector Unsigned Integer Quadword MxN Multiply. </p>
<p>Compute the M+N quadword product of two quadword arrays m1, m2. The product is returned as M+N quadword array p.</p>
<dl class="section note"><dt>Note</dt><dd>This is the dynamic call ABI for IFUNC selection. The static implementations are vec_mul128_byMN_PWR8 and vec_mul128_byMN_PWR9. For static calls the <a class="el" href="vec__int512__ppc_8h.html#a77eca5d7bebe0f30894fe9669c01b7a7" title="Macro to add platform suffix for static calls.">__VEC_PWR_IMP()</a> macro will add appropriate suffix based on the compile -mcpu= option. </dd>
<dd>
The storage order for quadwords matches the system endian. On Little Endian systems the least significant quadword is quadword element 0. The most significant is quadword elements [M-1], [N-1], and [M+N-1]. On Big Endian systems the least significant quadword is quadword elements [M-1], [N-1], and [M+N-1]. The most significant is quadword element 0.</dd></dl>
<table class="markdownTable">
<tr class="markdownTableHead">
<th class="markdownTableHeadRight">processor </th><th class="markdownTableHeadCenter">Latency </th><th class="markdownTableHeadLeft">Throughput  </th></tr>
<tr class="markdownTableRowOdd">
<td class="markdownTableBodyRight">power8 </td><td class="markdownTableBodyCenter">??? </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
<tr class="markdownTableRowEven">
<td class="markdownTableBodyRight">power9 </td><td class="markdownTableBodyCenter">??? </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">p</td><td>pointer to vector result as a unsigned (M+N)x128-bit integer in storage. </td></tr>
    <tr><td class="paramname">m1</td><td>pointer to vector representation of a unsigned Mx128-bit integer. </td></tr>
    <tr><td class="paramname">m2</td><td>pointer ro vector representation of a unsigned Nx128-bit integer. </td></tr>
    <tr><td class="paramname">M</td><td>long int specifying the number of quadword in m1. </td></tr>
    <tr><td class="paramname">N</td><td>long int specifying the number of quadword in m2. </td></tr>
  </table>
  </dd>
</dl>

</div>
</div>
<a id="ab5b80fd9694cea8bf502b26e55af37f7"></a>
<h2 class="memtitle"><span class="permalink"><a href="#ab5b80fd9694cea8bf502b26e55af37f7">&#9670;&nbsp;</a></span>vec_mul128x128()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname"><a class="el" href="struct____VEC__U__256.html">__VEC_U_256</a> vec_mul128x128 </td>
          <td>(</td>
          <td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>&#160;</td>
          <td class="paramname"><em>m1</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>&#160;</td>
          <td class="paramname"><em>m2</em>&#160;</td>
        </tr>
        <tr>
          <td></td>
          <td>)</td>
          <td></td><td></td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Vector 128x128bit Unsigned Integer Multiply. </p>
<p>Compute the 256 bit product of two 128 bit values a, b. The product is returned as single 256-bit integer in a homogeneous aggregate structure.</p>
<dl class="section note"><dt>Note</dt><dd>This is the dynamic call ABI for IFUNC selection. The static implementations are vec_mul128x128_PWR8 and vec_mul128x128_PWR9. For static calls the <a class="el" href="vec__int512__ppc_8h.html#a77eca5d7bebe0f30894fe9669c01b7a7" title="Macro to add platform suffix for static calls.">__VEC_PWR_IMP()</a> macro will add appropriate suffix based on the compile -mcpu= option.</dd></dl>
<table class="markdownTable">
<tr class="markdownTableHead">
<th class="markdownTableHeadRight">processor </th><th class="markdownTableHeadCenter">Latency </th><th class="markdownTableHeadLeft">Throughput  </th></tr>
<tr class="markdownTableRowOdd">
<td class="markdownTableBodyRight">power8 </td><td class="markdownTableBodyCenter">48-56 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
<tr class="markdownTableRowEven">
<td class="markdownTableBodyRight">power9 </td><td class="markdownTableBodyCenter">16-24 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">m1</td><td>vector representation of a unsigned 128-bit integer. </td></tr>
    <tr><td class="paramname">m2</td><td>vector representation of a unsigned 128-bit integer. </td></tr>
  </table>
  </dd>
</dl>
<dl class="section return"><dt>Returns</dt><dd>homogeneous aggregate representation of the unsigned 256-bit product of a * b. </dd></dl>

</div>
</div>
<a id="a958e029fc824ec3a73ad9550bf7ea506"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a958e029fc824ec3a73ad9550bf7ea506">&#9670;&nbsp;</a></span>vec_mul128x128_inline()</h2>

<div class="memitem">
<div class="memproto">
<table class="mlabels">
  <tr>
  <td class="mlabels-left">
      <table class="memname">
        <tr>
          <td class="memname">static <a class="el" href="struct____VEC__U__256.html">__VEC_U_256</a> vec_mul128x128_inline </td>
          <td>(</td>
          <td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>&#160;</td>
          <td class="paramname"><em>a</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>&#160;</td>
          <td class="paramname"><em>b</em>&#160;</td>
        </tr>
        <tr>
          <td></td>
          <td>)</td>
          <td></td><td></td>
        </tr>
      </table>
  </td>
  <td class="mlabels-right">
<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span>  </td>
  </tr>
</table>
</div><div class="memdoc">

<p>Vector 128x128bit Unsigned Integer Multiply. </p>
<p>Compute the 256 bit product of two 128 bit values a, b. The product is returned as single 256-bit integer in a homogeneous aggregate structure.</p>
<table class="markdownTable">
<tr class="markdownTableHead">
<th class="markdownTableHeadRight">processor </th><th class="markdownTableHeadCenter">Latency </th><th class="markdownTableHeadLeft">Throughput  </th></tr>
<tr class="markdownTableRowOdd">
<td class="markdownTableBodyRight">power8 </td><td class="markdownTableBodyCenter">56-64 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
<tr class="markdownTableRowEven">
<td class="markdownTableBodyRight">power9 </td><td class="markdownTableBodyCenter">33-39 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">a</td><td>vector representation of a unsigned 128-bit integer. </td></tr>
    <tr><td class="paramname">b</td><td>vector representation of a unsigned 128-bit integer. </td></tr>
  </table>
  </dd>
</dl>
<dl class="section return"><dt>Returns</dt><dd>homogeneous aggregate representation of the unsigned 256-bit product of a * b. </dd></dl>

</div>
</div>
<a id="a8287aa4483acb25ac3188a97cc23b89a"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a8287aa4483acb25ac3188a97cc23b89a">&#9670;&nbsp;</a></span>vec_mul2048x2048()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void vec_mul2048x2048 </td>
          <td>(</td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__4096.html">__VEC_U_4096</a> *&#160;</td>
          <td class="paramname"><em>p4096</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__2048.html">__VEC_U_2048</a> *&#160;</td>
          <td class="paramname"><em>m1</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__2048.html">__VEC_U_2048</a> *&#160;</td>
          <td class="paramname"><em>m2</em>&#160;</td>
        </tr>
        <tr>
          <td></td>
          <td>)</td>
          <td></td><td></td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Vector 2048x2048-bit Unsigned Integer Multiply. </p>
<p>Compute the 4096 bit product of 2048 bit values m1 and m2. The product is returned as single 4096-bit integer in a homogeneous aggregate structure.</p>
<dl class="section note"><dt>Note</dt><dd>This is the dynamic call ABI for IFUNC selection. The static implementations are vec_mul2048x2048_PWR8 and vec_mul2048x2048_PWR9. For static calls the <a class="el" href="vec__int512__ppc_8h.html#a77eca5d7bebe0f30894fe9669c01b7a7" title="Macro to add platform suffix for static calls.">__VEC_PWR_IMP()</a> macro will add appropriate suffix based on the compile -mcpu= option. </dd>
<dd>
The storage order for quadwords matches the system endian. On Little Endian systems the least significant quadword is quadword element 0. The most significant is quadword elements [M-1], [N-1], and [M+N-1]. On Big Endian systems the least significant quadword is quadword elements [M-1], [N-1], and [M+N-1]. The most significant is quadword element 0.</dd></dl>
<table class="markdownTable">
<tr class="markdownTableHead">
<th class="markdownTableHeadRight">processor </th><th class="markdownTableHeadCenter">Latency </th><th class="markdownTableHeadLeft">Throughput  </th></tr>
<tr class="markdownTableRowOdd">
<td class="markdownTableBodyRight">power8 </td><td class="markdownTableBodyCenter">~12000 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
<tr class="markdownTableRowEven">
<td class="markdownTableBodyRight">power9 </td><td class="markdownTableBodyCenter">4770 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">p4096</td><td>vector result as a unsigned 4096-bit integer in storage. </td></tr>
    <tr><td class="paramname">m1</td><td>vector representation of a unsigned 2048-bit integer. </td></tr>
    <tr><td class="paramname">m2</td><td>vector representation of a unsigned 2048-bit integer. </td></tr>
  </table>
  </dd>
</dl>

</div>
</div>
<a id="a131bdfc55718991610c886b2c77f6ae7"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a131bdfc55718991610c886b2c77f6ae7">&#9670;&nbsp;</a></span>vec_mul256x256()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> vec_mul256x256 </td>
          <td>(</td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__256.html">__VEC_U_256</a>&#160;</td>
          <td class="paramname"><em>m1</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__256.html">__VEC_U_256</a>&#160;</td>
          <td class="paramname"><em>m2</em>&#160;</td>
        </tr>
        <tr>
          <td></td>
          <td>)</td>
          <td></td><td></td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Vector 256x256-bit Unsigned Integer Multiply. </p>
<p>Compute the 512 bit product of two 256 bit values a, b. The product is returned as single 512-bit integer in a homogeneous aggregate structure.</p>
<dl class="section note"><dt>Note</dt><dd>This is the dynamic call ABI for IFUNC selection. The static implementations are vec_mul256x256_PWR8 and vec_mul256x256_PWR9. For static calls the <a class="el" href="vec__int512__ppc_8h.html#a77eca5d7bebe0f30894fe9669c01b7a7" title="Macro to add platform suffix for static calls.">__VEC_PWR_IMP()</a> macro will add appropriate suffix based on the compile -mcpu= option.</dd></dl>
<table class="markdownTable">
<tr class="markdownTableHead">
<th class="markdownTableHeadRight">processor </th><th class="markdownTableHeadCenter">Latency </th><th class="markdownTableHeadLeft">Throughput  </th></tr>
<tr class="markdownTableRowOdd">
<td class="markdownTableBodyRight">power8 </td><td class="markdownTableBodyCenter">140-150 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
<tr class="markdownTableRowEven">
<td class="markdownTableBodyRight">power9 </td><td class="markdownTableBodyCenter">46-58 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">m1</td><td>vector representation of a unsigned 256-bit integer. </td></tr>
    <tr><td class="paramname">m2</td><td>vector representation of a unsigned 256-bit integer. </td></tr>
  </table>
  </dd>
</dl>
<dl class="section return"><dt>Returns</dt><dd>homogeneous aggregate representation of the unsigned 512-bit product of m1 * m2. </dd></dl>

</div>
</div>
<a id="a92120de408d445766efcd709d73840d9"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a92120de408d445766efcd709d73840d9">&#9670;&nbsp;</a></span>vec_mul256x256_inline()</h2>

<div class="memitem">
<div class="memproto">
<table class="mlabels">
  <tr>
  <td class="mlabels-left">
      <table class="memname">
        <tr>
          <td class="memname">static <a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> vec_mul256x256_inline </td>
          <td>(</td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__256.html">__VEC_U_256</a>&#160;</td>
          <td class="paramname"><em>m1</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__256.html">__VEC_U_256</a>&#160;</td>
          <td class="paramname"><em>m2</em>&#160;</td>
        </tr>
        <tr>
          <td></td>
          <td>)</td>
          <td></td><td></td>
        </tr>
      </table>
  </td>
  <td class="mlabels-right">
<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span>  </td>
  </tr>
</table>
</div><div class="memdoc">

<p>Vector 256x256-bit Unsigned Integer Multiply. </p>
<p>Compute the 512 bit product of two 256 bit values a, b. The product is returned as single 512-bit integer in a homogeneous aggregate structure.</p>
<dl class="section note"><dt>Note</dt><dd>Using the Multiply-Add form which applies the addend early reduces the live ranges for registers passing partial products for larger multiple precision multiplies. </dd>
<dd>
We use the COMPILER_FENCE to limit instruction scheduling and code motion to smaller code blocks. This in turn reduces register pressure and avoids generating spill code.</dd></dl>
<table class="markdownTable">
<tr class="markdownTableHead">
<th class="markdownTableHeadRight">processor </th><th class="markdownTableHeadCenter">Latency </th><th class="markdownTableHeadLeft">Throughput  </th></tr>
<tr class="markdownTableRowOdd">
<td class="markdownTableBodyRight">power8 </td><td class="markdownTableBodyCenter">224-232 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
<tr class="markdownTableRowEven">
<td class="markdownTableBodyRight">power9 </td><td class="markdownTableBodyCenter">132-135 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">m1</td><td>vector representation of a unsigned 256-bit integer. </td></tr>
    <tr><td class="paramname">m2</td><td>vector representation of a unsigned 256-bit integer. </td></tr>
  </table>
  </dd>
</dl>
<dl class="section return"><dt>Returns</dt><dd>homogeneous aggregate representation of the unsigned 512-bit product of m1 * m2. </dd></dl>

</div>
</div>
<a id="a46ee75bf2ea0b7095d60f1448cf2f097"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a46ee75bf2ea0b7095d60f1448cf2f097">&#9670;&nbsp;</a></span>vec_mul512_byMN()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void vec_mul512_byMN </td>
          <td>(</td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> *&#160;</td>
          <td class="paramname"><em>p</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> *&#160;</td>
          <td class="paramname"><em>m1</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a> *&#160;</td>
          <td class="paramname"><em>m2</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long&#160;</td>
          <td class="paramname"><em>M</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long&#160;</td>
          <td class="paramname"><em>N</em>&#160;</td>
        </tr>
        <tr>
          <td></td>
          <td>)</td>
          <td></td><td></td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Vector Unsigned Integer Quadword 4xMxN Multiply. </p>
<p>Compute the 4xM+N quadword product of two quadword arrays m1, m2. The product is returned as 4xM+N quadword array p.</p>
<dl class="section note"><dt>Note</dt><dd>This is the dynamic call ABI for IFUNC selection. The static implementations are vec_mul512_byMN_PWR8 and vec_mul512_byMN_PWR9. For static calls the <a class="el" href="vec__int512__ppc_8h.html#a77eca5d7bebe0f30894fe9669c01b7a7" title="Macro to add platform suffix for static calls.">__VEC_PWR_IMP()</a> macro will add appropriate suffix based on the compile -mcpu= option. </dd>
<dd>
The storage order for quadwords matches the system endian. On Little Endian systems the least significant quadword is quadword element 0. The most significant is quadword elements [M-1], [N-1], and [M+N-1]. On Big Endian systems the least significant quadword is quadword elements [M-1], [N-1], and [M+N-1]. The most significant is quadword element 0.</dd></dl>
<table class="markdownTable">
<tr class="markdownTableHead">
<th class="markdownTableHeadRight">processor </th><th class="markdownTableHeadCenter">Latency </th><th class="markdownTableHeadLeft">Throughput  </th></tr>
<tr class="markdownTableRowOdd">
<td class="markdownTableBodyRight">power8 </td><td class="markdownTableBodyCenter">~570*(M*N) </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
<tr class="markdownTableRowEven">
<td class="markdownTableBodyRight">power9 </td><td class="markdownTableBodyCenter">~260*(M*N) </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">p</td><td>pointer to vector result as a unsigned (M+N)x512-bit integer in storage. </td></tr>
    <tr><td class="paramname">m1</td><td>pointer to vector representation of a unsigned Mx512-bit integer. </td></tr>
    <tr><td class="paramname">m2</td><td>pointer ro vector representation of a unsigned Nx512-bit integer. </td></tr>
    <tr><td class="paramname">M</td><td>long int specifying the number of 4x quadwords in m1. </td></tr>
    <tr><td class="paramname">N</td><td>long int specifying the number of 4x quadwords in m2. </td></tr>
  </table>
  </dd>
</dl>

</div>
</div>
<a id="a0cfdc3e00f5e2c3a9a959969f684203e"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a0cfdc3e00f5e2c3a9a959969f684203e">&#9670;&nbsp;</a></span>vec_mul512x128()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname"><a class="el" href="struct____VEC__U__640.html">__VEC_U_640</a> vec_mul512x128 </td>
          <td>(</td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>m1</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>&#160;</td>
          <td class="paramname"><em>m2</em>&#160;</td>
        </tr>
        <tr>
          <td></td>
          <td>)</td>
          <td></td><td></td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Vector 512x128-bit Unsigned Integer Multiply. </p>
<p>Compute the 640 bit product of 512 bit value m1 and 128-bit value m2. The product is returned as single 640-bit integer in a homogeneous aggregate structure.</p>
<dl class="section note"><dt>Note</dt><dd>This is the dynamic call ABI for IFUNC selection. The static implementations are vec_mul256x256_PWR8 and vec_mul256x256_PWR9. For static calls the <a class="el" href="vec__int512__ppc_8h.html#a77eca5d7bebe0f30894fe9669c01b7a7" title="Macro to add platform suffix for static calls.">__VEC_PWR_IMP()</a> macro will add appropriate suffix based on the compile -mcpu= option.</dd></dl>
<table class="markdownTable">
<tr class="markdownTableHead">
<th class="markdownTableHeadRight">processor </th><th class="markdownTableHeadCenter">Latency </th><th class="markdownTableHeadLeft">Throughput  </th></tr>
<tr class="markdownTableRowOdd">
<td class="markdownTableBodyRight">power8 </td><td class="markdownTableBodyCenter">224-232 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
<tr class="markdownTableRowEven">
<td class="markdownTableBodyRight">power9 </td><td class="markdownTableBodyCenter">132-135 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">m1</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
    <tr><td class="paramname">m2</td><td>vector representation of a unsigned 128-bit integer. </td></tr>
  </table>
  </dd>
</dl>
<dl class="section return"><dt>Returns</dt><dd>homogeneous aggregate representation of the unsigned 640-bit product of m1 * m2. </dd></dl>

</div>
</div>
<a id="ad10d62fe43f329c396b6486402f4f7af"></a>
<h2 class="memtitle"><span class="permalink"><a href="#ad10d62fe43f329c396b6486402f4f7af">&#9670;&nbsp;</a></span>vec_mul512x128_inline()</h2>

<div class="memitem">
<div class="memproto">
<table class="mlabels">
  <tr>
  <td class="mlabels-left">
      <table class="memname">
        <tr>
          <td class="memname">static <a class="el" href="struct____VEC__U__640.html">__VEC_U_640</a> vec_mul512x128_inline </td>
          <td>(</td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>m1</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>&#160;</td>
          <td class="paramname"><em>m2</em>&#160;</td>
        </tr>
        <tr>
          <td></td>
          <td>)</td>
          <td></td><td></td>
        </tr>
      </table>
  </td>
  <td class="mlabels-right">
<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span>  </td>
  </tr>
</table>
</div><div class="memdoc">

<p>Vector 512x128-bit Unsigned Integer Multiply. </p>
<p>Compute the 640 bit product of 512 bit value m1 and 128-bit value m2. The product is returned as single 640-bit integer in a homogeneous aggregate structure.</p>
<dl class="section note"><dt>Note</dt><dd>Using the Multiply-Add form which applies the addend early reduces the live ranges for registers passing partial products for larger multiple precision multiplies. </dd>
<dd>
We use the COMPILER_FENCE to limit instruction scheduling and code motion to smaller code blocks. This in turn reduces register pressure and avoids generating spill code.</dd></dl>
<table class="markdownTable">
<tr class="markdownTableHead">
<th class="markdownTableHeadRight">processor </th><th class="markdownTableHeadCenter">Latency </th><th class="markdownTableHeadLeft">Throughput  </th></tr>
<tr class="markdownTableRowOdd">
<td class="markdownTableBodyRight">power8 </td><td class="markdownTableBodyCenter">224-232 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
<tr class="markdownTableRowEven">
<td class="markdownTableBodyRight">power9 </td><td class="markdownTableBodyCenter">132-135 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">m1</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
    <tr><td class="paramname">m2</td><td>vector representation of a unsigned 128-bit integer. </td></tr>
  </table>
  </dd>
</dl>
<dl class="section return"><dt>Returns</dt><dd>homogeneous aggregate representation of the unsigned 640-bit product of m1 * m2. </dd></dl>

</div>
</div>
<a id="a56a5da10870d9878e2ab888d3c4d2e7b"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a56a5da10870d9878e2ab888d3c4d2e7b">&#9670;&nbsp;</a></span>vec_mul512x512()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname"><a class="el" href="struct____VEC__U__1024.html">__VEC_U_1024</a> vec_mul512x512 </td>
          <td>(</td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>m1</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>m2</em>&#160;</td>
        </tr>
        <tr>
          <td></td>
          <td>)</td>
          <td></td><td></td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Vector 512x512-bit Unsigned Integer Multiply. </p>
<p>Compute the 1024 bit product of 512 bit values m1 and m2. The product is returned as single 1024-bit integer in a homogeneous aggregate structure.</p>
<dl class="section note"><dt>Note</dt><dd>This is the dynamic call ABI for IFUNC selection. The static implementations are vec_mul512x512_PWR8 and vec_mul512x512_PWR9. For static calls the <a class="el" href="vec__int512__ppc_8h.html#a77eca5d7bebe0f30894fe9669c01b7a7" title="Macro to add platform suffix for static calls.">__VEC_PWR_IMP()</a> macro will add appropriate suffix based on the compile -mcpu= option.</dd></dl>
<table class="markdownTable">
<tr class="markdownTableHead">
<th class="markdownTableHeadRight">processor </th><th class="markdownTableHeadCenter">Latency </th><th class="markdownTableHeadLeft">Throughput  </th></tr>
<tr class="markdownTableRowOdd">
<td class="markdownTableBodyRight">power8 </td><td class="markdownTableBodyCenter">~600 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
<tr class="markdownTableRowEven">
<td class="markdownTableBodyRight">power9 </td><td class="markdownTableBodyCenter">~210 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">m1</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
    <tr><td class="paramname">m2</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
  </table>
  </dd>
</dl>
<dl class="section return"><dt>Returns</dt><dd>homogeneous aggregate representation of the unsigned 1028-bit product of a * b. </dd></dl>

</div>
</div>
<a id="a6f917597902625a8218ba41fc6dca426"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a6f917597902625a8218ba41fc6dca426">&#9670;&nbsp;</a></span>vec_mul512x512_inline()</h2>

<div class="memitem">
<div class="memproto">
<table class="mlabels">
  <tr>
  <td class="mlabels-left">
      <table class="memname">
        <tr>
          <td class="memname">static <a class="el" href="struct____VEC__U__1024.html">__VEC_U_1024</a> vec_mul512x512_inline </td>
          <td>(</td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>m1</em>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype"><a class="el" href="struct____VEC__U__512.html">__VEC_U_512</a>&#160;</td>
          <td class="paramname"><em>m2</em>&#160;</td>
        </tr>
        <tr>
          <td></td>
          <td>)</td>
          <td></td><td></td>
        </tr>
      </table>
  </td>
  <td class="mlabels-right">
<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span>  </td>
  </tr>
</table>
</div><div class="memdoc">

<p>Vector 512x512-bit Unsigned Integer Multiply. </p>
<p>Compute the 1024 bit product of 512 bit values m1 and m2. The product is returned as single 1024-bit integer in a homogeneous aggregate structure.</p>
<dl class="section note"><dt>Note</dt><dd>We use the COMPILER_FENCE to limit instruction scheduling and code motion to smaller code blocks. This in turn reduces register pressure and avoids generating spill code.</dd>
<dd>
Using the Multiply-Add form which applies the addend early reduces the live ranges for registers passing partial products for larger multiple precision multiplies.</dd></dl>
<table class="markdownTable">
<tr class="markdownTableHead">
<th class="markdownTableHeadRight">processor </th><th class="markdownTableHeadCenter">Latency </th><th class="markdownTableHeadLeft">Throughput  </th></tr>
<tr class="markdownTableRowOdd">
<td class="markdownTableBodyRight">power8 </td><td class="markdownTableBodyCenter">~600 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
<tr class="markdownTableRowEven">
<td class="markdownTableBodyRight">power9 </td><td class="markdownTableBodyCenter">~210 </td><td class="markdownTableBodyLeft">1/cycle  </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">m1</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
    <tr><td class="paramname">m2</td><td>vector representation of a unsigned 512-bit integer. </td></tr>
  </table>
  </dd>
</dl>
<dl class="section return"><dt>Returns</dt><dd>homogeneous aggregate representation of the unsigned 1028-bit product of m1 * m2. </dd></dl>

</div>
</div>
</div><!-- contents -->
<div class="ttc" id="avec__int512__ppc_8h_html_a77eca5d7bebe0f30894fe9669c01b7a7"><div class="ttname"><a href="vec__int512__ppc_8h.html#a77eca5d7bebe0f30894fe9669c01b7a7">__VEC_PWR_IMP</a></div><div class="ttdeci">#define __VEC_PWR_IMP(FNAME)</div><div class="ttdoc">Macro to add platform suffix for static calls.</div><div class="ttdef"><b>Definition:</b> vec_int512_ppc.h:1357</div></div>
<div class="ttc" id="astruct____VEC__U__512_html"><div class="ttname"><a href="struct____VEC__U__512.html">__VEC_U_512</a></div><div class="ttdoc">A vector representation of a 512-bit unsigned integer.</div><div class="ttdef"><b>Definition:</b> vec_int512_ppc.h:842</div></div>
<div class="ttc" id="avec__int512__ppc_8h_html_ab5b80fd9694cea8bf502b26e55af37f7"><div class="ttname"><a href="vec__int512__ppc_8h.html#ab5b80fd9694cea8bf502b26e55af37f7">vec_mul128x128</a></div><div class="ttdeci">__VEC_U_256 vec_mul128x128(vui128_t m1, vui128_t m2)</div><div class="ttdoc">Vector 128x128bit Unsigned Integer Multiply.</div></div>
<div class="ttc" id="astruct____VEC__U__256_html"><div class="ttname"><a href="struct____VEC__U__256.html">__VEC_U_256</a></div><div class="ttdoc">A vector representation of a 256-bit unsigned integer.</div><div class="ttdef"><b>Definition:</b> vec_int512_ppc.h:823</div></div>
<div class="ttc" id="avec__int128__ppc_8h_html_a0f75e65180e68c4753f3d9c2f42d1a31"><div class="ttname"><a href="vec__int128__ppc_8h.html#a0f75e65180e68c4753f3d9c2f42d1a31">CONST_VUINT128_QxW</a></div><div class="ttdeci">#define CONST_VUINT128_QxW(__q0, __q1, __q2, __q3)</div><div class="ttdoc">Generate a vector unsigned __int128 constant from words.</div><div class="ttdef"><b>Definition:</b> vec_int128_ppc.h:2338</div></div>
<div class="ttc" id="avec__int512__ppc_8h_html_a958e029fc824ec3a73ad9550bf7ea506"><div class="ttname"><a href="vec__int512__ppc_8h.html#a958e029fc824ec3a73ad9550bf7ea506">vec_mul128x128_inline</a></div><div class="ttdeci">static __VEC_U_256 vec_mul128x128_inline(vui128_t a, vui128_t b)</div><div class="ttdoc">Vector 128x128bit Unsigned Integer Multiply.</div><div class="ttdef"><b>Definition:</b> vec_int512_ppc.h:1574</div></div>
<div class="ttc" id="avec__int128__ppc_8h_html"><div class="ttname"><a href="vec__int128__ppc_8h.html">vec_int128_ppc.h</a></div><div class="ttdoc">Header package containing a collection of 128-bit computation functions implemented with PowerISA VMX...</div></div>
<div class="ttc" id="avec__common__ppc_8h_html_aaf7a8e92d8ba681dac3d2ec3259c0820"><div class="ttname"><a href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a></div><div class="ttdeci">__vector unsigned __int128 vui128_t</div><div class="ttdoc">vector of one 128-bit unsigned __int128 element.</div><div class="ttdef"><b>Definition:</b> vec_common_ppc.h:237</div></div>
<div class="ttc" id="avec__int512__ppc_8h_html_aac9d31829610c29b0f5558bfb1f18e4a"><div class="ttname"><a href="vec__int512__ppc_8h.html#aac9d31829610c29b0f5558bfb1f18e4a">CONST_VINT512_Q</a></div><div class="ttdeci">#define CONST_VINT512_Q(__q0, __q1, __q2, __q3)</div><div class="ttdoc">Generate a 512-bit vector unsigned integer constant from 4 x quadword constants.</div><div class="ttdef"><b>Definition:</b> vec_int512_ppc.h:811</div></div>
<div class="ttc" id="avec__int512__ppc_8h_html"><div class="ttname"><a href="vec__int512__ppc_8h.html">vec_int512_ppc.h</a></div><div class="ttdoc">Header package containing a collection of multiple precision quadword integer computation functions i...</div></div>
<!-- start footer part -->
<hr class="footer"/><address class="footer"><small>
Generated on Sun Jul 10 2022 14:57:44 for POWER Vector Library Manual by &#160;<a href="http://www.doxygen.org/index.html">
<img class="footer" src="doxygen.png" alt="doxygen"/>
</a> 1.8.17
</small></address>
</body>
</html>