From 6a70eec318bc8a1a47370a1ebab802ff4cbc2814 Mon Sep 17 00:00:00 2001 From: WO Date: Tue, 17 Jul 2018 19:33:18 +0900 Subject: [PATCH] Add yespower algorithm Default algo is changed yespower 0.5 mode for "a few percent speedup" which is compatible with yescrypt. --- Makefile.am | 4 +- README.txt | 5 + cpu-miner.c | 57 +-- insecure_memzero.h | 1 + miner.h | 3 + sha256.c | 647 ++++++++++++++++++++++++ sha256.h | 121 +++++ sha256_Y.c | 2 + winbuild-cross.sh | 63 +++ yespower-opt.c | 1148 +++++++++++++++++++++++++++++++++++++++++++ yespower-platform.c | 107 ++++ yespower-ref.c | 580 ++++++++++++++++++++++ yespower.c | 68 +++ yespower.h | 130 +++++ 14 files changed, 2888 insertions(+), 48 deletions(-) create mode 100644 README.txt create mode 100644 insecure_memzero.h create mode 100644 sha256.c create mode 100644 sha256.h create mode 100755 winbuild-cross.sh create mode 100644 yespower-opt.c create mode 100644 yespower-platform.c create mode 100644 yespower-ref.c create mode 100644 yespower.c create mode 100644 yespower.h diff --git a/Makefile.am b/Makefile.am index 454e2ff2f..b314ca7d9 100644 --- a/Makefile.am +++ b/Makefile.am @@ -17,8 +17,8 @@ dist_man_MANS = minerd.1 minerd_SOURCES = elist.h miner.h compat.h \ cpu-miner.c util.c \ - sha2.c scrypt.c \ - yescrypt.c yescrypt.h + yescrypt.c yescrypt.h \ + sha2.c sha256.c yespower.c yespower.h yespower-opt.c if USE_ASM if ARCH_x86 minerd_SOURCES += sha2-x86.S scrypt-x86.S diff --git a/README.txt b/README.txt new file mode 100644 index 000000000..87d54b57a --- /dev/null +++ b/README.txt @@ -0,0 +1,5 @@ +minerd-sse2.exe Core2, Nehalem +minerd-aes-sse42.exe Westmere, Sandy-Ivybridge +minerd-avx.exe Sandy-Ivybridge +minerd-avx2.exe Haswell, Sky-Kaby-Coffeelake +minerd-avx2-sha.exe Ryzen diff --git a/cpu-miner.c b/cpu-miner.c index 4503aaf6d..b84fd0dd6 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -20,6 +20,7 @@ #include #include #ifdef WIN32 +#include #include #else #include @@ -101,15 +102,13 @@ struct workio_cmd { }; enum algos { - ALGO_SCRYPT, /* scrypt(1024,1,1) */ - ALGO_SHA256D, /* SHA-256d */ - ALGO_YESCRYPT, + ALGO_YESCRYPT, + ALGO_YESPOWER, }; static const char *algo_names[] = { - [ALGO_SCRYPT] = "scrypt", - [ALGO_SHA256D] = "sha256d", [ALGO_YESCRYPT] = "yescrypt", + [ALGO_YESPOWER] = "yespower", }; bool opt_debug = false; @@ -131,8 +130,7 @@ static int opt_fail_pause = 30; int opt_timeout = 0; static int opt_scantime = 5; static const bool opt_time = true; -static enum algos opt_algo = ALGO_YESCRYPT; -static int opt_scrypt_n = 1024; +static enum algos opt_algo = ALGO_YESPOWER; static int opt_n_threads; static int num_processors; static char *rpc_url; @@ -173,10 +171,8 @@ static char const usage[] = "\ Usage: " PROGRAM_NAME " [OPTIONS]\n\ Options:\n\ -a, --algo=ALGO specify the algorithm to use\n\ - scrypt scrypt(1024, 1, 1)\n\ - scrypt:N scrypt(N, 1, 1)\n\ - sha256d SHA-256d\n\ - yescrypt yescrypt (default)\n\ + yescrypt yescrypt\n\ + yespower yespower 0.5 (default)\n\ -o, --url=URL URL of mining server\n\ -O, --userpass=U:P username:password pair for mining server\n\ -u, --user=USERNAME username for mining server\n\ @@ -1068,7 +1064,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) free(xnonce2str); } - if (opt_algo == ALGO_SCRYPT || opt_algo == ALGO_YESCRYPT) + if (opt_algo == ALGO_YESCRYPT || opt_algo == ALGO_YESPOWER) diff_to_target(work->target, sctx->job.diff / 65536.0); else diff_to_target(work->target, sctx->job.diff); @@ -1083,7 +1079,6 @@ static void *miner_thread(void *userdata) struct work work = {{0}}; uint32_t max_nonce; uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x20; - unsigned char *scratchbuf = NULL; char s[16]; int i; @@ -1104,15 +1099,6 @@ static void *miner_thread(void *userdata) affine_to_cpu(thr_id, thr_id % num_processors); } - if (opt_algo == ALGO_SCRYPT) { - scratchbuf = scrypt_buffer_alloc(opt_scrypt_n); - if (!scratchbuf) { - applog(LOG_ERR, "scrypt buffer allocation failed"); - pthread_mutex_lock(&applog_lock); - exit(1); - } - } - while (1) { unsigned long hashes_done; struct timeval tv_start, tv_end, diff; @@ -1163,15 +1149,10 @@ static void *miner_thread(void *userdata) max64 *= thr_hashrates[thr_id]; if (max64 <= 0) { switch (opt_algo) { - case ALGO_SCRYPT: - max64 = opt_scrypt_n < 16 ? 0x3ffff : 0x3fffff / opt_scrypt_n; - break; case ALGO_YESCRYPT: + case ALGO_YESPOWER: max64 = 0x3fffff; break; - case ALGO_SHA256D: - max64 = 0x1fffff; - break; } } if (work.data[19] + max64 > end_nonce) @@ -1184,21 +1165,14 @@ static void *miner_thread(void *userdata) /* scan nonces for a proof-of-work hash */ switch (opt_algo) { - case ALGO_SCRYPT: - rc = scanhash_scrypt(thr_id, work.data, scratchbuf, work.target, - max_nonce, &hashes_done, opt_scrypt_n); - break; - case ALGO_YESCRYPT: rc = scanhash_yescrypt(thr_id, work.data, work.target, max_nonce, &hashes_done); break; - - case ALGO_SHA256D: - rc = scanhash_sha256d(thr_id, work.data, work.target, + case ALGO_YESPOWER: + rc = scanhash_yespower(thr_id, work.data, work.target, max_nonce, &hashes_done); break; - default: /* should never happen */ goto out; @@ -1521,15 +1495,6 @@ static void parse_arg(int key, char *arg, char *pname) opt_algo = i; break; } - if (arg[v] == ':' && i == ALGO_SCRYPT) { - char *ep; - v = strtol(arg+v+1, &ep, 10); - if (*ep || v & (v-1) || v < 2) - continue; - opt_algo = i; - opt_scrypt_n = v; - break; - } } } if (i == ARRAY_SIZE(algo_names)) { diff --git a/insecure_memzero.h b/insecure_memzero.h new file mode 100644 index 000000000..5a0ba75c4 --- /dev/null +++ b/insecure_memzero.h @@ -0,0 +1 @@ +#define insecure_memzero(buf, len) /* empty */ diff --git a/miner.h b/miner.h index bc4b4eebe..440bc9a82 100644 --- a/miner.h +++ b/miner.h @@ -157,6 +157,9 @@ extern int scanhash_sha256d(int thr_id, uint32_t *pdata, extern int scanhash_yescrypt(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_yespower(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); + extern unsigned char *scrypt_buffer_alloc(int N); extern int scanhash_scrypt(int thr_id, uint32_t *pdata, unsigned char *scratchbuf, const uint32_t *ptarget, diff --git a/sha256.c b/sha256.c new file mode 100644 index 000000000..0d878a130 --- /dev/null +++ b/sha256.c @@ -0,0 +1,647 @@ +/*- + * Copyright 2005-2016 Colin Percival + * Copyright 2016-2018 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include + +#include "insecure_memzero.h" +#include "sysendian.h" +#include "miner.h" + +#include "sha256.h" + +#ifdef __ICC +/* Miscompile with icc 14.0.0 (at least), so don't use restrict there */ +#define restrict +#elif __STDC_VERSION__ >= 199901L +/* Have restrict */ +#elif defined(__GNUC__) +#define restrict __restrict +#else +#define restrict +#endif + +/* + * Encode a length len*2 vector of (uint32_t) into a length len*8 vector of + * (uint8_t) in big-endian form. + */ +static void +be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len) +{ + + /* Encode vector, two words at a time. */ + do { + be32enc(&dst[0], src[0]); + be32enc(&dst[4], src[1]); + src += 2; + dst += 8; + } while (--len); +} + +/* + * Decode a big-endian length len*8 vector of (uint8_t) into a length + * len*2 vector of (uint32_t). + */ +static void +be32dec_vect(uint32_t * dst, const uint8_t * src, size_t len) +{ + + /* Decode vector, two words at a time. */ + do { + dst[0] = be32dec(&src[0]); + dst[1] = be32dec(&src[4]); + src += 8; + dst += 2; + } while (--len); +} + +/* SHA256 round constants. */ +static const uint32_t Krnd[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +/* Elementary functions used by SHA256 */ +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define SHR(x, n) (x >> n) +#define ROTR(x, n) ((x >> n) | (x << (32 - n))) +#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) +#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) +#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3)) +#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10)) + +/* SHA256 round function */ +#define RND(a, b, c, d, e, f, g, h, k) \ + h += S1(e) + Ch(e, f, g) + k; \ + d += h; \ + h += S0(a) + Maj(a, b, c); + +/* Adjusted round function for rotating state */ +#define RNDr(S, W, i, ii) \ + RND(S[(64 - i) % 8], S[(65 - i) % 8], \ + S[(66 - i) % 8], S[(67 - i) % 8], \ + S[(68 - i) % 8], S[(69 - i) % 8], \ + S[(70 - i) % 8], S[(71 - i) % 8], \ + W[i + ii] + Krnd[i + ii]) + +/* Message schedule computation */ +#define MSCH(W, ii, i) \ + W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + s0(W[i + ii + 1]) + W[i + ii] + +/* + * SHA256 block compression function. The 256-bit state is transformed via + * the 512-bit input block to produce a new state. + */ +static void +SHA256_Transform(uint32_t state[static restrict 8], + const uint8_t block[static restrict 64], + uint32_t W[static restrict 64], uint32_t S[static restrict 8]) +{ + int i; + + /* 1. Prepare the first part of the message schedule W. */ + be32dec_vect(W, block, 8); + + /* 2. Initialize working variables. */ + memcpy(S, state, 32); + + /* 3. Mix. */ + for (i = 0; i < 64; i += 16) { + RNDr(S, W, 0, i); + RNDr(S, W, 1, i); + RNDr(S, W, 2, i); + RNDr(S, W, 3, i); + RNDr(S, W, 4, i); + RNDr(S, W, 5, i); + RNDr(S, W, 6, i); + RNDr(S, W, 7, i); + RNDr(S, W, 8, i); + RNDr(S, W, 9, i); + RNDr(S, W, 10, i); + RNDr(S, W, 11, i); + RNDr(S, W, 12, i); + RNDr(S, W, 13, i); + RNDr(S, W, 14, i); + RNDr(S, W, 15, i); + + if (i == 48) + break; + MSCH(W, 0, i); + MSCH(W, 1, i); + MSCH(W, 2, i); + MSCH(W, 3, i); + MSCH(W, 4, i); + MSCH(W, 5, i); + MSCH(W, 6, i); + MSCH(W, 7, i); + MSCH(W, 8, i); + MSCH(W, 9, i); + MSCH(W, 10, i); + MSCH(W, 11, i); + MSCH(W, 12, i); + MSCH(W, 13, i); + MSCH(W, 14, i); + MSCH(W, 15, i); + } + + /* 4. Mix local working variables into global state. */ + state[0] += S[0]; + state[1] += S[1]; + state[2] += S[2]; + state[3] += S[3]; + state[4] += S[4]; + state[5] += S[5]; + state[6] += S[6]; + state[7] += S[7]; +} + +static const uint8_t PAD[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* Add padding and terminating bit-count. */ +static void +SHA256_Pad(SHA256_CTX * ctx, uint32_t tmp32[static restrict 72]) +{ + size_t r; + + /* Figure out how many bytes we have buffered. */ + r = (ctx->count >> 3) & 0x3f; + + /* Pad to 56 mod 64, transforming if we finish a block en route. */ + if (r < 56) { + /* Pad to 56 mod 64. */ + memcpy(&ctx->buf[r], PAD, 56 - r); + } else { + /* Finish the current block and mix. */ + memcpy(&ctx->buf[r], PAD, 64 - r); + SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]); + + /* The start of the final block is all zeroes. */ + memset(&ctx->buf[0], 0, 56); + } + + /* Add the terminating bit-count. */ + be64enc(&ctx->buf[56], ctx->count); + + /* Mix in the final block. */ + SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]); +} + +/* Magic initialization constants. */ +static const uint32_t initial_state[8] = { + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 +}; + +/** + * SHA256_Init(ctx): + * Initialize the SHA256 context ${ctx}. + */ +void +SHA256_Init(SHA256_CTX * ctx) +{ + + /* Zero bits processed so far. */ + ctx->count = 0; + + /* Initialize state. */ + memcpy(ctx->state, initial_state, sizeof(initial_state)); +} + +/** + * SHA256_Update(ctx, in, len): + * Input ${len} bytes from ${in} into the SHA256 context ${ctx}. + */ +static void +_SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len, + uint32_t tmp32[static restrict 72]) +{ + uint32_t r; + const uint8_t * src = in; + + /* Return immediately if we have nothing to do. */ + if (len == 0) + return; + + /* Number of bytes left in the buffer from previous updates. */ + r = (ctx->count >> 3) & 0x3f; + + /* Update number of bits. */ + ctx->count += (uint64_t)(len) << 3; + + /* Handle the case where we don't need to perform any transforms. */ + if (len < 64 - r) { + memcpy(&ctx->buf[r], src, len); + return; + } + + /* Finish the current block. */ + memcpy(&ctx->buf[r], src, 64 - r); + SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]); + src += 64 - r; + len -= 64 - r; + + /* Perform complete blocks. */ + while (len >= 64) { + SHA256_Transform(ctx->state, src, &tmp32[0], &tmp32[64]); + src += 64; + len -= 64; + } + + /* Copy left over data into buffer. */ + memcpy(ctx->buf, src, len); +} + +/* Wrapper function for intermediate-values sanitization. */ +void +SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len) +{ + uint32_t tmp32[72]; + + /* Call the real function. */ + _SHA256_Update(ctx, in, len, tmp32); + + /* Clean the stack. */ + insecure_memzero(tmp32, 288); +} + +/** + * SHA256_Final(digest, ctx): + * Output the SHA256 hash of the data input to the context ${ctx} into the + * buffer ${digest}. + */ +static void +_SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx, + uint32_t tmp32[static restrict 72]) +{ + + /* Add padding. */ + SHA256_Pad(ctx, tmp32); + + /* Write the hash. */ + be32enc_vect(digest, ctx->state, 4); +} + +/* Wrapper function for intermediate-values sanitization. */ +void +SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx) +{ + uint32_t tmp32[72]; + + /* Call the real function. */ + _SHA256_Final(digest, ctx, tmp32); + + /* Clear the context state. */ + insecure_memzero(ctx, sizeof(SHA256_CTX)); + + /* Clean the stack. */ + insecure_memzero(tmp32, 288); +} + +/** + * SHA256_Buf(in, len, digest): + * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}. + */ +void +SHA256_Buf(const void * in, size_t len, uint8_t digest[32]) +{ + SHA256_CTX ctx; + uint32_t tmp32[72]; + + SHA256_Init(&ctx); + _SHA256_Update(&ctx, in, len, tmp32); + _SHA256_Final(digest, &ctx, tmp32); + + /* Clean the stack. */ + insecure_memzero(&ctx, sizeof(SHA256_CTX)); + insecure_memzero(tmp32, 288); +} + +/** + * HMAC_SHA256_Init(ctx, K, Klen): + * Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from + * ${K}. + */ +static void +_HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen, + uint32_t tmp32[static restrict 72], uint8_t pad[static restrict 64], + uint8_t khash[static restrict 32]) +{ + const uint8_t * K = _K; + size_t i; + + /* If Klen > 64, the key is really SHA256(K). */ + if (Klen > 64) { + SHA256_Init(&ctx->ictx); + _SHA256_Update(&ctx->ictx, K, Klen, tmp32); + _SHA256_Final(khash, &ctx->ictx, tmp32); + K = khash; + Klen = 32; + } + + /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */ + SHA256_Init(&ctx->ictx); + memset(pad, 0x36, 64); + for (i = 0; i < Klen; i++) + pad[i] ^= K[i]; + _SHA256_Update(&ctx->ictx, pad, 64, tmp32); + + /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */ + SHA256_Init(&ctx->octx); + memset(pad, 0x5c, 64); + for (i = 0; i < Klen; i++) + pad[i] ^= K[i]; + _SHA256_Update(&ctx->octx, pad, 64, tmp32); +} + +/* Wrapper function for intermediate-values sanitization. */ +void +HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen) +{ + uint32_t tmp32[72]; + uint8_t pad[64]; + uint8_t khash[32]; + + /* Call the real function. */ + _HMAC_SHA256_Init(ctx, _K, Klen, tmp32, pad, khash); + + /* Clean the stack. */ + insecure_memzero(tmp32, 288); + insecure_memzero(khash, 32); + insecure_memzero(pad, 64); +} + +/** + * HMAC_SHA256_Update(ctx, in, len): + * Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}. + */ +static void +_HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len, + uint32_t tmp32[static restrict 72]) +{ + + /* Feed data to the inner SHA256 operation. */ + _SHA256_Update(&ctx->ictx, in, len, tmp32); +} + +/* Wrapper function for intermediate-values sanitization. */ +void +HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len) +{ + uint32_t tmp32[72]; + + /* Call the real function. */ + _HMAC_SHA256_Update(ctx, in, len, tmp32); + + /* Clean the stack. */ + insecure_memzero(tmp32, 288); +} + +/** + * HMAC_SHA256_Final(digest, ctx): + * Output the HMAC-SHA256 of the data input to the context ${ctx} into the + * buffer ${digest}. + */ +static void +_HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx, + uint32_t tmp32[static restrict 72], uint8_t ihash[static restrict 32]) +{ + + /* Finish the inner SHA256 operation. */ + _SHA256_Final(ihash, &ctx->ictx, tmp32); + + /* Feed the inner hash to the outer SHA256 operation. */ + _SHA256_Update(&ctx->octx, ihash, 32, tmp32); + + /* Finish the outer SHA256 operation. */ + _SHA256_Final(digest, &ctx->octx, tmp32); +} + +/* Wrapper function for intermediate-values sanitization. */ +void +HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx) +{ + uint32_t tmp32[72]; + uint8_t ihash[32]; + + /* Call the real function. */ + _HMAC_SHA256_Final(digest, ctx, tmp32, ihash); + + /* Clean the stack. */ + insecure_memzero(tmp32, 288); + insecure_memzero(ihash, 32); +} + +/** + * HMAC_SHA256_Buf(K, Klen, in, len, digest): + * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of + * length ${Klen}, and write the result to ${digest}. + */ +void +HMAC_SHA256_Buf(const void * K, size_t Klen, const void * in, size_t len, + uint8_t digest[32]) +{ + HMAC_SHA256_CTX ctx; + uint32_t tmp32[72]; + uint8_t tmp8[96]; + + _HMAC_SHA256_Init(&ctx, K, Klen, tmp32, &tmp8[0], &tmp8[64]); + _HMAC_SHA256_Update(&ctx, in, len, tmp32); + _HMAC_SHA256_Final(digest, &ctx, tmp32, &tmp8[0]); + + /* Clean the stack. */ + insecure_memzero(&ctx, sizeof(HMAC_SHA256_CTX)); + insecure_memzero(tmp32, 288); + insecure_memzero(tmp8, 96); +} + +/* Add padding and terminating bit-count, but don't invoke Transform yet. */ +static int +SHA256_Pad_Almost(SHA256_CTX * ctx, uint8_t len[static restrict 8], + uint32_t tmp32[static restrict 72]) +{ + uint32_t r; + + r = (ctx->count >> 3) & 0x3f; + if (r >= 56) + return -1; + + /* + * Convert length to a vector of bytes -- we do this now rather + * than later because the length will change after we pad. + */ + be64enc(len, ctx->count); + + /* Add 1--56 bytes so that the resulting length is 56 mod 64. */ + _SHA256_Update(ctx, PAD, 56 - r, tmp32); + + /* Add the terminating bit-count. */ + ctx->buf[63] = len[7]; + _SHA256_Update(ctx, len, 7, tmp32); + + return 0; +} + +/** + * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen): + * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and + * write the output to buf. The value dkLen must be at most 32 * (2^32 - 1). + */ +void +PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt, + size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen) +{ + HMAC_SHA256_CTX Phctx, PShctx, hctx; + uint32_t tmp32[72]; + union { + uint8_t tmp8[96]; + uint32_t state[8]; + } u; + size_t i; + uint8_t ivec[4]; + uint8_t U[32]; + uint8_t T[32]; + uint64_t j; + int k; + size_t clen; + + /* Sanity-check. */ + assert(dkLen <= 32 * (size_t)(UINT32_MAX)); + + if (c == 1 && (dkLen & 31) == 0 && (saltlen & 63) <= 51) { + uint32_t oldcount; + uint8_t * ivecp; + + /* Compute HMAC state after processing P and S. */ + _HMAC_SHA256_Init(&hctx, passwd, passwdlen, + tmp32, &u.tmp8[0], &u.tmp8[64]); + _HMAC_SHA256_Update(&hctx, salt, saltlen, tmp32); + + /* Prepare ictx padding. */ + oldcount = hctx.ictx.count & (0x3f << 3); + _HMAC_SHA256_Update(&hctx, "\0\0\0", 4, tmp32); + if ((hctx.ictx.count & (0x3f << 3)) < oldcount || + SHA256_Pad_Almost(&hctx.ictx, u.tmp8, tmp32)) + goto generic; /* Can't happen due to saltlen check */ + ivecp = hctx.ictx.buf + (oldcount >> 3); + + /* Prepare octx padding. */ + hctx.octx.count += 32 << 3; + SHA256_Pad_Almost(&hctx.octx, u.tmp8, tmp32); + + /* Iterate through the blocks. */ + for (i = 0; i * 32 < dkLen; i++) { + /* Generate INT(i + 1). */ + be32enc(ivecp, (uint32_t)(i + 1)); + + /* Compute U_1 = PRF(P, S || INT(i)). */ + memcpy(u.state, hctx.ictx.state, sizeof(u.state)); + SHA256_Transform(u.state, hctx.ictx.buf, + &tmp32[0], &tmp32[64]); + be32enc_vect(hctx.octx.buf, u.state, 4); + memcpy(u.state, hctx.octx.state, sizeof(u.state)); + SHA256_Transform(u.state, hctx.octx.buf, + &tmp32[0], &tmp32[64]); + be32enc_vect(&buf[i * 32], u.state, 4); + } + + goto cleanup; + } + +generic: + /* Compute HMAC state after processing P. */ + _HMAC_SHA256_Init(&Phctx, passwd, passwdlen, + tmp32, &u.tmp8[0], &u.tmp8[64]); + + /* Compute HMAC state after processing P and S. */ + memcpy(&PShctx, &Phctx, sizeof(HMAC_SHA256_CTX)); + _HMAC_SHA256_Update(&PShctx, salt, saltlen, tmp32); + + /* Iterate through the blocks. */ + for (i = 0; i * 32 < dkLen; i++) { + /* Generate INT(i + 1). */ + be32enc(ivec, (uint32_t)(i + 1)); + + /* Compute U_1 = PRF(P, S || INT(i)). */ + memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX)); + _HMAC_SHA256_Update(&hctx, ivec, 4, tmp32); + _HMAC_SHA256_Final(T, &hctx, tmp32, u.tmp8); + + if (c > 1) { + /* T_i = U_1 ... */ + memcpy(U, T, 32); + + for (j = 2; j <= c; j++) { + /* Compute U_j. */ + memcpy(&hctx, &Phctx, sizeof(HMAC_SHA256_CTX)); + _HMAC_SHA256_Update(&hctx, U, 32, tmp32); + _HMAC_SHA256_Final(U, &hctx, tmp32, u.tmp8); + + /* ... xor U_j ... */ + for (k = 0; k < 32; k++) + T[k] ^= U[k]; + } + } + + /* Copy as many bytes as necessary into buf. */ + clen = dkLen - i * 32; + if (clen > 32) + clen = 32; + memcpy(&buf[i * 32], T, clen); + } + + /* Clean the stack. */ + insecure_memzero(&Phctx, sizeof(HMAC_SHA256_CTX)); + insecure_memzero(&PShctx, sizeof(HMAC_SHA256_CTX)); + insecure_memzero(U, 32); + insecure_memzero(T, 32); + +cleanup: + insecure_memzero(&hctx, sizeof(HMAC_SHA256_CTX)); + insecure_memzero(tmp32, 288); + insecure_memzero(&u, sizeof(u)); +} diff --git a/sha256.h b/sha256.h new file mode 100644 index 000000000..3b8f1e5d1 --- /dev/null +++ b/sha256.h @@ -0,0 +1,121 @@ +/*- + * Copyright 2005-2016 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _SHA256_H_ +#define _SHA256_H_ + +#include +#include + +/* + * Use #defines in order to avoid namespace collisions with anyone else's + * SHA256 code (e.g., the code in OpenSSL). + */ +#define SHA256_Init libcperciva_SHA256_Init +#define SHA256_Update libcperciva_SHA256_Update +#define SHA256_Final libcperciva_SHA256_Final +#define SHA256_Buf libcperciva_SHA256_Buf +#define SHA256_CTX libcperciva_SHA256_CTX +#define HMAC_SHA256_Init libcperciva_HMAC_SHA256_Init +#define HMAC_SHA256_Update libcperciva_HMAC_SHA256_Update +#define HMAC_SHA256_Final libcperciva_HMAC_SHA256_Final +#define HMAC_SHA256_Buf libcperciva_HMAC_SHA256_Buf +#define HMAC_SHA256_CTX libcperciva_HMAC_SHA256_CTX + +/* Context structure for SHA256 operations. */ +typedef struct { + uint32_t state[8]; + uint64_t count; + uint8_t buf[64]; +} SHA256_CTX; + +/** + * SHA256_Init(ctx): + * Initialize the SHA256 context ${ctx}. + */ +void SHA256_Init(SHA256_CTX *); + +/** + * SHA256_Update(ctx, in, len): + * Input ${len} bytes from ${in} into the SHA256 context ${ctx}. + */ +void SHA256_Update(SHA256_CTX *, const void *, size_t); + +/** + * SHA256_Final(digest, ctx): + * Output the SHA256 hash of the data input to the context ${ctx} into the + * buffer ${digest}. + */ +void SHA256_Final(uint8_t[32], SHA256_CTX *); + +/** + * SHA256_Buf(in, len, digest): + * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}. + */ +void SHA256_Buf(const void *, size_t, uint8_t[32]); + +/* Context structure for HMAC-SHA256 operations. */ +typedef struct { + SHA256_CTX ictx; + SHA256_CTX octx; +} HMAC_SHA256_CTX; + +/** + * HMAC_SHA256_Init(ctx, K, Klen): + * Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from + * ${K}. + */ +void HMAC_SHA256_Init(HMAC_SHA256_CTX *, const void *, size_t); + +/** + * HMAC_SHA256_Update(ctx, in, len): + * Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}. + */ +void HMAC_SHA256_Update(HMAC_SHA256_CTX *, const void *, size_t); + +/** + * HMAC_SHA256_Final(digest, ctx): + * Output the HMAC-SHA256 of the data input to the context ${ctx} into the + * buffer ${digest}. + */ +void HMAC_SHA256_Final(uint8_t[32], HMAC_SHA256_CTX *); + +/** + * HMAC_SHA256_Buf(K, Klen, in, len, digest): + * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of + * length ${Klen}, and write the result to ${digest}. + */ +void HMAC_SHA256_Buf(const void *, size_t, const void *, size_t, uint8_t[32]); + +/** + * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen): + * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and + * write the output to buf. The value dkLen must be at most 32 * (2^32 - 1). + */ +void PBKDF2_SHA256(const uint8_t *, size_t, const uint8_t *, size_t, + uint64_t, uint8_t *, size_t); + +#endif /* !_SHA256_H_ */ diff --git a/sha256_Y.c b/sha256_Y.c index 0cbb72ab4..811bff5a7 100755 --- a/sha256_Y.c +++ b/sha256_Y.c @@ -353,6 +353,7 @@ HMAC_SHA256_Final_Y(unsigned char digest[32], HMAC_SHA256_CTX_Y * ctx) memset(ihash, 0, 32); } +#if 0 /** * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen): * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and @@ -409,3 +410,4 @@ PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt, /* Clean PShctx, since we never called _Final on it. */ memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX_Y)); } +#endif diff --git a/winbuild-cross.sh b/winbuild-cross.sh new file mode 100755 index 000000000..fac9c76dd --- /dev/null +++ b/winbuild-cross.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +#CURLVER=7.61.0 +#tar xzf curl-$CURLVER.tar.gz +#ln -s curl-$CURLVER curl +#cd curl +#./configure --with-winssl --with-winidn --host=i686-w64-mingw32 +#make -j4 + +#LOCAL_LIB=`pwd` + +export LDFLAGS="-Lcurl/lib/.libs" + +#F="--with-libcurl=curl --host=i686-w64-mingw32" +F="--with-libcurl=curl --host=x86_64-w64-mingw32" +#CFLAGS="-DWIN32 -DPTW32_STATIC_LIB -O3 -fomit-frame-pointer" +CFLAGS="-Wall -O3 -fomit-frame-pointer" + +mkdir release +cp README.txt release/ +#cp /usr/i686-w64-mingw32/lib/libwinpthread-1.dll release/ +cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/ +cp curl/lib/.libs/libcurl-4.dll release/ + +make distclean || echo clean +rm -f config.status +./autogen.sh || echo done +CFLAGS="$CFLAGS -march=core-avx2 -msha" ./configure $F +make -j4 +strip -s minerd.exe +mv minerd.exe release/minerd-avx2-sha.exe + +make clean || echo clean +rm -f config.status +CFLAGS="$CFLAGS -march=core-avx2" ./configure $F +make -j4 +strip -s minerd.exe +mv minerd.exe release/minerd-avx2.exe + +make clean || echo clean +rm -f config.status +CFLAGS="$CFLAGS -march=corei7-avx" ./configure $F +make -j4 +strip -s minerd.exe +mv minerd.exe release/minerd-avx.exe + +# -march=westmere is supported in gcc5 +make clean || echo clean +rm -f config.status +CFLAGS="$CFLAGS -march=westmere" ./configure $F +#CFLAGS="$CFLAGS -maes -msse4.2" ./configure $F +make -j4 +strip -s minerd.exe +mv minerd.exe release/minerd-aes-sse42.exe + +make clean || echo clean +rm -f config.status +CFLAGS="$CFLAGS -msse2" ./configure $F +make -j4 +strip -s minerd.exe +mv minerd.exe release/minerd-sse2.exe +make clean || echo clean + diff --git a/yespower-opt.c b/yespower-opt.c new file mode 100644 index 000000000..09a1fdd82 --- /dev/null +++ b/yespower-opt.c @@ -0,0 +1,1148 @@ +/*- + * Copyright 2009 Colin Percival + * Copyright 2012-2018 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + * + * This is a proof-of-work focused fork of yescrypt, including optimized and + * cut-down implementation of the obsolete yescrypt 0.5 (based off its first + * submission to PHC back in 2014) and a new proof-of-work specific variation + * known as yespower 1.0. The former is intended as an upgrade for + * cryptocurrencies that already use yescrypt 0.5 and the latter may be used + * as a further upgrade (hard fork) by those and other cryptocurrencies. The + * version of algorithm to use is requested through parameters, allowing for + * both algorithms to co-exist in client and miner implementations (such as in + * preparation for a hard-fork). + */ + +#ifndef _YESPOWER_OPT_C_PASS_ +#define _YESPOWER_OPT_C_PASS_ 1 +#endif + +#if _YESPOWER_OPT_C_PASS_ == 1 +/* + * AVX and especially XOP speed up Salsa20 a lot, but needlessly result in + * extra instruction prefixes for pwxform (which we make more use of). While + * no slowdown from the prefixes is generally observed on AMD CPUs supporting + * XOP, some slowdown is sometimes observed on Intel CPUs with AVX. + */ +#ifdef __XOP__ +#warning "Note: XOP is enabled. That's great." +#elif defined(__AVX__) +#warning "Note: AVX is enabled. That's OK." +#elif defined(__SSE2__) +#warning "Note: AVX and XOP are not enabled. That's OK." +#elif defined(__x86_64__) || defined(__i386__) +#warning "SSE2 not enabled. Expect poor performance." +#else +#warning "Note: building generic code for non-x86. That's OK." +#endif + +/* + * The SSE4 code version has fewer instructions than the generic SSE2 version, + * but all of the instructions are SIMD, thereby wasting the scalar execution + * units. Thus, the generic SSE2 version below actually runs faster on some + * CPUs due to its balanced mix of SIMD and scalar instructions. + */ +#undef USE_SSE4_FOR_32BIT + +#ifdef __SSE2__ +/* + * GCC before 4.9 would by default unnecessarily use store/load (without + * SSE4.1) or (V)PEXTR (with SSE4.1 or AVX) instead of simply (V)MOV. + * This was tracked as GCC bug 54349. + * "-mtune=corei7" works around this, but is only supported for GCC 4.6+. + * We use inline asm for pre-4.6 GCC, further down this file. + */ +#if __GNUC__ == 4 && __GNUC_MINOR__ >= 6 && __GNUC_MINOR__ < 9 && \ + !defined(__clang__) && !defined(__ICC) +#pragma GCC target ("tune=corei7") +#endif +#include +#ifdef __XOP__ +#include +#endif +#elif defined(__SSE__) +#include +#endif + +#include +#include +#include +#include + +#include "insecure_memzero.h" +#include "sha256.h" +#include "sysendian.h" +#include "miner.h" + +#include "yespower.h" + +#include "yespower-platform.c" + +#if __STDC_VERSION__ >= 199901L +/* Have restrict */ +#elif defined(__GNUC__) +#define restrict __restrict +#else +#define restrict +#endif + +#ifdef __GNUC__ +#undef unlikely +#define unlikely(exp) __builtin_expect(exp, 0) +#else +#define unlikely(exp) (exp) +#endif + +#ifdef __SSE__ +#define PREFETCH(x, hint) _mm_prefetch((const char *)(x), (hint)); +#else +#undef PREFETCH +#endif + +typedef union { + uint32_t w[16]; + uint64_t d[8]; +#ifdef __SSE2__ + __m128i q[4]; +#endif +} salsa20_blk_t; + +static inline void salsa20_simd_shuffle(const salsa20_blk_t *Bin, + salsa20_blk_t *Bout) +{ +#define COMBINE(out, in1, in2) \ + Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32); + COMBINE(0, 0, 2) + COMBINE(1, 5, 7) + COMBINE(2, 2, 4) + COMBINE(3, 7, 1) + COMBINE(4, 4, 6) + COMBINE(5, 1, 3) + COMBINE(6, 6, 0) + COMBINE(7, 3, 5) +#undef COMBINE +} + +static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin, + salsa20_blk_t *Bout) +{ +#define UNCOMBINE(out, in1, in2) \ + Bout->w[out * 2] = Bin->d[in1]; \ + Bout->w[out * 2 + 1] = Bin->d[in2] >> 32; + UNCOMBINE(0, 0, 6) + UNCOMBINE(1, 5, 3) + UNCOMBINE(2, 2, 0) + UNCOMBINE(3, 7, 5) + UNCOMBINE(4, 4, 2) + UNCOMBINE(5, 1, 7) + UNCOMBINE(6, 6, 4) + UNCOMBINE(7, 3, 1) +#undef UNCOMBINE +} + +#ifdef __SSE2__ +#define DECL_X \ + __m128i X0, X1, X2, X3; +#define DECL_Y \ + __m128i Y0, Y1, Y2, Y3; +#define READ_X(in) \ + X0 = (in).q[0]; X1 = (in).q[1]; X2 = (in).q[2]; X3 = (in).q[3]; +#define WRITE_X(out) \ + (out).q[0] = X0; (out).q[1] = X1; (out).q[2] = X2; (out).q[3] = X3; + +#ifdef __XOP__ +#define ARX(out, in1, in2, s) \ + out = _mm_xor_si128(out, _mm_roti_epi32(_mm_add_epi32(in1, in2), s)); +#else +#define ARX(out, in1, in2, s) { \ + __m128i tmp = _mm_add_epi32(in1, in2); \ + out = _mm_xor_si128(out, _mm_slli_epi32(tmp, s)); \ + out = _mm_xor_si128(out, _mm_srli_epi32(tmp, 32 - s)); \ +} +#endif + +#define SALSA20_2ROUNDS \ + /* Operate on "columns" */ \ + ARX(X1, X0, X3, 7) \ + ARX(X2, X1, X0, 9) \ + ARX(X3, X2, X1, 13) \ + ARX(X0, X3, X2, 18) \ + /* Rearrange data */ \ + X1 = _mm_shuffle_epi32(X1, 0x93); \ + X2 = _mm_shuffle_epi32(X2, 0x4E); \ + X3 = _mm_shuffle_epi32(X3, 0x39); \ + /* Operate on "rows" */ \ + ARX(X3, X0, X1, 7) \ + ARX(X2, X3, X0, 9) \ + ARX(X1, X2, X3, 13) \ + ARX(X0, X1, X2, 18) \ + /* Rearrange data */ \ + X1 = _mm_shuffle_epi32(X1, 0x39); \ + X2 = _mm_shuffle_epi32(X2, 0x4E); \ + X3 = _mm_shuffle_epi32(X3, 0x93); + +/** + * Apply the Salsa20 core to the block provided in (X0 ... X3). + */ +#define SALSA20_wrapper(out, rounds) { \ + __m128i Z0 = X0, Z1 = X1, Z2 = X2, Z3 = X3; \ + rounds \ + (out).q[0] = X0 = _mm_add_epi32(X0, Z0); \ + (out).q[1] = X1 = _mm_add_epi32(X1, Z1); \ + (out).q[2] = X2 = _mm_add_epi32(X2, Z2); \ + (out).q[3] = X3 = _mm_add_epi32(X3, Z3); \ +} + +/** + * Apply the Salsa20/2 core to the block provided in X. + */ +#define SALSA20_2(out) \ + SALSA20_wrapper(out, SALSA20_2ROUNDS) + +#define SALSA20_8ROUNDS \ + SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS + +/** + * Apply the Salsa20/8 core to the block provided in X. + */ +#define SALSA20_8(out) \ + SALSA20_wrapper(out, SALSA20_8ROUNDS) + +#define XOR_X(in) \ + X0 = _mm_xor_si128(X0, (in).q[0]); \ + X1 = _mm_xor_si128(X1, (in).q[1]); \ + X2 = _mm_xor_si128(X2, (in).q[2]); \ + X3 = _mm_xor_si128(X3, (in).q[3]); + +#define XOR_X_2(in1, in2) \ + X0 = _mm_xor_si128((in1).q[0], (in2).q[0]); \ + X1 = _mm_xor_si128((in1).q[1], (in2).q[1]); \ + X2 = _mm_xor_si128((in1).q[2], (in2).q[2]); \ + X3 = _mm_xor_si128((in1).q[3], (in2).q[3]); + +#define XOR_X_WRITE_XOR_Y_2(out, in) \ + (out).q[0] = Y0 = _mm_xor_si128((out).q[0], (in).q[0]); \ + (out).q[1] = Y1 = _mm_xor_si128((out).q[1], (in).q[1]); \ + (out).q[2] = Y2 = _mm_xor_si128((out).q[2], (in).q[2]); \ + (out).q[3] = Y3 = _mm_xor_si128((out).q[3], (in).q[3]); \ + X0 = _mm_xor_si128(X0, Y0); \ + X1 = _mm_xor_si128(X1, Y1); \ + X2 = _mm_xor_si128(X2, Y2); \ + X3 = _mm_xor_si128(X3, Y3); + +#define INTEGERIFY _mm_cvtsi128_si32(X0) + +#else /* !defined(__SSE2__) */ + +#define DECL_X \ + salsa20_blk_t X; +#define DECL_Y \ + salsa20_blk_t Y; + +#define COPY(out, in) \ + (out).d[0] = (in).d[0]; \ + (out).d[1] = (in).d[1]; \ + (out).d[2] = (in).d[2]; \ + (out).d[3] = (in).d[3]; \ + (out).d[4] = (in).d[4]; \ + (out).d[5] = (in).d[5]; \ + (out).d[6] = (in).d[6]; \ + (out).d[7] = (in).d[7]; + +#define READ_X(in) COPY(X, in) +#define WRITE_X(out) COPY(out, X) + +/** + * salsa20(B): + * Apply the Salsa20 core to the provided block. + */ +static inline void salsa20(salsa20_blk_t *restrict B, + salsa20_blk_t *restrict Bout, uint32_t doublerounds) +{ + salsa20_blk_t X; +#define x X.w + + salsa20_simd_unshuffle(B, &X); + + do { +#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b)))) + /* Operate on columns */ + x[ 4] ^= R(x[ 0]+x[12], 7); x[ 8] ^= R(x[ 4]+x[ 0], 9); + x[12] ^= R(x[ 8]+x[ 4],13); x[ 0] ^= R(x[12]+x[ 8],18); + + x[ 9] ^= R(x[ 5]+x[ 1], 7); x[13] ^= R(x[ 9]+x[ 5], 9); + x[ 1] ^= R(x[13]+x[ 9],13); x[ 5] ^= R(x[ 1]+x[13],18); + + x[14] ^= R(x[10]+x[ 6], 7); x[ 2] ^= R(x[14]+x[10], 9); + x[ 6] ^= R(x[ 2]+x[14],13); x[10] ^= R(x[ 6]+x[ 2],18); + + x[ 3] ^= R(x[15]+x[11], 7); x[ 7] ^= R(x[ 3]+x[15], 9); + x[11] ^= R(x[ 7]+x[ 3],13); x[15] ^= R(x[11]+x[ 7],18); + + /* Operate on rows */ + x[ 1] ^= R(x[ 0]+x[ 3], 7); x[ 2] ^= R(x[ 1]+x[ 0], 9); + x[ 3] ^= R(x[ 2]+x[ 1],13); x[ 0] ^= R(x[ 3]+x[ 2],18); + + x[ 6] ^= R(x[ 5]+x[ 4], 7); x[ 7] ^= R(x[ 6]+x[ 5], 9); + x[ 4] ^= R(x[ 7]+x[ 6],13); x[ 5] ^= R(x[ 4]+x[ 7],18); + + x[11] ^= R(x[10]+x[ 9], 7); x[ 8] ^= R(x[11]+x[10], 9); + x[ 9] ^= R(x[ 8]+x[11],13); x[10] ^= R(x[ 9]+x[ 8],18); + + x[12] ^= R(x[15]+x[14], 7); x[13] ^= R(x[12]+x[15], 9); + x[14] ^= R(x[13]+x[12],13); x[15] ^= R(x[14]+x[13],18); +#undef R + } while (--doublerounds); +#undef x + + { + uint32_t i; + salsa20_simd_shuffle(&X, Bout); + for (i = 0; i < 16; i += 4) { + B->w[i] = Bout->w[i] += B->w[i]; + B->w[i + 1] = Bout->w[i + 1] += B->w[i + 1]; + B->w[i + 2] = Bout->w[i + 2] += B->w[i + 2]; + B->w[i + 3] = Bout->w[i + 3] += B->w[i + 3]; + } + } +} + +/** + * Apply the Salsa20/2 core to the block provided in X. + */ +#define SALSA20_2(out) \ + salsa20(&X, &out, 1); + +/** + * Apply the Salsa20/8 core to the block provided in X. + */ +#define SALSA20_8(out) \ + salsa20(&X, &out, 4); + +#define XOR(out, in1, in2) \ + (out).d[0] = (in1).d[0] ^ (in2).d[0]; \ + (out).d[1] = (in1).d[1] ^ (in2).d[1]; \ + (out).d[2] = (in1).d[2] ^ (in2).d[2]; \ + (out).d[3] = (in1).d[3] ^ (in2).d[3]; \ + (out).d[4] = (in1).d[4] ^ (in2).d[4]; \ + (out).d[5] = (in1).d[5] ^ (in2).d[5]; \ + (out).d[6] = (in1).d[6] ^ (in2).d[6]; \ + (out).d[7] = (in1).d[7] ^ (in2).d[7]; + +#define XOR_X(in) XOR(X, X, in) +#define XOR_X_2(in1, in2) XOR(X, in1, in2) +#define XOR_X_WRITE_XOR_Y_2(out, in) \ + XOR(Y, out, in) \ + COPY(out, Y) \ + XOR(X, X, Y) + +#define INTEGERIFY (uint32_t)X.d[0] +#endif + +/** + * Apply the Salsa20 core to the block provided in X ^ in. + */ +#define SALSA20_XOR_MEM(in, out) \ + XOR_X(in) \ + SALSA20(out) + +#define SALSA20 SALSA20_8 +#else /* pass 2 */ +#undef SALSA20 +#define SALSA20 SALSA20_2 +#endif + +/** + * blockmix_salsa(Bin, Bout): + * Compute Bout = BlockMix_{salsa20, 1}(Bin). The input Bin must be 128 + * bytes in length; the output Bout must also be the same size. + */ +static inline void blockmix_salsa(const salsa20_blk_t *restrict Bin, + salsa20_blk_t *restrict Bout) +{ + DECL_X + + READ_X(Bin[1]) + SALSA20_XOR_MEM(Bin[0], Bout[0]) + SALSA20_XOR_MEM(Bin[1], Bout[1]) +} + +static inline uint32_t blockmix_salsa_xor(const salsa20_blk_t *restrict Bin1, + const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout) +{ + DECL_X + + XOR_X_2(Bin1[1], Bin2[1]) + XOR_X(Bin1[0]) + SALSA20_XOR_MEM(Bin2[0], Bout[0]) + XOR_X(Bin1[1]) + SALSA20_XOR_MEM(Bin2[1], Bout[1]) + + return INTEGERIFY; +} + +#if _YESPOWER_OPT_C_PASS_ == 1 +/* This is tunable, but it is part of what defines a yespower version */ +/* Version 0.5 */ +#define Swidth_0_5 8 +/* Version 1.0 */ +#define Swidth_1_0 11 + +/* Not tunable in this implementation, hard-coded in a few places */ +#define PWXsimple 2 +#define PWXgather 4 + +/* Derived value. Not tunable on its own. */ +#define PWXbytes (PWXgather * PWXsimple * 8) + +/* (Maybe-)runtime derived values. Not tunable on their own. */ +#define Swidth_to_Sbytes1(Swidth) ((1 << (Swidth)) * PWXsimple * 8) +#define Swidth_to_Smask(Swidth) (((1 << (Swidth)) - 1) * PWXsimple * 8) +#define Smask_to_Smask2(Smask) (((uint64_t)(Smask) << 32) | (Smask)) + +/* These should be compile-time derived */ +#define Smask2_0_5 Smask_to_Smask2(Swidth_to_Smask(Swidth_0_5)) +#define Smask2_1_0 Smask_to_Smask2(Swidth_to_Smask(Swidth_1_0)) + +typedef struct { + uint8_t *S0, *S1, *S2; + size_t w; + uint32_t Sbytes; +} pwxform_ctx_t; + +#define DECL_SMASK2REG /* empty */ +#define MAYBE_MEMORY_BARRIER /* empty */ + +#ifdef __SSE2__ +/* + * (V)PSRLDQ and (V)PSHUFD have higher throughput than (V)PSRLQ on some CPUs + * starting with Sandy Bridge. Additionally, PSHUFD uses separate source and + * destination registers, whereas the shifts would require an extra move + * instruction for our code when building without AVX. Unfortunately, PSHUFD + * is much slower on Conroe (4 cycles latency vs. 1 cycle latency for PSRLQ) + * and somewhat slower on some non-Intel CPUs (luckily not including AMD + * Bulldozer and Piledriver). + */ +#ifdef __AVX__ +#define HI32(X) \ + _mm_srli_si128((X), 4) +#elif 1 /* As an option, check for __SSE4_1__ here not to hurt Conroe */ +#define HI32(X) \ + _mm_shuffle_epi32((X), _MM_SHUFFLE(2,3,0,1)) +#else +#define HI32(X) \ + _mm_srli_epi64((X), 32) +#endif + +#if defined(__x86_64__) && \ + __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__ICC) +#ifdef __AVX__ +#define MOVQ "vmovq" +#else +/* "movq" would be more correct, but "movd" is supported by older binutils + * due to an error in AMD's spec for x86-64. */ +#define MOVQ "movd" +#endif +#define EXTRACT64(X) ({ \ + uint64_t result; \ + __asm__(MOVQ " %1, %0" : "=r" (result) : "x" (X)); \ + result; \ +}) +#elif defined(__x86_64__) && !defined(_MSC_VER) && !defined(__OPEN64__) +/* MSVC and Open64 had bugs */ +#define EXTRACT64(X) _mm_cvtsi128_si64(X) +#elif defined(__x86_64__) && defined(__SSE4_1__) +/* No known bugs for this intrinsic */ +#include +#define EXTRACT64(X) _mm_extract_epi64((X), 0) +#elif defined(USE_SSE4_FOR_32BIT) && defined(__SSE4_1__) +/* 32-bit */ +#include +#if 0 +/* This is currently unused by the code below, which instead uses these two + * intrinsics explicitly when (!defined(__x86_64__) && defined(__SSE4_1__)) */ +#define EXTRACT64(X) \ + ((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \ + ((uint64_t)(uint32_t)_mm_extract_epi32((X), 1) << 32)) +#endif +#else +/* 32-bit or compilers with known past bugs in _mm_cvtsi128_si64() */ +#define EXTRACT64(X) \ + ((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \ + ((uint64_t)(uint32_t)_mm_cvtsi128_si32(HI32(X)) << 32)) +#endif + +//#if defined(__x86_64__) && (defined(__AVX__) || !defined(__GNUC__)) +#if 0 /* XXX The follwoing code is slower XXX */ +/* 64-bit with AVX */ +/* Force use of 64-bit AND instead of two 32-bit ANDs */ +#undef DECL_SMASK2REG +#if defined(__GNUC__) && !defined(__ICC) +#define DECL_SMASK2REG uint64_t Smask2reg = Smask2; +/* Force use of lower-numbered registers to reduce number of prefixes, relying + * on out-of-order execution and register renaming. */ +#define FORCE_REGALLOC_1 \ + __asm__("" : "=a" (x), "+d" (Smask2reg), "+S" (S0), "+D" (S1)); +#define FORCE_REGALLOC_2 \ + __asm__("" : : "c" (lo)); +#else +static volatile uint64_t Smask2var = Smask2; +#define DECL_SMASK2REG uint64_t Smask2reg = Smask2var; +#define FORCE_REGALLOC_1 /* empty */ +#define FORCE_REGALLOC_2 /* empty */ +#endif +#define PWXFORM_SIMD(X) { \ + uint64_t x; \ + FORCE_REGALLOC_1 \ + uint32_t lo = x = EXTRACT64(X) & Smask2reg; \ + FORCE_REGALLOC_2 \ + uint32_t hi = x >> 32; \ + X = _mm_mul_epu32(HI32(X), X); \ + X = _mm_add_epi64(X, *(__m128i *)(S0 + lo)); \ + X = _mm_xor_si128(X, *(__m128i *)(S1 + hi)); \ +} +#elif defined(__x86_64__) +/* 64-bit without AVX. This relies on out-of-order execution and register + * renaming. It may actually be fastest on CPUs with AVX(2) as well - e.g., + * it runs great on Haswell. */ +#warning "Note: using x86-64 inline assembly for pwxform. That's great." +#undef MAYBE_MEMORY_BARRIER +#define MAYBE_MEMORY_BARRIER \ + __asm__("" : : : "memory"); +#define PWXFORM_SIMD(X) { \ + __m128i H; \ + __asm__( \ + "movd %0, %%rax\n\t" \ + "pshufd $0xb1, %0, %1\n\t" \ + "andq %2, %%rax\n\t" \ + "pmuludq %1, %0\n\t" \ + "movl %%eax, %%ecx\n\t" \ + "shrq $0x20, %%rax\n\t" \ + "paddq (%3,%%rcx), %0\n\t" \ + "pxor (%4,%%rax), %0\n\t" \ + : "+x" (X), "=x" (H) \ + : "d" (Smask2), "S" (S0), "D" (S1) \ + : "cc", "ax", "cx"); \ +} +#elif defined(USE_SSE4_FOR_32BIT) && defined(__SSE4_1__) +/* 32-bit with SSE4.1 */ +#define PWXFORM_SIMD(X) { \ + __m128i x = _mm_and_si128(X, _mm_set1_epi64x(Smask2)); \ + __m128i s0 = *(__m128i *)(S0 + (uint32_t)_mm_cvtsi128_si32(x)); \ + __m128i s1 = *(__m128i *)(S1 + (uint32_t)_mm_extract_epi32(x, 1)); \ + X = _mm_mul_epu32(HI32(X), X); \ + X = _mm_add_epi64(X, s0); \ + X = _mm_xor_si128(X, s1); \ +} +#else +/* 32-bit without SSE4.1 */ +#define PWXFORM_SIMD(X) { \ + uint64_t x = EXTRACT64(X) & Smask2; \ + __m128i s0 = *(__m128i *)(S0 + (uint32_t)x); \ + __m128i s1 = *(__m128i *)(S1 + (x >> 32)); \ + X = _mm_mul_epu32(HI32(X), X); \ + X = _mm_add_epi64(X, s0); \ + X = _mm_xor_si128(X, s1); \ +} +#endif + +#define PWXFORM_SIMD_WRITE(X, Sw) \ + PWXFORM_SIMD(X) \ + MAYBE_MEMORY_BARRIER \ + *(__m128i *)(Sw + w) = X; \ + MAYBE_MEMORY_BARRIER + +#define PWXFORM_ROUND \ + PWXFORM_SIMD(X0) \ + PWXFORM_SIMD(X1) \ + PWXFORM_SIMD(X2) \ + PWXFORM_SIMD(X3) + +#define PWXFORM_ROUND_WRITE4 \ + PWXFORM_SIMD_WRITE(X0, S0) \ + PWXFORM_SIMD_WRITE(X1, S1) \ + w += 16; \ + PWXFORM_SIMD_WRITE(X2, S0) \ + PWXFORM_SIMD_WRITE(X3, S1) \ + w += 16; + +#define PWXFORM_ROUND_WRITE2 \ + PWXFORM_SIMD_WRITE(X0, S0) \ + PWXFORM_SIMD_WRITE(X1, S1) \ + w += 16; \ + PWXFORM_SIMD(X2) \ + PWXFORM_SIMD(X3) + +#else /* !defined(__SSE2__) */ + +#define PWXFORM_SIMD(x0, x1) { \ + uint64_t x = x0 & Smask2; \ + uint64_t *p0 = (uint64_t *)(S0 + (uint32_t)x); \ + uint64_t *p1 = (uint64_t *)(S1 + (x >> 32)); \ + x0 = ((x0 >> 32) * (uint32_t)x0 + p0[0]) ^ p1[0]; \ + x1 = ((x1 >> 32) * (uint32_t)x1 + p0[1]) ^ p1[1]; \ +} + +#define PWXFORM_SIMD_WRITE(x0, x1, Sw) \ + PWXFORM_SIMD(x0, x1) \ + ((uint64_t *)(Sw + w))[0] = x0; \ + ((uint64_t *)(Sw + w))[1] = x1; + +#define PWXFORM_ROUND \ + PWXFORM_SIMD(X.d[0], X.d[1]) \ + PWXFORM_SIMD(X.d[2], X.d[3]) \ + PWXFORM_SIMD(X.d[4], X.d[5]) \ + PWXFORM_SIMD(X.d[6], X.d[7]) + +#define PWXFORM_ROUND_WRITE4 \ + PWXFORM_SIMD_WRITE(X.d[0], X.d[1], S0) \ + PWXFORM_SIMD_WRITE(X.d[2], X.d[3], S1) \ + w += 16; \ + PWXFORM_SIMD_WRITE(X.d[4], X.d[5], S0) \ + PWXFORM_SIMD_WRITE(X.d[6], X.d[7], S1) \ + w += 16; + +#define PWXFORM_ROUND_WRITE2 \ + PWXFORM_SIMD_WRITE(X.d[0], X.d[1], S0) \ + PWXFORM_SIMD_WRITE(X.d[2], X.d[3], S1) \ + w += 16; \ + PWXFORM_SIMD(X.d[4], X.d[5]) \ + PWXFORM_SIMD(X.d[6], X.d[7]) +#endif + +#define PWXFORM \ + PWXFORM_ROUND PWXFORM_ROUND PWXFORM_ROUND \ + PWXFORM_ROUND PWXFORM_ROUND PWXFORM_ROUND + +#define Smask2 Smask2_0_5 + +#else /* pass 2 */ + +#undef PWXFORM +#define PWXFORM \ + PWXFORM_ROUND_WRITE4 PWXFORM_ROUND_WRITE2 PWXFORM_ROUND_WRITE2 \ + w &= Smask2; \ + { \ + uint8_t *Stmp = S2; \ + S2 = S1; \ + S1 = S0; \ + S0 = Stmp; \ + } + +#undef Smask2 +#define Smask2 Smask2_1_0 + +#endif + +/** + * blockmix_pwxform(Bin, Bout, r, S): + * Compute Bout = BlockMix_pwxform{salsa20, r, S}(Bin). The input Bin must + * be 128r bytes in length; the output Bout must also be the same size. + */ +static void blockmix(const salsa20_blk_t *restrict Bin, + salsa20_blk_t *restrict Bout, size_t r, pwxform_ctx_t *restrict ctx) +{ + if (unlikely(!ctx)) { + blockmix_salsa(Bin, Bout); + return; + } + + uint8_t *S0 = ctx->S0, *S1 = ctx->S1; +#if _YESPOWER_OPT_C_PASS_ > 1 + uint8_t *S2 = ctx->S2; + size_t w = ctx->w; +#endif + size_t i; + DECL_X + + /* Convert count of 128-byte blocks to max index of 64-byte block */ + r = r * 2 - 1; + + READ_X(Bin[r]) + + DECL_SMASK2REG + + i = 0; + do { + XOR_X(Bin[i]) + PWXFORM + if (unlikely(i >= r)) + break; + WRITE_X(Bout[i]) + i++; + } while (1); + +#if _YESPOWER_OPT_C_PASS_ > 1 + ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2; + ctx->w = w; +#endif + + SALSA20(Bout[i]) +} + +static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1, + const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, + size_t r, pwxform_ctx_t *restrict ctx) +{ + if (unlikely(!ctx)) + return blockmix_salsa_xor(Bin1, Bin2, Bout); + + uint8_t *S0 = ctx->S0, *S1 = ctx->S1; +#if _YESPOWER_OPT_C_PASS_ > 1 + uint8_t *S2 = ctx->S2; + size_t w = ctx->w; +#endif + size_t i; + DECL_X + + /* Convert count of 128-byte blocks to max index of 64-byte block */ + r = r * 2 - 1; + +#ifdef PREFETCH + PREFETCH(&Bin2[r], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin2[i], _MM_HINT_T0) + } +#endif + + XOR_X_2(Bin1[r], Bin2[r]) + + DECL_SMASK2REG + + i = 0; + r--; + do { + XOR_X(Bin1[i]) + XOR_X(Bin2[i]) + PWXFORM + WRITE_X(Bout[i]) + + XOR_X(Bin1[i + 1]) + XOR_X(Bin2[i + 1]) + PWXFORM + + if (unlikely(i >= r)) + break; + + WRITE_X(Bout[i + 1]) + + i += 2; + } while (1); + i++; + +#if _YESPOWER_OPT_C_PASS_ > 1 + ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2; + ctx->w = w; +#endif + + SALSA20(Bout[i]) + + return INTEGERIFY; +} + +static uint32_t blockmix_xor_save(salsa20_blk_t *restrict Bin1out, + salsa20_blk_t *restrict Bin2, + size_t r, pwxform_ctx_t *restrict ctx) +{ + uint8_t *S0 = ctx->S0, *S1 = ctx->S1; +#if _YESPOWER_OPT_C_PASS_ > 1 + uint8_t *S2 = ctx->S2; + size_t w = ctx->w; +#endif + size_t i; + DECL_X + DECL_Y + + /* Convert count of 128-byte blocks to max index of 64-byte block */ + r = r * 2 - 1; + +#ifdef PREFETCH + PREFETCH(&Bin2[r], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin2[i], _MM_HINT_T0) + } +#endif + + XOR_X_2(Bin1out[r], Bin2[r]) + + DECL_SMASK2REG + + i = 0; + r--; + do { + XOR_X_WRITE_XOR_Y_2(Bin2[i], Bin1out[i]) + PWXFORM + WRITE_X(Bin1out[i]) + + XOR_X_WRITE_XOR_Y_2(Bin2[i + 1], Bin1out[i + 1]) + PWXFORM + + if (unlikely(i >= r)) + break; + + WRITE_X(Bin1out[i + 1]) + + i += 2; + } while (1); + i++; + +#if _YESPOWER_OPT_C_PASS_ > 1 + ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2; + ctx->w = w; +#endif + + SALSA20(Bin1out[i]) + + return INTEGERIFY; +} + +#if _YESPOWER_OPT_C_PASS_ == 1 +/** + * integerify(B, r): + * Return the result of parsing B_{2r-1} as a little-endian integer. + */ +static inline uint32_t integerify(const salsa20_blk_t *B, size_t r) +{ +/* + * Our 64-bit words are in host byte order, which is why we don't just read + * w[0] here (would be wrong on big-endian). Also, our 32-bit words are + * SIMD-shuffled, but we only care about the least significant 32 bits anyway. + */ + return (uint32_t)B[2 * r - 1].d[0]; +} +#endif + +/** + * smix1(B, r, N, V, XY, S): + * Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in + * length; the temporary storage V must be 128rN bytes in length; the temporary + * storage XY must be 128r+64 bytes in length. N must be even and at least 4. + * The array V must be aligned to a multiple of 64 bytes, and arrays B and XY + * to a multiple of at least 16 bytes. + */ +static void smix1(uint8_t *B, size_t r, uint32_t N, + salsa20_blk_t *V, salsa20_blk_t *XY, pwxform_ctx_t *ctx) +{ + size_t s = 2 * r; + salsa20_blk_t *X = V, *Y = &V[s], *V_j; + uint32_t i, j, n; + +#if _YESPOWER_OPT_C_PASS_ == 1 + for (i = 0; i < 2 * r; i++) { +#else + for (i = 0; i < 2; i++) { +#endif + const salsa20_blk_t *src = (salsa20_blk_t *)&B[i * 64]; + salsa20_blk_t *tmp = Y; + salsa20_blk_t *dst = &X[i]; + size_t k; + for (k = 0; k < 16; k++) + tmp->w[k] = le32dec(&src->w[k]); + salsa20_simd_shuffle(tmp, dst); + } + +#if _YESPOWER_OPT_C_PASS_ > 1 + for (i = 1; i < r; i++) + blockmix(&X[(i - 1) * 2], &X[i * 2], 1, ctx); +#endif + + blockmix(X, Y, r, ctx); + X = Y + s; + blockmix(Y, X, r, ctx); + j = integerify(X, r); + + for (n = 2; n < N; n <<= 1) { + uint32_t m = (n < N / 2) ? n : (N - 1 - n); + for (i = 1; i < m; i += 2) { + Y = X + s; + j &= n - 1; + j += i - 1; + V_j = &V[j * s]; + j = blockmix_xor(X, V_j, Y, r, ctx); + j &= n - 1; + j += i; + V_j = &V[j * s]; + X = Y + s; + j = blockmix_xor(Y, V_j, X, r, ctx); + } + } + n >>= 1; + + j &= n - 1; + j += N - 2 - n; + V_j = &V[j * s]; + Y = X + s; + j = blockmix_xor(X, V_j, Y, r, ctx); + j &= n - 1; + j += N - 1 - n; + V_j = &V[j * s]; + blockmix_xor(Y, V_j, XY, r, ctx); + + for (i = 0; i < 2 * r; i++) { + const salsa20_blk_t *src = &XY[i]; + salsa20_blk_t *tmp = &XY[s]; + salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64]; + size_t k; + for (k = 0; k < 16; k++) + le32enc(&tmp->w[k], src->w[k]); + salsa20_simd_unshuffle(tmp, dst); + } +} + +/** + * smix2(B, r, N, Nloop, V, XY, S): + * Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in + * length; the temporary storage V must be 128rN bytes in length; the temporary + * storage XY must be 256r bytes in length. N must be a power of 2 and at + * least 2. Nloop must be even. The array V must be aligned to a multiple of + * 64 bytes, and arrays B and XY to a multiple of at least 16 bytes. + */ +static void smix2(uint8_t *B, size_t r, uint32_t N, uint32_t Nloop, + salsa20_blk_t *V, salsa20_blk_t *XY, pwxform_ctx_t *ctx) +{ + size_t s = 2 * r; + salsa20_blk_t *X = XY, *Y = &XY[s]; + uint32_t i, j; + + for (i = 0; i < 2 * r; i++) { + const salsa20_blk_t *src = (salsa20_blk_t *)&B[i * 64]; + salsa20_blk_t *tmp = Y; + salsa20_blk_t *dst = &X[i]; + size_t k; + for (k = 0; k < 16; k++) + tmp->w[k] = le32dec(&src->w[k]); + salsa20_simd_shuffle(tmp, dst); + } + + j = integerify(X, r) & (N - 1); + +#if _YESPOWER_OPT_C_PASS_ == 1 + if (Nloop > 2) { +#endif + do { + salsa20_blk_t *V_j = &V[j * s]; + j = blockmix_xor_save(X, V_j, r, ctx) & (N - 1); + V_j = &V[j * s]; + j = blockmix_xor_save(X, V_j, r, ctx) & (N - 1); + } while (Nloop -= 2); +#if _YESPOWER_OPT_C_PASS_ == 1 + } else { + const salsa20_blk_t * V_j = &V[j * s]; + j = blockmix_xor(X, V_j, Y, r, ctx) & (N - 1); + V_j = &V[j * s]; + blockmix_xor(Y, V_j, X, r, ctx); + } +#endif + + for (i = 0; i < 2 * r; i++) { + const salsa20_blk_t *src = &X[i]; + salsa20_blk_t *tmp = Y; + salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64]; + size_t k; + for (k = 0; k < 16; k++) + le32enc(&tmp->w[k], src->w[k]); + salsa20_simd_unshuffle(tmp, dst); + } +} + +/** + * smix(B, r, N, V, XY, S): + * Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the + * temporary storage V must be 128rN bytes in length; the temporary storage + * XY must be 256r bytes in length. N must be a power of 2 and at least 16. + * The array V must be aligned to a multiple of 64 bytes, and arrays B and XY + * to a multiple of at least 16 bytes (aligning them to 64 bytes as well saves + * cache lines, but it might also result in cache bank conflicts). + */ +static void smix(uint8_t *B, size_t r, uint32_t N, + salsa20_blk_t *V, salsa20_blk_t *XY, pwxform_ctx_t *ctx) +{ +#if _YESPOWER_OPT_C_PASS_ == 1 + uint32_t Nloop_all = (N + 2) / 3; /* 1/3, round up */ + uint32_t Nloop_rw = Nloop_all; + + Nloop_all++; Nloop_all &= ~(uint32_t)1; /* round up to even */ + Nloop_rw &= ~(uint32_t)1; /* round down to even */ +#else + uint32_t Nloop_rw = (N + 2) / 3; /* 1/3, round up */ + Nloop_rw++; Nloop_rw &= ~(uint32_t)1; /* round up to even */ +#endif + + smix1(B, 1, ctx->Sbytes / 128, (salsa20_blk_t *)ctx->S0, XY, NULL); + smix1(B, r, N, V, XY, ctx); + smix2(B, r, N, Nloop_rw /* must be > 2 */, V, XY, ctx); +#if _YESPOWER_OPT_C_PASS_ == 1 + if (Nloop_all > Nloop_rw) + smix2(B, r, N, 2, V, XY, ctx); +#endif +} + +#if _YESPOWER_OPT_C_PASS_ == 1 +#undef _YESPOWER_OPT_C_PASS_ +#define _YESPOWER_OPT_C_PASS_ 2 +#define blockmix_salsa blockmix_salsa_1_0 +#define blockmix_salsa_xor blockmix_salsa_xor_1_0 +#define blockmix blockmix_1_0 +#define blockmix_xor blockmix_xor_1_0 +#define blockmix_xor_save blockmix_xor_save_1_0 +#define smix1 smix1_1_0 +#define smix2 smix2_1_0 +#define smix smix_1_0 +#include "yespower-opt.c" +#undef smix + +/** + * yespower(local, src, srclen, params, dst): + * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target". + * local is the thread-local data structure, allowing to preserve and reuse a + * memory allocation across calls, thereby reducing its overhead. + * + * Return 0 on success; or -1 on error. + */ +int yespower(yespower_local_t *local, + const uint8_t *src, size_t srclen, + const yespower_params_t *params, + yespower_binary_t *dst) +{ + yespower_version_t version = params->version; + uint32_t N = params->N; + uint32_t r = params->r; + const uint8_t *pers = params->pers; + size_t perslen = params->perslen; + uint32_t Swidth; + size_t B_size, V_size, XY_size, need; + uint8_t *B, *S; + salsa20_blk_t *V, *XY; + pwxform_ctx_t ctx; + uint8_t sha256[32]; + + /* Sanity-check parameters */ + if ((version != YESPOWER_0_5 && version != YESPOWER_1_0) || + N < 1024 || N > 512 * 1024 || r < 8 || r > 32 || + (N & (N - 1)) != 0 || + (!pers && perslen)) { + errno = EINVAL; + return -1; + } + + /* Allocate memory */ + B_size = (size_t)128 * r; + V_size = B_size * N; + if (version == YESPOWER_0_5) { + XY_size = B_size * 2; + Swidth = Swidth_0_5; + ctx.Sbytes = 2 * Swidth_to_Sbytes1(Swidth); + } else { + XY_size = B_size + 64; + Swidth = Swidth_1_0; + ctx.Sbytes = 3 * Swidth_to_Sbytes1(Swidth); + } + need = B_size + V_size + XY_size + ctx.Sbytes; + if (local->aligned_size < need) { + if (free_region(local)) + return -1; + if (!alloc_region(local, need)) + return -1; + } + B = (uint8_t *)local->aligned; + V = (salsa20_blk_t *)((uint8_t *)B + B_size); + XY = (salsa20_blk_t *)((uint8_t *)V + V_size); + S = (uint8_t *)XY + XY_size; + ctx.S0 = S; + ctx.S1 = S + Swidth_to_Sbytes1(Swidth); + + SHA256_Buf(src, srclen, sha256); + + if (version == YESPOWER_0_5) { + PBKDF2_SHA256(sha256, sizeof(sha256), src, srclen, 1, + B, B_size); + memcpy(sha256, B, sizeof(sha256)); + smix(B, r, N, V, XY, &ctx); + PBKDF2_SHA256(sha256, sizeof(sha256), B, B_size, 1, + (uint8_t *)dst, sizeof(*dst)); + + if (pers) { + HMAC_SHA256_Buf(dst, sizeof(*dst), pers, perslen, + sha256); + SHA256_Buf(sha256, sizeof(sha256), (uint8_t *)dst); + } + } else { + ctx.S2 = S + 2 * Swidth_to_Sbytes1(Swidth); + ctx.w = 0; + + if (pers) { + src = pers; + srclen = perslen; + } else { + srclen = 0; + } + + PBKDF2_SHA256(sha256, sizeof(sha256), src, srclen, 1, B, 128); + memcpy(sha256, B, sizeof(sha256)); + smix_1_0(B, r, N, V, XY, &ctx); + HMAC_SHA256_Buf(B + B_size - 64, 64, + sha256, sizeof(sha256), (uint8_t *)dst); + } + + /* Success! */ + return 0; +} + +/** + * yespower_tls(src, srclen, params, dst): + * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target". + * The memory allocation is maintained internally using thread-local storage. + * + * Return 0 on success; or -1 on error. + */ +int yespower_tls(const uint8_t *src, size_t srclen, + const yespower_params_t *params, yespower_binary_t *dst) +{ + static __thread int initialized = 0; + static __thread yespower_local_t local; + + if (!initialized) { + if (yespower_init_local(&local)) + return -1; + initialized = 1; + } + + return yespower(&local, src, srclen, params, dst); +} + +int yespower_init_local(yespower_local_t *local) +{ + init_region(local); + return 0; +} + +int yespower_free_local(yespower_local_t *local) +{ + return free_region(local); +} +#endif diff --git a/yespower-platform.c b/yespower-platform.c new file mode 100644 index 000000000..2b1a03f0f --- /dev/null +++ b/yespower-platform.c @@ -0,0 +1,107 @@ +/*- + * Copyright 2013-2018 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifdef __unix__ +#include +#endif + +#define HUGEPAGE_THRESHOLD (12 * 1024 * 1024) + +#ifdef __x86_64__ +#define HUGEPAGE_SIZE (2 * 1024 * 1024) +#else +#undef HUGEPAGE_SIZE +#endif + +static void *alloc_region(yespower_region_t *region, size_t size) +{ + size_t base_size = size; + uint8_t *base, *aligned; +#ifdef MAP_ANON + int flags = +#ifdef MAP_NOCORE + MAP_NOCORE | +#endif + MAP_ANON | MAP_PRIVATE; +#if defined(MAP_HUGETLB) && defined(HUGEPAGE_SIZE) + size_t new_size = size; + const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE - 1; + if (size >= HUGEPAGE_THRESHOLD && size + hugepage_mask >= size) { + flags |= MAP_HUGETLB; +/* + * Linux's munmap() fails on MAP_HUGETLB mappings if size is not a multiple of + * huge page size, so let's round up to huge page size here. + */ + new_size = size + hugepage_mask; + new_size &= ~hugepage_mask; + } + base = mmap(NULL, new_size, PROT_READ | PROT_WRITE, flags, -1, 0); + if (base != MAP_FAILED) { + base_size = new_size; + } else if (flags & MAP_HUGETLB) { + flags &= ~MAP_HUGETLB; + base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); + } + +#else + base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); +#endif + if (base == MAP_FAILED) + base = NULL; + aligned = base; +#elif defined(HAVE_POSIX_MEMALIGN) + if ((errno = posix_memalign((void **)&base, 64, size)) != 0) + base = NULL; + aligned = base; +#else + base = aligned = NULL; + if (size + 63 < size) { + errno = ENOMEM; + } else if ((base = malloc(size + 63)) != NULL) { + aligned = base + 63; + aligned -= (uintptr_t)aligned & 63; + } +#endif + region->base = base; + region->aligned = aligned; + region->base_size = base ? base_size : 0; + region->aligned_size = base ? size : 0; + return aligned; +} + +static inline void init_region(yespower_region_t *region) +{ + region->base = region->aligned = NULL; + region->base_size = region->aligned_size = 0; +} + +static int free_region(yespower_region_t *region) +{ + if (region->base) { +#ifdef MAP_ANON + if (munmap(region->base, region->base_size)) + return -1; +#else + free(region->base); +#endif + } + init_region(region); + return 0; +} diff --git a/yespower-ref.c b/yespower-ref.c new file mode 100644 index 000000000..29c03d2b4 --- /dev/null +++ b/yespower-ref.c @@ -0,0 +1,580 @@ +/*- + * Copyright 2009 Colin Percival + * Copyright 2013-2018 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + * + * This is a proof-of-work focused fork of yescrypt, including reference and + * cut-down implementation of the obsolete yescrypt 0.5 (based off its first + * submission to PHC back in 2014) and a new proof-of-work specific variation + * known as yespower 1.0. The former is intended as an upgrade for + * cryptocurrencies that already use yescrypt 0.5 and the latter may be used + * as a further upgrade (hard fork) by those and other cryptocurrencies. The + * version of algorithm to use is requested through parameters, allowing for + * both algorithms to co-exist in client and miner implementations (such as in + * preparation for a hard-fork). + * + * This is the reference implementation. Its purpose is to provide a simple + * human- and machine-readable specification that implementations intended + * for actual use should be tested against. It is deliberately mostly not + * optimized, and it is not meant to be used in production. Instead, use + * yespower-opt.c. + */ + +#warning "This reference implementation is deliberately mostly not optimized. Use yespower-opt.c instead unless you're testing (against) the reference implementation on purpose." + +#include +#include +#include +#include + +#include "sha256.h" +#include "sysendian.h" + +#include "yespower.h" + +static void blkcpy(uint32_t *dst, const uint32_t *src, size_t count) +{ + do { + *dst++ = *src++; + } while (--count); +} + +static void blkxor(uint32_t *dst, const uint32_t *src, size_t count) +{ + do { + *dst++ ^= *src++; + } while (--count); +} + +/** + * salsa20(B): + * Apply the Salsa20 core to the provided block. + */ +static void salsa20(uint32_t B[16], uint32_t rounds) +{ + uint32_t x[16]; + size_t i; + + /* SIMD unshuffle */ + for (i = 0; i < 16; i++) + x[i * 5 % 16] = B[i]; + + for (i = 0; i < rounds; i += 2) { +#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b)))) + /* Operate on columns */ + x[ 4] ^= R(x[ 0]+x[12], 7); x[ 8] ^= R(x[ 4]+x[ 0], 9); + x[12] ^= R(x[ 8]+x[ 4],13); x[ 0] ^= R(x[12]+x[ 8],18); + + x[ 9] ^= R(x[ 5]+x[ 1], 7); x[13] ^= R(x[ 9]+x[ 5], 9); + x[ 1] ^= R(x[13]+x[ 9],13); x[ 5] ^= R(x[ 1]+x[13],18); + + x[14] ^= R(x[10]+x[ 6], 7); x[ 2] ^= R(x[14]+x[10], 9); + x[ 6] ^= R(x[ 2]+x[14],13); x[10] ^= R(x[ 6]+x[ 2],18); + + x[ 3] ^= R(x[15]+x[11], 7); x[ 7] ^= R(x[ 3]+x[15], 9); + x[11] ^= R(x[ 7]+x[ 3],13); x[15] ^= R(x[11]+x[ 7],18); + + /* Operate on rows */ + x[ 1] ^= R(x[ 0]+x[ 3], 7); x[ 2] ^= R(x[ 1]+x[ 0], 9); + x[ 3] ^= R(x[ 2]+x[ 1],13); x[ 0] ^= R(x[ 3]+x[ 2],18); + + x[ 6] ^= R(x[ 5]+x[ 4], 7); x[ 7] ^= R(x[ 6]+x[ 5], 9); + x[ 4] ^= R(x[ 7]+x[ 6],13); x[ 5] ^= R(x[ 4]+x[ 7],18); + + x[11] ^= R(x[10]+x[ 9], 7); x[ 8] ^= R(x[11]+x[10], 9); + x[ 9] ^= R(x[ 8]+x[11],13); x[10] ^= R(x[ 9]+x[ 8],18); + + x[12] ^= R(x[15]+x[14], 7); x[13] ^= R(x[12]+x[15], 9); + x[14] ^= R(x[13]+x[12],13); x[15] ^= R(x[14]+x[13],18); +#undef R + } + + /* SIMD shuffle */ + for (i = 0; i < 16; i++) + B[i] += x[i * 5 % 16]; +} + +/** + * blockmix_salsa(B): + * Compute B = BlockMix_{salsa20, 1}(B). The input B must be 128 bytes in + * length. + */ +static void blockmix_salsa(uint32_t *B, uint32_t rounds) +{ + uint32_t X[16]; + size_t i; + + /* 1: X <-- B_{2r - 1} */ + blkcpy(X, &B[16], 16); + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < 2; i++) { + /* 3: X <-- H(X xor B_i) */ + blkxor(X, &B[i * 16], 16); + salsa20(X, rounds); + + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + blkcpy(&B[i * 16], X, 16); + } +} + +/* + * These are tunable, but they must meet certain constraints and are part of + * what defines a yespower version. + */ +#define PWXsimple 2 +#define PWXgather 4 +/* Version 0.5 */ +#define PWXrounds_0_5 6 +#define Swidth_0_5 8 +/* Version 1.0 */ +#define PWXrounds_1_0 3 +#define Swidth_1_0 11 + +/* Derived values. Not tunable on their own. */ +#define PWXbytes (PWXgather * PWXsimple * 8) +#define PWXwords (PWXbytes / sizeof(uint32_t)) +#define rmin ((PWXbytes + 127) / 128) + +/* Runtime derived values. Not tunable on their own. */ +#define Swidth_to_Sbytes1(Swidth) ((1 << Swidth) * PWXsimple * 8) +#define Swidth_to_Smask(Swidth) (((1 << Swidth) - 1) * PWXsimple * 8) + +typedef struct { + yespower_version_t version; + uint32_t salsa20_rounds; + uint32_t PWXrounds, Swidth, Sbytes, Smask; + uint32_t *S; + uint32_t (*S0)[2], (*S1)[2], (*S2)[2]; + size_t w; +} pwxform_ctx_t; + +/** + * pwxform(B): + * Transform the provided block using the provided S-boxes. + */ +static void pwxform(uint32_t *B, pwxform_ctx_t *ctx) +{ + uint32_t (*X)[PWXsimple][2] = (uint32_t (*)[PWXsimple][2])B; + uint32_t (*S0)[2] = ctx->S0, (*S1)[2] = ctx->S1, (*S2)[2] = ctx->S2; + uint32_t Smask = ctx->Smask; + size_t w = ctx->w; + size_t i, j, k; + + /* 1: for i = 0 to PWXrounds - 1 do */ + for (i = 0; i < ctx->PWXrounds; i++) { + /* 2: for j = 0 to PWXgather - 1 do */ + for (j = 0; j < PWXgather; j++) { + uint32_t xl = X[j][0][0]; + uint32_t xh = X[j][0][1]; + uint32_t (*p0)[2], (*p1)[2]; + + /* 3: p0 <-- (lo(B_{j,0}) & Smask) / (PWXsimple * 8) */ + p0 = S0 + (xl & Smask) / sizeof(*S0); + /* 4: p1 <-- (hi(B_{j,0}) & Smask) / (PWXsimple * 8) */ + p1 = S1 + (xh & Smask) / sizeof(*S1); + + /* 5: for k = 0 to PWXsimple - 1 do */ + for (k = 0; k < PWXsimple; k++) { + uint64_t x, s0, s1; + + /* 6: B_{j,k} <-- (hi(B_{j,k}) * lo(B_{j,k}) + S0_{p0,k}) xor S1_{p1,k} */ + s0 = ((uint64_t)p0[k][1] << 32) + p0[k][0]; + s1 = ((uint64_t)p1[k][1] << 32) + p1[k][0]; + + xl = X[j][k][0]; + xh = X[j][k][1]; + + x = (uint64_t)xh * xl; + x += s0; + x ^= s1; + + X[j][k][0] = x; + X[j][k][1] = x >> 32; + } + + if (ctx->version != YESPOWER_0_5 && + (i == 0 || j < PWXgather / 2)) { + if (j & 1) { + for (k = 0; k < PWXsimple; k++) { + S1[w][0] = X[j][k][0]; + S1[w][1] = X[j][k][1]; + w++; + } + } else { + for (k = 0; k < PWXsimple; k++) { + S0[w + k][0] = X[j][k][0]; + S0[w + k][1] = X[j][k][1]; + } + } + } + } + } + + if (ctx->version != YESPOWER_0_5) { + /* 14: (S0, S1, S2) <-- (S2, S0, S1) */ + ctx->S0 = S2; + ctx->S1 = S0; + ctx->S2 = S1; + /* 15: w <-- w mod 2^Swidth */ + ctx->w = w & ((1 << ctx->Swidth) * PWXsimple - 1); + } +} + +/** + * blockmix_pwxform(B, ctx, r): + * Compute B = BlockMix_pwxform{salsa20, ctx, r}(B). The input B must be + * 128r bytes in length. + */ +static void blockmix_pwxform(uint32_t *B, pwxform_ctx_t *ctx, size_t r) +{ + uint32_t X[PWXwords]; + size_t r1, i; + + /* Convert 128-byte blocks to PWXbytes blocks */ + /* 1: r_1 <-- 128r / PWXbytes */ + r1 = 128 * r / PWXbytes; + + /* 2: X <-- B'_{r_1 - 1} */ + blkcpy(X, &B[(r1 - 1) * PWXwords], PWXwords); + + /* 3: for i = 0 to r_1 - 1 do */ + for (i = 0; i < r1; i++) { + /* 4: if r_1 > 1 */ + if (r1 > 1) { + /* 5: X <-- X xor B'_i */ + blkxor(X, &B[i * PWXwords], PWXwords); + } + + /* 7: X <-- pwxform(X) */ + pwxform(X, ctx); + + /* 8: B'_i <-- X */ + blkcpy(&B[i * PWXwords], X, PWXwords); + } + + /* 10: i <-- floor((r_1 - 1) * PWXbytes / 64) */ + i = (r1 - 1) * PWXbytes / 64; + + /* 11: B_i <-- H(B_i) */ + salsa20(&B[i * 16], ctx->salsa20_rounds); + +#if 1 /* No-op with our current pwxform settings, but do it to make sure */ + /* 12: for i = i + 1 to 2r - 1 do */ + for (i++; i < 2 * r; i++) { + /* 13: B_i <-- H(B_i xor B_{i-1}) */ + blkxor(&B[i * 16], &B[(i - 1) * 16], 16); + salsa20(&B[i * 16], ctx->salsa20_rounds); + } +#endif +} + +/** + * integerify(B, r): + * Return the result of parsing B_{2r-1} as a little-endian integer. + */ +static uint32_t integerify(const uint32_t *B, size_t r) +{ +/* + * Our 32-bit words are in host byte order. Also, they are SIMD-shuffled, but + * we only care about the least significant 32 bits anyway. + */ + const uint32_t *X = &B[(2 * r - 1) * 16]; + return X[0]; +} + +/** + * p2floor(x): + * Largest power of 2 not greater than argument. + */ +static uint32_t p2floor(uint32_t x) +{ + uint32_t y; + while ((y = x & (x - 1))) + x = y; + return x; +} + +/** + * wrap(x, i): + * Wrap x to the range 0 to i-1. + */ +static uint32_t wrap(uint32_t x, uint32_t i) +{ + uint32_t n = p2floor(i); + return (x & (n - 1)) + (i - n); +} + +/** + * smix1(B, r, N, V, X, ctx): + * Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in + * length; the temporary storage V must be 128rN bytes in length; the temporary + * storage X must be 128r bytes in length. + */ +static void smix1(uint32_t *B, size_t r, uint32_t N, + uint32_t *V, uint32_t *X, pwxform_ctx_t *ctx) +{ + size_t s = 32 * r; + uint32_t i, j; + size_t k; + + /* 1: X <-- B */ + for (k = 0; k < 2 * r; k++) + for (i = 0; i < 16; i++) + X[k * 16 + i] = le32dec(&B[k * 16 + (i * 5 % 16)]); + + if (ctx->version != YESPOWER_0_5) { + for (k = 1; k < r; k++) { + blkcpy(&X[k * 32], &X[(k - 1) * 32], 32); + blockmix_pwxform(&X[k * 32], ctx, 1); + } + } + + /* 2: for i = 0 to N - 1 do */ + for (i = 0; i < N; i++) { + /* 3: V_i <-- X */ + blkcpy(&V[i * s], X, s); + + if (i > 1) { + /* j <-- Wrap(Integerify(X), i) */ + j = wrap(integerify(X, r), i); + + /* X <-- X xor V_j */ + blkxor(X, &V[j * s], s); + } + + /* 4: X <-- H(X) */ + if (V != ctx->S) + blockmix_pwxform(X, ctx, r); + else + blockmix_salsa(X, ctx->salsa20_rounds); + } + + /* B' <-- X */ + for (k = 0; k < 2 * r; k++) + for (i = 0; i < 16; i++) + le32enc(&B[k * 16 + (i * 5 % 16)], X[k * 16 + i]); +} + +/** + * smix2(B, r, N, Nloop, V, X, ctx): + * Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in + * length; the temporary storage V must be 128rN bytes in length; the temporary + * storage X must be 128r bytes in length. The value N must be a power of 2 + * greater than 1. + */ +static void smix2(uint32_t *B, size_t r, uint32_t N, uint32_t Nloop, + uint32_t *V, uint32_t *X, pwxform_ctx_t *ctx) +{ + size_t s = 32 * r; + uint32_t i, j; + size_t k; + + /* X <-- B */ + for (k = 0; k < 2 * r; k++) + for (i = 0; i < 16; i++) + X[k * 16 + i] = le32dec(&B[k * 16 + (i * 5 % 16)]); + + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < Nloop; i++) { + /* 7: j <-- Integerify(X) mod N */ + j = integerify(X, r) & (N - 1); + + /* 8.1: X <-- X xor V_j */ + blkxor(X, &V[j * s], s); + /* V_j <-- X */ + if (Nloop != 2) + blkcpy(&V[j * s], X, s); + + /* 8.2: X <-- H(X) */ + blockmix_pwxform(X, ctx, r); + } + + /* 10: B' <-- X */ + for (k = 0; k < 2 * r; k++) + for (i = 0; i < 16; i++) + le32enc(&B[k * 16 + (i * 5 % 16)], X[k * 16 + i]); +} + +/** + * smix(B, r, N, p, t, V, X, ctx): + * Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the + * temporary storage V must be 128rN bytes in length; the temporary storage + * X must be 128r bytes in length. The value N must be a power of 2 and at + * least 16. + */ +static void smix(uint32_t *B, size_t r, uint32_t N, + uint32_t *V, uint32_t *X, pwxform_ctx_t *ctx) +{ + uint32_t Nloop_all = (N + 2) / 3; /* 1/3, round up */ + uint32_t Nloop_rw = Nloop_all; + + Nloop_all++; Nloop_all &= ~(uint32_t)1; /* round up to even */ + if (ctx->version == YESPOWER_0_5) { + Nloop_rw &= ~(uint32_t)1; /* round down to even */ + } else { + Nloop_rw++; Nloop_rw &= ~(uint32_t)1; /* round up to even */ + } + + smix1(B, 1, ctx->Sbytes / 128, ctx->S, X, ctx); + smix1(B, r, N, V, X, ctx); + smix2(B, r, N, Nloop_rw /* must be > 2 */, V, X, ctx); + smix2(B, r, N, Nloop_all - Nloop_rw /* 0 or 2 */, V, X, ctx); +} + +/** + * yespower(local, src, srclen, params, dst): + * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target". + * + * Return 0 on success; or -1 on error. + */ +int yespower(yespower_local_t *local, + const uint8_t *src, size_t srclen, + const yespower_params_t *params, yespower_binary_t *dst) +{ + yespower_version_t version = params->version; + uint32_t N = params->N; + uint32_t r = params->r; + const uint8_t *pers = params->pers; + size_t perslen = params->perslen; + int retval = -1; + size_t B_size, V_size; + uint32_t *B, *V, *X, *S; + pwxform_ctx_t ctx; + uint32_t sha256[8]; + + /* Sanity-check parameters */ + if ((version != YESPOWER_0_5 && version != YESPOWER_1_0) || + N < 1024 || N > 512 * 1024 || r < 8 || r > 32 || + (N & (N - 1)) != 0 || r < rmin || + (!pers && perslen)) { + errno = EINVAL; + return -1; + } + + /* Allocate memory */ + B_size = (size_t)128 * r; + V_size = B_size * N; + if ((V = malloc(V_size)) == NULL) + return -1; + if ((B = malloc(B_size)) == NULL) + goto free_V; + if ((X = malloc(B_size)) == NULL) + goto free_B; + ctx.version = version; + if (version == YESPOWER_0_5) { + ctx.salsa20_rounds = 8; + ctx.PWXrounds = PWXrounds_0_5; + ctx.Swidth = Swidth_0_5; + ctx.Sbytes = 2 * Swidth_to_Sbytes1(ctx.Swidth); + } else { + ctx.salsa20_rounds = 2; + ctx.PWXrounds = PWXrounds_1_0; + ctx.Swidth = Swidth_1_0; + ctx.Sbytes = 3 * Swidth_to_Sbytes1(ctx.Swidth); + } + if ((S = malloc(ctx.Sbytes)) == NULL) + goto free_X; + ctx.S = S; + ctx.S0 = (uint32_t (*)[2])S; + ctx.S1 = ctx.S0 + (1 << ctx.Swidth) * PWXsimple; + ctx.S2 = ctx.S1 + (1 << ctx.Swidth) * PWXsimple; + ctx.Smask = Swidth_to_Smask(ctx.Swidth); + ctx.w = 0; + + SHA256_Buf(src, srclen, (uint8_t *)sha256); + + if (version != YESPOWER_0_5) { + if (pers) { + src = pers; + srclen = perslen; + } else { + srclen = 0; + } + } + + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ + PBKDF2_SHA256((uint8_t *)sha256, sizeof(sha256), + src, srclen, 1, (uint8_t *)B, B_size); + + blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0])); + + /* 3: B_i <-- MF(B_i, N) */ + smix(B, r, N, V, X, &ctx); + + if (version == YESPOWER_0_5) { + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ + PBKDF2_SHA256((uint8_t *)sha256, sizeof(sha256), + (uint8_t *)B, B_size, 1, (uint8_t *)dst, sizeof(*dst)); + + if (pers) { + HMAC_SHA256_Buf(dst, sizeof(*dst), pers, perslen, + (uint8_t *)sha256); + SHA256_Buf(sha256, sizeof(sha256), (uint8_t *)dst); + } + } else { + HMAC_SHA256_Buf((uint8_t *)B + B_size - 64, 64, + sha256, sizeof(sha256), (uint8_t *)dst); + } + + /* Success! */ + retval = 0; + + /* Free memory */ + free(S); +free_X: + free(X); +free_B: + free(B); +free_V: + free(V); + + return retval; +} + +int yespower_tls(const uint8_t *src, size_t srclen, + const yespower_params_t *params, yespower_binary_t *dst) +{ +/* The reference implementation doesn't use thread-local storage */ + return yespower(NULL, src, srclen, params, dst); +} + +int yespower_init_local(yespower_local_t *local) +{ +/* The reference implementation doesn't use the local structure */ + local->base = local->aligned = NULL; + local->base_size = local->aligned_size = 0; + return 0; +} + +int yespower_free_local(yespower_local_t *local) +{ +/* The reference implementation frees its memory in yespower() */ + (void)local; /* unused */ + return 0; +} diff --git a/yespower.c b/yespower.c new file mode 100644 index 000000000..f0f06f43c --- /dev/null +++ b/yespower.c @@ -0,0 +1,68 @@ +/*- + * Copyright 2013-2018 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "cpuminer-config.h" +#include "miner.h" + +#include +#include +#include + +#include "yespower.h" + +int scanhash_yespower(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t n = pdata[19] - 1; + const uint32_t first_nonce = pdata[19]; + + uint32_t hash64[8] __attribute__((aligned(32))); + uint32_t endiandata[32]; + yespower_params_t params = { + .version = YESPOWER_0_5, + .N = 2048, + .r = 8, + .pers = (const uint8_t *)endiandata, + .perslen = 80 + }; + //we need bigendian data... + int kk=0; + for (; kk < 32; kk++) + { + be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]); + }; + + do { + pdata[19] = ++n; + be32enc(&endiandata[19], n); + if (yespower_tls((unsigned char *)endiandata, 80, ¶ms, (yespower_binary_t *)hash64)) { + puts("FAILED"); + return -1; + } + if ((hash64[7] < ptarget[7]) || ((hash64[7] == ptarget[7]) && (hash64[6] < ptarget[6])) && fulltest(hash64, ptarget)) { + *hashes_done = n - first_nonce + 1; + return true; + } + } while (n < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = n - first_nonce + 1; + pdata[19] = n; + return 0; +} diff --git a/yespower.h b/yespower.h new file mode 100644 index 000000000..b388d7217 --- /dev/null +++ b/yespower.h @@ -0,0 +1,130 @@ +/*- + * Copyright 2009 Colin Percival + * Copyright 2013-2018 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ +#ifndef _YESPOWER_H_ +#define _YESPOWER_H_ + +#include +#include /* for size_t */ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Internal type used by the memory allocator. Please do not use it directly. + * Use yespower_local_t instead. + */ +typedef struct { + void *base, *aligned; + size_t base_size, aligned_size; +} yespower_region_t; + +/** + * Type for thread-local (RAM) data structure. + */ +typedef yespower_region_t yespower_local_t; + +/* + * Type for yespower algorithm version numbers. + */ +typedef enum { YESPOWER_0_5 = 5, YESPOWER_1_0 = 10 } yespower_version_t; + +/** + * yespower parameters combined into one struct. + */ +typedef struct { + yespower_version_t version; + uint32_t N, r; + const uint8_t *pers; + size_t perslen; +} yespower_params_t; + +/** + * A 256-bit yespower hash. + */ +typedef struct { + unsigned char uc[32]; +} yespower_binary_t; + +/** + * yespower_init_local(local): + * Initialize the thread-local (RAM) data structure. Actual memory allocation + * is currently fully postponed until a call to yespower(). + * + * Return 0 on success; or -1 on error. + * + * MT-safe as long as local is local to the thread. + */ +extern int yespower_init_local(yespower_local_t *local); + +/** + * yespower_free_local(local): + * Free memory that may have been allocated for an initialized thread-local + * (RAM) data structure. + * + * Return 0 on success; or -1 on error. + * + * MT-safe as long as local is local to the thread. + */ +extern int yespower_free_local(yespower_local_t *local); + +/** + * yespower(local, src, srclen, params, dst): + * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target". + * local is the thread-local data structure, allowing to preserve and reuse a + * memory allocation across calls, thereby reducing processing overhead. + * + * Return 0 on success; or -1 on error. + * + * local must be initialized with yespower_init_local(). + * + * MT-safe as long as local and dst are local to the thread. + */ +extern int yespower(yespower_local_t *local, + const uint8_t *src, size_t srclen, + const yespower_params_t *params, yespower_binary_t *dst); + +/** + * yespower_tls(src, srclen, params, dst): + * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target". + * The memory allocation is maintained internally using thread-local storage. + * + * Return 0 on success; or -1 on error. + * + * MT-safe as long as dst is local to the thread. + */ +extern int yespower_tls(const uint8_t *src, size_t srclen, + const yespower_params_t *params, yespower_binary_t *dst); + +#ifdef __cplusplus +} +#endif + +#endif /* !_YESPOWER_H_ */