Skip to content

Commit

Permalink
Upgrade to 2.10.1-hide-3.1.1
Browse files Browse the repository at this point in the history
  • Loading branch information
rapid821 committed Mar 12, 2019
1 parent 2990196 commit ef276cf
Show file tree
Hide file tree
Showing 29 changed files with 219 additions and 193 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ Besides [Monero](https://getmonero.org), following coins can be mined using this
- [Stellite](https://stellite.cash/)
- [TurtleCoin](https://turtlecoin.lol)
- [Zelerius](https://zelerius.org/)
- [X-CASH](https://x-network.io/)

Ryo currency is a way for us to implement the ideas that we were unable to in
Monero. See [here](https://github.com/fireice-uk/cryptonote-speedup-demo/) for details.
Expand All @@ -78,6 +79,7 @@ If your prefered coin is not listed, you can choose one of the following algorit
- cryptonight_v7
- cryptonight_v7_stellite
- cryptonight_v8
- cryptonight_v8_double (used by X-CASH)
- cryptonight_v8_half (used by masari and stellite)
- cryptonight_v8_reversewaltz (used by graft)
- cryptonight_v8_zelerius
Expand Down
1 change: 0 additions & 1 deletion doc/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ The miner will automatically detect if CUDA (for NVIDIA GPUs) or OpenCL (for AMD
```
xmr-stak --noCPU
```
**CUDA** is currently not supported. I am currently try to get some performance out it.

### NVIDIA via OpenCL

Expand Down
2 changes: 1 addition & 1 deletion scripts/build_xmr-stak_docker/build_xmr-stak_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ fi
if [ -d xmr-stak ]; then
git -C xmr-stak clean -fd
else
git clone https://github.com/rapid821/xmr-stak-hide.git
git clone https://github.com/fireice-uk/xmr-stak.git
fi

wget -c https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda_9.0.176_384.81_linux-run
Expand Down
11 changes: 7 additions & 4 deletions xmrstak/backend/amd/OclCryptonightR_gen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ static cl_program CryptonightR_build_program(
const GpuContext* ctx,
xmrstak_algo algo,
uint64_t height,
uint32_t precompile_count,
cl_kernel old_kernel,
std::string source_code,
std::string options)
Expand All @@ -151,7 +152,7 @@ static cl_program CryptonightR_build_program(
for(size_t i = 0; i < CryptonightR_cache.size();)
{
const CacheEntry& entry = CryptonightR_cache[i];
if ((entry.algo == algo) && (entry.height + 2 < height))
if ((entry.algo == algo) && (entry.height + 2 + precompile_count < height))
{
printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu released (old program)", entry.height);
old_programs.push_back(entry.program);
Expand Down Expand Up @@ -252,10 +253,12 @@ static cl_program CryptonightR_build_program(
return program;
}

cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t height, bool background, cl_kernel old_kernel)
cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t height, uint32_t precompile_count, bool background, cl_kernel old_kernel)
{
printer::inst()->print_msg(LDEBUG, "CryptonightR: start %llu released",height);

if (background) {
background_exec([=](){ CryptonightR_get_program(ctx, algo, height, false, old_kernel); });
background_exec([=](){ CryptonightR_get_program(ctx, algo, height, precompile_count, false, old_kernel); });
return nullptr;
}

Expand Down Expand Up @@ -347,7 +350,7 @@ cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t

}

return CryptonightR_build_program(ctx, algo, height, old_kernel, source, options);
return CryptonightR_build_program(ctx, algo, height, precompile_count, old_kernel, source, options);
}

} // namespace amd
Expand Down
2 changes: 1 addition & 1 deletion xmrstak/backend/amd/OclCryptonightR_gen.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ namespace amd
{

cl_program CryptonightR_get_program(GpuContext* ctx, const xmrstak_algo algo,
uint64_t height, bool background = false, cl_kernel old_kernel = nullptr);
uint64_t height, uint32_t precompile_count, bool background = false, cl_kernel old_kernel = nullptr);

} // namespace amd
} // namespace xmrstak
24 changes: 15 additions & 9 deletions xmrstak/backend/amd/amd_gpu/gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
return ERR_OCL_API;
}

ctx->InputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, 88, NULL, &ret);
ctx->InputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, 128, NULL, &ret);
if(ret != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create input buffer.", err_to_str(ret));
Expand Down Expand Up @@ -334,6 +334,12 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
*/
options += " -DOPENCL_DRIVER_MAJOR=" + std::to_string(std::stoi(openCLDriverVer.data()) / 100);

uint32_t isWindowsOs = 0;
#ifdef _WIN32
isWindowsOs = 1;
#endif
options += " -DIS_WINDOWS_OS=" + std::to_string(isWindowsOs);

if(miner_algo == cryptonight_gpu)
options += " -cl-fp32-correctly-rounded-divide-sqrt";

Expand Down Expand Up @@ -889,15 +895,15 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar

cl_int ret;

if(input_len > 84)
if(input_len > 124)
return ERR_STUPID_PARAMS;

input[input_len] = 0x01;
memset(input + input_len + 1, 0, 88 - input_len - 1);
memset(input + input_len + 1, 0, 128 - input_len - 1);

cl_uint numThreads = ctx->rawIntensity;

if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->InputBuffer, CL_TRUE, 0, 88, input, 0, NULL, NULL)) != CL_SUCCESS)
if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->InputBuffer, CL_TRUE, 0, 128, input, 0, NULL, NULL)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to fill input buffer.", err_to_str(ret));
return ERR_OCL_API;
Expand Down Expand Up @@ -952,8 +958,10 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar

if ((miner_algo == cryptonight_r) || (miner_algo == cryptonight_r_wow)) {

uint32_t PRECOMPILATION_DEPTH = 4;

// Get new kernel
cl_program program = xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height);
cl_program program = xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height, PRECOMPILATION_DEPTH);

if (program != ctx->ProgramCryptonightR) {
cl_int ret;
Expand All @@ -969,12 +977,10 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
}
ctx->ProgramCryptonightR = program;

uint32_t PRECOMPILATION_DEPTH = 4;

// Precompile next program in background
xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + 1, true, old_kernel);
xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + 1, PRECOMPILATION_DEPTH, true, old_kernel);
for (int i = 2; i <= PRECOMPILATION_DEPTH; ++i)
xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + i, true, nullptr);
xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + i, PRECOMPILATION_DEPTH, true, nullptr);

printer::inst()->print_msg(LDEBUG, "Thread #%zu updated CryptonightR", ctx->deviceIdx);
}
Expand Down
84 changes: 12 additions & 72 deletions xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
Original file line number Diff line number Diff line change
Expand Up @@ -32,69 +32,6 @@ R"===(
#define cryptonight_conceal 14
#define cryptonight_v8_reversewaltz 17

/* For Mesa clover support */
#ifdef cl_clang_storage_class_specifiers
# pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
#endif

#ifdef cl_amd_media_ops
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
#else
/* taken from https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops.txt
* Build-in Function
* uintn amd_bitalign (uintn src0, uintn src1, uintn src2)
* Description
* dst.s0 = (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2.s0 & 31))
* similar operation applied to other components of the vectors.
*
* The implemented function is modified because the last is in our case always a scalar.
* We can ignore the bitwise AND operation.
*/
inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2)
{
uint2 result;
result.s0 = (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2));
result.s1 = (uint) (((((ulong)src0.s1) << 32) | (ulong)src1.s1) >> (src2));
return result;
}
#endif

#ifdef cl_amd_media_ops2
#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
#else
/* taken from: https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops2.txt
* Built-in Function:
* uintn amd_bfe (uintn src0, uintn src1, uintn src2)
* Description
* NOTE: operator >> below represent logical right shift
* offset = src1.s0 & 31;
* width = src2.s0 & 31;
* if width = 0
* dst.s0 = 0;
* else if (offset + width) < 32
* dst.s0 = (src0.s0 << (32 - offset - width)) >> (32 - width);
* else
* dst.s0 = src0.s0 >> offset;
* similar operation applied to other components of the vectors
*/
inline int amd_bfe(const uint src0, const uint offset, const uint width)
{
/* casts are removed because we can implement everything as uint
* int offset = src1;
* int width = src2;
* remove check for edge case, this function is always called with
* `width==8`
* @code
* if ( width == 0 )
* return 0;
* @endcode
*/
if ( (offset + width) < 32u )
return (src0 << (32u - offset - width)) >> (32u - width);

return src0 >> offset;
}
#endif

static const __constant ulong keccakf_rndc[24] =
{
Expand Down Expand Up @@ -128,6 +65,8 @@ static const __constant uchar sbox[256] =
0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16
};

//#include "opencl/wolf-aes.cl"
XMRSTAK_INCLUDE_WOLF_AES

void keccakf1600(ulong *s)
{
Expand Down Expand Up @@ -355,8 +294,6 @@ inline uint getIdx()
XMRSTAK_INCLUDE_FAST_INT_MATH_V2
//#include "fast_div_heavy.cl"
XMRSTAK_INCLUDE_FAST_DIV_HEAVY
//#include "opencl/wolf-aes.cl"
XMRSTAK_INCLUDE_WOLF_AES
//#include "opencl/wolf-skein.cl"
XMRSTAK_INCLUDE_WOLF_SKEIN
//#include "opencl/jh.cl"
Expand Down Expand Up @@ -461,8 +398,6 @@ void CNKeccak(ulong *output, ulong *input)

static const __constant uchar rcon[8] = { 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40 };

#define BYTE(x, y) (amd_bfe((x), (y) << 3U, 8U))

#define SubWord(inw) ((sbox[BYTE(inw, 3)] << 24) | (sbox[BYTE(inw, 2)] << 16) | (sbox[BYTE(inw, 1)] << 8) | sbox[BYTE(inw, 0)])

void AESExpandKey256(uint *keybuf)
Expand Down Expand Up @@ -539,6 +474,11 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
State[8] = input[8];
State[9] = input[9];
State[10] = input[10];
State[11] = input[11];
State[12] = input[12];
State[13] = input[13];
State[14] = input[14];
State[15] = input[15];

((__local uint *)State)[9] &= 0x00FFFFFFU;
((__local uint *)State)[9] |= (((uint)get_global_id(0)) & 0xFF) << 24;
Expand All @@ -550,13 +490,13 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
*/
((__local uint *)State)[10] |= (((uint)get_global_id(0) >> 8));

for (int i = 11; i < 25; ++i) {
State[i] = 0x00UL;
}

// Last bit of padding
State[16] = 0x8000000000000000UL;

for (int i = 17; i < 25; ++i) {
State[i] = 0x00UL;
}

keccakf1600_2(State);

#pragma unroll
Expand Down Expand Up @@ -1361,7 +1301,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
states += 25 * BranchBuf[idx];

ulong State[8] = { 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0x0001000000000000UL };
#if defined(__clang__) && !defined(__NV_CL_C_VERSION)
#if defined(__clang__) && !defined(__NV_CL_C_VERSION) && (IS_WINDOWS_OS != 1)
// on ROCM we need volatile for AMD RX5xx cards to avoid invalid shares
volatile
#endif
Expand Down
64 changes: 64 additions & 0 deletions xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,70 @@ R"===(
#ifndef WOLF_AES_CL
#define WOLF_AES_CL

/* For Mesa clover support */
#ifdef cl_clang_storage_class_specifiers
# pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
#endif

#ifdef cl_amd_media_ops
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
#else
/* taken from https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops.txt
* Build-in Function
* uintn amd_bitalign (uintn src0, uintn src1, uintn src2)
* Description
* dst.s0 = (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2.s0 & 31))
* similar operation applied to other components of the vectors.
*
* The implemented function is modified because the last is in our case always a scalar.
* We can ignore the bitwise AND operation.
*/
inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2)
{
uint2 result;
result.s0 = (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2));
result.s1 = (uint) (((((ulong)src0.s1) << 32) | (ulong)src1.s1) >> (src2));
return result;
}
#endif

#ifdef cl_amd_media_ops2
#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
#else
/* taken from: https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops2.txt
* Built-in Function:
* uintn amd_bfe (uintn src0, uintn src1, uintn src2)
* Description
* NOTE: operator >> below represent logical right shift
* offset = src1.s0 & 31;
* width = src2.s0 & 31;
* if width = 0
* dst.s0 = 0;
* else if (offset + width) < 32
* dst.s0 = (src0.s0 << (32 - offset - width)) >> (32 - width);
* else
* dst.s0 = src0.s0 >> offset;
* similar operation applied to other components of the vectors
*/
inline int amd_bfe(const uint src0, const uint offset, const uint width)
{
/* casts are removed because we can implement everything as uint
* int offset = src1;
* int width = src2;
* remove check for edge case, this function is always called with
* `width==8`
* @code
* if ( width == 0 )
* return 0;
* @endcode
*/
if ( (offset + width) < 32u )
return (src0 << (32u - offset - width)) >> (32u - width);

return src0 >> offset;
}
#endif

// AES table - the other three are generated on the fly

static const __constant uint AES0_C[256] =
Expand Down
4 changes: 2 additions & 2 deletions xmrstak/backend/amd/autoAdjust.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,8 @@ class autoAdjust
memPerThread = std::min(memPerThread, memDoubleThread);
}

// 224byte extra memory is used per thread for meta data
size_t perThread = hashMemSize + 224u;
// 240byte extra memory is used per thread for meta data
size_t perThread = hashMemSize + 240u;
size_t maxIntensity = memPerThread / perThread;
size_t possibleIntensity = std::min( maxThreads , maxIntensity );
// map intensity to a multiple of the compute unit count, 8 is the number of threads per work group
Expand Down
2 changes: 1 addition & 1 deletion xmrstak/backend/amd/minethd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ void minethd::work_main()

for(size_t i = 0; i < results[0xFF]; i++)
{
uint8_t bWorkBlob[112];
uint8_t bWorkBlob[128];
uint8_t bResult[32];

memcpy(bWorkBlob, oWork.bWorkBlob, oWork.iWorkSize);
Expand Down
Loading

0 comments on commit ef276cf

Please sign in to comment.