Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sleep before ReadBuffer to resolve Nvidia's busywait issue #60

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 75 additions & 5 deletions main.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <unistd.h>
#include <getopt.h>
#include <errno.h>
#include <time.h>
#include <CL/cl.h>
#include "blake.h"
#include "_kernel.h"
Expand All @@ -31,6 +32,7 @@ uint64_t nr_nonces = 1;
uint32_t do_list_devices = 0;
uint32_t gpu_to_use = 0;
uint32_t mining = 0;
double kern_avg_run_time = 0;

typedef struct debug_s
{
Expand Down Expand Up @@ -113,6 +115,24 @@ void randomize(void *p, ssize_t l)
fatal("close %s: %s\n", fname, strerror(errno));
}


#define NSEC 1e-9
double timespec_to_double(struct timespec *t)
{
return ((double)t->tv_sec) + ((double) t->tv_nsec) * NSEC;
}

void double_to_timespec(double dt, struct timespec *t)
{
t->tv_sec = (long)dt;
t->tv_nsec = (long)((dt - t->tv_sec) / NSEC);
}

void get_time(struct timespec *t)
{
clock_gettime(CLOCK_MONOTONIC, t);
}

cl_mem check_clCreateBuffer(cl_context ctx, cl_mem_flags flags, size_t size,
void *host_ptr)
{
Expand Down Expand Up @@ -776,21 +796,61 @@ uint32_t verify_sol(sols_t *sols, unsigned sol_i)
*/
uint32_t verify_sols(cl_command_queue queue, cl_mem buf_sols, uint64_t *nonce,
uint8_t *header, size_t fixed_nonce_bytes, uint8_t *target,
char *job_id, uint32_t *shares)
char *job_id, uint32_t *shares, struct timespec *target_time)
{
sols_t *sols;
uint32_t nr_valid_sols;
sols = (sols_t *)malloc(sizeof (*sols));
if (!sols)
fatal("malloc: %s\n", strerror(errno));
fatal("malloc: %s\n", strerror(errno));

cl_event readEvent;
check_clEnqueueReadBuffer(queue, buf_sols,
CL_TRUE, // cl_bool blocking_read
CL_FALSE, // cl_bool blocking_read
0, // size_t offset
sizeof (*sols), // size_t size
sols, // void *ptr
0, // cl_uint num_events_in_wait_list
NULL, // cl_event *event_wait_list
NULL); // cl_event *event
&readEvent); // cl_event *event
clFlush(queue);

struct timespec start_time;
get_time(&start_time);

double dtarget = timespec_to_double(target_time);

cl_int readStatus;
clGetEventInfo(readEvent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int),
&readStatus, NULL);
while(readStatus != CL_COMPLETE && SLEEP_SKIP_RATIO != 1)
{
struct timespec t;
get_time(&t);
double dt = timespec_to_double(&t);
double delta = dtarget - dt;
if (delta < 0)
break;

double_to_timespec(delta * SLEEP_RECHECK_RATIO, &t);
nanosleep(&t, NULL);

clGetEventInfo(readEvent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int),
&readStatus, NULL);
}
clWaitForEvents(1, &readEvent);

struct timespec end_time;
get_time(&end_time);

double dstart, dend, delta;
dstart = timespec_to_double(&start_time);
dend = timespec_to_double(&end_time);

delta = dend - dstart;
kern_avg_run_time = kern_avg_run_time * 6.0 / 10.0 + delta * ((4.0 / 10.0));
kern_avg_run_time *= (1 - (double)SLEEP_SKIP_RATIO);

if (sols->nr > MAX_SOLS)
{
fprintf(stderr, "%d (probably invalid) solutions were dropped!\n",
Expand Down Expand Up @@ -904,10 +964,20 @@ uint32_t solve_equihash(cl_context ctx, cl_command_queue queue,
check_clSetKernelArg(k_sols, 3, &rowCounters[0]);
check_clSetKernelArg(k_sols, 4, &rowCounters[1]);
global_ws = NR_ROWS;

struct timespec start_time, target_time;
get_time(&start_time);

double dstart, dtarget = 0;
dstart = timespec_to_double(&start_time);
dtarget = dstart + kern_avg_run_time;
double_to_timespec(dtarget, &target_time);

check_clEnqueueNDRangeKernel(queue, k_sols, 1, NULL,
&global_ws, &local_work_size, 0, NULL, NULL);
clFlush(queue);
sol_found = verify_sols(queue, buf_sols, nonce_ptr, header,
fixed_nonce_bytes, target, job_id, shares);
fixed_nonce_bytes, target, job_id, shares, &target_time);
clReleaseMemObject(buf_blake_st);
return sol_found;
}
Expand Down
7 changes: 7 additions & 0 deletions param.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@
// Number of collision items to track, per thread
#define COLL_DATA_SIZE_PER_TH (NR_SLOTS * 5)


// Ratio of time of sleeping before rechecking if task is done (0-1)
#define SLEEP_RECHECK_RATIO 0.60
// Ratio of time to busy wait for the solution (0-1)
// The higher value the higher CPU usage with Nvidia
#define SLEEP_SKIP_RATIO 0.005

// Make hash tables OVERHEAD times larger than necessary to store the average
// number of elements per row. The ideal value is as small as possible to
// reduce memory usage, but not too small or else elements are dropped from the
Expand Down