diff --git a/Makefile b/Makefile index db6ab346..5d2d3bf0 100644 --- a/Makefile +++ b/Makefile @@ -22,4 +22,11 @@ svm-scale: svm-scale.c svm.o: svm.cpp svm.h $(CXX) $(CFLAGS) -c svm.cpp clean: - rm -f *~ svm.o svm-train svm-predict svm-scale libsvm.so.$(SHVER) + rm -f *~ svm.o svm-train svm-predict svm-scale libsvm.so.$(SHVER) svm-train-cuda svm-train.o svm-cuda.o + +svm-train.o: svm-train.c + $(CXX) $(CFLAGS) -c svm-train.c +svm-cuda.o: svm.cpp + nvcc -Xcompiler -fopenmp -O3 -x cu -DSVM_CUDA -c svm.cpp -o svm-cuda.o +svm-train-cuda: svm-train.o svm-cuda.o + $(CXX) $(CFLAGS) svm-train.o svm-cuda.o -o svm-train-cuda -lm -lcusparse -lcublas -lcudart -fopenmp diff --git a/README b/README index dce2c1f1..19b4a65b 100644 --- a/README +++ b/README @@ -1,3 +1,17 @@ +How to build: +make clean; make svm-train-cuda + +How to use: +run svm-train-cuda instead of svm-train, with same command line arguments of svm-train + +tips +try add -h 0 as arguments of svm-train-cuda, it may faster than default one(-h 1) + + +/////////////////////////////////////////////////////////////////////////////////////// + + + Libsvm is a simple, easy-to-use, and efficient software for SVM classification and regression. It solves C-SVM classification, nu-SVM classification, one-class-SVM, epsilon-SVM regression, and nu-SVM diff --git a/svm.cpp b/svm.cpp index fa44818b..ba56a47b 100644 --- a/svm.cpp +++ b/svm.cpp @@ -8,6 +8,16 @@ #include #include #include "svm.h" +#ifdef SVM_CUDA +#include +#include +#include +#include +#include +#include +#include +#endif + int libsvm_version = LIBSVM_VERSION; typedef float Qfloat; typedef signed char schar; @@ -196,28 +206,36 @@ class QMatrix { virtual Qfloat *get_Q(int column, int len) const = 0; virtual double *get_QD() const = 0; virtual void swap_index(int i, int j) const = 0; + virtual void done_shrinking() const = 0; virtual ~QMatrix() {} }; -class Kernel: public QMatrix { +class KernelBase: public QMatrix { public: - Kernel(int l, svm_node * const * x, const svm_parameter& param); - virtual ~Kernel(); + KernelBase(int l, svm_node * const * x, const svm_parameter& param); + virtual ~KernelBase(); + static void batch_k_function(const svm_node *x, const svm_model* model, double* out); static double k_function(const svm_node *x, const svm_node *y, const svm_parameter& param); virtual Qfloat *get_Q(int column, int len) const = 0; virtual double *get_QD() const = 0; - virtual void swap_index(int i, int j) const // no so const... + virtual void swap_index(int i, int j) const // no so const... { swap(x[i],x[j]); if(x_square) swap(x_square[i],x_square[j]); } + virtual void done_shrinking() const {} protected: - double (Kernel::*kernel_function)(int i, int j) const; + double (KernelBase::*kernel_function)(int i, int j) const; + virtual void batch_kernel_function(int i, Qfloat *Q, int start, int end) const + { + for(int j=start;j*kernel_function)(i,j); + } -private: +protected: const svm_node **x; double *x_square; @@ -250,26 +268,26 @@ class Kernel: public QMatrix { } }; -Kernel::Kernel(int l, svm_node * const * x_, const svm_parameter& param) +KernelBase::KernelBase(int l, svm_node * const * x_, const svm_parameter& param) :kernel_type(param.kernel_type), degree(param.degree), gamma(param.gamma), coef0(param.coef0) { switch(kernel_type) { case LINEAR: - kernel_function = &Kernel::kernel_linear; + kernel_function = &KernelBase::kernel_linear; break; case POLY: - kernel_function = &Kernel::kernel_poly; + kernel_function = &KernelBase::kernel_poly; break; case RBF: - kernel_function = &Kernel::kernel_rbf; + kernel_function = &KernelBase::kernel_rbf; break; case SIGMOID: - kernel_function = &Kernel::kernel_sigmoid; + kernel_function = &KernelBase::kernel_sigmoid; break; case PRECOMPUTED: - kernel_function = &Kernel::kernel_precomputed; + kernel_function = &KernelBase::kernel_precomputed; break; } @@ -285,13 +303,13 @@ Kernel::Kernel(int l, svm_node * const * x_, const svm_parameter& param) x_square = 0; } -Kernel::~Kernel() +KernelBase::~KernelBase() { delete[] x; delete[] x_square; } -double Kernel::dot(const svm_node *px, const svm_node *py) +double KernelBase::dot(const svm_node *px, const svm_node *py) { double sum = 0; while(px->index != -1 && py->index != -1) @@ -313,7 +331,16 @@ double Kernel::dot(const svm_node *px, const svm_node *py) return sum; } -double Kernel::k_function(const svm_node *x, const svm_node *y, +void KernelBase::batch_k_function(const svm_node *x, const svm_model* model, double* out) { + svm_node **SV = model->SV; + const svm_parameter& param = model->param; + int i; + for(i=0; il; i++ ) { + out[i] = k_function(x,SV[i],param); + } +} + +double KernelBase::k_function(const svm_node *x, const svm_node *y, const svm_parameter& param) { switch(param.kernel_type) @@ -360,7 +387,6 @@ double Kernel::k_function(const svm_node *x, const svm_node *y, sum += y->value * y->value; ++y; } - return exp(-param.gamma*sum); } case SIGMOID: @@ -372,6 +398,451 @@ double Kernel::k_function(const svm_node *x, const svm_node *y, } } +#ifdef SVM_CUDA +#define CUDA_CHECK(error) { \ + if (error!=cudaSuccess){ \ + std::cerr<<"CUDA ERROR "<< cudaGetErrorName(error) << " in file " << __FILE__ << " line " <<__LINE__<< std::endl; \ + exit(0); \ + } \ +} + +class CudaKernel : public KernelBase { +public: + CudaKernel(int l, svm_node * const * x, const svm_parameter& param); + virtual ~CudaKernel(); + virtual void done_shrinking() const; + static void batch_k_function(const svm_node *x, const svm_model* model, double* out); +protected: + virtual void batch_kernel_function(int i, Qfloat *Q, int start, int end) const; +private: + void x2csrMatrix(int sizeX) const; //convert svm_node** x -> csr format matrix +private: + int m_max_index; // max column of x matrix + std::vector m_csrRowPtr; // csr row ptr of x matrix + int* d_csrColIdx; // csr col index of x matrix + double* d_csrVal; // csr val vector of x matrix + double* d_squareX; // a gpu copy of x_square + int m_sizeX; // size of x + int m_nnz; // number of non-zero value of x matrix + + cusparseHandle_t m_cusparse; + cusparseMatDescr_t m_descr; + cudaStream_t m_stream; + double* d_vectorX; + double* d_vectorY; + int* d_csrRowPtrA; + double* h_vectorX; + double* h_vectorY; + int* h_csrRowPtrA; + char* d_defaultBuf; + static int defaultBufSize; +}; + +int CudaKernel::defaultBufSize = 10240; +CudaKernel::CudaKernel(int l, svm_node * const * x, const svm_parameter& param) : KernelBase(l, x, param), + m_max_index(0), m_csrRowPtr(l+1), + d_csrColIdx(NULL), d_csrVal(NULL), d_squareX(NULL), m_sizeX(l), m_nnz(0), + m_cusparse(NULL), m_descr(NULL), m_stream(NULL), d_vectorX(NULL), d_vectorY(NULL), + d_csrRowPtrA(NULL), h_vectorX(NULL), h_vectorY(NULL), h_csrRowPtrA(NULL), + d_defaultBuf(NULL) { + // get Num-non-zero and max_index + m_max_index = 0; + const svm_node *node; + for( int i=0; iindex != -1) { + if( node->index > m_max_index ) + m_max_index = node->index; + m_nnz++; + node++; + } + } + m_max_index++; + + CUDA_CHECK(cudaStreamCreate(&m_stream)); + cusparseStatus_t cusparseResult = cusparseCreate(&m_cusparse); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + cusparseResult = cusparseCreateMatDescr(&m_descr); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + cusparseSetMatIndexBase(m_descr,CUSPARSE_INDEX_BASE_ZERO); + cusparseSetMatType(m_descr, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetPointerMode(m_cusparse, CUSPARSE_POINTER_MODE_HOST); + + CUDA_CHECK(cudaMalloc((void**)&d_vectorX, sizeof(double)*m_max_index)); + CUDA_CHECK(cudaMalloc((void**)&d_vectorY, sizeof(double)*l)); + CUDA_CHECK(cudaMalloc((void**)&d_csrRowPtrA, sizeof(int)*(l+1))); + CUDA_CHECK(cudaMallocHost((void**)&h_vectorX, sizeof(double)*m_max_index)); + CUDA_CHECK(cudaMallocHost((void**)&h_vectorY, sizeof(double)*l)); + CUDA_CHECK(cudaMallocHost((void**)&h_csrRowPtrA, sizeof(int)*(l+1))); + CUDA_CHECK(cudaMalloc((void**)&d_defaultBuf, defaultBufSize)); + CUDA_CHECK(cudaMalloc((void**)&d_csrColIdx, sizeof(int)*m_nnz)); + CUDA_CHECK(cudaMalloc((void**)&d_csrVal, sizeof(double)*m_nnz)); + CUDA_CHECK(cudaMalloc((void**)&d_squareX, sizeof(double)*l)); + + x2csrMatrix(l); +} + +void CudaKernel::x2csrMatrix(int sizeX) const { + // alloc host memory and fill up + std::vector csrVal(m_nnz); + std::vector csrColIdx(m_nnz); + int curPos = 0; + const svm_node *node; + int* csrRowPtr = (int*)m_csrRowPtr.data(); + for( int i=0; iindex != -1) { + csrVal[curPos] = node->value; + csrColIdx[curPos] = node->index; + curPos++; + node++; + } + } + csrRowPtr[sizeX] = curPos; + + // copy csrVal, csrColIdx to device + CUDA_CHECK(cudaMemcpy(d_csrColIdx, csrColIdx.data(), sizeof(int)*m_nnz, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_csrVal, csrVal.data(), sizeof(double)*m_nnz, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_squareX, x_square, sizeof(double)*sizeX, cudaMemcpyHostToDevice)); +} + +CudaKernel::~CudaKernel() { + CUDA_CHECK(cudaFree(d_csrColIdx)); + CUDA_CHECK(cudaFree(d_csrVal)); + CUDA_CHECK(cudaFree(d_squareX)); + CUDA_CHECK(cudaFree(d_vectorX)); + CUDA_CHECK(cudaFree(d_vectorY)); + CUDA_CHECK(cudaFree(d_csrRowPtrA)); + CUDA_CHECK(cudaFree(d_defaultBuf)); + CUDA_CHECK(cudaFreeHost(h_vectorX)); + CUDA_CHECK(cudaFreeHost(h_vectorY)); + CUDA_CHECK(cudaFreeHost(h_csrRowPtrA)); + cusparseDestroyMatDescr(m_descr); + cusparseDestroy(m_cusparse); + CUDA_CHECK(cudaStreamDestroy(m_stream)); +} + +void CudaKernel::done_shrinking() const { + KernelBase::done_shrinking(); + x2csrMatrix(m_sizeX); +} + +__global__ void dot2kernelValue(int kernel_type, int len, double gamma, double coef0, double degree, double squareX, double* d_squareA, int start, double* Vector) { + int idx = blockDim.x*blockIdx.x + threadIdx.x; + if( idx >= len ) + return; + + // Vector input dot(), output kernel Value + switch(kernel_type) { + default: + case LINEAR: // do nothing + break; + case POLY: + Vector[idx] = pow(gamma*Vector[idx]+coef0,degree); + break; + case RBF: + Vector[idx] = exp(-gamma*(squareX+d_squareA[start+idx]-2*Vector[idx])); + break; + case SIGMOID: + Vector[idx] = tanh(gamma*Vector[idx]+coef0); + break; + } +} + +void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) const { + int len = end-start; + assert(len<=m_sizeX); + + // get csrValA, csrColIndA, nnz ready for cusparseCsrmvEx + double* csrValA = d_csrVal + m_csrRowPtr[start]; + int* csrColIndA = d_csrColIdx + m_csrRowPtr[start]; + int nnz = m_csrRowPtr[len+start] - m_csrRowPtr[start]; + + // get csrRowPtrA ready for cusparseCsrmvEx + for( int idx=0; idx<=len; idx++ ) { + h_csrRowPtrA[idx] = m_csrRowPtr[idx+start] - m_csrRowPtr[start]; + } + CUDA_CHECK(cudaMemcpyAsync(d_csrRowPtrA, h_csrRowPtrA, sizeof(int)*(len+1), cudaMemcpyHostToDevice, m_stream)); + + // get vector x ready for cusparseCsrmvEx + memset(h_vectorX, 0, sizeof(double)*m_max_index); + const svm_node* node = x[i]; + while(node->index != -1) { + h_vectorX[node->index] = node->value; + node++; + } + CUDA_CHECK(cudaMemcpyAsync(d_vectorX, h_vectorX, sizeof(double)*m_max_index, cudaMemcpyHostToDevice, m_stream)); + + // get temp buffer size and get it ready + // https://docs.nvidia.com/cuda/cusparse/index.html#asynchronous-execution + size_t buffer_size = 0; + static const double h_zero = 0.0; + static const double h_one = 1.0; + cusparseSetStream(m_cusparse, m_stream); + cusparseStatus_t cusparseResult = cusparseCsrmvEx_bufferSize(m_cusparse, + CUSPARSE_ALG0, + CUSPARSE_OPERATION_NON_TRANSPOSE, + len, + m_max_index, + nnz, + &h_one, CUDA_R_64F, + m_descr, + csrValA, CUDA_R_64F, + d_csrRowPtrA, + csrColIndA, + d_vectorX, CUDA_R_64F, + &h_zero, CUDA_R_64F, + d_vectorY, CUDA_R_64F, + CUDA_R_64F, + &buffer_size); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + void* alloc_buffer = NULL; + void* buffer = NULL; + if( buffer_size > defaultBufSize ) { + CUDA_CHECK(cudaMalloc((void**)&alloc_buffer, buffer_size)); + buffer = alloc_buffer; + } else + buffer = d_defaultBuf; + + // call cusparseCsrmvEx + cusparseSetStream(m_cusparse, m_stream); + cusparseResult = cusparseCsrmvEx(m_cusparse, + CUSPARSE_ALG0, + CUSPARSE_OPERATION_NON_TRANSPOSE, + len, + m_max_index, + nnz, + &h_one, CUDA_R_64F, + m_descr, + csrValA, CUDA_R_64F, + d_csrRowPtrA, + csrColIndA, + d_vectorX, CUDA_R_64F, + &h_zero, CUDA_R_64F, + d_vectorY, CUDA_R_64F, + CUDA_R_64F, + buffer); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + + // update out base on kernel_type + const int threadPerBlock = 256; + int blockPerGrid = (len+threadPerBlock-1)/threadPerBlock; + dot2kernelValue<<>>(kernel_type, len, gamma, coef0, + degree, x_square[i], d_squareX, start, d_vectorY); + + // copy d_vectorY back to host + CUDA_CHECK(cudaMemcpyAsync(h_vectorY, d_vectorY, sizeof(double)*len, cudaMemcpyDeviceToHost, m_stream)); + CUDA_CHECK(cudaStreamSynchronize(m_stream)); + for( int idx=0; idx Qbase(start+len, 0.0); + KernelBase::batch_kernel_function(i, Qbase.data(), start, end); + for( int idx=0; idx0.01 ) { + printf("%f <-> %f(%d, %d)\n", Q[start+idx], Qbase[start+idx], i, start+idx); + } + }*/ +} + +void svm_model::cuda_init() { + nMaxIdxSV = 0; + nnzSV = 0; + const svm_node *node; + for( int i=0; iindex != -1) { + if( node->index > nMaxIdxSV ) + nMaxIdxSV = node->index; + nnzSV++; + node++; + } + } + nMaxIdxSV++; + + // get csrVal csrColIdx ready + std::vector csrRowPtr(l+1); + std::vector csrVal(nnzSV); + std::vector csrValSquare(nnzSV); + std::vector csrColIdx(nnzSV); + std::vector hSVSquare(l); + int curPos = 0; + for( int i=0; iindex != -1) { + csrVal[curPos] = node->value; + csrValSquare[curPos] = csrVal[curPos]*csrVal[curPos]; + csrColIdx[curPos] = node->index; + hSVSquare[i] += node->value*node->value; + curPos++; + node++; + } + } + csrRowPtr[l] = curPos; + + // copy csrVal, csrColIdx to device + csrColIdxSV = NULL; + csrValSV = NULL; + csrValSVSquare = NULL; + csrRowPtrSV = NULL; + SVSquare = NULL; + CUDA_CHECK(cudaMalloc((void**)&csrRowPtrSV, sizeof(int)*(l+1))); + CUDA_CHECK(cudaMalloc((void**)&csrColIdxSV, sizeof(int)*nnzSV)); + CUDA_CHECK(cudaMalloc((void**)&csrValSV, sizeof(double)*nnzSV)); + CUDA_CHECK(cudaMalloc((void**)&csrValSVSquare, sizeof(double)*nnzSV)); + CUDA_CHECK(cudaMalloc((void**)&SVSquare, sizeof(double)*l)); + CUDA_CHECK(cudaMemcpy(csrRowPtrSV, csrRowPtr.data(), sizeof(int)*(l+1), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(csrColIdxSV, csrColIdx.data(), sizeof(int)*nnzSV, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(csrValSV, csrVal.data(), sizeof(double)*nnzSV, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(csrValSVSquare, csrValSquare.data(), sizeof(double)*nnzSV, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(SVSquare, hSVSquare.data(), sizeof(double)*l, cudaMemcpyHostToDevice)); +} + +void svm_model::cuda_free() { + CUDA_CHECK(cudaFree(csrRowPtrSV)); + CUDA_CHECK(cudaFree(csrColIdxSV)); + CUDA_CHECK(cudaFree(csrValSV)); + CUDA_CHECK(cudaFree(csrValSVSquare)); + CUDA_CHECK(cudaFree(SVSquare)); +} + +void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, double* out) { + const svm_parameter& param = model->param; + switch(param.kernel_type) + { + case LINEAR: + case POLY: + case SIGMOID: + case RBF: + { + cusparseHandle_t cusparse = NULL; + cusparseMatDescr_t descrSV = NULL; + cudaStream_t stream = NULL; + static const double one = 1.0; + static const double zero = 0.0; + size_t buffer_size; + + CUDA_CHECK(cudaStreamCreate(&stream)); + cusparseStatus_t cusparseResult = cusparseCreate(&cusparse); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + cusparseResult = cusparseCreateMatDescr(&descrSV); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + cusparseSetMatIndexBase(descrSV,CUSPARSE_INDEX_BASE_ZERO); + cusparseSetMatType(descrSV, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetPointerMode(cusparse, CUSPARSE_POINTER_MODE_HOST); + + const svm_node *node = x; + int x_nnz = 0; + double squareX = 0.0; + while(node->index != -1) { + x_nnz++; + squareX += node->value * node->value; + node++; + } + + // get dense vector x ready + double* d_vector = NULL; + CUDA_CHECK(cudaMalloc((void**)&d_vector, sizeof(double)*(model->nMaxIdxSV+model->l))); + double* d_vectorX = d_vector; + double* d_vectorY = d_vector + model->nMaxIdxSV; + double* h_vectors = NULL; + CUDA_CHECK(cudaMallocHost((void**)&h_vectors, sizeof(double)*(model->nMaxIdxSV+model->l))); + double* vectorX = h_vectors; + double* vectorY = vectorX + model->nMaxIdxSV; + + memset(vectorX, 0, model->nMaxIdxSV); + node = x; + while(node->index != -1) { + vectorX[node->index] = node->value; + node++; + } + CUDA_CHECK(cudaMemcpyAsync(d_vectorX, vectorX, sizeof(double)*model->nMaxIdxSV, cudaMemcpyHostToDevice, stream)); + + // get buffer size + cusparseSetStream(cusparse, stream); + cusparseResult = cusparseCsrmvEx_bufferSize(cusparse, + CUSPARSE_ALG0, + CUSPARSE_OPERATION_NON_TRANSPOSE, + model->l, model->nMaxIdxSV, + model->nnzSV, &one, CUDA_R_64F, + descrSV, + model->csrValSV, CUDA_R_64F, + model->csrRowPtrSV, model->csrColIdxSV, + d_vectorX, CUDA_R_64F, + &zero, CUDA_R_64F, + d_vectorY, CUDA_R_64F, + CUDA_R_64F, &buffer_size); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + char* buffer = NULL; + CUDA_CHECK(cudaMalloc((void**)&buffer, buffer_size)); + + // y = dot(SV, x) + cusparseSetStream(cusparse, stream); + cusparseResult = cusparseCsrmvEx(cusparse, + CUSPARSE_ALG0, + CUSPARSE_OPERATION_NON_TRANSPOSE, + model->l, model->nMaxIdxSV, + model->nnzSV, + &one, CUDA_R_64F, + descrSV, + model->csrValSV, CUDA_R_64F, + model->csrRowPtrSV, model->csrColIdxSV, + d_vectorX, CUDA_R_64F, + &zero, CUDA_R_64F, + d_vectorY, CUDA_R_64F, + CUDA_R_64F, buffer); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + + // update out base on kernel_type + const int threadPerBlock = 256; + int blockPerGrid = (model->l+threadPerBlock-1)/threadPerBlock; + dot2kernelValue<<>>(param.kernel_type, model->l, param.gamma, param.coef0, + param.degree, squareX, model->SVSquare, 0, d_vectorY); + + // copy vectorY back to host + CUDA_CHECK(cudaMemcpyAsync(vectorY, d_vectorY, sizeof(double)*model->l, cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + memcpy(out, vectorY, sizeof(double)*model->l); + + CUDA_CHECK(cudaFree(d_vector)); + CUDA_CHECK(cudaFreeHost(h_vectors)); + CUDA_CHECK(cudaFree(buffer)); + cusparseDestroyMatDescr(descrSV); + cusparseDestroy(cusparse); + CUDA_CHECK(cudaStreamDestroy(stream)); + } + break; + default: + printf("unsupport kernel type\n"); + exit(0); + break; + } + + // debug, compare with base class result from cpu + /*std::vector Outbase(model->l, 0.0); + KernelBase::batch_k_function(x, model, Outbase.data()); + for( int idx=0; idxl; idx++ ) { + if( fabs(Outbase[idx]-out[idx])>0.01 ) + printf("batch_k_function %f <-> %f %d\n", Outbase[idx], out[idx], idx); + }*/ +} + +#endif + +#ifdef SVM_CUDA +typedef CudaKernel Kernel; +#else +typedef KernelBase Kernel; +#endif + // An SMO algorithm in Fan et al., JMLR 6(2005), p. 1889--1918 // Solves: // @@ -568,7 +1039,10 @@ void Solver::Solve(int l, const QMatrix& Q, const double *p_, const schar *y_, if(--counter == 0) { counter = min(l,1000); - if(shrinking) do_shrinking(); + if(shrinking) { + do_shrinking(); + Q.done_shrinking(); + } info("."); } @@ -691,6 +1165,9 @@ void Solver::Solve(int l, const QMatrix& Q, const double *p_, const schar *y_, double delta_alpha_i = alpha[i] - old_alpha_i; double delta_alpha_j = alpha[j] - old_alpha_j; +#ifdef SVM_CUDA +#pragma omp parallel for +#endif for(int k=0;k= Gmax) + if(-G[t] >= tGmax) { - Gmax = -G[t]; - Gmax_idx = t; + tGmax = -G[t]; + tGmax_idx = t; } } else { if(!is_lower_bound(t)) - if(G[t] >= Gmax) + if(G[t] >= tGmax) + { + tGmax = G[t]; + tGmax_idx = t; + } + } +#ifdef SVM_CUDA +#pragma omp parallel + { + #pragma omp critical + { + if( tGmax>=Gmax ) + { + if( tGmax==Gmax ) + { + if( tGmax_idx>Gmax_idx ) + Gmax_idx = tGmax_idx; + } + else { - Gmax = G[t]; - Gmax_idx = t; + Gmax = tGmax; + Gmax_idx = tGmax_idx; } + } } + } +#endif int i = Gmax_idx; const Qfloat *Q_i = NULL; if(i != -1) // NULL Q_i not accessed: Gmax=-INF if i=-1 Q_i = Q->get_Q(i,active_size); + tGmax2 = -INF; + tGmin_idx = -1; + tobj_diff_min = INF; +#ifdef SVM_CUDA +#pragma omp parallel for copyin(tGmax2, tGmin_idx, tobj_diff_min) +#endif for(int j=0;j= Gmax2) - Gmax2 = G[j]; + if (G[j] >= tGmax2) + tGmax2 = G[j]; if (grad_diff > 0) { double obj_diff; @@ -840,10 +1373,10 @@ int Solver::select_working_set(int &out_i, int &out_j) else obj_diff = -(grad_diff*grad_diff)/TAU; - if (obj_diff <= obj_diff_min) + if (obj_diff <= tobj_diff_min) { - Gmin_idx=j; - obj_diff_min = obj_diff; + tGmin_idx=j; + tobj_diff_min = obj_diff; } } } @@ -853,8 +1386,8 @@ int Solver::select_working_set(int &out_i, int &out_j) if (!is_upper_bound(j)) { double grad_diff= Gmax-G[j]; - if (-G[j] >= Gmax2) - Gmax2 = -G[j]; + if (-G[j] >= tGmax2) + tGmax2 = -G[j]; if (grad_diff > 0) { double obj_diff; @@ -864,15 +1397,37 @@ int Solver::select_working_set(int &out_i, int &out_j) else obj_diff = -(grad_diff*grad_diff)/TAU; - if (obj_diff <= obj_diff_min) + if (obj_diff <= tobj_diff_min) { - Gmin_idx=j; - obj_diff_min = obj_diff; + tGmin_idx=j; + tobj_diff_min = obj_diff; } } } } } +#ifdef SVM_CUDA +#pragma omp parallel + { + #pragma omp critical + { + if(tGmax2>Gmax2 ) + Gmax2 = tGmax2; + if( tobj_diff_min <= obj_diff_min ) + { + if( tobj_diff_min Gmin_idx ) + { + Gmin_idx = tGmin_idx; + } + } + } + } +#endif if(Gmax+Gmax2 < eps || Gmin_idx == -1) return 1; @@ -1282,8 +1837,9 @@ class SVC_Q: public Kernel int start, j; if((start = cache->get_data(i,&data,len)) < len) { + batch_kernel_function(i,data,start,len); for(j=start;j*kernel_function)(i,j)); + data[j] *= (Qfloat)(y[i]*y[j]); } return data; } @@ -1328,11 +1884,10 @@ class ONE_CLASS_Q: public Kernel Qfloat *get_Q(int i, int len) const { Qfloat *data; - int start, j; + int start; if((start = cache->get_data(i,&data,len)) < len) { - for(j=start;j*kernel_function)(i,j); + batch_kernel_function(i,data,start,len); } return data; } @@ -1394,18 +1949,17 @@ class SVR_Q: public Kernel Qfloat *get_Q(int i, int len) const { Qfloat *data; - int j, real_i = index[i]; + int real_i = index[i]; if(cache->get_data(real_i,&data,l) < l) { - for(j=0;j*kernel_function)(real_i,j); + batch_kernel_function(real_i,data,0,l); } // reorder and copy Qfloat *buf = buffer[next_buffer]; next_buffer = 1 - next_buffer; schar si = sign[i]; - for(j=0;j=nr_class ) + continue; svm_problem sub_prob; int si = start[i], sj = start[j]; int ci = count[i], cj = count[j]; @@ -2213,8 +2769,8 @@ svm_model *svm_train(const svm_problem *prob, const svm_parameter *param) if(param->probability) svm_binary_svc_probability(&sub_prob,param,weighted_C[i],weighted_C[j],probA[p],probB[p]); - f[p] = svm_train_one(&sub_prob,param,weighted_C[i],weighted_C[j]); + for(k=0;k 0) nonzero[si+k] = true; @@ -2223,8 +2779,8 @@ svm_model *svm_train(const svm_problem *prob, const svm_parameter *param) nonzero[sj+k] = true; free(sub_prob.x); free(sub_prob.y); - ++p; } + } // build output @@ -2275,7 +2831,7 @@ svm_model *svm_train(const svm_problem *prob, const svm_parameter *param) model->l = total_sv; model->SV = Malloc(svm_node *,total_sv); model->sv_indices = Malloc(int,total_sv); - p = 0; + int p = 0; for(i=0;icuda_init(); +#endif return model; } @@ -2507,8 +3066,14 @@ double svm_predict_values(const svm_model *model, const svm_node *x, double* dec { double *sv_coef = model->sv_coef[0]; double sum = 0; + //for(i=0;il;i++) + // sum += sv_coef[i] * Kernel::k_function(x,model->SV[i],model->param); + double *kvalue = Malloc(double,model->l); + Kernel::batch_k_function(x,model,kvalue); for(i=0;il;i++) - sum += sv_coef[i] * Kernel::k_function(x,model->SV[i],model->param); + sum += sv_coef[i] * kvalue[i]; + free(kvalue); + sum -= model->rho[0]; *dec_values = sum; @@ -2523,8 +3088,9 @@ double svm_predict_values(const svm_model *model, const svm_node *x, double* dec int l = model->l; double *kvalue = Malloc(double,l); - for(i=0;iSV[i],model->param); + //for(i=0;iSV[i],model->param); + Kernel::batch_k_function(x,model,kvalue); int *start = Malloc(int,nr_class); start[0] = 0; @@ -2989,6 +3555,9 @@ svm_model *svm_load_model(const char *model_file_name) return NULL; model->free_sv = 1; // XXX +#ifdef SVM_CUDA + model->cuda_init(); +#endif return model; } @@ -3025,6 +3594,10 @@ void svm_free_model_content(svm_model* model_ptr) free(model_ptr->nSV); model_ptr->nSV = NULL; + +#ifdef SVM_CUDA + model_ptr->cuda_free(); +#endif } void svm_free_and_destroy_model(svm_model** model_ptr_ptr) diff --git a/svm.h b/svm.h index b0bf6ef0..0f546ccd 100644 --- a/svm.h +++ b/svm.h @@ -69,6 +69,18 @@ struct svm_model /* XXX */ int free_sv; /* 1 if svm_model is created by svm_load_model*/ /* 0 if svm_model is created by svm_train */ +#ifdef SVM_CUDA + void cuda_init(); + void cuda_free(); + + int nMaxIdxSV; /* max index of SV */ + int nnzSV; /* non-zero value count of SV */ + int* csrRowPtrSV; /* row index of matrix SV */ + int* csrColIdxSV; /* column index of matrix SV */ + double* csrValSV; /* values of matrix SV */ + double* csrValSVSquare; /* value^2 of matrix SV */ + double* SVSquare; /* dot(x,x) of matrix SV */ +#endif }; struct svm_model *svm_train(const struct svm_problem *prob, const struct svm_parameter *param);