-develop

KondratokGK · Jul 26, 2019 · 8ffc2aa · 8ffc2aa
1 parent af6a89d
commit 8ffc2aa
Show file tree

Hide file tree

Showing 14 changed files with 90 additions and 72 deletions.
diff --git a/cpp/snOperator.h b/cpp/snOperator.h
@@ -89,7 +89,7 @@ namespace SN_API{
             units(units_), act(act_), opt(opt_),
             dropOut(dropOut_), bnorm(bnorm_), gpuDeviceId(gpuDeviceId_){};
 
-        FullyConnected(uint32_t units_, batchNormType bnorm_ = batchNormType::none) :
+        FullyConnected(uint32_t units_, batchNormType bnorm_) :
             units(units_), bnorm(bnorm_){}
 
         ~FullyConnected(){};

diff --git a/example/mnist/cpp_example.cpp b/example/mnist/cpp_example.cpp
@@ -51,14 +51,14 @@ int main(int argc, char* argv[]){
     sn::Net snet;
 
     snet.addNode("Input", sn::Input(), "C1")
-        .addNode("C1", sn::Convolution(15, 3, 0, 1, sn::batchNormType::none, sn::active::relu), "C2")
-        .addNode("C2", sn::Convolution(15, 3, 0, 1, sn::batchNormType::none, sn::active::relu), "P1")
+        .addNode("C1", sn::Convolution(15, 3, 0, 1, sn::batchNormType::beforeActive, sn::active::relu), "C2")
+        .addNode("C2", sn::Convolution(15, 3, 0, 1, sn::batchNormType::beforeActive, sn::active::relu), "P1")
         .addNode("P1", sn::Pooling(), "FC1")
-        .addNode("FC1", sn::FullyConnected(128), "FC2")
+        .addNode("FC1", sn::FullyConnected(128, sn::batchNormType::none), "FC2")
         .addNode("FC2", sn::FullyConnected(10), "LS")
         .addNode("LS", sn::LossFunction(sn::lossType::softMaxToCrossEntropy), "Output");
 
-    string imgPath = "c://cpp//other//skyNet//example//mnist//images//";
+    string imgPath = "c://cpp//skyNet//example//mnist//images//";
 
     int batchSz = 100, classCnt = 10, w = 28, h = 28; float lr = 0.001F;
     vector<vector<string>> imgName(classCnt);
@@ -81,7 +81,7 @@ int main(int argc, char* argv[]){
     size_t sum_metric = 0;
     size_t num_inst = 0;
     float accuratSumm = 0;
-    for (int k = 0; k < 100; ++k){
+    for (int k = 0; k < 1000; ++k){
 
         targetLayer.clear();
 
@@ -151,6 +151,6 @@ int main(int argc, char* argv[]){
 
     snet.saveAllWeightToFile("c:\\cpp\\w.dat");
 
-   // system("pause");
+    system("pause");
     return 0;
 }
diff --git a/example/resnet50/cpp_example.cpp b/example/resnet50/cpp_example.cpp
@@ -90,13 +90,13 @@ int main(int argc, char* argv[]){
     // CMD: cd c:\cpp\other\skyNet\example\resnet50\
     // CMD: python createNet.py  
 
-    if (!snet.loadAllWeightFromFile("c:/cpp/other/skyNet/example/resnet50/resNet50Weights.dat")){
+    if (!snet.loadAllWeightFromFile("c:/cpp/skyNet/example/resnet50/resNet50Weights.dat")){
         cout << "error loadAllWeightFromFile: " << snet.getLastErrorStr() << endl;
         system("pause");
         return -1;
     }
 
-    string imgPath = "c:/cpp/other/skyNet/example/resnet50/images/elephant.jpg";
+    string imgPath = "c:/cpp/skyNet/example/resnet50/images/elephant.jpg";
 
     int classCnt = 1000, w = 224, h = 224;
 
@@ -151,6 +151,6 @@ int main(int argc, char* argv[]){
         cout << "inx " << maxInx << " accurate " << refOutput[maxInx] << snet.getLastErrorStr() << endl;
     }
 
-   // system("pause");
+    system("pause");
     return 0;
 }
diff --git a/src/snOperatorCPU/src/batchNormFunctions.cpp b/src/snOperatorCPU/src/batchNormFunctions.cpp
@@ -102,8 +102,8 @@ void layerBatchNorm(bool fwBw, bool isLern, const snSize& insz, snFloat* in, snF
         }
     }
     else{ // isLerning
-        if (fwBw)
-            batchNormForward(insz, in, out, prm);
+        if (fwBw) 
+            batchNormForward(insz, in, out, prm);        
         else
             batchNormBackward(insz, in, out, prm);
     }

diff --git a/src/snOperatorCUDA/src/CUDA/batchNorm.cu b/src/snOperatorCUDA/src/CUDA/batchNorm.cu
@@ -63,7 +63,7 @@ __global__ void calcMeanAndVarce(SN_Base::snSize insz, snFloat* in, batchNorm pr
     size_t sz = insz.w * insz.h * insz.d,
            bsz = insz.n;
 
-    // gridDim.x = insz.d
+    // gridDim.x = insz.d    
     // blockDim.x <= insz.w * insz.h
 
     in += insz.w * insz.h * blockIdx.x;
@@ -73,6 +73,8 @@ __global__ void calcMeanAndVarce(SN_Base::snSize insz, snFloat* in, batchNorm pr
     unsigned int i = threadIdx.x;
     while (i < (insz.w * insz.h)){
 
+        prm.mean[i] = 0;
+
         snFloat srq = 0.F;
         for (size_t j = 0; j < bsz; ++j){
 
@@ -135,6 +137,8 @@ __global__ void calcDSchiftAndDScale(SN_Base::snSize insz, snFloat* gradIn, batc
     unsigned int i = threadIdx.x;
     while (i < (insz.w * insz.h)){
 
+        prm.dSchift[i] = 0;
+
         snFloat dScale = 0.F;
         for (size_t j = 0; j < bsz; ++j){
 
@@ -157,7 +161,7 @@ __global__ void calcGrOut(SN_Base::snSize insz, snFloat* gradIn, snFloat* gradOu
            bsz = insz.n;
 
     // gridDim.x = insz.d
-    // gridDim.x = insz.n
+    // gridDim.y = insz.n
     // blockDim.x <= insz.w * insz.h
 
     prm.scale += insz.w * insz.h * blockIdx.x;

diff --git a/src/snOperatorCUDA/src/CUDA/convolution.cu b/src/snOperatorCUDA/src/CUDA/convolution.cu
@@ -251,17 +251,16 @@ __global__ void cuFwdBias(snSize outsz, const snFloat* bias, snFloat* output){
 
     size_t osz = outsz.w * outsz.h;
 
-    snFloat* pOut = output + osz * outsz.d * blockIdx.x;
-    unsigned int d = threadIdx.x;
-    while (d < outsz.d){
+    snFloat* pOut = output + osz * blockIdx.x + osz * outsz.d * blockIdx.y;
 
-        snFloat b = bias[d];
-        for (size_t j = 0; j < osz; ++j)
-            pOut[j] += b;
+    snFloat b = bias[blockIdx.x];
 
-        pOut += osz * blockDim.x;
+    unsigned int i = threadIdx.x;
+    while (i < osz){
+
+        pOut[i] += b;
 
-        d += blockDim.x;
+        i += blockDim.x;
     }
 }
 
@@ -289,7 +288,10 @@ void Convolution::forwardCUDA(const convParams& prms,
         output));
 
     // +bias
-    cuFwdBias << < int(insz.n), 128 >> > (outsz, weight + wStepByN, output);
+    dim3 dimBlock(128);
+    dim3 dimGrid(int(outsz.d), int(outsz.n));
+
+    cuFwdBias << < dimGrid, dimBlock >> > (outsz, weight + wStepByN, output);   
 }
 
 void Convolution::backwardCUDA_GW(const convParams& prms,

diff --git a/src/snOperatorCUDA/src/CUDA/deconvolution.cu b/src/snOperatorCUDA/src/CUDA/deconvolution.cu
@@ -266,17 +266,16 @@ __global__ void cuBwdBias(snSize insz, const snFloat* bias, snFloat* grout){
 
     size_t isz = insz.w * insz.h;
 
-    snFloat* pGrOut = grout + isz * insz.d * blockIdx.x;
-    unsigned int d = threadIdx.x;
-    while (d < insz.d){
+    snFloat* pGrOut = grout + isz * blockIdx.x + isz * insz.d * blockIdx.y;
 
-        snFloat b = bias[d];
-        for (size_t j = 0; j < isz; ++j)
-            pGrOut[j] += b;
+    snFloat b = bias[blockIdx.x];
 
-        pGrOut += isz * blockDim.x;
+    unsigned int i = threadIdx.x;
+    while (i < isz){
+
+        pGrOut[i] += b;
 
-        d += blockDim.x;
+        i += blockDim.x;
     }
 }
 
@@ -325,7 +324,10 @@ void Deconvolution::backwardCUDA_GW(const deconvParams& prms,
         dWeightOut + wStepByN));
 
     // +bias
-    cuBwdBias << < int(insz.n), 128 >> > (insz, weight + wStepByN, gradOut);
+    dim3 dimBlock(128);
+    dim3 dimGrid(int(insz.d), int(insz.n));
+
+    cuBwdBias << < dimGrid, dimBlock >> > (insz, weight + wStepByN, gradOut);
 
 }
 

diff --git a/src/snOperatorCUDA/src/CUDA/fullyConnected.cu b/src/snOperatorCUDA/src/CUDA/fullyConnected.cu
@@ -69,9 +69,7 @@ void FullyConnected::freeParamCUDA(void* gpuPrms){
 }
 
 __global__ void cuFwdBias(size_t kernel, snSize insz, const snFloat* weight, snFloat* output){
-
-    weight += insz.w * insz.h * insz.d * kernel;
-
+
     snFloat* out = output + kernel * blockIdx.x;
     unsigned int k = threadIdx.x;
     while (k < kernel){
@@ -110,13 +108,12 @@ void FullyConnected::forwardCUDA(size_t kernel, const snSize& insz, const snFloa
         krn));                         // Out, step to next Y (Y21 - Y11) 
 
     // +bias
-    cuFwdBias << < int(insz.n), 128 >> > (kernel, insz, weight, output);
+    cuFwdBias << < int(insz.n), 128 >> > (kernel, insz, weight + ida * krn, output);
 
 }
 
 __global__ void cuBwdBias(size_t kernel, snSize insz, const snFloat* gradIn, snFloat* dWOut){
 
-    dWOut += insz.w * insz.h * insz.d * kernel;
     unsigned int k = threadIdx.x;
     while (k < kernel){
 
@@ -159,7 +156,7 @@ void FullyConnected::backwardCUDA_GW(size_t kernel, const snFloat* weight,
         krn));                   // dW, step to next 
 
     // bias
-    cuBwdBias << < 1, 128 >> > (kernel, insz, gradIn, dWOut);
+    cuBwdBias << < 1, 128 >> > (kernel, insz, gradIn, dWOut + ida * krn);
 
     //// Gradient for previous layer
     //// GrOut = αGrIn * W^T + βGrOut

diff --git a/src/snOperatorCUDA/src/CUDA/lossFunctions.cu b/src/snOperatorCUDA/src/CUDA/lossFunctions.cu
@@ -38,7 +38,7 @@ __global__ void softMaxACrossEntropyFwd(snSize iosz, snFloat* inout){
 
     inout += blockIdx.x * inStepByN;
 
-    __shared__ unsigned long long int tmax;
+    __shared__ int tmax;
     __shared__ snFloat tsumm;
 
     tmax = 0;
@@ -47,16 +47,16 @@ __global__ void softMaxACrossEntropyFwd(snSize iosz, snFloat* inout){
     __syncthreads();
 
     unsigned int i = threadIdx.x;
-
     while (i < inStepByN){
 
-        atomicMax(&tmax, unsigned long long int(inout[i] * 100.F));
+        atomicMax(&tmax, int(inout[i] * 100.F));
 
         i += blockDim.x;
     }
 
     __syncthreads();
 
+    i = threadIdx.x;
     while (i < inStepByN){
 
         inout[i] = ((inout[i] - tmax / 100.F) > -20) ? exp(inout[i] - tmax / 100.F) : 0.1E-8F;
@@ -68,6 +68,7 @@ __global__ void softMaxACrossEntropyFwd(snSize iosz, snFloat* inout){
 
     __syncthreads();
 
+    i = threadIdx.x;
     while (i < inStepByN){
 
         inout[i] /= tsumm;

diff --git a/src/snOperatorCUDA/src/CUDA/pooling.cu b/src/snOperatorCUDA/src/CUDA/pooling.cu
@@ -147,16 +147,15 @@ void Pooling::freeParamCUDA(void* gpuPrms){
 
 __global__ void cuFiltrNegative(snSize outsz, snFloat* out){
 
-    out += blockIdx.x * outsz.w * outsz.h * outsz.d;
+    out += blockIdx.x * outsz.w * outsz.h + blockIdx.y * outsz.w * outsz.h * outsz.d;
 
-    unsigned int k = threadIdx.x;
-    while (k < outsz.d){
+    unsigned int i = threadIdx.x;
+    while (i < (outsz.w * outsz.h)){
 
-        snFloat* pOut = out + outsz.w * outsz.h * k;
-        for (size_t j = 0; j < (outsz.w * outsz.h); ++j)
-            if (pOut[j] < 0) pOut[j] = 0.0;
+        if (out[i] < 0)
+           out[i] = 0.0;
 
-        k += blockDim.x;
+        i += blockDim.x;
     }    
 }
 
@@ -177,10 +176,10 @@ void Pooling::forwardCUDA(const poolParams& poolPrms, const snSize& insz, const
         output));
 
     // filtrNegative
-  //  dim3 dimBlock(256);
-   // dim3 dimGrid(int(outsz.n));
-
-  //  cuFiltrNegative << < dimGrid, dimBlock >> >(outsz, output);
+    dim3 dimBlock(128);
+    dim3 dimGrid(int(outsz.d), int(outsz.n));
+         
+    cuFiltrNegative << < dimGrid, dimBlock >> >(outsz, output);
 
 }
 

diff --git a/src/snOperatorCUDA/src/Operator/convolution.cpp b/src/snOperatorCUDA/src/Operator/convolution.cpp
@@ -99,6 +99,11 @@ void Convolution::load(std::map<std::string, std::string>& prms){
     }
 
     setInternPrm(prms);
+
+    // aux array
+    auxGPUParams_["dWeight"] = nullptr;
+    auxGPUParams_["dWPrev"] = nullptr;
+    auxGPUParams_["dWGrad"] = nullptr;        
 }
 
 bool Convolution::setInternPrm(std::map<std::string, std::string>& prms){
@@ -396,15 +401,15 @@ void Convolution::updateConfig(bool isLern, const snSize& newsz){
            osz = outSz.w * outSz.h * outSz.d;
 
     if (batchNormType_ != batchNormType::none){        
-        baseBatchNorm_.mean = cuMemRealloc(csz, osz, baseBatchNorm_.mean, 0);
-        baseBatchNorm_.varce = cuMemRealloc(csz, osz, baseBatchNorm_.varce, 1);
-        baseBatchNorm_.scale = cuMemRealloc(csz, osz, baseBatchNorm_.scale, 1);
-        baseBatchNorm_.schift = cuMemRealloc(csz, osz, baseBatchNorm_.schift, 0);
+        baseBatchNorm_.mean = cuMemRealloc(0, osz, baseBatchNorm_.mean, 0);
+        baseBatchNorm_.varce = cuMemRealloc(0, osz, baseBatchNorm_.varce, 1);
+        baseBatchNorm_.scale = cuMemRealloc(0, osz, baseBatchNorm_.scale, 1);
+        baseBatchNorm_.schift = cuMemRealloc(0, osz, baseBatchNorm_.schift, 0);
 
         if (isLern){
-            baseBatchNorm_.norm = cuMemRealloc(csz * outSz.n, osz * outSz.n, baseBatchNorm_.norm, 0);
-            baseBatchNorm_.dScale = cuMemRealloc(csz, osz, baseBatchNorm_.dScale, 0);
-            baseBatchNorm_.dSchift = cuMemRealloc(csz, osz, baseBatchNorm_.dSchift, 0);
+            baseBatchNorm_.norm = cuMemRealloc(0, osz * outSz.n, baseBatchNorm_.norm, 0);
+            baseBatchNorm_.dScale = cuMemRealloc(0, osz, baseBatchNorm_.dScale, 0);
+            baseBatchNorm_.dSchift = cuMemRealloc(0, osz, baseBatchNorm_.dSchift, 0);
         }
 
         baseBatchNorm_.sz = outSz;

diff --git a/src/snOperatorCUDA/src/Operator/deconvolution.cpp b/src/snOperatorCUDA/src/Operator/deconvolution.cpp
@@ -90,6 +90,11 @@ void Deconvolution::load(std::map<std::string, std::string>& prms){
 
 
     setInternPrm(prms);
+
+    // aux array
+    auxGPUParams_["dWeight"] = nullptr;
+    auxGPUParams_["dWPrev"] = nullptr;
+    auxGPUParams_["dWGrad"] = nullptr;
 }
 
 bool Deconvolution::setInternPrm(std::map<std::string, std::string>& prms){