Skip to content

Commit

Permalink
-develop
Browse files Browse the repository at this point in the history
  • Loading branch information
Tyill committed Jul 26, 2019
1 parent af6a89d commit 8ffc2aa
Show file tree
Hide file tree
Showing 14 changed files with 90 additions and 72 deletions.
2 changes: 1 addition & 1 deletion cpp/snOperator.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ namespace SN_API{
units(units_), act(act_), opt(opt_),
dropOut(dropOut_), bnorm(bnorm_), gpuDeviceId(gpuDeviceId_){};

FullyConnected(uint32_t units_, batchNormType bnorm_ = batchNormType::none) :
FullyConnected(uint32_t units_, batchNormType bnorm_) :
units(units_), bnorm(bnorm_){}

~FullyConnected(){};
Expand Down
12 changes: 6 additions & 6 deletions example/mnist/cpp_example.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,14 @@ int main(int argc, char* argv[]){
sn::Net snet;

snet.addNode("Input", sn::Input(), "C1")
.addNode("C1", sn::Convolution(15, 3, 0, 1, sn::batchNormType::none, sn::active::relu), "C2")
.addNode("C2", sn::Convolution(15, 3, 0, 1, sn::batchNormType::none, sn::active::relu), "P1")
.addNode("C1", sn::Convolution(15, 3, 0, 1, sn::batchNormType::beforeActive, sn::active::relu), "C2")
.addNode("C2", sn::Convolution(15, 3, 0, 1, sn::batchNormType::beforeActive, sn::active::relu), "P1")
.addNode("P1", sn::Pooling(), "FC1")
.addNode("FC1", sn::FullyConnected(128), "FC2")
.addNode("FC1", sn::FullyConnected(128, sn::batchNormType::none), "FC2")
.addNode("FC2", sn::FullyConnected(10), "LS")
.addNode("LS", sn::LossFunction(sn::lossType::softMaxToCrossEntropy), "Output");

string imgPath = "c://cpp//other//skyNet//example//mnist//images//";
string imgPath = "c://cpp//skyNet//example//mnist//images//";

int batchSz = 100, classCnt = 10, w = 28, h = 28; float lr = 0.001F;
vector<vector<string>> imgName(classCnt);
Expand All @@ -81,7 +81,7 @@ int main(int argc, char* argv[]){
size_t sum_metric = 0;
size_t num_inst = 0;
float accuratSumm = 0;
for (int k = 0; k < 100; ++k){
for (int k = 0; k < 1000; ++k){

targetLayer.clear();

Expand Down Expand Up @@ -151,6 +151,6 @@ int main(int argc, char* argv[]){

snet.saveAllWeightToFile("c:\\cpp\\w.dat");

// system("pause");
system("pause");
return 0;
}
6 changes: 3 additions & 3 deletions example/resnet50/cpp_example.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,13 @@ int main(int argc, char* argv[]){
// CMD: cd c:\cpp\other\skyNet\example\resnet50\
// CMD: python createNet.py

if (!snet.loadAllWeightFromFile("c:/cpp/other/skyNet/example/resnet50/resNet50Weights.dat")){
if (!snet.loadAllWeightFromFile("c:/cpp/skyNet/example/resnet50/resNet50Weights.dat")){
cout << "error loadAllWeightFromFile: " << snet.getLastErrorStr() << endl;
system("pause");
return -1;
}

string imgPath = "c:/cpp/other/skyNet/example/resnet50/images/elephant.jpg";
string imgPath = "c:/cpp/skyNet/example/resnet50/images/elephant.jpg";

int classCnt = 1000, w = 224, h = 224;

Expand Down Expand Up @@ -151,6 +151,6 @@ int main(int argc, char* argv[]){
cout << "inx " << maxInx << " accurate " << refOutput[maxInx] << snet.getLastErrorStr() << endl;
}

// system("pause");
system("pause");
return 0;
}
4 changes: 2 additions & 2 deletions src/snOperatorCPU/src/batchNormFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ void layerBatchNorm(bool fwBw, bool isLern, const snSize& insz, snFloat* in, snF
}
}
else{ // isLerning
if (fwBw)
batchNormForward(insz, in, out, prm);
if (fwBw)
batchNormForward(insz, in, out, prm);
else
batchNormBackward(insz, in, out, prm);
}
Expand Down
8 changes: 6 additions & 2 deletions src/snOperatorCUDA/src/CUDA/batchNorm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ __global__ void calcMeanAndVarce(SN_Base::snSize insz, snFloat* in, batchNorm pr
size_t sz = insz.w * insz.h * insz.d,
bsz = insz.n;

// gridDim.x = insz.d
// gridDim.x = insz.d
// blockDim.x <= insz.w * insz.h

in += insz.w * insz.h * blockIdx.x;
Expand All @@ -73,6 +73,8 @@ __global__ void calcMeanAndVarce(SN_Base::snSize insz, snFloat* in, batchNorm pr
unsigned int i = threadIdx.x;
while (i < (insz.w * insz.h)){

prm.mean[i] = 0;

snFloat srq = 0.F;
for (size_t j = 0; j < bsz; ++j){

Expand Down Expand Up @@ -135,6 +137,8 @@ __global__ void calcDSchiftAndDScale(SN_Base::snSize insz, snFloat* gradIn, batc
unsigned int i = threadIdx.x;
while (i < (insz.w * insz.h)){

prm.dSchift[i] = 0;

snFloat dScale = 0.F;
for (size_t j = 0; j < bsz; ++j){

Expand All @@ -157,7 +161,7 @@ __global__ void calcGrOut(SN_Base::snSize insz, snFloat* gradIn, snFloat* gradOu
bsz = insz.n;

// gridDim.x = insz.d
// gridDim.x = insz.n
// gridDim.y = insz.n
// blockDim.x <= insz.w * insz.h

prm.scale += insz.w * insz.h * blockIdx.x;
Expand Down
20 changes: 11 additions & 9 deletions src/snOperatorCUDA/src/CUDA/convolution.cu
Original file line number Diff line number Diff line change
Expand Up @@ -251,17 +251,16 @@ __global__ void cuFwdBias(snSize outsz, const snFloat* bias, snFloat* output){

size_t osz = outsz.w * outsz.h;

snFloat* pOut = output + osz * outsz.d * blockIdx.x;
unsigned int d = threadIdx.x;
while (d < outsz.d){
snFloat* pOut = output + osz * blockIdx.x + osz * outsz.d * blockIdx.y;

snFloat b = bias[d];
for (size_t j = 0; j < osz; ++j)
pOut[j] += b;
snFloat b = bias[blockIdx.x];

pOut += osz * blockDim.x;
unsigned int i = threadIdx.x;
while (i < osz){

pOut[i] += b;

d += blockDim.x;
i += blockDim.x;
}
}

Expand Down Expand Up @@ -289,7 +288,10 @@ void Convolution::forwardCUDA(const convParams& prms,
output));

// +bias
cuFwdBias << < int(insz.n), 128 >> > (outsz, weight + wStepByN, output);
dim3 dimBlock(128);
dim3 dimGrid(int(outsz.d), int(outsz.n));

cuFwdBias << < dimGrid, dimBlock >> > (outsz, weight + wStepByN, output);
}

void Convolution::backwardCUDA_GW(const convParams& prms,
Expand Down
20 changes: 11 additions & 9 deletions src/snOperatorCUDA/src/CUDA/deconvolution.cu
Original file line number Diff line number Diff line change
Expand Up @@ -266,17 +266,16 @@ __global__ void cuBwdBias(snSize insz, const snFloat* bias, snFloat* grout){

size_t isz = insz.w * insz.h;

snFloat* pGrOut = grout + isz * insz.d * blockIdx.x;
unsigned int d = threadIdx.x;
while (d < insz.d){
snFloat* pGrOut = grout + isz * blockIdx.x + isz * insz.d * blockIdx.y;

snFloat b = bias[d];
for (size_t j = 0; j < isz; ++j)
pGrOut[j] += b;
snFloat b = bias[blockIdx.x];

pGrOut += isz * blockDim.x;
unsigned int i = threadIdx.x;
while (i < isz){

pGrOut[i] += b;

d += blockDim.x;
i += blockDim.x;
}
}

Expand Down Expand Up @@ -325,7 +324,10 @@ void Deconvolution::backwardCUDA_GW(const deconvParams& prms,
dWeightOut + wStepByN));

// +bias
cuBwdBias << < int(insz.n), 128 >> > (insz, weight + wStepByN, gradOut);
dim3 dimBlock(128);
dim3 dimGrid(int(insz.d), int(insz.n));

cuBwdBias << < dimGrid, dimBlock >> > (insz, weight + wStepByN, gradOut);

}

Expand Down
9 changes: 3 additions & 6 deletions src/snOperatorCUDA/src/CUDA/fullyConnected.cu
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,7 @@ void FullyConnected::freeParamCUDA(void* gpuPrms){
}

__global__ void cuFwdBias(size_t kernel, snSize insz, const snFloat* weight, snFloat* output){

weight += insz.w * insz.h * insz.d * kernel;


snFloat* out = output + kernel * blockIdx.x;
unsigned int k = threadIdx.x;
while (k < kernel){
Expand Down Expand Up @@ -110,13 +108,12 @@ void FullyConnected::forwardCUDA(size_t kernel, const snSize& insz, const snFloa
krn)); // Out, step to next Y (Y21 - Y11)

// +bias
cuFwdBias << < int(insz.n), 128 >> > (kernel, insz, weight, output);
cuFwdBias << < int(insz.n), 128 >> > (kernel, insz, weight + ida * krn, output);

}

__global__ void cuBwdBias(size_t kernel, snSize insz, const snFloat* gradIn, snFloat* dWOut){

dWOut += insz.w * insz.h * insz.d * kernel;
unsigned int k = threadIdx.x;
while (k < kernel){

Expand Down Expand Up @@ -159,7 +156,7 @@ void FullyConnected::backwardCUDA_GW(size_t kernel, const snFloat* weight,
krn)); // dW, step to next

// bias
cuBwdBias << < 1, 128 >> > (kernel, insz, gradIn, dWOut);
cuBwdBias << < 1, 128 >> > (kernel, insz, gradIn, dWOut + ida * krn);

//// Gradient for previous layer
//// GrOut = αGrIn * W^T + βGrOut
Expand Down
7 changes: 4 additions & 3 deletions src/snOperatorCUDA/src/CUDA/lossFunctions.cu
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ __global__ void softMaxACrossEntropyFwd(snSize iosz, snFloat* inout){

inout += blockIdx.x * inStepByN;

__shared__ unsigned long long int tmax;
__shared__ int tmax;
__shared__ snFloat tsumm;

tmax = 0;
Expand All @@ -47,16 +47,16 @@ __global__ void softMaxACrossEntropyFwd(snSize iosz, snFloat* inout){
__syncthreads();

unsigned int i = threadIdx.x;

while (i < inStepByN){

atomicMax(&tmax, unsigned long long int(inout[i] * 100.F));
atomicMax(&tmax, int(inout[i] * 100.F));

i += blockDim.x;
}

__syncthreads();

i = threadIdx.x;
while (i < inStepByN){

inout[i] = ((inout[i] - tmax / 100.F) > -20) ? exp(inout[i] - tmax / 100.F) : 0.1E-8F;
Expand All @@ -68,6 +68,7 @@ __global__ void softMaxACrossEntropyFwd(snSize iosz, snFloat* inout){

__syncthreads();

i = threadIdx.x;
while (i < inStepByN){

inout[i] /= tsumm;
Expand Down
21 changes: 10 additions & 11 deletions src/snOperatorCUDA/src/CUDA/pooling.cu
Original file line number Diff line number Diff line change
Expand Up @@ -147,16 +147,15 @@ void Pooling::freeParamCUDA(void* gpuPrms){

__global__ void cuFiltrNegative(snSize outsz, snFloat* out){

out += blockIdx.x * outsz.w * outsz.h * outsz.d;
out += blockIdx.x * outsz.w * outsz.h + blockIdx.y * outsz.w * outsz.h * outsz.d;

unsigned int k = threadIdx.x;
while (k < outsz.d){
unsigned int i = threadIdx.x;
while (i < (outsz.w * outsz.h)){

snFloat* pOut = out + outsz.w * outsz.h * k;
for (size_t j = 0; j < (outsz.w * outsz.h); ++j)
if (pOut[j] < 0) pOut[j] = 0.0;
if (out[i] < 0)
out[i] = 0.0;

k += blockDim.x;
i += blockDim.x;
}
}

Expand All @@ -177,10 +176,10 @@ void Pooling::forwardCUDA(const poolParams& poolPrms, const snSize& insz, const
output));

// filtrNegative
// dim3 dimBlock(256);
// dim3 dimGrid(int(outsz.n));

// cuFiltrNegative << < dimGrid, dimBlock >> >(outsz, output);
dim3 dimBlock(128);
dim3 dimGrid(int(outsz.d), int(outsz.n));
cuFiltrNegative << < dimGrid, dimBlock >> >(outsz, output);

}

Expand Down
19 changes: 12 additions & 7 deletions src/snOperatorCUDA/src/Operator/convolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,11 @@ void Convolution::load(std::map<std::string, std::string>& prms){
}

setInternPrm(prms);

// aux array
auxGPUParams_["dWeight"] = nullptr;
auxGPUParams_["dWPrev"] = nullptr;
auxGPUParams_["dWGrad"] = nullptr;
}

bool Convolution::setInternPrm(std::map<std::string, std::string>& prms){
Expand Down Expand Up @@ -396,15 +401,15 @@ void Convolution::updateConfig(bool isLern, const snSize& newsz){
osz = outSz.w * outSz.h * outSz.d;

if (batchNormType_ != batchNormType::none){
baseBatchNorm_.mean = cuMemRealloc(csz, osz, baseBatchNorm_.mean, 0);
baseBatchNorm_.varce = cuMemRealloc(csz, osz, baseBatchNorm_.varce, 1);
baseBatchNorm_.scale = cuMemRealloc(csz, osz, baseBatchNorm_.scale, 1);
baseBatchNorm_.schift = cuMemRealloc(csz, osz, baseBatchNorm_.schift, 0);
baseBatchNorm_.mean = cuMemRealloc(0, osz, baseBatchNorm_.mean, 0);
baseBatchNorm_.varce = cuMemRealloc(0, osz, baseBatchNorm_.varce, 1);
baseBatchNorm_.scale = cuMemRealloc(0, osz, baseBatchNorm_.scale, 1);
baseBatchNorm_.schift = cuMemRealloc(0, osz, baseBatchNorm_.schift, 0);

if (isLern){
baseBatchNorm_.norm = cuMemRealloc(csz * outSz.n, osz * outSz.n, baseBatchNorm_.norm, 0);
baseBatchNorm_.dScale = cuMemRealloc(csz, osz, baseBatchNorm_.dScale, 0);
baseBatchNorm_.dSchift = cuMemRealloc(csz, osz, baseBatchNorm_.dSchift, 0);
baseBatchNorm_.norm = cuMemRealloc(0, osz * outSz.n, baseBatchNorm_.norm, 0);
baseBatchNorm_.dScale = cuMemRealloc(0, osz, baseBatchNorm_.dScale, 0);
baseBatchNorm_.dSchift = cuMemRealloc(0, osz, baseBatchNorm_.dSchift, 0);
}

baseBatchNorm_.sz = outSz;
Expand Down
5 changes: 5 additions & 0 deletions src/snOperatorCUDA/src/Operator/deconvolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,11 @@ void Deconvolution::load(std::map<std::string, std::string>& prms){


setInternPrm(prms);

// aux array
auxGPUParams_["dWeight"] = nullptr;
auxGPUParams_["dWPrev"] = nullptr;
auxGPUParams_["dWGrad"] = nullptr;
}

bool Deconvolution::setInternPrm(std::map<std::string, std::string>& prms){
Expand Down
Loading

0 comments on commit 8ffc2aa

Please sign in to comment.