From 1a18e1357d0e7f0c987e0a1ccb6965a72b336e15 Mon Sep 17 00:00:00 2001 From: SOMANSHU AGARWAL Date: Sat, 14 Sep 2019 02:42:27 -0400 Subject: [PATCH 01/56] Changes for Part2A --- Project2-Stream-Compaction/src/main.cpp | 12 +- .../stream_compaction/CMakeLists.txt | 2 +- .../stream_compaction/common.cu | 16 +- .../stream_compaction/cpu.cu | 65 +++++++- .../stream_compaction/efficient.cu | 156 +++++++++++++++++- .../stream_compaction/naive.cu | 61 ++++++- 6 files changed, 287 insertions(+), 25 deletions(-) diff --git a/Project2-Stream-Compaction/src/main.cpp b/Project2-Stream-Compaction/src/main.cpp index d016553..8fd8a3b 100644 --- a/Project2-Stream-Compaction/src/main.cpp +++ b/Project2-Stream-Compaction/src/main.cpp @@ -51,7 +51,7 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan @@ -64,21 +64,21 @@ int main(int argc, char* argv[]) { printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); @@ -137,14 +137,14 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit diff --git a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt index cdbef77..4bb0dc2 100644 --- a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt +++ b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_61 ) diff --git a/Project2-Stream-Compaction/stream_compaction/common.cu b/Project2-Stream-Compaction/stream_compaction/common.cu index 2ed6d63..52d424e 100644 --- a/Project2-Stream-Compaction/stream_compaction/common.cu +++ b/Project2-Stream-Compaction/stream_compaction/common.cu @@ -23,7 +23,14 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) + return; + + if (idata[index]) + bools[index] = 1; + else + bools[index] = 0; } /** @@ -32,7 +39,12 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) + return; + + if (bools[index]) + odata[indices[index]] = idata[index]; } } diff --git a/Project2-Stream-Compaction/stream_compaction/cpu.cu b/Project2-Stream-Compaction/stream_compaction/cpu.cu index a2d3e6c..345c09c 100644 --- a/Project2-Stream-Compaction/stream_compaction/cpu.cu +++ b/Project2-Stream-Compaction/stream_compaction/cpu.cu @@ -18,9 +18,24 @@ namespace StreamCompaction { * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ void scan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); + + bool stopTimer = false; + try { + timer().startCpuTimer(); + } + catch (const std::runtime_error& exception) { + stopTimer = true; + } + + if (n <= 0) + return; + + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } + if (!stopTimer) + timer().endCpuTimer(); } /** @@ -29,10 +44,20 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO + timer().startCpuTimer(); + if (n <= 0) + return -1; + + int k = 0; + + for (int i = 0; i < n; i++) { + if (!idata[i]) + continue; + odata[k] = idata[i]; + k++; + } timer().endCpuTimer(); - return -1; + return k; } /** @@ -41,10 +66,32 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO + + timer().startCpuTimer(); + if (n <= 0) + return -1; + + int *temp = new int[n]; + int *exclusiveScan = new int[n]; + for (int i = 0; i < n; i++) { + if (idata[i]) + temp[i] = 1; + else + temp[i] = 0; + } + + scan(n, exclusiveScan, temp); + printf("\n"); + for (int i = 0; i < n; i++) { + if (temp[i] == 1) + odata[exclusiveScan[i]] = idata[i]; + } timer().endCpuTimer(); - return -1; + // Check for the length case + if (temp[n - 1] == 1) + return exclusiveScan[n - 1] + 1; + else + return exclusiveScan[n - 1]; } } } diff --git a/Project2-Stream-Compaction/stream_compaction/efficient.cu b/Project2-Stream-Compaction/stream_compaction/efficient.cu index 2db346e..83573cd 100644 --- a/Project2-Stream-Compaction/stream_compaction/efficient.cu +++ b/Project2-Stream-Compaction/stream_compaction/efficient.cu @@ -3,6 +3,8 @@ #include "common.h" #include "efficient.h" +# define blockSize 32 + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,13 +14,109 @@ namespace StreamCompaction { return timer; } + void printArray(int n, int *a, bool abridged = false) { + printf(" [ "); + for (int i = 0; i < n; i++) { + if (abridged && i + 2 == 15 && n > 16) { + i = n - 2; + printf("... "); + } + printf("%3d ", a[i]); + } + printf("]\n"); + } + + int *dev_arr1; + int *dev_arr2; + int *dev_bools; + int *dev_indices; + int *dev_odata; + + __global__ void kernUpSweep(int n, int valPower2D, int *data) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) + return; + + if (index % (2 * valPower2D) == 0 && (index + (2 * valPower2D) - 1 < n) && (index + valPower2D - 1 < n)) { + data[index + (2 * valPower2D) - 1] += data[index + valPower2D - 1]; + } + } + + __global__ void kernZeroPadding(int n, int N, int *data) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n && index < N) + data[index] = 0; + else + return; + } + __global__ void kernLastElement(int n, int *data) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index == n - 1) + data[index] = 0; + else + return; + } + __global__ void kernDownSweep(int n, int valPower2D, int *data) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) + return; + + if ((index % (2*valPower2D)== 0) && (index + (2 * valPower2D) - 1 < n) && (index + valPower2D - 1 < n)) { + int temp = data[index + valPower2D - 1]; + data[index + valPower2D - 1] = data[index + (2 * valPower2D) - 1]; + data[index + (2 * valPower2D) - 1] += temp; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + + bool stopTimer = false; + try { + timer().startGpuTimer(); + } + catch (const std::runtime_error& exception) { + stopTimer = true; + } + + int diff = (1 << ilog2ceil(n)) - n; + int N = n + diff; + + cudaMalloc((void**)&dev_arr1, N * sizeof(int)); + checkCUDAErrorFn("Malloc idata into arr1 failed"); + + cudaMemcpy(dev_arr1, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAErrorFn("Copying idata to arr1 failed"); + + dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); + + if (diff) { + kernZeroPadding << >> (n,N,dev_arr1); + } + + for (int d = 0; d <= ilog2ceil(n) - 1; d++) { + int valPower2D = 1 << d; + kernUpSweep << > > (N,valPower2D,dev_arr1); + checkCUDAErrorFn("Kernel Up Sweep Failed"); + } + + kernLastElement << > > (N, dev_arr1); + + for (int d = ilog2ceil(n) - 1; d >= 0; d--) { + int valPower2D = 1 << d; + kernDownSweep << > > (N, valPower2D, dev_arr1); + checkCUDAErrorFn("Kernel Down Sweep Failed"); + } + + cudaMemcpy(odata, dev_arr1, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAErrorFn("Copying back to Host failed"); + + cudaFree(dev_arr1); + + if(!stopTimer) + timer().endGpuTimer(); + } /** @@ -32,9 +130,57 @@ namespace StreamCompaction { */ int compact(int n, int *odata, const int *idata) { timer().startGpuTimer(); - // TODO + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + cudaMalloc((void**)&dev_arr2, n * sizeof(int)); + checkCUDAErrorFn("Malloc idata into arr2 failed"); + + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + checkCUDAErrorFn("Malloc idata into arr3 failed"); + + cudaMalloc((void**)&dev_indices, n * sizeof(int)); + checkCUDAErrorFn("Malloc idata into indices failed"); + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAErrorFn("Malloc idata into odata failed"); + + cudaMemcpy(dev_arr2, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAErrorFn("Copying idata to arr2 failed"); + + Common::kernMapToBoolean << > > (n,dev_bools,dev_arr2); + checkCUDAErrorFn("Kernel Map indicator failed"); + + int *indices = new int[n]; + int *bools = new int[n]; + + cudaMemcpy(bools, dev_bools, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAErrorFn("Copying bools to host failed"); + + scan(n, indices, bools); + + cudaMemcpy(dev_indices, indices, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAErrorFn("Copying indices to device failed"); + + Common::kernScatter << > > (n,dev_odata,dev_arr2,dev_bools,dev_indices); + checkCUDAErrorFn("Kernel Scatter failed"); + + int length = indices[n - 1]; + + if (idata[n - 1]) + length += 1; + + //printf("Length is %d \n", length); + cudaMemcpy(odata, dev_odata, sizeof(int) * length, cudaMemcpyDeviceToHost); + checkCUDAErrorFn("Copying back to the host failed"); + + cudaFree(dev_arr2); + cudaFree(dev_bools); + cudaFree(dev_indices); + cudaFree(dev_odata); + timer().endGpuTimer(); - return -1; + return length; } } } diff --git a/Project2-Stream-Compaction/stream_compaction/naive.cu b/Project2-Stream-Compaction/stream_compaction/naive.cu index 4308876..2a14d51 100644 --- a/Project2-Stream-Compaction/stream_compaction/naive.cu +++ b/Project2-Stream-Compaction/stream_compaction/naive.cu @@ -3,6 +3,8 @@ #include "common.h" #include "naive.h" +# define blockSize 32 + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -11,14 +13,69 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + int *dev_arr1; + int *dev_arr2; + + __global__ void kernScan(int n,int pos, int *arr2, const int *arr1) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) + return; + if (index >= pos) + arr2[index] = arr1[index - pos] + arr1[index]; + + } + + __global__ void kernShiftRight(int n, int *odata, int *idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) + return; + + if (index == 0) { + odata[index] = 0; + return; + } + odata[index] = idata[index-1]; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { timer().startGpuTimer(); - // TODO + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + cudaMalloc((void**)&dev_arr1, n * sizeof(int)); + checkCUDAErrorFn("Malloc idata into arr1 failed"); + + cudaMalloc((void**)&dev_arr2, n * sizeof(int)); + checkCUDAErrorFn("Malloc odata into arr2 failed"); + + cudaMemcpy(dev_arr1, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAErrorFn("Copying idata to arr1 failed"); + + cudaMemcpy(dev_arr2, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAErrorFn("Memory copy idata to arr2 failed"); + + for (int d = 1; d <= ilog2ceil(n); d++) { + int pos = 1 << (d - 1); + + kernScan << > > (n,pos, dev_arr2, dev_arr1); + checkCUDAErrorFn("Kernel Scan failed"); + + cudaMemcpy(dev_arr1, dev_arr2, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAErrorFn("Memory copy from arr2 to arr1 failed"); + } + + kernShiftRight << > > (n, dev_arr2, dev_arr1); + checkCUDAErrorFn("Kernel Scan failed"); + + cudaMemcpy(odata, dev_arr2, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAErrorFn("Copying back to Host failed"); + + cudaFree(dev_arr1); + cudaFree(dev_arr2); + timer().endGpuTimer(); } } From 75a8d241395b9fa5a2282796a33529d76f0446ff Mon Sep 17 00:00:00 2001 From: SOMANSHU AGARWAL Date: Sat, 14 Sep 2019 19:51:33 -0400 Subject: [PATCH 02/56] Changes for Part 2A --- Project2-Stream-Compaction/src/main.cpp | 6 +-- .../stream_compaction/efficient.cu | 41 ++++++++----------- .../stream_compaction/naive.cu | 9 ++-- .../stream_compaction/thrust.cu | 12 +++++- 4 files changed, 35 insertions(+), 33 deletions(-) diff --git a/Project2-Stream-Compaction/src/main.cpp b/Project2-Stream-Compaction/src/main.cpp index 8fd8a3b..7a9bad7 100644 --- a/Project2-Stream-Compaction/src/main.cpp +++ b/Project2-Stream-Compaction/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 20; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -85,14 +85,14 @@ int main(int argc, char* argv[]) { printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); diff --git a/Project2-Stream-Compaction/stream_compaction/efficient.cu b/Project2-Stream-Compaction/stream_compaction/efficient.cu index 83573cd..cbf8bb6 100644 --- a/Project2-Stream-Compaction/stream_compaction/efficient.cu +++ b/Project2-Stream-Compaction/stream_compaction/efficient.cu @@ -3,7 +3,7 @@ #include "common.h" #include "efficient.h" -# define blockSize 32 +# define blockSize 128 namespace StreamCompaction { namespace Efficient { @@ -14,18 +14,6 @@ namespace StreamCompaction { return timer; } - void printArray(int n, int *a, bool abridged = false) { - printf(" [ "); - for (int i = 0; i < n; i++) { - if (abridged && i + 2 == 15 && n > 16) { - i = n - 2; - printf("... "); - } - printf("%3d ", a[i]); - } - printf("]\n"); - } - int *dev_arr1; int *dev_arr2; int *dev_bools; @@ -72,14 +60,6 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { - bool stopTimer = false; - try { - timer().startGpuTimer(); - } - catch (const std::runtime_error& exception) { - stopTimer = true; - } - int diff = (1 << ilog2ceil(n)) - n; int N = n + diff; @@ -91,6 +71,15 @@ namespace StreamCompaction { dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); + bool stopTimer = false; + try { + timer().startGpuTimer(); + } + catch (const std::runtime_error& exception) { + stopTimer = true; + } + + if (diff) { kernZeroPadding << >> (n,N,dev_arr1); } @@ -109,13 +98,15 @@ namespace StreamCompaction { checkCUDAErrorFn("Kernel Down Sweep Failed"); } + + if (!stopTimer) + timer().endGpuTimer(); + cudaMemcpy(odata, dev_arr1, sizeof(int) * n, cudaMemcpyDeviceToHost); checkCUDAErrorFn("Copying back to Host failed"); cudaFree(dev_arr1); - if(!stopTimer) - timer().endGpuTimer(); } @@ -165,6 +156,8 @@ namespace StreamCompaction { Common::kernScatter << > > (n,dev_odata,dev_arr2,dev_bools,dev_indices); checkCUDAErrorFn("Kernel Scatter failed"); + timer().endGpuTimer(); + int length = indices[n - 1]; if (idata[n - 1]) @@ -178,8 +171,6 @@ namespace StreamCompaction { cudaFree(dev_bools); cudaFree(dev_indices); cudaFree(dev_odata); - - timer().endGpuTimer(); return length; } } diff --git a/Project2-Stream-Compaction/stream_compaction/naive.cu b/Project2-Stream-Compaction/stream_compaction/naive.cu index 2a14d51..4988ca1 100644 --- a/Project2-Stream-Compaction/stream_compaction/naive.cu +++ b/Project2-Stream-Compaction/stream_compaction/naive.cu @@ -3,7 +3,7 @@ #include "common.h" #include "naive.h" -# define blockSize 32 +# define blockSize 128 namespace StreamCompaction { namespace Naive { @@ -42,7 +42,7 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); cudaMalloc((void**)&dev_arr1, n * sizeof(int)); @@ -57,6 +57,8 @@ namespace StreamCompaction { cudaMemcpy(dev_arr2, idata, sizeof(int) * n, cudaMemcpyHostToDevice); checkCUDAErrorFn("Memory copy idata to arr2 failed"); + timer().startGpuTimer(); + for (int d = 1; d <= ilog2ceil(n); d++) { int pos = 1 << (d - 1); @@ -70,13 +72,14 @@ namespace StreamCompaction { kernShiftRight << > > (n, dev_arr2, dev_arr1); checkCUDAErrorFn("Kernel Scan failed"); + timer().endGpuTimer(); + cudaMemcpy(odata, dev_arr2, sizeof(int) * n, cudaMemcpyDeviceToHost); checkCUDAErrorFn("Copying back to Host failed"); cudaFree(dev_arr1); cudaFree(dev_arr2); - timer().endGpuTimer(); } } } diff --git a/Project2-Stream-Compaction/stream_compaction/thrust.cu b/Project2-Stream-Compaction/stream_compaction/thrust.cu index 1def45e..618bbf4 100644 --- a/Project2-Stream-Compaction/stream_compaction/thrust.cu +++ b/Project2-Stream-Compaction/stream_compaction/thrust.cu @@ -18,11 +18,19 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); - timer().endGpuTimer(); + + thrust::device_vector dv_in(idata,idata+n); + thrust::device_vector dv_out(idata,idata+n); + timer().startGpuTimer(); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + timer().endGpuTimer(); + thrust::copy(dv_out.begin(),dv_out.end(),odata); + + } } } From 091d1ca031b945f7e0a0c1d364992114b7cbb697 Mon Sep 17 00:00:00 2001 From: SOMANSHU AGARWAL Date: Sat, 14 Sep 2019 23:35:06 -0400 Subject: [PATCH 03/56] Changes for Part 2B --- Project2-Character-Recognition/CMakeLists.txt | 2 + .../character_recognition/CMakeLists.txt | 2 +- .../character_recognition/mlp.cu | 97 +++++++++- Project2-Character-Recognition/src/main.cpp | 176 ++++-------------- .../src/testing_helpers.hpp | 6 +- 5 files changed, 143 insertions(+), 140 deletions(-) diff --git a/Project2-Character-Recognition/CMakeLists.txt b/Project2-Character-Recognition/CMakeLists.txt index 09e9198..1caf239 100644 --- a/Project2-Character-Recognition/CMakeLists.txt +++ b/Project2-Character-Recognition/CMakeLists.txt @@ -22,6 +22,7 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") endif() include_directories(.) +link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/x64) add_subdirectory(character_recognition) cuda_add_executable(${CMAKE_PROJECT_NAME} @@ -32,4 +33,5 @@ cuda_add_executable(${CMAKE_PROJECT_NAME} target_link_libraries(${CMAKE_PROJECT_NAME} character_recognition ${CORELIBS} + cublas ) diff --git a/Project2-Character-Recognition/character_recognition/CMakeLists.txt b/Project2-Character-Recognition/character_recognition/CMakeLists.txt index 7446175..9e834c1 100644 --- a/Project2-Character-Recognition/character_recognition/CMakeLists.txt +++ b/Project2-Character-Recognition/character_recognition/CMakeLists.txt @@ -7,5 +7,5 @@ set(SOURCE_FILES cuda_add_library(character_recognition ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_61 ) diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu index 5a3ed7f..c8d61d3 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.cu +++ b/Project2-Character-Recognition/character_recognition/mlp.cu @@ -2,6 +2,15 @@ #include #include "common.h" #include "mlp.h" +#include "cublas_v2.h" + +# define blockSize 128 +# define hiddenLayerLen 10 + +static __inline__ void modify(cublasHandle_t handle, float *m, int ldm, int n, int p, int q, float alpha, float beta) { + cublasSscal(handle, n - q + 1, &alpha, &m[IDX2F(p, q, ldm)], ldm); + cublasSscal(handle, ldm - p + 1, &beta, &m[IDX2F(p, q, ldm)], 1); +} namespace CharacterRecognition { using Common::PerformanceTimer; @@ -10,7 +19,12 @@ namespace CharacterRecognition { static PerformanceTimer timer; return timer; } - + + float *dev_input; + float *dev_hiddenLayer; + float *dev_output; + float *dev_weightsIH; + float *dev_weightsHO; // TODO: __global__ /** @@ -23,5 +37,86 @@ namespace CharacterRecognition { } */ + __global__ void kernMatrixMultiplication(int n, int m, int k, float *M,float *N, float *Out) { + int ty = blockIdx.y * blockDim.y + threadIdx.y; + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int sum = 0; + if (col < k && ) + + } + __global__ void kernActivationFunction(int N, float* ) { + + } // TODO: implement required elements for MLP sections 1 and 2 here + + /*void matrixMultiplication(float *M,float *N,float *Out) { + + stat = cublasCreate(&handle); + if (stat != CUBLAS_STATUS_SUCCESS) { + printf("CUBLAS initialization failed\n"); + return EXIT_FAILURE; + } + stat = cublasSetMatrix(M, N, sizeof(*a), a, M, devPtrA, M); + if (stat != CUBLAS_STATUS_SUCCESS) { + printf("data download failed"); + cudaFree(devPtrA); + cublasDestroy(handle); + return EXIT_FAILURE; + } + modify(handle, devPtrA, M, N, 2, 3, 16.0f, 12.0f); + stat = cublasGetMatrix(M, N, sizeof(*a), devPtrA, M, a, M); + if (stat != CUBLAS_STATUS_SUCCESS) { + printf("data upload failed"); + cudaFree(devPtrA); + cublasDestroy(handle); + return EXIT_FAILURE; + } + }*/ + + void createNN(int n,int h,int m, const float *idata, float *hidden, float *odata, const float *weightsIH, const float *weightsHO) { + + cublasStatus_t stat; + cublasHandle_t handle; + + cudaMalloc((void**)&dev_input, n * sizeof(float)); + checkCUDAErrorFn("Malloc idata into input failed"); + + cudaMemcpy(dev_input, idata, sizeof(float) * n, cudaMemcpyHostToDevice); + checkCUDAErrorFn("Copying idata to input failed"); + + cudaMalloc((void**)&dev_hiddenLayer, h * sizeof(float)); + checkCUDAErrorFn("Malloc idata into hidden layer failed"); + + cudaMalloc((void**)&dev_output, m * sizeof(float)); + checkCUDAErrorFn("Malloc idata into output failed"); + + cudaMalloc((void**)&dev_weightsIH, (n*h) * sizeof(float)); + checkCUDAErrorFn("Malloc idata into weights b/w input & hidden failed"); + + cudaMalloc((void**)&dev_weightsHO, (h*m) * sizeof(float)); + checkCUDAErrorFn("Malloc idata into weights b/w hidden & output failed"); + + cudaMemcpy(dev_weightsIH, weightsIH, sizeof(float) * (n*h), cudaMemcpyHostToDevice); + checkCUDAErrorFn("Copying weights array 1 failed"); + + cudaMemcpy(dev_weightsHO, weightsHO , sizeof(float) * (h*m), cudaMemcpyHostToDevice); + checkCUDAErrorFn("Copying weights array 2 failed"); + + dim3 fullBlocks1((n + blockSize - 1) / blockSize); + dim3 fullBlocks2((h + blockSize - 1) / blockSize); + dim3 fullBlocks3((m + blockSize - 1) / blockSize); + dim3 fullBlocksMult((m + blockSize - 1) / blockSize); + + //kernMultiplyWeights << > > (n,hiddenLayerLen,dev_input,dev_hiddenLayer,dev_weightsIH); + + kernMatrixMultiplication(dev_input, dev_weightsIH, dev_hiddenLayer); + + //cudaMemcpy(dev_hiddenLayer, hidden, sizeof(float) * (n*h), cudaMemcpyHostToDevice); + //checkCUDAErrorFn("Copying hidden layer units failed"); + + kernActivationFunction<< > > (h,dev_hiddenLayer); + checkCUDAErrorFn("Kernel Activation function failed"); + + + } } diff --git a/Project2-Character-Recognition/src/main.cpp b/Project2-Character-Recognition/src/main.cpp index 11dd534..52ad1aa 100644 --- a/Project2-Character-Recognition/src/main.cpp +++ b/Project2-Character-Recognition/src/main.cpp @@ -11,142 +11,48 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array -const int NPOT = SIZE - 3; // Non-Power-Of-Two -int *a = new int[SIZE]; -int *b = new int[SIZE]; -int *c = new int[SIZE]; - -int main(int argc, char* argv[]) { - // Scan tests - - printf("\n"); - printf("****************\n"); - printf("** SCAN TESTS **\n"); - printf("****************\n"); - - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - // initialize b using StreamCompaction::CPU::scan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. - // At first all cases passed because b && c are all zeroes. - zeroArray(SIZE, b); - printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); - - zeroArray(SIZE, c); - printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); - printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ - - zeroArray(SIZE, c); - printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - printf("\n"); - printf("*****************************\n"); - printf("** STREAM COMPACTION TESTS **\n"); - printf("*****************************\n"); - - // Compaction tests - - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - int count, expectedCount, expectedNPOT; - - // initialize b using StreamCompaction::CPU::compactWithoutScan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. - zeroArray(SIZE, b); - printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedCount = count; - printArray(count, b, true); - printCmpLenResult(count, expectedCount, b, b); - - zeroArray(SIZE, c); - printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedNPOT = count; - printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - zeroArray(SIZE, c); - printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); +void createInputXor(float *input) { + input[0] = 0.0; + input[1] = 0.0; + input[2] = 0.0; + input[3] = 1.0; + input[4] = 1.0; + input[5] = 0.0; + input[6] = 1.0; + input[7] = 1.0; +} - zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); +void createOutputXor(float *output) { + output[0] = 0.0; + output[1] = 1.0; + output[2] = 1.0; + output[3] = 0.0; +} - zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - system("pause"); // stop Win32 console from closing on exit - delete[] a; - delete[] b; - delete[] c; +int main(int argc, char* argv[]) { + + printf("\n"); + printf("****************\n"); + printf("** CREATING THE NEURAL NETWORK **\n"); + printf("****************\n"); + + const int INPUT_SIZE = 4; // Input size + const int HIDDENLAYER_SIZE = 2; // Output size + const int OUTPUT_SIZE = 1; // Output size + const int FEATURE_SIZE = 2; + + float *input = new float[INPUT_SIZE*FEATURE_SIZE]; + float *hidden = new float[HIDDENLAYER_SIZE]; + float *output = new float[OUTPUT_SIZE]; + float *weightsIH = new float[HIDDENLAYER_SIZE*FEATURE_SIZE]; + float *weightsHO = new float[HIDDENLAYER_SIZE*OUTPUT_SIZE]; + + createInputXor(input); + genArray(FEATURE_SIZE*HIDDENLAYER_SIZE, weightsIH, 100); + genArray(HIDDENLAYER_SIZE*OUTPUT_SIZE, weightsHO, 100); + + CharacterRecognition::createNN(INPUT_SIZE, HIDDENLAYER_SIZE, OUTPUT_SIZE, input, output, weightsIH, weightsHO); + + return 0; } diff --git a/Project2-Character-Recognition/src/testing_helpers.hpp b/Project2-Character-Recognition/src/testing_helpers.hpp index b28a8d2..0503578 100644 --- a/Project2-Character-Recognition/src/testing_helpers.hpp +++ b/Project2-Character-Recognition/src/testing_helpers.hpp @@ -43,17 +43,17 @@ void zeroArray(int n, int *a) { } } -void onesArray(int n, int *a) { +void onesArray(int n, float *a) { for (int i = 0; i < n; i++) { a[i] = 1; } } -void genArray(int n, int *a, int maxval) { +void genArray(int n, float *a, int maxval) { srand(time(nullptr)); for (int i = 0; i < n; i++) { - a[i] = rand() % maxval; + a[i] = ((float)rand()) / (float)maxval; } } From b9a7591dac019b6cf4a03fddc8f116193c53c34a Mon Sep 17 00:00:00 2001 From: SOMANSHU AGARWAL Date: Mon, 16 Sep 2019 03:50:08 -0400 Subject: [PATCH 04/56] Changes for Part 2B --- Project2-Character-Recognition/CMakeLists.txt | 4 +- .../character_recognition/mlp.cu | 448 ++++++++++++++++-- .../character_recognition/mlp.h | 2 +- Project2-Character-Recognition/src/main.cpp | 45 +- .../src/testing_helpers.hpp | 6 +- 5 files changed, 437 insertions(+), 68 deletions(-) diff --git a/Project2-Character-Recognition/CMakeLists.txt b/Project2-Character-Recognition/CMakeLists.txt index 1caf239..a8562ca 100644 --- a/Project2-Character-Recognition/CMakeLists.txt +++ b/Project2-Character-Recognition/CMakeLists.txt @@ -22,7 +22,6 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") endif() include_directories(.) -link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/x64) add_subdirectory(character_recognition) cuda_add_executable(${CMAKE_PROJECT_NAME} @@ -33,5 +32,4 @@ cuda_add_executable(${CMAKE_PROJECT_NAME} target_link_libraries(${CMAKE_PROJECT_NAME} character_recognition ${CORELIBS} - cublas - ) + ) \ No newline at end of file diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu index c8d61d3..58d947a 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.cu +++ b/Project2-Character-Recognition/character_recognition/mlp.cu @@ -2,15 +2,12 @@ #include #include "common.h" #include "mlp.h" -#include "cublas_v2.h" +#include -# define blockSize 128 -# define hiddenLayerLen 10 +//#include "cublas_v2.h" -static __inline__ void modify(cublasHandle_t handle, float *m, int ldm, int n, int p, int q, float alpha, float beta) { - cublasSscal(handle, n - q + 1, &alpha, &m[IDX2F(p, q, ldm)], ldm); - cublasSscal(handle, ldm - p + 1, &beta, &m[IDX2F(p, q, ldm)], 1); -} +# define blockSize 1 +# define hiddenLayerLen 10 namespace CharacterRecognition { using Common::PerformanceTimer; @@ -25,6 +22,10 @@ namespace CharacterRecognition { float *dev_output; float *dev_weightsIH; float *dev_weightsHO; + float *dev_newWeightsIH; + float *dev_newWeightsHO; + float *dev_actualOutput; + float *dev_gradB; // TODO: __global__ /** @@ -37,71 +38,372 @@ namespace CharacterRecognition { } */ - __global__ void kernMatrixMultiplication(int n, int m, int k, float *M,float *N, float *Out) { - int ty = blockIdx.y * blockDim.y + threadIdx.y; - int tx = blockIdx.x * blockDim.x + threadIdx.x; - int sum = 0; - if (col < k && ) + // Multiply the arrays A and B on GPU and save the result in C + // C(m,n) = A(m,k) * B(k,n) + /* + void gpu_blas_mmul(const float *A, const float *B, float *C, const int m, const int k, const int n) { + int lda = m, ldb = k, ldc = m; + const float alf = 1; + const float bet = 0; + const float *alpha = &alf; + const float *beta = &bet; + + // Create a handle for CUBLAS + cublasHandle_t handle; + cublasCreate(&handle); + + // Do the actual multiplication + cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); + // Destroy the handle + cublasDestroy(handle); } - __global__ void kernActivationFunction(int N, float* ) { + */ + __global__ void kernMatrixMultiplication(float *M,float *N, float *Out,int m, int n,int k) { + int row = blockIdx.y * blockDim.y + threadIdx.y; + int col = blockIdx.x * blockDim.x + threadIdx.x; + //printf("The values of m , n and k are :%d , %d %d \n", m, n , k); + //printf("The values of row and col are: %d & %d \n", row, col); + float sum = 0; + if (col < k && row < m) { + for (int i = 0; i < n; i++) { + sum += M[row*n + i] * N[i*k + col]; + //printf("hello the value of Sum is : %0.3f\n",sum); + } + //printf("The values are %d & %d \n", row, col); + Out[row*k + col] = sum; + //printf("The value is: %0.2f \n", Out[row*k + col]); + } } - // TODO: implement required elements for MLP sections 1 and 2 here + + void printArray(int n, float *a, bool abridged = false) { + printf(" [ "); + for (int i = 0; i < n; i++) { + if (abridged && i + 2 == 15 && n > 16) { + i = n - 2; + printf("... "); + } + printf("%0.2f ", a[i]); + } + printf("]\n"); + } + + __global__ void kernSigmoidFunction(int N, float* A) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= N) + return; - /*void matrixMultiplication(float *M,float *N,float *Out) { + A[index] = exp(-1*A[index]); + A[index] = 1.0 / (1.0 + A[index]); - stat = cublasCreate(&handle); - if (stat != CUBLAS_STATUS_SUCCESS) { - printf("CUBLAS initialization failed\n"); - return EXIT_FAILURE; + } + + __global__ void kernSoftMax(int N, float *A, int d) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= N) + return; + //printf("The index values are :%d\n", index); + float sum = 0; + //printf("The values are %d and %d :\n", N, d); + for (int i = index * d; i < index*d + d; i++) { + sum += exp(A[i]); + //printf("%d \n", i); } - stat = cublasSetMatrix(M, N, sizeof(*a), a, M, devPtrA, M); - if (stat != CUBLAS_STATUS_SUCCESS) { - printf("data download failed"); - cudaFree(devPtrA); - cublasDestroy(handle); - return EXIT_FAILURE; + + for (int i = index * d; i < index*d + d; i++) { + A[i] = exp(A[i]) / sum; } - modify(handle, devPtrA, M, N, 2, 3, 16.0f, 12.0f); - stat = cublasGetMatrix(M, N, sizeof(*a), devPtrA, M, a, M); - if (stat != CUBLAS_STATUS_SUCCESS) { - printf("data upload failed"); - cudaFree(devPtrA); - cublasDestroy(handle); - return EXIT_FAILURE; + } + + // TODO: implement required elements for MLP sections 1 and 2 here + + __global__ void kernCalculateLoss(int N, float *output, float *actualOutput, float *loss,int d) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= N) + return; + + for (int i = index * d; i < index*d + d; i++) { + if (actualOutput[i] == 1.0) + loss[index] = -log(output[i]); } - }*/ + + } + + __global__ void kernSubtraction(int N,float *A, float *B, float *C) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= N) + return; + + C[index] = A[index] - B[index]; + } + + __global__ void kernSoftMaxGradient(int N,int d, float *A, float *B, float *C) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= N) + return; - void createNN(int n,int h,int m, const float *idata, float *hidden, float *odata, const float *weightsIH, const float *weightsHO) { - cublasStatus_t stat; - cublasHandle_t handle; + for (int i = index * d; i < index*d + d; i++) { + if (B[i] == 1.0) + C[i] = A[i] - 1; + else + C[i] = A[i]; + C[i] /= N; - cudaMalloc((void**)&dev_input, n * sizeof(float)); - checkCUDAErrorFn("Malloc idata into input failed"); + } + } - cudaMemcpy(dev_input, idata, sizeof(float) * n, cudaMemcpyHostToDevice); - checkCUDAErrorFn("Copying idata to input failed"); + __global__ void kernSigmoidGrad(int N, float *A,float *B) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= N) + return; + + B[index] = A[index] * (1 - A[index]); + } + + __global__ void kernDotProduct(int N,float *A, float *B,float *C) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= N) + return; + + C[index] = A[index] * B[index]; + //printf("Values are for index %d is: %0.2f \n", index,C[index]); + } + + __global__ void gpu_matrix_transpose(float* mat_in, float* mat_out, unsigned int rows, unsigned int cols) + { + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; + + if (idx < cols && idy < rows) + { + unsigned int pos = idy * cols + idx; + unsigned int trans_pos = idx * rows + idy; + mat_out[trans_pos] = mat_in[pos]; + } + } + + __global__ void kernUpdateWeights(int N, float *A, float *B,float *C,float step_size) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= N) + return; + + B[index] = A[index] - step_size * C[index]; + + } + + void createNN(float *input, float* hidden, float *output, float *weightsA, float *weightsB, int n, int h, int m, int d) { + + dim3 fullBlocks1((n + blockSize - 1) / blockSize); + dim3 fullBlocks2((h + blockSize - 1) / blockSize); + //dim3 fullBlocks3((n + blockSize - 1) / blockSize); + dim3 fullBlocksMult1((h + blockSize - 1) / blockSize, (n + blockSize - 1) / blockSize); + dim3 fullBlocksMult2((m + blockSize - 1) / blockSize, (n + blockSize - 1) / blockSize); + + kernMatrixMultiplication << > > (input,weightsA,hidden,n, d,h); + checkCUDAErrorFn("Multiplication 1 failed"); + //gpu_blas_mmul(dev_input, dev_weightsIH, dev_hiddenLayer, n, d, h); + + //cudaMemcpy(dev_hiddenLayer, hidden, sizeof(float) * (n*h), cudaMemcpyHostToDevice); + //checkCUDAErrorFn("Copying hidden layer units failed"); + + kernSigmoidFunction << > > (h, dev_hiddenLayer); + checkCUDAErrorFn("Kernel Activation function failed"); + + kernMatrixMultiplication << > > (hidden, weightsB, output, n, h, m); + //gpu_blas_mmul(dev_hiddenLayer, dev_weightsHO, dev_output, h, d, m); + + kernSoftMax << > > (n,output,m); + checkCUDAErrorFn("Kernel Soft Max function failed"); + + //kernSigmoidFunction << > > (m, dev_output); + //checkCUDAErrorFn("Kernel Activation function failed"); + } + + void trainNN(float *input, float *hidden,float *output,float *actualOutput, float *weightsA,float *weightsB, + float *newWeightsA,float *newWeightsB,int n, int h ,int m, int d) { + + float *hiddenTrans; + float *gradSoftMax; + float *weightsBTrans; + float *devGrad; + float *dev_hiddenLayerGrad; + float *devGrad2; + float *inputTrans; + float *dev_gradA; + + cudaMalloc((void**)&dev_gradB, (h*m) * sizeof(float)); + checkCUDAErrorFn("Malloc geadient B weights failed"); + + cudaMalloc((void**)&gradSoftMax, (n*m) * sizeof(float)); + checkCUDAErrorFn("Malloc temporay arr1 failed"); + + cudaMalloc((void**)&weightsBTrans, (m*h) * sizeof(float)); + checkCUDAErrorFn("Malloc temporary arr2 failed"); - cudaMalloc((void**)&dev_hiddenLayer, h * sizeof(float)); + cudaMalloc((void**)&devGrad, (n*h) * sizeof(float)); + checkCUDAErrorFn("Malloc temporary arr3 failed"); + + cudaMalloc((void**)&devGrad2, (n*h) * sizeof(float)); + checkCUDAErrorFn("Malloc temporary arr3 failed"); + + cudaMalloc((void**)&hiddenTrans, (n*h) * sizeof(float)); + checkCUDAErrorFn("Malloc temporary arr3 failed"); + + cudaMalloc((void**)&dev_hiddenLayerGrad, (n*h) * sizeof(float)); + checkCUDAErrorFn("Malloc hiddenlayer gradient failed"); + + cudaMalloc((void**)&inputTrans, (n*d) * sizeof(float)); + checkCUDAErrorFn("Malloc hiddenlayer gradient failed"); + + cudaMalloc((void**)&dev_gradA, (n*h) * sizeof(float)); + checkCUDAErrorFn("Malloc gradient A failed"); + + // Wrote the structure as of now, needs to check later + dim3 fullBlocksMult1((h + blockSize - 1) / blockSize, (n + blockSize - 1) / blockSize); + gpu_matrix_transpose << > > (hidden,hiddenTrans,n,h); + checkCUDAErrorFn("Kernel transpose hidden failed"); + + dim3 fullBlocksMult2((m + blockSize - 1) / blockSize, (h + blockSize - 1) / blockSize); + dim3 fullBlocksMult3((n + blockSize - 1) / blockSize); + + //kernSubtraction << > > (n*m,output, actualOutput,tempOutput); + + kernSoftMaxGradient << > > (n,m,output,actualOutput,gradSoftMax); + + kernMatrixMultiplication << > > (hiddenTrans, gradSoftMax ,dev_gradB,h,n,m); + checkCUDAErrorFn("Kernel Matrix Multiplication hiiden and loss failed"); + + gpu_matrix_transpose << > > (weightsB, weightsBTrans, h, m); + checkCUDAErrorFn("Kernel Transpose for weightsB failed"); + + kernMatrixMultiplication << > > (gradSoftMax,weightsBTrans,devGrad,n,m,h); + checkCUDAErrorFn("Kernel Matrix Multiplication for Devgrad failed"); + + dim3 fullBlocksMult4((n*h + blockSize - 1) / blockSize); + kernSigmoidGrad << > > (h*n,dev_hiddenLayer,dev_hiddenLayerGrad); + checkCUDAErrorFn("Kernel Sigmoid gradient failed"); + + kernDotProduct << > > (n*h,dev_hiddenLayerGrad,devGrad,devGrad2); + checkCUDAErrorFn("Kernel Sigmoid gradient failed"); + + dim3 fullBlocksMult5((d + blockSize - 1) / blockSize, (n + blockSize - 1) / blockSize); + dim3 fullBlocksMult6((h + blockSize - 1) / blockSize, (d + blockSize - 1) / blockSize); + + gpu_matrix_transpose << > > (input, inputTrans, n, d); + checkCUDAErrorFn("Kernel Transpose for input failed"); + + kernMatrixMultiplication << > > (inputTrans,devGrad2,dev_gradA,d,n,h); + checkCUDAErrorFn("Kernel Matrix Multiplication for gradA failed"); + + /* + float *check2 = new float[d*h]; + + cudaMemcpy(check2, dev_gradA, sizeof(float) * (d*h), cudaMemcpyDeviceToHost); + checkCUDAErrorFn("Copying data to output failed"); + + printf("Grad A \n"); + printArray(d*h, check2, true); + */ + float eta_rate = 0.2; + + dim3 fullBlocksMult7((d*h + blockSize - 1) / blockSize); + kernUpdateWeights << > > (d*h,weightsA,newWeightsA,dev_gradA,eta_rate); + checkCUDAErrorFn("kernel update weights A failed"); + + /* + float *check2 = new float[d*h]; + + cudaMemcpy(check2, newWeightsA, sizeof(float) * (d*h), cudaMemcpyDeviceToHost); + checkCUDAErrorFn("Copying data to output failed"); + + printf("New weights A \n"); + printArray(d*h, check2, true); + */ + + dim3 fullBlocksMult8((h*m + blockSize - 1) / blockSize); + kernUpdateWeights << > > (h*m, weightsB, newWeightsB,dev_gradB, eta_rate); + checkCUDAErrorFn("Kernel update weights B failed"); + + cudaFree(dev_gradB); + cudaFree(gradSoftMax); + cudaFree(hiddenTrans); + cudaFree(devGrad); + cudaFree(devGrad2); + cudaFree(dev_hiddenLayerGrad); + cudaFree(inputTrans); + cudaFree(dev_gradA); + cudaFree(weightsBTrans); + + + //kernMatrixMultiplication << > > (hiddenTrans,output- actualOutput,h,n,m); + //checkCUDAErrorFn("Kernel Matrix Multiplication hiiden and loss failed"); + + //gpu_blas_mmul((output - actualOutput), hidden, dev_gradB, m, d, h); //(Still to caclulate mean) + //gpu_blas_mmul(hidden, (1 - hidden), dev_arr1, h, d, h); // Check the dimensions + //gpu_blas_mmul(weightsA, dev_arr1, dev_arr2, h, d, h); // Still to look on transpose + //gpu_blas_mmul((output - actualOutput),input,dev_arr3,m,d,n);// Look into it for transpose + + } + + float calculateLoss(int n, float *dev_output, float *dev_actualOutput, float *dev_loss, int m) { + dim3 fullBlocks1((n + blockSize - 1) / blockSize); + kernCalculateLoss << > > (n, dev_output, dev_actualOutput, dev_loss, m); + float *loss = new float[n]; + cudaMemcpy(loss, dev_loss, sizeof(float) * (n), cudaMemcpyDeviceToHost); + checkCUDAErrorFn("Copying data to hidden layer failed"); + + float totalLoss = 0; + for (int i = 0; i < n; i++) + totalLoss += loss[i]; + return totalLoss; + } + void createAndTrainNN(int n,int h,int m,int d, float *idata, float *hidden, float *odata, float *weightsIH, float *weightsHO,float *actualOutput) { + + float *dev_loss; + + cudaMalloc((void**)&dev_input, (n*d) * sizeof(float)); + checkCUDAErrorFn("Malloc idata into input failed"); + + cudaMalloc((void**)&dev_hiddenLayer, (n*h) * sizeof(float)); checkCUDAErrorFn("Malloc idata into hidden layer failed"); - cudaMalloc((void**)&dev_output, m * sizeof(float)); + cudaMalloc((void**)&dev_output, (n*m) * sizeof(float)); checkCUDAErrorFn("Malloc idata into output failed"); - cudaMalloc((void**)&dev_weightsIH, (n*h) * sizeof(float)); + cudaMalloc((void**)&dev_weightsIH, (d*h) * sizeof(float)); checkCUDAErrorFn("Malloc idata into weights b/w input & hidden failed"); cudaMalloc((void**)&dev_weightsHO, (h*m) * sizeof(float)); checkCUDAErrorFn("Malloc idata into weights b/w hidden & output failed"); - cudaMemcpy(dev_weightsIH, weightsIH, sizeof(float) * (n*h), cudaMemcpyHostToDevice); + cudaMalloc((void**)&dev_actualOutput, (n*m) * sizeof(float)); + checkCUDAErrorFn("Malloc actual output memeory failed"); + + cudaMalloc((void**)&dev_newWeightsIH, (d*h) * sizeof(float)); + checkCUDAErrorFn("Malloc actual output memeory failed"); + + cudaMalloc((void**)&dev_newWeightsHO, (h*m) * sizeof(float)); + checkCUDAErrorFn("Malloc actual output memeory failed"); + + cudaMalloc((void**)&dev_loss, (n) * sizeof(float)); + checkCUDAErrorFn("Malloc actual output memeory failed"); + + cudaMemcpy(dev_input, idata, sizeof(float) * (n*d), cudaMemcpyHostToDevice); + checkCUDAErrorFn("Copying idata to input failed"); + + cudaMemcpy(dev_actualOutput, actualOutput, sizeof(float) * (n*m), cudaMemcpyHostToDevice); + checkCUDAErrorFn("Copying real output failed failed"); + + cudaMemcpy(dev_weightsIH, weightsIH, sizeof(float) * (d*h), cudaMemcpyHostToDevice); checkCUDAErrorFn("Copying weights array 1 failed"); cudaMemcpy(dev_weightsHO, weightsHO , sizeof(float) * (h*m), cudaMemcpyHostToDevice); checkCUDAErrorFn("Copying weights array 2 failed"); + //printf("Inside the function \n"); + dim3 fullBlocks1((n + blockSize - 1) / blockSize); dim3 fullBlocks2((h + blockSize - 1) / blockSize); dim3 fullBlocks3((m + blockSize - 1) / blockSize); @@ -109,14 +411,60 @@ namespace CharacterRecognition { //kernMultiplyWeights << > > (n,hiddenLayerLen,dev_input,dev_hiddenLayer,dev_weightsIH); - kernMatrixMultiplication(dev_input, dev_weightsIH, dev_hiddenLayer); + createNN(dev_input, dev_hiddenLayer, dev_output , dev_weightsIH, dev_weightsHO, n, h, m, d); + + float totalLoss; + totalLoss = calculateLoss(n, dev_output, dev_actualOutput, dev_loss, m); - //cudaMemcpy(dev_hiddenLayer, hidden, sizeof(float) * (n*h), cudaMemcpyHostToDevice); - //checkCUDAErrorFn("Copying hidden layer units failed"); + //thrust::device_pointer() + //thrust::inclusive_scan(dev_loss,dev_loss+n,dev_loss); + /* float *check = new float[n]; - kernActivationFunction<< > > (h,dev_hiddenLayer); - checkCUDAErrorFn("Kernel Activation function failed"); + cudaMemcpy(check, dev_loss, sizeof(float) * (n), cudaMemcpyDeviceToHost); + checkCUDAErrorFn("Copying data to hidden layer failed"); + + printArray(n, check, true); + */ + //printf("Total loss: %0.2f\n", totalLoss); + //float totalError = 1; + //if (totalLoss > totalError) { + + int iterations = 0; + float totalError = 0.1; + while (totalLoss > totalError && iterations < 10000) { + trainNN(dev_input, dev_hiddenLayer, dev_output, dev_actualOutput, dev_weightsIH, dev_weightsHO, dev_newWeightsIH, dev_newWeightsHO, n, h, m, d); + dev_weightsIH = dev_newWeightsIH; + dev_weightsHO = dev_newWeightsHO; + createNN(dev_input, dev_hiddenLayer, dev_output, dev_weightsIH, dev_weightsHO, n, h, m, d); + totalLoss = calculateLoss(n, dev_output, dev_actualOutput, dev_loss, m) / n; + iterations++; + printf("Iteration: %d \n", iterations); + printf("Total loss is :%0.3f", totalLoss); + } + + cudaMemcpy(hidden, dev_hiddenLayer, sizeof(float) * (n*h), cudaMemcpyDeviceToHost); + checkCUDAErrorFn("Copying data to hidden layer failed"); + + cudaMemcpy(odata, dev_output, sizeof(float) * (n*m), cudaMemcpyDeviceToHost); + checkCUDAErrorFn("Copying data to output failed"); + + cudaMemcpy(weightsIH, dev_newWeightsIH, sizeof(float) * (d*h), cudaMemcpyDeviceToHost); + checkCUDAErrorFn("Copying data to hidden layer failed"); + + cudaMemcpy(weightsHO, dev_newWeightsHO, sizeof(float) * (h*m), cudaMemcpyDeviceToHost); + checkCUDAErrorFn("Copying data to hidden layer failed"); + + cudaFree(dev_input); + cudaFree(dev_hiddenLayer); + cudaFree(dev_output); + cudaFree(dev_weightsIH); + cudaFree(dev_weightsHO); + cudaFree(dev_newWeightsIH); + cudaFree(dev_newWeightsHO); + cudaFree(dev_actualOutput); + cudaFree(dev_actualOutput); + cudaFree(dev_loss); - } + } diff --git a/Project2-Character-Recognition/character_recognition/mlp.h b/Project2-Character-Recognition/character_recognition/mlp.h index 2096228..1efecac 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.h +++ b/Project2-Character-Recognition/character_recognition/mlp.h @@ -4,6 +4,6 @@ namespace CharacterRecognition { Common::PerformanceTimer& timer(); - + void createAndTrainNN(int n, int h, int m, int d, float *idata, float *hidden, float *odata, float *weightsIH, float *weightsHO, float *actualOutput); // TODO: implement required elements for MLP sections 1 and 2 here } diff --git a/Project2-Character-Recognition/src/main.cpp b/Project2-Character-Recognition/src/main.cpp index 52ad1aa..d765b11 100644 --- a/Project2-Character-Recognition/src/main.cpp +++ b/Project2-Character-Recognition/src/main.cpp @@ -23,10 +23,14 @@ void createInputXor(float *input) { } void createOutputXor(float *output) { - output[0] = 0.0; - output[1] = 1.0; - output[2] = 1.0; - output[3] = 0.0; + output[0] = 1.0; + output[1] = 0.0; + output[2] = 0.0; + output[3] = 1.0; + output[4] = 0.0; + output[5] = 1.0; + output[6] = 1.0; + output[7] = 0.0; } @@ -39,20 +43,39 @@ int main(int argc, char* argv[]) { const int INPUT_SIZE = 4; // Input size const int HIDDENLAYER_SIZE = 2; // Output size - const int OUTPUT_SIZE = 1; // Output size - const int FEATURE_SIZE = 2; + const int OUTPUT_SIZE = 2; // Output size + const int FEATURE_SIZE = 2; // Feature Size float *input = new float[INPUT_SIZE*FEATURE_SIZE]; - float *hidden = new float[HIDDENLAYER_SIZE]; - float *output = new float[OUTPUT_SIZE]; + float *hidden = new float[INPUT_SIZE*HIDDENLAYER_SIZE]; + float *output = new float[INPUT_SIZE*OUTPUT_SIZE]; float *weightsIH = new float[HIDDENLAYER_SIZE*FEATURE_SIZE]; float *weightsHO = new float[HIDDENLAYER_SIZE*OUTPUT_SIZE]; + float *outputNN = new float[INPUT_SIZE*OUTPUT_SIZE]; createInputXor(input); - genArray(FEATURE_SIZE*HIDDENLAYER_SIZE, weightsIH, 100); - genArray(HIDDENLAYER_SIZE*OUTPUT_SIZE, weightsHO, 100); + createOutputXor(output); + genArray(FEATURE_SIZE*HIDDENLAYER_SIZE, weightsIH, 1); + genArray(HIDDENLAYER_SIZE*OUTPUT_SIZE, weightsHO, 1); - CharacterRecognition::createNN(INPUT_SIZE, HIDDENLAYER_SIZE, OUTPUT_SIZE, input, output, weightsIH, weightsHO); + printf("Weights A array: \n"); + printArray(HIDDENLAYER_SIZE*FEATURE_SIZE, weightsIH, true); + printf("Weights B array: \n"); + printArray(HIDDENLAYER_SIZE*OUTPUT_SIZE, weightsHO, true); + CharacterRecognition::createAndTrainNN(INPUT_SIZE, HIDDENLAYER_SIZE, OUTPUT_SIZE, FEATURE_SIZE, input,hidden, outputNN, weightsIH, weightsHO,output); + printf("After NN and Training \n"); + printf("Input Array: \n"); + printArray(INPUT_SIZE*FEATURE_SIZE,input,true); + printf("hidden Layer Array: \n"); + printArray(INPUT_SIZE*HIDDENLAYER_SIZE,hidden,true); + printf("Output Array: \n"); + printArray(INPUT_SIZE*OUTPUT_SIZE,outputNN,true); + printf("Actual Output Array: \n"); + printArray(INPUT_SIZE*OUTPUT_SIZE, output, true); + printf("Weights A array: \n"); + printArray(HIDDENLAYER_SIZE*FEATURE_SIZE,weightsIH,true); + printf("Weights B array: \n"); + printArray(HIDDENLAYER_SIZE*OUTPUT_SIZE,weightsHO,true); return 0; } diff --git a/Project2-Character-Recognition/src/testing_helpers.hpp b/Project2-Character-Recognition/src/testing_helpers.hpp index 0503578..0b50291 100644 --- a/Project2-Character-Recognition/src/testing_helpers.hpp +++ b/Project2-Character-Recognition/src/testing_helpers.hpp @@ -53,18 +53,18 @@ void genArray(int n, float *a, int maxval) { srand(time(nullptr)); for (int i = 0; i < n; i++) { - a[i] = ((float)rand()) / (float)maxval; + a[i] = static_cast (rand()) / static_cast (RAND_MAX); } } -void printArray(int n, int *a, bool abridged = false) { +void printArray(int n,float *a, bool abridged = false) { printf(" [ "); for (int i = 0; i < n; i++) { if (abridged && i + 2 == 15 && n > 16) { i = n - 2; printf("... "); } - printf("%3d ", a[i]); + printf("%0.4f ", a[i]); } printf("]\n"); } From f6ca9d818008a01bfd42106dfd58029624daa5ce Mon Sep 17 00:00:00 2001 From: SOMANSHU AGARWAL Date: Mon, 16 Sep 2019 17:39:28 -0400 Subject: [PATCH 05/56] Latest changes for part 2B --- .../character_recognition/mlp.cu | 78 +++++++++++++++---- Project2-Character-Recognition/src/main.cpp | 11 ++- .../src/testing_helpers.hpp | 18 ++++- 3 files changed, 86 insertions(+), 21 deletions(-) diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu index 58d947a..f0bb2a1 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.cu +++ b/Project2-Character-Recognition/character_recognition/mlp.cu @@ -193,10 +193,25 @@ namespace CharacterRecognition { } + __global__ void kernGetAccuracy(int N, float *A,int *B,int d) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= N) + return; + + int max_index = 0; + float max_val = -1;; + for (int i = index * d; i < index*d + d; i++) { + if (max_val < A[i]) { + max_val = A[i]; + max_index = i; + } + } + B[index] = max_index % d; + } void createNN(float *input, float* hidden, float *output, float *weightsA, float *weightsB, int n, int h, int m, int d) { dim3 fullBlocks1((n + blockSize - 1) / blockSize); - dim3 fullBlocks2((h + blockSize - 1) / blockSize); + dim3 fullBlocks2((n*h + blockSize - 1) / blockSize); //dim3 fullBlocks3((n + blockSize - 1) / blockSize); dim3 fullBlocksMult1((h + blockSize - 1) / blockSize, (n + blockSize - 1) / blockSize); dim3 fullBlocksMult2((m + blockSize - 1) / blockSize, (n + blockSize - 1) / blockSize); @@ -208,7 +223,7 @@ namespace CharacterRecognition { //cudaMemcpy(dev_hiddenLayer, hidden, sizeof(float) * (n*h), cudaMemcpyHostToDevice); //checkCUDAErrorFn("Copying hidden layer units failed"); - kernSigmoidFunction << > > (h, dev_hiddenLayer); + kernSigmoidFunction << > > (n*h, hidden); checkCUDAErrorFn("Kernel Activation function failed"); kernMatrixMultiplication << > > (hidden, weightsB, output, n, h, m); @@ -271,9 +286,20 @@ namespace CharacterRecognition { //kernSubtraction << > > (n*m,output, actualOutput,tempOutput); kernSoftMaxGradient << > > (n,m,output,actualOutput,gradSoftMax); + /* + float *check0 = new float[d*h]; + + cudaMemcpy(check0, gradSoftMax, sizeof(float) * (n*m), cudaMemcpyDeviceToHost); + checkCUDAErrorFn("Copying data to output failed"); + + printf("Gradient Soft Max \n"); + printArray(n*m, check0, true); + */ kernMatrixMultiplication << > > (hiddenTrans, gradSoftMax ,dev_gradB,h,n,m); checkCUDAErrorFn("Kernel Matrix Multiplication hiiden and loss failed"); + + //float *check0 = new float[d*h]; gpu_matrix_transpose << > > (weightsB, weightsBTrans, h, m); checkCUDAErrorFn("Kernel Transpose for weightsB failed"); @@ -282,7 +308,7 @@ namespace CharacterRecognition { checkCUDAErrorFn("Kernel Matrix Multiplication for Devgrad failed"); dim3 fullBlocksMult4((n*h + blockSize - 1) / blockSize); - kernSigmoidGrad << > > (h*n,dev_hiddenLayer,dev_hiddenLayerGrad); + kernSigmoidGrad << > > (h*n,hidden,dev_hiddenLayerGrad); checkCUDAErrorFn("Kernel Sigmoid gradient failed"); kernDotProduct << > > (n*h,dev_hiddenLayerGrad,devGrad,devGrad2); @@ -306,21 +332,29 @@ namespace CharacterRecognition { printf("Grad A \n"); printArray(d*h, check2, true); */ - float eta_rate = 0.2; - - dim3 fullBlocksMult7((d*h + blockSize - 1) / blockSize); - kernUpdateWeights << > > (d*h,weightsA,newWeightsA,dev_gradA,eta_rate); - checkCUDAErrorFn("kernel update weights A failed"); + float eta_rate = 0.3; + + float *check = new float[d*h]; + cudaMemcpy(check, dev_gradA, sizeof(float) * (d*h), cudaMemcpyDeviceToHost); + checkCUDAErrorFn("Copying data to output failed"); /* - float *check2 = new float[d*h]; + printf("Grad A \n"); + printArray(d*h, check, true); - cudaMemcpy(check2, newWeightsA, sizeof(float) * (d*h), cudaMemcpyDeviceToHost); + + float *check2 = new float[h*m]; + + cudaMemcpy(check2, dev_gradB, sizeof(float) * (h*m), cudaMemcpyDeviceToHost); checkCUDAErrorFn("Copying data to output failed"); - printf("New weights A \n"); - printArray(d*h, check2, true); + printf("Grad B \n"); + printArray(h*m, check2, true); */ + dim3 fullBlocksMult7((d*h + blockSize - 1) / blockSize); + kernUpdateWeights << > > (d*h,weightsA,newWeightsA,dev_gradA,eta_rate); + checkCUDAErrorFn("kernel update weights A failed"); + dim3 fullBlocksMult8((h*m + blockSize - 1) / blockSize); kernUpdateWeights << > > (h*m, weightsB, newWeightsB,dev_gradB, eta_rate); @@ -362,6 +396,7 @@ namespace CharacterRecognition { void createAndTrainNN(int n,int h,int m,int d, float *idata, float *hidden, float *odata, float *weightsIH, float *weightsHO,float *actualOutput) { float *dev_loss; + int *dev_predict; cudaMalloc((void**)&dev_input, (n*d) * sizeof(float)); checkCUDAErrorFn("Malloc idata into input failed"); @@ -390,6 +425,9 @@ namespace CharacterRecognition { cudaMalloc((void**)&dev_loss, (n) * sizeof(float)); checkCUDAErrorFn("Malloc actual output memeory failed"); + cudaMalloc((void**)&dev_predict, (n) * sizeof(float)); + checkCUDAErrorFn("Malloc predict memeory failed"); + cudaMemcpy(dev_input, idata, sizeof(float) * (n*d), cudaMemcpyHostToDevice); checkCUDAErrorFn("Copying idata to input failed"); @@ -431,7 +469,7 @@ namespace CharacterRecognition { int iterations = 0; float totalError = 0.1; - while (totalLoss > totalError && iterations < 10000) { + while (totalLoss > totalError && iterations < 2000) { trainNN(dev_input, dev_hiddenLayer, dev_output, dev_actualOutput, dev_weightsIH, dev_weightsHO, dev_newWeightsIH, dev_newWeightsHO, n, h, m, d); dev_weightsIH = dev_newWeightsIH; dev_weightsHO = dev_newWeightsHO; @@ -439,9 +477,21 @@ namespace CharacterRecognition { totalLoss = calculateLoss(n, dev_output, dev_actualOutput, dev_loss, m) / n; iterations++; printf("Iteration: %d \n", iterations); - printf("Total loss is :%0.3f", totalLoss); + printf("Total loss is :%0.3f\n", totalLoss); } + //dim3 fullBlocks4((n + blockSize - 1) / blockSize); + kernGetAccuracy << < fullBlocks1,blockSize>> > (n*m,dev_output,dev_predict,m); + checkCUDAErrorFn("Kernel accuracy failed"); + + int *predict = new int[n]; + cudaMemcpy(predict, dev_predict, sizeof(float) * (n), cudaMemcpyDeviceToHost); + checkCUDAErrorFn("Copying predict data failed"); + + for (int i = 0; i < n; i++) { + printf("The outcome for the data point %d is: %d\n", i+1,predict[i]); + } + cudaMemcpy(hidden, dev_hiddenLayer, sizeof(float) * (n*h), cudaMemcpyDeviceToHost); checkCUDAErrorFn("Copying data to hidden layer failed"); diff --git a/Project2-Character-Recognition/src/main.cpp b/Project2-Character-Recognition/src/main.cpp index d765b11..5b89c7c 100644 --- a/Project2-Character-Recognition/src/main.cpp +++ b/Project2-Character-Recognition/src/main.cpp @@ -42,7 +42,7 @@ int main(int argc, char* argv[]) { printf("****************\n"); const int INPUT_SIZE = 4; // Input size - const int HIDDENLAYER_SIZE = 2; // Output size + const int HIDDENLAYER_SIZE = 4; // Output size const int OUTPUT_SIZE = 2; // Output size const int FEATURE_SIZE = 2; // Feature Size @@ -55,8 +55,13 @@ int main(int argc, char* argv[]) { createInputXor(input); createOutputXor(output); - genArray(FEATURE_SIZE*HIDDENLAYER_SIZE, weightsIH, 1); - genArray(HIDDENLAYER_SIZE*OUTPUT_SIZE, weightsHO, 1); + srand(10); + genArrayA(FEATURE_SIZE*HIDDENLAYER_SIZE, weightsIH, 0); + genArrayB(HIDDENLAYER_SIZE*OUTPUT_SIZE, weightsHO, 1); + + + //zeroArray(FEATURE_SIZE*HIDDENLAYER_SIZE, weightsIH); + //zeroArray(HIDDENLAYER_SIZE*OUTPUT_SIZE, weightsHO); printf("Weights A array: \n"); printArray(HIDDENLAYER_SIZE*FEATURE_SIZE, weightsIH, true); diff --git a/Project2-Character-Recognition/src/testing_helpers.hpp b/Project2-Character-Recognition/src/testing_helpers.hpp index 0b50291..65976bc 100644 --- a/Project2-Character-Recognition/src/testing_helpers.hpp +++ b/Project2-Character-Recognition/src/testing_helpers.hpp @@ -37,7 +37,7 @@ void printCmpLenResult(int n, int expN, T *a, T *b) { cmpArrays(n, a, b) ? "FAIL VALUE" : "passed"); } -void zeroArray(int n, int *a) { +void zeroArray(int n, float *a) { for (int i = 0; i < n; i++) { a[i] = 0; } @@ -49,14 +49,24 @@ void onesArray(int n, float *a) { } } -void genArray(int n, float *a, int maxval) { - srand(time(nullptr)); +void genArrayA(int n, float *a, int val) { + //srand(time(nullptr)); + srand(0); for (int i = 0; i < n; i++) { - a[i] = static_cast (rand()) / static_cast (RAND_MAX); + a[i] = -1.0 + (static_cast (rand()) / static_cast (RAND_MAX))*2; } } +void genArrayB(int n, float *a, int val) { + //srand(time(nullptr)); + srand(1); + + for (int i = 0; i < n; i++) { + a[i] = -1.0 + (static_cast (rand()) / static_cast (RAND_MAX)) * 2; + } +} + void printArray(int n,float *a, bool abridged = false) { printf(" [ "); for (int i = 0; i < n; i++) { From 007eb5dcdd152a43b6af1807d85eeba70316ca93 Mon Sep 17 00:00:00 2001 From: SOMANSHU AGARWAL Date: Mon, 16 Sep 2019 23:16:42 -0400 Subject: [PATCH 06/56] Changes in par 2B --- .../character_recognition/mlp.cu | 27 ++-- Project2-Character-Recognition/src/main.cpp | 123 +++++++++++++++--- .../src/testing_helpers.hpp | 2 +- 3 files changed, 121 insertions(+), 31 deletions(-) diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu index f0bb2a1..aef4305 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.cu +++ b/Project2-Character-Recognition/character_recognition/mlp.cu @@ -7,7 +7,6 @@ //#include "cublas_v2.h" # define blockSize 1 -# define hiddenLayerLen 10 namespace CharacterRecognition { using Common::PerformanceTimer; @@ -227,6 +226,7 @@ namespace CharacterRecognition { checkCUDAErrorFn("Kernel Activation function failed"); kernMatrixMultiplication << > > (hidden, weightsB, output, n, h, m); + checkCUDAErrorFn("Multiplication 2 failed"); //gpu_blas_mmul(dev_hiddenLayer, dev_weightsHO, dev_output, h, d, m); kernSoftMax << > > (n,output,m); @@ -252,25 +252,25 @@ namespace CharacterRecognition { checkCUDAErrorFn("Malloc geadient B weights failed"); cudaMalloc((void**)&gradSoftMax, (n*m) * sizeof(float)); - checkCUDAErrorFn("Malloc temporay arr1 failed"); + checkCUDAErrorFn("Malloc Soft gradient failed"); cudaMalloc((void**)&weightsBTrans, (m*h) * sizeof(float)); - checkCUDAErrorFn("Malloc temporary arr2 failed"); + checkCUDAErrorFn("Malloc weightsB Transpose failed"); cudaMalloc((void**)&devGrad, (n*h) * sizeof(float)); - checkCUDAErrorFn("Malloc temporary arr3 failed"); + checkCUDAErrorFn("Malloc devGrad failed"); cudaMalloc((void**)&devGrad2, (n*h) * sizeof(float)); - checkCUDAErrorFn("Malloc temporary arr3 failed"); + checkCUDAErrorFn("Malloc devGrad2 failed"); cudaMalloc((void**)&hiddenTrans, (n*h) * sizeof(float)); - checkCUDAErrorFn("Malloc temporary arr3 failed"); + checkCUDAErrorFn("Malloc hiddenTrans failed"); cudaMalloc((void**)&dev_hiddenLayerGrad, (n*h) * sizeof(float)); checkCUDAErrorFn("Malloc hiddenlayer gradient failed"); cudaMalloc((void**)&inputTrans, (n*d) * sizeof(float)); - checkCUDAErrorFn("Malloc hiddenlayer gradient failed"); + checkCUDAErrorFn("Malloc input trans failed"); cudaMalloc((void**)&dev_gradA, (n*h) * sizeof(float)); checkCUDAErrorFn("Malloc gradient A failed"); @@ -312,7 +312,7 @@ namespace CharacterRecognition { checkCUDAErrorFn("Kernel Sigmoid gradient failed"); kernDotProduct << > > (n*h,dev_hiddenLayerGrad,devGrad,devGrad2); - checkCUDAErrorFn("Kernel Sigmoid gradient failed"); + checkCUDAErrorFn("Kernel Dot Product failed"); dim3 fullBlocksMult5((d + blockSize - 1) / blockSize, (n + blockSize - 1) / blockSize); dim3 fullBlocksMult6((h + blockSize - 1) / blockSize, (d + blockSize - 1) / blockSize); @@ -332,12 +332,14 @@ namespace CharacterRecognition { printf("Grad A \n"); printArray(d*h, check2, true); */ + /* float eta_rate = 0.3; float *check = new float[d*h]; cudaMemcpy(check, dev_gradA, sizeof(float) * (d*h), cudaMemcpyDeviceToHost); checkCUDAErrorFn("Copying data to output failed"); + */ /* printf("Grad A \n"); printArray(d*h, check, true); @@ -351,6 +353,7 @@ namespace CharacterRecognition { printf("Grad B \n"); printArray(h*m, check2, true); */ + float eta_rate = 0.3; dim3 fullBlocksMult7((d*h + blockSize - 1) / blockSize); kernUpdateWeights << > > (d*h,weightsA,newWeightsA,dev_gradA,eta_rate); checkCUDAErrorFn("kernel update weights A failed"); @@ -479,7 +482,13 @@ namespace CharacterRecognition { printf("Iteration: %d \n", iterations); printf("Total loss is :%0.3f\n", totalLoss); } - + float *check = new float[n*m]; + + cudaMemcpy(check, dev_output, sizeof(float) * (n*m), cudaMemcpyDeviceToHost); + checkCUDAErrorFn("Copying data to hidden layer failed"); + + printArray(n*m, check, true); + //dim3 fullBlocks4((n + blockSize - 1) / blockSize); kernGetAccuracy << < fullBlocks1,blockSize>> > (n*m,dev_output,dev_predict,m); checkCUDAErrorFn("Kernel accuracy failed"); diff --git a/Project2-Character-Recognition/src/main.cpp b/Project2-Character-Recognition/src/main.cpp index 5b89c7c..6691b7c 100644 --- a/Project2-Character-Recognition/src/main.cpp +++ b/Project2-Character-Recognition/src/main.cpp @@ -10,6 +10,10 @@ #include #include #include "testing_helpers.hpp" +#include +#include +#include +using namespace std; void createInputXor(float *input) { input[0] = 0.0; @@ -33,19 +37,7 @@ void createOutputXor(float *output) { output[7] = 0.0; } - -int main(int argc, char* argv[]) { - - printf("\n"); - printf("****************\n"); - printf("** CREATING THE NEURAL NETWORK **\n"); - printf("****************\n"); - - const int INPUT_SIZE = 4; // Input size - const int HIDDENLAYER_SIZE = 4; // Output size - const int OUTPUT_SIZE = 2; // Output size - const int FEATURE_SIZE = 2; // Feature Size - +void xor2x2NN(int INPUT_SIZE, int HIDDENLAYER_SIZE, int OUTPUT_SIZE, int FEATURE_SIZE) { float *input = new float[INPUT_SIZE*FEATURE_SIZE]; float *hidden = new float[INPUT_SIZE*HIDDENLAYER_SIZE]; float *output = new float[INPUT_SIZE*OUTPUT_SIZE]; @@ -53,13 +45,13 @@ int main(int argc, char* argv[]) { float *weightsHO = new float[HIDDENLAYER_SIZE*OUTPUT_SIZE]; float *outputNN = new float[INPUT_SIZE*OUTPUT_SIZE]; + createInputXor(input); createOutputXor(output); srand(10); genArrayA(FEATURE_SIZE*HIDDENLAYER_SIZE, weightsIH, 0); genArrayB(HIDDENLAYER_SIZE*OUTPUT_SIZE, weightsHO, 1); - //zeroArray(FEATURE_SIZE*HIDDENLAYER_SIZE, weightsIH); //zeroArray(HIDDENLAYER_SIZE*OUTPUT_SIZE, weightsHO); @@ -68,19 +60,108 @@ int main(int argc, char* argv[]) { printf("Weights B array: \n"); printArray(HIDDENLAYER_SIZE*OUTPUT_SIZE, weightsHO, true); - CharacterRecognition::createAndTrainNN(INPUT_SIZE, HIDDENLAYER_SIZE, OUTPUT_SIZE, FEATURE_SIZE, input,hidden, outputNN, weightsIH, weightsHO,output); + CharacterRecognition::createAndTrainNN(INPUT_SIZE, HIDDENLAYER_SIZE, OUTPUT_SIZE, FEATURE_SIZE, input, hidden, outputNN, weightsIH, weightsHO, output); + printf("After NN and Training \n"); printf("Input Array: \n"); - printArray(INPUT_SIZE*FEATURE_SIZE,input,true); + printArray(INPUT_SIZE*FEATURE_SIZE, input, true); printf("hidden Layer Array: \n"); - printArray(INPUT_SIZE*HIDDENLAYER_SIZE,hidden,true); + printArray(INPUT_SIZE*HIDDENLAYER_SIZE, hidden, true); printf("Output Array: \n"); - printArray(INPUT_SIZE*OUTPUT_SIZE,outputNN,true); + printArray(INPUT_SIZE*OUTPUT_SIZE, outputNN, true); printf("Actual Output Array: \n"); printArray(INPUT_SIZE*OUTPUT_SIZE, output, true); printf("Weights A array: \n"); - printArray(HIDDENLAYER_SIZE*FEATURE_SIZE,weightsIH,true); + printArray(HIDDENLAYER_SIZE*FEATURE_SIZE, weightsIH, true); printf("Weights B array: \n"); - printArray(HIDDENLAYER_SIZE*OUTPUT_SIZE,weightsHO,true); - return 0; + printArray(HIDDENLAYER_SIZE*OUTPUT_SIZE, weightsHO, true); +} + +int main(int argc, char* argv[]) { + + printf("\n"); + printf("****************\n"); + printf("** CREATING THE NEURAL NETWORK **\n"); + printf("****************\n"); + + /* + const int INPUT_SIZE = 4; // Input size + const int HIDDENLAYER_SIZE = 4; // Output size + const int OUTPUT_SIZE = 2; // Output size + const int FEATURE_SIZE = 2; // Feature Size + + xor2x2NN(INPUT_SIZE,HIDDENLAYER_SIZE,OUTPUT_SIZE,FEATURE_SIZE); + */ + + const int INPUT_SIZE = 52; // Input size + const int HIDDENLAYER_SIZE = 10; // Output size + const int OUTPUT_SIZE = 52; // Output size + const int FEATURE_SIZE = 10201; // Feature Size + + float *input = new float[INPUT_SIZE*FEATURE_SIZE]; + float *hidden = new float[INPUT_SIZE*HIDDENLAYER_SIZE]; + float *output = new float[INPUT_SIZE*OUTPUT_SIZE]; + float *weightsIH = new float[HIDDENLAYER_SIZE*FEATURE_SIZE]; + float *weightsHO = new float[HIDDENLAYER_SIZE*OUTPUT_SIZE]; + float *outputNN = new float[INPUT_SIZE*OUTPUT_SIZE]; + + for (int i = 1; i <= INPUT_SIZE; i++) { + string line; + string filename; + string temp; + if (i < 10) + filename = "C:\\Users\\somanshu\\Desktop\\Project2-Number-Algorithms\\Project2-Character-Recognition\\data-set\\0" + std::to_string(i) + "info.txt"; + else + filename = "C:\\Users\\somanshu\\Desktop\\Project2-Number-Algorithms\\Project2-Character-Recognition\\data-set\\" + std::to_string(i) + "info.txt"; + ifstream myfile(filename); + if (myfile.is_open()) + { + int count = 0; + while (getline(myfile, line)) + { + if (count == 2) { + std::size_t foundFirst = 1; + std::size_t foundLast = 1; + int counter = 0; + while (foundLast < line.length()) { + foundLast = line.find(" ", foundFirst); + temp = line.substr(foundFirst, foundLast - foundFirst); + float val = (float) std::stoi(temp); + //float val2 = (float)val; + input[(i - 1)*FEATURE_SIZE + counter] = val/255; + counter++; + foundFirst = foundLast+1; + } + } + + count++; + } + myfile.close(); + } + } + + + + /*for (int i = 0; i < 10; i++) { + printf("Element is: %0.2f\n", input[i]); + } + printf("last Elements \n"); + for (int i = (10201*52)-10; i < 10201*52; i++) { + printf("Element is: %0.2f\n", input[i]); + } + */ + genArrayA(FEATURE_SIZE*HIDDENLAYER_SIZE, weightsIH, 0); + genArrayB(HIDDENLAYER_SIZE*OUTPUT_SIZE, weightsHO, 1); + + zeroArray(INPUT_SIZE*OUTPUT_SIZE, output); + for (int i = 0; i < OUTPUT_SIZE; i++) + output[i*INPUT_SIZE+i] = 1.0; + //float *inputNorm = new float[INPUT_SIZE*FEATURE_SIZE]; + CharacterRecognition::createAndTrainNN(INPUT_SIZE, HIDDENLAYER_SIZE, OUTPUT_SIZE, FEATURE_SIZE, input, hidden, outputNN, weightsIH, weightsHO, output); + /* + printf("Output Array: \n"); + printArray(INPUT_SIZE*OUTPUT_SIZE, outputNN, true); + printf("Actual Output Array: \n"); + printArray(INPUT_SIZE*OUTPUT_SIZE, output, true); + */ } diff --git a/Project2-Character-Recognition/src/testing_helpers.hpp b/Project2-Character-Recognition/src/testing_helpers.hpp index 65976bc..78ba8eb 100644 --- a/Project2-Character-Recognition/src/testing_helpers.hpp +++ b/Project2-Character-Recognition/src/testing_helpers.hpp @@ -39,7 +39,7 @@ void printCmpLenResult(int n, int expN, T *a, T *b) { void zeroArray(int n, float *a) { for (int i = 0; i < n; i++) { - a[i] = 0; + a[i] = 0.0; } } From d5dc43b38c404471f4033f00abb451ecdc4b0688 Mon Sep 17 00:00:00 2001 From: SOMANSHU AGARWAL Date: Mon, 16 Sep 2019 23:29:40 -0400 Subject: [PATCH 07/56] Changes for part 2B --- .../character_recognition/mlp.cu | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu index aef4305..747a216 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.cu +++ b/Project2-Character-Recognition/character_recognition/mlp.cu @@ -6,7 +6,8 @@ //#include "cublas_v2.h" -# define blockSize 1 +# define blockSize 128 +# define block 15 namespace CharacterRecognition { using Common::PerformanceTimer; @@ -209,23 +210,21 @@ namespace CharacterRecognition { } void createNN(float *input, float* hidden, float *output, float *weightsA, float *weightsB, int n, int h, int m, int d) { + dim3 blockDim(block, block); dim3 fullBlocks1((n + blockSize - 1) / blockSize); dim3 fullBlocks2((n*h + blockSize - 1) / blockSize); //dim3 fullBlocks3((n + blockSize - 1) / blockSize); - dim3 fullBlocksMult1((h + blockSize - 1) / blockSize, (n + blockSize - 1) / blockSize); - dim3 fullBlocksMult2((m + blockSize - 1) / blockSize, (n + blockSize - 1) / blockSize); + dim3 fullBlocksMult1((h + blockDim.x - 1) / blockDim.x, (n + blockDim.y - 1) / blockDim.y); + dim3 fullBlocksMult2((m + blockDim.x - 1) / blockDim.x, (n + blockDim.x - 1) / blockDim.x); - kernMatrixMultiplication << > > (input,weightsA,hidden,n, d,h); + kernMatrixMultiplication << > > (input,weightsA,hidden,n, d,h); checkCUDAErrorFn("Multiplication 1 failed"); //gpu_blas_mmul(dev_input, dev_weightsIH, dev_hiddenLayer, n, d, h); - //cudaMemcpy(dev_hiddenLayer, hidden, sizeof(float) * (n*h), cudaMemcpyHostToDevice); - //checkCUDAErrorFn("Copying hidden layer units failed"); - kernSigmoidFunction << > > (n*h, hidden); checkCUDAErrorFn("Kernel Activation function failed"); - kernMatrixMultiplication << > > (hidden, weightsB, output, n, h, m); + kernMatrixMultiplication << > > (hidden, weightsB, output, n, h, m); checkCUDAErrorFn("Multiplication 2 failed"); //gpu_blas_mmul(dev_hiddenLayer, dev_weightsHO, dev_output, h, d, m); @@ -276,11 +275,12 @@ namespace CharacterRecognition { checkCUDAErrorFn("Malloc gradient A failed"); // Wrote the structure as of now, needs to check later - dim3 fullBlocksMult1((h + blockSize - 1) / blockSize, (n + blockSize - 1) / blockSize); - gpu_matrix_transpose << > > (hidden,hiddenTrans,n,h); + dim3 blockDim(block, block); + dim3 fullBlocksMult1((h + blockDim.x - 1) / blockDim.x, (n + blockDim.y - 1) / blockDim.y); + gpu_matrix_transpose << > > (hidden,hiddenTrans,n,h); checkCUDAErrorFn("Kernel transpose hidden failed"); - dim3 fullBlocksMult2((m + blockSize - 1) / blockSize, (h + blockSize - 1) / blockSize); + dim3 fullBlocksMult2((m + blockDim.x - 1) / blockDim.x, (h + blockDim.y - 1) / blockDim.y); dim3 fullBlocksMult3((n + blockSize - 1) / blockSize); //kernSubtraction << > > (n*m,output, actualOutput,tempOutput); @@ -296,15 +296,15 @@ namespace CharacterRecognition { printArray(n*m, check0, true); */ - kernMatrixMultiplication << > > (hiddenTrans, gradSoftMax ,dev_gradB,h,n,m); + kernMatrixMultiplication << > > (hiddenTrans, gradSoftMax ,dev_gradB,h,n,m); checkCUDAErrorFn("Kernel Matrix Multiplication hiiden and loss failed"); //float *check0 = new float[d*h]; - gpu_matrix_transpose << > > (weightsB, weightsBTrans, h, m); + gpu_matrix_transpose << > > (weightsB, weightsBTrans, h, m); checkCUDAErrorFn("Kernel Transpose for weightsB failed"); - kernMatrixMultiplication << > > (gradSoftMax,weightsBTrans,devGrad,n,m,h); + kernMatrixMultiplication << > > (gradSoftMax,weightsBTrans,devGrad,n,m,h); checkCUDAErrorFn("Kernel Matrix Multiplication for Devgrad failed"); dim3 fullBlocksMult4((n*h + blockSize - 1) / blockSize); @@ -314,13 +314,13 @@ namespace CharacterRecognition { kernDotProduct << > > (n*h,dev_hiddenLayerGrad,devGrad,devGrad2); checkCUDAErrorFn("Kernel Dot Product failed"); - dim3 fullBlocksMult5((d + blockSize - 1) / blockSize, (n + blockSize - 1) / blockSize); - dim3 fullBlocksMult6((h + blockSize - 1) / blockSize, (d + blockSize - 1) / blockSize); + dim3 fullBlocksMult5((d + blockDim.x - 1) / blockDim.x, (n + blockDim.y - 1) / blockDim.y); + dim3 fullBlocksMult6((h + blockDim.x - 1) / blockDim.x, (d + blockDim.y - 1) / blockDim.y); - gpu_matrix_transpose << > > (input, inputTrans, n, d); + gpu_matrix_transpose << > > (input, inputTrans, n, d); checkCUDAErrorFn("Kernel Transpose for input failed"); - kernMatrixMultiplication << > > (inputTrans,devGrad2,dev_gradA,d,n,h); + kernMatrixMultiplication << > > (inputTrans,devGrad2,dev_gradA,d,n,h); checkCUDAErrorFn("Kernel Matrix Multiplication for gradA failed"); /* From 5a78fd3b80db44f321ac51a1b5dc354f14b315ed Mon Sep 17 00:00:00 2001 From: SOMANSHU AGARWAL Date: Tue, 17 Sep 2019 02:57:01 -0400 Subject: [PATCH 08/56] Changes for Part2B, Plots and CSV files --- .../character_recognition/mlp.cu | 9 + .../img/LossVsIterations_Character.png | Bin 0 -> 15482 bytes .../img/LossVsIterations_XOR.png | Bin 0 -> 21649 bytes .../lossCharacterTraining.csv | 105758 +++++++++++++++ .../lossCharacterTrainingLossValues.csv | 2001 + .../lossXorTraining.csv | 44 + .../lossXorTrainingLossValues.csv | 1713 + Project2-Character-Recognition/src/main.cpp | 96 +- 8 files changed, 109592 insertions(+), 29 deletions(-) create mode 100644 Project2-Character-Recognition/img/LossVsIterations_Character.png create mode 100644 Project2-Character-Recognition/img/LossVsIterations_XOR.png create mode 100644 Project2-Character-Recognition/lossCharacterTraining.csv create mode 100644 Project2-Character-Recognition/lossCharacterTrainingLossValues.csv create mode 100644 Project2-Character-Recognition/lossXorTraining.csv create mode 100644 Project2-Character-Recognition/lossXorTrainingLossValues.csv diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu index 747a216..84654ba 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.cu +++ b/Project2-Character-Recognition/character_recognition/mlp.cu @@ -3,6 +3,9 @@ #include "common.h" #include "mlp.h" #include +#include +#include +using namespace std; //#include "cublas_v2.h" @@ -472,6 +475,10 @@ namespace CharacterRecognition { int iterations = 0; float totalError = 0.1; + + ofstream outputFile; + outputFile.open("lossCharacterTrainingLossValues.csv"); + outputFile << "Average Total Loss vs Iterations for Training" << endl; while (totalLoss > totalError && iterations < 2000) { trainNN(dev_input, dev_hiddenLayer, dev_output, dev_actualOutput, dev_weightsIH, dev_weightsHO, dev_newWeightsIH, dev_newWeightsHO, n, h, m, d); dev_weightsIH = dev_newWeightsIH; @@ -481,7 +488,9 @@ namespace CharacterRecognition { iterations++; printf("Iteration: %d \n", iterations); printf("Total loss is :%0.3f\n", totalLoss); + outputFile << totalLoss << endl; } + outputFile.close(); float *check = new float[n*m]; cudaMemcpy(check, dev_output, sizeof(float) * (n*m), cudaMemcpyDeviceToHost); diff --git a/Project2-Character-Recognition/img/LossVsIterations_Character.png b/Project2-Character-Recognition/img/LossVsIterations_Character.png new file mode 100644 index 0000000000000000000000000000000000000000..d561167a24c63fb9f77bafdbd9f0a1a50df01bc6 GIT binary patch literal 15482 zcmch8cUV(d_byfxRKBr*qKFNo1r#YND(Xm$pi(6?m1d+#?;vBRN`OR&6oG(r41v&t zNHG+_gd#Q42~~=OmT-4)z8McQ&wYN+z4y6)h{`!o9D;^E-f5WH<8_{~>?4lnT61}80*s~lOM57EJYHd|fMxWd8lK9qaq?iTR> z?GIHAoH#gkePsXHP-FkjoP*;6kJ_~>I_@SD-Bb4a^kavoSBx*jo@{umD}%5w*EqZb zvAr2#At>=+;DxQhMXuwnNY76h77f=9c>ealLq=W3J?h$#)AYA|+iYi#BRMY|@H)09 z=h3#rPe-`xu9ni$w25{xOAdC%#gnCRVm(s2ZpFS`X|tun&4W~zx%?DA9M zxT4=?@Q*E&-QedoFx z+pjhaKc{()Wqyo1>9jwlCt3UpB`-7P+yHeHce!>I25 zoZk~>M5Y6b44J{Dk#w)Pc45Y7Q(COMd@Kw7IK&R_i8b#@l^`WA36hO`*%jTr3)YP9L!6yDj^p>kh=mNx7B9hj zg+6>@Jc1%k<9OZn#~#b43m32c&ZW`Nz&muDg?E>1O5*3tjF74+DBbDqV7JN|&#|wF zpO_7nK+1F^Cn8gA;;o)Pup6(=wVUk{Ki;mCx3$!*phCRl_>QwiHw}J!n!MO%KjvunUu=H`)i~*G^q=9ws<1F7t#{860Hg*xM4G#|`!iCAretTwQn=dTIcY9`w z$QJ}0^26js_m(D=5MnD3ZDqU3%0GIZza~z(3oln};N+4i_SP+Fo=~J!cad@+>EQIp+;NHs4;NHrWTlgG;4dFb`5^Aas#fB9A={g z!*`a3m+#4zIxt7xAiZ?ZBPGhOcGr2Mn^`bcdsn|;wl}sTSn8dp!UI`hiXwsJ(Bf!X z64`lV)-0`6a|Q+@bWPwqd8U_8jCjAf*m~ukB0V-o;;|^#NLd^$XmmH1REEb~*V_Q~ zj@R0FrMqo**bHY|-7{&XS|*4acWu>{FX-+R3F~yTYuvPJ?|s+BG$Sm14FBXl&IFA^_QRWY@kLm}7q`;|1{i+pF>HM&lh_(aH7iPDV4Rx6@avU~0}KI-3< zRgsx(gD^N#A+?<-Vwj+9)F*0v`DsP$O7at)0sApg)5!K1YqJ)Wn=`&1b&k4&SiM#q z`}T6-mb3S9sEJyW(U4aLudO3YE5FU}AeLfNy&JzMcf;XAAJ6jg%#Y8we*`&gHC!UY z+oK|8GYd(NQ?6EkB}R2|OK&|SDMI6&%6Vzj$)y34Bv0TL0y6N#kK_qqy~6CYZk?&J zlRXJdFJm_-kmb|9esbs3^^&6b-95QTOdQYrju|xlZ8~2Bsppcqspm<>s3%*Zw4}&b z)Q_GlrhL@`Ox(Kq5oW>Z+|#yQV(}Rp_601@M@xUZ>~D$5tUWBTbgroqwP`r5c2WWr z7rIc_S9m%6!6EkHSV+NSq}zo3m~4@J2rgMy?iYW?;ZV2UyioCS)hMC&e#|d8JCb|+ zBBH6YLgDys*3oVwzr-GIJy%XUpQ_$J`t+Z<4i4om^Ajo?EEv6h$^%uMR@kOtSKXG^ z_a(cny>}2TT4vOz?VMuHHfzpoS(1?fN!V*|5*`m7;o65UIWSQvy=rZI=leZ4|8wN#e>DcONm73Gzc`({@^mn7xd+ z?wz$v%Rt0wR1d0(#^KHPZ*fs8{&nZ@~}fn0LGFgsh}fcsM`#VxX?#tI8Vga=Y8<~P|2P_ zSe0VUn=a%wj>HO;riJNPx8IORLf@GaejVRer8$B)1+Vnic=f0y;eK-()|fx^@<#MZ z)Mk#v^1p80(e;v2Bf9BM(84x;#h%%$|0CEql7hpvr^eEs2Adz{;3$7hI12j>1^zMc z6MlBgX&g!g_%z5gaB?R9>opUAPy1UF2z#q=?YMzqF``{JKHz*mZIww2k30VsKG-B% zE26?{oS0!YxKiV{qJr8H-4sh8ygy~!-$AkWTkaf4(Y&uQ)l9`Xj__MV91qO8rF?F~ zC{nam_2MQei31cAUNZ9cHj%_b2F}6KwcGDgyr~jwhXv!nZDXy|cQx{vz zlbO)vjp6XsML(nE@Ks-1C+m|@_U9mrUzVp*!W;C--gVGz7NOCs6~5I*3Z)|v^RHLy zOqG!I^gO}vMKn}oXBdpxwz?08v#x7I{mVrG=9mb~*u*=!o_GkN-gs*y{VQ$dGlfz$ z7W%Kt_St765a-?KSEOl zK~qZNs~;CK)kIaDIk7@QAV%x^OAbJeh98MUx;}7CmEhkQgJw0JyoEULPjzdd`3#o}XA0^+@&McYZ;Y%RQ@%gz>M}e)| zXPh`7bGHvET2`zeURJL&P?m9Uqz^|!AUqk54Z&I(MQ;495L*^=^HAB?hEBMyJI1#7 zgE%4VKa{PSj5Cp|HE~OGx=X&jwiFsIM|4Fg*+Q-MraD|BhcMUk+|&@U#GmBjFr9w%qMT&?B6b0T{6feGzTkY)In4;a)*6b{y5yCI8XTcmx7B$B@4|IO4N1l2gn`3_go2qan2ppwoTKaM6p-69uUZgIg=%fY5>e#Ne2){VyKZ;a zYn}Tbt`oQyl3GKD}Oyyhqk z`Fo+hlZ&KRF51IjJ`Tg2|Mny}0vIfh+13X=*%%GDX(sbNGIhew5Cc_9J7MmgR~v=$ z!A!o(1`d@EfkMmiCZ{AP^KDhcDrmr~nRx|MadszXVU-XIxUYghM8}xi&RN}oL2EVu z?*(Tr6J>uCs^^0g3T3=`T^i9xp>E+r70 z2j;rX%br3d8by-vTDwsXQiRe!Ee^3r28bncV^${*`4|VscS}{&l|7YIEpInBJ?|*d zBVM*-0`p~0N9h>?G#}I$_k{J5UY?nJ*)vi41q;>8G+`iiggI?BM)vMJ1&Qt*;pkBi z0IDK~(ddrlYxYnvYp5u%1V(3#z3{BoKU5!KxHA+VXSOiP_AG^^bYt7i%dLg)@O&w}FLI+QJgjua^c57(4K1rhyYW-c+RM4^_(j3$Un7&v}aL)Jvb7c!dt~ z3MpSpZwI03QGQ6%3T4D}71$YbCE>JZ)4^f>IJ5z({Ex`gILy+t2tqn5XRkVZ@<#lj z7(<|xM-Zhnxg(-;E+ahOmyX3A&ne6cmq67(&dL2T1$FK`(FRGM>7$yP9EN!Hoy`mw zj^QMJ(Vpzk_}l$27zgNKSyCy7>*{KHGLQgu z-DnMX+|-~4NLsm)+iWGS-)NAms`Er!jy+A%ISV_$u;}7*^Hm+ngFeN718$k!Xox37 zG%7cw;9PwUK*iYq!3Tb^$IbDBA;yzN9F&xZV#zzmYE#Pzgu*;(0xbxN5oF$=mpoDh zLlcBP)K??mcpbhh;JWT=)*Sqyq z(S-FFhf}=hdpwGhgCi#V#AD%YNdRY>>_{tnw6|z{Al1p1eCV7{+eT{MpcT0!jm*$Y zEYjKaH=w)R+o82OlypMw2maO^>7onRb~KXA+!w)6EW*^M--8%`&=J@o4Ln0&KrogzIPnsC=+1~*mX;B1VYhYz!5knED-fJL=pQ~nPX4|etO^r3^F_qu zcE1BW_+1>jD{-@pmTv?HRPc6!&h-A;fU3lsim<4tTyehyrU=v)%MZr<42W}|>l4P~ zg4gNVo2C^Js7`u$uq_7uJ5H$ou+VpU3xbDBP!Vd;R5y=d-=kSpMno9EJ6SMACkzJV zaT;oJ$yBWqMdcBvTUEC0zyc47cr+BgE)gfH!ovyA33K$8op88{CQV}~66#;oD74Ph zj4S1&E|-Z3RLQT<$M6^oGzui)UW^Y4LH5dl6z!Fi5@0)=EOc#tHTb^Jo!{T32r-`D zlsE;;x^RPcJ2Z;PpwNz?J4A4?=%8hVVy*SBGg}n43z`o{o7hYh6=R$wrJ$am3O}4@ z8Uj`EEA$1rRL#qk!L#dkDaEXFK)*pLg+t^05m@9|$EB<-$xfKVCuY}OynltAG#x>1 z^;2k7{RxEV-`*d*A!jNSDEtIsYDeNgBDOSE1*tJM+Bg8S1qfH4x~R*29RN1e;`X}` z6P9^}9=%{o&)j6C7v8|ArDz2Q!ZmQXhRhgZD2-4q(@uH{4VYj35NWeV>~k3@+Vf>E z4|?Av%S3`hk1jX2XFzcoVOVl%1Y7Wd55s>QUfn%;@k7dX=*fWG*cW(xQ?`@=0v{0V zNOz;%UQ5kVK6xk!0D^5_0SA_eMW>a$?i`bMfajJzF~5uKY6GdNd%1FvY+jq}WmN(3 z-}qjFjzbN@cXC=&@W{=vYFssyGSnL4Rm8Yvw{CwjSad>8#EHa_ngLHO=R)L^ya!Lp zG{Ia_>XHl5%g|AsH<%mhcZ&p_-C@=HG0I`+`Y(lpFTDlVg**d9fbXQw28{8Ad&kw` zSVM)Msjn(V5T#@W^>uc5j0%MNm~p`q)!~-MXf;ry;@5;b7^zRuLoBPKBbB4j^~-|Y zx#i}oG4-L)j1W&J$RLkv!0BVO2P+TgK>ZGh0|*mtT@8TAqPgPqT~+rFl`iVoGfNk&_i} z6F?}SyUtfpcghU_?3Z=)mg!Pp|QT0;{fjGmdKL`=U&ygjx`rtFL zs6E4^ymIG}fr;OkzUvB2Izfk3>|k76r|{#R7o(fU&fv~@Sw$2Vw4i994H~~tp01c* zjVTbLPt^qXZ54+2_vKt{FY?P)ni;xy@V0WJ>FrSIgJ_yC%|UZJap0~Hi+BlyMEJ*JFC>nW~!j}$xe$G<_> zgWBNtE$b|{A1OMUR6DTPoo=7!%O=<^_XbWot*1L05GM-BlW(`jU_vMgmiB?RQLDK z!;})N)eiI*W$d*!Mo9awyS`ru5uGHLEEK*qxl4lvF*v}yaq+#jslBAP4#Te)_gI*g z{Ft=lXJQ;DG~mKM8JCy7sY6|P7MYrj3C6=!Vlapp^mYE%msmF4me7cDToqnQ^Ml9X zBUm?G=&4t1Z@`6_RFRXP{gxY;esHwwRg9+6CB47cLy)wjc?_g)oej{^@u~X1e|idOg&8=}ZH`=aq3aBoDgBqXEZ(g=3%PtQ8YGSc%LSHF)(^O5ExohWB+3^BekUia8Ps zebeQSNKvwKs>ng5?P>O3;cGR;iT~CV8;c^)_1%gYB_Or%z77bz_aIR+Ccr!6Z_@Pw z%)`hT|Eam|=JF!H9WuYV1s8%78jAtg8mPn?L{v6B67NWbryLU$|BFIZ#-gLwsNVE~ zNt_VlBS&_V@C zlLYvT?K+lmxRJ@p=v!pxg$~p#$S7+u95}WSsM-xWb~l7{lSj2+&m1Eh`aUHbvS>h4 zy%AzBRhM7F>9d>eZ8FmyrlOpN!e!kKhBKQII$sw#i_N@G_%H2z#~dlTRLIz3SWa2#<%jo{+eV|&jkd*upEGkiY1xfBQrJZ-4$={L;uw6<1iiCk!22_s_(9x4a zFb9bD^BZFaTCLAYd(i^X5vrjkXGAnX9tJxXXVU%9pI*vC(qrjm!5;^Q?)!^j2W0jS zuVL0wYq4OLBdwN=zR>}6%ANkdtrH|o(IR~$sZp;ZPF@Zc#n)Lp<>%aJY#v?wJ?7g# zUTTBj@0TL!x`$Iy;lAQ-fbn7TOmJCeh{<36NUN-3xZIGI0$>f4^pl2#K>hWm9|C>y51ixjm2B)iX%aOSD@!0X2RHoutregg!8 zxiYJ^Vh3PXGoZM^Tiq;qPcC@-uY2VMm_=C-d{UQx%xsG)To;gqKF)USKts*iq+*C^ z006w#LI@Kyo?>#1-^%D1>}gE#Tsafg^ZQAAnkh{k9Ri0sgp41?%~8Qk7-qp23h3?l zxl~#|=~FXQMZrI#wCvq$W^KDsgD3c2S9_e)Ubhgy(^Fk2qXEv7WiLCiVIke@V6*() z8WRP0zL4IwL8uYO(~C>`OLSzC#o)arB z>`uZl-QSmDSuW?`nr-MfA#r{RRDO_y78DBJPTj(*%9Q31P8mm|N1rBv7p_ib+Y10f z!6F$@`~c?)i_d35eF~()o{K#nyR!*b5Mrl4y!Qe*lTr0XgB{}8oY*}~cWzW+Wm4J1qN#(l@OL`BlWyx_XLsUj*IMhb=Py4``Uu8t3BU5lXE4IeL zCD#xSxKT8r5gN^mv!GL`b) zdU^y;*~Z+oVzZ$(5FZB0k@SbaEUlNn8XI}AKqIEO-kjRFIyuk$k!fyR zH~n`qh@ZBGGrfUmB)cGExGJzKtYtE23-H<_9|-&k9Xx3$MsSz^$LC-dEx7l^g?Y2H zq~e&Kt#*oUK#brOPVap7Ygp_V?C$TLxY$~G4WVT@IAFJlh#VSMKS+xGC3ctJr)cps z8g``Y3hf3QH;A0ndUk~cv_Kfq+5^qeo_Zc=Mvt@a_@#Dlmr2UwIpjCFn6s-BIer;# z$A0nAQ?Ue@OG!c&{*Gk@vmfi*BW2E%E`P`brG~iagTB8L2WZE3VC7r^j)^ACobtID zuEG0OLKW9YZ2s_f7zP-QAn!VEV(5$Z87T);d+egQ#;E1PUo-<+vZcMiDd#6a3=Rb~ zXf-0<$LC_gGi z|K)1tTCY)Co63EVY_koi)~;Gtn7>kgRZC6|s0_xfmG8Zm=SckD?Pqr1-GEyi&P`3i z%^$pRj?qSxkh56b+YQjS=9|_0_8Mc#-@R!BsH3(bQ@N{+J_8wtB`>;9eaxnQHK&GA z-GF|4R%z7ro$TPRT^)dkmsM%0m?onNtB_RLD&;HuRkb^=Et0)AhIh0s62K2V)j~On^)c z#v@@4#Ph*RQ&}=0VaXXk=NtYUH>hjD16BYu+8d2M?;I6fR?+WbcWV7s+Lr(h0kn>5 zHmXWhyR~#r${7(AgWSQNBL|VK_fkjte$?PhzZ#&=g9bz4Xva+M)>RI;HrUH~zdi!r zpon$;p|xp)4F9}>I?Y-A!w6L|Un|VB6yV%!wlO>Eka9F$ezX$FS9`0~=G@O&g^#WG z!an`+r6w-ZP_>qciqQbP*F(2JKZoJxFS}+dRki1Z)J z{RwlA``>g$f0*H2zva*IIOP{ zI_Q!dv0Qq=O}m-hX+PY)w?L{^W76xQTd0TUt=eRuA+qoDYg50zfYi5ZR#RPwPl@`#G?4`525{jh@SelM^Z_Hu!rF9(dk8Zm|C< zLW@035xuogTx(K|Q?9VH-nOg)D8lVus`ZoV|bd)5YShG zz_TE)t#zpDBKsXfH6}-!f%_GwYs(EvYu|C?86}XJU-n}uwDDYI(haq5K*i^3qhTr+ zV7LHNN@Om!HDXXh&Gd2zlImNmFk3T#Gg*Y-P=DD$Q#`AIkt$wB)KQXlh#ER4mTXWfzu zaHMpmbnR*Ejxb3~o2~OHBr`qC7&CJRFJrrMzjPDkM{oks|CMUGxWp^LcGSdB-FNgS zBig>XuD#tS#xe##i2{y@OP;mK{x{;%qQ{@oHEqMs4wGk=m zYviqIh1y8=EY3`&&d&CQcCxl5tGk_1`txAF&ey(`CSU zsqZrna+8E*@NnCv14uB}1Ty&%Im z`DIsYfGF7!kzc9tBb@iR24upkhHO%XCo_#n&iMTi%Ez#dNMs`wK#-!E(*<-U1_OYo zJngj?i`Or-!GDUp^h_ouV_NNi>?z7$(e3A$TT6xzhUmyZwnr3~Sx`leVKgS)_S+7( zn;CRC5$lV;4=0bCHoqrFo}$#C?W2^#IR%g)pnS{ST^SEU*wLeibrSPJLw1;`!p?hbvw4#w%ht;xn?cyhONTxq_}4imYF#c70WN*`Yc$oNvF zbCH^pUrejA1#I!xqxO%!VqGz-XEq?eC=&&am)dJe__Z}9##vInmdE#m8BMVa%n!+vg^k<@J7ZJ=tuK&8@|9lh@%qr?O??+F*i>6yaS8ZNDOxBF-4`+NDiREh zF613yf4X%2KmXHDnqFxFg07Gj;BNm+Vms=akW%iwKor&3CpW1k*u(eJ9`o$WInBvbT;-l>HLz$dz|nAw13tW~TwW^>N-YdOY#XWRj5A9ttH_k*H4bt3n|3Vp{HQ zl3scg)X_fa$TcyhevW*js+tc|5SM3;&`WvJx0{h0Fe8^Yzv*aOa=zb@&7Jy22fDl| zwFfaPIz|R%h3KQ2;ov4Gfg~ZJQdd4}q^?VA`PmY;#)>LQX3is>SR)Na9TOu66dtxkdC!$vc~K4u++|`NeNy z!TFol(dF|P+QqfxW{SOKN0VI3?!0>In=#c>iEnn{a0Q+!X>d+>r+`ziR4b)yl)_Vk z#_WkP4N<7;FxDhR6EbbZPud!vU=BEK6ne+2D^g?lijZG5n1opLzp15Zt@xjSJi;Qi zJY5)R2-YloqlsgGhgTi;|LV)U@`B9RbKhGHpDW%NsusWGYIAkx3ym28j#C#XyB{gC zre{AoJUR=$&olVX%T+Z7pRAdq!P}Wn%>T5=2T=AT3B?|mp5Mjsbq`osM!eGW6K?gz^7aZ-tCBGp^7WP7 zHMaaDF_gx!(u`MGbFIot)+J`bQsV+bt70dqTf@uF7uTugM|fw(-q$507v=3!xgOE7 zn^a+$HXqe(=Ih5%9ZJ|7AGePpSb}@MAUHox>iV+L&(`m;%og)uDjx%Xd8%c)!&$$l z+V|j$lmyp!?SfZbnu&ILUZl#LmF;I{SEp5et#ht>)&f6A`0t2~uNt_b{Cdv7H6*td zY>Bk)U-W(Uy=}So2&x?uw{OLRQBiN_G?OiBBHsT4C*|h#dB#+Sm?vD)Dk^iL^LjR! zV=owAcVBg9eDq(v;W=mTu4^fh)iAu-{t`*r>s?<_y~7eVQp|7ymhS6wr*S&%rnPwJ z?NVyRrCjd5xOBv(JG+5q9al4I$TdUiXvH`yVY(Xcu;)xd|pYN_U6tXv{Z z{rm>s(t~F&aVVQBFMsAmivnysH>dn1pFT;DqZ)h(8o$t3Wcct%vCQgJ;c9d*6@@t_ zO|Ijb#i7>oPcf0>Cx7!)U}48uZy9Lez+>|~UNV@hAIG0Cvi<6@H%#bLpZ82jh9R%41Q;oB z2~KX!R!#@E8h=RJd*UvhNHv*D3|@I z>t)2owU1i=@4ssWLn{CKuVz0Prl+U#z6RgY-W<(5Aq$dJ{79(Z%Btb5w@&yOxO;qB%>UQn2jXkoKuXMX;-BssgQD-S~-mz z!jQw5MF^oB8)8I^!!Q|U#xQ1n_cO%kWA^p=eqX=e>-W#+k6sPtdG71J?(6Wr-q-a! zf1ZJz5Lv%%{fZSUL<~+IH(9Y_^_3MXR#LwE2K>fZol_6~Tj^zT;^>OZhV4V(ht-Zp zjE<~W@gjT!=i;~E=XD+@&wH&{AzA_bTUqU%dTGT9GdY9fN6fC-j&{-9%~%ONY>v(0 zZuyE`-fDJR_KC?^8NEIHIy3Le_nV5H8MpJA>fau-dL0wi^e$(xHvNYyPn~|eci@Ae zvPa>qU#^gQV4XWw{`sxGO322XU-k~qn#K2-roiHHhw_QEtLE(42PxH=vT#cRs`ROD z0TsvMnm^cW@SugvB4cPh;L~n{AWco^zozd+VbGscZv;g^zZdy9_vd5(&;DGD7@90) zbt7pNS#)K*(sgWklcj8an+u8c(bABe)^RH-$#tNUxphz4-JpnP`$?-_XRnp1kznDo zP$}$cn;@AS4CCU6#p(X=AB&yUd@0>>vT^hvXL6}nMd{?)rh#d~1_~vTR@Y>i(x;vW ztEYy>GR*5wpPe}71V5lx4C8jKc|U||bZ=?HsCSnBwVMHUoc4V3=PT2)?s*BD_twz- zxaQV<^6PWlIsqCr${XxqR}jQH!!?uX;{g~Fiab#9e8&YXcUx&~ zsFwGktLW$m-7r?7)9(=G zd43DG9?NvP2)}?I*m=$q@0L`JJm&UG1~;6oh}mhv?oZ^5I~Fv!N(F?UYDKo_-WxqI zmG7XaRx2UpVR61f7FE&Oaf^(1VK969vDapCMu%=B$W{#`9n!nm#KLKFZgp3?B)PG> zwxf527Jn#K8}29X%WC14T(Q#4f-&eQ^j=csbY@c$*`YI~ot)+Ba<2iAB&%PpT0`H2U*p^nVjK5s-Nj3cqEEeiu{D$ zHmXdo`{eiY%sL0MqBe2HTCFggG=6ViBs|kcQ?=;6jh|MgtqxJYbSyY!f=BckACol> zt55&IGHvGRNk~%F#p%f^8_=G{96-a70@%r7_+(z<3gJKw*OTGv!h)jHB7GtjeJp&9+g41Mhk>-tkxUhT1r zgcG)|q}CAATxH~b^Tex_G1lh^CljU5C>5V(&1S$e1}2M?+J^~yg50c$G;$L)U*zS* zFn2F&T9Py-(TapmFUo`yFZm)UcQr=dvg`^wXy!Ve8(1$s`eh~961?g^wc{>im07jA zV^KEmCS1ljNs&dKHox1A>y2fz&o%C$=dqbJ4>Q=}Pq;XiV{^|8zE#A8Xw@j@Y;8$B z@9^3+IxhtTYs3)xRbCq7`cwM7oyaXO{H=WP4d2qeMz)6QPwQ@$;%bmc=7!^HJZki&z1vV52Xf!$W{4QJFSTK+6uRJHOg+(Oh3UY_aS4H(nVf6 z$4*BMxpie%WV>7P2p#Ri+%)x)@Gf%J$1tkj!8NsQccsqOs~I z?PG@}M+u(>jOs6!aEuGIC9e3cLbP7M(-1n2E%2)wSg9%c*2X<|2MSzGXZq``Mi>cN zJv&Tzdq-5LN4qu9>(vF1}Q@#hCjRS?P9$8Hu?>$8Jy zj|GcVCZ;ExQ1l5eLA2-vrWJ2(H5`amkZH`*rA>FzFKaxp=wi`wt$g3nhkPtEbyT!7 zO6+I~JA%fH`a0Vj{`QmhK6bT|<$^IOEbh9Mp+1<|KOCLfU3P27tw7>5P3@DL=*#dZ z_x8{FqE;_pjOnPX3FQ&mFI|s)G!AOJ_B0mlvd&|blZIMn%>9PC4b`=hk|@FWc%{qH zl9O-{9v_47$a!&ebXOv^r1n@C=VNPd>srNqD4EMXwxvG-iH8VM1mfOEomXkLxrPza zXVTS=d2TqBGN6$@^MK^kRq(tUya=U&H@NM$p>7lALvqzRL)mTdemX8ox7w#w(YgjTA)=wZo#Pkw15O_i*n z=>w%F*wWR0W6$c1feQ`i7r8eZ`;x|9?_;piSxxooCub3(>7sYFI3XlTtKmtLsxpI) zi2zxGca2PqH#T0cXq}p+QUjGMJ<(JVH65(Xu8D0|;!2Rm4BfbYm|cVqa>+M>$=9t= zhrBIDteyuH_Er@rN9fTJ-r>_NS2{TZ8QSM_*q@8bTA`Esx4PAio#h;+B zarJH4nfv7U1R55F?fukiTaXu?z?oc9p^a;i5I#C+CAt+jKloUv$wyDG>8ddr~lWc<7dYtGMBL_efs zLkhB5xdTatmo+X=yl|`=&=@XK;AiBWxO9g97W05tt^UUkP0EO*p8e2;YYN+r?ks}W zKanhgqPrzecDBu8#H+n_gM>q^Uru9DJzC#JUt>?d+_71$6mim{E8y=MEy zt{nF>$d{M(o@NH^cJrY?2iBazv?P{tSiaUK@s}(yz-OVUX>X%9d{yPU<{htE#ArqxqIL*jo>fC00%*^ z^_GVR^xw0cATNSnIDH511b(rt{Qv5o<*s;G6?#)=S>ZNTHN;40!A*}N%ZA1tKYBc7 zdkADSXzfpO!JJ~n(w@|=iFAibAQZzc%bg`e4yu9@`u*d=|?vj zW-GqRp8v4h_5C^5AJ?Q5J9g7Fy)^WER;-o(qd4X_^V=8KS=^L-IRD|9h;+J9!$Fw! z$DZRJmf~Mal6TQwZLPdof2N$kqWsmJDyk5oFhs45k-(z-)thdGwQi4ViClg$EzL9q zKQ)2f^0#dut#0AvVPZOlnBUlcxWoD~1Ea!p!3{w>JK2%_A|@yz=IVbp^xqSbP8V)+ z(urkG{Y@`e2OY--J&|Fso3|P@ zD^XBsp6A$~du08Kj^i6)JNz0|{mW|{)`_CQh~dQ8h6ah3E$`x0S3FL=F8j0R|8Hnej}K~wbLzt)ybReAr(P{D zYOr}oI##l;$Mj%su!^Qm zKuH4j*$%H|vmah%Tb#xI9Y{T|0XKA0a|;~KPSR!y@Ju;5%v?5mzFJdJ<2Kt&VJtr~jM)y8w3 zykY9>Q4~0Gej^Ni!9Py8XzL{7)-;;otr|6FZAQ!)To!L4_GJVFB$+XR>L+(v#9o&O z9*i@cDBr}e<2$uvoURsSHTHBzkg?Qq(92~_N|WgVjvH(+M0oqBxq-txH3Xrjk^swIdoUJa~8ba~X(0RrfW%M!qdC$H&f zpVj`fyb(7ao;R2Q3HW7sM6i%u#;i?grbOF&6sZ$q;!}4l59jbsGEl(U9Mt@wUGJZ7 z<~)(EqFHat+4+x>)hn0YDRUaAtNcf0V z$8Bcw21g&o=QQGv2|1+xpyst$w0KbS1YUy_-LY`2-ioXwvP<~z@%!FrynyHntb{<6@_Ir3%okGf$W<5*fqq@<)vt7ukM z(~YwxBJzF}Vnq9cO#OEcYs;v`Lzj3=bFOZL1;ai}NHFuOmp@85CyydzGTkr`)hY^e zC(llgQ^=K6@1Ci608A>Q*;0rqAs1fpK z0Kv)0W~p~?#p;2?N}leD zVD^|+EtcyG1#E1jM6l%NkWFf9#9)u{i~^M-mAfY@qpJIl>|OJ$wnz1yY_PFTYTRbo z{+UirpB^__&jMDXds&%#Mk|T4$v<@HKKC{H$tvCb}6&-I(#HIr0p!_)NOz#KYAeX=UtQN4g-69rveEfy*; zF3uXq*>3+^F`D-)`)B0jHI(aYhH?Kpff8l0_CHF;spv(@hqwl+2124qT|?;UPEE$v zmfS#d2ClX5RtLC{y(XHnAO2ge7H`*z1=?is9)Fl0R3U7MINjgZ!9n;B)bPdd-(Rci z%Qdq-?ME`{PbmzsKF~{6-+oLcbCGiqWI8rqv~N(joG_KT@i$ov*G9z?+R=d{BHY)u z_PLUoIg~7B`MLg&i>!^{@;f_4X9mXuXiuW*E%8}rNi+9tt}%PXJS(ZAYe7gHNH^9L zvs@rymQWQ(DlTJAsuQOUNT-Li@7~UNW6nHmu70M{r}|?(DZjQt2=C*o;=)w=qn$>) zy;1=g`*Nl7owKJ+LEzEkHX8ZAT%?5riJ8~eZtYuxqgL)iV4v37KAxRq_ixa!=%w2; z{=x`T%K?RpxqfR)K{D3SFrO*EPKw>;NbF$+jvArsyLyDFLwpDGT_1y*q+n-@+jw?- zQ$=ev79&Bq%VqBsriaJL*`Nx$qbGu$oUOmVL66HR2Im-~C*rV3^g%XeRbH0HSZzA~ z!3616TblmLm%jcf)kncf^F$=%z=nr4!`3a3 zFQ!BBPkQYE3PfYpCb@q0_`eZt&zk(uG|+6Eh%T)u`*RN9y#HkP$8VF?gu)G5?oeI9({d3{>uFH_z9?B4I2=d0Npr&_5|yX_E)a;X(5GOteK0l7iE5i`C- z+P6I|v(`wPU9EdHH8;&8?LkWhIC5Zax*%+;96@o|eh`I_t9ECJO6<|*SxC2GE@I1q zt?b#DInC_Kf}+j+G)XK|mat=06lS$rwxxL@tY~9_yvkd^d^@*~hIPU>g_4*&6&0Oj?@9$l6?Z*cSLgqnQ&_(EVj>tmNnjbb%8Lvmye*Oe@1;C(s0bu^#^qY@m0ddC2O0A!bUA`4N5&2 zHqJn3YMM6=?}8xH1JFVVMuuYi$GF5;_A#LQSOTk4NsI@O6y5OTgS>rgmwQK_KlJbz zp278n+1@>@{})3zT+p6pB3yOm-cRDS4LywW=bWoz*ez@=}XF-ZZ>zE$+T$`BWK)QI%T2kx^jfY4Oh?>gHtnYcCtg=uk=YF?uLRE4t+gL(i;c5Zw0#vA)BS7vQv2KyXO+*^hZ!sy zlVmWCBz^_}yS)k5A^IhPyLzcc4D~d_RxL_oZ|=iRY47)YSD$b_<}L9$eTlCoO2!3x z-X5B_mU2l86oCGqY?VbZv#)(K45%#6+DnAc%q1Ha;FRuowE`FiXcgJG$I4&PsKUGA;sQ8&^cb(AY3Bx;kmZ(Uev-Ic*4y0r=?l;4qJ~X{AU5ync z!D+izNL~P&##gt1lGO_&M$efIAlD0cXZ}Q48X2i!uBmBscs@MH6%04GhAC$r-p~C; zQ5V**4^_-DL^Itcmgr}(<8(tIHDHz-S}T4=!tFVATKvosPjcNt#AyU9nox4`FesdX z-zX$p@4!ql-$aKNPknVz0%|-BZv{67#Z2tXbTvToY*J$H5>FH8Z#7$~$t69CuEnfm z)D+U(_jBT~^+JVC%Q^($oqn=iAYg8~N$JLb?5Sz2y0#=uEH>QAWzu4FRx-*`$Xmo) z0BrJ`i&^HV$!oLVgdoOee5mIQ2PE^`@=uoJ62S}HEo`hYNN_}S z@iV~2J_F2L{tsUdw**!TQawzapGpB}1kOTnq+J(g4d>KKP^=L=yER-FdG7E^AHX1i zQg8H=jkm~`W~Cr1g<$bMWKq#LsG2Ip&y0Isb#p?}{&E&i}j{50tQ3f=2)Xo2f|-`~^{e3dA_w#3{j~cn<$(x8J+4-w~ffA4v=LR`+a= ztF16;?=fQ|Rxd`GbNE8=b)5snZ)sm?LSIFbeW!UI@qngq=Vwq$u-BzhZM?CWZl4?r zqy+Dt#F0*E%Zn5{nN+5LEWCZc0kqq{0E_2EuAsyY0A~xJ!4`3|K--eg%rCuPKBj1r z2!5@fvMq5(?DjaxFGao&MMlR~4}~tI?=Q0CeKcm1PF2qr9uV|#TAT84_;bs(D~s8= z5OBx`yD>bdbN1?pz$VqAz$Rw_tlj_N9I2*w5LYg6qoK7au81%r(%)O`O&ksbT7Xny zwt#9M{?_^_M?!^HKi*ln+6Nc`dfWk>@B6pF_3PHAtjt<@e>T4P2?YzfY_}5W6jxE0 z{UXo*Benh}wvP4}a_|)a(8ex9Hs0nR=c*Jwq;3gnE=9%eGNJ5qS0a{JYHN!~#nIs{ z&Qq}Vt& z0|7rAG9zcuw0jyuwAEDalmao3!-OXk^TaTLk=T6(2sx~X$?eu`cxzn(24MtCik8c= zO-&^Aw8X^WvbLYCc8vROv5R*PeX+fbY+K5B4%st?+XbA7<0A4&U?adX+plj?Cywb7 z0@93B_AyYNw~aFDR8dt5ywgJqovzn!vnN88jrnO*20K|AnyENnM)8aMVaK>>QCJf3 zQmtcRy-FH`=Z>Izn+hC>`N@iGK!ccr2ymd-14JVDtCENF^$z63n?+%y=VomQu{ov_ z(IJC(O!qEkJ&T(zYE%GhQQsiGRDICo0fhIQd^=blgQK(*s~ZKacRk7WJ#}3wy?C&S z0-!1C8^u0>Ex!sf{dI&zFAs+M$l*5K0T*tZ&m+&*lPy9^!A(icbn9a-yoEUYXLr37?c2zO(zRi;Itdl zZZ`0xI)a8lR9%=7XFv>G>p0M8i^`GUe)FaNhujcYKl4MR-iwy$K(7l3oegTe?-oHO zNVL1YizTckgQ_`1{>kOi3nic+Ml)6CYuYab1*a{)zbtJgbg`NQvE}vN3f+Jb^i?+i z?jddraA9eI-+IlL>d>Jb32MHstY5Z(X~A-dV$u;!H>H(S)ExStZi9Uczwb#+n`Z_1 z{`QK9YiImq%4RXQ$H`$AcvGs~`*~D(?HfhphAvLYI*PzaM*<+qs6IXFlxF|TDIA(o)EY{sKRGs{S#-clhFc!a_fPCixdes8dPDVsV%YqVWWSv#WV^;I|Yon zGE(QZr)2z+%QjsENdZu7if9ZCU_*QEG;*Hg;1{&B=>aJA)lcG$3#_W#UTS)nAK-@0 zak#dDSHa=((M{hgfwj^=b*sEKkY{TIq%v5WG8P@Y*H+YSG6ERcViHL{Km`CA;DO71rh#|3t-Az|-opc-M*`V5 z<9wQAT!-TnEx`)XmGrka>vUJ3H|xy{!{}a~mgeZT#duwJ5uj$y zJ4Z19S^pl_URNzIAWD*KT)zB3vfW-qOhV{I#bjLD=TAL(&H^1nkkmpQu{i~3D{D+F zw!mg?x@|5JO9WpbwOOl+Y>(@`!HhU73M&>ESc~-sU^&8z3@88)v-~u5=P4y9z#3pZ zah!+U7PjBp{oT{T264espyOGWkmIYO*2$ zp0|I%dUlvE0)g)pSx&{h)5Z-;p&biEVpZb@l@NDlX_( zh58#P-L~*IX^e&6HKk8X=gtiZLTOunJUyTjfX8^jRDb(o?A~zWag|AT?EFk3M9Do1 zYHk`ru~Q>Swdu!p_Y0=03Uy0=rqkb3m*%KHC{FO}7igm3a8m zItQ%gI6O$QGF!w<5y)a{tVL_vZ-JFQ2Z8SCEmywWO?g5AZ zH)||gC(yn$>IbY@$oP;(wL3f(Uf+1;(8AXrh{B?NcVgMfl?x`uTSiJ%DWLXw6lM&3 z6>5T@Z<+APM1GFC=d~c1K&c9O#u+n+QUHu7U~qupgF+ah+8v#w z`86URl(D5#=yeoVn4u`FS+F6OY6dZQ)K>YmcgZzXP-}h%HoqDo7QU1dgECM42x?Xp z>~(N4*9WkeUB;_ZJp=m`TXrgL-Q;$=4=VI`#$L=zvN8~hDu%t z*L*!6>w-u0o7VxD&CWsM7eo+XT==^~n!{a7W_oM;00mi1{Zu#B$|r)L<6VL@UuEBj zH$#;&2Y}x*TOg`(pp^SAx=U$6TmjkwV(iNs(m|Fk;^ZI!Mfww z^A}KP#OL1c{Tf|F4gY-%wkmtL(~p<{IN~NjwSpnjc)d##K=G35#wx3aChD9=+$SdFe*bdV(EQrwMN*vKJg~O~zOT=AV1YDoKq1R&4C3mR zvo7(o4A2gGtSm7{jvX&S6n?qy*KU0JOsjh|M!^<;xGDt@nrtci8e!%rmpKfW2GErP z$zc`z60j<3gx;?y9cmzc ztU3jd++!O_07!&MnVg8|*YR>Jy>Nj$ZjtbpShB@FcEUKWn61M*?e$!&P<7u{0lg5U?yS23nRCD0nd=_=Nh)6ht zr`I79*_{9=0xS$Ag|9-=du?;T>6{R*+MH1Sx&(%VyOwzIDMAPae3h}vaI0QrZ`oz4qU$IvObk-3HhnhTmM^VM-FlOfy}Gsae{l-rGlijA6C=YCJIVrXp2=29wCoYA3dza`i(qJn_dd& zK{xsMd+MwgOO_B1Aktm(W8lG{<-*zAaY)P(vZapWBCuI`-1;Rb9I~@b%+{)6z}thC z=M&I{0%1wre}fGJ&`FS?7=Z@!HGv-Ad_Q;cRz3F{9N=_xrwMB#rvb~t|MHujmdS$2 zj`^zS*`{ls!$XM#`e!{(rEDl4S@Ze}XLz5hPq6nq>$=BJNKl#?orp0uDqBL0@`rb% zwlP6S2mJ09+RClafYnP6dk5E9SM)jt;N+ks!|HolW@7p(Uz)|eYrFKvSGmLDrmC1W zaVm$iZS(kj!g?~$FndoP;0B<;!=HRwnva2>X550j7C?trug&fltDLI*Wvl%3fz98} zY4P7Yay`ixVuQfB)n%1Sh({{#x<9Xse{->W@xk@n?{J5Y?9fD0KpO|7l?g+tj!fpV zv0>|jpQ&>_GcF+6YsFjSDW&8fVkMx@Ae;IM7y{Ei4zZSvww$M&qk$y%@zMBofKyy1 zzygnXn+;m5P}PMNRn*l5{ZnxKMy&M5$T>^vh{#+;jXG$9@es3iQ&{z&;$fHC8BiGd(;rLlk26{^R|Y~Y5eC5diqvWkZH(aF%v5BCU2!?sip|>gO=CnsVdXn-K z@IA{#1jS1(72Ox6`~gtuXSIffg9iiTDG$N?yUKuatVCeJ3c$Plf#ZPuS}wSmzrJxRY$Y@W3{5Hd zJ4{U7HnqDQ;suJ1I%S=!LN)g7Mf=+^@Ia{wlYX(kdUqk^h5;3|OXe@3TBt3r`hKgu z$wos$<8g?-14_`JkUJ}Mfy#@yaEdmPVL&4v2hP=Q07=NvboXLbHM=DlcuF zXnm_yyL?vPYWKb@1U?+9H4ON!C@tJIUB$~8^iYP-`xoy4?g%EO+Dbq82fn(rHKjVz zhe?xx_*mlICIjTlIRwPBm>)Nqn`@-P4#SbACB1CWhc|tk*t-k%290+|kN1Dqt%n#+ z(z!5KykeA=JV2?Q@hB03D2?2K*p8HEy?r3wC+HI6W}_k#TP%cK1&D>PcUJ(Te#b!N z1e7Qa9CwiQY^y2UgN;J`IQnrLA5ieD4VFAEdSj)ztd1=a@FMD`7^Pd?nd1-*$a)ZB zSk*t_tjG{{**-eagVjwMs-~Ng890QP}5jERaP)9s$)Ac&5r1}}y-I8ZrOis+a zv5&T(D+hzTI7L1M;-SE!v|y&^jtNP5sy?t#a>9hxY2YBM;!s)W`2zTzT@1u5!Z88R z-uV??1b<-KH2z*k$_Ik?8$b@SzrNw_EP$eI?&t$64RyjDIKEZ3Qgy4WXOzN>5fS3j zz{4=0Ygq-J4cx&6qC#G`F0bMs4Nv(j#Uj z&E`=coftJgyfi<7_`VWHvNi;W@d3fmJ#f1wI_1C#5Ih1q0!WiCPl|w7Oo^F6vCUNb z%L9ex7~YH?zcvJOY~kU&DL8kE#w(!#$%$I%8OS0yG4^x)%fKoFqa>ZF$QO&x90Rh# zAgu^I7y&A$LNLcjs@IEGv~8mSu#0}`$xo=8Ks#rkC$E#^zGt~-fT5?0bbfZAvs2Yh zIbeUFqiebS{}%{+g6q?5O-ZxP?aWA%ljV%5?kia#Jq! zWb!k@kQip_n6C7*pQXUyvQ5^7vuLbCYw8hx7RW6m^( z%v@lc3X6Hg|76ba;jo+?6`eTMN>RTj1vGBI=IM%SjKs7dFV9xSdU}{(V!uVEclcZX zXg-7a_?et=qZd1I*TObEd2`00^CHDwN%}^1%!CK1OkBsISb65$0yafWuOzsAQ6~%U z0#!7{%asnR8rig#8O(Z5Gc52E>fT4$?#4$+JkcVylk2f`H{%iVEIMNxa+ZqJA{-=*mT>pvk{N@pKvKapgnI5#IgN6L_g4^d@WFbQj|%I zoCwmmM%TSuBKS;}#j8wTNQdIFyu|p6;)7c{Id54J6n0d~w%_gB5PZX$bjq_kVI~O_Pr65IGgUA(vqeqhYtQL**|z_U8m2{?)q%&7E^qWo~^ctcpHZ3 z`@BGLCcJ&i+>7%fAO7!*`9FW{o|D8+@!jBsd9vszjGEObtt;P9Urp$CHqM4KYtAFs;HO*v;b|BFMMRwYNbI_KJ*?yEA{GIOn2(m za7uCTpJNk8+jBM85~JytazgX7T1{k5fj{@Y$@H>5MDa#7azbLsNdXzBhtzNMcJ%MD zuJua_fEVXAAmW>OZA|vv!lId`QvJYp?)CJLR>p3FL<)F)3PPdkR&PY6X=g|Hc%4|U zwY)dBv+x&{?<0x(GfT97aIq~$xxFxJzgg^lj1fXu^V8NMOQjJEEYZ4?rR|H6f;(W{ zGtZYsS5|hw8MY@L=Q211$VUk zX`l63g4(J@>5^KJY1% z((jeN>nLBAfYC7}M~ZKzG{wy}8)ZTHfq}bFP-Mm%5j&@4{p@4L4vvGTWk-sin`QgD zq?uT(Ev6WlKp$iRX~=5{Is>#rYIAj(& zJ@rF_O#g_*{tvwAq|T=MT{!0Xc#ZJ&L2j;%bQ`7U4c_~jeU7%9-|Dh4X!;J4RrPq| zy