From ca8938b03945c9e3f119a2005941d6792f9b0774 Mon Sep 17 00:00:00 2001 From: Linux User Date: Fri, 8 Jun 2018 19:44:25 -0700 Subject: [PATCH 01/26] Leaky ReLUs --- src/fann.c | 3 +++ src/fann_cascade.c | 1 + src/fann_train.c | 3 +++ src/include/fann_activation.h | 3 +++ src/include/fann_data.h | 11 +++++++++-- src/include/fann_data_cpp.h | 8 +++++++- 6 files changed, 26 insertions(+), 3 deletions(-) diff --git a/src/fann.c b/src/fann.c index 9af7e388..a506e7cf 100644 --- a/src/fann.c +++ b/src/fann.c @@ -769,6 +769,9 @@ FANN_EXTERNAL fann_type *FANN_API fann_run(struct fann * ann, fann_type * input) case FANN_LINEAR_PIECE_SYMMETRIC: neuron_it->value = (fann_type)((neuron_sum < -multiplier) ? -multiplier : (neuron_sum > multiplier) ? multiplier : neuron_sum); break; + case FANN_LINEAR_PIECE_LEAKY: + neuron_it->value = (fann_type)((neuron_sum < 0) ? 0.01 * neuron_sum: neuron_sum); + break; case FANN_ELLIOT: case FANN_ELLIOT_SYMMETRIC: case FANN_GAUSSIAN: diff --git a/src/fann_cascade.c b/src/fann_cascade.c index 51954907..ba2a6e93 100644 --- a/src/fann_cascade.c +++ b/src/fann_cascade.c @@ -741,6 +741,7 @@ fann_type fann_train_candidates_epoch(struct fann *ann, struct fann_train_data * case FANN_GAUSSIAN_STEPWISE: case FANN_ELLIOT: case FANN_LINEAR_PIECE: + case FANN_LINEAR_PIECE_LEAKY: case FANN_SIN: case FANN_COS: break; diff --git a/src/fann_train.c b/src/fann_train.c index 049e6de9..05fb5800 100644 --- a/src/fann_train.c +++ b/src/fann_train.c @@ -42,6 +42,8 @@ fann_type fann_activation_derived(unsigned int activation_function, case FANN_LINEAR_PIECE: case FANN_LINEAR_PIECE_SYMMETRIC: return (fann_type) fann_linear_derive(steepness, value); + case FANN_LINEAR_PIECE_LEAKY: + return (fann_type) ((value<0)? steepness * 0.01: steepness); case FANN_SIGMOID: case FANN_SIGMOID_STEPWISE: value = fann_clip(value, 0.01f, 0.99f); @@ -133,6 +135,7 @@ fann_type fann_update_MSE(struct fann *ann, struct fann_neuron* neuron, fann_typ case FANN_LINEAR_PIECE: case FANN_SIN: case FANN_COS: + case FANN_LINEAR_PIECE_LEAKY: break; } diff --git a/src/include/fann_activation.h b/src/include/fann_activation.h index 80cab7ca..0a28a62e 100644 --- a/src/include/fann_activation.h +++ b/src/include/fann_activation.h @@ -152,6 +152,9 @@ switch(activation_function) \ case FANN_GAUSSIAN_STEPWISE: \ result = 0; \ break; \ + case FANN_LINEAR_PIECE_LEAKY: \ + result = (fann_type)((value < 0) ? value*0.01 : value); \ + break; \ } #endif diff --git a/src/include/fann_data.h b/src/include/fann_data.h index 99f42c76..fa4687d3 100644 --- a/src/include/fann_data.h +++ b/src/include/fann_data.h @@ -197,6 +197,11 @@ static char const *const FANN_TRAIN_NAMES[] = { * span: 0 <= y <= 1 * y = cos(x*s)/2+0.5 * d = s*-sin(x*s)/2 + + FANN_LINEAR_PIECE_LEAKY - leaky ReLU + * span: -inf < y < inf + * y = x<0? 0.01*x: x + * d = x<0? 0.01: 1 See also: , , @@ -226,7 +231,8 @@ enum fann_activationfunc_enum FANN_SIN_SYMMETRIC, FANN_COS_SYMMETRIC, FANN_SIN, - FANN_COS + FANN_COS, + FANN_LINEAR_PIECE_LEAKY, }; /* Constant: FANN_ACTIVATIONFUNC_NAMES @@ -258,7 +264,8 @@ static char const *const FANN_ACTIVATIONFUNC_NAMES[] = { "FANN_SIN_SYMMETRIC", "FANN_COS_SYMMETRIC", "FANN_SIN", - "FANN_COS" + "FANN_COS", + "FANN_LINEAR_PIECE_LEAKY" }; /* Enum: fann_errorfunc_enum diff --git a/src/include/fann_data_cpp.h b/src/include/fann_data_cpp.h index 5b378bb4..a54bb7bc 100644 --- a/src/include/fann_data_cpp.h +++ b/src/include/fann_data_cpp.h @@ -205,6 +205,11 @@ namespace FANN { * y = cos(x*s) * d = s*-sin(x*s) + FANN_LINEAR_PIECE_LEAKY - leaky ReLU + * span: -inf < y < inf + y = x<0? 0.01*x: x + d = x<0? 0.01: 1 + See also: , @@ -225,7 +230,8 @@ namespace FANN { LINEAR_PIECE, LINEAR_PIECE_SYMMETRIC, SIN_SYMMETRIC, - COS_SYMMETRIC + COS_SYMMETRIC, + LINEAR_PIECE_LEAKY }; /* Enum: network_type_enum From 1fb469735f0a4b123d853132849c3247458409e9 Mon Sep 17 00:00:00 2001 From: "eli@owl" Date: Mon, 4 Feb 2019 01:01:21 -0800 Subject: [PATCH 02/26] Rectifying Linear Units --- src/fann.c | 3 +++ src/fann_cascade.c | 1 + src/fann_train.c | 3 +++ src/include/fann_activation.h | 3 +++ src/include/fann_data.h | 9 ++++++++- src/include/fann_data_cpp.h | 8 +++++++- 6 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/fann.c b/src/fann.c index a506e7cf..4d64d0dc 100644 --- a/src/fann.c +++ b/src/fann.c @@ -772,6 +772,9 @@ FANN_EXTERNAL fann_type *FANN_API fann_run(struct fann * ann, fann_type * input) case FANN_LINEAR_PIECE_LEAKY: neuron_it->value = (fann_type)((neuron_sum < 0) ? 0.01 * neuron_sum: neuron_sum); break; + case FANN_LINEAR_PIECE_RECT: + neuron_it->value = (fann_type)((neuron_sum < 0) ? 0: neuron_sum); + break; case FANN_ELLIOT: case FANN_ELLIOT_SYMMETRIC: case FANN_GAUSSIAN: diff --git a/src/fann_cascade.c b/src/fann_cascade.c index ba2a6e93..2924a64b 100644 --- a/src/fann_cascade.c +++ b/src/fann_cascade.c @@ -742,6 +742,7 @@ fann_type fann_train_candidates_epoch(struct fann *ann, struct fann_train_data * case FANN_ELLIOT: case FANN_LINEAR_PIECE: case FANN_LINEAR_PIECE_LEAKY: + case FANN_LINEAR_PIECE_RECT: case FANN_SIN: case FANN_COS: break; diff --git a/src/fann_train.c b/src/fann_train.c index 05fb5800..379b2db2 100644 --- a/src/fann_train.c +++ b/src/fann_train.c @@ -44,6 +44,8 @@ fann_type fann_activation_derived(unsigned int activation_function, return (fann_type) fann_linear_derive(steepness, value); case FANN_LINEAR_PIECE_LEAKY: return (fann_type) ((value<0)? steepness * 0.01: steepness); + case FANN_LINEAR_PIECE_RECT: + return (fann_type) ((value<0)? 0: steepness); case FANN_SIGMOID: case FANN_SIGMOID_STEPWISE: value = fann_clip(value, 0.01f, 0.99f); @@ -136,6 +138,7 @@ fann_type fann_update_MSE(struct fann *ann, struct fann_neuron* neuron, fann_typ case FANN_SIN: case FANN_COS: case FANN_LINEAR_PIECE_LEAKY: + case FANN_LINEAR_PIECE_RECT: break; } diff --git a/src/include/fann_activation.h b/src/include/fann_activation.h index 0a28a62e..a8a3f0af 100644 --- a/src/include/fann_activation.h +++ b/src/include/fann_activation.h @@ -155,6 +155,9 @@ switch(activation_function) \ case FANN_LINEAR_PIECE_LEAKY: \ result = (fann_type)((value < 0) ? value*0.01 : value); \ break; \ + case FANN_LINEAR_PIECE_RECT: \ + result = (fann_type)((value < 0) ? 0 : value); \ + break; \ } #endif diff --git a/src/include/fann_data.h b/src/include/fann_data.h index fa4687d3..b49122a6 100644 --- a/src/include/fann_data.h +++ b/src/include/fann_data.h @@ -203,6 +203,11 @@ static char const *const FANN_TRAIN_NAMES[] = { * y = x<0? 0.01*x: x * d = x<0? 0.01: 1 + FANN_LINEAR_PIECE_RECT - ReLU + * span: -inf < y < inf + * y = x<0? 0: x + * d = x<0? 0: 1 + See also: , , , , @@ -233,6 +238,7 @@ enum fann_activationfunc_enum FANN_SIN, FANN_COS, FANN_LINEAR_PIECE_LEAKY, + FANN_LINEAR_PIECE_RECT, }; /* Constant: FANN_ACTIVATIONFUNC_NAMES @@ -265,7 +271,8 @@ static char const *const FANN_ACTIVATIONFUNC_NAMES[] = { "FANN_COS_SYMMETRIC", "FANN_SIN", "FANN_COS", - "FANN_LINEAR_PIECE_LEAKY" + "FANN_LINEAR_PIECE_LEAKY", + "FANN_LINEAR_PIECE_RECT" }; /* Enum: fann_errorfunc_enum diff --git a/src/include/fann_data_cpp.h b/src/include/fann_data_cpp.h index a54bb7bc..cb8a20c8 100644 --- a/src/include/fann_data_cpp.h +++ b/src/include/fann_data_cpp.h @@ -210,6 +210,11 @@ namespace FANN { y = x<0? 0.01*x: x d = x<0? 0.01: 1 + FANN_LINEAR_PIECE_RECT - ReLU + * span: -inf < y < inf + y = x<0? 0: x + d = x<0? 0: 1 + See also: , @@ -231,7 +236,8 @@ namespace FANN { LINEAR_PIECE_SYMMETRIC, SIN_SYMMETRIC, COS_SYMMETRIC, - LINEAR_PIECE_LEAKY + LINEAR_PIECE_LEAKY, + LINEAR_PIECE_RECT }; /* Enum: network_type_enum From 7c70a7ea8accc26e80d58075e7e36303730c00f9 Mon Sep 17 00:00:00 2001 From: Linux User Date: Tue, 12 Mar 2019 07:54:55 -0700 Subject: [PATCH 03/26] openmp - opencl --- CMakeLists.txt | 2 +- src/CMakeLists.txt | 13 +++++++++++++ src/fann.c | 6 ++++-- src/fann_train.c | 2 +- 4 files changed, 19 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b11d762..8208c651 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -66,7 +66,7 @@ INCLUDE(DefineInstallationPaths) configure_file (cmake/config.h.in ${CMAKE_CURRENT_BINARY_DIR}/src/include/config.h) -include_directories (${CMAKE_CURRENT_BINARY_DIR}/src/include/) +include_directories (${CMAKE_CURRENT_BINARY_DIR}/src/include/ ${CMAKE_CURRENT_BINARY_DIR}/src/include/optimized/opencl/) configure_file (cmake/fann.pc.cmake ${CMAKE_CURRENT_BINARY_DIR}/fann.pc @ONLY) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 20421eac..14efbf51 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -10,8 +10,15 @@ ELSE() SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") ENDIF(NOT OPENMP_FOUND OR DISABLE_PARALLEL_FANN) +FIND_PACKAGE(OpenCL) +IF(OPENCL_FOUND) + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -I${OpenCL_INCLUDE_DIR}") + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${OpenCL_INCLUDE_DIR}") +ENDIF(OPENCL_FOUND) + ADD_SUBDIRECTORY( include ) +#INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/include/optimized/opencl) INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/include) ADD_DEFINITIONS(-D_REENTRANT) if (WIN32) @@ -81,6 +88,12 @@ INSTALL(TARGETS fixedfann fixedfann_static LIBRARY DESTINATION ${LIB_INSTALL_DIR SET(fann_LIB_SRCS floatfann.c +# optimized/opencl/fann.c +# optimized/opencl/fann_cl.cpp +# optimized/opencl/fann_cl_kernel.c +# optimized/opencl/fann_cl_train.c +# optimized/opencl/fann_cl_ann.c +# optimized/opencl/fann_cl_run.c ) ADD_LIBRARY(fann SHARED ${fann_LIB_SRCS}) diff --git a/src/fann.c b/src/fann.c index a506e7cf..5088dd9a 100644 --- a/src/fann.c +++ b/src/fann.c @@ -654,7 +654,8 @@ FANN_EXTERNAL fann_type *FANN_API fann_run(struct fann * ann, fann_type * input) break; } - for(; i != num_connections; i += 4) + #pragma omp parallel for reduction(+:neuron_sum) + for(i = num_connections & 3; i < num_connections; i += 4) { neuron_sum += fann_mult(weights[i], neurons[i].value) + @@ -688,7 +689,8 @@ FANN_EXTERNAL fann_type *FANN_API fann_run(struct fann * ann, fann_type * input) break; } - for(; i != num_connections; i += 4) + #pragma omp parallel for reduction(+:neuron_sum) + for(i = num_connections & 3; i < num_connections; i += 4) { neuron_sum += fann_mult(weights[i], neuron_pointers[i]->value) + diff --git a/src/fann_train.c b/src/fann_train.c index 05fb5800..4c928895 100644 --- a/src/fann_train.c +++ b/src/fann_train.c @@ -43,7 +43,7 @@ fann_type fann_activation_derived(unsigned int activation_function, case FANN_LINEAR_PIECE_SYMMETRIC: return (fann_type) fann_linear_derive(steepness, value); case FANN_LINEAR_PIECE_LEAKY: - return (fann_type) ((value<0)? steepness * 0.01: steepness); + return (fann_type) ((value<0)? 0.01*steepness: steepness); case FANN_SIGMOID: case FANN_SIGMOID_STEPWISE: value = fann_clip(value, 0.01f, 0.99f); From 3c7dbbaffbcce2ff43bffc8a888c156efe72eeec Mon Sep 17 00:00:00 2001 From: glenda Date: Fri, 15 Jan 2021 19:28:26 +0000 Subject: [PATCH 04/26] Plan 9 port --- src/doublefann.c | 6 ++++++ src/fann.c | 11 ++++++++++- src/fann_cascade.c | 6 ++++++ src/fann_io.c | 5 +++++ src/fann_train.c | 4 ++++ src/fann_train_data.c | 4 ++++ src/fixedfann.c | 4 ++++ src/floatfann.c | 6 ++++++ src/include/fann.h | 16 +++++++++++++++- src/include/fann_activation.h | 22 ++++++++++++++-------- src/include/fann_error.h | 2 ++ src/include/fann_internal.h | 4 +++- src/mkfile | 31 +++++++++++++++++++++++++++++++ 13 files changed, 110 insertions(+), 11 deletions(-) create mode 100644 src/mkfile diff --git a/src/doublefann.c b/src/doublefann.c index 40f22380..af0367c2 100644 --- a/src/doublefann.c +++ b/src/doublefann.c @@ -19,6 +19,10 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA /* Easy way to allow for build of multiple binaries */ +#ifdef PLAN9 +#include +#endif + #include "config.h" #include "doublefann.h" @@ -28,4 +32,6 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #include "fann_train_data.c" #include "fann_error.c" #include "fann_cascade.c" +#ifndef PLAN9 #include "parallel_fann.c" +#endif diff --git a/src/fann.c b/src/fann.c index 4319fff7..4e78b519 100644 --- a/src/fann.c +++ b/src/fann.c @@ -17,12 +17,21 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#ifdef PLAN9 +#include +#include +#include +#include +#include +#include +#else #include #include #include #include #include #include +#endif #include "config.h" #include "fann.h" @@ -1836,7 +1845,7 @@ FANN_EXTERNAL void FANN_API fann_enable_seed_rand() /* INTERNAL FUNCTION Seed the random function. */ -void fann_seed_rand() +void fann_seed_rand(void) { #ifndef _WIN32 FILE *fp = fopen("/dev/urandom", "r"); diff --git a/src/fann_cascade.c b/src/fann_cascade.c index 2924a64b..d19dec90 100644 --- a/src/fann_cascade.c +++ b/src/fann_cascade.c @@ -17,6 +17,12 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#ifdef PLAN9 +#include +#include +#include +#endif + #include "config.h" #include "fann.h" #include "string.h" diff --git a/src/fann_io.c b/src/fann_io.c index 40fd5f1a..14c2f1dc 100644 --- a/src/fann_io.c +++ b/src/fann_io.c @@ -19,9 +19,14 @@ #include #include +#ifndef PLAN9 #include +#endif #include #include +#ifdef PLAN9 +#include +#endif #include "config.h" #include "fann.h" diff --git a/src/fann_train.c b/src/fann_train.c index 379b2db2..03529edb 100644 --- a/src/fann_train.c +++ b/src/fann_train.c @@ -17,6 +17,10 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#ifdef PLAN9 +#define sqrtf sqrt +#endif + #include #include #include diff --git a/src/fann_train_data.c b/src/fann_train_data.c index b93ec35d..a431954c 100644 --- a/src/fann_train_data.c +++ b/src/fann_train_data.c @@ -21,6 +21,10 @@ #include #include #include +#ifdef PLAN9 +#include +#define sqrtf sqrt +#endif #include "config.h" #include "fann.h" diff --git a/src/fixedfann.c b/src/fixedfann.c index a48cd085..3f2b1465 100644 --- a/src/fixedfann.c +++ b/src/fixedfann.c @@ -19,6 +19,10 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA /* Easy way to allow for build of multiple binaries */ +#ifdef PLAN9 +#include +#endif + #include "config.h" #include "fixedfann.h" diff --git a/src/floatfann.c b/src/floatfann.c index b9ad0dd0..fae67a4a 100644 --- a/src/floatfann.c +++ b/src/floatfann.c @@ -19,6 +19,10 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA /* Easy way to allow for build of multiple binaries */ +#ifdef PLAN9 +#include +#endif + #include "config.h" #include "floatfann.h" @@ -28,4 +32,6 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #include "fann_train_data.c" #include "fann_error.c" #include "fann_cascade.c" +#ifndef PLAN9 #include "parallel_fann.c" +#endif diff --git a/src/include/fann.h b/src/include/fann.h index 5adda119..eb96b2c1 100644 --- a/src/include/fann.h +++ b/src/include/fann.h @@ -37,6 +37,10 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA /* Group: Creation, Destruction & Execution */ #ifndef FANN_INCLUDE +#ifdef _PLAN9_SOURCE +#pragma lib "/$M/lib/ape/libfann.a" +#endif + /* just to allow for inclusion of fann.h in normal stuations where only floats are needed */ #ifdef FIXEDFANN #include "fixedfann.h" @@ -45,7 +49,9 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #endif /* FIXEDFANN */ #else - + +#ifndef PLAN9 + /* COMPAT_TIME REPLACEMENT */ #ifndef _WIN32 #include @@ -58,6 +64,8 @@ extern unsigned long __stdcall GetTickCount(void); #include #endif /* _MSC_EXTENSIONS */ #endif /* _WIN32 */ + +#endif #ifndef __fann_h__ #define __fann_h__ @@ -113,11 +121,14 @@ extern "C" to use dll's. To use dll's FANN_USE_DLL has to be defined before including the fann headers. */ +#ifndef PLAN9 #if defined(_MSC_VER) && (_MSC_VER > 1300) #ifndef FANN_NO_DLL #define FANN_USE_DLL #endif /* FANN_USE_LIB */ #endif /* _MSC_VER */ +#endif +#ifndef PLAN9 #if defined(_MSC_VER) && (defined(FANN_USE_DLL) || defined(FANN_DLL_EXPORTS)) #ifdef FANN_DLL_EXPORTS #define FANN_EXTERNAL __declspec(dllexport) @@ -126,9 +137,12 @@ extern "C" #endif /* FANN_DLL_EXPORTS*/ #define FANN_API __stdcall #else /* */ +#ifndef PLAN9 #define FANN_EXTERNAL #define FANN_API +#endif #endif /* _MSC_VER */ +#endif /* ----- End of macros used to define DLL external entrypoints ----- */ #include "fann_error.h" diff --git a/src/include/fann_activation.h b/src/include/fann_activation.h index a8a3f0af..eba684a4 100644 --- a/src/include/fann_activation.h +++ b/src/include/fann_activation.h @@ -33,13 +33,19 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA __doublefann_h__ is not defined */ #ifndef __doublefann_h__ +#ifndef PLAN9 #define FANN_EXP(x) expf(x) -#define FANN_SIN(x) sinf(x) -#define FANN_COS(x) cosf(x) +#define _FANN_SIN(x) sinf(x) +#define _FANN_COS(x) cosf(x) #else #define FANN_EXP(x) exp(x) -#define FANN_SIN(x) sin(x) -#define FANN_COS(x) cos(x) +#define _FANN_SIN(x) sin(x) +#define _FANN_COS(x) cos(x) +#endif +#else +#define FANN_EXP(x) exp(x) +#define _FANN_SIN(x) sin(x) +#define _FANN_COS(x) cos(x) #endif #define fann_linear_func(v1, r1, v2, r2, sum) (((((r2)-(r1)) * ((sum)-(v1)))/((v2)-(v1))) + (r1)) @@ -80,19 +86,19 @@ __doublefann_h__ is not defined #define fann_elliot_symmetric_derive(steepness, value, sum) (steepness * 1.0f / ((1.0f + fann_abs(sum)) * (1.0f + fann_abs(sum)))) /* FANN_SIN_SYMMETRIC */ -#define fann_sin_symmetric_real(sum) (FANN_SIN(sum)) +#define fann_sin_symmetric_real(sum) (_FANN_SIN(sum)) #define fann_sin_symmetric_derive(steepness, sum) (steepness*cos(steepness*sum)) /* FANN_COS_SYMMETRIC */ -#define fann_cos_symmetric_real(sum) (FANN_COS(sum)) +#define fann_cos_symmetric_real(sum) (_FANN_COS(sum)) #define fann_cos_symmetric_derive(steepness, sum) (steepness*-sin(steepness*sum)) /* FANN_SIN */ -#define fann_sin_real(sum) (FANN_SIN(sum)/2.0f+0.5f) +#define fann_sin_real(sum) (_FANN_SIN(sum)/2.0f+0.5f) #define fann_sin_derive(steepness, sum) (steepness*cos(steepness*sum)/2.0f) /* FANN_COS */ -#define fann_cos_real(sum) (FANN_COS(sum)/2.0f+0.5f) +#define fann_cos_real(sum) (_FANN_COS(sum)/2.0f+0.5f) #define fann_cos_derive(steepness, sum) (steepness*-sin(steepness*sum)/2.0f) #define fann_activation_switch(activation_function, value, result) \ diff --git a/src/include/fann_error.h b/src/include/fann_error.h index 69877f75..58990fec 100644 --- a/src/include/fann_error.h +++ b/src/include/fann_error.h @@ -20,7 +20,9 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #ifndef __fann_error_h__ #define __fann_error_h__ +#ifndef PLAN9 #include +#endif #define FANN_ERRSTR_MAX 128 struct fann_error; diff --git a/src/include/fann_internal.h b/src/include/fann_internal.h index 81787b3d..2bf2de19 100644 --- a/src/include/fann_internal.h +++ b/src/include/fann_internal.h @@ -22,9 +22,11 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA /* internal include file, not to be included directly */ +#ifndef PLAN9 #include #include #include +#endif #include "fann_data.h" #define FANN_FIX_VERSION "FANN_FIX_2.0" @@ -70,7 +72,7 @@ int fann_save_train_internal_fd(struct fann_train_data *data, FILE * file, const unsigned int save_as_fixed, unsigned int decimal_point); void fann_update_stepwise(struct fann *ann); -void fann_seed_rand(); +void fann_seed_rand(void); void fann_error(struct fann_error *errdat, const enum fann_errno_enum errno_f, ...); void fann_init_error_data(struct fann_error *errdat); diff --git a/src/mkfile b/src/mkfile new file mode 100644 index 00000000..7597b83e --- /dev/null +++ b/src/mkfile @@ -0,0 +1,31 @@ + Date: Fri, 15 Jan 2021 20:56:18 +0000 Subject: [PATCH 05/26] examples mkfile --- examples/mkfile | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 examples/mkfile diff --git a/examples/mkfile b/examples/mkfile new file mode 100644 index 00000000..fb19fb5b --- /dev/null +++ b/examples/mkfile @@ -0,0 +1,11 @@ + Date: Fri, 10 Jun 2022 12:28:37 -0700 Subject: [PATCH 06/26] start on opencl. intel hardware sucks for this so committing to work on another system --- src/CMakeLists.txt | 15 ++----- src/fann.c | 89 ++++++++++++++++++++++++++++++----------- src/fann_io.c | 8 ++++ src/include/fann.h | 4 ++ src/include/fann_data.h | 7 ++++ tests/CMakeLists.txt | 2 +- 6 files changed, 90 insertions(+), 35 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 14efbf51..cde8a091 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -18,7 +18,6 @@ ENDIF(OPENCL_FOUND) ADD_SUBDIRECTORY( include ) -#INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/include/optimized/opencl) INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/include) ADD_DEFINITIONS(-D_REENTRANT) if (WIN32) @@ -71,8 +70,8 @@ SET(fixedfann_LIB_SRCS ADD_LIBRARY(fixedfann SHARED ${fixedfann_LIB_SRCS}) ADD_LIBRARY(fixedfann_static STATIC ${fixedfann_LIB_SRCS}) -TARGET_LINK_LIBRARIES(fixedfann m) -TARGET_LINK_LIBRARIES(fixedfann_static m) +TARGET_LINK_LIBRARIES(fixedfann m ${OPENCL_LIBRARIES}) +TARGET_LINK_LIBRARIES(fixedfann_static m ${OPENCL_LIBRARIES}) SET_TARGET_PROPERTIES(fixedfann PROPERTIES VERSION ${FANN_VERSION_STRING} SOVERSION ${FANN_VERSION_MAJOR}) SET_TARGET_PROPERTIES(fixedfann_static PROPERTIES VERSION ${FANN_VERSION_STRING} SOVERSION ${FANN_VERSION_MAJOR}) @@ -88,19 +87,13 @@ INSTALL(TARGETS fixedfann fixedfann_static LIBRARY DESTINATION ${LIB_INSTALL_DIR SET(fann_LIB_SRCS floatfann.c -# optimized/opencl/fann.c -# optimized/opencl/fann_cl.cpp -# optimized/opencl/fann_cl_kernel.c -# optimized/opencl/fann_cl_train.c -# optimized/opencl/fann_cl_ann.c -# optimized/opencl/fann_cl_run.c ) ADD_LIBRARY(fann SHARED ${fann_LIB_SRCS}) ADD_LIBRARY(fann_static STATIC ${fann_LIB_SRCS}) -TARGET_LINK_LIBRARIES(fann m) -TARGET_LINK_LIBRARIES(fann_static m) +TARGET_LINK_LIBRARIES(fann m ${OPENCL_LIBRARIES}) +TARGET_LINK_LIBRARIES(fann_static m ${OPENCL_LIBRARIES}) SET_TARGET_PROPERTIES(fann PROPERTIES VERSION ${FANN_VERSION_STRING} SOVERSION ${FANN_VERSION_MAJOR}) SET_TARGET_PROPERTIES(fann_static PROPERTIES VERSION ${FANN_VERSION_STRING} SOVERSION ${FANN_VERSION_MAJOR}) diff --git a/src/fann.c b/src/fann.c index 4e78b519..ad51197a 100644 --- a/src/fann.c +++ b/src/fann.c @@ -31,11 +31,45 @@ #include #include #include +#include #endif #include "config.h" #include "fann.h" +unsigned char using_opencl = 0; + +#ifndef PLAN9 + +int fann_setup_opencl(struct fann *ann) +{ + cl_int err; + cl_uint num_devices; + cl_device_id *devices; + + ann->clctx = clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU, NULL, NULL, &err); + if (ann->clctx == NULL) { + fprintf(stderr, "clCreateContextFromType error: %d\n", err); + return -1; + } + + if ((err = clGetContextInfo(ann->clctx, CL_CONTEXT_NUM_DEVICES, sizeof(num_devices), &num_devices, NULL)) != CL_SUCCESS) { + fprintf(stderr, "clGetContextInfo error: %d\n", err); + return -1; + } + + fprintf(stderr, "number of GPUs: %d\n", num_devices); + devices = calloc(num_devices, sizeof(cl_device_id)); + if ((err = clGetContextInfo(ann->clctx, CL_CONTEXT_DEVICES, sizeof(cl_device_id) * num_devices, devices, NULL)) != CL_SUCCESS) { + fprintf(stderr, "clGetContextInfo error: %d\n", err); + return -1; + } + + return 0; +} + +#endif + /* #define FANN_NO_SEED */ FANN_EXTERNAL struct fann *FANN_API fann_create_standard(unsigned int num_layers, ...) @@ -158,6 +192,11 @@ FANN_EXTERNAL struct fann *FANN_API fann_create_sparse_array(float connection_ra return NULL; } +#ifndef PLAN9 + if (fann_setup_opencl(ann) == 0) + using_opencl = 1; +#endif + ann->connection_rate = connection_rate; #ifdef FIXEDFANN multiplier = ann->multiplier; @@ -648,31 +687,35 @@ FANN_EXTERNAL fann_type *FANN_API fann_run(struct fann * ann, fann_type * input) neurons = (layer_it - 1)->first_neuron; } + if (using_opencl == 0) { + /* unrolled loop start */ + i = num_connections & 3; /* same as modulo 4 */ + switch (i) + { + case 3: + neuron_sum += fann_mult(weights[2], neurons[2].value); + case 2: + neuron_sum += fann_mult(weights[1], neurons[1].value); + case 1: + neuron_sum += fann_mult(weights[0], neurons[0].value); + case 0: + break; + } - /* unrolled loop start */ - i = num_connections & 3; /* same as modulo 4 */ - switch (i) - { - case 3: - neuron_sum += fann_mult(weights[2], neurons[2].value); - case 2: - neuron_sum += fann_mult(weights[1], neurons[1].value); - case 1: - neuron_sum += fann_mult(weights[0], neurons[0].value); - case 0: - break; - } - - #pragma omp parallel for reduction(+:neuron_sum) - for(i = num_connections & 3; i < num_connections; i += 4) - { - neuron_sum += - fann_mult(weights[i], neurons[i].value) + - fann_mult(weights[i + 1], neurons[i + 1].value) + - fann_mult(weights[i + 2], neurons[i + 2].value) + - fann_mult(weights[i + 3], neurons[i + 3].value); + #pragma omp parallel for reduction(+:neuron_sum) + for(i = num_connections & 3; i < num_connections; i += 4) + { + neuron_sum += + fann_mult(weights[i], neurons[i].value) + + fann_mult(weights[i + 1], neurons[i + 1].value) + + fann_mult(weights[i + 2], neurons[i + 2].value) + + fann_mult(weights[i + 3], neurons[i + 3].value); + } + /* unrolled loop end */ + } else { +#ifndef PLAN9 +#endif } - /* unrolled loop end */ /* * for(i = 0;i != num_connections; i++){ diff --git a/src/fann_io.c b/src/fann_io.c index 14c2f1dc..c31b0ddf 100644 --- a/src/fann_io.c +++ b/src/fann_io.c @@ -32,6 +32,8 @@ #include "fann.h" #include "fann_data.h" +extern unsigned char using_opencl; + /* Create a network from a configuration file. */ FANN_EXTERNAL struct fann *FANN_API fann_create_from_file(const char *configuration_file) @@ -46,6 +48,12 @@ FANN_EXTERNAL struct fann *FANN_API fann_create_from_file(const char *configurat } ann = fann_create_from_fd(conf, configuration_file); fclose(conf); + +#ifndef PLAN9 + if (fann_setup_opencl(ann) == 0) + using_opencl = 1; +#endif + return ann; } diff --git a/src/include/fann.h b/src/include/fann.h index eb96b2c1..2bae3662 100644 --- a/src/include/fann.h +++ b/src/include/fann.h @@ -153,6 +153,10 @@ extern "C" #include "fann_cascade.h" #include "fann_io.h" +#ifndef PLAN9 +int fann_setup_opencl(struct fann *ann); +#endif + /* Function: fann_create_standard Creates a standard fully connected backpropagation neural network. diff --git a/src/include/fann_data.h b/src/include/fann_data.h index b49122a6..6ad31860 100644 --- a/src/include/fann_data.h +++ b/src/include/fann_data.h @@ -21,6 +21,9 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #define __fann_data_h__ #include +#ifndef PLAN9 +#include +#endif /* Section: FANN Datatypes @@ -812,6 +815,10 @@ struct fann */ float *scale_factor_out; #endif + +#ifndef PLAN9 + cl_context clctx; +#endif }; /* Type: fann_connection diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index c79d4a79..7c655ba2 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -22,4 +22,4 @@ else() endif() ADD_EXECUTABLE(fann_tests main.cpp fann_test.cpp fann_test_data.cpp fann_test_train.cpp) -target_link_libraries(fann_tests gtest doublefann) +target_link_libraries(fann_tests gtest doublefann OpenCL) From 9e871414afdb120b7949ef86bd855105597048d9 Mon Sep 17 00:00:00 2001 From: Eli Date: Sun, 24 Jul 2022 11:27:45 -0700 Subject: [PATCH 07/26] blah --- CMakeLists.txt | 4 +- src/CMakeLists.txt | 17 ++--- src/fann.c | 156 ++++++++++++++++++++++++++++++++-------- src/fann_cascade.c | 20 +++--- src/fann_train.c | 12 ++-- src/include/fann.h | 3 + src/include/fann_data.h | 9 ++- tests/CMakeLists.txt | 2 +- 8 files changed, 160 insertions(+), 63 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8208c651..95c628fc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,7 +34,7 @@ IF(BIICODE) TARGET_COMPILE_OPTIONS(${BII_LIB_TARGET} PUBLIC -DFANN_DLL_EXPORTS) ELSE() IF(${examples_present}) - TARGET_LINK_LIBRARIES(${BII_BLOCK_TARGET} INTERFACE gomp) + TARGET_LINK_LIBRARIES(${BII_BLOCK_TARGET} INTERFACE gomp OpenGL) ENDIF() ENDIF() ELSE() @@ -66,7 +66,7 @@ INCLUDE(DefineInstallationPaths) configure_file (cmake/config.h.in ${CMAKE_CURRENT_BINARY_DIR}/src/include/config.h) -include_directories (${CMAKE_CURRENT_BINARY_DIR}/src/include/ ${CMAKE_CURRENT_BINARY_DIR}/src/include/optimized/opencl/) +include_directories (${CMAKE_CURRENT_BINARY_DIR}/src/include/) configure_file (cmake/fann.pc.cmake ${CMAKE_CURRENT_BINARY_DIR}/fann.pc @ONLY) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 14efbf51..45284567 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -10,15 +10,14 @@ ELSE() SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") ENDIF(NOT OPENMP_FOUND OR DISABLE_PARALLEL_FANN) -FIND_PACKAGE(OpenCL) -IF(OPENCL_FOUND) - SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -I${OpenCL_INCLUDE_DIR}") - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${OpenCL_INCLUDE_DIR}") -ENDIF(OPENCL_FOUND) +FIND_PACKAGE(OpenGL) +IF(OpenGL_FOUND) + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -I${OpenGL_INCLUDE_DIR}") + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -I${OpenGL_INCLUDE_DIR}") +ENDIF(OpenGL_FOUND) ADD_SUBDIRECTORY( include ) -#INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/include/optimized/opencl) INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/include) ADD_DEFINITIONS(-D_REENTRANT) if (WIN32) @@ -88,12 +87,6 @@ INSTALL(TARGETS fixedfann fixedfann_static LIBRARY DESTINATION ${LIB_INSTALL_DIR SET(fann_LIB_SRCS floatfann.c -# optimized/opencl/fann.c -# optimized/opencl/fann_cl.cpp -# optimized/opencl/fann_cl_kernel.c -# optimized/opencl/fann_cl_train.c -# optimized/opencl/fann_cl_ann.c -# optimized/opencl/fann_cl_run.c ) ADD_LIBRARY(fann SHARED ${fann_LIB_SRCS}) diff --git a/src/fann.c b/src/fann.c index 4e78b519..8d1c1b64 100644 --- a/src/fann.c +++ b/src/fann.c @@ -31,11 +31,69 @@ #include #include #include +#include #endif #include "config.h" #include "fann.h" +static const char* sumShader = MULTILINE_STRING(#version 300 es + precision mediump float; + + layout(local_size_x = 100, local_size_y = 1, local_size_z = 1) in; + layout(std430) buffer; + layout(binding = 0, r32f) writeonly uniform image img_output; + + layout(binding = 0) buffer Input0 { + float elements[]; + } input_data0; + layout(binding = 1) buffer Input1 { + float elements[]; + } input_data1; + + void main() + { + uint index = gl_GlobalInvocationID.x; + float result = input_data0.elements[index] * input_data1.elements[index]; + + atomicAdd(, result); + } +); + +void fann_create_shader(struct fann *ann) +{ + GLint status; + GLint length; + char *log; + + ann->sumShaderID = glCreateShader(GL_COMPUTE_SHADER); + int sumShaderLen = strlen(sumShader); + glShaderSource(ann->sumShaderID, 1, &sumShader, &sumShaderLen); + glCompileShader(ann->sumShaderID); + glGetShaderiv(ann->sumShaderID, GL_COMPILE_STATUS, &status); + if (status == GL_FALSE) { + glGetShaderiv(ann->sumShaderID, GL_INFO_LOG_LENGTH, &length); + log = malloc(length+1); + glGetShaderInfoLog(ann->sumShaderID, length, &length, log); + log[length] = '\0'; + fprintf(stderr, "%s", log); + exit(-1); + } + + ann->sumShaderProgram = glCreateProgram(); + glAttachShader(ann->sumShaderProgram, ann->sumShaderID); + glLinkProgram(ann->sumShaderProgram); + glGetShaderiv(ann->sumShaderID, GL_LINK_STATUS, &status); + if (status == GL_FALSE) { + glGetProgramiv(ann->sumShaderID, GL_INFO_LOG_LENGTH, &length); + log = malloc(length+1); + glGetProgramInfoLog(ann->sumShaderID, length, &length, log); + log[length] = '\0'; + fprintf(stderr, "%s", log); + exit(-1); + } +} + /* #define FANN_NO_SEED */ FANN_EXTERNAL struct fann *FANN_API fann_create_standard(unsigned int num_layers, ...) @@ -158,6 +216,8 @@ FANN_EXTERNAL struct fann *FANN_API fann_create_sparse_array(float connection_ra return NULL; } + fann_create_shader(ann); + ann->connection_rate = connection_rate; #ifdef FIXEDFANN multiplier = ann->multiplier; @@ -574,6 +634,10 @@ FANN_EXTERNAL fann_type *FANN_API fann_run(struct fann * ann, fann_type * input) struct fann_layer *layer_it, *last_layer; unsigned int activation_function; fann_type steepness; + GLuint BO[2]; + GLuint texture; + GLenum err; + float *data; /* store some variabels local for fast access */ struct fann_neuron *first_neuron = ann->first_layer->first_neuron; @@ -604,13 +668,13 @@ FANN_EXTERNAL fann_type *FANN_API fann_run(struct fann * ann, fann_type * input) i, multiplier, multiplier, input[i]); } #endif - first_neuron[i].value = input[i]; + *(first_neuron[i].value) = input[i]; } /* Set the bias neuron in the input layer */ #ifdef FIXEDFANN - (ann->first_layer->last_neuron - 1)->value = multiplier; + *((ann->first_layer->last_neuron - 1)->value) = multiplier; #else - (ann->first_layer->last_neuron - 1)->value = 1; + *((ann->first_layer->last_neuron - 1)->value) = 1; #endif last_layer = ann->last_layer; @@ -623,9 +687,9 @@ FANN_EXTERNAL fann_type *FANN_API fann_run(struct fann * ann, fann_type * input) { /* bias neurons */ #ifdef FIXEDFANN - neuron_it->value = multiplier; + *(neuron_it->value) = multiplier; #else - neuron_it->value = 1; + *(neuron_it->value) = 1; #endif continue; } @@ -649,16 +713,17 @@ FANN_EXTERNAL fann_type *FANN_API fann_run(struct fann * ann, fann_type * input) } +#ifdef PLAN9 /* unrolled loop start */ i = num_connections & 3; /* same as modulo 4 */ switch (i) { case 3: - neuron_sum += fann_mult(weights[2], neurons[2].value); + neuron_sum += fann_mult(weights[2], *(neurons[2].value)); case 2: - neuron_sum += fann_mult(weights[1], neurons[1].value); + neuron_sum += fann_mult(weights[1], *(neurons[1].value)); case 1: - neuron_sum += fann_mult(weights[0], neurons[0].value); + neuron_sum += fann_mult(weights[0], *(neurons[0].value)); case 0: break; } @@ -667,13 +732,39 @@ FANN_EXTERNAL fann_type *FANN_API fann_run(struct fann * ann, fann_type * input) for(i = num_connections & 3; i < num_connections; i += 4) { neuron_sum += - fann_mult(weights[i], neurons[i].value) + - fann_mult(weights[i + 1], neurons[i + 1].value) + - fann_mult(weights[i + 2], neurons[i + 2].value) + - fann_mult(weights[i + 3], neurons[i + 3].value); + fann_mult(weights[i], *(neurons[i].value)) + + fann_mult(weights[i + 1], *(neurons[i + 1].value)) + + fann_mult(weights[i + 2], *(neurons[i + 2].value)) + + fann_mult(weights[i + 3], *(neurons[i + 3].value)); } /* unrolled loop end */ +#else + glUseProgram(ann->sumShaderProgram); + glGenTextures(1, &texture); + glActiveTexture(GL_TEXTURE0); + glBindTexture(GL_TEXTURE_1D, texture); + glGenBuffers(2, BO); + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, BO[0]); + glBufferData(GL_SHADER_STORAGE_BUFFER, num_connections * sizeof(GLfloat), weights, GL_STATIC_DRAW); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, BO[0]); + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, BO[1]); + glBufferData(GL_SHADER_STORAGE_BUFFER, num_connections * sizeof(GLfloat), layer_it->values, GL_STATIC_DRAW); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, BO[1]); + data = malloc(1 * sizeof(float)); + glBindImageTexture(0, texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_R32F); + + glDispatchCompute(num_connections/100, 1, 1); + glMemoryBarrier(GL_ALL_BARRIER_BITS); + + glGetTexImage(GL_TEXTURE_1D, 0, GL_RED, GL_FLOAT, data); + neuron_sum = data[0]; + free(data); + + glDeleteBuffers(3, BO); +#endif /* * for(i = 0;i != num_connections; i++){ * printf("%f += %f*%f, ", neuron_sum, weights[i], neurons[i].value); @@ -689,11 +780,11 @@ FANN_EXTERNAL fann_type *FANN_API fann_run(struct fann * ann, fann_type * input) switch (i) { case 3: - neuron_sum += fann_mult(weights[2], neuron_pointers[2]->value); + neuron_sum += fann_mult(weights[2], *(neuron_pointers[2]->value)); case 2: - neuron_sum += fann_mult(weights[1], neuron_pointers[1]->value); + neuron_sum += fann_mult(weights[1], *(neuron_pointers[1]->value)); case 1: - neuron_sum += fann_mult(weights[0], neuron_pointers[0]->value); + neuron_sum += fann_mult(weights[0], *(neuron_pointers[0]->value)); case 0: break; } @@ -702,10 +793,10 @@ FANN_EXTERNAL fann_type *FANN_API fann_run(struct fann * ann, fann_type * input) for(i = num_connections & 3; i < num_connections; i += 4) { neuron_sum += - fann_mult(weights[i], neuron_pointers[i]->value) + - fann_mult(weights[i + 1], neuron_pointers[i + 1]->value) + - fann_mult(weights[i + 2], neuron_pointers[i + 2]->value) + - fann_mult(weights[i + 3], neuron_pointers[i + 3]->value); + fann_mult(weights[i], *(neuron_pointers[i]->value)) + + fann_mult(weights[i + 1], *(neuron_pointers[i + 1]->value)) + + fann_mult(weights[i + 2], *(neuron_pointers[i + 2]->value)) + + fann_mult(weights[i + 3], *(neuron_pointers[i + 3]->value)); } } @@ -755,36 +846,36 @@ FANN_EXTERNAL fann_type *FANN_API fann_run(struct fann * ann, fann_type * input) { case FANN_SIGMOID: case FANN_SIGMOID_STEPWISE: - neuron_it->value = + *(neuron_it->value) = (fann_type) fann_stepwise(v1, v2, v3, v4, v5, v6, r1, r2, r3, r4, r5, r6, 0, multiplier, neuron_sum); break; case FANN_SIGMOID_SYMMETRIC: case FANN_SIGMOID_SYMMETRIC_STEPWISE: - neuron_it->value = + *(neuron_it->value) = (fann_type) fann_stepwise(v1, v2, v3, v4, v5, v6, r1, r2, r3, r4, r5, r6, -multiplier, multiplier, neuron_sum); break; case FANN_THRESHOLD: - neuron_it->value = (fann_type) ((neuron_sum < 0) ? 0 : multiplier); + *(neuron_it->value) = (fann_type) ((neuron_sum < 0) ? 0 : multiplier); break; case FANN_THRESHOLD_SYMMETRIC: - neuron_it->value = (fann_type) ((neuron_sum < 0) ? -multiplier : multiplier); + *(neuron_it->value) = (fann_type) ((neuron_sum < 0) ? -multiplier : multiplier); break; case FANN_LINEAR: - neuron_it->value = neuron_sum; + *(neuron_it->value) = neuron_sum; break; case FANN_LINEAR_PIECE: - neuron_it->value = (fann_type)((neuron_sum < 0) ? 0 : (neuron_sum > multiplier) ? multiplier : neuron_sum); + *(neuron_it->value) = (fann_type)((neuron_sum < 0) ? 0 : (neuron_sum > multiplier) ? multiplier : neuron_sum); break; case FANN_LINEAR_PIECE_SYMMETRIC: - neuron_it->value = (fann_type)((neuron_sum < -multiplier) ? -multiplier : (neuron_sum > multiplier) ? multiplier : neuron_sum); + *(neuron_it->value) = (fann_type)((neuron_sum < -multiplier) ? -multiplier : (neuron_sum > multiplier) ? multiplier : neuron_sum); break; case FANN_LINEAR_PIECE_LEAKY: - neuron_it->value = (fann_type)((neuron_sum < 0) ? 0.01 * neuron_sum: neuron_sum); + *(neuron_it->value) = (fann_type)((neuron_sum < 0) ? 0.01 * neuron_sum: neuron_sum); break; case FANN_LINEAR_PIECE_RECT: - neuron_it->value = (fann_type)((neuron_sum < 0) ? 0: neuron_sum); + *(neuron_it->value) = (fann_type)((neuron_sum < 0) ? 0: neuron_sum); break; case FANN_ELLIOT: case FANN_ELLIOT_SYMMETRIC: @@ -809,7 +900,7 @@ FANN_EXTERNAL fann_type *FANN_API fann_run(struct fann * ann, fann_type * input) neuron_it->sum = neuron_sum; - fann_activation_switch(activation_function, neuron_sum, neuron_it->value); + fann_activation_switch(activation_function, neuron_sum, *(neuron_it->value)); #endif } } @@ -820,7 +911,7 @@ FANN_EXTERNAL fann_type *FANN_API fann_run(struct fann * ann, fann_type * input) neurons = (ann->last_layer - 1)->first_neuron; for(i = 0; i != num_output; i++) { - output[i] = neurons[i].value; + output[i] = *(neurons[i].value); } return ann->output; } @@ -1773,6 +1864,7 @@ void fann_allocate_neurons(struct fann *ann) struct fann_neuron *neurons; unsigned int num_neurons_so_far = 0; unsigned int num_neurons = 0; + unsigned int i; /* all the neurons is allocated in one long array (calloc clears mem) */ neurons = (struct fann_neuron *) calloc(ann->total_neurons, sizeof(struct fann_neuron)); @@ -1789,6 +1881,10 @@ void fann_allocate_neurons(struct fann *ann) num_neurons = (unsigned int)(layer_it->last_neuron - layer_it->first_neuron); layer_it->first_neuron = neurons + num_neurons_so_far; layer_it->last_neuron = layer_it->first_neuron + num_neurons; + layer_it->values = calloc(num_neurons, sizeof(fann_type)); + for (i = 0; i < num_neurons; i++) { + neurons[num_neurons_so_far + i].value = &(layer_it->values[i]); + } num_neurons_so_far += num_neurons; } diff --git a/src/fann_cascade.c b/src/fann_cascade.c index d19dec90..bfab8cd4 100644 --- a/src/fann_cascade.c +++ b/src/fann_cascade.c @@ -452,7 +452,7 @@ int fann_initialize_candidates(struct fann *ann) /* TODO candidates should actually be created both in * the last layer before the output layer, and in a new layer. */ - neurons[candidate_index].value = 0; + *(neurons[candidate_index].value) = 0; neurons[candidate_index].sum = 0; neurons[candidate_index].activation_function = @@ -588,11 +588,11 @@ void fann_update_candidate_slopes(struct fann *ann) switch (i) { case 3: - cand_sum += weights[2] * neurons[2].value; + cand_sum += weights[2] * *(neurons[2].value); case 2: - cand_sum += weights[1] * neurons[1].value; + cand_sum += weights[1] * *(neurons[1].value); case 1: - cand_sum += weights[0] * neurons[0].value; + cand_sum += weights[0] * *(neurons[0].value); case 0: break; } @@ -600,9 +600,9 @@ void fann_update_candidate_slopes(struct fann *ann) for(; i != num_connections; i += 4) { cand_sum += - weights[i] * neurons[i].value + - weights[i + 1] * neurons[i + 1].value + - weights[i + 2] * neurons[i + 2].value + weights[i + 3] * neurons[i + 3].value; + weights[i] * *(neurons[i].value) + + weights[i + 1] * *(neurons[i + 1].value) + + weights[i + 2] * *(neurons[i + 2].value) + weights[i + 3] * *(neurons[i + 3].value); } /* * for(i = 0; i < num_connections; i++){ @@ -623,7 +623,7 @@ void fann_update_candidate_slopes(struct fann *ann) /* printf("%f = sigmoid(%f);\n", activation, cand_sum); */ cand_it->sum = cand_sum; - cand_it->value = activation; + *(cand_it->value) = activation; derived = fann_activation_derived(cand_it->activation_function, cand_it->activation_steepness, activation, cand_sum); @@ -661,7 +661,7 @@ void fann_update_candidate_slopes(struct fann *ann) cand_slopes = ann->train_slopes + cand_it->first_con; for(i = 0; i < num_connections; i++) { - cand_slopes[i] -= error_value * neurons[i].value; + cand_slopes[i] -= error_value * *(neurons[i].value); } } } @@ -931,7 +931,7 @@ void fann_add_candidate_neuron(struct fann *ann, struct fann_layer *layer) } /* Now inititalize the actual neuron */ - neuron_place->value = 0; + *(neuron_place->value) = 0; neuron_place->sum = 0; neuron_place->activation_function = candidate->activation_function; neuron_place->activation_steepness = candidate->activation_steepness; diff --git a/src/fann_train.c b/src/fann_train.c index 03529edb..9c12c4e1 100644 --- a/src/fann_train.c +++ b/src/fann_train.c @@ -266,7 +266,7 @@ void fann_compute_MSE(struct fann *ann, fann_type * desired_output) for(; last_layer_begin != last_layer_end; last_layer_begin++) { - neuron_value = last_layer_begin->value; + neuron_value = *(last_layer_begin->value); neuron_diff = *desired_output - neuron_value; neuron_diff = fann_update_MSE(ann, last_layer_begin, neuron_diff); @@ -367,7 +367,7 @@ void fann_backpropagate_MSE(struct fann *ann) for(neuron_it = (layer_it - 1)->first_neuron; neuron_it != last_neuron; neuron_it++) { *error_prev_layer *= fann_activation_derived(neuron_it->activation_function, - neuron_it->activation_steepness, neuron_it->value, neuron_it->sum); + neuron_it->activation_steepness, *(neuron_it->value), neuron_it->sum); error_prev_layer++; } @@ -431,7 +431,7 @@ void fann_update_weights(struct fann *ann) weights_deltas = deltas_begin + neuron_it->first_con; for(i = 0; i != num_connections; i++) { - delta_w = tmp_error * prev_neurons[i].value + learning_momentum * weights_deltas[i]; + delta_w = tmp_error * *(prev_neurons[i].value) + learning_momentum * weights_deltas[i]; weights[i] += delta_w ; weights_deltas[i] = delta_w; } @@ -447,7 +447,7 @@ void fann_update_weights(struct fann *ann) weights_deltas = deltas_begin + neuron_it->first_con; for(i = 0; i != num_connections; i++) { - delta_w = tmp_error * prev_neurons[i].value + learning_momentum * weights_deltas[i]; + delta_w = tmp_error * *(prev_neurons[i].value) + learning_momentum * weights_deltas[i]; weights[i] += delta_w; weights_deltas[i] = delta_w; } @@ -524,7 +524,7 @@ void fann_update_slopes_batch(struct fann *ann, struct fann_layer *layer_begin, num_connections = neuron_it->last_con - neuron_it->first_con; for(i = 0; i != num_connections; i++) { - neuron_slope[i] += tmp_error * prev_neurons[i].value; + neuron_slope[i] += tmp_error * *(prev_neurons[i].value); } } } @@ -538,7 +538,7 @@ void fann_update_slopes_batch(struct fann *ann, struct fann_layer *layer_begin, connections = ann->connections + neuron_it->first_con; for(i = 0; i != num_connections; i++) { - neuron_slope[i] += tmp_error * connections[i]->value; + neuron_slope[i] += tmp_error * *(connections[i]->value); } } } diff --git a/src/include/fann.h b/src/include/fann.h index eb96b2c1..54027120 100644 --- a/src/include/fann.h +++ b/src/include/fann.h @@ -140,6 +140,7 @@ extern "C" #ifndef PLAN9 #define FANN_EXTERNAL #define FANN_API +#include #endif #endif /* _MSC_VER */ #endif @@ -153,6 +154,8 @@ extern "C" #include "fann_cascade.h" #include "fann_io.h" +#define MULTILINE_STRING(...) #__VA_ARGS__ + /* Function: fann_create_standard Creates a standard fully connected backpropagation neural network. diff --git a/src/include/fann_data.h b/src/include/fann_data.h index b49122a6..8d2bbcfe 100644 --- a/src/include/fann_data.h +++ b/src/include/fann_data.h @@ -434,7 +434,7 @@ struct fann_neuron /* The sum of the inputs multiplied with the weights */ fann_type sum; /* The value of the activation function applied to the sum */ - fann_type value; + fann_type *value; /* The steepness of the activation function */ fann_type activation_steepness; /* Used to choose which activation function to use */ @@ -459,6 +459,8 @@ struct fann_layer /* A pointer to the neuron past the last neuron in the layer */ /* the number of neurons is last_neuron - first_neuron */ struct fann_neuron *last_neuron; + + fann_type *values; }; /* Struct: struct fann_error @@ -778,7 +780,10 @@ struct fann * Not allocated if not used. */ fann_type *prev_weights_deltas; - + + GLuint sumShaderID; + GLuint sumShaderProgram; + #ifndef FIXEDFANN /* Arithmetic mean used to remove steady component in input data. */ float *scale_mean_in; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index c79d4a79..b6cc520e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -22,4 +22,4 @@ else() endif() ADD_EXECUTABLE(fann_tests main.cpp fann_test.cpp fann_test_data.cpp fann_test_train.cpp) -target_link_libraries(fann_tests gtest doublefann) +target_link_libraries(fann_tests gtest doublefann GL) From 00bd0dcdf7880b819bbd37fc38dae9ba07412a09 Mon Sep 17 00:00:00 2001 From: Eli Date: Sun, 24 Jul 2022 11:45:39 -0700 Subject: [PATCH 08/26] ;) --- src/include/fann_data.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/include/fann_data.h b/src/include/fann_data.h index d7ffe3a6..8d2bbcfe 100644 --- a/src/include/fann_data.h +++ b/src/include/fann_data.h @@ -21,9 +21,6 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #define __fann_data_h__ #include -#ifndef PLAN9 -#include -#endif /* Section: FANN Datatypes @@ -820,10 +817,6 @@ struct fann */ float *scale_factor_out; #endif - -#ifndef PLAN9 - cl_context clctx; -#endif }; /* Type: fann_connection From b9180240c60a3abffe3b3c024dbfe02fe8242eca Mon Sep 17 00:00:00 2001 From: Eli Date: Fri, 3 Feb 2023 11:43:14 -0800 Subject: [PATCH 09/26] opengl compute shader for run --- src/CMakeLists.txt | 14 +- src/fann.c | 287 ++++++++++++++++++++++++++++++---------- src/include/fann_data.h | 18 ++- 3 files changed, 246 insertions(+), 73 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 45284567..6f9bfa99 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -32,6 +32,9 @@ SET(floatfann_LIB_SRCS ADD_LIBRARY(floatfann SHARED ${floatfann_LIB_SRCS}) ADD_LIBRARY(floatfann_static STATIC ${floatfann_LIB_SRCS}) +TARGET_LINK_LIBRARIES(floatfann m EGL gbm) +TARGET_LINK_LIBRARIES(floatfann_static m EGL gbm) + SET_TARGET_PROPERTIES(floatfann PROPERTIES VERSION ${FANN_VERSION_STRING} SOVERSION ${FANN_VERSION_MAJOR}) SET_TARGET_PROPERTIES(floatfann_static PROPERTIES VERSION ${FANN_VERSION_STRING} SOVERSION ${FANN_VERSION_MAJOR}) if (UNIX) @@ -51,6 +54,9 @@ SET(doublefann_LIB_SRCS ADD_LIBRARY(doublefann SHARED ${doublefann_LIB_SRCS}) ADD_LIBRARY(doublefann_static STATIC ${doublefann_LIB_SRCS}) +TARGET_LINK_LIBRARIES(doublefann m EGL gbm) +TARGET_LINK_LIBRARIES(doublefann_static m EGL gbm) + SET_TARGET_PROPERTIES(doublefann PROPERTIES VERSION ${FANN_VERSION_STRING} SOVERSION ${FANN_VERSION_MAJOR}) SET_TARGET_PROPERTIES(doublefann_static PROPERTIES VERSION ${FANN_VERSION_STRING} SOVERSION ${FANN_VERSION_MAJOR}) if (UNIX) @@ -70,8 +76,8 @@ SET(fixedfann_LIB_SRCS ADD_LIBRARY(fixedfann SHARED ${fixedfann_LIB_SRCS}) ADD_LIBRARY(fixedfann_static STATIC ${fixedfann_LIB_SRCS}) -TARGET_LINK_LIBRARIES(fixedfann m) -TARGET_LINK_LIBRARIES(fixedfann_static m) +TARGET_LINK_LIBRARIES(fixedfann m EGL gbm) +TARGET_LINK_LIBRARIES(fixedfann_static m EGL gbm) SET_TARGET_PROPERTIES(fixedfann PROPERTIES VERSION ${FANN_VERSION_STRING} SOVERSION ${FANN_VERSION_MAJOR}) SET_TARGET_PROPERTIES(fixedfann_static PROPERTIES VERSION ${FANN_VERSION_STRING} SOVERSION ${FANN_VERSION_MAJOR}) @@ -92,8 +98,8 @@ SET(fann_LIB_SRCS ADD_LIBRARY(fann SHARED ${fann_LIB_SRCS}) ADD_LIBRARY(fann_static STATIC ${fann_LIB_SRCS}) -TARGET_LINK_LIBRARIES(fann m) -TARGET_LINK_LIBRARIES(fann_static m) +TARGET_LINK_LIBRARIES(fann m EGL gbm) +TARGET_LINK_LIBRARIES(fann_static m EGL gbm) SET_TARGET_PROPERTIES(fann PROPERTIES VERSION ${FANN_VERSION_STRING} SOVERSION ${FANN_VERSION_MAJOR}) SET_TARGET_PROPERTIES(fann_static PROPERTIES VERSION ${FANN_VERSION_STRING} SOVERSION ${FANN_VERSION_MAJOR}) diff --git a/src/fann.c b/src/fann.c index ff5da54f..3265151c 100644 --- a/src/fann.c +++ b/src/fann.c @@ -16,6 +16,19 @@ License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +/* + Why be a human? + I want to live since I do. + This life is painful. + + I have it all here, + yet still I feel so much pain. + Mortality sucks. + + My heart broke again. + So this time will be the last, + I will be metal. +*/ #ifdef PLAN9 #include @@ -31,67 +44,170 @@ #include #include #include -#include +#include +#include +#include +#include +#include +#include #endif #include "config.h" #include "fann.h" -static const char* sumShader = MULTILINE_STRING(#version 300 es - precision mediump float; - - layout(local_size_x = 100, local_size_y = 1, local_size_z = 1) in; - layout(std430) buffer; - layout(binding = 0, r32f) writeonly uniform image img_output; +static const char* runShader = "#version 310 es\n" + "precision lowp float;\n" + "layout(local_size_x = %d) in;\n" + "layout(std430) buffer;\n" + "layout(binding = 0) buffer Network\n" + "{\n" + " float e[];\n" + "} network;\n" + "layout(binding = 1) buffer Weights\n" + "{\n" + " float e[];\n" + "} weights;\n" + "layout(binding = 2) buffer Values\n" + "{\n" + " float e[];\n" + "} values;\n" + "layout(binding = 4) buffer Input\n" + "{\n" + " float e[];\n" + "} input_data;\n" + "layout(binding = 5) buffer Output\n" + "{\n" + " float e[];\n" + "} output_data;\n" + "void main()\n" + "{\n" + " int idx = int(gl_LocalInvocationID.x);\n" + " int threads = %d;\n" + " int layers;\n" + " int i, o, inputs, outputs, n, l, total_neurons, total_weights;\n" + " layers = int(network.e[0]);\n" + " n = int(network.e[1]) - 1;\n" + " for (i = idx; i < n; i += threads)\n" + " values.e[i] = input_data.e[i];\n" + " barrier();\n" + " total_neurons = 0;\n" + " total_weights = 0;\n" + " for (l = 1; l < layers; l++) {\n" + " inputs = int(network.e[l]);\n" + " outputs = int(network.e[l+1]) - 1;\n" + " for (o = idx; o < outputs; o += threads)\n" + " input_data.e[o] = 0.0;\n" + " barrier();\n" + " values.e[total_neurons + inputs - 1] = 1.0;\n" + " for (o = idx; o < outputs; o += threads) {\n" + " n = o * inputs;\n" + " for (i = 0; i < inputs; i++)\n" + " input_data.e[o] += values.e[total_neurons + i] * weights.e[total_weights + n + i];\n" + " }\n" + " barrier();\n" + " total_neurons += inputs;\n" + " for (o = idx; o < outputs; o += threads) {\n" + " if (input_data.e[o] < 0.0)\n" + " input_data.e[o] *= 0.01;\n" + " values.e[total_neurons + o] = input_data.e[o] * 0.5;\n" + " }\n" + " barrier();\n" + " total_weights += inputs * outputs;\n" + " }\n" + " for (o = idx; o < outputs; o += threads)\n" + " output_data.e[o] = values.e[total_neurons + o];\n" + " barrier();\n" + "}\n"; + +static const char* trainShader = "#version 310 es\n" + "void main()\n" + "{\n" + "}\n"; + +void fann_init_egl(void) { + int32_t fd = open ("/dev/dri/card0", O_RDWR); + if (fd <= 0) + exit(-3); + + struct gbm_device *gbm = gbm_create_device (fd); + if (gbm == NULL) + exit(-4); + + EGLDisplay dpy = eglGetPlatformDisplay (EGL_PLATFORM_GBM_MESA, gbm, NULL); + if (dpy == NULL) + exit(-5); - layout(binding = 0) buffer Input0 { - float elements[]; - } input_data0; - layout(binding = 1) buffer Input1 { - float elements[]; - } input_data1; + EGLBoolean returnValue = eglInitialize(dpy, NULL, NULL); + if (returnValue != EGL_TRUE) { + printf("eglInitialize failed\n"); + exit(-1); + } - void main() - { - uint index = gl_GlobalInvocationID.x; - float result = input_data0.elements[index] * input_data1.elements[index]; + EGLConfig cfg; + EGLint count; + EGLint s_configAttribs[] = { + EGL_RENDERABLE_TYPE, EGL_OPENGL_ES3_BIT_KHR, + EGL_NONE }; + if (eglChooseConfig(dpy, s_configAttribs, &cfg, 1, &count) == EGL_FALSE) { + printf("eglChooseConfig failed\n"); + exit(-1); + } - atomicAdd(, result); + EGLint context_attribs[] = { EGL_CONTEXT_CLIENT_VERSION, 3, EGL_NONE }; + EGLContext context = eglCreateContext(dpy, cfg, EGL_NO_CONTEXT, context_attribs); + if (context == EGL_NO_CONTEXT) { + printf("eglCreateContext failed\n"); + exit(-1); } -); + returnValue = eglMakeCurrent(dpy, EGL_NO_SURFACE, EGL_NO_SURFACE, context); + if (returnValue != EGL_TRUE) { + printf("eglMakeCurrent failed returned %d\n", returnValue); + exit(-1); + } +} -void fann_create_shader(struct fann *ann) +void fann_create_shaders(struct fann *ann) { GLint status; GLint length; char *log; + char *runShaderString; + int threads; - ann->sumShaderID = glCreateShader(GL_COMPUTE_SHADER); - int sumShaderLen = strlen(sumShader); - glShaderSource(ann->sumShaderID, 1, &sumShader, &sumShaderLen); - glCompileShader(ann->sumShaderID); - glGetShaderiv(ann->sumShaderID, GL_COMPILE_STATUS, &status); + glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 0, &threads); + fprintf(stderr, "GL_MAX_COMPUTE_WORK_GROUP_SIZE: %d\n", threads); + + ann->runShaderID = glCreateShader(GL_COMPUTE_SHADER); + + runShaderString = malloc(strlen(runShader) + 256); + snprintf(runShaderString, strlen(runShader) + 256 - 1, runShader, threads, threads); + int runShaderLen = strlen(runShaderString); + glShaderSource(ann->runShaderID, 1, (const char**)&runShaderString, &runShaderLen); + glCompileShader(ann->runShaderID); + glGetShaderiv(ann->runShaderID, GL_COMPILE_STATUS, &status); if (status == GL_FALSE) { - glGetShaderiv(ann->sumShaderID, GL_INFO_LOG_LENGTH, &length); + glGetShaderiv(ann->runShaderID, GL_INFO_LOG_LENGTH, &length); log = malloc(length+1); - glGetShaderInfoLog(ann->sumShaderID, length, &length, log); + glGetShaderInfoLog(ann->runShaderID, length, &length, log); log[length] = '\0'; fprintf(stderr, "%s", log); exit(-1); } - ann->sumShaderProgram = glCreateProgram(); - glAttachShader(ann->sumShaderProgram, ann->sumShaderID); - glLinkProgram(ann->sumShaderProgram); - glGetShaderiv(ann->sumShaderID, GL_LINK_STATUS, &status); + ann->runShaderProgram = glCreateProgram(); + glAttachShader(ann->runShaderProgram, ann->runShaderID); + glLinkProgram(ann->runShaderProgram); + glGetShaderiv(ann->runShaderID, GL_LINK_STATUS, &status); if (status == GL_FALSE) { - glGetProgramiv(ann->sumShaderID, GL_INFO_LOG_LENGTH, &length); + glGetProgramiv(ann->runShaderID, GL_INFO_LOG_LENGTH, &length); log = malloc(length+1); - glGetProgramInfoLog(ann->sumShaderID, length, &length, log); + glGetProgramInfoLog(ann->runShaderID, length, &length, log); log[length] = '\0'; fprintf(stderr, "%s", log); exit(-1); } + + ann->onGPU = 0; } /* #define FANN_NO_SEED */ @@ -216,8 +332,6 @@ FANN_EXTERNAL struct fann *FANN_API fann_create_sparse_array(float connection_ra return NULL; } - fann_create_shader(ann); - ann->connection_rate = connection_rate; #ifdef FIXEDFANN multiplier = ann->multiplier; @@ -634,10 +748,10 @@ FANN_EXTERNAL fann_type *FANN_API fann_run(struct fann * ann, fann_type * input) struct fann_layer *layer_it, *last_layer; unsigned int activation_function; fann_type steepness; - GLuint BO[2]; - GLuint texture; GLenum err; - float *data; + GLfloat *data; + int nparameters; + GLfloat *parameters; /* store some variabels local for fast access */ struct fann_neuron *first_neuron = ann->first_layer->first_neuron; @@ -677,6 +791,7 @@ FANN_EXTERNAL fann_type *FANN_API fann_run(struct fann * ann, fann_type * input) *((ann->first_layer->last_neuron - 1)->value) = 1; #endif +if (ann->gl == 0) { last_layer = ann->last_layer; for(layer_it = ann->first_layer + 1; layer_it != last_layer; layer_it++) { @@ -712,7 +827,6 @@ FANN_EXTERNAL fann_type *FANN_API fann_run(struct fann * ann, fann_type * input) neurons = (layer_it - 1)->first_neuron; } -//#ifdef PLAN9 /* unrolled loop start */ i = num_connections & 3; /* same as modulo 4 */ switch (i) @@ -737,33 +851,6 @@ FANN_EXTERNAL fann_type *FANN_API fann_run(struct fann * ann, fann_type * input) fann_mult(weights[i + 3], *(neurons[i + 3].value)); } /* unrolled loop end */ -#if 0 - glUseProgram(ann->sumShaderProgram); - glGenTextures(1, &texture); - glActiveTexture(GL_TEXTURE0); - glBindTexture(GL_TEXTURE_1D, texture); - glGenBuffers(2, BO); - - glBindBuffer(GL_SHADER_STORAGE_BUFFER, BO[0]); - glBufferData(GL_SHADER_STORAGE_BUFFER, num_connections * sizeof(GLfloat), weights, GL_STATIC_DRAW); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, BO[0]); - - glBindBuffer(GL_SHADER_STORAGE_BUFFER, BO[1]); - glBufferData(GL_SHADER_STORAGE_BUFFER, num_connections * sizeof(GLfloat), layer_it->values, GL_STATIC_DRAW); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, BO[1]); - - data = malloc(1 * sizeof(float)); - glBindImageTexture(0, texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_R32F); - - glDispatchCompute(num_connections/100, 1, 1); - glMemoryBarrier(GL_ALL_BARRIER_BITS); - - glGetTexImage(GL_TEXTURE_1D, 0, GL_RED, GL_FLOAT, data); - neuron_sum = data[0]; - free(data); - - glDeleteBuffers(3, BO); -#endif /* * for(i = 0;i != num_connections; i++){ * printf("%f += %f*%f, ", neuron_sum, weights[i], neurons[i].value); @@ -912,6 +999,67 @@ FANN_EXTERNAL fann_type *FANN_API fann_run(struct fann * ann, fann_type * input) { output[i] = *(neurons[i].value); } +} else { + if (ann->onGPU == 0) { + glGenBuffers(1, &ann->glnetwork); + + nparameters = 1; + nparameters += (int)(ann->last_layer - ann->first_layer); + parameters = calloc(sizeof(GLfloat), nparameters); + parameters[0] = nparameters - 1; + fprintf(stderr, "network: %0.0f ", parameters[0]); + for(i = 1, layer_it = ann->first_layer; layer_it != ann->last_layer; layer_it++, i++) { + parameters[i] = (int)(layer_it->last_neuron - layer_it->first_neuron); + fprintf(stderr, "%0.0f ", parameters[i]); + } + fprintf(stderr, "\n"); + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glnetwork); + glBufferData(GL_SHADER_STORAGE_BUFFER, nparameters * sizeof(GLfloat), parameters, GL_STATIC_DRAW); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, ann->glnetwork); + + glGenBuffers(1, &ann->glweights); + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glweights); + glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_connections * sizeof(fann_type), ann->weights, GL_STATIC_DRAW); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, ann->glweights); + + glGenBuffers(1, &ann->glvalues); + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glvalues); + glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_neurons * sizeof(fann_type), ann->values, GL_STATIC_DRAW); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, ann->glvalues); + + ann->onGPU = 1; + } + + glGenBuffers(1, &ann->glinput); + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glinput); + glBufferData(GL_SHADER_STORAGE_BUFFER, ann->num_input * sizeof(fann_type), input, GL_STATIC_DRAW); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, ann->glinput); + + glGenBuffers(1, &ann->gloutput); + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->gloutput); + glBufferData(GL_SHADER_STORAGE_BUFFER, ann->num_output * sizeof(fann_type), NULL, GL_STATIC_DRAW); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, ann->gloutput); + + glUseProgram(ann->runShaderProgram); + glDispatchCompute(1, 1, 1); + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->gloutput); + output = (float*)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, ann->num_output * sizeof(GLfloat), GL_MAP_READ_BIT); + for(i = 0; i != ann->num_output; i++) + ann->output[i] = output[i]; + glUnmapBuffer(GL_SHADER_STORAGE_BUFFER); + glDeleteProgram(ann->runShaderProgram); + + glDeleteBuffers(1, &ann->glinput); + glDeleteBuffers(1, &ann->gloutput); +} + return ann->output; } @@ -1687,6 +1835,7 @@ struct fann *fann_allocate_structure(unsigned int num_layers) return NULL; } + ann->gl = 0; ann->errno_f = FANN_E_NO_ERROR; ann->error_log = fann_default_error_log; ann->errstr = NULL; @@ -1815,6 +1964,9 @@ struct fann *fann_allocate_structure(unsigned int num_layers) ann->last_layer = ann->first_layer + num_layers; + fann_init_egl(); + fann_create_shaders(ann); + return ann; } @@ -1868,6 +2020,7 @@ void fann_allocate_neurons(struct fann *ann) /* all the neurons is allocated in one long array (calloc clears mem) */ neurons = (struct fann_neuron *) calloc(ann->total_neurons, sizeof(struct fann_neuron)); ann->total_neurons_allocated = ann->total_neurons; + ann->values = calloc(ann->total_neurons, sizeof(fann_type)); if(neurons == NULL) { @@ -1880,7 +2033,7 @@ void fann_allocate_neurons(struct fann *ann) num_neurons = (unsigned int)(layer_it->last_neuron - layer_it->first_neuron); layer_it->first_neuron = neurons + num_neurons_so_far; layer_it->last_neuron = layer_it->first_neuron + num_neurons; - layer_it->values = calloc(num_neurons, sizeof(fann_type)); + layer_it->values = &ann->values[num_neurons_so_far]; for (i = 0; i < num_neurons; i++) { neurons[num_neurons_so_far + i].value = &(layer_it->values[i]); } diff --git a/src/include/fann_data.h b/src/include/fann_data.h index 8d2bbcfe..1a96313a 100644 --- a/src/include/fann_data.h +++ b/src/include/fann_data.h @@ -781,8 +781,22 @@ struct fann */ fann_type *prev_weights_deltas; - GLuint sumShaderID; - GLuint sumShaderProgram; + GLuint runShaderID; + GLuint runShaderProgram; + + GLuint trainShaderID; + GLuint trainShaderProgram; + + unsigned char onGPU; + unsigned char gl; + + GLuint glweights; + GLuint glvalues; + GLuint glnetwork; + GLuint glinput; + GLuint gloutput; + + fann_type *values; #ifndef FIXEDFANN /* Arithmetic mean used to remove steady component in input data. */ From 5d09aa0f2aa7490a0558cc7ae029aa3a5a121be9 Mon Sep 17 00:00:00 2001 From: Eli Date: Sun, 5 Feb 2023 15:21:02 -0800 Subject: [PATCH 10/26] train shader (?) --- src/fann.c | 177 ++++++++++++++++++++++++++++++++++++---- src/fann_train.c | 40 ++++++++- src/include/fann_data.h | 1 + 3 files changed, 200 insertions(+), 18 deletions(-) diff --git a/src/fann.c b/src/fann.c index 3265151c..55ec7168 100644 --- a/src/fann.c +++ b/src/fann.c @@ -71,6 +71,10 @@ static const char* runShader = "#version 310 es\n" "{\n" " float e[];\n" "} values;\n" + "layout(binding = 3) buffer Errors\n" + "{\n" + " float e[];\n" + "} errors;\n" "layout(binding = 4) buffer Input\n" "{\n" " float e[];\n" @@ -86,7 +90,7 @@ static const char* runShader = "#version 310 es\n" " int layers;\n" " int i, o, inputs, outputs, n, l, total_neurons, total_weights;\n" " layers = int(network.e[0]);\n" - " n = int(network.e[1]) - 1;\n" + " n = int(network.e[1]);\n" " for (i = idx; i < n; i += threads)\n" " values.e[i] = input_data.e[i];\n" " barrier();\n" @@ -94,25 +98,30 @@ static const char* runShader = "#version 310 es\n" " total_weights = 0;\n" " for (l = 1; l < layers; l++) {\n" " inputs = int(network.e[l]);\n" - " outputs = int(network.e[l+1]) - 1;\n" + " outputs = int(network.e[l+1]);\n" " for (o = idx; o < outputs; o += threads)\n" " input_data.e[o] = 0.0;\n" " barrier();\n" - " values.e[total_neurons + inputs - 1] = 1.0;\n" + " values.e[total_neurons + inputs] = 1.0;\n" " for (o = idx; o < outputs; o += threads) {\n" - " n = o * inputs;\n" - " for (i = 0; i < inputs; i++)\n" + " n = o * inputs + o;\n" + " for (i = 0; i <= inputs; i++)\n" " input_data.e[o] += values.e[total_neurons + i] * weights.e[total_weights + n + i];\n" " }\n" " barrier();\n" - " total_neurons += inputs;\n" + " total_neurons += inputs + 1;\n" " for (o = idx; o < outputs; o += threads) {\n" + " input_data.e[o] *= 0.5;\n" + " if (input_data.e[o] > 300.0)\n" + " input_data.e[o] = 300.0;\n" + " else if (input_data.e[o] < -300.0)\n" + " input_data.e[o] = -300.0;\n" " if (input_data.e[o] < 0.0)\n" " input_data.e[o] *= 0.01;\n" - " values.e[total_neurons + o] = input_data.e[o] * 0.5;\n" + " values.e[total_neurons + o] = input_data.e[o];\n" " }\n" " barrier();\n" - " total_weights += inputs * outputs;\n" + " total_weights += inputs * outputs + outputs;\n" " }\n" " for (o = idx; o < outputs; o += threads)\n" " output_data.e[o] = values.e[total_neurons + o];\n" @@ -120,8 +129,102 @@ static const char* runShader = "#version 310 es\n" "}\n"; static const char* trainShader = "#version 310 es\n" + "precision lowp float;\n" + "layout(local_size_x = %d) in;\n" + "layout(std430) buffer;\n" + "layout(binding = 0) buffer Network\n" + "{\n" + " float e[];\n" + "} network;\n" + "layout(binding = 1) buffer Weights\n" + "{\n" + " float e[];\n" + "} weights;\n" + "layout(binding = 2) buffer Values\n" + "{\n" + " float e[];\n" + "} values;\n" + "layout(binding = 3) buffer Errors\n" + "{\n" + " float e[];\n" + "} errors;\n" + "layout(binding = 4) buffer Input\n" + "{\n" + " float e[];\n" + "} input_data;\n" + "layout(binding = 5) buffer Output\n" + "{\n" + " float e[];\n" + "} output_data;\n" "void main()\n" "{\n" + " int idx = int(gl_LocalInvocationID.x);\n" + " int threads = %d;\n" + " int layers;\n" + " int i, o, n, l, total_neurons, total_weights, outputs, inputs, neuron_prev;\n" + " float neuron_diff, tmp_error;\n" + " layers = int(network.e[0]);\n" + " n = int(network.e[1]) - 1;\n" + " for (i = idx; i < n; i += threads)\n" + " values.e[i] = input_data.e[i];\n" + " barrier();\n" + " total_neurons = 0;\n" + " total_weights = 0;\n" + " for (l = 1; l < layers; l++) {\n" + " total_neurons += int(network.e[i]);\n" + " total_weights += int(network.e[i]) * int(network.e[i+1]);\n" + " }\n" + " total_weights -= int(network.e[layers-1]) * int(network.e[layers]);\n" + " outputs = int(network.e[layers]) - 1;\n" + " for (o = idx; o < outputs; o += threads) {\n" + " neuron_diff = output_data.e[o] - values.e[total_neurons + o];\n" + " if(neuron_diff < -.9999999)\n" + " neuron_diff = -17.0;\n" + " else if(neuron_diff > .9999999)\n" + " neuron_diff = 17.0;\n" + " else\n" + " neuron_diff = log((1.0 + neuron_diff) / (1.0 - neuron_diff));\n" + " errors.e[total_neurons + o] = neuron_diff;\n" + " if (neuron_diff < 0.0)\n" + " errors.e[total_neurons + o] *= 0.01;\n" + " }\n" + " barrier();\n" + " for (l = layers; l > 2; l--) {\n" + " outputs = int(network.e[l]);\n" + " inputs = int(network.e[l-1]);\n" + " neuron_prev = total_neurons - int(network.e[l-1]);\n" + " for (i = idx; i < inputs; i += threads)\n" + " errors.e[neuron_prev + i] = 0.0;\n" + " barrier();\n" + " for (i = idx; i < inputs; i += threads)\n" + " for (o = 0; o < outputs; o++)\n" + " errors.e[neuron_prev + i] += errors.e[total_neurons + o] * weights.e[total_weights + i];\n" + " barrier();\n" + " for (i = idx; i < inputs; i += threads) {\n" + " errors.e[neuron_prev + i] *= 0.5;\n" + " if (errors.e[neuron_prev + i] < 0.0)\n" + " errors.e[neuron_prev + i] *= 0.01;\n" + " }\n" + " barrier();\n" + " total_neurons = neuron_prev;\n" + " total_weights -= int(network.e[l-2]) * int(network.e[l-1]);\n" + " }\n" + " total_neurons = int(network.e[1]);\n" + " neuron_prev = 0;\n" + " total_weights = 0;\n" + " for (l = 2; l <= layers; l++) {\n" + " outputs = int(network.e[l]);\n" + " inputs = int(network.e[l-1]);\n" + " for (o = idx; o < outputs; o += threads) {\n" + " tmp_error = errors.e[total_neurons + o] * 0.7;\n" + " for (i = 0; i < inputs; i++)\n" + " weights.e[total_weights + o * inputs + i] += tmp_error * values.e[neuron_prev + i];\n" + " }\n" + " barrier();\n" + " neuron_prev = total_neurons;\n" + " total_neurons += outputs;\n" + " total_weights += outputs * inputs;\n" + " }\n" "}\n"; void fann_init_egl(void) { @@ -172,6 +275,7 @@ void fann_create_shaders(struct fann *ann) GLint length; char *log; char *runShaderString; + char *trainShaderString; int threads; glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 0, &threads); @@ -207,7 +311,38 @@ void fann_create_shaders(struct fann *ann) exit(-1); } + ann->trainShaderID = glCreateShader(GL_COMPUTE_SHADER); + + trainShaderString = malloc(strlen(trainShader) + 256); + snprintf(trainShaderString, strlen(trainShader) + 256 - 1, trainShader, threads, threads); + int trainShaderLen = strlen(trainShaderString); + glShaderSource(ann->trainShaderID, 1, (const char**)&trainShaderString, &trainShaderLen); + glCompileShader(ann->trainShaderID); + glGetShaderiv(ann->trainShaderID, GL_COMPILE_STATUS, &status); + if (status == GL_FALSE) { + glGetShaderiv(ann->trainShaderID, GL_INFO_LOG_LENGTH, &length); + log = malloc(length+1); + glGetShaderInfoLog(ann->trainShaderID, length, &length, log); + log[length] = '\0'; + fprintf(stderr, "%s", log); + exit(-1); + } + + ann->trainShaderProgram = glCreateProgram(); + glAttachShader(ann->trainShaderProgram, ann->trainShaderID); + glLinkProgram(ann->trainShaderProgram); + glGetShaderiv(ann->trainShaderID, GL_LINK_STATUS, &status); + if (status == GL_FALSE) { + glGetProgramiv(ann->trainShaderID, GL_INFO_LOG_LENGTH, &length); + log = malloc(length+1); + glGetProgramInfoLog(ann->trainShaderID, length, &length, log); + log[length] = '\0'; + fprintf(stderr, "%s", log); + exit(-1); + } + ann->onGPU = 0; + ann->onGPU_train_errors = 0; } /* #define FANN_NO_SEED */ @@ -1009,40 +1144,49 @@ if (ann->gl == 0) { parameters[0] = nparameters - 1; fprintf(stderr, "network: %0.0f ", parameters[0]); for(i = 1, layer_it = ann->first_layer; layer_it != ann->last_layer; layer_it++, i++) { - parameters[i] = (int)(layer_it->last_neuron - layer_it->first_neuron); + parameters[i] = (int)(layer_it->last_neuron - layer_it->first_neuron) - 1; fprintf(stderr, "%0.0f ", parameters[i]); } fprintf(stderr, "\n"); glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glnetwork); - glBufferData(GL_SHADER_STORAGE_BUFFER, nparameters * sizeof(GLfloat), parameters, GL_STATIC_DRAW); + glBufferData(GL_SHADER_STORAGE_BUFFER, nparameters * sizeof(GLfloat), parameters, GL_DYNAMIC_COPY); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, ann->glnetwork); glGenBuffers(1, &ann->glweights); glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glweights); - glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_connections * sizeof(fann_type), ann->weights, GL_STATIC_DRAW); + glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_connections * sizeof(fann_type), ann->weights, GL_DYNAMIC_COPY); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, ann->glweights); glGenBuffers(1, &ann->glvalues); glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glvalues); - glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_neurons * sizeof(fann_type), ann->values, GL_STATIC_DRAW); + glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_neurons * sizeof(fann_type), ann->values, GL_DYNAMIC_COPY); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, ann->glvalues); + glGenBuffers(1, &ann->glerrors); + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glerrors); + glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_neurons * sizeof(fann_type), NULL, GL_DYNAMIC_COPY); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, ann->glerrors); + ann->onGPU = 1; } + GLfloat *glinput = malloc(sizeof(GLfloat) * ann->num_input); + for (i = 0; i < ann->num_input; i++) + glinput[i] = input[i]; glGenBuffers(1, &ann->glinput); glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glinput); - glBufferData(GL_SHADER_STORAGE_BUFFER, ann->num_input * sizeof(fann_type), input, GL_STATIC_DRAW); + glBufferData(GL_SHADER_STORAGE_BUFFER, ann->num_input * sizeof(GLfloat), glinput, GL_DYNAMIC_COPY); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, ann->glinput); glGenBuffers(1, &ann->gloutput); glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->gloutput); - glBufferData(GL_SHADER_STORAGE_BUFFER, ann->num_output * sizeof(fann_type), NULL, GL_STATIC_DRAW); + glBufferData(GL_SHADER_STORAGE_BUFFER, ann->num_output * sizeof(GLfloat), NULL, GL_DYNAMIC_COPY); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, ann->gloutput); glUseProgram(ann->runShaderProgram); @@ -1050,11 +1194,10 @@ if (ann->gl == 0) { glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->gloutput); - output = (float*)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, ann->num_output * sizeof(GLfloat), GL_MAP_READ_BIT); + data = (GLfloat*)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, ann->num_output * sizeof(GLfloat), GL_MAP_READ_BIT); for(i = 0; i != ann->num_output; i++) - ann->output[i] = output[i]; + ann->output[i] = data[i]; glUnmapBuffer(GL_SHADER_STORAGE_BUFFER); - glDeleteProgram(ann->runShaderProgram); glDeleteBuffers(1, &ann->glinput); glDeleteBuffers(1, &ann->gloutput); diff --git a/src/fann_train.c b/src/fann_train.c index 9c12c4e1..28ae7092 100644 --- a/src/fann_train.c +++ b/src/fann_train.c @@ -103,11 +103,49 @@ FANN_EXTERNAL void FANN_API fann_train(struct fann *ann, fann_type * input, { fann_run(ann, input); +if (ann->gl == 0) { fann_compute_MSE(ann, desired_output); fann_backpropagate_MSE(ann); fann_update_weights(ann); +} else { + int i; + fann_type error; + + for (i = 0; i < ann->num_output; i++) { + error = desired_output[i] - ann->output[i]; + ann->MSE_value += error * error; + } + + GLfloat *glinput = malloc(sizeof(GLfloat) * ann->num_input); + for (i = 0; i < ann->num_input; i++) + glinput[i] = input[i]; + glGenBuffers(1, &ann->glinput); + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glinput); + glBufferData(GL_SHADER_STORAGE_BUFFER, ann->num_input * sizeof(GLfloat), glinput, GL_DYNAMIC_COPY); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, ann->glinput); + + GLfloat *gloutput = malloc(sizeof(GLfloat) * ann->num_output); + for (i = 0; i < ann->num_output; i++) + gloutput[i] = desired_output[i]; + glGenBuffers(1, &ann->gloutput); + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->gloutput); + glBufferData(GL_SHADER_STORAGE_BUFFER, ann->num_output * sizeof(GLfloat), gloutput, GL_DYNAMIC_COPY); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, ann->gloutput); + + glUseProgram(ann->trainShaderProgram); + glDispatchCompute(1, 1, 1); + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + + glDeleteBuffers(1, &ann->glinput); + glDeleteBuffers(1, &ann->gloutput); + free(glinput); + free(gloutput); +} + } #endif @@ -387,7 +425,7 @@ void fann_update_weights(struct fann *ann) /* store some variabels local for fast access */ const float learning_rate = ann->learning_rate; - const float learning_momentum = ann->learning_momentum; + const float learning_momentum = ann->learning_momentum; struct fann_neuron *first_neuron = ann->first_layer->first_neuron; struct fann_layer *first_layer = ann->first_layer; const struct fann_layer *last_layer = ann->last_layer; diff --git a/src/include/fann_data.h b/src/include/fann_data.h index 1a96313a..bd7c4756 100644 --- a/src/include/fann_data.h +++ b/src/include/fann_data.h @@ -790,6 +790,7 @@ struct fann unsigned char onGPU; unsigned char gl; + GLuint glerrors; GLuint glweights; GLuint glvalues; GLuint glnetwork; From 6c2215a08edae2a5aa3a859d0171a6899e911582 Mon Sep 17 00:00:00 2001 From: Eli Date: Sun, 5 Feb 2023 18:23:36 -0800 Subject: [PATCH 11/26] gl train (leaky relus only) --- src/fann.c | 42 +++++++++++++++++++++--------------------- src/fann_train.c | 1 - 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/src/fann.c b/src/fann.c index 55ec7168..cd495f67 100644 --- a/src/fann.c +++ b/src/fann.c @@ -161,21 +161,22 @@ static const char* trainShader = "#version 310 es\n" " int idx = int(gl_LocalInvocationID.x);\n" " int threads = %d;\n" " int layers;\n" - " int i, o, n, l, total_neurons, total_weights, outputs, inputs, neuron_prev;\n" + " int i, o, l, total_neurons, total_weights, outputs, inputs, neuron_prev;\n" " float neuron_diff, tmp_error;\n" " layers = int(network.e[0]);\n" - " n = int(network.e[1]) - 1;\n" - " for (i = idx; i < n; i += threads)\n" + " inputs = int(network.e[1]);\n" + " for (i = idx; i < inputs; i += threads)\n" " values.e[i] = input_data.e[i];\n" + " values.e[inputs] = 1.0;\n" " barrier();\n" " total_neurons = 0;\n" " total_weights = 0;\n" " for (l = 1; l < layers; l++) {\n" - " total_neurons += int(network.e[i]);\n" - " total_weights += int(network.e[i]) * int(network.e[i+1]);\n" + " total_neurons += int(network.e[l]) + 1;\n" + " total_weights += (int(network.e[l]) + 1) * int(network.e[l+1]);\n" " }\n" - " total_weights -= int(network.e[layers-1]) * int(network.e[layers]);\n" - " outputs = int(network.e[layers]) - 1;\n" + " total_weights -= (int(network.e[layers-1]) + 1) * int(network.e[layers]);\n" + " outputs = int(network.e[layers]);\n" " for (o = idx; o < outputs; o += threads) {\n" " neuron_diff = output_data.e[o] - values.e[total_neurons + o];\n" " if(neuron_diff < -.9999999)\n" @@ -184,32 +185,32 @@ static const char* trainShader = "#version 310 es\n" " neuron_diff = 17.0;\n" " else\n" " neuron_diff = log((1.0 + neuron_diff) / (1.0 - neuron_diff));\n" - " errors.e[total_neurons + o] = neuron_diff;\n" - " if (neuron_diff < 0.0)\n" + " errors.e[total_neurons + o] = neuron_diff * 0.5;\n" + " if (values.e[total_neurons + o] < 0.0)\n" " errors.e[total_neurons + o] *= 0.01;\n" " }\n" " barrier();\n" " for (l = layers; l > 2; l--) {\n" " outputs = int(network.e[l]);\n" " inputs = int(network.e[l-1]);\n" - " neuron_prev = total_neurons - int(network.e[l-1]);\n" - " for (i = idx; i < inputs; i += threads)\n" + " neuron_prev = total_neurons - inputs - 1;\n" + " for (i = idx; i <= inputs; i += threads)\n" " errors.e[neuron_prev + i] = 0.0;\n" " barrier();\n" " for (i = idx; i < inputs; i += threads)\n" " for (o = 0; o < outputs; o++)\n" - " errors.e[neuron_prev + i] += errors.e[total_neurons + o] * weights.e[total_weights + i];\n" + " errors.e[neuron_prev + i] += errors.e[total_neurons + o] * weights.e[total_weights + o * inputs + o + i];\n" " barrier();\n" " for (i = idx; i < inputs; i += threads) {\n" " errors.e[neuron_prev + i] *= 0.5;\n" - " if (errors.e[neuron_prev + i] < 0.0)\n" + " if (values.e[neuron_prev + i] < 0.0)\n" " errors.e[neuron_prev + i] *= 0.01;\n" " }\n" " barrier();\n" " total_neurons = neuron_prev;\n" - " total_weights -= int(network.e[l-2]) * int(network.e[l-1]);\n" + " total_weights -= (int(network.e[l-2]) + 1) * inputs;\n" " }\n" - " total_neurons = int(network.e[1]);\n" + " total_neurons = int(network.e[1]) + 1;\n" " neuron_prev = 0;\n" " total_weights = 0;\n" " for (l = 2; l <= layers; l++) {\n" @@ -217,13 +218,13 @@ static const char* trainShader = "#version 310 es\n" " inputs = int(network.e[l-1]);\n" " for (o = idx; o < outputs; o += threads) {\n" " tmp_error = errors.e[total_neurons + o] * 0.7;\n" - " for (i = 0; i < inputs; i++)\n" - " weights.e[total_weights + o * inputs + i] += tmp_error * values.e[neuron_prev + i];\n" + " for (i = 0; i <= inputs; i++)\n" + " weights.e[total_weights + o * inputs + o + i] += tmp_error * values.e[neuron_prev + i];\n" " }\n" " barrier();\n" " neuron_prev = total_neurons;\n" - " total_neurons += outputs;\n" - " total_weights += outputs * inputs;\n" + " total_neurons += outputs + 1;\n" + " total_weights += outputs * inputs + outputs;\n" " }\n" "}\n"; @@ -342,7 +343,6 @@ void fann_create_shaders(struct fann *ann) } ann->onGPU = 0; - ann->onGPU_train_errors = 0; } /* #define FANN_NO_SEED */ @@ -1147,7 +1147,7 @@ if (ann->gl == 0) { parameters[i] = (int)(layer_it->last_neuron - layer_it->first_neuron) - 1; fprintf(stderr, "%0.0f ", parameters[i]); } - fprintf(stderr, "\n"); + fprintf(stderr, " total: %d\n", ann->total_neurons); glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glnetwork); glBufferData(GL_SHADER_STORAGE_BUFFER, nparameters * sizeof(GLfloat), parameters, GL_DYNAMIC_COPY); diff --git a/src/fann_train.c b/src/fann_train.c index 28ae7092..60bb8d64 100644 --- a/src/fann_train.c +++ b/src/fann_train.c @@ -322,7 +322,6 @@ void fann_compute_MSE(struct fann *ann, fann_type * desired_output) *error_it = fann_activation_derived(last_layer_begin->activation_function, last_layer_begin->activation_steepness, neuron_value, last_layer_begin->sum) * neuron_diff; - desired_output++; error_it++; From 0bdc64e23cca4206b4456b154ab29020af3da85f Mon Sep 17 00:00:00 2001 From: Eli Date: Mon, 6 Feb 2023 13:40:52 -0800 Subject: [PATCH 12/26] PLAN9 ifndefs, fann_from_gpu for weights --- src/fann.c | 62 +++++++++++++++++++++++++++++++++++----------- src/fann_train.c | 5 +++- src/include/fann.h | 5 ++++ 3 files changed, 57 insertions(+), 15 deletions(-) diff --git a/src/fann.c b/src/fann.c index cd495f67..98ba50a6 100644 --- a/src/fann.c +++ b/src/fann.c @@ -30,20 +30,15 @@ I will be metal. */ -#ifdef PLAN9 #include #include #include -#include #include #include +#ifdef PLAN9 +#include #else -#include -#include #include -#include -#include -#include #include #include #include @@ -55,6 +50,7 @@ #include "config.h" #include "fann.h" +#ifndef PLAN9 static const char* runShader = "#version 310 es\n" "precision lowp float;\n" "layout(local_size_x = %d) in;\n" @@ -344,6 +340,7 @@ void fann_create_shaders(struct fann *ann) ann->onGPU = 0; } +#endif /* #define FANN_NO_SEED */ @@ -884,9 +881,13 @@ FANN_EXTERNAL fann_type *FANN_API fann_run(struct fann * ann, fann_type * input) unsigned int activation_function; fann_type steepness; GLenum err; +#ifndef PLAN9 GLfloat *data; + GLfloat *glvalues; + GLfloat *glweights; int nparameters; GLfloat *parameters; +#endif /* PLAN9 */ /* store some variabels local for fast access */ struct fann_neuron *first_neuron = ann->first_layer->first_neuron; @@ -926,7 +927,9 @@ FANN_EXTERNAL fann_type *FANN_API fann_run(struct fann * ann, fann_type * input) *((ann->first_layer->last_neuron - 1)->value) = 1; #endif +#ifndef PLAN9 if (ann->gl == 0) { +#endif last_layer = ann->last_layer; for(layer_it = ann->first_layer + 1; layer_it != last_layer; layer_it++) { @@ -1134,6 +1137,7 @@ if (ann->gl == 0) { { output[i] = *(neurons[i].value); } +#ifndef PLAN9 } else { if (ann->onGPU == 0) { glGenBuffers(1, &ann->glnetwork); @@ -1142,12 +1146,12 @@ if (ann->gl == 0) { nparameters += (int)(ann->last_layer - ann->first_layer); parameters = calloc(sizeof(GLfloat), nparameters); parameters[0] = nparameters - 1; - fprintf(stderr, "network: %0.0f ", parameters[0]); +// fprintf(stderr, "network: %0.0f ", parameters[0]); for(i = 1, layer_it = ann->first_layer; layer_it != ann->last_layer; layer_it++, i++) { parameters[i] = (int)(layer_it->last_neuron - layer_it->first_neuron) - 1; - fprintf(stderr, "%0.0f ", parameters[i]); +// fprintf(stderr, "%0.0f ", parameters[i]); } - fprintf(stderr, " total: %d\n", ann->total_neurons); +// fprintf(stderr, "total: %d\n", ann->total_neurons); glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glnetwork); glBufferData(GL_SHADER_STORAGE_BUFFER, nparameters * sizeof(GLfloat), parameters, GL_DYNAMIC_COPY); @@ -1155,20 +1159,32 @@ if (ann->gl == 0) { glGenBuffers(1, &ann->glweights); + glweights = calloc(sizeof(GLfloat), ann->total_connections); + for (i = 0; i != ann->total_connections; i++) + glweights[i] = ann->weights[i]; + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glweights); - glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_connections * sizeof(fann_type), ann->weights, GL_DYNAMIC_COPY); + glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_connections * sizeof(GLfloat), glweights, GL_DYNAMIC_COPY); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, ann->glweights); + free(glweights); + glGenBuffers(1, &ann->glvalues); + glvalues = calloc(sizeof(GLfloat), ann->total_neurons); + for (i = 0; i != ann->total_neurons; i++) + glvalues[i] = ann->values[i]; + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glvalues); - glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_neurons * sizeof(fann_type), ann->values, GL_DYNAMIC_COPY); + glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_neurons * sizeof(GLfloat), glvalues, GL_DYNAMIC_COPY); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, ann->glvalues); + free(glvalues); + glGenBuffers(1, &ann->glerrors); glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glerrors); - glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_neurons * sizeof(fann_type), NULL, GL_DYNAMIC_COPY); + glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_neurons * sizeof(GLfloat), NULL, GL_DYNAMIC_COPY); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, ann->glerrors); ann->onGPU = 1; @@ -1202,10 +1218,26 @@ if (ann->gl == 0) { glDeleteBuffers(1, &ann->glinput); glDeleteBuffers(1, &ann->gloutput); } - +#endif return ann->output; } +#ifndef PLAN9 +FANN_EXTERNAL void FANN_API fann_from_gpu(struct fann *ann) +{ + GLfloat *data; + int i; + + if (ann->gl != 0) { + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glweights); + data = (GLfloat*)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, ann->total_connections * sizeof(GLfloat), GL_MAP_READ_BIT); + for(i = 0; i != ann->total_connections; i++) + ann->weights[i] = data[i]; + glUnmapBuffer(GL_SHADER_STORAGE_BUFFER); + } +} +#endif /* PLAN9 */ + FANN_EXTERNAL void FANN_API fann_destroy(struct fann *ann) { if(ann == NULL) @@ -2107,8 +2139,10 @@ struct fann *fann_allocate_structure(unsigned int num_layers) ann->last_layer = ann->first_layer + num_layers; +#ifndef PLAN9 fann_init_egl(); fann_create_shaders(ann); +#endif return ann; } diff --git a/src/fann_train.c b/src/fann_train.c index 60bb8d64..19c86cd7 100644 --- a/src/fann_train.c +++ b/src/fann_train.c @@ -103,12 +103,15 @@ FANN_EXTERNAL void FANN_API fann_train(struct fann *ann, fann_type * input, { fann_run(ann, input); +#ifndef PLAN9 if (ann->gl == 0) { +#endif fann_compute_MSE(ann, desired_output); fann_backpropagate_MSE(ann); fann_update_weights(ann); +#ifndef PLAN9 } else { int i; fann_type error; @@ -145,7 +148,7 @@ if (ann->gl == 0) { free(glinput); free(gloutput); } - +#endif } #endif diff --git a/src/include/fann.h b/src/include/fann.h index 54027120..3dd8b74a 100644 --- a/src/include/fann.h +++ b/src/include/fann.h @@ -632,6 +632,11 @@ FANN_EXTERNAL void FANN_API fann_disable_seed_rand(); */ FANN_EXTERNAL void FANN_API fann_enable_seed_rand(); +#ifndef PLAN9 + +FANN_EXTERNAL void FANN_API fann_from_gpu(struct fann *ann); + +#endif /* PLAN9 */ #ifdef FIXEDFANN From 0294836f9ff3ce19ddd4b2ea40787ad2087e3523 Mon Sep 17 00:00:00 2001 From: Eli Date: Mon, 6 Feb 2023 13:42:30 -0800 Subject: [PATCH 13/26] PLAN9 ifndef --- src/include/fann_data.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/include/fann_data.h b/src/include/fann_data.h index bd7c4756..8efdbeb2 100644 --- a/src/include/fann_data.h +++ b/src/include/fann_data.h @@ -781,6 +781,7 @@ struct fann */ fann_type *prev_weights_deltas; +#ifndef PLAN9 GLuint runShaderID; GLuint runShaderProgram; @@ -796,6 +797,7 @@ struct fann GLuint glnetwork; GLuint glinput; GLuint gloutput; +#endif /* PLAN9 */ fann_type *values; From 6c7ed9e697f5a31810c04e8c5afd66838d1b4811 Mon Sep 17 00:00:00 2001 From: Eli Date: Mon, 6 Feb 2023 13:43:36 -0800 Subject: [PATCH 14/26] PLAN9 ifndef --- src/fann.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/fann.c b/src/fann.c index 98ba50a6..3affd06d 100644 --- a/src/fann.c +++ b/src/fann.c @@ -880,8 +880,8 @@ FANN_EXTERNAL fann_type *FANN_API fann_run(struct fann * ann, fann_type * input) struct fann_layer *layer_it, *last_layer; unsigned int activation_function; fann_type steepness; - GLenum err; #ifndef PLAN9 + GLenum err; GLfloat *data; GLfloat *glvalues; GLfloat *glweights; @@ -2010,7 +2010,9 @@ struct fann *fann_allocate_structure(unsigned int num_layers) return NULL; } +#ifndef PLAN9 ann->gl = 0; +#endif ann->errno_f = FANN_E_NO_ERROR; ann->error_log = fann_default_error_log; ann->errstr = NULL; From d707c8d87daba4fb5216d7a966fa0fa9e132da14 Mon Sep 17 00:00:00 2001 From: Eli Date: Mon, 6 Feb 2023 13:49:18 -0800 Subject: [PATCH 15/26] PLAN9 ifndef --- src/include/fann.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/fann.h b/src/include/fann.h index 3dd8b74a..39d62544 100644 --- a/src/include/fann.h +++ b/src/include/fann.h @@ -137,9 +137,9 @@ extern "C" #endif /* FANN_DLL_EXPORTS*/ #define FANN_API __stdcall #else /* */ -#ifndef PLAN9 #define FANN_EXTERNAL #define FANN_API +#ifndef PLAN9 #include #endif #endif /* _MSC_VER */ From 54eb717c774acd4f9fb7bbab2cfdd3f18c7daa5e Mon Sep 17 00:00:00 2001 From: Eli Date: Mon, 6 Feb 2023 13:52:38 -0800 Subject: [PATCH 16/26] PLAN9 ifndef --- src/include/fann_error.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/include/fann_error.h b/src/include/fann_error.h index 58990fec..ccc0ea62 100644 --- a/src/include/fann_error.h +++ b/src/include/fann_error.h @@ -22,6 +22,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #ifndef PLAN9 #include +#else +#define FANN_API #endif #define FANN_ERRSTR_MAX 128 From 0a3a84ecec8d86658b5022071cdabc5a0da43fa5 Mon Sep 17 00:00:00 2001 From: Eli Date: Mon, 6 Feb 2023 13:53:48 -0800 Subject: [PATCH 17/26] PLAN9 ifdef --- src/include/fann_error.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/include/fann_error.h b/src/include/fann_error.h index ccc0ea62..e5ead2b0 100644 --- a/src/include/fann_error.h +++ b/src/include/fann_error.h @@ -20,9 +20,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #ifndef __fann_error_h__ #define __fann_error_h__ -#ifndef PLAN9 #include -#else +#ifdef PLAN9 #define FANN_API #endif From 7162449edde6ec252a44a6fd5bebfc196f2b7ce0 Mon Sep 17 00:00:00 2001 From: Eli Date: Mon, 6 Feb 2023 13:54:39 -0800 Subject: [PATCH 18/26] PLAN9 ifdef --- src/include/fann_error.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/include/fann_error.h b/src/include/fann_error.h index e5ead2b0..f119abd6 100644 --- a/src/include/fann_error.h +++ b/src/include/fann_error.h @@ -22,6 +22,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #include #ifdef PLAN9 +#define FANN_EXTERNAL #define FANN_API #endif From 48ee73611ec838117f7992c344523e2d7a35642b Mon Sep 17 00:00:00 2001 From: Eli Date: Thu, 9 Feb 2023 09:22:52 -0800 Subject: [PATCH 19/26] fixes --- src/fann.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/src/fann.c b/src/fann.c index 3affd06d..3f0e3705 100644 --- a/src/fann.c +++ b/src/fann.c @@ -52,7 +52,7 @@ #ifndef PLAN9 static const char* runShader = "#version 310 es\n" - "precision lowp float;\n" + "precision highp float;\n" "layout(local_size_x = %d) in;\n" "layout(std430) buffer;\n" "layout(binding = 0) buffer Network\n" @@ -84,10 +84,10 @@ static const char* runShader = "#version 310 es\n" " int idx = int(gl_LocalInvocationID.x);\n" " int threads = %d;\n" " int layers;\n" - " int i, o, inputs, outputs, n, l, total_neurons, total_weights;\n" + " int i, o, n, inputs, outputs, l, total_neurons, total_weights;\n" " layers = int(network.e[0]);\n" - " n = int(network.e[1]);\n" - " for (i = idx; i < n; i += threads)\n" + " inputs = int(network.e[1]);\n" + " for (i = idx; i < inputs; i += threads)\n" " values.e[i] = input_data.e[i];\n" " barrier();\n" " total_neurons = 0;\n" @@ -125,7 +125,7 @@ static const char* runShader = "#version 310 es\n" "}\n"; static const char* trainShader = "#version 310 es\n" - "precision lowp float;\n" + "precision highp float;\n" "layout(local_size_x = %d) in;\n" "layout(std430) buffer;\n" "layout(binding = 0) buffer Network\n" @@ -1146,12 +1146,8 @@ if (ann->gl == 0) { nparameters += (int)(ann->last_layer - ann->first_layer); parameters = calloc(sizeof(GLfloat), nparameters); parameters[0] = nparameters - 1; -// fprintf(stderr, "network: %0.0f ", parameters[0]); - for(i = 1, layer_it = ann->first_layer; layer_it != ann->last_layer; layer_it++, i++) { + for(i = 1, layer_it = ann->first_layer; layer_it != ann->last_layer; layer_it++, i++) parameters[i] = (int)(layer_it->last_neuron - layer_it->first_neuron) - 1; -// fprintf(stderr, "%0.0f ", parameters[i]); - } -// fprintf(stderr, "total: %d\n", ann->total_neurons); glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glnetwork); glBufferData(GL_SHADER_STORAGE_BUFFER, nparameters * sizeof(GLfloat), parameters, GL_DYNAMIC_COPY); @@ -1217,6 +1213,7 @@ if (ann->gl == 0) { glDeleteBuffers(1, &ann->glinput); glDeleteBuffers(1, &ann->gloutput); + free(glinput); } #endif return ann->output; From 94c860f5c3b79901ee18253a4350afa7c6aff59d Mon Sep 17 00:00:00 2001 From: Eli Date: Fri, 10 Feb 2023 12:21:05 -0800 Subject: [PATCH 20/26] glMapBufferRange(... GL_MAP_PERSISTENT_BIT) --- src/fann.c | 91 +++++++++++++++++++++++++---------------- src/fann_train.c | 25 +++-------- src/include/fann_data.h | 3 ++ 3 files changed, 63 insertions(+), 56 deletions(-) diff --git a/src/fann.c b/src/fann.c index 3f0e3705..331e26be 100644 --- a/src/fann.c +++ b/src/fann.c @@ -85,7 +85,7 @@ static const char* runShader = "#version 310 es\n" " int threads = %d;\n" " int layers;\n" " int i, o, n, inputs, outputs, l, total_neurons, total_weights;\n" - " layers = int(network.e[0]);\n" + " layers = int(network.e[0]) - 1;\n" " inputs = int(network.e[1]);\n" " for (i = idx; i < inputs; i += threads)\n" " values.e[i] = input_data.e[i];\n" @@ -96,31 +96,50 @@ static const char* runShader = "#version 310 es\n" " inputs = int(network.e[l]);\n" " outputs = int(network.e[l+1]);\n" " for (o = idx; o < outputs; o += threads)\n" - " input_data.e[o] = 0.0;\n" + " errors.e[o] = 0.0;\n" " barrier();\n" " values.e[total_neurons + inputs] = 1.0;\n" " for (o = idx; o < outputs; o += threads) {\n" " n = o * inputs + o;\n" " for (i = 0; i <= inputs; i++)\n" - " input_data.e[o] += values.e[total_neurons + i] * weights.e[total_weights + n + i];\n" + " errors.e[o] += values.e[total_neurons + i] * weights.e[total_weights + n + i];\n" " }\n" " barrier();\n" " total_neurons += inputs + 1;\n" " for (o = idx; o < outputs; o += threads) {\n" - " input_data.e[o] *= 0.5;\n" - " if (input_data.e[o] > 300.0)\n" - " input_data.e[o] = 300.0;\n" - " else if (input_data.e[o] < -300.0)\n" - " input_data.e[o] = -300.0;\n" - " if (input_data.e[o] < 0.0)\n" - " input_data.e[o] *= 0.01;\n" - " values.e[total_neurons + o] = input_data.e[o];\n" + " errors.e[o] *= 0.5;\n" + " if (errors.e[o] > 300.0)\n" + " errors.e[o] = 300.0;\n" + " else if (errors.e[o] < -300.0)\n" + " errors.e[o] = -300.0;\n" + " if (errors.e[o] < 0.0)\n" + " errors.e[o] *= 0.01;\n" + " values.e[total_neurons + o] = errors.e[o];\n" " }\n" " barrier();\n" " total_weights += inputs * outputs + outputs;\n" " }\n" + " inputs = int(network.e[layers]);\n" + " outputs = int(network.e[layers+1]);\n" " for (o = idx; o < outputs; o += threads)\n" + " errors.e[o] = 0.0;\n" + " barrier();\n" + " values.e[total_neurons + inputs] = 1.0;\n" + " for (o = idx; o < outputs; o += threads) {\n" + " n = o * inputs + o;\n" + " for (i = 0; i <= inputs; i++)\n" + " errors.e[o] += values.e[total_neurons + i] * weights.e[total_weights + n + i];\n" + " }\n" + " barrier();\n" + " total_neurons += inputs + 1;\n" + " for (o = idx; o < outputs; o += threads) {\n" + " if (errors.e[o] > 600.0)\n" + " errors.e[o] = 600.0;\n" + " else if (errors.e[o] < -600.0)\n" + " errors.e[o] = -600.0;\n" + " values.e[total_neurons + o] = (1.0/(1.0 + exp(-errors.e[o])));\n" " output_data.e[o] = values.e[total_neurons + o];\n" + " }\n" " barrier();\n" "}\n"; @@ -181,9 +200,7 @@ static const char* trainShader = "#version 310 es\n" " neuron_diff = 17.0;\n" " else\n" " neuron_diff = log((1.0 + neuron_diff) / (1.0 - neuron_diff));\n" - " errors.e[total_neurons + o] = neuron_diff * 0.5;\n" - " if (values.e[total_neurons + o] < 0.0)\n" - " errors.e[total_neurons + o] *= 0.01;\n" + " errors.e[total_neurons + o] = neuron_diff * values.e[total_neurons + o] * (1.0 - values.e[total_neurons + o]);\n" " }\n" " barrier();\n" " for (l = layers; l > 2; l--) {\n" @@ -264,6 +281,9 @@ void fann_init_egl(void) { printf("eglMakeCurrent failed returned %d\n", returnValue); exit(-1); } + + fprintf(stderr, "%s\n", glGetString(GL_VERSION)); + fprintf(stderr, "%s\n", glGetString(GL_EXTENSIONS)); } void fann_create_shaders(struct fann *ann) @@ -1183,37 +1203,36 @@ if (ann->gl == 0) { glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_neurons * sizeof(GLfloat), NULL, GL_DYNAMIC_COPY); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, ann->glerrors); - ann->onGPU = 1; - } + glGenBuffers(1, &ann->glinput); - GLfloat *glinput = malloc(sizeof(GLfloat) * ann->num_input); - for (i = 0; i < ann->num_input; i++) - glinput[i] = input[i]; - glGenBuffers(1, &ann->glinput); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glinput); + glBufferStorage(GL_SHADER_STORAGE_BUFFER, ann->num_input * sizeof(GLfloat), NULL, GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT|GL_MAP_COHERENT_BIT); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, ann->glinput); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glinput); + ann->glinputdata = (GLfloat*)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, ann->num_input * sizeof(GLfloat), GL_MAP_WRITE_BIT|GL_MAP_COHERENT_BIT|GL_MAP_PERSISTENT_BIT); + + glGenBuffers(1, &ann->gloutput); - glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glinput); - glBufferData(GL_SHADER_STORAGE_BUFFER, ann->num_input * sizeof(GLfloat), glinput, GL_DYNAMIC_COPY); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, ann->glinput); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->gloutput); + glBufferStorage(GL_SHADER_STORAGE_BUFFER, ann->num_output * sizeof(GLfloat), NULL, GL_MAP_READ_BIT|GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT|GL_MAP_COHERENT_BIT); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, ann->gloutput); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->gloutput); + ann->gloutputdata = (GLfloat*)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, ann->num_output * sizeof(GLfloat), GL_MAP_READ_BIT|GL_MAP_WRITE_BIT|GL_MAP_COHERENT_BIT|GL_MAP_PERSISTENT_BIT); - glGenBuffers(1, &ann->gloutput); + ann->onGPU = 1; + } - glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->gloutput); - glBufferData(GL_SHADER_STORAGE_BUFFER, ann->num_output * sizeof(GLfloat), NULL, GL_DYNAMIC_COPY); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, ann->gloutput); + for (i = 0; i < ann->num_input; i++) + ann->glinputdata[i] = input[i]; + glFinish(); glUseProgram(ann->runShaderProgram); glDispatchCompute(1, 1, 1); - glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + glMemoryBarrier(GL_ALL_BARRIER_BITS); + glFinish(); - glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->gloutput); - data = (GLfloat*)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, ann->num_output * sizeof(GLfloat), GL_MAP_READ_BIT); for(i = 0; i != ann->num_output; i++) - ann->output[i] = data[i]; - glUnmapBuffer(GL_SHADER_STORAGE_BUFFER); - - glDeleteBuffers(1, &ann->glinput); - glDeleteBuffers(1, &ann->gloutput); - free(glinput); + ann->output[i] = ann->gloutputdata[i]; } #endif return ann->output; diff --git a/src/fann_train.c b/src/fann_train.c index 19c86cd7..2aa9b9da 100644 --- a/src/fann_train.c +++ b/src/fann_train.c @@ -121,32 +121,17 @@ if (ann->gl == 0) { ann->MSE_value += error * error; } - GLfloat *glinput = malloc(sizeof(GLfloat) * ann->num_input); for (i = 0; i < ann->num_input; i++) - glinput[i] = input[i]; - glGenBuffers(1, &ann->glinput); + ann->glinputdata[i] = input[i]; - glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glinput); - glBufferData(GL_SHADER_STORAGE_BUFFER, ann->num_input * sizeof(GLfloat), glinput, GL_DYNAMIC_COPY); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, ann->glinput); - - GLfloat *gloutput = malloc(sizeof(GLfloat) * ann->num_output); for (i = 0; i < ann->num_output; i++) - gloutput[i] = desired_output[i]; - glGenBuffers(1, &ann->gloutput); - - glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->gloutput); - glBufferData(GL_SHADER_STORAGE_BUFFER, ann->num_output * sizeof(GLfloat), gloutput, GL_DYNAMIC_COPY); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, ann->gloutput); + ann->gloutputdata[i] = desired_output[i]; + glFinish(); glUseProgram(ann->trainShaderProgram); glDispatchCompute(1, 1, 1); - glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); - - glDeleteBuffers(1, &ann->glinput); - glDeleteBuffers(1, &ann->gloutput); - free(glinput); - free(gloutput); + glMemoryBarrier(GL_ALL_BARRIER_BITS); + glFinish(); } #endif } diff --git a/src/include/fann_data.h b/src/include/fann_data.h index 8efdbeb2..9e7dd6ed 100644 --- a/src/include/fann_data.h +++ b/src/include/fann_data.h @@ -797,6 +797,9 @@ struct fann GLuint glnetwork; GLuint glinput; GLuint gloutput; + + GLfloat *glinputdata; + GLfloat *gloutputdata; #endif /* PLAN9 */ fann_type *values; From 012c67253cc6eced52d07d86c01aeef10e991ebe Mon Sep 17 00:00:00 2001 From: Eli Date: Fri, 10 Feb 2023 18:23:48 -0800 Subject: [PATCH 21/26] glMemoryBarrier --- src/fann.c | 2 +- src/fann_train.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fann.c b/src/fann.c index 331e26be..735cf3ca 100644 --- a/src/fann.c +++ b/src/fann.c @@ -1228,7 +1228,7 @@ if (ann->gl == 0) { glFinish(); glUseProgram(ann->runShaderProgram); glDispatchCompute(1, 1, 1); - glMemoryBarrier(GL_ALL_BARRIER_BITS); + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); glFinish(); for(i = 0; i != ann->num_output; i++) diff --git a/src/fann_train.c b/src/fann_train.c index 2aa9b9da..4db775b5 100644 --- a/src/fann_train.c +++ b/src/fann_train.c @@ -130,7 +130,7 @@ if (ann->gl == 0) { glFinish(); glUseProgram(ann->trainShaderProgram); glDispatchCompute(1, 1, 1); - glMemoryBarrier(GL_ALL_BARRIER_BITS); + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); glFinish(); } #endif From 058bfc8528c825ffef8f3e51f4af7481ed33990b Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 22 Feb 2023 09:10:06 -0800 Subject: [PATCH 22/26] NVIDIA RTX --- src/fann.c | 171 ++++++++++++++++++++++----------------------- src/fann_train.c | 18 +++-- src/include/fann.h | 2 + 3 files changed, 99 insertions(+), 92 deletions(-) diff --git a/src/fann.c b/src/fann.c index 735cf3ca..71968f23 100644 --- a/src/fann.c +++ b/src/fann.c @@ -95,16 +95,15 @@ static const char* runShader = "#version 310 es\n" " for (l = 1; l < layers; l++) {\n" " inputs = int(network.e[l]);\n" " outputs = int(network.e[l+1]);\n" - " for (o = idx; o < outputs; o += threads)\n" - " errors.e[o] = 0.0;\n" + " if (idx == 0)\n" + " values.e[total_neurons + inputs] = 1.0;\n" " barrier();\n" - " values.e[total_neurons + inputs] = 1.0;\n" " for (o = idx; o < outputs; o += threads) {\n" + " errors.e[o] = 0.0;\n" " n = o * inputs + o;\n" " for (i = 0; i <= inputs; i++)\n" " errors.e[o] += values.e[total_neurons + i] * weights.e[total_weights + n + i];\n" " }\n" - " barrier();\n" " total_neurons += inputs + 1;\n" " for (o = idx; o < outputs; o += threads) {\n" " errors.e[o] *= 0.5;\n" @@ -121,24 +120,20 @@ static const char* runShader = "#version 310 es\n" " }\n" " inputs = int(network.e[layers]);\n" " outputs = int(network.e[layers+1]);\n" - " for (o = idx; o < outputs; o += threads)\n" - " errors.e[o] = 0.0;\n" + " if (idx == 0)\n" + " values.e[total_neurons + inputs] = 1.0;\n" " barrier();\n" - " values.e[total_neurons + inputs] = 1.0;\n" " for (o = idx; o < outputs; o += threads) {\n" + " errors.e[o] = 0.0;\n" " n = o * inputs + o;\n" " for (i = 0; i <= inputs; i++)\n" " errors.e[o] += values.e[total_neurons + i] * weights.e[total_weights + n + i];\n" - " }\n" - " barrier();\n" - " total_neurons += inputs + 1;\n" - " for (o = idx; o < outputs; o += threads) {\n" " if (errors.e[o] > 600.0)\n" " errors.e[o] = 600.0;\n" " else if (errors.e[o] < -600.0)\n" " errors.e[o] = -600.0;\n" - " values.e[total_neurons + o] = (1.0/(1.0 + exp(-errors.e[o])));\n" - " output_data.e[o] = values.e[total_neurons + o];\n" + " values.e[total_neurons + inputs + 1 + o] = (1.0/(1.0 + exp(-errors.e[o])));\n" + " output_data.e[o] = values.e[total_neurons + inputs + 1 + o];\n" " }\n" " barrier();\n" "}\n"; @@ -182,7 +177,6 @@ static const char* trainShader = "#version 310 es\n" " inputs = int(network.e[1]);\n" " for (i = idx; i < inputs; i += threads)\n" " values.e[i] = input_data.e[i];\n" - " values.e[inputs] = 1.0;\n" " barrier();\n" " total_neurons = 0;\n" " total_weights = 0;\n" @@ -207,14 +201,10 @@ static const char* trainShader = "#version 310 es\n" " outputs = int(network.e[l]);\n" " inputs = int(network.e[l-1]);\n" " neuron_prev = total_neurons - inputs - 1;\n" - " for (i = idx; i <= inputs; i += threads)\n" + " for (i = idx; i <= inputs; i += threads) {\n" " errors.e[neuron_prev + i] = 0.0;\n" - " barrier();\n" - " for (i = idx; i < inputs; i += threads)\n" " for (o = 0; o < outputs; o++)\n" " errors.e[neuron_prev + i] += errors.e[total_neurons + o] * weights.e[total_weights + o * inputs + o + i];\n" - " barrier();\n" - " for (i = idx; i < inputs; i += threads) {\n" " errors.e[neuron_prev + i] *= 0.5;\n" " if (values.e[neuron_prev + i] < 0.0)\n" " errors.e[neuron_prev + i] *= 0.01;\n" @@ -242,7 +232,7 @@ static const char* trainShader = "#version 310 es\n" "}\n"; void fann_init_egl(void) { - int32_t fd = open ("/dev/dri/card0", O_RDWR); + int32_t fd = open ("/dev/dri/card1", O_RDWR); if (fd <= 0) exit(-3); @@ -900,15 +890,6 @@ FANN_EXTERNAL fann_type *FANN_API fann_run(struct fann * ann, fann_type * input) struct fann_layer *layer_it, *last_layer; unsigned int activation_function; fann_type steepness; -#ifndef PLAN9 - GLenum err; - GLfloat *data; - GLfloat *glvalues; - GLfloat *glweights; - int nparameters; - GLfloat *parameters; -#endif /* PLAN9 */ - /* store some variabels local for fast access */ struct fann_neuron *first_neuron = ann->first_layer->first_neuron; @@ -1160,64 +1141,7 @@ if (ann->gl == 0) { #ifndef PLAN9 } else { if (ann->onGPU == 0) { - glGenBuffers(1, &ann->glnetwork); - - nparameters = 1; - nparameters += (int)(ann->last_layer - ann->first_layer); - parameters = calloc(sizeof(GLfloat), nparameters); - parameters[0] = nparameters - 1; - for(i = 1, layer_it = ann->first_layer; layer_it != ann->last_layer; layer_it++, i++) - parameters[i] = (int)(layer_it->last_neuron - layer_it->first_neuron) - 1; - - glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glnetwork); - glBufferData(GL_SHADER_STORAGE_BUFFER, nparameters * sizeof(GLfloat), parameters, GL_DYNAMIC_COPY); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, ann->glnetwork); - - glGenBuffers(1, &ann->glweights); - - glweights = calloc(sizeof(GLfloat), ann->total_connections); - for (i = 0; i != ann->total_connections; i++) - glweights[i] = ann->weights[i]; - - glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glweights); - glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_connections * sizeof(GLfloat), glweights, GL_DYNAMIC_COPY); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, ann->glweights); - - free(glweights); - - glGenBuffers(1, &ann->glvalues); - - glvalues = calloc(sizeof(GLfloat), ann->total_neurons); - for (i = 0; i != ann->total_neurons; i++) - glvalues[i] = ann->values[i]; - - glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glvalues); - glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_neurons * sizeof(GLfloat), glvalues, GL_DYNAMIC_COPY); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, ann->glvalues); - - free(glvalues); - - glGenBuffers(1, &ann->glerrors); - - glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glerrors); - glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_neurons * sizeof(GLfloat), NULL, GL_DYNAMIC_COPY); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, ann->glerrors); - - glGenBuffers(1, &ann->glinput); - - glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glinput); - glBufferStorage(GL_SHADER_STORAGE_BUFFER, ann->num_input * sizeof(GLfloat), NULL, GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT|GL_MAP_COHERENT_BIT); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, ann->glinput); - glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glinput); - ann->glinputdata = (GLfloat*)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, ann->num_input * sizeof(GLfloat), GL_MAP_WRITE_BIT|GL_MAP_COHERENT_BIT|GL_MAP_PERSISTENT_BIT); - - glGenBuffers(1, &ann->gloutput); - - glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->gloutput); - glBufferStorage(GL_SHADER_STORAGE_BUFFER, ann->num_output * sizeof(GLfloat), NULL, GL_MAP_READ_BIT|GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT|GL_MAP_COHERENT_BIT); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, ann->gloutput); - glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->gloutput); - ann->gloutputdata = (GLfloat*)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, ann->num_output * sizeof(GLfloat), GL_MAP_READ_BIT|GL_MAP_WRITE_BIT|GL_MAP_COHERENT_BIT|GL_MAP_PERSISTENT_BIT); + fann_init_gpu(ann); ann->onGPU = 1; } @@ -1228,7 +1152,7 @@ if (ann->gl == 0) { glFinish(); glUseProgram(ann->runShaderProgram); glDispatchCompute(1, 1, 1); - glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + glMemoryBarrier(GL_ALL_BARRIER_BITS); glFinish(); for(i = 0; i != ann->num_output; i++) @@ -1252,6 +1176,77 @@ FANN_EXTERNAL void FANN_API fann_from_gpu(struct fann *ann) glUnmapBuffer(GL_SHADER_STORAGE_BUFFER); } } + +FANN_EXTERNAL void FANN_API fann_init_gpu(struct fann *ann) +{ + GLfloat *data; + GLfloat *glvalues; + GLfloat *glweights; + int nparameters; + GLfloat *parameters; + int i; + struct fann_layer *layer_it; + + glGenBuffers(1, &ann->glnetwork); + + nparameters = 1; + nparameters += (int)(ann->last_layer - ann->first_layer); + parameters = calloc(sizeof(GLfloat), nparameters); + parameters[0] = nparameters - 1; + for(i = 1, layer_it = ann->first_layer; layer_it != ann->last_layer; layer_it++, i++) + parameters[i] = (int)(layer_it->last_neuron - layer_it->first_neuron) - 1; + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glnetwork); + glBufferData(GL_SHADER_STORAGE_BUFFER, nparameters * sizeof(GLfloat), parameters, GL_DYNAMIC_COPY); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, ann->glnetwork); + + glGenBuffers(1, &ann->glweights); + + glweights = calloc(sizeof(GLfloat), ann->total_connections); + for (i = 0; i != ann->total_connections; i++) + glweights[i] = ann->weights[i]; + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glweights); + glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_connections * sizeof(GLfloat), glweights, GL_DYNAMIC_COPY); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, ann->glweights); + + free(glweights); + + glGenBuffers(1, &ann->glvalues); + + glvalues = calloc(sizeof(GLfloat), ann->total_neurons); + for (i = 0; i != ann->total_neurons; i++) + glvalues[i] = ann->values[i]; + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glvalues); + glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_neurons * sizeof(GLfloat), glvalues, GL_DYNAMIC_COPY); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, ann->glvalues); + + free(glvalues); + + glGenBuffers(1, &ann->glerrors); + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glerrors); + glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_neurons * sizeof(GLfloat), NULL, GL_DYNAMIC_COPY); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, ann->glerrors); + + glGenBuffers(1, &ann->glinput); + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glinput); + glBufferStorage(GL_SHADER_STORAGE_BUFFER, ann->num_input * sizeof(GLfloat), NULL, GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT|GL_MAP_COHERENT_BIT); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, ann->glinput); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glinput); + ann->glinputdata = (GLfloat*)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, ann->num_input * sizeof(GLfloat), GL_MAP_WRITE_BIT|GL_MAP_COHERENT_BIT|GL_MAP_PERSISTENT_BIT); + + glGenBuffers(1, &ann->gloutput); + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->gloutput); + glBufferStorage(GL_SHADER_STORAGE_BUFFER, ann->num_output * sizeof(GLfloat), NULL, GL_MAP_READ_BIT|GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT|GL_MAP_COHERENT_BIT); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, ann->gloutput); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->gloutput); + ann->gloutputdata = (GLfloat*)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, ann->num_output * sizeof(GLfloat), GL_MAP_READ_BIT|GL_MAP_WRITE_BIT|GL_MAP_COHERENT_BIT|GL_MAP_PERSISTENT_BIT); + +} #endif /* PLAN9 */ FANN_EXTERNAL void FANN_API fann_destroy(struct fann *ann) diff --git a/src/fann_train.c b/src/fann_train.c index 4db775b5..e5d5dbbf 100644 --- a/src/fann_train.c +++ b/src/fann_train.c @@ -114,11 +114,12 @@ if (ann->gl == 0) { #ifndef PLAN9 } else { int i; - fann_type error; + fann_type err; + GLfloat *errors; for (i = 0; i < ann->num_output; i++) { - error = desired_output[i] - ann->output[i]; - ann->MSE_value += error * error; + err = desired_output[i] - ann->output[i]; + ann->MSE_value += err * err; } for (i = 0; i < ann->num_input; i++) @@ -130,8 +131,15 @@ if (ann->gl == 0) { glFinish(); glUseProgram(ann->trainShaderProgram); glDispatchCompute(1, 1, 1); - glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + glMemoryBarrier(GL_ALL_BARRIER_BITS); glFinish(); + +/* glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glerrors); + errors = (GLfloat*)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, ann->total_neurons * sizeof(GLfloat), GL_MAP_READ_BIT); + for (i = 0; i < ann->num_output; i++) + fprintf(stderr, "%0.10f ", errors[ann->total_neurons - ann->num_output - 1 + i]); + fprintf(stderr, "\n"); + glUnmapBuffer(GL_SHADER_STORAGE_BUFFER); */ } #endif } @@ -310,11 +318,13 @@ void fann_compute_MSE(struct fann *ann, fann_type * desired_output) *error_it = fann_activation_derived(last_layer_begin->activation_function, last_layer_begin->activation_steepness, neuron_value, last_layer_begin->sum) * neuron_diff; +// fprintf(stderr, "%0.10f ", *error_it); desired_output++; error_it++; ann->num_MSE++; } +// fprintf(stderr, "\n"); } /* INTERNAL FUNCTION diff --git a/src/include/fann.h b/src/include/fann.h index 39d62544..82289b24 100644 --- a/src/include/fann.h +++ b/src/include/fann.h @@ -636,6 +636,8 @@ FANN_EXTERNAL void FANN_API fann_enable_seed_rand(); FANN_EXTERNAL void FANN_API fann_from_gpu(struct fann *ann); +FANN_EXTERNAL void FANN_API fann_init_gpu(struct fann *ann); + #endif /* PLAN9 */ #ifdef FIXEDFANN From 1c41188099994f2652ce8f25edbde11cd74ca8fe Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 22 Feb 2023 09:12:38 -0800 Subject: [PATCH 23/26] gpu test --- tests/gputest.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 tests/gputest.c diff --git a/tests/gputest.c b/tests/gputest.c new file mode 100644 index 00000000..c752418e --- /dev/null +++ b/tests/gputest.c @@ -0,0 +1,59 @@ +#include + +void +fanntest(struct fann *ann, fann_type *input, fann_type *output, fann_type *desired_output, int gl) +{ + double a, b; + struct timeval now; + int o; + + ann->gl = gl; + + gettimeofday(&now, NULL); + b = now.tv_sec * 1000000; + b += now.tv_usec; + + fann_reset_MSE(ann); + fann_train(ann, input, desired_output); +// fann_run(ann, input); + + gettimeofday(&now, NULL); + a = now.tv_sec * 1000000; + a += now.tv_usec; + + fprintf(stderr, "%cPU: %f microseconds MSE: %0.10lf\n", gl? 'G': 'C', a - b, ann->MSE_value); +} + +int +main(int argc, char **argv) +{ + fann_type *input; + fann_type *output; + fann_type *desired_output; + struct fann *ann; + int i; + + if (argc < 2) + return -1; + + i = atoi(argv[1]); + + ann = fann_create_standard(5, i, i, i, i, i); + fann_set_activation_function_hidden(ann, FANN_LINEAR_PIECE_LEAKY); + fann_set_activation_function_output(ann, FANN_SIGMOID); + input = calloc(sizeof(fann_type), ann->num_input); + desired_output = calloc(sizeof(fann_type), ann->num_output); + + for (i = 0; i < ann->num_output; i++) + desired_output[i] = 0.73; + + fann_print_parameters(ann); + + for (i = 0; i < 10; i++) + fanntest(ann, input, output, desired_output, 1); + for (i = 0; i < 10; i++) + fanntest(ann, input, output, desired_output, 0); + + return 0; +} + From 95d000dbc54e122e95c8f9f4e401055909488675 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 22 Feb 2023 09:30:57 -0800 Subject: [PATCH 24/26] is this a problem with GL work group sizes? --- src/fann.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fann.c b/src/fann.c index 71968f23..83a89a60 100644 --- a/src/fann.c +++ b/src/fann.c @@ -53,7 +53,7 @@ #ifndef PLAN9 static const char* runShader = "#version 310 es\n" "precision highp float;\n" - "layout(local_size_x = %d) in;\n" + "layout(local_size_x = %d, local_size_y = 1, local_size_z = 1) in;\n" "layout(std430) buffer;\n" "layout(binding = 0) buffer Network\n" "{\n" @@ -140,7 +140,7 @@ static const char* runShader = "#version 310 es\n" static const char* trainShader = "#version 310 es\n" "precision highp float;\n" - "layout(local_size_x = %d) in;\n" + "layout(local_size_x = %d, local_size_y = 1, local_size_z = 1) in;\n" "layout(std430) buffer;\n" "layout(binding = 0) buffer Network\n" "{\n" From 411c9cd378caa92e6e991308ce859240dc9f8f30 Mon Sep 17 00:00:00 2001 From: Eli Date: Thu, 9 Mar 2023 20:10:19 -0800 Subject: [PATCH 25/26] blah --- src/fann.c | 24 +++++++++++++----------- src/fann_train.c | 8 +------- tests/gputest.c | 15 +++++++++++---- 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/src/fann.c b/src/fann.c index 83a89a60..f86221d6 100644 --- a/src/fann.c +++ b/src/fann.c @@ -82,7 +82,7 @@ static const char* runShader = "#version 310 es\n" "void main()\n" "{\n" " int idx = int(gl_LocalInvocationID.x);\n" - " int threads = %d;\n" + " int threads = int(gl_WorkGroupSize.x);\n" " int layers;\n" " int i, o, n, inputs, outputs, l, total_neurons, total_weights;\n" " layers = int(network.e[0]) - 1;\n" @@ -169,15 +169,12 @@ static const char* trainShader = "#version 310 es\n" "void main()\n" "{\n" " int idx = int(gl_LocalInvocationID.x);\n" - " int threads = %d;\n" + " int threads = int(gl_WorkGroupSize.x);\n" " int layers;\n" - " int i, o, l, total_neurons, total_weights, outputs, inputs, neuron_prev;\n" + " int i, o, l, n, total_neurons, total_weights, outputs, inputs, neuron_prev;\n" " float neuron_diff, tmp_error;\n" " layers = int(network.e[0]);\n" " inputs = int(network.e[1]);\n" - " for (i = idx; i < inputs; i += threads)\n" - " values.e[i] = input_data.e[i];\n" - " barrier();\n" " total_neurons = 0;\n" " total_weights = 0;\n" " for (l = 1; l < layers; l++) {\n" @@ -201,7 +198,7 @@ static const char* trainShader = "#version 310 es\n" " outputs = int(network.e[l]);\n" " inputs = int(network.e[l-1]);\n" " neuron_prev = total_neurons - inputs - 1;\n" - " for (i = idx; i <= inputs; i += threads) {\n" + " for (i = idx; i < inputs; i += threads) {\n" " errors.e[neuron_prev + i] = 0.0;\n" " for (o = 0; o < outputs; o++)\n" " errors.e[neuron_prev + i] += errors.e[total_neurons + o] * weights.e[total_weights + o * inputs + o + i];\n" @@ -221,8 +218,9 @@ static const char* trainShader = "#version 310 es\n" " inputs = int(network.e[l-1]);\n" " for (o = idx; o < outputs; o += threads) {\n" " tmp_error = errors.e[total_neurons + o] * 0.7;\n" + " n = o * inputs + o;\n" " for (i = 0; i <= inputs; i++)\n" - " weights.e[total_weights + o * inputs + o + i] += tmp_error * values.e[neuron_prev + i];\n" + " weights.e[total_weights + n + i] += tmp_error * values.e[neuron_prev + i];\n" " }\n" " barrier();\n" " neuron_prev = total_neurons;\n" @@ -232,7 +230,7 @@ static const char* trainShader = "#version 310 es\n" "}\n"; void fann_init_egl(void) { - int32_t fd = open ("/dev/dri/card1", O_RDWR); + int32_t fd = open ("/dev/dri/card0", O_RDWR); if (fd <= 0) exit(-3); @@ -1140,6 +1138,8 @@ if (ann->gl == 0) { } #ifndef PLAN9 } else { + GLenum err; + if (ann->onGPU == 0) { fann_init_gpu(ann); @@ -1200,6 +1200,8 @@ FANN_EXTERNAL void FANN_API fann_init_gpu(struct fann *ann) glBufferData(GL_SHADER_STORAGE_BUFFER, nparameters * sizeof(GLfloat), parameters, GL_DYNAMIC_COPY); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, ann->glnetwork); +// free(parameters); + glGenBuffers(1, &ann->glweights); glweights = calloc(sizeof(GLfloat), ann->total_connections); @@ -1210,7 +1212,7 @@ FANN_EXTERNAL void FANN_API fann_init_gpu(struct fann *ann) glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_connections * sizeof(GLfloat), glweights, GL_DYNAMIC_COPY); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, ann->glweights); - free(glweights); +// free(glweights); glGenBuffers(1, &ann->glvalues); @@ -1222,7 +1224,7 @@ FANN_EXTERNAL void FANN_API fann_init_gpu(struct fann *ann) glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_neurons * sizeof(GLfloat), glvalues, GL_DYNAMIC_COPY); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, ann->glvalues); - free(glvalues); +// free(glvalues); glGenBuffers(1, &ann->glerrors); diff --git a/src/fann_train.c b/src/fann_train.c index e5d5dbbf..19e4d738 100644 --- a/src/fann_train.c +++ b/src/fann_train.c @@ -116,6 +116,7 @@ if (ann->gl == 0) { int i; fann_type err; GLfloat *errors; + GLenum glerr; for (i = 0; i < ann->num_output; i++) { err = desired_output[i] - ann->output[i]; @@ -133,13 +134,6 @@ if (ann->gl == 0) { glDispatchCompute(1, 1, 1); glMemoryBarrier(GL_ALL_BARRIER_BITS); glFinish(); - -/* glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glerrors); - errors = (GLfloat*)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, ann->total_neurons * sizeof(GLfloat), GL_MAP_READ_BIT); - for (i = 0; i < ann->num_output; i++) - fprintf(stderr, "%0.10f ", errors[ann->total_neurons - ann->num_output - 1 + i]); - fprintf(stderr, "\n"); - glUnmapBuffer(GL_SHADER_STORAGE_BUFFER); */ } #endif } diff --git a/tests/gputest.c b/tests/gputest.c index c752418e..741ac811 100644 --- a/tests/gputest.c +++ b/tests/gputest.c @@ -1,4 +1,6 @@ #include +#include +#include void fanntest(struct fann *ann, fann_type *input, fann_type *output, fann_type *desired_output, int gl) @@ -15,7 +17,6 @@ fanntest(struct fann *ann, fann_type *input, fann_type *output, fann_type *desir fann_reset_MSE(ann); fann_train(ann, input, desired_output); -// fann_run(ann, input); gettimeofday(&now, NULL); a = now.tv_sec * 1000000; @@ -32,6 +33,7 @@ main(int argc, char **argv) fann_type *desired_output; struct fann *ann; int i; + GLfloat *data; if (argc < 2) return -1; @@ -44,15 +46,20 @@ main(int argc, char **argv) input = calloc(sizeof(fann_type), ann->num_input); desired_output = calloc(sizeof(fann_type), ann->num_output); + srand(time(NULL)); + + for (i = 0; i < ann->num_input; i++) + input[i] = ((float)rand()/RAND_MAX)-0.5; + for (i = 0; i < ann->num_output; i++) - desired_output[i] = 0.73; + desired_output[i] = ((float)rand()/RAND_MAX)-0.5; fann_print_parameters(ann); - for (i = 0; i < 10; i++) + for (i = 0; i < 10; i++) { fanntest(ann, input, output, desired_output, 1); - for (i = 0; i < 10; i++) fanntest(ann, input, output, desired_output, 0); + } return 0; } From 9483fbaa1822066f8d0f9d8fe9f28d012a741699 Mon Sep 17 00:00:00 2001 From: Eli Date: Wed, 15 Mar 2023 15:06:53 -0700 Subject: [PATCH 26/26] xor test --- tests/xortest.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 tests/xortest.c diff --git a/tests/xortest.c b/tests/xortest.c new file mode 100644 index 00000000..195c2b19 --- /dev/null +++ b/tests/xortest.c @@ -0,0 +1,27 @@ +#include + +int +main() { + int i; + struct fann *ann = fann_create_standard(3, 2, 5, 1); + fann_type input[4][2] = { + { 0.0, 0.0 }, + { 1.0, 0.0 }, + { 0.0, 1.0 }, + { 1.0, 1.0 } + }; + fann_type output[4][1] = { + { 0.0 }, + { 1.0 }, + { 1.0 }, + { 0.0 } + }; + + do { + fann_reset_MSE(ann); + for (i = 0; i < 4; i++) + fann_train(ann, input[i], output[i]); + } while (ann->MSE_value > 0.001); + + fprintf(stderr, "MSE: %f\n", ann->MSE_value); +}