From 4834086d1c7e282613220ce46d1ca8018ab9927f Mon Sep 17 00:00:00 2001
From: Marek Otahal <markotahal@gmail.com>
Date: Thu, 11 Jul 2019 22:18:24 +0200
Subject: [PATCH 1/6] Parallel: compile with TBB if g++-9

---
 CommonCompilerConfig.cmake | 1 +
 1 file changed, 1 insertion(+)
diff --git a/CommonCompilerConfig.cmake b/CommonCompilerConfig.cmake
index 39a2813f50..41af4c570e 100644
--- a/CommonCompilerConfig.cmake
+++ b/CommonCompilerConfig.cmake
@@ -100,6 +100,7 @@ set(boost_required ON)
 if(NOT FORCE_CPP11)
   if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
     if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "9")
+	 set(extra_lib_for_filesystem "tbb") #TODO hijacked for parallel TS, link with tbb for g++-9
          set(CMAKE_CXX_STANDARD 17)
 	 set(boost_required OFF)
     elseif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "8")

From 2e0760b6c5be6e052debb971209369033f632161 Mon Sep 17 00:00:00 2001
From: Marek Otahal <markotahal@gmail.com>
Date: Thu, 11 Jul 2019 22:18:50 +0200
Subject: [PATCH 2/6] Parallel: WIP parallel demo on MNIST

---
 src/examples/mnist/MNIST_SP.cpp | 43 +++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 8 deletions(-)

diff --git a/src/examples/mnist/MNIST_SP.cpp b/src/examples/mnist/MNIST_SP.cpp
index 35c46fae1b..0144bd38eb 100644
--- a/src/examples/mnist/MNIST_SP.cpp
+++ b/src/examples/mnist/MNIST_SP.cpp
@@ -36,6 +36,9 @@
 #include <mnist/mnist_reader.hpp> // MNIST data itself + read methods, namespace mnist::
 #include <mnist/mnist_utils.hpp>  // mnist::binarize_dataset
 
+#include <execution>
+#include <tbb/parallel_for.h>
+#include <mutex>
 
 using namespace std;
 using namespace htm;
@@ -67,12 +70,12 @@ class MNIST {
 
   public:
     UInt verbosity = 1;
-    const UInt train_dataset_iterations = 2u; //epochs somewhat help, at linear time
+    const UInt train_dataset_iterations = 5u; //epochs somewhat help, at linear time
 
 
 void setup() {
 
-  input.initialize({28, 28,1}); 
+  input.initialize({28, 28, 1}); 
   columns.initialize({28, 28, 8}); //1D vs 2D no big difference, 2D seems more natural for the problem. Speed-----, Results+++++++++; #columns HIGHEST impact. 
   sp.initialize(
     /* inputDimensions */             input.dimensions,
@@ -83,12 +86,12 @@ void setup() {
     /* localAreaDensity */            0.1f,  // % active bits
     /* numActiveColumnsPerInhArea */  -1,
     /* stimulusThreshold */           6u,
-    /* synPermInactiveDec */          0.002f, //FIXME inactive decay permanence plays NO role, investigate! (slightly better w/o it)
+    /* synPermInactiveDec */          0.002f, //very low values better for MNIST
     /* synPermActiveInc */            0.14f, //takes upto 5x steps to get dis/connected
     /* synPermConnected */            0.5f, //no difference, let's leave at 0.5 in the middle
     /* minPctOverlapDutyCycles */     0.2f, //speed of re-learning?
     /* dutyCyclePeriod */             1402,
-    /* boostStrength */               2.0f, // Boosting does help, but entropy is high, on MNIST it does not matter, for learning with TM prefer boosting off (=0.0), or "neutral"=1.0
+    /* boostStrength */               12.0f, // Boosting does help, but entropy is high, on MNIST it does not matter, for learning with TM prefer boosting off (BOOSTING_DISABLED), or "neutral"=1.0
     /* seed */                        4u,
     /* spVerbosity */                 1u,
     /* wrapAround */                  true); // does not matter (helps slightly)
@@ -126,17 +129,41 @@ void train() {
     }
     Random().shuffle( index.begin(), index.end() );
 
-    for(const auto idx : index) { // index = order of label (shuffeled)
+
+
+    //parallel loop with TBB
+    std::mutex m;
+
+    tbb::parallel_for( tbb::blocked_range<size_t>(0, index.size()),
+                       [&](tbb::blocked_range<size_t> r) {
+//    for(size_t i=0; i< index.size(); i++) { // index = order of label (shuffeled)
+      for(auto i = r.begin(); i < r.end(); ++i) {
+
+      const auto idx = index[i];
       // Get the input & label
       const auto image = dataset.training_images.at(idx);
       const UInt label  = dataset.training_labels.at(idx);
 
       // Compute & Train
-      input.setDense( image );
-      sp.compute(input, true, columns);
-      clsr.learn( columns, {label} );
+      SDR Pinput(input.dimensions);
+      Pinput.setDense( image );
+
+      SDR Pcolumns({28,28,8});
+      sp.compute(Pinput, true, Pcolumns); //TODO change to return output?
+      //TODO make compute() const for parallelization? 
+      
+      // sync this
+      m.lock();
+      clsr.learn( Pcolumns, {label} );
+      m.unlock();
+
       if( verbosity && (++i % 1000 == 0) ) cout << "." << flush;
     }
+    }); // !end of lambda
+
+
+
+
     if( verbosity ) cout << endl;
   
   cout << "epoch ended" << endl;

From b27b40c07d4775a8995f65c36d6f9cfcd0e9e95e Mon Sep 17 00:00:00 2001
From: Marek Otahal <markotahal@gmail.com>
Date: Fri, 12 Jul 2019 14:51:45 +0200
Subject: [PATCH 3/6] MNIST: provide both single, parallel version

---
 src/CMakeLists.txt                       |  23 +++
 src/examples/mnist/MNIST_SP.cpp          |  41 +----
 src/examples/mnist/MNIST_SP_parallel.cpp | 225 +++++++++++++++++++++++
 3 files changed, 252 insertions(+), 37 deletions(-)
 create mode 100644 src/examples/mnist/MNIST_SP_parallel.cpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 8f589ab5bf..5671397d23 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -393,6 +393,29 @@ add_custom_target(mnist
                   COMMENT "Executing ${src_executable_mnistsp}"
                   VERBATIM)
 
+#########################################################
+## MNIST Spatial Pooler (Parallel Example)
+#
+set(src_executable_mnistsp_par mnist_sp_par)
+add_executable(${src_executable_mnistsp_par} examples/mnist/MNIST_SP_parallel.cpp)
+target_link_libraries(${src_executable_mnistsp_par}
+  ${INTERNAL_LINKER_FLAGS} #par uses tbb
+  ${core_library}
+  ${COMMON_OS_LIBS}
+)
+target_compile_options(    ${src_executable_mnistsp_par} PUBLIC ${INTERNAL_CXX_FLAGS})
+target_compile_definitions(${src_executable_mnistsp_par} PRIVATE ${COMMON_COMPILER_DEFINITIONS})
+# Pass MNIST data directory to main.cpp
+target_compile_definitions(${src_executable_mnistsp_par} PRIVATE MNIST_DATA_LOCATION=${mnist_SOURCE_DIR})
+target_include_directories(${src_executable_mnistsp_par} PRIVATE
+  ${CORE_LIB_INCLUDES}
+  ${EXTERNAL_INCLUDES}
+)
+add_custom_target(mnist_parallel
+  COMMAND ${src_executable_mnistsp_par}
+  DEPENDS ${src_executable_mnistsp_par}
+  COMMENT "Executing ${src_executable_mnistsp_par}"
+  VERBATIM)
 
 ############ INSTALL ######################################
 #
diff --git a/src/examples/mnist/MNIST_SP.cpp b/src/examples/mnist/MNIST_SP.cpp
index bd34f6ddfd..5d2196c259 100644
--- a/src/examples/mnist/MNIST_SP.cpp
+++ b/src/examples/mnist/MNIST_SP.cpp
@@ -36,9 +36,6 @@
 #include <mnist/mnist_reader.hpp> // MNIST data itself + read methods, namespace mnist::
 #include <mnist/mnist_utils.hpp>  // mnist::binarize_dataset
 
-#include <execution>
-#include <tbb/parallel_for.h>
-#include <mutex>
 
 using namespace std;
 using namespace htm;
@@ -78,7 +75,7 @@ class MNIST {
 
 void setup() {
 
-  input.initialize({28, 28, 1}); 
+  input.initialize({28, 28,1}); 
   columns.initialize({28, 28, 8}); //1D vs 2D no big difference, 2D seems more natural for the problem. Speed-----, Results+++++++++; #columns HIGHEST impact. 
   sp.initialize(
     /* inputDimensions */             input.dimensions,
@@ -89,12 +86,12 @@ void setup() {
     /* localAreaDensity */            0.1f,  // % active bits
     /* numActiveColumnsPerInhArea */  -1,
     /* stimulusThreshold */           6u,
-    /* synPermInactiveDec */          0.002f, //very low values better for MNIST
+    /* synPermInactiveDec */          0.002f, //FIXME inactive decay permanence plays NO role, investigate! (slightly better w/o it)
     /* synPermActiveInc */            0.14f, //takes upto 5x steps to get dis/connected
     /* synPermConnected */            0.5f, //no difference, let's leave at 0.5 in the middle
     /* minPctOverlapDutyCycles */     0.2f, //speed of re-learning?
     /* dutyCyclePeriod */             1402,
-    /* boostStrength */               12.0f, // Boosting does help, but entropy is high, on MNIST it does not matter, for learning with TM prefer boosting off (BOOSTING_DISABLED), or "neutral"=1.0
+    /* boostStrength */               2.0f, // Boosting does help, but entropy is high, on MNIST it does not matter, for learning with TM prefer boosting off (=0.0), or "neutral"=1.0
     /* seed */                        4u,
     /* spVerbosity */                 1u,
     /* wrapAround */                  true); // does not matter (helps slightly)
@@ -138,48 +135,18 @@ void train(const bool skipSP=false) {
     }
     Random().shuffle( index.begin(), index.end() );
 
-
-
-    //parallel loop with TBB
-    std::mutex m;
-
-    tbb::parallel_for( tbb::blocked_range<size_t>(0, index.size()),
-                       [&](tbb::blocked_range<size_t> r) {
-//    for(size_t i=0; i< index.size(); i++) { // index = order of label (shuffeled)
-      for(auto i = r.begin(); i < r.end(); ++i) {
-
-      const auto idx = index[i];
+    for(const auto idx : index) { // index = order of label (shuffeled)
       // Get the input & label
       const auto image = dataset.training_images.at(idx);
       const UInt label  = dataset.training_labels.at(idx);
 
       // Compute & Train
-<<<<<<< HEAD
-      SDR Pinput(input.dimensions);
-      Pinput.setDense( image );
-
-      SDR Pcolumns({28,28,8});
-      sp.compute(Pinput, true, Pcolumns); //TODO change to return output?
-      //TODO make compute() const for parallelization? 
-      
-      // sync this
-      m.lock();
-      clsr.learn( Pcolumns, {label} );
-      m.unlock();
-
-=======
       input.setDense( image );
       if(not skipSP) 
         sp.compute(input, true, columns);
       clsr.learn( skipSP ? input : columns, {label} );
->>>>>>> master_community
       if( verbosity && (++i % 1000 == 0) ) cout << "." << flush;
     }
-    }); // !end of lambda
-
-
-
-
     if( verbosity ) cout << endl;
   
   cout << "epoch ended" << endl;
diff --git a/src/examples/mnist/MNIST_SP_parallel.cpp b/src/examples/mnist/MNIST_SP_parallel.cpp
new file mode 100644
index 0000000000..3c20361af6
--- /dev/null
+++ b/src/examples/mnist/MNIST_SP_parallel.cpp
@@ -0,0 +1,225 @@
+/* ---------------------------------------------------------------------
+ * Copyright (C) 2018-2019, David McDougall, @breznak
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero Public License version 3 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU Affero Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses.
+ * ----------------------------------------------------------------------
+ */
+
+/**
+ * Solving the MNIST dataset with Spatial Pooler. Parallel demonstartion using c++17 TS Parallel (TBB)
+ * Requirements:
+ *   - c++17 codebase
+ *   - compiler: MSVC 2017+, g++-9
+ *   - link with TBB (The Building Blocks)
+ *
+ *
+ * Note 1: the example is more ugly, because we parallelize for-loop, compared to std::algorithms `sort(execution::policy::par, a.begin(), a.end());`
+ * Note 2: Running SP.compute() in parallel is useless for sequences, but works for MNIST and the likes. 
+ *
+ * This consists of a simple black & white image encoder, a spatial pool, and an
+ * SDR classifier.  The task is to recognise images of hand written numbers 0-9.
+ * This should score at least 95%.
+ */
+
+#include <cstdint> //uint8_t
+#include <iostream>
+#include <fstream>      // std::ofstream
+#include <vector>
+
+#include <htm/algorithms/SpatialPooler.hpp>
+#include <htm/algorithms/SDRClassifier.hpp>
+#include <htm/utils/SdrMetrics.hpp>
+#include <htm/os/Timer.hpp>
+
+#include <mnist/mnist_reader.hpp> // MNIST data itself + read methods, namespace mnist::
+#include <mnist/mnist_utils.hpp>  // mnist::binarize_dataset
+
+//includes for TS Parallel
+#include <execution>
+#include <tbb/parallel_for.h>
+#include <mutex>
+
+using namespace std;
+using namespace htm;
+
+class MNIST {
+/**
+ * RESULTS: Store results in the MNIST_SP.cpp file only, this parallel is just for experimenting with parallelization.
+ */
+
+  private:
+    SpatialPooler sp;
+    SDR input;
+    SDR columns;
+    Classifier clsr;
+    mnist::MNIST_dataset<std::vector, std::vector<uint8_t>, uint8_t> dataset;
+
+  public:
+    UInt verbosity = 1;
+    const UInt train_dataset_iterations = 20u; //epochs somewhat help, at linear time
+
+
+void setup() {
+
+  input.initialize({28, 28, 1}); 
+  columns.initialize({28, 28, 8}); //1D vs 2D no big difference, 2D seems more natural for the problem. Speed-----, Results+++++++++; #columns HIGHEST impact. 
+  sp.initialize(
+    /* inputDimensions */             input.dimensions,
+    /* columnDimensions */            columns.dimensions,
+    /* potentialRadius */             7, // with 2D, 7 results in 15x15 area, which is cca 25% for the input area. Slightly improves than 99999 aka "no topology, all to all connections"
+    /* potentialPct */                0.1f, //we have only 10 classes, and << #columns. So we want to force each col to specialize. Cca 0.3 w "7" above, or very small (0.1) for "no topology". Cannot be too small due to internal checks. Speed++
+    /* globalInhibition */            true, //Speed+++++++; SDR quality-- (global does have active nearby cols, which we want to avoid (local)); Results+-0
+    /* localAreaDensity */            0.1f,  // % active bits
+    /* numActiveColumnsPerInhArea */  -1,
+    /* stimulusThreshold */           6u,
+    /* synPermInactiveDec */          0.002f, //very low values better for MNIST
+    /* synPermActiveInc */            0.14f, //takes upto 5x steps to get dis/connected
+    /* synPermConnected */            0.5f, //no difference, let's leave at 0.5 in the middle
+    /* minPctOverlapDutyCycles */     0.2f, //speed of re-learning?
+    /* dutyCyclePeriod */             1402,
+    /* boostStrength */               12.0f, // Boosting does help, but entropy is high, on MNIST it does not matter, for learning with TM prefer boosting off (BOOSTING_DISABLED), or "neutral"=1.0
+    /* seed */                        4u,
+    /* spVerbosity */                 1u,
+    /* wrapAround */                  true); // does not matter (helps slightly)
+
+  // Save the connections to file for postmortem analysis.
+  ofstream dump("mnist_sp_initial.connections", ofstream::binary | ofstream::trunc | ofstream::out);
+  sp.connections.save( dump );
+  dump.close();
+
+  clsr.initialize( /* alpha */ 0.001f);
+
+  dataset = mnist::read_dataset<std::vector, std::vector, uint8_t, uint8_t>(string("../ThirdParty/mnist_data/mnist-src/")); //from CMake
+  mnist::binarize_dataset(dataset);
+}
+
+
+/**
+ *  train the SP on the training set. 
+ *  @param skipSP bool (default false) if set, output directly the input to the classifier.
+ *  This is used for a baseline benchmark (Classifier directly learns on input images)
+ */
+void train(const bool skipSP=false) {
+  // Train
+
+  if(verbosity)
+    cout << "Training for " << (train_dataset_iterations * dataset.training_labels.size())
+         << " cycles ..." << endl;
+  size_t i = 0;
+
+  Metrics inputStats(input,    1402);
+  Metrics columnStats(columns, 1402);
+
+  Timer tTrain(true);
+
+  for(auto epoch = 0u; epoch < train_dataset_iterations; epoch++) {
+    NTA_INFO << "epoch " << epoch;
+    // Shuffle the training data.
+    vector<UInt> index( dataset.training_labels.size() );
+    for (UInt i=0; i<dataset.training_labels.size(); i++) {
+      index.push_back(i);
+    }
+    Random().shuffle( index.begin(), index.end() );
+
+
+
+    //parallel loop with TBB
+    std::mutex m;
+    tbb::parallel_for( tbb::blocked_range<size_t>(0, index.size()),
+                       [&](tbb::blocked_range<size_t> r) {
+//    for(size_t i=0; i< index.size(); i++) { // index = order of label (shuffeled)
+      for(auto i = r.begin(); i < r.end(); ++i) {
+
+      const auto idx = index[i];
+      // Get the input & label
+      const auto image = dataset.training_images.at(idx);
+      const UInt label  = dataset.training_labels.at(idx);
+
+      // Compute & Train
+      SDR Pinput(input.dimensions);
+      Pinput.setDense( image );
+
+      SDR Pcolumns({28,28,8});
+      if(not skipSP)
+        sp.compute(Pinput, true, Pcolumns); //TODO change to return output?
+      //TODO make compute() const for parallelization? 
+      
+      // sync this 
+      {
+        m.lock(); //TODO use better locks than just mutex, unique_lock etc
+        clsr.learn( Pcolumns, {label} );
+        clsr.learn( skipSP ? Pinput : Pcolumns, {label} );
+        m.unlock(); 
+      }
+      if( verbosity && (++i % 1000 == 0) ) cout << "." << flush;
+    }
+    }); // !end of lambda
+
+
+    if( verbosity ) cout << endl;
+  
+  cout << "epoch ended" << endl;
+  cout << "inputStats "  << inputStats << endl;
+  cout << "columnStats " << columnStats << endl;
+  cout << sp << endl;
+  }
+  
+  tTrain.stop();
+  cout << "MNIST train time: " << tTrain.getElapsed() << endl; 
+
+  // Save the connections to file for postmortem analysis.
+  ofstream dump("mnist_sp_learned.connections", ofstream::binary | ofstream::trunc | ofstream::out);
+  sp.connections.save( dump );
+  dump.close();
+}
+
+void test(const bool skipSP=false) {
+  // Test
+  Real score = 0;
+  UInt n_samples = 0;
+  if(verbosity)
+    cout << "Testing for " << dataset.test_labels.size() << " cycles ..." << endl;
+  for(UInt i = 0; i < dataset.test_labels.size(); i++) {
+    // Get the input & label
+    const auto image  = dataset.test_images.at(i);
+    const UInt label  = dataset.test_labels.at(i);
+
+    // Compute
+    input.setDense( image );
+    if(not skipSP) 
+      sp.compute(input, false, columns);
+
+    // Check results
+    if( argmax( clsr.infer( skipSP ? input : columns ) ) == label)
+        score += 1;
+    n_samples += 1;
+    if( verbosity && i % 1000 == 0 ) cout << "." << flush;
+  }
+  if( verbosity ) cout << endl;
+  cout << "===========RESULTs=================" << endl;
+  cout << "Score: " << 100.0 * score / n_samples << "% ("<< (n_samples - score) << " / " << n_samples << " wrong). "   << endl;
+  cout << "SDR example: " << columns << endl;
+}
+
+};  // End class MNIST
+
+int main(int argc, char **argv) {
+  MNIST m;
+  cout << "=========== Spatial Pooler (parallel) =====================" << endl;
+  m.setup();
+  m.train();
+  m.test();
+
+  return 0;
+}
+

From a4e0113859d03bce330da7825c956236c3674ba0 Mon Sep 17 00:00:00 2001
From: Marek Otahal <markotahal@gmail.com>
Date: Wed, 17 Jul 2019 11:51:41 +0200
Subject: [PATCH 4/6] add Parallelizable header

---
 src/CMakeLists.txt                       |  1 +
 src/examples/mnist/MNIST_SP_parallel.cpp |  7 ++----
 src/htm/types/Parallelizable.hpp         | 29 ++++++++++++++++++++++++
 3 files changed, 32 insertions(+), 5 deletions(-)
 create mode 100644 src/htm/types/Parallelizable.hpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 5671397d23..4a16a37c71 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -168,6 +168,7 @@ set(regions_files
 
 set(types_files
     htm/types/Exception.hpp
+    htm/types/Parallelizable.hpp
     htm/types/Types.hpp
     htm/types/Serializable.hpp
     htm/types/Sdr.hpp
diff --git a/src/examples/mnist/MNIST_SP_parallel.cpp b/src/examples/mnist/MNIST_SP_parallel.cpp
index d97b72d136..93d93b3fa9 100644
--- a/src/examples/mnist/MNIST_SP_parallel.cpp
+++ b/src/examples/mnist/MNIST_SP_parallel.cpp
@@ -41,14 +41,11 @@
 #include <htm/utils/SdrMetrics.hpp>
 #include <htm/os/Timer.hpp>
 
+#include <htm/types/Parallelizable.hpp>
+
 #include <mnist/mnist_reader.hpp> // MNIST data itself + read methods, namespace mnist::
 #include <mnist/mnist_utils.hpp>  // mnist::binarize_dataset
 
-//includes for TS Parallel
-#include <execution>
-#include <tbb/parallel_for.h>
-#include <mutex>
-
 using namespace std;
 using namespace htm;
 
diff --git a/src/htm/types/Parallelizable.hpp b/src/htm/types/Parallelizable.hpp
new file mode 100644
index 0000000000..d5ac97a5f9
--- /dev/null
+++ b/src/htm/types/Parallelizable.hpp
@@ -0,0 +1,29 @@
+/** 
+ * Parallelizable.hpp
+ *
+ * include this header to files where you want to run blocks of code in parallel. 
+ *  We use [C++17 standard Parallel TS](https://en.cppreference.com/w/cpp/experimental/parallelism). 
+ *
+ * Requirements: 
+ *  - c++17
+ *  - The Building Blocks (tbb) linked to the library
+ *  - [supported compiler](https://en.cppreference.com/w/cpp/compiler_support#cpp17): currently GCC-9+, MSVC 2019
+ * //TODO: switch to c++17 by default, or implement `transform()` temporarily as a custom method? 
+ *
+ * Functionality:
+ *  - include all needed headers for given platform, compiler, ... 
+ *  - handle define `NUM_PARALLEL=n`
+ *    - to run in single thread, set NUM_PARALLEL=1
+ *
+ */  
+
+//includes for TS Parallel
+#include <execution> //std::execution::par, seq, par_unseq
+// #include <tbb/parallel_for.h> 
+#include <mutex>
+
+namespace htm {
+namespace parallel {
+  const constexpr auto mode = std::execution::par; //TODO ifdef NUM_PARALLEL=1 -> seq
+}
+}

From 230905c2b54342f058c0bd0e18faabf32d45a045 Mon Sep 17 00:00:00 2001
From: Marek Otahal <markotahal@gmail.com>
Date: Wed, 17 Jul 2019 13:27:42 +0200
Subject: [PATCH 5/6] SP: global inhibition: parallel

try execution::par_unseq for inhibition in SP.
Results: takes much longer than seq.
---
 src/examples/mnist/MNIST_SP.cpp      | 6 +++++-
 src/htm/algorithms/SpatialPooler.cpp | 9 ++++++++-
 src/htm/algorithms/SpatialPooler.hpp | 3 ++-
 src/htm/types/Parallelizable.hpp     | 2 +-
 4 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/examples/mnist/MNIST_SP.cpp b/src/examples/mnist/MNIST_SP.cpp
index 8bd2d4677b..9925e32daf 100644
--- a/src/examples/mnist/MNIST_SP.cpp
+++ b/src/examples/mnist/MNIST_SP.cpp
@@ -76,7 +76,7 @@ class MNIST {
 void setup() {
 
   input.initialize({28, 28,1}); 
-  columns.initialize({28, 28, 8}); //1D vs 2D no big difference, 2D seems more natural for the problem. Speed-----, Results+++++++++; #columns HIGHEST impact. 
+  columns.initialize({28, 28, 24}); //1D vs 2D no big difference, 2D seems more natural for the problem. Speed-----, Results+++++++++; #columns HIGHEST impact. 
   sp.initialize(
     /* inputDimensions */             input.dimensions,
     /* columnDimensions */            columns.dimensions,
@@ -156,6 +156,10 @@ void train(const bool skipSP=false) {
   
   tTrain.stop();
   cout << "MNIST train time: " << tTrain.getElapsed() << endl; 
+  cout << "SP nth \t" << sp.tNth.getElapsed() << endl;
+  cout << "SP sort \t" << sp.tSort.getElapsed() << endl;
+  cout << "SP while\t" << sp.tWhile.getElapsed() << endl;
+
 
   // Save the connections to file for postmortem analysis.
   ofstream dump("mnist_sp_learned.connections", ofstream::binary | ofstream::trunc | ofstream::out);
diff --git a/src/htm/algorithms/SpatialPooler.cpp b/src/htm/algorithms/SpatialPooler.cpp
index ff170b4227..c2d659017e 100644
--- a/src/htm/algorithms/SpatialPooler.cpp
+++ b/src/htm/algorithms/SpatialPooler.cpp
@@ -27,6 +27,7 @@
 #include <htm/algorithms/SpatialPooler.hpp>
 #include <htm/utils/Topology.hpp>
 #include <htm/utils/VectorHelpers.hpp>
+#include <htm/types/Parallelizable.hpp>
 
 using namespace std;
 using namespace htm;
@@ -844,19 +845,25 @@ void SpatialPooler::inhibitColumnsGlobal_(const vector<Real> &overlaps,
   // faster than a regular sort because it stops after it partitions the
   // elements about the Nth element, with all elements on their correct side of
   // the Nth element.
-  std::nth_element(
+  tNth.start();
+  std::nth_element(htm::parallel::mode,
     activeColumns.begin(),
     activeColumns.begin() + numDesired,
     activeColumns.end(),
     compare);
   // Remove the columns which lost the competition.
   activeColumns.resize(numDesired);
+  tNth.stop();
   // Finish sorting the winner columns by their overlap.
+  tSort.start();
   std::sort(activeColumns.begin(), activeColumns.end(), compare);
+  tSort.stop();
   // Remove sub-threshold winners
+  tWhile.start();
   while( !activeColumns.empty() &&
          overlaps[activeColumns.back()] < stimulusThreshold_)
       activeColumns.pop_back();
+  tWhile.stop();
 }
 
 
diff --git a/src/htm/algorithms/SpatialPooler.hpp b/src/htm/algorithms/SpatialPooler.hpp
index 93832aab90..b6e9a53a4a 100644
--- a/src/htm/algorithms/SpatialPooler.hpp
+++ b/src/htm/algorithms/SpatialPooler.hpp
@@ -29,7 +29,7 @@
 #include <htm/types/Types.hpp>
 #include <htm/types/Serializable.hpp>
 #include <htm/types/Sdr.hpp>
-
+#include <htm/os/Timer.hpp>
 
 namespace htm {
 
@@ -1210,6 +1210,7 @@ class SpatialPooler : public Serializable
 
 public:
   const Connections &connections = connections_;
+  mutable Timer tSort, tNth, tWhile;
 };
 
 std::ostream & operator<<(std::ostream & out, const SpatialPooler &sp);
diff --git a/src/htm/types/Parallelizable.hpp b/src/htm/types/Parallelizable.hpp
index d5ac97a5f9..6957c85913 100644
--- a/src/htm/types/Parallelizable.hpp
+++ b/src/htm/types/Parallelizable.hpp
@@ -24,6 +24,6 @@
 
 namespace htm {
 namespace parallel {
-  const constexpr auto mode = std::execution::par; //TODO ifdef NUM_PARALLEL=1 -> seq
+  const constexpr auto mode = std::execution::par_unseq; //TODO ifdef NUM_PARALLEL=1 -> seq
 }
 }

From ab572cd2db60937d387294007374b4bb2618dbd7 Mon Sep 17 00:00:00 2001
From: Marek Otahal <markotahal@gmail.com>
Date: Wed, 17 Jul 2019 13:42:34 +0200
Subject: [PATCH 6/6] Revert "MNIST: provide both single, parallel version"

This reverts commit b27b40c07d4775a8995f65c36d6f9cfcd0e9e95e.
---
 src/CMakeLists.txt                       |  23 ---
 src/examples/mnist/MNIST_SP.cpp          |   6 +-
 src/examples/mnist/MNIST_SP_parallel.cpp | 221 -----------------------
 3 files changed, 1 insertion(+), 249 deletions(-)
 delete mode 100644 src/examples/mnist/MNIST_SP_parallel.cpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4a16a37c71..3f080cebc7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -394,29 +394,6 @@ add_custom_target(mnist
                   COMMENT "Executing ${src_executable_mnistsp}"
                   VERBATIM)
 
-#########################################################
-## MNIST Spatial Pooler (Parallel Example)
-#
-set(src_executable_mnistsp_par mnist_sp_par)
-add_executable(${src_executable_mnistsp_par} examples/mnist/MNIST_SP_parallel.cpp)
-target_link_libraries(${src_executable_mnistsp_par}
-  ${INTERNAL_LINKER_FLAGS} #par uses tbb
-  ${core_library}
-  ${COMMON_OS_LIBS}
-)
-target_compile_options(    ${src_executable_mnistsp_par} PUBLIC ${INTERNAL_CXX_FLAGS})
-target_compile_definitions(${src_executable_mnistsp_par} PRIVATE ${COMMON_COMPILER_DEFINITIONS})
-# Pass MNIST data directory to main.cpp
-target_compile_definitions(${src_executable_mnistsp_par} PRIVATE MNIST_DATA_LOCATION=${mnist_SOURCE_DIR})
-target_include_directories(${src_executable_mnistsp_par} PRIVATE
-  ${CORE_LIB_INCLUDES}
-  ${EXTERNAL_INCLUDES}
-)
-add_custom_target(mnist_parallel
-  COMMAND ${src_executable_mnistsp_par}
-  DEPENDS ${src_executable_mnistsp_par}
-  COMMENT "Executing ${src_executable_mnistsp_par}"
-  VERBATIM)
 
 ############ INSTALL ######################################
 #
diff --git a/src/examples/mnist/MNIST_SP.cpp b/src/examples/mnist/MNIST_SP.cpp
index 9925e32daf..8bd2d4677b 100644
--- a/src/examples/mnist/MNIST_SP.cpp
+++ b/src/examples/mnist/MNIST_SP.cpp
@@ -76,7 +76,7 @@ class MNIST {
 void setup() {
 
   input.initialize({28, 28,1}); 
-  columns.initialize({28, 28, 24}); //1D vs 2D no big difference, 2D seems more natural for the problem. Speed-----, Results+++++++++; #columns HIGHEST impact. 
+  columns.initialize({28, 28, 8}); //1D vs 2D no big difference, 2D seems more natural for the problem. Speed-----, Results+++++++++; #columns HIGHEST impact. 
   sp.initialize(
     /* inputDimensions */             input.dimensions,
     /* columnDimensions */            columns.dimensions,
@@ -156,10 +156,6 @@ void train(const bool skipSP=false) {
   
   tTrain.stop();
   cout << "MNIST train time: " << tTrain.getElapsed() << endl; 
-  cout << "SP nth \t" << sp.tNth.getElapsed() << endl;
-  cout << "SP sort \t" << sp.tSort.getElapsed() << endl;
-  cout << "SP while\t" << sp.tWhile.getElapsed() << endl;
-
 
   // Save the connections to file for postmortem analysis.
   ofstream dump("mnist_sp_learned.connections", ofstream::binary | ofstream::trunc | ofstream::out);
diff --git a/src/examples/mnist/MNIST_SP_parallel.cpp b/src/examples/mnist/MNIST_SP_parallel.cpp
deleted file mode 100644
index 93d93b3fa9..0000000000
--- a/src/examples/mnist/MNIST_SP_parallel.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/* ---------------------------------------------------------------------
- * Copyright (C) 2018-2019, David McDougall, @breznak
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero Public License version 3 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- * See the GNU Affero Public License for more details.
- *
- * You should have received a copy of the GNU Affero Public License
- * along with this program.  If not, see http://www.gnu.org/licenses.
- * ----------------------------------------------------------------------
- */
-
-/**
- * Solving the MNIST dataset with Spatial Pooler. Parallel demonstartion using c++17 TS Parallel (TBB)
- * Requirements:
- *   - c++17 codebase
- *   - compiler: MSVC 2017+, g++-9
- *   - link with TBB (The Building Blocks)
- *
- *
- * Note 1: the example is more ugly, because we parallelize for-loop, compared to std::algorithms `sort(execution::policy::par, a.begin(), a.end());`
- * Note 2: Running SP.compute() in parallel is useless for sequences, but works for MNIST and the likes. 
- *
- * This consists of a simple black & white image encoder, a spatial pool, and an
- * SDR classifier.  The task is to recognise images of hand written numbers 0-9.
- * This should score at least 95%.
- */
-
-#include <cstdint> //uint8_t
-#include <iostream>
-#include <fstream>      // std::ofstream
-#include <vector>
-
-#include <htm/algorithms/SpatialPooler.hpp>
-#include <htm/algorithms/SDRClassifier.hpp>
-#include <htm/utils/SdrMetrics.hpp>
-#include <htm/os/Timer.hpp>
-
-#include <htm/types/Parallelizable.hpp>
-
-#include <mnist/mnist_reader.hpp> // MNIST data itself + read methods, namespace mnist::
-#include <mnist/mnist_utils.hpp>  // mnist::binarize_dataset
-
-using namespace std;
-using namespace htm;
-
-class MNIST {
-/**
- * RESULTS: Store results in the MNIST_SP.cpp file only, this parallel is just for experimenting with parallelization.
- */
-
-  private:
-    SpatialPooler sp;
-    SDR input;
-    SDR columns;
-    Classifier clsr;
-    mnist::MNIST_dataset<std::vector, std::vector<uint8_t>, uint8_t> dataset;
-
-  public:
-    UInt verbosity = 1;
-    const UInt train_dataset_iterations = 20u; //epochs somewhat help, at linear time
-
-
-void setup() {
-
-  input.initialize({28, 28, 1}); 
-  columns.initialize({28, 28, 8}); //1D vs 2D no big difference, 2D seems more natural for the problem. Speed-----, Results+++++++++; #columns HIGHEST impact. 
-  sp.initialize(
-    /* inputDimensions */             input.dimensions,
-    /* columnDimensions */            columns.dimensions,
-    /* potentialRadius */             7, // with 2D, 7 results in 15x15 area, which is cca 25% for the input area. Slightly improves than 99999 aka "no topology, all to all connections"
-    /* potentialPct */                0.1f, //we have only 10 classes, and << #columns. So we want to force each col to specialize. Cca 0.3 w "7" above, or very small (0.1) for "no topology". Cannot be too small due to internal checks. Speed++
-    /* globalInhibition */            true, //Speed+++++++; SDR quality-- (global does have active nearby cols, which we want to avoid (local)); Results+-0
-    /* localAreaDensity */            0.1f,  // % active bits
-    /* stimulusThreshold */           6u,
-    /* synPermInactiveDec */          0.002f, //very low values better for MNIST
-    /* synPermActiveInc */            0.14f, //takes upto 5x steps to get dis/connected
-    /* synPermConnected */            0.5f, //no difference, let's leave at 0.5 in the middle
-    /* minPctOverlapDutyCycles */     0.2f, //speed of re-learning?
-    /* dutyCyclePeriod */             1402,
-    /* boostStrength */               12.0f, // Boosting does help, but entropy is high, on MNIST it does not matter, for learning with TM prefer boosting off (BOOSTING_DISABLED), or "neutral"=1.0
-    /* seed */                        4u,
-    /* spVerbosity */                 1u,
-    /* wrapAround */                  true); // does not matter (helps slightly)
-
-  // Save the connections to file for postmortem analysis.
-  ofstream dump("mnist_sp_initial.connections", ofstream::binary | ofstream::trunc | ofstream::out);
-  sp.connections.save( dump );
-  dump.close();
-
-  clsr.initialize( /* alpha */ 0.001f);
-
-  dataset = mnist::read_dataset<std::vector, std::vector, uint8_t, uint8_t>(string("../ThirdParty/mnist_data/mnist-src/")); //from CMake
-  mnist::binarize_dataset(dataset);
-}
-
-
-/**
- *  train the SP on the training set. 
- *  @param skipSP bool (default false) if set, output directly the input to the classifier.
- *  This is used for a baseline benchmark (Classifier directly learns on input images)
- */
-void train(const bool skipSP=false) {
-  // Train
-
-  if(verbosity)
-    cout << "Training for " << (train_dataset_iterations * dataset.training_labels.size())
-         << " cycles ..." << endl;
-  size_t i = 0;
-
-  Metrics inputStats(input,    1402);
-  Metrics columnStats(columns, 1402);
-
-  Timer tTrain(true);
-
-  for(auto epoch = 0u; epoch < train_dataset_iterations; epoch++) {
-    NTA_INFO << "epoch " << epoch;
-    // Shuffle the training data.
-    vector<UInt> index( dataset.training_labels.size() );
-    for (UInt i=0; i<dataset.training_labels.size(); i++) {
-      index.push_back(i);
-    }
-    Random().shuffle( index.begin(), index.end() );
-
-
-
-    //parallel loop with TBB
-    std::mutex m;
-    tbb::parallel_for( tbb::blocked_range<size_t>(0, index.size()),
-                       [&](tbb::blocked_range<size_t> r) {
-//    for(size_t i=0; i< index.size(); i++) { // index = order of label (shuffeled)
-      for(auto i = r.begin(); i < r.end(); ++i) {
-
-      const auto idx = index[i];
-      // Get the input & label
-      const auto image = dataset.training_images.at(idx);
-      const UInt label  = dataset.training_labels.at(idx);
-
-      // Compute & Train
-      SDR Pinput(input.dimensions);
-      Pinput.setDense( image );
-
-      SDR Pcolumns({28,28,8});
-      if(not skipSP)
-        sp.compute(Pinput, true, Pcolumns); //TODO change to return output?
-      //TODO make compute() const for parallelization? 
-      
-      // sync this 
-      {
-        m.lock(); //TODO use better locks than just mutex, unique_lock etc
-        clsr.learn( Pcolumns, {label} );
-        clsr.learn( skipSP ? Pinput : Pcolumns, {label} );
-        m.unlock(); 
-      }
-      if( verbosity && (++i % 1000 == 0) ) cout << "." << flush;
-    }
-    }); // !end of lambda
-
-
-    if( verbosity ) cout << endl;
-  
-  cout << "epoch ended" << endl;
-  cout << "inputStats "  << inputStats << endl;
-  cout << "columnStats " << columnStats << endl;
-  cout << sp << endl;
-  }
-  
-  tTrain.stop();
-  cout << "MNIST train time: " << tTrain.getElapsed() << endl; 
-
-  // Save the connections to file for postmortem analysis.
-  ofstream dump("mnist_sp_learned.connections", ofstream::binary | ofstream::trunc | ofstream::out);
-  sp.connections.save( dump );
-  dump.close();
-}
-
-void test(const bool skipSP=false) {
-  // Test
-  Real score = 0;
-  UInt n_samples = 0;
-  if(verbosity)
-    cout << "Testing for " << dataset.test_labels.size() << " cycles ..." << endl;
-  for(UInt i = 0; i < dataset.test_labels.size(); i++) {
-    // Get the input & label
-    const auto image  = dataset.test_images.at(i);
-    const UInt label  = dataset.test_labels.at(i);
-
-    // Compute
-    input.setDense( image );
-    if(not skipSP) 
-      sp.compute(input, false, columns);
-
-    // Check results
-    if( argmax( clsr.infer( skipSP ? input : columns ) ) == label)
-        score += 1;
-    n_samples += 1;
-    if( verbosity && i % 1000 == 0 ) cout << "." << flush;
-  }
-  if( verbosity ) cout << endl;
-  cout << "===========RESULTs=================" << endl;
-  cout << "Score: " << 100.0 * score / n_samples << "% ("<< (n_samples - score) << " / " << n_samples << " wrong). "   << endl;
-  cout << "SDR example: " << columns << endl;
-}
-
-};  // End class MNIST
-
-int main(int argc, char **argv) {
-  MNIST m;
-  cout << "=========== Spatial Pooler (parallel) =====================" << endl;
-  m.setup();
-  m.train();
-  m.test();
-
-  return 0;
-}
-