From 3dc19d6fdf4085a8d63ef0d60f2be90d1e41d947 Mon Sep 17 00:00:00 2001
From: Andreas Singraber <andreas.singraber@gmx.at>
Date: Sun, 6 Sep 2020 00:42:01 +0200
Subject: [PATCH 1/9] Add keyword to select decoupling type.

---
 src/libnnp/Settings.cpp      |  1 +
 src/libnnptrain/Training.cpp | 42 ++++++++++++++++++++++++++++++++++++
 src/libnnptrain/Training.h   | 17 +++++++++++++++
 3 files changed, 60 insertions(+)

diff --git a/src/libnnp/Settings.cpp b/src/libnnp/Settings.cpp
index 68c0638d8..7f6a2c2ca 100644
--- a/src/libnnp/Settings.cpp
+++ b/src/libnnp/Settings.cpp
@@ -73,6 +73,7 @@ map<string, string> const createKnownKeywordsMap()
     m["jacobian_mode"                 ] = "";
     m["update_strategy"               ] = "";
     m["selection_mode"                ] = "";
+    m["decoupling_type"               ] = "";
     m["task_batch_size_energy"        ] = "";
     m["task_batch_size_force"         ] = "";
     m["gradient_type"                 ] = "";
diff --git a/src/libnnptrain/Training.cpp b/src/libnnptrain/Training.cpp
index 5cd08a80e..94a5acc92 100644
--- a/src/libnnptrain/Training.cpp
+++ b/src/libnnptrain/Training.cpp
@@ -40,6 +40,7 @@ Training::Training() : Dataset(),
                        jacobianMode               (JM_SUM         ),
                        updateStrategy             (US_COMBINED    ),
                        selectionMode              (SM_RANDOM      ),
+                       decouplingType             (DT_GLOBAL      ),
                        hasUpdaters                (false          ),
                        hasStructures              (false          ),
                        useForces                  (false          ),
@@ -991,6 +992,47 @@ void Training::setupTraining()
     {
         kalmanType = (KalmanFilter::KalmanType)
                      atoi(settings["kalman_type"].c_str());
+        decouplingType = (DecouplingType)
+                         atoi(settings["decoupling_type"].c_str());
+        if (decouplingType == DT_GLOBAL)
+        {
+            log << strpr("No decoupling (GEKF) selected: DT_GLOBAL (%d).\n",
+                         decouplingType);
+        }
+        else if (decouplingType == DT_ELEMENT)
+        {
+            log << strpr("Per-element decoupling (ED-GEKF) selected: "
+                         "DT_ELEMENT (%d).\n", decouplingType);
+        }
+        else if (decouplingType == DT_LAYER)
+        {
+            log << strpr("Per-layer decoupling selected: "
+                         "DT_LAYER (%d).\n", decouplingType);
+        }
+        else if (decouplingType == DT_NODE)
+        {
+            log << strpr("Per-node decoupling (NDEKF) selected: "
+                         "DT_NODE (%d).\n", decouplingType);
+        }
+        else if (decouplingType == DT_FULL)
+        {
+            log << strpr("Full (per-weight) decoupling selected: "
+                         "DT_FULL (%d).\n", decouplingType);
+        }
+        else
+        {
+            throw runtime_error("ERROR: Unknown Kalman filter decoupling "
+                                "type.\n");
+        }
+        if (decouplingType != DT_GLOBAL &&
+            (parallelMode != PM_TRAIN_ALL || updateStrategy != US_COMBINED))
+        {
+            throw runtime_error(strpr("ERROR: Kalman filter decoupling works "
+                                      "only in conjunction with"
+                                      "\"parallel_mode %d\" and "
+                                      "\"update_strategy %d\".\n",
+                                      PM_TRAIN_ALL, US_COMBINED));
+        }
     }
 
     for (size_t i = 0; i < numUpdaters; ++i)
diff --git a/src/libnnptrain/Training.h b/src/libnnptrain/Training.h
index 1ae678255..93022ebc8 100644
--- a/src/libnnptrain/Training.h
+++ b/src/libnnptrain/Training.h
@@ -110,6 +110,21 @@ class Training : public Dataset
         SM_THRESHOLD
     };
 
+    /// Kalman filter decoupling type (requires UT_KF and US_COMBINED).
+    enum DecouplingType
+    {
+        /// No decoupling, global extended Kalman filter (GEKF).
+        DT_GLOBAL,
+        /// Per-element decoupling, ED-GEKF (doi.org/10.1021/acs.jctc.5b00211).
+        DT_ELEMENT,
+        /// Layer decoupling.
+        DT_LAYER,
+        /// Node decoupling, NDEKF.
+        DT_NODE,
+        /// Full decoupling, FDEKF.
+        DT_FULL
+    };
+
     /// Constructor.
     Training();
     /// Destructor, updater vector needs to be cleaned.
@@ -309,6 +324,8 @@ class Training : public Dataset
     UpdateStrategy                updateStrategy;
     /// Selection mode for update candidates.
     SelectionMode                 selectionMode;
+    /// Kalman filter decoupling type if KF training selected.
+    DecouplingType                decouplingType;
     /// If this rank performs weight updates.
     bool                          hasUpdaters;
     /// If this rank holds structure information.

From dce6dd9130c7f9bb74e489945e03e62e6f5c1fad Mon Sep 17 00:00:00 2001
From: Andreas Singraber <andreas.singraber@gmx.at>
Date: Sun, 6 Sep 2020 00:55:31 +0200
Subject: [PATCH 2/9] Fix examples with new keyword.

---
 examples/input.nn.recommended           | 1 +
 examples/nnp-train/Cu2S_PBE/input.nn    | 1 +
 examples/nnp-train/H2O_RPBE-D3/input.nn | 1 +
 examples/nnp-train/LJ/input.nn          | 1 +
 4 files changed, 4 insertions(+)

diff --git a/examples/input.nn.recommended b/examples/input.nn.recommended
index e53934e0f..a0030bf52 100644
--- a/examples/input.nn.recommended
+++ b/examples/input.nn.recommended
@@ -35,6 +35,7 @@ parallel_mode                   0              # Training parallelization used (
 jacobian_mode                   1              # Jacobian computation mode (0 = Summation to single gradient, 1 = Per-task summed gradient, 2 = Full Jacobian).
 update_strategy                 0              # Update strategy (0 = Combined, 1 = Per-element).
 selection_mode                  2              # Update candidate selection mode (0 = Random, 1 = Sort, 2 = Threshold).
+decoupling_type                 0              # Kalman filter decoupling type (0 = GEKF, 1 = ED-GEKF, 2 = Layer, 3 = NDEKF, 4 = FDEKF).
 task_batch_size_energy          1              # Number of energy update candidates prepared per task for each update (0 = Entire training set).
 task_batch_size_force           1              # Number of force update candidates prepared per task for each update (0 = Entire training set).
 memorize_symfunc_results                       # Keep symmetry function results in memory.
diff --git a/examples/nnp-train/Cu2S_PBE/input.nn b/examples/nnp-train/Cu2S_PBE/input.nn
index b4aa9333e..23eccf6db 100644
--- a/examples/nnp-train/Cu2S_PBE/input.nn
+++ b/examples/nnp-train/Cu2S_PBE/input.nn
@@ -44,6 +44,7 @@ parallel_mode                   0              # Training parallelization used (
 jacobian_mode                   1              # Jacobian computation mode (0 = Summation to single gradient, 1 = Per-task summed gradient, 2 = Full Jacobian).
 update_strategy                 0              # Update strategy (0 = Combined, 1 = Per-element).
 selection_mode                  2              # Update candidate selection mode (0 = Random, 1 = Sort, 2 = Threshold).
+decoupling_type                 0              # Kalman filter decoupling type (0 = GEKF, 1 = ED-GEKF, 2 = Layer, 3 = NDEKF, 4 = FDEKF).
 task_batch_size_energy          1              # Number of energy update candidates prepared per task for each update (0 = Entire training set).
 task_batch_size_force           1              # Number of force update candidates prepared per task for each update (0 = Entire training set).
 memorize_symfunc_results                       # Keep symmetry function results in memory.
diff --git a/examples/nnp-train/H2O_RPBE-D3/input.nn b/examples/nnp-train/H2O_RPBE-D3/input.nn
index b2666eecd..05b3fcacb 100644
--- a/examples/nnp-train/H2O_RPBE-D3/input.nn
+++ b/examples/nnp-train/H2O_RPBE-D3/input.nn
@@ -44,6 +44,7 @@ parallel_mode                   0              # Training parallelization used (
 jacobian_mode                   1              # Jacobian computation mode (0 = Summation to single gradient, 1 = Per-task summed gradient, 2 = Full Jacobian).
 update_strategy                 0              # Update strategy (0 = Combined, 1 = Per-element).
 selection_mode                  2              # Update candidate selection mode (0 = Random, 1 = Sort, 2 = Threshold).
+decoupling_type                 0              # Kalman filter decoupling type (0 = GEKF, 1 = ED-GEKF, 2 = Layer, 3 = NDEKF, 4 = FDEKF).
 task_batch_size_energy          1              # Number of energy update candidates prepared per task for each update (0 = Entire training set).
 task_batch_size_force           1              # Number of force update candidates prepared per task for each update (0 = Entire training set).
 memorize_symfunc_results                       # Keep symmetry function results in memory.
diff --git a/examples/nnp-train/LJ/input.nn b/examples/nnp-train/LJ/input.nn
index d42118f4b..2a1812341 100644
--- a/examples/nnp-train/LJ/input.nn
+++ b/examples/nnp-train/LJ/input.nn
@@ -34,6 +34,7 @@ parallel_mode                   0              # Training parallelization used (
 jacobian_mode                   1              # Jacobian computation mode (0 = Summation to single gradient, 1 = Per-task summed gradient, 2 = Full Jacobian).
 update_strategy                 0              # Update strategy (0 = Combined, 1 = Per-element).
 selection_mode                  2              # Update candidate selection mode (0 = Random, 1 = Sort, 2 = Threshold).
+decoupling_type                 0              # Kalman filter decoupling type (0 = GEKF, 1 = ED-GEKF, 2 = Layer, 3 = NDEKF, 4 = FDEKF).
 task_batch_size_energy          1              # Number of energy update candidates prepared per task for each update (0 = Entire training set).
 task_batch_size_force           1              # Number of force update candidates prepared per task for each update (0 = Entire training set).
 memorize_symfunc_results                       # Keep symmetry function results in memory.

From d0a6af3bc6585718085362717342e42959f5bf52 Mon Sep 17 00:00:00 2001
From: Andreas Singraber <andreas.singraber@gmx.at>
Date: Tue, 8 Sep 2020 00:07:21 +0200
Subject: [PATCH 3/9] Added MPI communicator and decoupling group mask

---
 src/libnnptrain/KalmanFilter.cpp | 50 ++++++++++++++++++++++++++++++--
 src/libnnptrain/KalmanFilter.h   | 20 +++++++++++++
 src/libnnptrain/Training.cpp     | 24 +++++++++++++++
 3 files changed, 92 insertions(+), 2 deletions(-)

diff --git a/src/libnnptrain/KalmanFilter.cpp b/src/libnnptrain/KalmanFilter.cpp
index ab55a8419..48c7f4101 100644
--- a/src/libnnptrain/KalmanFilter.cpp
+++ b/src/libnnptrain/KalmanFilter.cpp
@@ -19,6 +19,7 @@
 #include <Eigen/LU>
 #include <iostream>
 #include <stdexcept>
+#include <map>       // std::map
 
 using namespace Eigen;
 using namespace std;
@@ -27,6 +28,8 @@ using namespace nnp;
 KalmanFilter::KalmanFilter(size_t const sizeState,
                            KalmanType const type) :
     Updater(sizeState),
+    myRank         (0   ),
+    numProcs       (0   ),
     sizeObservation(0   ),
     numUpdates     (0   ),
     epsilon        (0.0 ),
@@ -67,12 +70,47 @@ KalmanFilter::KalmanFilter(size_t const sizeState,
     // Prevent problems with unallocated K when log starts.
     K.resize(sizeState, sizeObservation);
     K.setZero();
+
+    // Set default decoupling group mask.
+    groupMask = new Map<VectorXi const>(0, sizeState);
 }
 
 KalmanFilter::~KalmanFilter()
 {
 }
 
+void KalmanFilter::setupMPI(MPI_Comm* communicator)
+{
+    MPI_Comm_dup(*communicator, &comm);
+    MPI_Comm_rank(comm, &myRank);
+    MPI_Comm_size(comm, &numProcs);
+
+    return;
+}
+
+void KalmanFilter::setupDecoupling(int const* const mask)
+{
+    new (groupMask) Map<VectorXi const>(mask, sizeState);
+
+    for (long i = 0; i < groupMask->size(); ++i)
+    {
+        groupMap[(*groupMask)(i)].push_back(i);
+    }
+
+    // Check consistency of group map.
+    for (size_t i = 0; i < groupMap.size(); ++i)
+    {
+        if (groupMap.find(i) == groupMap.end())
+        {
+            throw runtime_error(strpr("ERROR: Kalman filter decoupling group "
+                                      "mask is inconsistent, group %zu not "
+                                      "present.\n", i));
+        }
+    }
+
+    return;
+}
+
 void KalmanFilter::setSizeObservation(size_t const size)
 {
     sizeObservation = size;
@@ -317,8 +355,8 @@ vector<string> KalmanFilter::info() const
     if (type == KT_STANDARD)
     {
         v.push_back(strpr("KalmanType::KT_STANDARD (%d)\n", type));
-        v.push_back(strpr("sizeState       = %zu\n", sizeState));
-        v.push_back(strpr("sizeObservation = %zu\n", sizeObservation));
+        v.push_back(strpr("myRank          = %d\n", myRank));
+        v.push_back(strpr("numProcs        = %d\n", numProcs));
         v.push_back(strpr("epsilon         = %12.4E\n", epsilon));
         v.push_back(strpr("q0              = %12.4E\n", q0     ));
         v.push_back(strpr("qtau            = %12.4E\n", qtau   ));
@@ -339,7 +377,15 @@ vector<string> KalmanFilter::info() const
         v.push_back(strpr("lambda          = %12.4E\n", lambda));
         v.push_back(strpr("nu              = %12.4E\n", nu    ));
     }
+    v.push_back(strpr("sizeState       = %zu\n", sizeState));
+    v.push_back(strpr("sizeObservation = %zu\n", sizeObservation));
     v.push_back(strpr("OpenMP threads used: %d\n", nbThreads()));
+    v.push_back(strpr("Number of decoupling groups: %zu\n", groupMap.size()));
+    for (size_t i = 0; i < groupMap.size(); ++i)
+    {
+        v.push_back(strpr(" - group %5zu size: %zu\n",
+                          i, groupMap.at(i).size()));
+    }
 
     return v;
 }
diff --git a/src/libnnptrain/KalmanFilter.h b/src/libnnptrain/KalmanFilter.h
index 356cee949..003445649 100644
--- a/src/libnnptrain/KalmanFilter.h
+++ b/src/libnnptrain/KalmanFilter.h
@@ -50,6 +50,16 @@ class KalmanFilter : public Updater
     /** Destructor.
      */
     virtual ~KalmanFilter();
+    /** Basic MPI setup.
+     *
+     * @param[in] communicator Input communicator of calling program.
+     */
+    void                     setupMPI(MPI_Comm* communicator);
+    /** Set decoupling group mask.
+     *
+     * @param[in] mask Input group mask provided via array of integers.
+     */
+    void                     setupDecoupling(int const* const mask);
     /** Set observation vector size.
      *
      * @param[in] size Size of the observation vector.
@@ -194,6 +204,10 @@ class KalmanFilter : public Updater
 private:
     /// Kalman filter type.
     KalmanType                         type;
+    /// My process ID.
+    int                                myRank;
+    /// Total number of MPI processors.
+    int                                numProcs;
     /// Size of observation (measurement) vector.
     std::size_t                        sizeObservation;
     /// Total number of updates performed.
@@ -222,6 +236,10 @@ class KalmanFilter : public Updater
     double                             nu;
     /// Forgetting gain factor gamma for fading memory Kalman filter.
     double                             gamma;
+    /// Decoupling group map.
+    std::map<int, std::vector<int>>    groupMap;
+    /// Decoupling group mask vector.
+    Eigen::Map<Eigen::VectorXi const>* groupMask;
     /// State vector.
     Eigen::Map<Eigen::VectorXd>*       w;
     /// Error vector.
@@ -234,6 +252,8 @@ class KalmanFilter : public Updater
     Eigen::MatrixXd                    K;
     /// Intermediate result X = P . H.
     Eigen::MatrixXd                    X;
+    /// Global MPI communicator.
+    MPI_Comm                           comm;
 };
 
 //////////////////////////////////
diff --git a/src/libnnptrain/Training.cpp b/src/libnnptrain/Training.cpp
index 94a5acc92..106a9bb55 100644
--- a/src/libnnptrain/Training.cpp
+++ b/src/libnnptrain/Training.cpp
@@ -1050,6 +1050,30 @@ void Training::setupTraining()
                 updaters.push_back(
                     (Updater*)new KalmanFilter(numWeightsPerUpdater.at(i),
                                                kalmanType));
+                KalmanFilter* u = dynamic_cast<KalmanFilter*>(updaters.back());
+                u->setupMPI(&comm);
+                vector<int> groupMask(weights.at(i).size(), 0);
+                if (decouplingType == DT_ELEMENT)
+                {
+                    for (size_t i = 0; i < numElements; ++i)
+                    {
+                        fill(groupMask.begin() + weightsOffset.at(i),
+                             groupMask.begin() + weightsOffset.at(i)
+                             + elements.at(i).
+                               neuralNetwork->getNumConnections(),
+                             i);
+                    }
+                }
+                else if (decouplingType == DT_LAYER)
+                {
+                }
+                else if (decouplingType == DT_NODE)
+                {
+                }
+                else if (decouplingType == DT_FULL)
+                {
+                }
+                u->setupDecoupling(groupMask.data());
             }
             updaters.back()->setState(&(weights.at(i).front()));
         }

From 31bcd583732bce1fd48d1fce227bfa62baa2d72e Mon Sep 17 00:00:00 2001
From: Andreas Singraber <andreas.singraber@gmx.at>
Date: Tue, 8 Sep 2020 17:35:13 +0200
Subject: [PATCH 4/9] Started with group mask for layer decoupling, WIP!

---
 src/libnnp/NeuralNetwork.cpp | 15 +++++++++++++++
 src/libnnp/NeuralNetwork.h   | 11 +++++++++++
 src/libnnptrain/Training.cpp | 20 ++++++++++++++++++++
 3 files changed, 46 insertions(+)

diff --git a/src/libnnp/NeuralNetwork.cpp b/src/libnnp/NeuralNetwork.cpp
index e7b7808c6..be44be604 100644
--- a/src/libnnp/NeuralNetwork.cpp
+++ b/src/libnnp/NeuralNetwork.cpp
@@ -921,6 +921,21 @@ void NeuralNetwork::getNeuronStatistics(long*   count,
     return;
 }
 
+vector<pair<size_t, size_t>> NeuralNetwork::getLayerBoundaries() const
+{
+    vector<pair<size_t, size_t>> boundaries;
+
+    for (int i = 0; i < numLayers - 1; ++i)
+    {
+        boundaries.push_back(make_pair(weightOffset[i],
+                                       weightOffset[i+1] - 1));
+    }
+    boundaries.push_back(make_pair(weightOffset[numLayers - 1],
+                                   numConnections - 1));
+
+    return boundaries;
+}
+
 /*
 void NeuralNetwork::writeStatus(int element, int epoch)
 {
diff --git a/src/libnnp/NeuralNetwork.h b/src/libnnp/NeuralNetwork.h
index c56c58e9f..c9c499734 100644
--- a/src/libnnp/NeuralNetwork.h
+++ b/src/libnnp/NeuralNetwork.h
@@ -17,8 +17,10 @@
 #ifndef NEURALNETWORK_H
 #define NEURALNETWORK_H
 
+#include <cstddef>   // std::size_t
 #include <fstream>   // std::ofstream
 #include <string>    // std::string
+#include <utility>   // std::pair
 #include <vector>    // std::vector
 
 namespace nnp
@@ -333,6 +335,15 @@ class NeuralNetwork
      * Counters and summation variables for neuron statistics are reset.
      */
     void                     resetNeuronStatistics();
+    /** Get layer boundaries in combined weight vector (for Kalman filter
+     * decoupling setup).
+     *
+     * @return Vector with layer boundaries as pair (begin, end),
+     * (numLayers + 1) points.
+     */
+    std::vector<std::pair<
+    std::size_t,
+    std::size_t>>            getLayerBoundaries() const;
     //void   writeStatus(int, int);
     long                     getMemoryUsage();
     /** Print neural network architecture.
diff --git a/src/libnnptrain/Training.cpp b/src/libnnptrain/Training.cpp
index 106a9bb55..396465336 100644
--- a/src/libnnptrain/Training.cpp
+++ b/src/libnnptrain/Training.cpp
@@ -1066,6 +1066,26 @@ void Training::setupTraining()
                 }
                 else if (decouplingType == DT_LAYER)
                 {
+                    size_t index = 0;
+                    for (size_t i = 0; i < numElements; ++i)
+                    {
+                        for (auto b : elements.at(i).neuralNetwork
+                                      ->getLayerBoundaries())
+                        {
+                            log << strpr("%zu %zu\n", b.first, b.second);
+                            fill(groupMask.begin()
+                                    + weightsOffset.at(i) + b.first,
+                                 groupMask.begin()
+                                    + weightsOffset.at(i) + b.second,
+                                 index);
+                            index++;
+                        }
+                    }
+                    index = 0;
+                    for (auto m : groupMask)
+                    {
+                        log << strpr("%zu %zu\n", index, m);
+                    }
                 }
                 else if (decouplingType == DT_NODE)
                 {

From 33719c4dfa0625050dab1483924f5e4cb3e9f0d7 Mon Sep 17 00:00:00 2001
From: Andreas Singraber <andreas.singraber@gmx.at>
Date: Wed, 9 Sep 2020 00:30:30 +0200
Subject: [PATCH 5/9] All decoupling group masks look correct

---
 src/libnnp/NeuralNetwork.cpp | 34 ++++++++++++++++++++++++++---
 src/libnnp/NeuralNetwork.h   |  8 +++++++
 src/libnnptrain/Training.cpp | 42 +++++++++++++++++++++++++++++-------
 3 files changed, 73 insertions(+), 11 deletions(-)

diff --git a/src/libnnp/NeuralNetwork.cpp b/src/libnnp/NeuralNetwork.cpp
index be44be604..42d4ac390 100644
--- a/src/libnnp/NeuralNetwork.cpp
+++ b/src/libnnp/NeuralNetwork.cpp
@@ -21,6 +21,8 @@
 #include <cstdio>    // fprintf, stderr
 #include <cstdlib>   // exit, EXIT_FAILURE, rand, srand
 #include <limits>    // std::numeric_limits
+#include <numeric>   // std::iota
+#include <valarray>  // std::valarray, std::slice
 
 #define EXP_LIMIT 35.0
 
@@ -925,17 +927,43 @@ vector<pair<size_t, size_t>> NeuralNetwork::getLayerBoundaries() const
 {
     vector<pair<size_t, size_t>> boundaries;
 
-    for (int i = 0; i < numLayers - 1; ++i)
+    for (int i = 0; i < numLayers - 2; ++i)
     {
         boundaries.push_back(make_pair(weightOffset[i],
-                                       weightOffset[i+1] - 1));
+                                       weightOffset[i + 1] - 1));
     }
-    boundaries.push_back(make_pair(weightOffset[numLayers - 1],
+    boundaries.push_back(make_pair(weightOffset[numLayers - 2],
                                    numConnections - 1));
 
     return boundaries;
 }
 
+vector<vector<size_t>> NeuralNetwork::getNodeWeightIndices() const
+{
+    vector<vector<size_t>> indices;
+    valarray<size_t> wlist(numConnections);
+
+    // Fill array with increasing numbers from 0 to numConnections - 1.
+    iota(begin(wlist), end(wlist), 0);
+
+    for (int i = 1; i < numLayers; i++)
+    {
+        for (int j = 0; j < layers[i].numNeurons; j++)
+        {
+            indices.push_back(vector<size_t>());
+            // Use slice function to get the right weight indices.
+            valarray<size_t> v = wlist[slice(weightOffset[i - 1] + j,
+                                             layers[i].numNeuronsPrevLayer,
+                                             layers[i].numNeurons)];
+            indices.back().assign(begin(v), end(v));
+            // Add bias weight.
+            indices.back().push_back(biasOffset[i - 1] + j);
+        }
+    }
+
+    return indices;
+}
+
 /*
 void NeuralNetwork::writeStatus(int element, int epoch)
 {
diff --git a/src/libnnp/NeuralNetwork.h b/src/libnnp/NeuralNetwork.h
index c9c499734..61f30ce09 100644
--- a/src/libnnp/NeuralNetwork.h
+++ b/src/libnnp/NeuralNetwork.h
@@ -344,6 +344,14 @@ class NeuralNetwork
     std::vector<std::pair<
     std::size_t,
     std::size_t>>            getLayerBoundaries() const;
+    /** Get indices of weights associated with nodes (for Kalman filter
+     * decoupling setup).
+     *
+     * @return Vector of each nodes weight indices.
+     */
+    std::vector<
+    std::vector<
+    std::size_t>>            getNodeWeightIndices() const;
     //void   writeStatus(int, int);
     long                     getMemoryUsage();
     /** Print neural network architecture.
diff --git a/src/libnnptrain/Training.cpp b/src/libnnptrain/Training.cpp
index 396465336..2ed36e190 100644
--- a/src/libnnptrain/Training.cpp
+++ b/src/libnnptrain/Training.cpp
@@ -1052,8 +1052,13 @@ void Training::setupTraining()
                                                kalmanType));
                 KalmanFilter* u = dynamic_cast<KalmanFilter*>(updaters.back());
                 u->setupMPI(&comm);
-                vector<int> groupMask(weights.at(i).size(), 0);
-                if (decouplingType == DT_ELEMENT)
+                vector<int> groupMask(weights.at(i).size(),
+                                      numeric_limits<int>::max());
+                if (decouplingType == DT_GLOBAL)
+                {
+                    fill(groupMask.begin(), groupMask.end(), 0);
+                }
+                else if (decouplingType == DT_ELEMENT)
                 {
                     for (size_t i = 0; i < numElements; ++i)
                     {
@@ -1076,23 +1081,44 @@ void Training::setupTraining()
                             fill(groupMask.begin()
                                     + weightsOffset.at(i) + b.first,
                                  groupMask.begin()
-                                    + weightsOffset.at(i) + b.second,
+                                    + weightsOffset.at(i) + b.second + 1,
                                  index);
                             index++;
                         }
                     }
-                    index = 0;
-                    for (auto m : groupMask)
-                    {
-                        log << strpr("%zu %zu\n", index, m);
-                    }
                 }
                 else if (decouplingType == DT_NODE)
                 {
+                    size_t index = 0;
+                    for (size_t i = 0; i < numElements; ++i)
+                    {
+                        vector<vector<size_t>> v = elements.at(i)
+                            .neuralNetwork->getNodeWeightIndices();
+                        for (auto n : v)
+                        {
+                            for (auto w : n)
+                            {
+                                groupMask.at(weightsOffset.at(i) + w) = index;
+                            }
+                            index++;
+                        }
+                    }
                 }
                 else if (decouplingType == DT_FULL)
                 {
+                    size_t index = 0;
+                    for (auto& m : groupMask)
+                    {
+                        m = index;
+                        index++;
+                    }
                 }
+                //size_t index = 0;
+                //for (auto m : groupMask)
+                //{
+                //    log << strpr("%zu %zu\n", index, m);
+                //    index++;
+                //}
                 u->setupDecoupling(groupMask.data());
             }
             updaters.back()->setState(&(weights.at(i).front()));

From 5c540dba27cb1006bd87f2c286ff65bfbe49e812 Mon Sep 17 00:00:00 2001
From: Andreas Singraber <andreas.singraber@gmx.at>
Date: Tue, 15 Sep 2020 23:03:17 +0200
Subject: [PATCH 6/9] Introduced new NN weight memory layout

- Weights and biases grouped by neuron.
- Simplifies node decoupling implementation => block submatrices.
- Old weight ordering available via compile flag
(-DALTERNATIVE_WEIGHT_ORDERING)
- Weight file output remains unchanged.
---
 src/libnnp/Mode.cpp              |   4 +-
 src/libnnp/NeuralNetwork.cpp     | 275 +++++++++++++++++++++++++++++--
 src/libnnp/NeuralNetwork.h       |  98 +++++++++--
 src/libnnptrain/KalmanFilter.cpp |  27 +++
 src/libnnptrain/KalmanFilter.h   |   6 +
 src/libnnptrain/Training.cpp     |  73 ++++++--
 src/makefile.gnu                 |   6 +-
 src/makefile.intel               |   3 +
 src/makefile.llvm                |   3 +
 9 files changed, 443 insertions(+), 52 deletions(-)

diff --git a/src/libnnp/Mode.cpp b/src/libnnp/Mode.cpp
index 475ec8ff3..bd7359734 100644
--- a/src/libnnp/Mode.cpp
+++ b/src/libnnp/Mode.cpp
@@ -884,7 +884,9 @@ void Mode::setupNeuralNetworkWeights(string const& fileNameFormat)
                 weights.push_back(atof(splitLine.at(0).c_str()));
             }
         }
-        it->neuralNetwork->setConnections(&(weights.front()));
+        // Attention: need alternative (old) ordering scheme here for
+        // backward compatibility!
+        it->neuralNetwork->setConnectionsAO(&(weights.front()));
         file.close();
     }
 
diff --git a/src/libnnp/NeuralNetwork.cpp b/src/libnnp/NeuralNetwork.cpp
index 42d4ac390..748835b4f 100644
--- a/src/libnnp/NeuralNetwork.cpp
+++ b/src/libnnp/NeuralNetwork.cpp
@@ -82,12 +82,26 @@ NeuralNetwork(int                              numLayers,
         weightOffset[i] = weightOffset[i-1] +
                           (layers[i-1].numNeurons + 1) * layers[i].numNeurons;
     }
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
+    biasOffset = new int*[numLayers-1];
+    for (int i = 0; i < numLayers-1; i++)
+    {
+        biasOffset[i] = new int[layers[i+1].numNeurons];
+        for (int j = 0; j < layers[i+1].numNeurons; j++)
+        {
+            biasOffset[i][j] = weightOffset[i]
+                             + (layers[i+1].numNeuronsPrevLayer + 1)
+                             * (j + 1) - 1;
+        }
+    }
+#else
     biasOffset = new int[numLayers-1];
     for (int i = 0; i < numLayers-1; i++)
     {
         biasOffset[i] = weightOffset[i] +
                         layers[i+1].numNeurons * layers[i].numNeurons;
     }
+#endif
     biasOnlyOffset = new int[numLayers-1];
     biasOnlyOffset[0] = 0;
     for (int i = 1; i < numLayers-1; i++)
@@ -108,7 +122,15 @@ NeuralNetwork::~NeuralNetwork()
     }
     delete[] layers;
     delete[] weightOffset;
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
+    for (int i = 0; i < numLayers - 1; i++)
+    {
+        delete[] biasOffset[i];
+    }
+    delete[] biasOffset;
+#else
     delete[] biasOffset;
+#endif
     delete[] biasOnlyOffset;
 }
 
@@ -150,6 +172,27 @@ void NeuralNetwork::setConnections(double const* const& connections)
 {
     int count = 0;
 
+    for (int i = 1; i < numLayers; i++)
+    {
+        for (int j = 0; j < layers[i].numNeurons; j++)
+        {
+            for (int k = 0; k < layers[i].numNeuronsPrevLayer; k++)
+            {
+                layers[i].neurons[j].weights[k] = connections[count];
+                count++;
+            }
+            layers[i].neurons[j].bias = connections[count];
+            count++;
+        }
+    }
+
+    return;
+}
+
+void NeuralNetwork::setConnectionsAO(double const* const& connections)
+{
+    int count = 0;
+
     for (int i = 1; i < numLayers; i++)
     {
         for (int j = 0; j < layers[i].numNeuronsPrevLayer; j++)
@@ -174,6 +217,27 @@ void NeuralNetwork::getConnections(double* connections) const
 {
     int count = 0;
 
+    for (int i = 1; i < numLayers; i++)
+    {
+        for (int j = 0; j < layers[i].numNeurons; j++)
+        {
+            for (int k = 0; k < layers[i].numNeuronsPrevLayer; k++)
+            {
+                connections[count] = layers[i].neurons[j].weights[k] ;
+                count++;
+            }
+            connections[count] = layers[i].neurons[j].bias;
+            count++;
+        }
+    }
+
+    return;
+}
+
+void NeuralNetwork::getConnectionsAO(double* connections) const
+{
+    int count = 0;
+
     for (int i = 1; i < numLayers; i++)
     {
         for (int j = 0; j < layers[i].numNeuronsPrevLayer; j++)
@@ -430,6 +494,52 @@ void NeuralNetwork::calculateDEdG(double *dEdG) const
 
 void NeuralNetwork::calculateDEdc(double* dEdc) const
 {
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
+    int count = 0;
+
+    for (int i = 0; i < numConnections; i++)
+    {
+        dEdc[i] = 0.0;
+    }
+
+    for (int i = 0; i < outputLayer->numNeurons; i++)
+    {
+        dEdc[biasOffset[numLayers-2][i]] = outputLayer->neurons[i].dfdx;
+        if (normalizeNeurons)
+        {
+            dEdc[biasOffset[numLayers-2][i]] /=
+                outputLayer->numNeuronsPrevLayer;
+        }
+    }
+
+    for (int i = numLayers - 2; i >= 0; i--)
+    {
+        count = 0;
+        for (int j = 0; j < layers[i+1].numNeurons; j++)
+        {
+            for (int k = 0; k < layers[i].numNeurons; k++)
+            {
+                dEdc[weightOffset[i]+count] = dEdc[biasOffset[i][j]]
+                                            * layers[i].neurons[k].value;
+                count++;
+                if (i >= 1)
+                {
+                    dEdc[biasOffset[i-1][k]] += dEdc[biasOffset[i][j]]
+                        * layers[i+1].neurons[j].weights[k]
+                        * layers[i].neurons[k].dfdx;
+                }
+            }
+            count++;
+        }
+        if (normalizeNeurons && i >= 1)
+        {
+            for (int k = 0; k < layers[i].numNeurons; k++)
+            {
+                dEdc[biasOffset[i-1][k]] /= layers[i].numNeuronsPrevLayer;
+            }
+        }
+    }
+#else
     int count = 0;
 
     for (int i = 0; i < numConnections; i++)
@@ -470,6 +580,7 @@ void NeuralNetwork::calculateDEdc(double* dEdc) const
             }
         }
     }
+#endif
 
     return;
 }
@@ -514,6 +625,67 @@ void NeuralNetwork::calculateDFdc(double*              dFdc,
 }
 
 void NeuralNetwork::writeConnections(std::ofstream& file) const
+{
+    // File header.
+    vector<string> title;
+    vector<string> colName;
+    vector<string> colInfo;
+    vector<size_t> colSize;
+    title.push_back("Neural network connection values (weights and biases).");
+    colSize.push_back(24);
+    colName.push_back("connection");
+    colInfo.push_back("Neural network connection value.");
+    colSize.push_back(1);
+    colName.push_back("t");
+    colInfo.push_back("Connection type (a = weight, b = bias).");
+    colSize.push_back(9);
+    colName.push_back("index");
+    colInfo.push_back("Index enumerating weights.");
+    colSize.push_back(5);
+    colName.push_back("l_s");
+    colInfo.push_back("Starting point layer (end point layer for biases).");
+    colSize.push_back(5);
+    colName.push_back("n_s");
+    colInfo.push_back("Starting point neuron in starting layer (end point "
+                      "neuron for biases).");
+    colSize.push_back(5);
+    colName.push_back("l_e");
+    colInfo.push_back("End point layer.");
+    colSize.push_back(5);
+    colName.push_back("n_e");
+    colInfo.push_back("End point neuron in end layer.");
+    appendLinesToFile(file,
+                      createFileHeader(title, colSize, colName, colInfo));
+
+    int count = 0;
+    for (int i = 1; i < numLayers; i++)
+    {
+        for (int j = 0; j < layers[i].numNeurons; j++)
+        {
+            for (int k = 0; k < layers[i].numNeuronsPrevLayer; k++)
+            {
+                count++;
+                file << strpr("%24.16E a %9d %5d %5d %5d %5d\n",
+                              layers[i].neurons[j].weights[k],
+                              count,
+                              i - 1,
+                              k + 1,
+                              i,
+                              j + 1);
+            }
+            count++;
+            file << strpr("%24.16E b %9d %5d %5d\n",
+                          layers[i].neurons[j].bias,
+                          count,
+                          i,
+                          j + 1);
+        }
+    }
+
+    return;
+}
+
+void NeuralNetwork::writeConnectionsAO(std::ofstream& file) const
 {
     // File header.
     vector<string> title;
@@ -647,6 +819,67 @@ void NeuralNetwork::calculateD2EdGdc(int                  index,
                                      double const* const& dEdb,
                                      double*              d2EdGdc) const
 {
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
+    int count = 0;
+
+    for (int i = 0; i < outputLayer->numNeurons; i++)
+    {
+        d2EdGdc[biasOffset[numLayers-2][i]] = outputLayer->neurons[i].d2fdx2
+                                            * outputLayer->neurons[i].dxdG;
+        if (normalizeNeurons)
+        {
+            d2EdGdc[biasOffset[numLayers-2][i]] /=
+                outputLayer->numNeuronsPrevLayer;
+        }
+    }
+
+    for (int i = numLayers-2; i >= 0; i--)
+    {
+        count = 0;
+        for (int j = 0; j < layers[i+1].numNeurons; j++)
+        {
+            for (int k = 0; k < layers[i].numNeurons; k++)
+            {
+                if (i == 0)
+                {
+                    d2EdGdc[weightOffset[i]+count] = d2EdGdc[biasOffset[i][j]]
+                        * layers[i].neurons[k].value;
+                    if (k == index)
+                    {
+                        d2EdGdc[weightOffset[i]+count] +=
+                            dEdb[biasOnlyOffset[i]+j];
+                    }
+                }
+                else
+                {
+                    d2EdGdc[weightOffset[i]+count] = d2EdGdc[biasOffset[i][j]]
+                        * layers[i].neurons[k].value
+                        + dEdb[biasOnlyOffset[i]+j] * layers[i].neurons[k].dfdx
+                        * layers[i].neurons[k].dxdG;
+                }
+                count++;
+                if (i >= 1)
+                {
+                    d2EdGdc[biasOffset[i-1][k]] +=
+                        layers[i+1].neurons[j].weights[k]
+                        * (d2EdGdc[biasOffset[i][j]]
+                        * layers[i].neurons[k].dfdx
+                        + dEdb[biasOnlyOffset[i]+j]
+                        * layers[i].neurons[k].d2fdx2
+                        * layers[i].neurons[k].dxdG);
+                }
+            }
+            count++;
+        }
+        for (int k = 0; k < layers[i].numNeurons; k++)
+        {
+            if (normalizeNeurons && i >= 1)
+            {
+                d2EdGdc[biasOffset[i-1][k]] /= layers[i].numNeuronsPrevLayer;
+            }
+        }
+    }
+#else
     int count = 0;
 
     for (int i = 0; i < outputLayer->numNeurons; i++)
@@ -701,6 +934,7 @@ void NeuralNetwork::calculateD2EdGdc(int                  index,
             }
         }
     }
+#endif
 
     return;
 }
@@ -938,30 +1172,29 @@ vector<pair<size_t, size_t>> NeuralNetwork::getLayerBoundaries() const
     return boundaries;
 }
 
-vector<vector<size_t>> NeuralNetwork::getNodeWeightIndices() const
+vector<pair<size_t, size_t>> NeuralNetwork::getNeuronBoundaries() const
 {
-    vector<vector<size_t>> indices;
-    valarray<size_t> wlist(numConnections);
-
-    // Fill array with increasing numbers from 0 to numConnections - 1.
-    iota(begin(wlist), end(wlist), 0);
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
+    vector<pair<size_t, size_t>> boundaries;
 
-    for (int i = 1; i < numLayers; i++)
+    for (int i = 0; i < numLayers - 1; ++i)
     {
-        for (int j = 0; j < layers[i].numNeurons; j++)
+        boundaries.push_back(make_pair(weightOffset[i],
+                                       biasOffset[i][0]));
+        for (int j = 0; j < layers[i+1].numNeurons - 1; ++j)
         {
-            indices.push_back(vector<size_t>());
-            // Use slice function to get the right weight indices.
-            valarray<size_t> v = wlist[slice(weightOffset[i - 1] + j,
-                                             layers[i].numNeuronsPrevLayer,
-                                             layers[i].numNeurons)];
-            indices.back().assign(begin(v), end(v));
-            // Add bias weight.
-            indices.back().push_back(biasOffset[i - 1] + j);
+            boundaries.push_back(make_pair(biasOffset[i][j] + 1,
+                                           biasOffset[i][j+1]));
+
         }
     }
+#else
+    throw runtime_error("ERROR: Neuron boundaries not available with "
+                        "alternative (old) weight memory layout, recompile "
+                        "without -DALTERNATIVE_WEIGHT_ORDERING.\n");
+#endif
 
-    return indices;
+    return boundaries;
 }
 
 /*
@@ -1012,7 +1245,15 @@ long NeuralNetwork::getMemoryUsage()
     int  numNeurons = getNumNeurons();
 
     mem += (numLayers - 1) * sizeof(int); // weightOffset
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
+    mem += (numLayers - 1) * sizeof(int*); // biasOffset
+    for (int i = 0; i < numLayers - 1; i++)
+    {
+        mem += layers[i].numNeurons * sizeof(int);
+    }
+#else
     mem += (numLayers - 1) * sizeof(int); // biasOffset
+#endif
     mem += (numLayers - 1) * sizeof(int); // biasOnlyOffset
     mem += numLayers  * sizeof(Layer);    // layers
     mem += numNeurons * sizeof(Neuron);   // neurons
diff --git a/src/libnnp/NeuralNetwork.h b/src/libnnp/NeuralNetwork.h
index 61f30ce09..1e5284e3d 100644
--- a/src/libnnp/NeuralNetwork.h
+++ b/src/libnnp/NeuralNetwork.h
@@ -154,29 +154,70 @@ class NeuralNetwork
      *            @f[
      *              \underbrace{
      *                \overbrace{
-     *                  a^{01}_{00}, \ldots, a^{01}_{0m_1},
-     *                  a^{01}_{10}, \ldots, a^{01}_{1m_1},
+     *                  a^{01}_{00}, \ldots, a^{01}_{n_00}, b^{1}_{0}
+     *                }^{\text{Neuron}^{1}_{0}}
+     *                \overbrace{
+     *                  a^{01}_{01}, \ldots, a^{01}_{n_01}, b^{1}_{1}
+     *                }^{\text{Neuron}^{1}_{1}}
      *                  \ldots,
-     *                  a^{01}_{n_00}, \ldots, a^{01}_{n_0m_1},
+     *                \overbrace{
+     *                  a^{01}_{0n_1}, \ldots, a^{01}_{n_0n_1}, b^{1}_{n_1}
+     *                }^{\text{Neuron}^{1}_{n_1}}
+     *              }_{\text{Layer } 0 \rightarrow 1},
+     *              \underbrace{
+     *                a^{12}_{00}, \ldots, b^{2}_{n_2}
+     *              }_{\text{Layer } 1 \rightarrow 2},
+     *              \ldots,
+     *              \underbrace{
+     *                a^{p-1,p}_{00}, \ldots, b^{p}_{n_p}
+     *              }_{\text{Layer } p-1 \rightarrow p}
+     *            @f]
+     *            where @f$a^{i-1, i}_{jk}@f$ is the weight connecting neuron
+     *            @f$j@f$ in layer @f$i-1@f$ to neuron @f$k@f$ in layer
+     *            @f$i@f$ and @f$b^{i}_{k}@f$ is the bias assigned to neuron
+     *            @f$k@f$ in layer @f$i@f$.
+     *
+     *  This ordering scheme is used internally to store weights in memory.
+     *  Because all weights and biases connected to a neuron are aligned in a
+     *  continuous block this memory layout simplifies the implementation of
+     *  node-decoupled Kalman filter training (NDEKF). However, for backward
+     *  compatibility reasons this layout is NOT used for storing weights on
+     *  disk (see setConnectionsAO()).
+     */
+    void                     setConnections(double const* const& connections);
+    /** Set neural network weights and biases (alternative ordering)
+     *
+     * @param[in] connections One-dimensional array with neural network
+     *                        connections in the following order:
+     *            @f[
+     *              \underbrace{
+     *                \overbrace{
+     *                  a^{01}_{00}, \ldots, a^{01}_{0n_1},
+     *                  a^{01}_{10}, \ldots, a^{01}_{1n_1},
+     *                  \ldots,
+     *                  a^{01}_{n_00}, \ldots, a^{01}_{n_0n_1},
      *                }^{\text{Weights}}
      *                \overbrace{
-     *                  b^{1}_{0}, \ldots, b^{1}_{m_1}
+     *                  b^{1}_{0}, \ldots, b^{1}_{n_1}
      *                }^{\text{Biases}}
      *              }_{\text{Layer } 0 \rightarrow 1},
      *              \underbrace{
-     *                a^{12}_{00}, \ldots, b^{2}_{m_2}
+     *                a^{12}_{00}, \ldots, b^{2}_{n_2}
      *              }_{\text{Layer } 1 \rightarrow 2},
      *              \ldots,
      *              \underbrace{
-     *                a^{p-1,p}_{00}, \ldots, b^{p}_{m_p}
+     *                a^{p-1,p}_{00}, \ldots, b^{p}_{n_p}
      *              }_{\text{Layer } p-1 \rightarrow p}
      *            @f]
      *            where @f$a^{i-1, i}_{jk}@f$ is the weight connecting neuron
      *            @f$j@f$ in layer @f$i-1@f$ to neuron @f$k@f$ in layer
      *            @f$i@f$ and @f$b^{i}_{k}@f$ is the bias assigned to neuron
      *            @f$k@f$ in layer @f$i@f$.
+     *
+     *  This memory layout is used when storing weights on disk.
      */
-    void                     setConnections(double const* const& connections);
+    void                     setConnectionsAO(
+                                             double const* const& connections);
     /** Get neural network weights and biases.
      *
      * @param[out] connections One-dimensional array with neural network
@@ -184,6 +225,13 @@ class NeuralNetwork
      *                         #setConnections())
      */
     void                     getConnections(double* connections) const;
+    /** Get neural network weights and biases (alternative ordering).
+     *
+     * @param[out] connections One-dimensional array with neural network
+     *                         connections (same order as described in
+     *                         #setConnectionsAO())
+     */
+    void                     getConnectionsAO(double* connections) const;
     /** Initialize connections with random numbers.
      *
      * @param[in] seed Random number generator seed.
@@ -213,14 +261,14 @@ class NeuralNetwork
                                          ModificationScheme modificationScheme,
                                          double             parameter1,
                                          double             parameter2);
-    /** Set neural network input layer node values.
+    /** Set neural network input layer neuron values.
      *
-     * @param[in] input Input layer node values.
+     * @param[in] input Input layer neuron values.
      */
     void                     setInput(double const* const& input) const;
-    /** Get neural network output layer node values.
+    /** Get neural network output layer neuron values.
      *
-     * @param[out] output Output layer node values.
+     * @param[out] output Output layer neuron values.
      */
     void                     getOutput(double* output) const;
     /** Propagate input information through all layers.
@@ -297,10 +345,20 @@ class NeuralNetwork
     void                     calculateDFdc(double*              dFdc,
                                            double const* const& dGdxyz) const;
     /** Write connections to file.
+     *
+     * __CAUTION__: For compatibility reasons this format is NOT used for
+     * storing NN weights to disk!
      *
      * @param[in,out] file File stream to write to.
      */
     void                     writeConnections(std::ofstream& file) const;
+    /** Write connections to file (alternative ordering).
+     *
+     * This NN weights ordering layout is used when writing weights to disk.
+     *
+     * @param[in,out] file File stream to write to.
+     */
+    void                     writeConnectionsAO(std::ofstream& file) const;
     /** Return gathered neuron statistics.
      *
      * @param[out] count Number of neuron output value calculations.
@@ -339,19 +397,20 @@ class NeuralNetwork
      * decoupling setup).
      *
      * @return Vector with layer boundaries as pair (begin, end),
-     * (numLayers + 1) points.
+     * (numLayers - 1) points.
      */
     std::vector<std::pair<
     std::size_t,
     std::size_t>>            getLayerBoundaries() const;
-    /** Get indices of weights associated with nodes (for Kalman filter
+    /** Get neuron boundaries in combined weight vector (for Kalman filter
      * decoupling setup).
      *
-     * @return Vector of each nodes weight indices.
+     * @return Vector with neuron boundaries as pair (begin, end),
+     * (numNeurons - inputLayer->numNeurons) points.
      */
-    std::vector<
-    std::vector<
-    std::size_t>>            getNodeWeightIndices() const;
+    std::vector<std::pair<
+    std::size_t,
+    std::size_t>>            getNeuronBoundaries() const;
     //void   writeStatus(int, int);
     long                     getMemoryUsage();
     /** Print neural network architecture.
@@ -423,8 +482,13 @@ class NeuralNetwork
     int    numHiddenLayers;
     /// Offset adress of weights per layer in combined weights+bias array.
     int*   weightOffset;
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
+    /// Offset adress of each neuron per layer in combined weights+bias array.
+    int**  biasOffset;
+#else
     /// Offset adress of biases per layer in combined weights+bias array.
     int*   biasOffset;
+#endif
     /// Offset adress of biases per layer in bias only array.
     int*   biasOnlyOffset;
     /// Pointer to input layer.
diff --git a/src/libnnptrain/KalmanFilter.cpp b/src/libnnptrain/KalmanFilter.cpp
index 48c7f4101..345116072 100644
--- a/src/libnnptrain/KalmanFilter.cpp
+++ b/src/libnnptrain/KalmanFilter.cpp
@@ -108,6 +108,26 @@ void KalmanFilter::setupDecoupling(int const* const mask)
         }
     }
 
+    numGroupsPerProc.resize(numProcs, 0);
+    groupOffsetPerProc.resize(numProcs, 0);
+    int quotient = groupMap.size() / numProcs;
+    int remainder = groupMap.size() % numProcs;
+    int groupSum = 0;
+    for (int i = 0; i < numProcs; i++)
+    {
+        numGroupsPerProc.at(i) = quotient;
+        if (remainder > 0 && i < remainder)
+        {
+            numGroupsPerProc.at(i)++;
+        }
+        groupOffsetPerProc.at(i) = groupSum;
+        groupSum += numGroupsPerProc.at(i);
+    }
+    for (size_t i = 0; i < numGroupsPerProc.at(myRank); ++i)
+    {
+        myGroups.push_back(groupOffsetPerProc.at(myRank) + i);
+    }
+
     return;
 }
 
@@ -386,6 +406,13 @@ vector<string> KalmanFilter::info() const
         v.push_back(strpr(" - group %5zu size: %zu\n",
                           i, groupMap.at(i).size()));
     }
+    v.push_back(strpr("Number of decoupling groups of proc %d: %zu\n",
+                      myRank, myGroups.size()));
+    for (size_t i = 0; i < myGroups.size(); ++i)
+    {
+        v.push_back(strpr(" - group %5zu size: %zu\n",
+                          i, groupMap.at(myGroups.at(i)).size()));
+    }
 
     return v;
 }
diff --git a/src/libnnptrain/KalmanFilter.h b/src/libnnptrain/KalmanFilter.h
index 003445649..e1c1f2723 100644
--- a/src/libnnptrain/KalmanFilter.h
+++ b/src/libnnptrain/KalmanFilter.h
@@ -236,6 +236,12 @@ class KalmanFilter : public Updater
     double                             nu;
     /// Forgetting gain factor gamma for fading memory Kalman filter.
     double                             gamma;
+    /// List of group indices of this processor.
+    std::vector<int>                   myGroups;
+    /// Number of groups for each processor.
+    std::vector<std::size_t>           numGroupsPerProc;
+    /// Offset in terms of groups for each processor.
+    std::vector<std::size_t>           groupOffsetPerProc;
     /// Decoupling group map.
     std::map<int, std::vector<int>>    groupMap;
     /// Decoupling group mask vector.
diff --git a/src/libnnptrain/Training.cpp b/src/libnnptrain/Training.cpp
index 2ed36e190..80feeb754 100644
--- a/src/libnnptrain/Training.cpp
+++ b/src/libnnptrain/Training.cpp
@@ -301,7 +301,12 @@ void Training::initializeWeights()
                 w.at(i).at(j) = minWeights + gsl_rng_uniform(rngGlobal)
                               * (maxWeights - minWeights);
             }
-            elements.at(i).neuralNetwork->setConnections(&(w.at(i).front()));
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
+            // FIX!!!
+            elements.at(i).neuralNetwork->setConnectionsAO(&(w.at(i).front()));
+#else
+            elements.at(i).neuralNetwork->setConnectionsAO(&(w.at(i).front()));
+#endif
         }
         if (settings.keywordExists("nguyen_widrow_weights_short"))
         {
@@ -1077,7 +1082,6 @@ void Training::setupTraining()
                         for (auto b : elements.at(i).neuralNetwork
                                       ->getLayerBoundaries())
                         {
-                            log << strpr("%zu %zu\n", b.first, b.second);
                             fill(groupMask.begin()
                                     + weightsOffset.at(i) + b.first,
                                  groupMask.begin()
@@ -1089,20 +1093,29 @@ void Training::setupTraining()
                 }
                 else if (decouplingType == DT_NODE)
                 {
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
                     size_t index = 0;
                     for (size_t i = 0; i < numElements; ++i)
                     {
-                        vector<vector<size_t>> v = elements.at(i)
-                            .neuralNetwork->getNodeWeightIndices();
-                        for (auto n : v)
+                        for (auto b : elements.at(i).neuralNetwork
+                                      ->getNeuronBoundaries())
                         {
-                            for (auto w : n)
-                            {
-                                groupMask.at(weightsOffset.at(i) + w) = index;
-                            }
+                            log << strpr("%zu %zu %zu\n", i, b.first, b.second);
+                            fill(groupMask.begin()
+                                    + weightsOffset.at(i) + b.first,
+                                 groupMask.begin()
+                                    + weightsOffset.at(i) + b.second + 1,
+                                 index);
                             index++;
                         }
                     }
+#else
+                    throw runtime_error("ERROR: Node-decoupled Kalman filter "
+                                        "(NDEFK) training not possible with "
+                                        "alternative (old) weight memory "
+                                        "layout, recompile without "
+                                        "-DALTERNATIVE_WEIGHT_ORDERING.\n");
+#endif
                 }
                 else if (decouplingType == DT_FULL)
                 {
@@ -1113,12 +1126,6 @@ void Training::setupTraining()
                         index++;
                     }
                 }
-                //size_t index = 0;
-                //for (auto m : groupMask)
-                //{
-                //    log << strpr("%zu %zu\n", index, m);
-                //    index++;
-                //}
                 u->setupDecoupling(groupMask.data());
             }
             updaters.back()->setState(&(weights.at(i).front()));
@@ -1564,7 +1571,9 @@ void Training::writeWeights(string const fileNameFormat) const
         string fileName = strpr(fileNameFormat.c_str(),
                                 elements.at(i).getAtomicNumber());
         file.open(fileName.c_str());
-        elements.at(i).neuralNetwork->writeConnections(file);
+        // Attention: need alternative (old) ordering scheme here for
+        // backward compatibility!
+        elements.at(i).neuralNetwork->writeConnectionsAO(file);
         file.close();
     }
 
@@ -2392,6 +2401,22 @@ void Training::update(bool force)
             else
             {
                 nn->calculateDEdc(&(dXdc.at(i).front()));
+                //vector<double> temp(dXdc.at(i).size(), 0.0);
+                //vector<double> tempAO(dXdc.at(i).size(), 0.0);
+                //nn->calculateDEdc(&(temp.front()));
+                //ofstream tf("dedc.out");
+                //size_t count;
+                //count = 0;
+                //sort(temp.begin(), temp.end());
+                //for (auto i : temp) tf << strpr("%zu %16.8E\n", count++, i);
+                //tf.close();
+                //nn->calculateDEdcAO(&(tempAO.front()));
+                //tf.open("dedc.out.ao");
+                //count = 0;
+                //sort(tempAO.begin(), tempAO.end());
+                //for (auto i : tempAO) tf << strpr("%zu %16.8E\n", count++, i);
+                //tf.close();
+                //throw runtime_error("END HERE");
             }
             // Finally sum up Jacobian.
             if (updateStrategy == US_ELEMENT) iu = i;
@@ -2827,7 +2852,11 @@ void Training::getWeights()
         for (size_t i = 0; i < numElements; ++i)
         {
             NeuralNetwork const* const& nn = elements.at(i).neuralNetwork;
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
             nn->getConnections(&(weights.at(0).at(pos)));
+#else
+            nn->getConnectionsAO(&(weights.at(0).at(pos)));
+#endif
             pos += nn->getNumConnections();
         }
     }
@@ -2836,7 +2865,11 @@ void Training::getWeights()
         for (size_t i = 0; i < numElements; ++i)
         {
             NeuralNetwork const* const& nn = elements.at(i).neuralNetwork;
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
             nn->getConnections(&(weights.at(i).front()));
+#else
+            nn->getConnectionsAO(&(weights.at(i).front()));
+#endif
         }
     }
 
@@ -2851,7 +2884,11 @@ void Training::setWeights()
         for (size_t i = 0; i < numElements; ++i)
         {
             NeuralNetwork* const& nn = elements.at(i).neuralNetwork;
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
             nn->setConnections(&(weights.at(0).at(pos)));
+#else
+            nn->setConnectionsAO(&(weights.at(0).at(pos)));
+#endif
             pos += nn->getNumConnections();
         }
     }
@@ -2860,7 +2897,11 @@ void Training::setWeights()
         for (size_t i = 0; i < numElements; ++i)
         {
             NeuralNetwork* const& nn = elements.at(i).neuralNetwork;
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
             nn->setConnections(&(weights.at(i).front()));
+#else
+            nn->setConnectionsAO(&(weights.at(i).front()));
+#endif
         }
     }
 
diff --git a/src/makefile.gnu b/src/makefile.gnu
index 223a03485..f35aeb0b6 100644
--- a/src/makefile.gnu
+++ b/src/makefile.gnu
@@ -14,7 +14,8 @@ PROJECT_EIGEN=/usr/include/eigen3/
 PROJECT_CC=g++
 PROJECT_MPICC=mpic++
 # OpenMP parallelization is disabled by default, add flag "-fopenmp" to enable.
-PROJECT_CFLAGS=-O3 -march=native -std=c++11
+PROJECT_CFLAGS=-O0
+#3 -march=native -std=c++11
 PROJECT_CFLAGS_MPI=-Wno-long-long
 PROJECT_DEBUG=-g -pedantic-errors -Wall -Wextra
 PROJECT_TEST=--coverage -fno-default-inline -fno-inline -fno-inline-small-functions -fno-elide-constructors
@@ -56,3 +57,6 @@ PROJECT_OPTIONS+= -DEIGEN_DONT_PARALLELIZE
 
 # Use improved memory layout for symmetry function derivatives.
 PROJECT_OPTIONS+= -DIMPROVED_SFD_MEMORY
+
+# Use alternative (old) ordering scheme of weights (NDEKF not possible).
+#PROJECT_OPTIONS+= -DALTERNATIVE_WEIGHT_ORDERING
diff --git a/src/makefile.intel b/src/makefile.intel
index 17e61b7a0..4a9a0e94c 100644
--- a/src/makefile.intel
+++ b/src/makefile.intel
@@ -57,3 +57,6 @@ PROJECT_OPTIONS+= -DEIGEN_DONT_PARALLELIZE
 
 # Use improved memory layout for symmetry function derivatives.
 PROJECT_OPTIONS+= -DIMPROVED_SFD_MEMORY
+
+# Use alternative (old) ordering scheme of weights (NDEKF not possible).
+#PROJECT_OPTIONS+= -DALTERNATIVE_WEIGHT_ORDERING
diff --git a/src/makefile.llvm b/src/makefile.llvm
index b748ce08b..30deb9a03 100644
--- a/src/makefile.llvm
+++ b/src/makefile.llvm
@@ -58,3 +58,6 @@ PROJECT_OPTIONS+= -DEIGEN_DONT_PARALLELIZE
 
 # Use improved memory layout for symmetry function derivatives.
 PROJECT_OPTIONS+= -DIMPROVED_SFD_MEMORY
+
+# Use alternative (old) ordering scheme of weights (NDEKF not possible).
+#PROJECT_OPTIONS+= -DALTERNATIVE_WEIGHT_ORDERING

From 0235ce50e739559d283168302a0cbcebe3d47891 Mon Sep 17 00:00:00 2001
From: Andreas Singraber <andreas.singraber@gmx.at>
Date: Wed, 16 Sep 2020 17:11:35 +0200
Subject: [PATCH 7/9] Changed decoupling setup

- Works now with limits instead of a group mask.
- Groups are assumed to be contiguous.
- Implemented rank 0 only update via new communicator.
---
 src/libnnp/NeuralNetwork.cpp     |  23 ++---
 src/libnnp/NeuralNetwork.h       |  12 +--
 src/libnnptrain/KalmanFilter.cpp | 159 ++++++++++++++++++++++---------
 src/libnnptrain/KalmanFilter.h   |  32 ++++---
 src/libnnptrain/Training.cpp     |  95 +++++++++---------
 test/cpp/Example_nnp_train.h     |  14 +--
 6 files changed, 205 insertions(+), 130 deletions(-)

diff --git a/src/libnnp/NeuralNetwork.cpp b/src/libnnp/NeuralNetwork.cpp
index 748835b4f..ea812a336 100644
--- a/src/libnnp/NeuralNetwork.cpp
+++ b/src/libnnp/NeuralNetwork.cpp
@@ -21,8 +21,6 @@
 #include <cstdio>    // fprintf, stderr
 #include <cstdlib>   // exit, EXIT_FAILURE, rand, srand
 #include <limits>    // std::numeric_limits
-#include <numeric>   // std::iota
-#include <valarray>  // std::valarray, std::slice
 
 #define EXP_LIMIT 35.0
 
@@ -1157,33 +1155,32 @@ void NeuralNetwork::getNeuronStatistics(long*   count,
     return;
 }
 
-vector<pair<size_t, size_t>> NeuralNetwork::getLayerBoundaries() const
+vector<pair<size_t, size_t>> NeuralNetwork::getLayerLimits() const
 {
-    vector<pair<size_t, size_t>> boundaries;
+    vector<pair<size_t, size_t>> limits;
 
     for (int i = 0; i < numLayers - 2; ++i)
     {
-        boundaries.push_back(make_pair(weightOffset[i],
+        limits.push_back(make_pair(weightOffset[i],
                                        weightOffset[i + 1] - 1));
     }
-    boundaries.push_back(make_pair(weightOffset[numLayers - 2],
+    limits.push_back(make_pair(weightOffset[numLayers - 2],
                                    numConnections - 1));
 
-    return boundaries;
+    return limits;
 }
 
-vector<pair<size_t, size_t>> NeuralNetwork::getNeuronBoundaries() const
+vector<pair<size_t, size_t>> NeuralNetwork::getNeuronLimits() const
 {
+    vector<pair<size_t, size_t>> limits;
 #ifndef ALTERNATIVE_WEIGHT_ORDERING
-    vector<pair<size_t, size_t>> boundaries;
-
     for (int i = 0; i < numLayers - 1; ++i)
     {
-        boundaries.push_back(make_pair(weightOffset[i],
+        limits.push_back(make_pair(weightOffset[i],
                                        biasOffset[i][0]));
         for (int j = 0; j < layers[i+1].numNeurons - 1; ++j)
         {
-            boundaries.push_back(make_pair(biasOffset[i][j] + 1,
+            limits.push_back(make_pair(biasOffset[i][j] + 1,
                                            biasOffset[i][j+1]));
 
         }
@@ -1194,7 +1191,7 @@ vector<pair<size_t, size_t>> NeuralNetwork::getNeuronBoundaries() const
                         "without -DALTERNATIVE_WEIGHT_ORDERING.\n");
 #endif
 
-    return boundaries;
+    return limits;
 }
 
 /*
diff --git a/src/libnnp/NeuralNetwork.h b/src/libnnp/NeuralNetwork.h
index 1e5284e3d..8cbef0cfd 100644
--- a/src/libnnp/NeuralNetwork.h
+++ b/src/libnnp/NeuralNetwork.h
@@ -393,24 +393,24 @@ class NeuralNetwork
      * Counters and summation variables for neuron statistics are reset.
      */
     void                     resetNeuronStatistics();
-    /** Get layer boundaries in combined weight vector (for Kalman filter
+    /** Get layer limits in combined weight vector (for Kalman filter
      * decoupling setup).
      *
-     * @return Vector with layer boundaries as pair (begin, end),
+     * @return Vector with layer limits as pair (begin, end),
      * (numLayers - 1) points.
      */
     std::vector<std::pair<
     std::size_t,
-    std::size_t>>            getLayerBoundaries() const;
-    /** Get neuron boundaries in combined weight vector (for Kalman filter
+    std::size_t>>            getLayerLimits() const;
+    /** Get neuron limits in combined weight vector (for Kalman filter
      * decoupling setup).
      *
-     * @return Vector with neuron boundaries as pair (begin, end),
+     * @return Vector with neuron limits as pair (begin, end),
      * (numNeurons - inputLayer->numNeurons) points.
      */
     std::vector<std::pair<
     std::size_t,
-    std::size_t>>            getNeuronBoundaries() const;
+    std::size_t>>            getNeuronLimits() const;
     //void   writeStatus(int, int);
     long                     getMemoryUsage();
     /** Print neural network architecture.
diff --git a/src/libnnptrain/KalmanFilter.cpp b/src/libnnptrain/KalmanFilter.cpp
index 345116072..221f82806 100644
--- a/src/libnnptrain/KalmanFilter.cpp
+++ b/src/libnnptrain/KalmanFilter.cpp
@@ -16,6 +16,7 @@
 
 #include "KalmanFilter.h"
 #include "utility.h"
+#include "mpi-extra.h"
 #include <Eigen/LU>
 #include <iostream>
 #include <stdexcept>
@@ -32,6 +33,7 @@ KalmanFilter::KalmanFilter(size_t const sizeState,
     numProcs       (0   ),
     sizeObservation(0   ),
     numUpdates     (0   ),
+    sizeP          (0   ),
     epsilon        (0.0 ),
     q              (0.0 ),
     q0             (0.0 ),
@@ -65,14 +67,6 @@ KalmanFilter::KalmanFilter(size_t const sizeState,
     w  = new Map<VectorXd      >(0, sizeState);
     xi = new Map<VectorXd const>(0, sizeObservation);
     H  = new Map<MatrixXd const>(0, sizeState, sizeObservation);
-    P.resize(sizeState, sizeState);
-    P.setIdentity();
-    // Prevent problems with unallocated K when log starts.
-    K.resize(sizeState, sizeObservation);
-    K.setZero();
-
-    // Set default decoupling group mask.
-    groupMask = new Map<VectorXi const>(0, sizeState);
 }
 
 KalmanFilter::~KalmanFilter()
@@ -88,30 +82,59 @@ void KalmanFilter::setupMPI(MPI_Comm* communicator)
     return;
 }
 
-void KalmanFilter::setupDecoupling(int const* const mask)
+void KalmanFilter::setupDecoupling(vector<pair<size_t, size_t>> groupLimits)
 {
-    new (groupMask) Map<VectorXi const>(mask, sizeState);
+    this->groupLimits = groupLimits;
 
-    for (long i = 0; i < groupMask->size(); ++i)
+    // Check consistency of decoupling group limits.
+    if (groupLimits.at(0).first != 0)
     {
-        groupMap[(*groupMask)(i)].push_back(i);
-    }
+        auto const& l = groupLimits.at(0);
+        throw runtime_error(strpr("ERROR: Inconsistent decoupling group "
+                                  "limits, first group must start with index "
+                                  "0 (is %zu).\n",
+                                  l.first));
 
-    // Check consistency of group map.
-    for (size_t i = 0; i < groupMap.size(); ++i)
+    }
+    for (size_t i = 0; i < groupLimits.size(); ++i)
     {
-        if (groupMap.find(i) == groupMap.end())
+        auto const& l = groupLimits.at(i);
+        if (l.first > l.second)
         {
-            throw runtime_error(strpr("ERROR: Kalman filter decoupling group "
-                                      "mask is inconsistent, group %zu not "
-                                      "present.\n", i));
+            throw runtime_error(
+                strpr("ERROR: Inconsistent decoupling group limits, "
+                      "group %zu: start %zu > end %zu.\n",
+                      i, l.first, l.second));
         }
     }
+    for (size_t i = 0; i < groupLimits.size() - 1; ++i)
+    {
+        auto const& l = groupLimits.at(i);
+        auto const& r = groupLimits.at(i + 1);
+        if (l.second + 1 != r.first)
+        {
+            throw runtime_error(
+                strpr("ERROR: Inconsistent decoupling group limits, "
+                      "group %zu end %zu + 1 != group %zu start %zu.\n",
+                      i, l.second, i + 1, r.first));
+        }
+    }
+    if (groupLimits.back().second != sizeState - 1)
+    {
+        auto const& l = groupLimits.back();
+        throw runtime_error(strpr("ERROR: Inconsistent decoupling group "
+                                  "limits, last group must end with index "
+                                  "%zu (is %zu).\n",
+                                  sizeState - 1,
+                                  l.second));
+
+    }
 
+    // Distribute groups to MPI procs.
     numGroupsPerProc.resize(numProcs, 0);
     groupOffsetPerProc.resize(numProcs, 0);
-    int quotient = groupMap.size() / numProcs;
-    int remainder = groupMap.size() % numProcs;
+    int quotient = groupLimits.size() / numProcs;
+    int remainder = groupLimits.size() % numProcs;
     int groupSum = 0;
     for (int i = 0; i < numProcs; i++)
     {
@@ -125,9 +148,29 @@ void KalmanFilter::setupDecoupling(int const* const mask)
     }
     for (size_t i = 0; i < numGroupsPerProc.at(myRank); ++i)
     {
-        myGroups.push_back(groupOffsetPerProc.at(myRank) + i);
+        size_t index = groupOffsetPerProc.at(myRank) + i;
+        size_t size = groupLimits.at(index).second
+                    - groupLimits.at(index).first + 1;
+        myGroups.push_back(make_pair(index, size));
     }
 
+    // Allocate per-processor matrices for all groups.
+    for (auto g : myGroups)
+    {
+        P.push_back(Eigen::MatrixXd(g.second, g.second));
+        P.back().setIdentity();
+
+        X.push_back(Eigen::MatrixXd(g.second, sizeObservation));
+
+        // Prevent problems with unallocated K when log starts.
+        K.push_back(Eigen::MatrixXd(g.second, sizeObservation));
+        K.back().setZero();
+
+        sizeP += g.second * g.second;
+    }
+
+    MPI_Allreduce(MPI_IN_PLACE, &sizeP, 1, MPI_SIZE_T, MPI_SUM, comm);
+
     return;
 }
 
@@ -183,15 +226,15 @@ void KalmanFilter::update()
 
 void KalmanFilter::update(size_t const sizeObservation)
 {
-    X.resize(sizeState, sizeObservation);
+    X.at(0).resize(sizeState, sizeObservation);
 
     // Calculate temporary result.
     // X = P . H
-    X = P.selfadjointView<Lower>() * (*H);
+    X.at(0) = P.at(0).selfadjointView<Lower>() * (*H);
 
     // Calculate scaling matrix.
     // A = H^T . X
-    MatrixXd A = H->transpose() * X;
+    MatrixXd A = H->transpose() * X.at(0);
 
     // Increase learning rate.
     // eta(n) = eta(0) * exp(n * tau)
@@ -210,25 +253,25 @@ void KalmanFilter::update(size_t const sizeObservation)
 
     // Calculate Kalman gain matrix.
     // K = X . A^-1
-    K.resize(sizeState, sizeObservation);
-    K = X * A.inverse();
+    K.at(0).resize(sizeState, sizeObservation);
+    K.at(0) = X.at(0) * A.inverse();
 
     // Update error covariance matrix.
     // P = P - K . X^T
-    P.noalias() -= K * X.transpose();
+    P.at(0).noalias() -= K.at(0) * X.at(0).transpose();
 
     // Apply forgetting factor.
     if (type == KT_FADINGMEMORY)
     {
-        P *= 1.0 / lambda;
+        P.at(0) *= 1.0 / lambda;
     }
     // Add process noise.
     // P = P + Q
-    P.diagonal() += VectorXd::Constant(sizeState, q);
+    P.at(0).diagonal() += VectorXd::Constant(sizeState, q);
 
     // Update state vector.
     // w =  w + K . xi
-    (*w) += K * (*xi);
+    (*w) += K.at(0) * (*xi);
 
     // Anneal process noise.
     // q(n) = q(0) * exp(-n * tau)
@@ -264,7 +307,10 @@ void KalmanFilter::setParametersStandard(double const epsilon,
 
     q = q0;
     eta = eta0;
-    P /= epsilon;
+    for (auto& p : P)
+    {
+        p /= epsilon;
+    }
 
     return;
 }
@@ -284,7 +330,10 @@ void KalmanFilter::setParametersFadingMemory(double const epsilon,
     this->nu      = nu     ;
 
     q = q0;
-    P /= epsilon;
+    for (auto& p : P)
+    {
+        p /= epsilon;
+    }
     gamma = 1.0;
 
     return;
@@ -293,12 +342,30 @@ void KalmanFilter::setParametersFadingMemory(double const epsilon,
 string KalmanFilter::status(size_t epoch) const
 {
 
-    double Pasym = 0.5 * (P - P.transpose()).array().abs().mean();
-    double Pdiag = P.diagonal().array().abs().sum(); 
-    double Poffdiag = (P.array().abs().sum() - Pdiag)
-                   / (sizeState * (sizeState - 1));
+    double Pasym    = 0.0;
+    double Pdiag    = 0.0;
+    double Poffdiag = 0.0;
+    double Kmean    = 0.0;
+
+    for (size_t i = 0; i < myGroups.size(); ++i)
+    {
+        auto const& p = P.at(i);
+        auto const& k = K.at(i);
+        Pasym += 0.5 * (p - p.transpose()).array().abs().sum();
+        double pdiag = p.diagonal().array().abs().sum(); 
+        Pdiag += pdiag; 
+        Poffdiag += p.array().abs().sum() - pdiag;
+        Kmean += k.array().abs().sum();
+    }
+
+    MPI_Allreduce(MPI_IN_PLACE, &Pasym   , 1, MPI_DOUBLE, MPI_SUM, comm);
+    MPI_Allreduce(MPI_IN_PLACE, &Pdiag   , 1, MPI_DOUBLE, MPI_SUM, comm);
+    MPI_Allreduce(MPI_IN_PLACE, &Poffdiag, 1, MPI_DOUBLE, MPI_SUM, comm);
+    MPI_Allreduce(MPI_IN_PLACE, &Kmean   , 1, MPI_DOUBLE, MPI_SUM, comm);
+    Pasym /= (sizeP - sizeState);
     Pdiag /= sizeState;
-    double Kmean = K.array().abs().mean();
+    Poffdiag /= (sizeP - sizeState);
+    Kmean /= (sizeState * sizeObservation);
 
     string s = strpr("%10zu %10zu %16.8E %16.8E %16.8E %16.8E %16.8E",
                      epoch, numUpdates, Pdiag, Poffdiag, Pasym, Kmean, q);
@@ -400,18 +467,22 @@ vector<string> KalmanFilter::info() const
     v.push_back(strpr("sizeState       = %zu\n", sizeState));
     v.push_back(strpr("sizeObservation = %zu\n", sizeObservation));
     v.push_back(strpr("OpenMP threads used: %d\n", nbThreads()));
-    v.push_back(strpr("Number of decoupling groups: %zu\n", groupMap.size()));
-    for (size_t i = 0; i < groupMap.size(); ++i)
+    v.push_back(strpr("Number of decoupling groups: %zu\n",
+                      groupLimits.size()));
+    v.push_back(strpr("P matrix size ratio (compared to GEKF): %6.2f\n",
+                      100.0 * sizeP / (sizeState * sizeState)));
+    for (size_t i = 0; i < groupLimits.size(); ++i)
     {
         v.push_back(strpr(" - group %5zu size: %zu\n",
-                          i, groupMap.at(i).size()));
+                          i,
+                          groupLimits.at(i).second
+                          - groupLimits.at(i).first + 1));
     }
     v.push_back(strpr("Number of decoupling groups of proc %d: %zu\n",
                       myRank, myGroups.size()));
-    for (size_t i = 0; i < myGroups.size(); ++i)
+    for (auto g : myGroups)
     {
-        v.push_back(strpr(" - group %5zu size: %zu\n",
-                          i, groupMap.at(myGroups.at(i)).size()));
+        v.push_back(strpr(" - group %5zu size: %zu\n", g.first, g.second));
     }
 
     return v;
diff --git a/src/libnnptrain/KalmanFilter.h b/src/libnnptrain/KalmanFilter.h
index e1c1f2723..75e926983 100644
--- a/src/libnnptrain/KalmanFilter.h
+++ b/src/libnnptrain/KalmanFilter.h
@@ -22,6 +22,7 @@
 #include <Eigen/Core>
 #include <cstddef>
 #include <string>
+#include <utility>
 #include <vector>
 
 namespace nnp
@@ -55,11 +56,14 @@ class KalmanFilter : public Updater
      * @param[in] communicator Input communicator of calling program.
      */
     void                     setupMPI(MPI_Comm* communicator);
-    /** Set decoupling group mask.
+    /** Set up Kalman filter decoupling.
      *
-     * @param[in] mask Input group mask provided via array of integers.
+     * @param[in] groupLimits Vector containing starting and ending indices of
+     *                        each decoupling group.
      */
-    void                     setupDecoupling(int const* const mask);
+    void                     setupDecoupling(
+                                 std::vector<std::pair< std::size_t,
+                                     std::size_t>> groupLimits);
     /** Set observation vector size.
      *
      * @param[in] size Size of the observation vector.
@@ -212,6 +216,8 @@ class KalmanFilter : public Updater
     std::size_t                        sizeObservation;
     /// Total number of updates performed.
     std::size_t                        numUpdates;
+    /// Number of covariance matrix entries in memory (sum over all groups).
+    std::size_t                        sizeP;
     /// Error covariance initialization parameter @f$\epsilon@f$.
     double                             epsilon;
     /// Process noise @f$q@f$.
@@ -236,28 +242,28 @@ class KalmanFilter : public Updater
     double                             nu;
     /// Forgetting gain factor gamma for fading memory Kalman filter.
     double                             gamma;
-    /// List of group indices of this processor.
-    std::vector<int>                   myGroups;
+    /// List of (group indices, group sizes) of this processor.
+    std::vector<std::pair<
+    std::size_t, std::size_t>>         myGroups;
     /// Number of groups for each processor.
     std::vector<std::size_t>           numGroupsPerProc;
     /// Offset in terms of groups for each processor.
     std::vector<std::size_t>           groupOffsetPerProc;
-    /// Decoupling group map.
-    std::map<int, std::vector<int>>    groupMap;
-    /// Decoupling group mask vector.
-    Eigen::Map<Eigen::VectorXi const>* groupMask;
+    /// Decoupling group limits, i.e. starting end ending indices.
+    std::vector<std::pair<
+    std::size_t, std::size_t>>         groupLimits;
     /// State vector.
     Eigen::Map<Eigen::VectorXd>*       w;
     /// Error vector.
     Eigen::Map<Eigen::VectorXd const>* xi;
     /// Derivative matrix.
     Eigen::Map<Eigen::MatrixXd const>* H;
-    /// Error covariance matrix.
-    Eigen::MatrixXd                    P;
+    /// Error covariance matrices for all groups.
+    std::vector<Eigen::MatrixXd>       P;
     /// Kalman gain matrix.
-    Eigen::MatrixXd                    K;
+    std::vector<Eigen::MatrixXd>       K;
     /// Intermediate result X = P . H.
-    Eigen::MatrixXd                    X;
+    std::vector<Eigen::MatrixXd>       X;
     /// Global MPI communicator.
     MPI_Comm                           comm;
 };
diff --git a/src/libnnptrain/Training.cpp b/src/libnnptrain/Training.cpp
index 80feeb754..c6a780889 100644
--- a/src/libnnptrain/Training.cpp
+++ b/src/libnnptrain/Training.cpp
@@ -302,7 +302,8 @@ void Training::initializeWeights()
                               * (maxWeights - minWeights);
             }
 #ifndef ALTERNATIVE_WEIGHT_ORDERING
-            // FIX!!!
+            // To be consistent this should actually be the non-AO version,
+            // but this way results stay comparable.
             elements.at(i).neuralNetwork->setConnectionsAO(&(w.at(i).front()));
 #else
             elements.at(i).neuralNetwork->setConnectionsAO(&(w.at(i).front()));
@@ -1029,14 +1030,11 @@ void Training::setupTraining()
             throw runtime_error("ERROR: Unknown Kalman filter decoupling "
                                 "type.\n");
         }
-        if (decouplingType != DT_GLOBAL &&
-            (parallelMode != PM_TRAIN_ALL || updateStrategy != US_COMBINED))
+        if (decouplingType != DT_GLOBAL && updateStrategy != US_COMBINED)
         {
             throw runtime_error(strpr("ERROR: Kalman filter decoupling works "
-                                      "only in conjunction with"
-                                      "\"parallel_mode %d\" and "
-                                      "\"update_strategy %d\".\n",
-                                      PM_TRAIN_ALL, US_COMBINED));
+                                      "only in conjunction with "
+                                      "update_strategy %d\".\n", US_COMBINED));
         }
     }
 
@@ -1056,57 +1054,62 @@ void Training::setupTraining()
                     (Updater*)new KalmanFilter(numWeightsPerUpdater.at(i),
                                                kalmanType));
                 KalmanFilter* u = dynamic_cast<KalmanFilter*>(updaters.back());
-                u->setupMPI(&comm);
-                vector<int> groupMask(weights.at(i).size(),
-                                      numeric_limits<int>::max());
+                if (parallelMode == PM_TRAIN_ALL)
+                {
+                    u->setupMPI(&comm);
+                }
+                else
+                {
+                    MPI_Group groupWorld;
+                    MPI_Comm_group(comm, &groupWorld);
+                    int const size0 = 1;
+                    int const rank0[1] = {0};
+                    MPI_Group groupRank0;
+                    MPI_Group_incl(groupWorld, size0, rank0, &groupRank0);
+                    MPI_Comm commRank0;
+                    MPI_Comm_create_group(comm, groupRank0, 0, &commRank0);
+                    u->setupMPI(&commRank0);
+                }
+                vector<pair<size_t, size_t>> limits;
                 if (decouplingType == DT_GLOBAL)
                 {
-                    fill(groupMask.begin(), groupMask.end(), 0);
+                    limits.push_back(
+                        make_pair(0, numWeightsPerUpdater.at(i) - 1));
                 }
                 else if (decouplingType == DT_ELEMENT)
                 {
-                    for (size_t i = 0; i < numElements; ++i)
+                    for (size_t j = 0; j < numElements; ++j)
                     {
-                        fill(groupMask.begin() + weightsOffset.at(i),
-                             groupMask.begin() + weightsOffset.at(i)
-                             + elements.at(i).
-                               neuralNetwork->getNumConnections(),
-                             i);
+                        limits.push_back(make_pair(
+                            weightsOffset.at(j),
+                            weightsOffset.at(j) + elements.at(j).
+                                neuralNetwork->getNumConnections() - 1));
                     }
                 }
                 else if (decouplingType == DT_LAYER)
                 {
-                    size_t index = 0;
-                    for (size_t i = 0; i < numElements; ++i)
+                    for (size_t j = 0; j < numElements; ++j)
                     {
-                        for (auto b : elements.at(i).neuralNetwork
-                                      ->getLayerBoundaries())
+                        for (auto l : elements.at(j).neuralNetwork
+                                      ->getLayerLimits())
                         {
-                            fill(groupMask.begin()
-                                    + weightsOffset.at(i) + b.first,
-                                 groupMask.begin()
-                                    + weightsOffset.at(i) + b.second + 1,
-                                 index);
-                            index++;
+                            limits.push_back(make_pair(
+                                weightsOffset.at(j) + l.first,
+                                weightsOffset.at(j) + l.second));
                         }
                     }
                 }
                 else if (decouplingType == DT_NODE)
                 {
 #ifndef ALTERNATIVE_WEIGHT_ORDERING
-                    size_t index = 0;
-                    for (size_t i = 0; i < numElements; ++i)
+                    for (size_t j = 0; j < numElements; ++j)
                     {
-                        for (auto b : elements.at(i).neuralNetwork
-                                      ->getNeuronBoundaries())
+                        for (auto l : elements.at(j).neuralNetwork
+                                      ->getNeuronLimits())
                         {
-                            log << strpr("%zu %zu %zu\n", i, b.first, b.second);
-                            fill(groupMask.begin()
-                                    + weightsOffset.at(i) + b.first,
-                                 groupMask.begin()
-                                    + weightsOffset.at(i) + b.second + 1,
-                                 index);
-                            index++;
+                            limits.push_back(make_pair(
+                                weightsOffset.at(j) + l.first,
+                                weightsOffset.at(j) + l.second));
                         }
                     }
 #else
@@ -1119,14 +1122,12 @@ void Training::setupTraining()
                 }
                 else if (decouplingType == DT_FULL)
                 {
-                    size_t index = 0;
-                    for (auto& m : groupMask)
+                    for (size_t j = 0; j < numWeightsPerUpdater.at(i); ++j)
                     {
-                        m = index;
-                        index++;
+                        limits.push_back(make_pair(j, j));
                     }
                 }
-                u->setupDecoupling(groupMask.data());
+                u->setupDecoupling(limits);
             }
             updaters.back()->setState(&(weights.at(i).front()));
         }
@@ -1551,7 +1552,7 @@ void Training::calculateErrorEpoch()
         fileNameForcesTest = strpr("testforces.%06zu.out", epoch);
     }
 
-    // Calculate RMSE and write comparison files.
+    // Calculate error and write comparison files.
     calculateError(true,
                    identifier,
                    fileNameEnergiesTrain,
@@ -1750,7 +1751,7 @@ void Training::writeNeuronStatistics(string const fileName) const
         vector<string> colInfo;
         vector<size_t> colSize;
         title.push_back("Statistics for individual neurons gathered during "
-                        "RMSE calculation.");
+                        "error calculation.");
         colSize.push_back(10);
         colName.push_back("element");
         colInfo.push_back("Element index.");
@@ -2005,7 +2006,7 @@ void Training::loop()
     else if (errorMetricForces == 1) metric = "MAE";
     if (errorMetricEnergies % 2 == 0) peratom = "per atom ";
 
-    log << "The training loop output covers different RMSEs, update and\n";
+    log << "The training loop output covers different errors, update and\n";
     log << "timing information. The following quantities are organized\n";
     log << "according to the matrix scheme below:\n";
     log << "-------------------------------------------------------------------\n";
@@ -2032,7 +2033,7 @@ void Training::loop()
     log << "Abbreviations:\n";
     log << "  p. u. = physical units.\n";
     log << "  i. u. = internal units.\n";
-    log << "Note: RMSEs in internal units (columns 5 + 6) are only present \n";
+    log << "Note: Errors in internal units (columns 5 + 6) are only present \n";
     log << "      if data set normalization is used.\n";
     log << "-------------------------------------------------------------------\n";
     log << "     1    2             3             4             5             6\n";
diff --git a/test/cpp/Example_nnp_train.h b/test/cpp/Example_nnp_train.h
index fe4aa3d22..88ed24a76 100644
--- a/test/cpp/Example_nnp_train.h
+++ b/test/cpp/Example_nnp_train.h
@@ -28,13 +28,13 @@ void BoostDataContainer<Example_nnp_train>::setup()
     e->rmseForcesTrain = 3.77302629E-02;
     e->rmseForcesTest  = 2.25135617E-01;
 
-    //examples.push_back(Example_nnp_train("H2O_RPBE-D3"));
-    //e = &(examples.back());
-    //e->lastEpoch = 10;
-    //e->rmseEnergyTrain = 1.02312914E-04;
-    //e->rmseEnergyTest  = 5.28837597E-04;
-    //e->rmseForcesTrain = 1.67069652E-02;
-    //e->rmseForcesTest  = 1.07708383E-02;
+    examples.push_back(Example_nnp_train("H2O_RPBE-D3"));
+    e = &(examples.back());
+    e->lastEpoch = 5;
+    e->rmseEnergyTrain = 9.19395999E-04;
+    e->rmseEnergyTest  = 7.42520683E-04;
+    e->rmseForcesTrain = 2.40682415E-02;
+    e->rmseForcesTest  = 1.44776060E-02;
 
     return;
 }

From 0a95ce8882b929ab67a911ac63fcd0254c1474c3 Mon Sep 17 00:00:00 2001
From: Andreas Singraber <andreas.singraber@gmx.at>
Date: Wed, 16 Sep 2020 18:33:28 +0200
Subject: [PATCH 8/9] Decoupling on single processor works

---
 src/libnnptrain/KalmanFilter.cpp | 97 +++++++++++++++++++++-----------
 1 file changed, 64 insertions(+), 33 deletions(-)

diff --git a/src/libnnptrain/KalmanFilter.cpp b/src/libnnptrain/KalmanFilter.cpp
index 221f82806..fd4d18224 100644
--- a/src/libnnptrain/KalmanFilter.cpp
+++ b/src/libnnptrain/KalmanFilter.cpp
@@ -226,15 +226,32 @@ void KalmanFilter::update()
 
 void KalmanFilter::update(size_t const sizeObservation)
 {
-    X.at(0).resize(sizeState, sizeObservation);
+    MatrixXd A(sizeObservation, sizeObservation);
+    A.setZero();
 
-    // Calculate temporary result.
-    // X = P . H
-    X.at(0) = P.at(0).selfadjointView<Lower>() * (*H);
+    for (size_t i = 0; i < myGroups.size(); ++i)
+    {
+        size_t const& j = myGroups.at(i).first;     // group index
+        size_t const& n = myGroups.at(i).second;    // group size
+        size_t const& s = groupLimits.at(j).first;  // group weight start index
+
+        auto& x = X.at(i);
+        auto& p = P.at(i);
+
+        x.resize(n, sizeObservation);
+
+        // Calculate temporary result.
+        // X = P . H
+        x = p.selfadjointView<Lower>() * H->block(s, 0, n, sizeObservation);
+ 
+        // Calculate scaling matrix.
+        // A = H^T . X
+        // For decoupling summarize scaling matrix here (locally).
+        A += H->block(s, 0, n, sizeObservation).transpose() * x;
+    }
 
-    // Calculate scaling matrix.
-    // A = H^T . X
-    MatrixXd A = H->transpose() * X.at(0);
+    // Summarize scaling matrix globally.
+    MPI_Allreduce(MPI_IN_PLACE, A.data(), A.size(), MPI_DOUBLE, MPI_SUM, comm);
 
     // Increase learning rate.
     // eta(n) = eta(0) * exp(n * tau)
@@ -251,39 +268,53 @@ void KalmanFilter::update(size_t const sizeObservation)
         A.diagonal() += VectorXd::Constant(sizeObservation, lambda);
     }
 
-    // Calculate Kalman gain matrix.
-    // K = X . A^-1
-    K.at(0).resize(sizeState, sizeObservation);
-    K.at(0) = X.at(0) * A.inverse();
+    for (size_t i = 0; i < myGroups.size(); ++i)
+    {
+        size_t const& j = myGroups.at(i).first;     // group index
+        size_t const& n = myGroups.at(i).second;    // group size
+        size_t const& s = groupLimits.at(j).first;  // group weight start index
 
-    // Update error covariance matrix.
-    // P = P - K . X^T
-    P.at(0).noalias() -= K.at(0) * X.at(0).transpose();
+        auto& x = X.at(i);
+        auto& p = P.at(i);
+        auto& k = K.at(i);
 
-    // Apply forgetting factor.
-    if (type == KT_FADINGMEMORY)
-    {
-        P.at(0) *= 1.0 / lambda;
-    }
-    // Add process noise.
-    // P = P + Q
-    P.at(0).diagonal() += VectorXd::Constant(sizeState, q);
+        // Calculate Kalman gain matrix.
+        // K = X . A^-1
+        k.resize(n, sizeObservation);
+        k = x * A.inverse();
 
-    // Update state vector.
-    // w =  w + K . xi
-    (*w) += K.at(0) * (*xi);
+        // Update error covariance matrix.
+        // P = P - K . X^T
+        p.noalias() -= k * x.transpose();
 
-    // Anneal process noise.
-    // q(n) = q(0) * exp(-n * tau)
-    if (q > qmin) q *= exp(-qtau);
+        // Apply forgetting factor.
+        if (type == KT_FADINGMEMORY)
+        {
+            p *= 1.0 / lambda;
+        }
+        // Add process noise.
+        // P = P + Q
+        p.diagonal() += VectorXd::Constant(n, q);
 
-    // Update forgetting factor.
-    if (type == KT_FADINGMEMORY)
-    {
-        lambda = nu * lambda + 1.0 - nu;
-        gamma = 1.0 / (1.0 + lambda / gamma);
+        // Update state vector.
+        // w =  w + K . xi
+        w->segment(s, n) += k * (*xi);
+
+        // Anneal process noise.
+        // q(n) = q(0) * exp(-n * tau)
+        if (q > qmin) q *= exp(-qtau);
+
+        // Update forgetting factor.
+        if (type == KT_FADINGMEMORY)
+        {
+            lambda = nu * lambda + 1.0 - nu;
+            gamma = 1.0 / (1.0 + lambda / gamma);
+        }
     }
 
+    // Communicate new weight segments.
+    // ????
+
     numUpdates++;
 
     return;

From 11dc11641080a07fae1d98f79a4387fec60d3b1f Mon Sep 17 00:00:00 2001
From: Andreas Singraber <andreas.singraber@gmx.at>
Date: Thu, 17 Sep 2020 14:10:20 +0200
Subject: [PATCH 9/9] Communicate weight segments correctly

---
 src/libnnptrain/KalmanFilter.cpp | 71 ++++++++++++++++++++------------
 src/libnnptrain/KalmanFilter.h   |  4 ++
 src/libnnptrain/Training.cpp     | 42 +++++++++++--------
 src/makefile.gnu                 |  3 +-
 4 files changed, 75 insertions(+), 45 deletions(-)

diff --git a/src/libnnptrain/KalmanFilter.cpp b/src/libnnptrain/KalmanFilter.cpp
index fd4d18224..5d02b44c2 100644
--- a/src/libnnptrain/KalmanFilter.cpp
+++ b/src/libnnptrain/KalmanFilter.cpp
@@ -136,7 +136,7 @@ void KalmanFilter::setupDecoupling(vector<pair<size_t, size_t>> groupLimits)
     int quotient = groupLimits.size() / numProcs;
     int remainder = groupLimits.size() % numProcs;
     int groupSum = 0;
-    for (int i = 0; i < numProcs; i++)
+    for (int i = 0; i < numProcs; ++i)
     {
         numGroupsPerProc.at(i) = quotient;
         if (remainder > 0 && i < remainder)
@@ -146,6 +146,23 @@ void KalmanFilter::setupDecoupling(vector<pair<size_t, size_t>> groupLimits)
         groupOffsetPerProc.at(i) = groupSum;
         groupSum += numGroupsPerProc.at(i);
     }
+
+    // Compute state vector segment length and offset for each processor.
+    for (int i = 0; i < numProcs - 1; ++i)
+    {
+        sizeStatePerProc.push_back(
+            groupLimits.at(groupOffsetPerProc.at(i+1)).first -
+            groupLimits.at(groupOffsetPerProc.at(i)).first);
+        offsetStatePerProc.push_back(
+            groupLimits.at(groupOffsetPerProc.at(i)).first);
+    }
+    sizeStatePerProc.push_back(
+        groupLimits.back().second + 1 -
+        groupLimits.at(groupOffsetPerProc.at(numProcs - 1)).first);
+    offsetStatePerProc.push_back(
+        groupLimits.at(groupOffsetPerProc.at(numProcs - 1)).first);
+
+    // Store group index and segment length for this processor.
     for (size_t i = 0; i < numGroupsPerProc.at(myRank); ++i)
     {
         size_t index = groupOffsetPerProc.at(myRank) + i;
@@ -313,7 +330,7 @@ void KalmanFilter::update(size_t const sizeObservation)
     }
 
     // Communicate new weight segments.
-    // ????
+    MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DOUBLE, w->data(), sizeStatePerProc.data(), offsetStatePerProc.data(), MPI_DOUBLE, comm);
 
     numUpdates++;
 
@@ -470,11 +487,24 @@ vector<string> KalmanFilter::info() const
 {
     vector<string> v;
 
+    v.push_back("Extended Kalman Filter (EKF) optimizer:\n");
+    v.push_back(strpr("sizeState       = %zu\n", sizeState));
+    v.push_back(strpr("sizeObservation = %zu\n", sizeObservation));
+    v.push_back(strpr("myRank          = %d\n", myRank));
+    v.push_back(strpr("numProcs        = %d\n", numProcs));
+    v.push_back(strpr("OpenMP threads used                    : %d\n",
+                      nbThreads()));
+    v.push_back(strpr("Number of decoupling groups            : %zu\n",
+                      groupLimits.size()));
+    v.push_back(strpr("Number of decoupling groups of proc %3d: %zu\n",
+                      myRank, myGroups.size()));
+    v.push_back(strpr("P matrix size ratio (compared to GEKF) : %6.2f %%\n",
+                      100.0 * sizeP / (sizeState * sizeState)));
+    v.push_back("-----------------------------------------"
+                "--------------------------------------\n");
     if (type == KT_STANDARD)
     {
         v.push_back(strpr("KalmanType::KT_STANDARD (%d)\n", type));
-        v.push_back(strpr("myRank          = %d\n", myRank));
-        v.push_back(strpr("numProcs        = %d\n", numProcs));
         v.push_back(strpr("epsilon         = %12.4E\n", epsilon));
         v.push_back(strpr("q0              = %12.4E\n", q0     ));
         v.push_back(strpr("qtau            = %12.4E\n", qtau   ));
@@ -486,8 +516,6 @@ vector<string> KalmanFilter::info() const
     else if (type == KT_FADINGMEMORY)
     {
         v.push_back(strpr("KalmanType::KT_FADINGMEMORY (%d)\n", type));
-        v.push_back(strpr("sizeState       = %zu\n", sizeState));
-        v.push_back(strpr("sizeObservation = %zu\n", sizeObservation));
         v.push_back(strpr("epsilon         = %12.4E\n", epsilon));
         v.push_back(strpr("q0              = %12.4E\n", q0     ));
         v.push_back(strpr("qtau            = %12.4E\n", qtau   ));
@@ -495,26 +523,17 @@ vector<string> KalmanFilter::info() const
         v.push_back(strpr("lambda          = %12.4E\n", lambda));
         v.push_back(strpr("nu              = %12.4E\n", nu    ));
     }
-    v.push_back(strpr("sizeState       = %zu\n", sizeState));
-    v.push_back(strpr("sizeObservation = %zu\n", sizeObservation));
-    v.push_back(strpr("OpenMP threads used: %d\n", nbThreads()));
-    v.push_back(strpr("Number of decoupling groups: %zu\n",
-                      groupLimits.size()));
-    v.push_back(strpr("P matrix size ratio (compared to GEKF): %6.2f\n",
-                      100.0 * sizeP / (sizeState * sizeState)));
-    for (size_t i = 0; i < groupLimits.size(); ++i)
-    {
-        v.push_back(strpr(" - group %5zu size: %zu\n",
-                          i,
-                          groupLimits.at(i).second
-                          - groupLimits.at(i).first + 1));
-    }
-    v.push_back(strpr("Number of decoupling groups of proc %d: %zu\n",
-                      myRank, myGroups.size()));
-    for (auto g : myGroups)
-    {
-        v.push_back(strpr(" - group %5zu size: %zu\n", g.first, g.second));
-    }
+    //for (size_t i = 0; i < groupLimits.size(); ++i)
+    //{
+    //    v.push_back(strpr(" - group %5zu size: %zu\n",
+    //                      i,
+    //                      groupLimits.at(i).second
+    //                      - groupLimits.at(i).first + 1));
+    //}
+    //for (auto g : myGroups)
+    //{
+    //    v.push_back(strpr(" - group %5zu size: %zu\n", g.first, g.second));
+    //}
 
     return v;
 }
diff --git a/src/libnnptrain/KalmanFilter.h b/src/libnnptrain/KalmanFilter.h
index 75e926983..0d409ac1c 100644
--- a/src/libnnptrain/KalmanFilter.h
+++ b/src/libnnptrain/KalmanFilter.h
@@ -242,6 +242,10 @@ class KalmanFilter : public Updater
     double                             nu;
     /// Forgetting gain factor gamma for fading memory Kalman filter.
     double                             gamma;
+    /// Size of state vector segment handled by each processor.
+    std::vector<int>                   sizeStatePerProc;
+    /// Offset of state vector segment handled by each processor.
+    std::vector<int>                   offsetStatePerProc;
     /// List of (group indices, group sizes) of this processor.
     std::vector<std::pair<
     std::size_t, std::size_t>>         myGroups;
diff --git a/src/libnnptrain/Training.cpp b/src/libnnptrain/Training.cpp
index c6a780889..990754450 100644
--- a/src/libnnptrain/Training.cpp
+++ b/src/libnnptrain/Training.cpp
@@ -1865,24 +1865,31 @@ void Training::writeUpdaterStatus(bool         append,
 
     for (size_t i = 0; i < numUpdaters; ++i)
     {
-        string fileName;
-        if (updateStrategy == US_COMBINED)
-        {
-            fileName = strpr(fileNameFormat.c_str(), 0);
-        }
-        else if (updateStrategy == US_ELEMENT)
+        if (myRank == 0)
         {
-            fileName = strpr(fileNameFormat.c_str(),
-                             elementMap.atomicNumber(i));
+            string fileName;
+            if (updateStrategy == US_COMBINED)
+            {
+                fileName = strpr(fileNameFormat.c_str(), 0);
+            }
+            else if (updateStrategy == US_ELEMENT)
+            {
+                fileName = strpr(fileNameFormat.c_str(),
+                                 elementMap.atomicNumber(i));
+            }
+            if (append) file.open(fileName.c_str(), ofstream::app);
+            else
+            {
+                file.open(fileName.c_str());
+                appendLinesToFile(file, updaters.at(i)->statusHeader());
+            }
+            file << updaters.at(i)->status(epoch);
+            file.close();
         }
-        if (append) file.open(fileName.c_str(), ofstream::app);
-        else 
+        else if (parallelMode == PM_TRAIN_ALL)
         {
-            file.open(fileName.c_str());
-            appendLinesToFile(file, updaters.at(i)->statusHeader());
+            updaters.at(i)->status(epoch);
         }
-        file << updaters.at(i)->status(epoch);
-        file.close();
     }
 
     return;
@@ -2062,8 +2069,9 @@ void Training::loop()
     // Write learning curve.
     if (myRank == 0) writeLearningCurve(false);
 
-    // Write updater status to file.
-    if (myRank == 0) writeUpdaterStatus(false);
+    // Write updater status to file (decoupled KF requires all tasks to execute
+    // the status() function.
+    writeUpdaterStatus(false);
 
     // Write neuron statistics.
     writeNeuronStatisticsEpoch();
@@ -2135,7 +2143,7 @@ void Training::loop()
         if (myRank == 0) writeLearningCurve(true);
 
         // Write updater status to file.
-        if (myRank == 0) writeUpdaterStatus(true);
+        writeUpdaterStatus(true);
 
         // Write neuron statistics.
         writeNeuronStatisticsEpoch();
diff --git a/src/makefile.gnu b/src/makefile.gnu
index f35aeb0b6..ece0147ce 100644
--- a/src/makefile.gnu
+++ b/src/makefile.gnu
@@ -14,8 +14,7 @@ PROJECT_EIGEN=/usr/include/eigen3/
 PROJECT_CC=g++
 PROJECT_MPICC=mpic++
 # OpenMP parallelization is disabled by default, add flag "-fopenmp" to enable.
-PROJECT_CFLAGS=-O0
-#3 -march=native -std=c++11
+PROJECT_CFLAGS=-O3 -march=native -std=c++11
 PROJECT_CFLAGS_MPI=-Wno-long-long
 PROJECT_DEBUG=-g -pedantic-errors -Wall -Wextra
 PROJECT_TEST=--coverage -fno-default-inline -fno-inline -fno-inline-small-functions -fno-elide-constructors