diff --git a/Makefile b/Makefile
index 74e85769..b2e74919 100644
--- a/Makefile
+++ b/Makefile
@@ -28,6 +28,7 @@ unittest:
 	python3 -m unittest discover -s tests -v -p "*_tests.py" || (echo "Error in standard unit tests."; exit 1)
 	@# remove temporary files created during testing
 	@if test -f net.pth; then rm net.pth; fi
+	@if test -f net_with_discard_wrapper.pth; then rm net_with_discard_wrapper.pth; fi
 	@if test -f scaler.pkl; then rm scaler.pkl; fi
 	@if test -f tmp.jnl; then rm tmp.jnl; fi
 	@if test -f journal_tests_testfile.pkl; then rm journal_tests_testfile.pkl; fi
@@ -40,6 +41,10 @@ unittest_mpi:
 exampletest: $(MAKEDIRS)
 	@echo "Testing standard examples.."
 	python3 -m unittest -v tests/test_examples.py || (echo "Error in example tests."; exit 1)
+	@if test -f scaler.pkl; then rm scaler.pkl; fi
+	@if test -f seminn_net.pth; then rm seminn_net.pth; fi
+	@if test -f triplet_net.pth; then rm triplet_net.pth; fi
+	@if test -f tmp.jnl; then rm tmp.jnl; fi
 
 exampletest_mpi:
 	@echo "Testing MPI backend examples.."
diff --git a/README.md b/README.md
index e1ff1c55..f79f4b4a 100644
--- a/README.md
+++ b/README.md
@@ -44,6 +44,7 @@ Additional **features** are:
 * several methods for summary selection:
   * [Semi-automatic summary selection (with Neural networks)](http://proceedings.mlr.press/v97/wiqvist19a/wiqvist19a.pdf)
   * [summary selection using distance learning (with Neural networks)](https://link.springer.com/article/10.1007/s13571-019-00208-8)
+  * [Sufficient statistics of exponential family approximating the likelihood (with Neural networks)](https://arxiv.org/abs/2012.10903)
 * [Random Forest Model Selection Scheme](https://academic.oup.com/bioinformatics/article/32/6/859/1744513)
 
 
@@ -130,9 +131,9 @@ Publications in which ABCpy was applied:
 
 * R. Dutta, K. Zouaoui-Boudjeltia, C. Kotsalos, A. Rousseau, D. Ribeiro de Sousa, J. M. Desmet, A. Van Meerhaeghe, A. Mira, and B. Chopard. "Interpretable pathological test for Cardio-vascular disease: Approximate Bayesian computation with distance learning.", 2020, arXiv:2010.06465.
 
-* R. Dutta, S. Gomes, D. Kalise, L. Pacchiardi. "Using mobility data in the design of optimal lockdown strategies for the COVID-19 pandemic in England.", 2020, arXiv:2006.16059.
+* R. Dutta, S. Gomes, D. Kalise, L. Pacchiardi. "Using mobility data in the design of optimal lockdown strategies for the COVID-19 pandemic in England.", 2021,  PLOS Computational Biology, 17(8), e1009236.
 
-* L. Pacchiardi, P. K&#252;nzli, M. Sch&#246;ngens, B. Chopard, R. Dutta, "Distance-Learning for Approximate Bayesian Computation to Model a Volcanic Eruption", 2020, Sankhya B, 1-30.
+* L. Pacchiardi, P. K&#252;nzli, M. Sch&#246;ngens, B. Chopard, R. Dutta, "Distance-Learning for Approximate Bayesian Computation to Model a Volcanic Eruption", 2021, Sankhya B, 83(1), 288-317.
 
 * R. Dutta, J. P.  Onnela, A. Mira, "Bayesian Inference of Spreading Processes on Networks", 2018, Proceedings of Royal Society A, 474(2215), 20180129.
 
diff --git a/VERSION b/VERSION
index b1d7abc0..844f6a91 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.6.2
\ No newline at end of file
+0.6.3
diff --git a/abcpy/NN_utilities/algorithms.py b/abcpy/NN_utilities/algorithms.py
index 4a17b0e5..14ded15a 100644
--- a/abcpy/NN_utilities/algorithms.py
+++ b/abcpy/NN_utilities/algorithms.py
@@ -81,13 +81,13 @@ def contrastive_training(samples, similarity_set, embedding_net, cuda, batch_siz
         scheduler = scheduler(optimizer, **scheduler_kwargs)
 
     # now train:
-    fit(pairs_train_loader, model_contrastive, loss_fn, optimizer, scheduler, n_epochs, cuda,
+    train_losses, test_losses = fit(pairs_train_loader, model_contrastive, loss_fn, optimizer, scheduler, n_epochs, cuda,
         val_loader=pairs_train_loader_val,
         early_stopping=early_stopping, start_epoch_early_stopping=start_epoch_early_stopping,
         epochs_early_stopping_interval=epochs_early_stopping_interval, start_epoch_training=start_epoch_training,
         use_tqdm=use_tqdm)
 
-    return embedding_net
+    return embedding_net, train_losses, test_losses
 
 
 def triplet_training(samples, similarity_set, embedding_net, cuda, batch_size=16, n_epochs=400,
@@ -155,12 +155,12 @@ def triplet_training(samples, similarity_set, embedding_net, cuda, batch_size=16
         scheduler = scheduler(optimizer, **scheduler_kwargs)
 
     # now train:
-    fit(triplets_train_loader, model_triplet, loss_fn, optimizer, scheduler, n_epochs, cuda,
+    train_losses, test_losses = fit(triplets_train_loader, model_triplet, loss_fn, optimizer, scheduler, n_epochs, cuda,
         val_loader=triplets_train_loader_val,
         early_stopping=early_stopping, start_epoch_early_stopping=start_epoch_early_stopping,
         epochs_early_stopping_interval=epochs_early_stopping_interval, start_epoch_training=start_epoch_training, use_tqdm=use_tqdm)
 
-    return embedding_net
+    return embedding_net, train_losses, test_losses
 
 
 def FP_nn_training(samples, target, embedding_net, cuda, batch_size=1, n_epochs=50, samples_val=None, target_val=None,
@@ -222,9 +222,9 @@ def FP_nn_training(samples, target, embedding_net, cuda, batch_size=1, n_epochs=
         scheduler = scheduler(optimizer, **scheduler_kwargs)
 
     # now train:
-    fit(data_loader_FP_nn, embedding_net, loss_fn, optimizer, scheduler, n_epochs, cuda,
+    train_losses, test_losses = fit(data_loader_FP_nn, embedding_net, loss_fn, optimizer, scheduler, n_epochs, cuda,
         val_loader=data_loader_FP_nn_val,
         early_stopping=early_stopping, start_epoch_early_stopping=start_epoch_early_stopping,
         epochs_early_stopping_interval=epochs_early_stopping_interval, start_epoch_training=start_epoch_training, use_tqdm=use_tqdm)
 
-    return embedding_net
+    return embedding_net, train_losses, test_losses
diff --git a/abcpy/NN_utilities/losses.py b/abcpy/NN_utilities/losses.py
index 4dc28cb2..22a15c37 100644
--- a/abcpy/NN_utilities/losses.py
+++ b/abcpy/NN_utilities/losses.py
@@ -1,3 +1,4 @@
+import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
@@ -37,3 +38,24 @@ def forward(self, anchor, positive, negative, size_average=True):
         distance_negative = (anchor - negative).pow(2).sum(1)  # .pow(.5)
         losses = F.relu(distance_positive - distance_negative + self.margin)
         return losses.mean() if size_average else losses.sum()
+
+
+def Fisher_divergence_loss(first_der_t, second_der_t, eta, lam=0):
+    """lam is the regularization parameter of the Kingma & LeCun (2010) regularization"""
+    inner_prod_second_der_eta = torch.bmm(second_der_t, eta.unsqueeze(-1))  # this is used twice
+
+    if lam == 0:
+        return sum(
+            (0.5 * torch.bmm(first_der_t, eta.unsqueeze(-1)) ** 2 + inner_prod_second_der_eta).view(-1))
+    else:
+        return sum(
+            (0.5 * torch.bmm(first_der_t, eta.unsqueeze(-1)) ** 2 +
+             inner_prod_second_der_eta + lam * inner_prod_second_der_eta ** 2).view(-1))
+
+
+def Fisher_divergence_loss_with_c_x(first_der_t, second_der_t, eta, lam=0):
+    # this enables to use the term c(x) in the approximating family, ie a term that depends only on x and not on theta.
+    new_eta = torch.cat((eta, torch.ones(eta.shape[0], 1).to(eta)),
+                        dim=1)  # the one tensor need to be on same device as eta.
+    # then call the other loss function with this new_eta:
+    return Fisher_divergence_loss(first_der_t, second_der_t, new_eta, lam=lam)
diff --git a/abcpy/NN_utilities/networks.py b/abcpy/NN_utilities/networks.py
index bd411dd8..7dbaf1e5 100644
--- a/abcpy/NN_utilities/networks.py
+++ b/abcpy/NN_utilities/networks.py
@@ -1,6 +1,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.autograd import grad
 
 
 class SiameseNet(nn.Module):
@@ -42,13 +43,61 @@ def get_embedding(self, x):
         return self.embedding_net(x)
 
 
-def createDefaultNN(input_size, output_size, hidden_sizes=None, nonlinearity=None):
+class ScalerAndNet(nn.Module):
+    """Defines a nn.Module class that wraps a scaler and a neural network, and applies the scaler before passing the
+    data through the neural network."""
+
+    def __init__(self, net, scaler):
+        """"""
+        super().__init__()
+        self.net = net
+        self.scaler = scaler
+
+    def forward(self, x):
+        """"""
+        x = torch.tensor(self.scaler.transform(x), dtype=torch.float32).to(next(self.net.parameters()).device)
+        return self.net(x)
+
+
+class DiscardLastOutputNet(nn.Module):
+    """Defines a nn.Module class that wraps a scaler and a neural network, and applies the scaler before passing the
+    data through the neural network. Next, the """
+
+    def __init__(self, net):
+        super().__init__()
+        self.net = net
+
+    def forward(self, x):
+        x = self.net(x)
+        if len(x.shape) == 1:
+            return x[0:-1]
+        if len(x.shape) == 2:
+            return x[:, 0:-1]
+        if len(x.shape) == 3:
+            return x[:, :, 0:-1]
+
+
+def createDefaultNN(input_size, output_size, hidden_sizes=None, nonlinearity=None, batch_norm_last_layer=False,
+                    batch_norm_last_layer_momentum=0.1):
     """Function returning a fully connected neural network class with a given input and output size, and optionally
-    given hidden layer sizes (if these are not given, they are determined from the input and output size with some
-    expression.
+    given hidden layer sizes (if these are not given, they are determined from the input and output size in a heuristic
+    way, see below).
+
+    In order to instantiate the network, you need to write:
+
+        >>> createDefaultNN(input_size, output_size)()
+
+    as the function returns a class, and () is needed to instantiate an object.
+
+    If hidden_sizes is None, three hidden layers are used with the following sizes:
+    ``[int(input_size * 1.5), int(input_size * 0.75 + output_size * 3), int(output_size * 5)]``
+
+    Note that the nonlinearity here is as an object or a functional, not a class, eg:
+        nonlinearity =  nn.Softplus()
+    or:
+        nonlinearity =  nn.functional.softplus
 
-    In order to instantiate the network, you need to write: createDefaultNN(input_size, output_size)() as the function
-    returns a class, and () is needed to instantiate an object."""
+    """
 
     class DefaultNN(nn.Module):
         """Neural network class with sizes determined by the upper level variables."""
@@ -78,36 +127,197 @@ def __init__(self):
                     self.fc_hidden.append(nn.Linear(hidden_sizes_list[i], hidden_sizes_list[i + 1]))
                 self.fc_out = nn.Linear(hidden_sizes_list[-1], output_size)
 
+            # define the batch_norm:
+            if batch_norm_last_layer:
+                self.bn_out = nn.BatchNorm1d(output_size, affine=False, momentum=batch_norm_last_layer_momentum)
+
         def forward(self, x):
+
+            if nonlinearity is None:
+                nonlinearity_fcn = F.relu
+            else:
+                nonlinearity_fcn = nonlinearity
+
             if not hasattr(self,
                            "fc_hidden"):  # it means that hidden sizes was provided and the length of the list was 0
                 return self.fc_in(x)
 
-            if nonlinearity is None:
-                x = F.relu(self.fc_in(x))
-                for i in range(len(self.fc_hidden)):
-                    x = F.relu(self.fc_hidden[i](x))
-            else:
-                x = nonlinearity(self.fc_in(x))
-                for i in range(len(self.fc_hidden)):
-                    x = nonlinearity(self.fc_hidden[i](x))
+            x = nonlinearity_fcn(self.fc_in(x))
+            for i in range(len(self.fc_hidden)):
+                x = nonlinearity_fcn(self.fc_hidden[i](x))
 
             x = self.fc_out(x)
 
+            if batch_norm_last_layer:
+                x = self.bn_out(x)
+
             return x
 
     return DefaultNN
 
 
-class ScalerAndNet(nn.Module):
-    """Defines a nn.Module class that wraps a scaler and a neural network, and applies the scaler before passing the
-    data through the neural network."""
+def createDefaultNNWithDerivatives(input_size, output_size, hidden_sizes=None, nonlinearity=None,
+                                   first_derivative_only=False):
+    """Function returning a fully connected neural network class with a given input and output size, and optionally
+    given hidden layer sizes (if these are not given, they are determined from the input and output size with some
+    expression. This neural network is capable of computing the first and second derivatives of output with respect to
+    input along with the forward pass.
 
-    def __init__(self, net, scaler):
-        super().__init__()
-        self.net = net
-        self.scaler = scaler
+    All layers in this neural network are linear.
 
-    def forward(self, x):
-        x = torch.tensor(self.scaler.transform(x), dtype=torch.float32).to(next(self.net.parameters()).device)
-        return self.net(x)
+        >>> createDefaultNN(input_size, output_size)()
+
+    as the function returns a class, and () is needed to instantiate an object.
+
+    If hidden_sizes is None, three hidden layers are used with the following sizes:
+    ``[int(input_size * 1.5), int(input_size * 0.75 + output_size * 3), int(output_size * 5)]``
+
+    Note that the nonlinearity here is passed as a class, not an object, eg:
+        nonlinearity =  nn.Softplus
+    """
+
+    if nonlinearity in [torch.nn.Softsign, torch.nn.Tanhshrink]:
+        raise RuntimeError("The implementation of forward derivatives does not work with Tanhshrink and "
+                           "Softsign nonlinearities.")
+
+    class DefaultNNWithDerivatives(nn.Module):
+        """Neural network class with sizes determined by the upper level variables."""
+
+        def __init__(self):
+            super(DefaultNNWithDerivatives, self).__init__()
+            # put some fully connected layers:
+
+            if nonlinearity is None:  # default nonlinearity
+                non_linearity = nn.ReLU
+            else:
+                non_linearity = nonlinearity  # need to change name otherwise it gives Error
+
+            if hidden_sizes is not None and len(hidden_sizes) == 0:
+                # it is effectively a linear network
+                self.fc_in = nn.Linear(input_size, output_size)
+
+            else:
+                if hidden_sizes is None:
+                    # then set some default values for the hidden layers sizes; is this parametrization reasonable?
+                    hidden_sizes_list = [int(input_size * 1.5), int(input_size * 0.75 + output_size * 3),
+                                         int(output_size * 5)]
+
+                else:
+                    hidden_sizes_list = hidden_sizes
+
+                self.fc_in = nn.Linear(input_size, hidden_sizes_list[0])
+                self.nonlinearity_in = non_linearity()
+
+                # define now the hidden layers
+                self.fc_hidden = nn.ModuleList()
+                self.nonlinearities_hidden = nn.ModuleList()
+                for i in range(len(hidden_sizes_list) - 1):
+                    self.fc_hidden.append(nn.Linear(hidden_sizes_list[i], hidden_sizes_list[i + 1]))
+                    self.nonlinearities_hidden.append(non_linearity())
+                self.fc_out = nn.Linear(hidden_sizes_list[-1], output_size)
+
+        def forward(self, x):
+
+            if not hasattr(self,
+                           "fc_hidden"):  # it means that hidden sizes was provided and the length of the list was 0, ie the
+                return self.fc_in(x)
+
+            x = self.fc_in(x)
+            x1 = self.nonlinearity_in(x)
+
+            for i in range(len(self.fc_hidden)):
+                x = self.fc_hidden[i](x1)
+                x1 = self.nonlinearities_hidden[i](x)
+
+            x = self.fc_out(x1)
+
+            return x
+
+        def forward_and_derivatives(self, x):
+
+            # initialize the derivatives:
+            f = self.fc_in.weight.unsqueeze(0).repeat(x.shape[0], 1, 1).transpose(2, 1).transpose(0,
+                                                                                                  1)  # one for each element of the batch
+            if not first_derivative_only:
+                s = torch.zeros_like(f)
+
+            if not hasattr(self, "fc_hidden"):
+                # it means that hidden sizes was provided and the length of the list was 0, ie the net is a single layer.
+                if first_derivative_only:
+                    return self.fc_in(x), f.transpose(0, 1)
+                else:
+                    return self.fc_in(x), f.transpose(0, 1), s.transpose(0, 1)
+
+            x = self.fc_in(x)
+            x1 = self.nonlinearity_in(x)
+
+            for i in range(len(self.fc_hidden)):
+                z = x1.grad_fn(torch.ones_like(x1))  # here we repeat some computation from the above line
+                # z = grad(x1, x, torch.ones_like(x1), create_graph=True)[0]  # here we repeat some computation from the above line
+                # you need to update first the second derivative, as you need the first derivative at previous layer
+                if not first_derivative_only:
+                    s = z * s + grad(z, x, torch.ones_like(z), retain_graph=True)[0] * f ** 2
+                f = z * f
+                f = F.linear(f, self.fc_hidden[i].weight)
+                if not first_derivative_only:
+                    s = F.linear(s, self.fc_hidden[i].weight)
+
+                x = self.fc_hidden[i](x1)
+                x1 = self.nonlinearities_hidden[i](x)
+
+            z = x1.grad_fn(torch.ones_like(x1))  # here we repeat some computation from the above line
+            # z = grad(x1, x, torch.ones_like(x1), create_graph=True)[0]  # here we repeat some computation from the above line
+            # you need to update first the second derivative, as you need the first derivative at previous layer
+            if not first_derivative_only:
+                s = z * s + grad(z, x, torch.ones_like(z), retain_graph=True)[0] * f ** 2
+            f = z * f
+            f = F.linear(f, self.fc_out.weight)
+            if not first_derivative_only:
+                s = F.linear(s, self.fc_out.weight)
+
+            x = self.fc_out(x1)
+
+            if first_derivative_only:
+                return x, f.transpose(0, 1)
+            else:
+                return x, f.transpose(0, 1), s.transpose(0, 1)
+
+        def forward_and_full_derivatives(self, x):
+            """This computes jacobian and full Hessian matrix"""
+
+            # initialize the derivatives (one for each element of the batch)
+            f = self.fc_in.weight.unsqueeze(0).repeat(x.shape[0], 1, 1).transpose(2, 1).transpose(0, 1)
+            H = torch.zeros((f.shape[0], *f.shape)).to(f)  # hessian has an additional dimension wrt f
+
+            if not hasattr(self, "fc_hidden"):
+                # it means that hidden sizes was provided and the length of the list was 0, ie the net is a single layer
+                return self.fc_in(x), f.transpose(0, 1), H.transpose(0, 2)
+
+            x = self.fc_in(x)
+            x1 = self.nonlinearity_in(x)
+
+            for i in range(len(self.fc_hidden)):
+                z = x1.grad_fn(torch.ones_like(x1))  # here we repeat some computation from the above line
+                # print("H", H.shape, "z", z.shape, "z'", grad(z, x, torch.ones_like(z), retain_graph=True)[0].shape, "f", f.shape)
+                # z = grad(x1, x, torch.ones_like(x1), create_graph=True)[0]  # here we repeat some computation from the above line
+                # you need to update first the second derivative, as you need the first derivative at previous layer
+                H = z * H + grad(z, x, torch.ones_like(z), retain_graph=True)[0] * torch.einsum('ibo,jbo->ijbo', f, f)
+                f = z * f
+                f = F.linear(f, self.fc_hidden[i].weight)
+                H = F.linear(H, self.fc_hidden[i].weight)
+
+                x = self.fc_hidden[i](x1)
+                x1 = self.nonlinearities_hidden[i](x)
+
+            z = x1.grad_fn(torch.ones_like(x1))  # here we repeat some computation from the above line
+            # z = grad(x1, x, torch.ones_like(x1), create_graph=True)[0]  # here we repeat some computation from the above line
+            # you need to update first the second derivative, as you need the first derivative at previous layer
+            H = z * H + grad(z, x, torch.ones_like(z), retain_graph=True)[0] * torch.einsum('ibo,jbo->ijbo', f, f)
+            f = z * f
+            f = F.linear(f, self.fc_out.weight)
+            H = F.linear(H, self.fc_out.weight)
+            x = self.fc_out(x1)
+
+            return x, f.transpose(0, 1), H.transpose(0, 2)
+
+    return DefaultNNWithDerivatives
diff --git a/abcpy/NN_utilities/trainer.py b/abcpy/NN_utilities/trainer.py
index 12382efe..18fd3376 100644
--- a/abcpy/NN_utilities/trainer.py
+++ b/abcpy/NN_utilities/trainer.py
@@ -17,8 +17,13 @@ def fit(train_loader, model, loss_fn, optimizer, scheduler, n_epochs, cuda, val_
     """
 
     logger = logging.getLogger("NN Trainer")
-    if early_stopping and val_loader is not None:
-        validation_loss_list = []
+    train_loss_list = []
+    if val_loader is not None:
+        test_loss_list = []
+        if early_stopping:
+            early_stopping_loss_list = []  # list of losses used for early stopping
+    else:
+        test_loss_list = None
     if early_stopping and val_loader is None:
         raise RuntimeError("You cannot perform early stopping if a validation loader is not provided to the training "
                            "routine")
@@ -29,22 +34,24 @@ def fit(train_loader, model, loss_fn, optimizer, scheduler, n_epochs, cuda, val_
     for epoch in tqdm(range(start_epoch_training, n_epochs), disable=not use_tqdm):
         # Train stage
         train_loss = train_epoch(train_loader, model, loss_fn, optimizer, cuda)
+        train_loss_list.append(train_loss)
 
         logger.debug('Epoch: {}/{}. Train set: Average loss: {:.4f}'.format(epoch + 1, n_epochs, train_loss))
 
         # Validation stage
         if val_loader is not None:
             val_loss = test_epoch(val_loader, model, loss_fn, cuda)
+            test_loss_list.append(val_loss)
 
             logger.debug('Epoch: {}/{}. Validation set: Average loss: {:.4f}'.format(epoch + 1, n_epochs, val_loss))
 
             # early stopping:
             if early_stopping and (epoch + 1) % epochs_early_stopping_interval == 0:
-                validation_loss_list.append(val_loss)  # save the previous validation loss. It is actually
+                early_stopping_loss_list.append(val_loss)  # save the previous validation loss. It is actually
                 # we need to have at least two saved test losses for performing early stopping (in which case we know
                 # we have saved the previous state_dict as well).
-                if epoch + 1 >= start_epoch_early_stopping and len(validation_loss_list) > 1:
-                    if validation_loss_list[-1] > validation_loss_list[-2]:
+                if epoch + 1 >= start_epoch_early_stopping and len(early_stopping_loss_list) > 1:
+                    if early_stopping_loss_list[-1] > early_stopping_loss_list[-2]:
                         logger.info("Training has been early stopped at epoch {}.".format(epoch + 1))
                         # reload the previous state dict:
                         model.load_state_dict(net_state_dict)
@@ -54,6 +61,7 @@ def fit(train_loader, model, loss_fn, optimizer, scheduler, n_epochs, cuda, val_
 
         scheduler.step()
 
+    return train_loss_list, test_loss_list
 
 def train_epoch(train_loader, model, loss_fn, optimizer, cuda):
     """Function implementing the training in one epoch.
diff --git a/abcpy/NN_utilities/utilities.py b/abcpy/NN_utilities/utilities.py
index 61a07d9b..c50c4e70 100644
--- a/abcpy/NN_utilities/utilities.py
+++ b/abcpy/NN_utilities/utilities.py
@@ -6,6 +6,8 @@
     has_torch = True
 
 import logging
+from functools import reduce
+from operator import mul
 
 import numpy as np
 
@@ -54,3 +56,112 @@ def load_net(path, network_class, *network_args, **network_kwargs):
     net = network_class(*network_args, **network_kwargs)
     net.load_state_dict(torch.load(path))
     return net.eval()  # call the network to eval model. Needed with batch normalization and dropout layers.
+
+
+def jacobian(input, output, diffable=True):
+    '''
+    Returns the Jacobian matrix (batch x in_size x out_size) of the function that produced the output evaluated at the
+    input
+
+    From https://github.com/mwcvitkovic/MASS-Learning/blob/master/models/utils.py
+
+    Important: need to use diffable=True in order for the training routines based on these to work!
+
+    '''
+    assert len(output.shape) == 2
+    assert input.shape[0] == output.shape[0]
+    in_size = reduce(mul, list(input.shape[1:]), 1)
+    if (input.sum() + output.sum()).item() in [np.nan, np.inf]:
+        raise ValueError
+    J = torch.zeros(list(output.shape) + list(input.shape[1:])).to(input)
+    # they are able here to do the gradient computation one batch at a time, of course still considering only one output coordinate at a time
+    for i in range(output.shape[1]):
+        g = torch.zeros(output.shape).to(input)
+        g[:, i] = 1
+        if diffable:
+            J[:, i] = torch.autograd.grad(output, input, g, only_inputs=True, retain_graph=True, create_graph=True)[0]
+        else:
+            J[:, i] = torch.autograd.grad(output, input, g, only_inputs=True, retain_graph=True)[0]
+    J = J.reshape(output.shape[0], output.shape[1], in_size)
+    return J.transpose(2, 1)
+
+
+def jacobian_second_order(input, output, diffable=True):
+    '''
+    Returns the Jacobian matrix (batch x in_size x out_size) of the function that produced the output evaluated at the
+    input, as well as
+    the matrix of second derivatives of outputs with respect to inputs (batch x in_size x out_size)
+
+    Adapted from https://github.com/mwcvitkovic/MASS-Learning/blob/master/models/utils.py
+
+    Important: need to use diffable=True in order for the training routines based on these to work!
+    '''
+    assert len(output.shape) == 2
+    assert input.shape[0] == output.shape[0]
+    in_size = reduce(mul, list(input.shape[1:]), 1)
+    if (input.sum() + output.sum()).item() in [np.nan, np.inf]:
+        raise ValueError
+    J = torch.zeros(list(output.shape) + list(input.shape[1:])).to(input)
+    J2 = torch.zeros(list(output.shape) + list(input.shape[1:])).to(input)
+
+    for i in range(output.shape[1]):
+        g = torch.zeros(output.shape).to(input)
+        g[:, i] = 1
+        J[:, i] = torch.autograd.grad(output, input, g, only_inputs=True, retain_graph=True, create_graph=True)[0]
+    J = J.reshape(output.shape[0], output.shape[1], in_size)
+
+    for i in range(output.shape[1]):
+        for j in range(input.shape[1]):
+            g = torch.zeros(J.shape).to(input)
+            g[:, i, j] = 1
+            if diffable:
+                J2[:, i, j] = torch.autograd.grad(J, input, g, only_inputs=True, retain_graph=True, create_graph=True)[
+                                  0][:, j]
+            else:
+                J2[:, i, j] = torch.autograd.grad(J, input, g, only_inputs=True, retain_graph=True)[0][:, j]
+
+    J2 = J2.reshape(output.shape[0], output.shape[1], in_size)
+
+    return J.transpose(2, 1), J2.transpose(2, 1)
+
+
+def jacobian_hessian(input, output, diffable=True):
+    '''
+    Returns the Jacobian matrix (batch x in_size x out_size) of the function that produced the output evaluated at the
+    input, as well as the Hessian matrix (batch x in_size x in_size x out_size).
+
+    This takes slightly more than the jacobian_second_order routine.
+
+    Adapted from https://github.com/mwcvitkovic/MASS-Learning/blob/master/models/utils.py
+
+    Important: need to use diffable=True in order for the training routines based on these to work!
+    '''
+    assert len(output.shape) == 2
+    assert input.shape[0] == output.shape[0]
+    in_size = reduce(mul, list(input.shape[1:]), 1)
+    if (input.sum() + output.sum()).item() in [np.nan, np.inf]:
+        raise ValueError
+    J = torch.zeros(list(output.shape) + list(input.shape[1:])).to(input)
+    H = torch.zeros(list(output.shape) + list(input.shape[1:]) + list(input.shape[1:])).to(input)
+
+    for i in range(output.shape[1]):
+        g = torch.zeros(output.shape).to(input)
+        g[:, i] = 1
+        J[:, i] = torch.autograd.grad(output, input, g, only_inputs=True, retain_graph=True, create_graph=True)[0]
+    J = J.reshape(output.shape[0], output.shape[1], in_size)
+
+    for i in range(output.shape[1]):
+        for j in range(input.shape[1]):
+            g = torch.zeros(J.shape).to(input)
+            g[:, i, j] = 1
+            if diffable:
+                H[:, i, j] = torch.autograd.grad(J, input, g, only_inputs=True, retain_graph=True, create_graph=True)[0]
+            else:
+                H[:, i, j] = torch.autograd.grad(J, input, g, only_inputs=True, retain_graph=True)[0]
+
+    return J.transpose(2, 1), H.transpose(3, 1)
+
+
+def set_requires_grad(net, value):
+    for param in net.parameters():
+        param.requires_grad = value
diff --git a/abcpy/continuousmodels.py b/abcpy/continuousmodels.py
index b1b6e2bb..baa9e98a 100644
--- a/abcpy/continuousmodels.py
+++ b/abcpy/continuousmodels.py
@@ -690,8 +690,9 @@ def __init__(self, parameters, name='Exponential'):
         ----------
         parameters: list
             Contains the probabilistic models and hyperparameters from which the model derives.
-            The list has one entry: the rate $\lambda$ of the exponential distribution, that has therefore pdf:
-                            f(x; \lambda) = \lambda \exp(-\lambda x ),
+            The list has one entry: the rate :math:`\lambda` of the exponential distribution, that has therefore pdf:
+            :math:`f(x; \lambda) = \lambda \exp(-\lambda x )`
+
         name: string
             The name that should be given to the probabilistic model in the journal file.
         """
diff --git a/abcpy/distances.py b/abcpy/distances.py
index 9d3b25a5..14a43907 100644
--- a/abcpy/distances.py
+++ b/abcpy/distances.py
@@ -482,7 +482,16 @@ def _estimate_always_positive(self):
     @staticmethod
     def get_random_projections(n_projections, d, seed=None):
         r"""
+        Taken from
+
+        https://github.com/PythonOT/POT/blob/78b44af2434f494c8f9e4c8c91003fbc0e1d4415/ot/sliced.py
+
+        Author: Adrien Corenflos <adrien.corenflos@aalto.fi>
+
+        License: MIT License
+
         Generates n_projections samples from the uniform on the unit sphere of dimension d-1: :math:`\mathcal{U}(\mathcal{S}^{d-1})`
+
         Parameters
         ----------
         n_projections : int
@@ -491,6 +500,7 @@ def get_random_projections(n_projections, d, seed=None):
             dimension of the space
         seed: int or RandomState, optional
             Seed used for numpy random number generator
+
         Returns
         -------
         out: ndarray, shape (n_projections, d)
@@ -516,11 +526,19 @@ def get_random_projections(n_projections, d, seed=None):
 
     def sliced_wasserstein_distance(self, X_s, X_t, a=None, b=None, n_projections=50, seed=None, log=False):
         r"""
+        Taken from
+
+        https://github.com/PythonOT/POT/blob/78b44af2434f494c8f9e4c8c91003fbc0e1d4415/ot/sliced.py
+
+        Author: Adrien Corenflos <adrien.corenflos@aalto.fi>
+
+        License: MIT License
+
         Computes a Monte-Carlo approximation of the 2-Sliced Wasserstein distance
-        .. math::
-            \mathcal{SWD}_2(\mu, \nu) = \underset{\theta \sim \mathcal{U}(\mathbb{S}^{d-1})}{\mathbb{E}}[\mathcal{W}_2^2(\theta_\# \mu, \theta_\# \nu)]^{\frac{1}{2}}
-        where :
-        - :math:`\theta_\# \mu` stands for the pushforwars of the projection :math:`\mathbb{R}^d \ni X \mapsto \langle \theta, X \rangle`
+        :math:`\mathcal{SWD}_2(\mu, \nu) = \underset{\theta \sim \mathcal{U}(\mathbb{S}^{d-1})}{\mathbb{E}}[\mathcal{W}_2^2(\theta_\# \mu, \theta_\# \nu)]^{\frac{1}{2}}`
+        where
+        :math:`\theta_\# \mu` stands for the pushforwars of the projection :math:`\mathbb{R}^d \ni X \mapsto \langle \theta, X \rangle`
+
         Parameters
         ----------
         X_s : ndarray, shape (n_samples_a, dim)
@@ -552,7 +570,7 @@ def sliced_wasserstein_distance(self, X_s, X_t, a=None, b=None, n_projections=50
         0.0
         References
         ----------
-        .. [31] Bonneel, Nicolas, et al. "Sliced and radon wasserstein barycenters of measures." Journal of Mathematical Imaging and Vision 51.1 (2015): 22-45
+        Bonneel, Nicolas, et al. "Sliced and radon wasserstein barycenters of measures." Journal of Mathematical Imaging and Vision 51.1 (2015): 22-45
         """
         from ot.lp import emd2_1d
 
diff --git a/abcpy/inferences.py b/abcpy/inferences.py
index 922b1ca0..71504796 100644
--- a/abcpy/inferences.py
+++ b/abcpy/inferences.py
@@ -151,6 +151,279 @@ def distance(self):
         raise NotImplementedError
 
 
+class DrawFromPrior(InferenceMethod):
+    """Helper class to obtain samples from the prior for a model.
+
+    The `sample` method follows similar API to the other InferenceMethod's (returning a journal file), while the
+    `sample_par_sim_pairs` method generates (parameter, simulation) pairs, which can be used for instance as a training
+    dataset for the automatic learning of summary statistics with the StatisticsLearning classes.
+
+    When generating large datasets with MPI backend, pickling may give overflow error; for this reason, the methods
+    split the generation in "chunks" of the specified size on which the parallelization is used.
+
+    Parameters
+    ----------
+    root_models: list
+        A list of the Probabilistic models corresponding to the observed datasets
+    backend: abcpy.backends.Backend
+        Backend object defining the backend to be used.
+    seed: integer, optional
+         Optional initial seed for the random number generator. The default value is generated randomly.
+    max_chunk_size: integer, optional
+        Maximum size of chunks in which to split the data generation. Defaults to 10**4
+    discard_too_large_values: boolean
+         If set to True, the simulation is discarded (and repeated) if at least one element of it is too large
+         to fit in float32, which therefore may be converted to infinite value in numpy. Defaults to False.
+    """
+
+    model = None
+    rng = None
+    n_samples = None
+    backend = None
+
+    n_samples_per_param = None  # this needs to be there otherwise it does not instantiate correctly
+
+    def __init__(self, root_models, backend, seed=None, max_chunk_size=10 ** 4, discard_too_large_values=False):
+        self.model = root_models
+        self.backend = backend
+        self.rng = np.random.RandomState(seed)
+        self.max_chunk_size = max_chunk_size
+        self.discard_too_large_values = discard_too_large_values
+        # An object managing the bds objects
+        self.accepted_parameters_manager = AcceptedParametersManager(self.model)
+        self.logger = logging.getLogger(__name__)
+
+    def sample(self, n_samples, path_to_save_journal=None):
+        """
+        Samples model parameters from the prior distribution.
+
+        Parameters
+        ----------
+        n_samples: integer
+            Number of samples to generate
+        path_to_save_journal: str, optional
+            If provided, save the journal after inference at the provided path.
+
+        Returns
+        -------
+        abcpy.output.Journal
+            a journal containing results and metadata.
+        """
+
+        journal = Journal(1)
+        journal.configuration["type_model"] = [type(model).__name__ for model in self.model]
+        journal.configuration["n_samples"] = n_samples
+
+        # we split sampling in chunks to avoid error in case MPI is used
+        parameters = []
+        samples_to_sample = n_samples
+        while samples_to_sample > 0:
+            parameters_part = self._sample(min(samples_to_sample, self.max_chunk_size))
+            samples_to_sample -= self.max_chunk_size
+            parameters += parameters_part
+
+        journal.add_accepted_parameters(copy.deepcopy(parameters))
+        journal.add_weights(np.ones((n_samples, 1)))
+        journal.add_ESS_estimate(np.ones((n_samples, 1)))
+        self.accepted_parameters_manager.update_broadcast(self.backend, accepted_parameters=parameters)
+        names_and_parameters = self._get_names_and_parameters()
+        journal.add_user_parameters(names_and_parameters)
+        journal.number_of_simulations.append(0)
+
+        if path_to_save_journal is not None:  # save journal
+            journal.save(path_to_save_journal)
+
+        return journal
+
+    def sample_par_sim_pairs(self, n_samples, n_samples_per_param):
+        """
+        Samples (parameter, simulation) pairs from the prior distribution from the model distribution. Specifically,
+        parameter values are sampled from the prior and used to generate the specified number of simulations per
+        parameter value. This returns arrays.
+
+        Parameters
+        ----------
+        n_samples: integer
+            Number of samples to generate
+        n_samples_per_param: integer
+            Number of data points in each simulated data set.
+
+        Returns
+        -------
+        tuple
+            A tuple of numpy.ndarray's containing parameter and simulation values. The first element of the tuple is an
+            array with shape (n_samples, d_theta), where d_theta is the dimension of the parameters. The second element
+            of the tuple is an array with shape (n_samples, n_samples_per_param, d_x), where d_x is the dimension of
+            each simulation.
+        """
+
+        # we split sampling in chunks to avoid error in case MPI is used
+        parameters_list = []
+        simulations_list = []
+        samples_to_sample = n_samples
+        while samples_to_sample > 0:
+            parameters_part, simulations_part = self._sample_par_sim_pairs(min(samples_to_sample, self.max_chunk_size),
+                                                                           n_samples_per_param)
+            samples_to_sample -= self.max_chunk_size
+            parameters_list.append(parameters_part)
+            simulations_list.append(simulations_part)
+        parameters = np.concatenate(parameters_list)
+        simulations = np.concatenate(simulations_list)
+        return parameters, simulations
+
+    def _sample(self, n_samples):
+        """
+        Not for end use; please use `sample`.
+
+        Samples model parameters from the prior distribution. This is an helper function called by the main `sample` one
+        in order to split drawing from the prior in chunks to avoid parallelization issues with MPI.
+
+        Parameters
+        ----------
+        n_samples: integer
+            Number of samples to generate
+
+        Returns
+        -------
+        list
+            List containing sampled parameter values.
+        """
+        # the following lines are similar to the RejectionABC code but only sample from the prior.
+
+        # generate the rng_pds
+        rng_pds = self._generate_rng_pds(n_samples)
+
+        parameters_pds = self.backend.map(self._sample_parameter_only, rng_pds)
+        parameters = self.backend.collect(parameters_pds)
+        return parameters
+
+    def _sample_par_sim_pairs(self, n_samples, n_samples_per_param):
+        """
+        Not for end use; please use `sample_par_sim_pairs`.
+
+        Samples (parameter, simulation) pairs from the prior distribution from the model distribution. Specifically,
+        parameter values are sampled from the prior and used to generate the specified number of simulations per
+        parameter value. This returns arrays.
+
+        This is an helper function called by the main `sample_par_sim_pair` one
+        in order to split drawing from the prior in chunks to avoid parallelization issues with MPI.
+
+        Parameters
+        ----------
+        n_samples: integer
+            Number of samples to generate
+        n_samples_per_param: integer
+            Number of data points in each simulated data set.
+
+        Returns
+        -------
+        tuple
+            A tuple of numpy.ndarray's containing parameter and simulation values. The first element of the tuple is an
+            array with shape (n_samples, d_theta), where d_theta is the dimension of the parameters. The second element
+            of the tuple is an array with shape (n_samples, n_samples_per_param, d_x), where d_x is the dimension of
+            each simulation.
+        """
+        self.n_samples_per_param = n_samples_per_param
+        self.accepted_parameters_manager.broadcast(self.backend, 1)
+
+        # generate the rng_pds
+        rng_pds = self._generate_rng_pds(n_samples)
+
+        parameters_simulations_pds = self.backend.map(self._sample_parameter_simulation, rng_pds)
+        parameters_simulations = self.backend.collect(parameters_simulations_pds)
+        parameters, simulations = [list(t) for t in zip(*parameters_simulations)]
+
+        parameters = np.array(parameters)
+        simulations = np.array(simulations)
+
+        parameters = parameters.reshape((parameters.shape[0], parameters.shape[1]))
+        simulations = simulations.reshape((simulations.shape[0], simulations.shape[2], simulations.shape[3],))
+
+        return parameters, simulations
+
+    def _generate_rng_pds(self, n_samples):
+        """Helper function to generate the random seeds which are used in sampling from prior and simulating from the
+        model in the parallel setup.
+
+        Parameters
+        ----------
+        n_samples: integer
+            Number of random seeds (corresponing to number of prior samples) to generate
+
+        Returns
+        -------
+        list
+            A (possibly distributed according to the used backend) list containing the random seeds to be assigned to
+            each worker.
+        """
+        # now generate an array of seeds that need to be different one from the other. One way to do it is the
+        # following.
+        # Moreover, you cannot use int64 as seeds need to be < 2**32 - 1. How to fix this?
+        # Note that this is not perfect; you still have small possibility of having some seeds that are equal. Is there
+        # a better way? This would likely not change much the performance
+        # An idea would be to use rng.choice but that is too expensive
+        seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=n_samples, dtype=np.uint32)
+        # check how many equal seeds there are and remove them:
+        sorted_seed_arr = np.sort(seed_arr)
+        indices = sorted_seed_arr[:-1] == sorted_seed_arr[1:]
+        if np.sum(indices) > 0:
+            # the following removes the equal seeds in case there are some
+            sorted_seed_arr[:-1][indices] = sorted_seed_arr[:-1][indices] + 1
+        rng_arr = np.array([np.random.RandomState(seed) for seed in sorted_seed_arr])
+        rng_pds = self.backend.parallelize(rng_arr)
+        return rng_pds
+
+    def _sample_parameter_simulation(self, rng, npc=None):
+        """
+        Samples a single model parameter and simulates from it.
+
+        Parameters
+        ----------
+        rng: random number generator
+            The random number generator to be used.
+        Returns
+        -------
+        Tuple
+            The first entry of the tuple is the parameter.
+            The second entry is the the simulation drawn from it.
+        """
+
+        ok_flag = False
+
+        while not ok_flag:
+            self.sample_from_prior(rng=rng)
+            theta = self.get_parameters(self.model)
+            y_sim = self.simulate(self.n_samples_per_param, rng=rng, npc=npc)
+
+            # if there are no potential infinities there (or if we do not check for those).
+            # For instance, Lorenz model may give too large values sometimes (quite rarely).
+            if self.discard_too_large_values and np.sum(np.isinf(np.array(y_sim).astype("float32"))) > 0:
+                self.logger.warning("y_sim contained too large values for float32; simulating again.")
+            else:
+                ok_flag = True
+
+        return theta, y_sim
+
+    def _sample_parameter_only(self, rng, npc=None):
+        """
+        Samples a single model parameter from the prior.
+
+        Parameters
+        ----------
+        rng: random number generator
+            The random number generator to be used.
+        Returns
+        -------
+        list
+            The sampled parameter values
+        """
+
+        self.sample_from_prior(rng=rng)
+        theta = self.get_parameters(self.model)
+
+        return theta
+
+
 class RejectionABC(InferenceMethod):
     """This class implements the rejection algorithm based inference scheme [1] for
         Approximate Bayesian Computation.
@@ -167,7 +440,7 @@ class RejectionABC(InferenceMethod):
             each model.
         backend: abcpy.backends.Backend
             Backend object defining the backend to be used.
-        seed: integer, optionaldistance
+        seed: integer, optional
              Optional initial seed for the random number generator. The default value is generated randomly.
         """
 
@@ -411,8 +684,8 @@ def _sample_parameter(self, rng, npc=None):
         distance = self.distance.dist_max()
 
         if distance < self.epsilon and self.logger:
-            self.logger.warn("initial epsilon {:e} is larger than dist_max {:e}"
-                             .format(float(self.epsilon), distance))
+            self.logger.warning("initial epsilon {:e} is larger than dist_max {:e}"
+                                .format(float(self.epsilon), distance))
 
         counter = 0
 
@@ -1891,7 +2164,7 @@ def sample(self, observations, steps, n_samples=10000, n_samples_per_param=1, ch
             if aStep == 0 and journal_file is not None:
                 accepted_parameters = journal.get_accepted_parameters(-1)
                 accepted_weights = journal.get_weights(-1)
-                accepted_cov_mats = journal.opt_values[-1]
+                accepted_cov_mats = journal.get_accepted_cov_mats()
 
             # main ABCsubsim algorithm
             self.logger.info("Initialization of ABCsubsim")
@@ -1990,7 +2263,7 @@ def sample(self, observations, steps, n_samples=10000, n_samples_per_param=1, ch
                 journal.add_distances(copy.deepcopy(distances))
                 journal.add_weights(copy.deepcopy(accepted_weights))
                 journal.add_ESS_estimate(accepted_weights)
-                journal.add_opt_values(accepted_cov_mats)
+                journal.add_accepted_cov_mats(accepted_cov_mats)
                 self.accepted_parameters_manager.update_broadcast(self.backend, accepted_parameters=accepted_parameters,
                                                                   accepted_weights=accepted_weights)
                 names_and_parameters = self._get_names_and_parameters()
@@ -2015,7 +2288,7 @@ def sample(self, observations, steps, n_samples=10000, n_samples_per_param=1, ch
             journal.add_distances(copy.deepcopy(distances))
             journal.add_weights(copy.deepcopy(accepted_weights))
             journal.add_ESS_estimate(accepted_weights)
-            journal.add_opt_values(accepted_cov_mats)
+            journal.add_accepted_cov_mats(accepted_cov_mats)
             self.accepted_parameters_manager.update_broadcast(self.backend, accepted_parameters=accepted_parameters,
                                                               accepted_weights=accepted_weights)
             names_and_parameters = self._get_names_and_parameters()
@@ -2998,8 +3271,8 @@ def __init__(self, root_models, distances, backend, kernel=None, version="DelMor
         self.simulation_counter = 0
 
     def sample(self, observations, steps, n_samples=10000, n_samples_per_param=1, epsilon_final=0.1, alpha=None,
-               covFactor=2, resample=None, full_output=0, which_mcmc_kernel=None, r=None, journal_file=None,
-               path_to_save_journal=None):
+               covFactor=2, resample=None, full_output=0, which_mcmc_kernel=None, r=None,
+               store_simulations_in_journal=True, journal_file=None, path_to_save_journal=None):
         """Samples from the posterior distribution of the model parameter given the observed
         data observations.
 
@@ -3045,6 +3318,13 @@ def sample(self, observations, steps, n_samples=10000, n_samples_per_param=1, ep
             Specifies the value of 'r' (the number of wanted hits) in the r-hits kernels. It is therefore ignored if
             'which_mcmc_kernel==0'. If no value is provided, the first version of r-hit kernel uses r=3, while the
             second uses r=2. The default value is None.
+        store_simulations_in_journal : boolean, optional
+            Every step of the SMCABC algorithm uses the accepted simulations from previous step. Therefore, the accepted
+            simulations at the final step are stored in the Journal file to allow restarting the inference
+            correctly. If each simulation is large, however, that means that the accepted Journal will be large in
+            memory. If you want to *not* save the simulations in the journal, set this to False; however, you will not
+            be able to restart the inference from the returned Journal. The default value is True, meaning simulations
+            are stored in the Journal.
         journal_file: str, optional
             Filename of a journal file to read an already saved journal file, from which the first iteration will start.
             The default value is None.
@@ -3092,6 +3372,7 @@ def sample(self, observations, steps, n_samples=10000, n_samples_per_param=1, ep
         accepted_weights = None
         accepted_cov_mats = None
         accepted_y_sim = None
+        distances = None
 
         # Define the resample parameter
         if resample is None:
@@ -3128,7 +3409,11 @@ def sample(self, observations, steps, n_samples=10000, n_samples_per_param=1, ep
             if aStep == 0 and journal_file is not None:
                 accepted_parameters = journal.get_accepted_parameters(-1)
                 accepted_weights = journal.get_weights(-1)
-                accepted_y_sim = journal.opt_values[-1]
+                accepted_y_sim = journal.get_accepted_simulations()
+                if accepted_y_sim is None:
+                    raise RuntimeError("You cannot restart the inference from this Journal file as you did not store "
+                                       "the simulations in it. In order to do that, the inference scheme needs to be"
+                                       "called with `store_simulations_in_journal=True`.")
                 distances = journal.get_distances(-1)
 
                 epsilon = journal.configuration["epsilon_arr"]
@@ -3154,7 +3439,7 @@ def sample(self, observations, steps, n_samples=10000, n_samples_per_param=1, ep
                 break
 
             # 0: Compute the Epsilon
-            if accepted_y_sim != None:
+            if distances is not None:
                 self.logger.info(
                     "Compute epsilon, might take a while; previous epsilon value: {:.4f}".format(epsilon[-1]))
                 if self.bernton:
@@ -3186,7 +3471,7 @@ def sample(self, observations, steps, n_samples=10000, n_samples_per_param=1, ep
 
             # 1: calculate weights for new parameters
             self.logger.info("Calculating weights")
-            if accepted_y_sim is not None:
+            if distances is not None:
                 if self.bernton:
                     new_weights = (current_distance_matrix < epsilon[-1]) * 1
                 else:
@@ -3205,7 +3490,7 @@ def sample(self, observations, steps, n_samples=10000, n_samples_per_param=1, ep
             # 2: Resample; we resample always when using the Bernton et al. algorithm, as in that case weights
             # can only be proportional to 1 or 0; if we use the Del Moral version, instead, the
             # weights can have fractional values -> use the # resample threshold
-            if accepted_y_sim is not None and (self.bernton or pow(sum(pow(new_weights, 2)), -1) < resample):
+            if distances is not None and (self.bernton or pow(sum(pow(new_weights, 2)), -1) < resample):
                 self.logger.info("Resampling")
                 # Weighted resampling:
                 index_resampled = self.rng.choice(n_samples, n_samples, replace=True, p=new_weights)
@@ -3221,7 +3506,7 @@ def sample(self, observations, steps, n_samples=10000, n_samples_per_param=1, ep
 
             self.accepted_parameters_manager.update_broadcast(self.backend, accepted_parameters=accepted_parameters,
                                                               accepted_weights=accepted_weights)
-            if accepted_y_sim is not None:
+            if distances is not None:
                 kernel_parameters = []
                 for kernel in self.kernel.kernels:
                     kernel_parameters.append(
@@ -3275,7 +3560,8 @@ def sample(self, observations, steps, n_samples=10000, n_samples_per_param=1, ep
                 journal.add_distances(copy.deepcopy(distances))
                 journal.add_weights(copy.deepcopy(accepted_weights))
                 journal.add_ESS_estimate(accepted_weights)
-                journal.add_opt_values(copy.deepcopy(accepted_y_sim))
+                if store_simulations_in_journal:
+                    journal.add_accepted_simulations(copy.deepcopy(accepted_y_sim))
 
                 names_and_parameters = self._get_names_and_parameters()
                 journal.add_user_parameters(names_and_parameters)
@@ -3998,6 +4284,10 @@ def sample(self, observations, n_samples, n_samples_per_param=100, burnin=1000,
                 accepted_parameter = self.get_parameters()
             else:
                 accepted_parameter = iniPoint
+                if isinstance(accepted_parameter, np.ndarray) and len(accepted_parameter.shape) == 1 or isinstance(
+                        accepted_parameter, list) and not hasattr(accepted_parameter[0], "__len__"):
+                    # reshape whether we pass a 1d array or list.
+                    accepted_parameter = [np.array([x]) for x in accepted_parameter]  # give correct shape for later
             if burnin == 0:
                 accepted_parameters_burnin.append(accepted_parameter)
             self.logger.info("Calculate approximate loglikelihood")
diff --git a/abcpy/output.py b/abcpy/output.py
index 13e05b71..a2f8ccc4 100644
--- a/abcpy/output.py
+++ b/abcpy/output.py
@@ -1,3 +1,4 @@
+import copy
 import pickle
 import warnings
 
@@ -5,6 +6,8 @@
 import numpy as np
 from scipy.stats import gaussian_kde
 
+from abcpy.acceptedparametersmanager import AcceptedParametersManager
+from abcpy.graphtools import GraphTools
 from abcpy.utils import wass_dist
 
 
@@ -15,15 +18,24 @@ class Journal:
 
     Attributes
     ----------
-    parameters : numpy.array
-        a nxpxt matrix
-    weights : numpy.array
-        a nxt matrix
-    opt_value : numpy.array
-        nxp matrix containing for each parameter the evaluated objective function for every time step
+    accepted_parameters : list
+        List of lists containing posterior samples
+    names_and_parameters : list
+        List of dictionaries containing posterior samples with parameter names as keys
+    accepted_simulations : list
+        List of lists containing simulations corresponding to posterior samples (this could be empty if the sampling
+        routine does not store those)
+    accepted_cov_mats : list
+        List of lists containing covariance matrices from accepted posterior samples (this could be empty if
+        the sampling routine does not store those)
+    weights : list
+        List containing posterior weights
+    ESS : list
+        List containing the Effective Sample Size (ESS) at each iteration
+    distances : list
+        List containing the ABC distance at each iteration
     configuration : Python dictionary
         dictionary containing the schemes configuration parameters
-
     """
 
     def __init__(self, type):
@@ -39,10 +51,11 @@ def __init__(self, type):
 
         self.accepted_parameters = []
         self.names_and_parameters = []
+        self.accepted_simulations = []
+        self.accepted_cov_mats = []
         self.weights = []
         self.ESS = []
         self.distances = []
-        self.opt_values = []
         self.configuration = {}
 
         if type not in [0, 1]:
@@ -114,6 +127,38 @@ def add_accepted_parameters(self, accepted_parameters):
         if self._type == 1:
             self.accepted_parameters.append(accepted_parameters)
 
+    def add_accepted_simulations(self, accepted_simulations):
+        """
+        Saves provided accepted simulations by appending them to the journal. If type==0, old accepted simulations get
+        overwritten.
+
+        Parameters
+        ----------
+        accepted_simulations: list
+        """
+
+        if self._type == 0:
+            self.accepted_simulations = [accepted_simulations]
+
+        if self._type == 1:
+            self.accepted_simulations.append(accepted_simulations)
+
+    def add_accepted_cov_mats(self, accepted_cov_mats):
+        """
+        Saves provided accepted cov_mats by appending them to the journal. If type==0, old accepted cov_mats get
+        overwritten.
+
+        Parameters
+        ----------
+        accepted_cov_mats: list
+        """
+
+        if self._type == 0:
+            self.accepted_cov_mats = [accepted_cov_mats]
+
+        if self._type == 1:
+            self.accepted_cov_mats.append(accepted_cov_mats)
+
     def add_weights(self, weights):
         """
         Saves provided weights by appending them to the journal. If type==0, old weights get overwritten.
@@ -146,23 +191,6 @@ def add_distances(self, distances):
         if self._type == 1:
             self.distances.append(distances)
 
-    def add_opt_values(self, opt_values):
-        """
-        Saves provided values of the evaluation of the schemes objective function. If type==0, old values get
-        overwritten
-
-        Parameters
-        ----------
-        opt_value: numpy.array
-            vector containing n evaluations of the schemes objective function
-        """
-
-        if self._type == 0:
-            self.opt_values = [opt_values]
-
-        if self._type == 1:
-            self.opt_values.append(opt_values)
-
     def add_ESS_estimate(self, weights):
         """
         Computes and saves Effective Sample Size (ESS) estimate starting from provided weights; ESS is estimated as sum
@@ -237,9 +265,8 @@ def get_accepted_parameters(self, iteration=None):
 
         Returns
         -------
-        accepted_parameters: dictionary
-            Samples from the specified iteration (last, if not specified) returned as a disctionary with names of the
-            random variables
+        accepted_parameters: list
+            List containing samples from the specified iteration (last, if not specified)
         """
 
         if iteration is None:
@@ -248,6 +275,60 @@ def get_accepted_parameters(self, iteration=None):
         else:
             return self.accepted_parameters[iteration]
 
+    def get_accepted_simulations(self, iteration=None):
+        """
+        Returns the accepted simulations from a sampling scheme. Notice not all sampling schemes store those in the
+        Journal, so this may return None.
+
+        For intermediate results, pass the iteration.
+
+        Parameters
+        ----------
+        iteration: int
+            specify the iteration for which to return accepted simulations
+
+        Returns
+        -------
+        accepted_simulations: list
+            List containing simulations corresponding to accepted samples from the specified
+            iteration (last, if not specified)
+        """
+
+        if iteration is None:
+            if len(self.accepted_simulations) == 0:
+                return None
+            return self.accepted_simulations[-1]
+
+        else:
+            return self.accepted_simulations[iteration]
+
+    def get_accepted_cov_mats(self, iteration=None):
+        """
+        Returns the accepted cov_mats used in a sampling scheme. Notice not all sampling schemes store those in the
+        Journal, so this may return None.
+
+        For intermediate results, pass the iteration.
+
+        Parameters
+        ----------
+        iteration: int
+            specify the iteration for which to return accepted cov_mats
+
+        Returns
+        -------
+        accepted_cov_mats: list
+            List containing accepted cov_mats from the specified
+            iteration (last, if not specified)
+        """
+
+        if iteration is None:
+            if len(self.accepted_cov_mats) == 0:
+                return None
+            return self.accepted_cov_mats[-1]
+
+        else:
+            return self.accepted_cov_mats[iteration]
+
     def get_weights(self, iteration=None):
         """
         Returns the weights from a sampling scheme.
@@ -837,17 +918,18 @@ def Wass_convergence_plot(self, num_iter_max=1e8, **kwargs):
                                "sequential algorithm for one iteration only or to using non-sequential algorithms (as"
                                "RejectionABC). Wasserstein distance convergence test requires at least samples from at "
                                "least 2 iterations.")
-        if self.get_accepted_parameters().dtype == "object":
+        last_params = np.array(self.get_accepted_parameters())
+        if last_params.dtype == "object":
             raise RuntimeError("This error was probably raised due to the parameters in your model having different "
-                               "dimenions (and specifically not being univariate). For now, Wasserstein distance"
+                               "dimensions (and specifically not being univariate). For now, Wasserstein distance"
                                " convergence test is available only if the different parameters have the same "
                                "dimension.")
 
         wass_dist_lists = [None] * (len(self.weights) - 1)
 
         for i in range(len(self.weights) - 1):
-            params_1 = self.get_accepted_parameters(i)
-            params_2 = self.get_accepted_parameters(i + 1)
+            params_1 = np.array(self.get_accepted_parameters(i))
+            params_2 = np.array(self.get_accepted_parameters(i + 1))
             weights_1 = self.get_weights(i)
             weights_2 = self.get_weights(i + 1)
             if len(params_1.shape) == 1:  # we assume that the dimension of parameters is 1
@@ -928,3 +1010,251 @@ def traceplot(self, parameters_to_show=None, iteration=None, **kwargs):
             ax[i].set_xlabel("MCMC step")
 
         return fig, ax
+
+    def resample(self, n_samples=None, replace=True, path_to_save_journal=None, seed=None):
+        """
+        Helper method to resample (by bootstrapping or subsampling) the posterior samples stored in the Journal.
+        This can be used for instance to obtain an unweighted set of
+        posterior samples from a weighted one (via bootstrapping) or
+        to subsample a given number of posterior samples from a larger set. The new set of (unweighted)
+        samples are stored in a new journal which is returned by the method.
+
+        In order to bootstrap/subsample, the ``np.random.choice`` method is used, with the posterior sample
+        weights used as
+        probabilities (p) for resampling each sample. ``np.random.choice`` performs resampling with or without
+        replacement according to whether ``replace=True`` or ``replace=False``. Moreover, the parameter
+        ``n_samples`` specifies the number of resampled samples
+        (the ``size`` argument of ``np.ranodom.choice``) and is set by
+         default to the number of samples in the journal). Therefore, different combinations of these
+        two parameters can be used to bootstrap or to subsample a set of posterior samples (see the examples below);
+        the default parameter values perform bootstrap.
+
+        Parameters
+        ----------
+        n_samples: integer, optional
+            The number of posterior samples which you want to resample. Defaults to the number of posterior samples
+            currently stored in the Journal.
+        replace: boolean, optional
+            If True, sampling with replacement is performed; if False, sampling without replacement. Defaults to False.
+        path_to_save_journal: str, optional
+            If provided, save the journal with the resampled posterior samples at the provided path.
+        seed: integer, optional
+             Optional initial seed for the random number generator. The default value is generated randomly.
+
+        Returns
+        -------
+        abcpy.output.Journal
+            a journal containing the resampled posterior samples
+
+        Examples
+        --------
+        If ``journal`` contains a weighted set of posterior samples, the following returns an unweighted bootstrapped
+        set of posterior samples, stored in ``new_journal``:
+
+        >>> new_journal = journal.resample()
+
+        The above of course also works when the original posterior samples are unweighted.
+
+        If ``journal`` contains a here a large number of posterior sampling, you can subsample (without replacement)
+        a smaller number of them (say 100) with the following line (and store them in ``new_journal``):
+
+        >>> new_journal = journal.resample(n_samples=100, replace=False)
+
+        Notice that the above takes into account the weights in the original ``journal``.
+
+        """
+
+        # instantiate the random number generator
+        rng = np.random.RandomState(seed)
+
+        # this extracts the parameters from the journal
+        accepted_parameters = self.get_accepted_parameters(-1)
+        accepted_weights = self.get_weights(-1)
+        n_samples_old = self.configuration["n_samples"]
+        normalized_weights = accepted_weights.reshape(-1) / np.sum(accepted_weights)
+
+        n_samples = n_samples_old if n_samples is None else n_samples
+
+        if n_samples > n_samples_old and not replace:
+            raise RuntimeError("You cannot draw without replacement a larger number of samples than the posterior "
+                               "samples currently stored in the journal.")
+
+        # here you just need to bootstrap (or subsample):
+        bootstrapped_parameter_indices = rng.choice(np.arange(n_samples_old), size=n_samples, replace=replace,
+                                                    p=normalized_weights)
+        bootstrapped_parameters = [accepted_parameters[index] for index in bootstrapped_parameter_indices]
+
+        # define new journal
+        journal_new = Journal(0)
+        journal_new.configuration["type_model"] = self.configuration["type_model"]
+        journal_new.configuration["n_samples"] = n_samples
+
+        # store bootstrapped parameters in new journal
+        journal_new.add_accepted_parameters(copy.deepcopy(bootstrapped_parameters))
+        journal_new.add_weights(np.ones((n_samples, 1)))
+        journal_new.add_ESS_estimate(np.ones((n_samples, 1)))
+
+        # the next piece of code build the list to be passed to add_user_parameter in order to build the dictionary;
+        # this mimics the behavior of what is done in the InferenceMethod's using the lines:
+        # `self.accepted_parameters_manager.update_broadcast(self.backend, accepted_parameters=accepted_parameters)`
+        # `names_and_parameters = self._get_names_and_parameters()`
+
+        names_par_dicts = self.get_parameters()
+        par_names = list(names_par_dicts.keys())
+        new_names_par_list = []
+        start_index = 0
+        for name in par_names:
+            parameter_size = len(names_par_dicts[name][0])  # the size of that parameter
+            name_par_tuple = (
+                name, [bootstrapped_parameters[i][start_index:start_index + parameter_size] for i in range(n_samples)])
+            new_names_par_list.append(name_par_tuple)
+            start_index += parameter_size
+        journal_new.add_user_parameters(new_names_par_list)
+        journal_new.number_of_simulations.append(0)
+
+        if path_to_save_journal is not None:  # save journal
+            journal_new.save(path_to_save_journal)
+
+        return journal_new
+
+
+class GenerateFromJournal(GraphTools):
+    """Helper class to generate simulations from a model starting from the parameter values stored in a Journal file.
+
+    Parameters
+    ----------
+    root_models: list
+        A list of the Probabilistic models corresponding to the observed datasets
+    backend: abcpy.backends.Backend
+        Backend object defining the backend to be used.
+    seed: integer, optional
+         Optional initial seed for the random number generator. The default value is generated randomly.
+    discard_too_large_values: boolean
+         If set to True, the simulation is discarded (and repeated) if at least one element of it is too large
+         to fit in float32, which therefore may be converted to infinite value in numpy. Defaults to False.
+
+    Examples
+    --------
+    Simplest possible usage is:
+
+    >>> generate_from_journal = GenerateFromJournal([model], backend=backend)
+    >>> parameters, simulations, normalized_weights = generate_from_journal.generate(journal)
+
+    which takes the parameter values stored in journal and generated simulations from them. Notice how the method
+    returns (in this order) the parameter values used for the simulations, the simulations themselves and the
+    posterior weights associated to the parameters. All of these three objects are numpy arrays.
+
+    """
+
+    def __init__(self, root_models, backend, seed=None, discard_too_large_values=False):
+        self.model = root_models
+        self.backend = backend
+        self.rng = np.random.RandomState(seed)
+        self.discard_too_large_values = discard_too_large_values
+        # An object managing the bds objects
+        self.accepted_parameters_manager = AcceptedParametersManager(self.model)
+
+    def generate(self, journal, n_samples_per_param=1, iteration=None):
+        """
+        Method to generate simulations using parameter values stored in the provided Journal.
+
+        Parameters
+        ----------
+        journal: abcpy.output.Journal
+            the Journal containing the parameter values from which to generate simulations from the model.
+        n_samples_per_param: integer, optional
+            Number of simulations for each parameter value. Defaults to 1.
+        iteration: integer, optional
+            specifies the iteration from which the parameter samples in the Journal are taken to generate simulations.
+            If None (default), it uses the last iteration.
+
+        Returns
+        -------
+        tuple
+            A tuple of numpy ndarray's containing the parameter values (first element, with shape n_samples x d_theta),
+            the generated
+            simulations (second element, with shape n_samples x n_samples_per_param x d_x, where d_x is the dimension of
+            each simulation) and the normalized weights attributed to each parameter value
+            (third element, with shape n_samples).
+
+        Examples
+        --------
+        Simplest possible usage is:
+
+        >>> generate_from_journal = GenerateFromJournal([model], backend=backend)
+        >>> parameters, simulations, normalized_weights = generate_from_journal.generate(journal)
+
+        which takes the parameter values stored in journal and generated simulations from them. Notice how the method
+        returns (in this order) the parameter values used for the simulations, the simulations themselves and the
+        posterior weights associated to the parameters. All of these three objects are numpy arrays.
+
+        """
+        # check whether the model corresponds to the one for which the journal was generated
+        if journal.configuration["type_model"] != [type(model).__name__ for model in self.model]:
+            raise RuntimeError("You are not using the same model as the one with which the journal was generated.")
+
+        self.n_samples_per_param = n_samples_per_param
+
+        accepted_parameters = journal.get_accepted_parameters(iteration)
+        accepted_weights = journal.get_weights(iteration)
+        normalized_weights = accepted_weights.reshape(-1) / np.sum(accepted_weights)
+        n_samples = len(normalized_weights)
+
+        self.accepted_parameters_manager.broadcast(self.backend, [None])
+        # Broadcast Accepted parameters
+        self.accepted_parameters_manager.update_broadcast(self.backend, accepted_parameters=accepted_parameters)
+
+        seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=n_samples, dtype=np.uint32)
+        # no need to check if the seeds are the same here as they are assigned to different parameter values
+        rng_arr = np.array([np.random.RandomState(seed) for seed in seed_arr])
+        index_arr = np.arange(0, n_samples, 1)
+        data_arr = []
+        for i in range(len(rng_arr)):
+            data_arr.append([rng_arr[i], index_arr[i]])
+        data_pds = self.backend.parallelize(data_arr)
+
+        simulations_pds = self.backend.map(self._sample_parameter, data_pds)
+        simulations = self.backend.collect(simulations_pds)
+
+        parameters = np.array(accepted_parameters)
+        simulations = np.array(simulations)
+
+        parameters = parameters.reshape((parameters.shape[0], parameters.shape[1]))
+        simulations = simulations.reshape((simulations.shape[0], simulations.shape[2], simulations.shape[3],))
+
+        return parameters, simulations, normalized_weights
+
+    def _sample_parameter(self, data, npc=None):
+        """
+        Simulates from a single model parameter.
+
+        Parameters
+        ----------
+        data: list
+            A list containing a random numpy state and a parameter index, e.g. [rng, index]
+
+        Returns
+        -------
+        numpy.ndarray
+            The simulated dataset.
+        """
+
+        if isinstance(data, np.ndarray):
+            data = data.tolist()
+        rng = data[0]
+        index = data[1]
+
+        parameter = self.accepted_parameters_manager.accepted_parameters_bds.value()[index]
+        ok_flag = False
+
+        while not ok_flag:
+            self.set_parameters(parameter)
+            y_sim = self.simulate(n_samples_per_param=self.n_samples_per_param, rng=rng, npc=npc)
+            # if there are no potential infinities there (or if we do not check for those).
+            # For instance, Lorenz model may give too large values sometimes (quite rarely).
+            if self.discard_too_large_values and np.sum(np.isinf(np.array(y_sim).astype("float32"))) > 0:
+                self.logger.warning("y_sim contained too large values for float32; simulating again.")
+            else:
+                ok_flag = True
+
+        return y_sim
diff --git a/abcpy/statistics.py b/abcpy/statistics.py
index fdde773e..55999c89 100644
--- a/abcpy/statistics.py
+++ b/abcpy/statistics.py
@@ -10,7 +10,7 @@
 else:
     has_torch = True
     from abcpy.NN_utilities.utilities import load_net, save_net
-    from abcpy.NN_utilities.networks import createDefaultNN, ScalerAndNet
+    from abcpy.NN_utilities.networks import createDefaultNN, ScalerAndNet, DiscardLastOutputNet
 
 
 class Statistics(metaclass=ABCMeta):
@@ -19,18 +19,26 @@ class Statistics(metaclass=ABCMeta):
     The base class also implements a polynomial expansion with cross-product
     terms that can be used to get desired polynomial expansion of the calculated statistics.
 
-
     """
 
-    @abstractmethod
-    def __init__(self, degree=1, cross=False, previous_statistics=None):
-        """Constructor that must be overwritten by the sub-class.
-
-        The constructor of a sub-class must accept arguments for the polynomial
-        expansion after extraction of the summary statistics, one has to define
-        the degree of polynomial expansion and cross, indicating whether cross-prodcut
-        terms are included.
-
+    def __init__(self, degree=1, cross=False, reference_simulations=None, previous_statistics=None):
+        """
+        Initialization of the parent class. All sub-classes must call this at the end of their __init__,
+         as it takes care of initializing the correct attributes to self for the other methods to work.
+
+        `degree` and `cross` specify the polynomial expansion you want to apply to the statistics.
+
+        If `reference_simulations` are provided, the standard deviation of the different statistics on the set 
+        of reference simulations is computed and stored; these will then be used to rescale
+        the statistics for each new simulation or observation. 
+        If no set of reference simulations are provided, then this is not done.
+        
+        `previous_statistics` allows different Statistics object to be pipelined. Specifically, if the final 
+        statistic to be used is determined by the
+        composition of two Statistics, you can pass the first here; then, whenever the final statistic is needed, it
+        is sufficient to call the `statistics` method of the second one, and that will automatically apply both
+        transformations.  
+        
         Parameters
         ----------
         degree: integer, optional
@@ -38,14 +46,27 @@ def __init__(self, degree=1, cross=False, previous_statistics=None):
         cross: boolean, optional
             Defines whether to include the cross-product terms. The default value is True, meaning the cross product term
             is included.
-        previous_statistics: Statistics class, optional
+        reference_simulations: array, optional
+            A numpy array with shape (n_samples, output_size) containing a set of reference simulations. If provided, 
+            statistics are computed at initialization for all reference simulations, and the standard deviation of the 
+            different statistics is extracted. The standard deviation is then used to standardize the summary 
+            statistics each time they are compute on a new observation or simulation. Defaults to None, in which case 
+            standardization is not applied.
+        previous_statistics: abcpy.statistics.Statistics, optional
             It allows pipelining of Statistics. Specifically, if the final statistic to be used is determined by the
             composition of two Statistics, you can pass the first here; then, whenever the final statistic is needed, it
             is sufficient to call the `statistics` method of the second one, and that will automatically apply both
             transformations.
         """
 
-        raise NotImplementedError
+        self.degree = degree
+        self.cross = cross
+        self.previous_statistics = previous_statistics
+        if reference_simulations is not None:
+            training_statistics = self.statistics(
+                [reference_simulations[i] for i in range(reference_simulations.shape[0])])
+            self.std_statistics = np.std(training_statistics, axis=0)
+            # we store this and use it to rescale the statistics
 
     @abstractmethod
     def statistics(self, data: object) -> object:
@@ -53,6 +74,24 @@ def statistics(self, data: object) -> object:
         data set data. It is assumed that data is a  list of n same type
         elements(eg., The data can be a list containing n timeseries, n graphs or n np.ndarray).
 
+        All statistics implementation should follow this structure:
+
+        >>> # need to call this first which takes care of calling the
+        >>> # previous statistics if that is defined and of properly
+        >>> # formatting data
+        >>> data = self._preprocess(data)
+        >>>
+        >>> # !!! here do all the processing on the statistics (data) !!!
+        >>>
+        >>> # Expand the data with polynomial expansion
+        >>> result = self._polynomial_expansion(data)
+        >>>
+        >>> # now call the _rescale function which automatically rescales
+        >>> # the different statistics using the standard
+        >>> # deviation of them on the training set provided at initialization.
+        >>> result = self._rescale(result)
+
+
         Parameters
         ----------
         data: python list
@@ -68,7 +107,9 @@ def statistics(self, data: object) -> object:
 
     def _polynomial_expansion(self, summary_statistics):
         """Helper function that does the polynomial expansion and includes cross-product
-        terms of summary_statistics, already calculated summary statistics.
+        terms of summary_statistics, already calculated summary statistics. It is tipically called in the `statistics`
+        method of a `Statistics` class, after the statistics have been computed from data but before the statistics
+        are (optionally) rescaled.
 
         Parameters
         ----------
@@ -93,15 +134,78 @@ def _polynomial_expansion(self, summary_statistics):
             result = np.column_stack((result, np.power(summary_statistics, ind)))
 
         # Include the cross-product term
-        if self.cross == True and summary_statistics.shape[1] > 1:
+        if self.cross and summary_statistics.shape[1] > 1:
             # Convert to a matrix
             for ind1 in range(0, summary_statistics.shape[1]):
                 for ind2 in range(ind1 + 1, summary_statistics.shape[1]):
                     result = np.column_stack((result, summary_statistics[:, ind1] * summary_statistics[:, ind2]))
         return result
 
-    def _check_and_transform_input(self, data):
+    def _rescale(self, result):
+        """Rescales the final summary statistics using the standard deviations computed at initialization on the set of
+        reference simulations. If that was not done, no rescaling is done.
+
+        Parameters
+        ----------
+        result: numpy.ndarray
+            Final summary statistics (after polynomial expansion)
+
+        Returns
+        -------
+        numpy.ndarray
+            Rescaled summary statistics, with the same shape as the input.
+        """
+        if hasattr(self, "std_statistics"):
+            if result.shape[-1] != self.std_statistics.shape[-1]:
+                raise RuntimeError("The size of the statistics is not the same as the stored standard deviations for "
+                                   "rescaling! Please check that you initialized the statistics with the correct set "
+                                   "of reference samples.")
+
+            result = result / self.std_statistics
+
+        return result
+
+    def _preprocess(self, data):
+        """Utility which needs to be called at the beginning of the `statistics` method for all `Statistics` classes.
+        It takes care of calling the `previous_statistics` if that is available (pipelining)
+        and of correctly formatting the data.
+
+        Parameters
+        ----------
+        data: python list
+            Contains n data sets with length p.
+
+        Returns
+        -------
+        numpy.ndarray
+            Formatted statistics after pipelining.
         """
+
+        # pipeline: first call the previous statistics:
+        if self.previous_statistics is not None:
+            data = self.previous_statistics.statistics(data)
+        # the first of the statistics need to take list as input, in order to match the API. Then actually the
+        # transformations work on np.arrays. In fact the first statistic transforms the list to array. Therefore, the
+        # following code needs to be called only if the self statistic is the first, i.e. it does not have a
+        # previous_statistic element.
+        else:
+            data = self._check_and_transform_input(data)
+
+        return data
+
+    def _check_and_transform_input(self, data):
+        """ Formats the input in the correct way for computing summary statistics; specifically takes as input a
+        list and returns a numpy.ndarray.
+
+        Parameters
+        ----------
+        data: python list
+            Contains n data sets with length p.
+
+        Returns
+        -------
+        numpy.ndarray
+            Formatted statistics after pipelining.
         """
         if isinstance(data, list):
             if np.array(data).shape == (len(data),):
@@ -124,26 +228,6 @@ class Identity(Statistics):
     expansion term and cross*nchoosek(p,2) many cross-product terms are calculated.
     """
 
-    def __init__(self, degree=1, cross=False, previous_statistics=None):
-        """
-
-        Parameters
-        ----------
-        degree : integer, optional
-            Of polynomial expansion. The default value is 2 meaning second order polynomial expansion.
-        cross : boolean, optional
-            Defines whether to include the cross-product terms. The default value is True, meaning the cross product term
-            is included.
-        previous_statistics : Statistics class, optional
-            It allows pipelining of Statistics. Specifically, if the final statistic to be used is determined by the
-            composition of two Statistics, you can pass the first here; then, whenever the final statistic is needed, it
-            is sufficient to call the `statistics` method of the second one, and that will automatically apply both
-            transformations.
-        """
-        self.degree = degree
-        self.cross = cross
-        self.previous_statistics = previous_statistics
-
     def statistics(self, data):
         """
         Parameters
@@ -157,19 +241,17 @@ def statistics(self, data):
             (p+degree*p+cross*nchoosek(p,2)) statistics are calculated.
         """
 
-        # pipeline: first call the previous statistics:
-        if self.previous_statistics is not None:
-            data = self.previous_statistics.statistics(data)
-        # the first of the statistics need to take list as input, in order to match the API. Then actually the
-        # transformations work on np.arrays. In fact the first statistic transforms the list to array. Therefore, the
-        # following code needs to be called only if the self statistic is the first, i.e. it does not have a
-        # previous_statistic element.
-        else:
-            data = self._check_and_transform_input(data)
+        # need to call this first which takes care of calling the previous statistics if that is defined and of properly
+        # formatting data
+        data = self._preprocess(data)
 
         # Expand the data with polynomial expansion
         result = self._polynomial_expansion(data)
 
+        # now call the _rescale function which automatically rescales the different statistics using the standard
+        # deviation of them on the training set provided at initialization.
+        result = self._rescale(result)
+
         return result
 
 
@@ -178,8 +260,21 @@ class LinearTransformation(Statistics):
     an additional polynomial expansion step.
     """
 
-    def __init__(self, coefficients, degree=1, cross=False, previous_statistics=None):
+    def __init__(self, coefficients, degree=1, cross=False, reference_simulations=None, previous_statistics=None):
         """
+        `degree` and `cross` specify the polynomial expansion you want to apply to the statistics.
+
+        If `reference_simulations` are provided, the standard deviation of the different statistics on the set 
+        of reference simulations is computed and stored; these will then be used to rescale
+        the statistics for each new simulation or observation. 
+        If no set of reference simulations are provided, then this is not done.
+
+        `previous_statistics` allows different Statistics object to be pipelined. Specifically, if the final 
+        statistic to be used is determined by the
+        composition of two Statistics, you can pass the first here; then, whenever the final statistic is needed, it
+        is sufficient to call the `statistics` method of the second one, and that will automatically apply both
+        transformations.  
+
         Parameters
         ----------
         coefficients: coefficients is a matrix with size d x p, where d is the dimension of the summary statistic that
@@ -190,16 +285,21 @@ def __init__(self, coefficients, degree=1, cross=False, previous_statistics=None
         cross : boolean, optional
             Defines whether to include the cross-product terms. The default value is True, meaning the cross product term
             is included.
-        previous_statistics : Statistics class, optional
+        reference_simulations: array, optional
+            A numpy array with shape (n_samples, output_size) containing a set of reference simulations. If provided, 
+            statistics are computed at initialization for all reference simulations, and the standard deviation of the 
+            different statistics is extracted. The standard deviation is then used to standardize the summary 
+            statistics each time they are compute on a new observation or simulation. Defaults to None, in which case 
+            standardization is not applied.
+        previous_statistics : abcpy.statistics.Statistics, optional
             It allows pipelining of Statistics. Specifically, if the final statistic to be used is determined by the
             composition of two Statistics, you can pass the first here; then, whenever the final statistic is needed, it
             is sufficient to call the `statistics` method of the second one, and that will automatically apply both
             transformations.
         """
         self.coefficients = coefficients
-        self.degree = degree
-        self.cross = cross
-        self.previous_statistics = previous_statistics
+
+        super(LinearTransformation, self).__init__(degree, cross, reference_simulations, previous_statistics)
 
     def statistics(self, data):
         """
@@ -215,23 +315,21 @@ def statistics(self, data):
             calculated.
         """
 
-        # pipeline: first call the previous statistics:
-        if self.previous_statistics is not None:
-            data = self.previous_statistics.statistics(data)
-        # the first of the statistics need to take list as input, in order to match the API. Then actually the
-        # transformations work on np.arrays. In fact the first statistic transforms the list to array. Therefore, the
-        # following code needs to be called only if the self statistic is the first, i.e. it does not have a
-        # previous_statistic element.
-        else:
-            data = self._check_and_transform_input(data)
+        # need to call this first which takes care of calling the previous statistics if that is defined and of properly
+        # formatting data
+        data = self._preprocess(data)
 
         # Apply now the linear transformation
         if not data.shape[1] == self.coefficients.shape[0]:
             raise ValueError('Mismatch in dimension of summary statistics and coefficients')
-        result = np.dot(data, self.coefficients)
+        data = np.dot(data, self.coefficients)
 
         # Expand the data with polynomial expansion
-        result = self._polynomial_expansion(result)
+        result = self._polynomial_expansion(data)
+
+        # now call the _rescale function which automatically rescales the different statistics using the standard
+        # deviation of them on the training set provided at initialization.
+        result = self._rescale(result)
 
         return result
 
@@ -244,14 +342,39 @@ class NeuralEmbedding(Statistics):
     Pytorch is required for this part to work.   
     """
 
-    def __init__(self, net, previous_statistics=None):  # are these default values OK?
+    def __init__(self, net, degree=1, cross=False, reference_simulations=None, previous_statistics=None):
+
         """
+        `degree` and `cross` specify the polynomial expansion you want to apply to the statistics.
+
+        If `reference_simulations` are provided, the standard deviation of the different statistics on the set
+        of reference simulations is computed and stored; these will then be used to rescale
+        the statistics for each new simulation or observation. 
+        If no set of reference simulations are provided, then this is not done.
+
+        `previous_statistics` allows different Statistics object to be pipelined. Specifically, if the final 
+        statistic to be used is determined by the
+        composition of two Statistics, you can pass the first here; then, whenever the final statistic is needed, it
+        is sufficient to call the `statistics` method of the second one, and that will automatically apply both
+        transformations.
+
         Parameters
         ----------
         net : torch.nn object
             the embedding neural network. The input size of the neural network must coincide with the size of each of
             the datapoints.
-        previous_statistics : Statistics class, optional
+        degree: integer, optional
+            Of polynomial expansion. The default value is 2 meaning second order polynomial expansion.
+        cross: boolean, optional
+            Defines whether to include the cross-product terms. The default value is True, meaning the cross product term
+            is included.
+        reference_simulations: array, optional
+            A numpy array with shape (n_samples, output_size) containing a set of reference simulations. If provided, 
+            statistics are computed at initialization for all reference simulations, and the standard deviation of the 
+            different statistics is extracted. The standard deviation is then used to standardize the summary 
+            statistics each time they are compute on a new observation or simulation. Defaults to None, in which case 
+            standardization is not applied.
+        previous_statistics: abcpy.statistics.Statistics, optional
             It allows pipelining of Statistics. Specifically, if the final statistic to be used is determined by the
             composition of two Statistics, you can pass the first here; then, whenever the final statistic is needed, it
             is sufficient to call the `statistics` method of the second one, and that will automatically apply both
@@ -263,11 +386,14 @@ def __init__(self, net, previous_statistics=None):  # are these default values O
                 "neural networks. Please install it. ".format(self.__class__.__name__))
 
         self.net = net
-        self.previous_statistics = previous_statistics
+
+        # init of super class
+        super(NeuralEmbedding, self).__init__(degree, cross, reference_simulations, previous_statistics)
 
     @classmethod
     def fromFile(cls, path_to_net_state_dict, network_class=None, path_to_scaler=None, input_size=None,
-                 output_size=None, hidden_sizes=None, previous_statistics=None):
+                 output_size=None, hidden_sizes=None, degree=1, cross=False, reference_simulations=None,
+                 previous_statistics=None):
         """If the neural network state_dict was saved to the disk, this method can be used to instantiate a
         NeuralEmbedding object with that neural network.
 
@@ -311,7 +437,18 @@ def fromFile(cls, path_to_net_state_dict, network_class=None, path_to_scaler=Non
             [5,7,5] denotes 3 hidden layers with correspondingly 5,7,5 neurons). In case this parameter is not provided,
             the hidden sizes are determined from the input and output sizes as determined in
             abcpy.NN_utilities.networks.DefaultNN. Note that this must not be provided together with `network_class`.
-        previous_statistics : Statistics class, optional
+        degree: integer, optional
+            Of polynomial expansion. The default value is 2 meaning second order polynomial expansion.
+        cross: boolean, optional
+            Defines whether to include the cross-product terms. The default value is True, meaning the cross product term
+            is included.
+        reference_simulations: array, optional
+            A numpy array with shape (n_samples, output_size) containing a set of reference simulations. If provided,
+            statistics are computed at initialization for all reference simulations, and the standard deviation of the
+            different statistics is extracted. The standard deviation is then used to standardize the summary
+            statistics each time they are compute on a new observation or simulation. Defaults to None, in which case
+            standardization is not applied.
+        previous_statistics : abcpy.statistics.Statistics, optional
             It allows pipelining of Statistics. Specifically, if the final statistic to be used is determined by the
             composition of two Statistics, you can pass the first here; then, whenever the final statistic is needed, it
             is sufficient to call the `statistics` method of the second one, and that will automatically apply both
@@ -335,11 +472,17 @@ def fromFile(cls, path_to_net_state_dict, network_class=None, path_to_scaler=Non
             raise RuntimeError("You passed hidden_sizes as an argument, but that may be passed only if you are passing "
                                "input_size and input_size as well, and you are not passing network_class.")
 
-        if network_class is not None:  # user explicitly passed the NN class
+        if network_class is None:
+            network_class = createDefaultNN(input_size=input_size, output_size=output_size,
+                                            hidden_sizes=hidden_sizes)
+
+        # the stored state_dict could be either a simple network or a network wrapped with DiscardLastOutput (in case
+        # the statistics was learned with the Exponential Family method); while instead the network class refers only to
+        # the actual net. Therefore need to try and load in both ways
+        try:
             net = load_net(path_to_net_state_dict, network_class)
-        else:  # the user passed the input_size, output_size and (maybe) the hidden_sizes
-            net = load_net(path_to_net_state_dict, createDefaultNN(input_size=input_size, output_size=output_size,
-                                                                   hidden_sizes=hidden_sizes))
+        except RuntimeError:
+            net = load_net(path_to_net_state_dict, DiscardLastOutputNet, network_class())
 
         if path_to_scaler is not None:
             f = open(path_to_scaler, 'rb')
@@ -347,7 +490,8 @@ def fromFile(cls, path_to_net_state_dict, network_class=None, path_to_scaler=Non
             f.close()
             net = ScalerAndNet(net, scaler)
 
-        statistic_object = cls(net, previous_statistics=previous_statistics)
+        statistic_object = cls(net, degree=degree, cross=cross, reference_simulations=reference_simulations,
+                               previous_statistics=previous_statistics)
 
         return statistic_object
 
@@ -384,21 +528,16 @@ def statistics(self, data):
         ----------
         data: python list
             Contains n data sets with length p.
+
         Returns
         -------
         numpy.ndarray
             the statistics computed by applying the neural network.
         """
 
-        # pipeline: first call the previous statistics:
-        if self.previous_statistics is not None:
-            data = self.previous_statistics.statistics(data)
-        # the first of the statistics need to take list as input, in order to match the API. Then actually the
-        # transformations work on np.arrays. In fact the first statistic transforms the list to array. Therefore, the
-        # following code needs to be called only if the self statistic is the first, i.e. it does not have a
-        # previous_statistic element.
-        else:
-            data = self._check_and_transform_input(data)
+        # need to call this first which takes care of calling the previous statistics if that is defined and of properly
+        # formatting data
+        data = self._preprocess(data)
 
         data = torch.from_numpy(data.astype("float32"))
 
@@ -407,6 +546,18 @@ def statistics(self, data):
             data = data.cuda()
 
         # simply apply the network transformation.
-        result = self.net(data).cpu().detach().numpy()
+        try:
+            data = self.net(data).cpu().detach().numpy()
+        except (IndexError, RuntimeError, ValueError) as e:
+            raise RuntimeError("There was an error in passing the data through the network, likely due to the data not "
+                               "being of the right size.")
+        data = np.array(data)
+
+        # Expand the data with polynomial expansion
+        result = self._polynomial_expansion(data)
 
-        return np.array(result)
+        # now call the _rescale function which automatically rescales the different statistics using the standard
+        # deviation of them on the training set provided at initialization.
+        result = self._rescale(result)
+
+        return result
diff --git a/abcpy/statisticslearning.py b/abcpy/statisticslearning.py
index f667253a..31951b1e 100644
--- a/abcpy/statisticslearning.py
+++ b/abcpy/statisticslearning.py
@@ -1,14 +1,16 @@
 import logging
 from abc import ABCMeta, abstractmethod
 
-import numpy as np
+import matplotlib.pyplot as plt
 from sklearn import linear_model
 from sklearn.preprocessing import MinMaxScaler
+from tqdm import tqdm
 
 from abcpy.acceptedparametersmanager import *
 from abcpy.graphtools import GraphTools
 # import dataset and networks definition:
 from abcpy.statistics import LinearTransformation
+from abcpy.transformers import BoundedVarScaler
 
 # Different torch components
 try:
@@ -17,8 +19,13 @@
     has_torch = False
 else:
     has_torch = True
-    from abcpy.NN_utilities.networks import createDefaultNN, ScalerAndNet
+    from abcpy.NN_utilities.networks import createDefaultNN, ScalerAndNet, createDefaultNNWithDerivatives, \
+        DiscardLastOutputNet
     from abcpy.statistics import NeuralEmbedding
+    from torch.optim import Adam, lr_scheduler
+    import torch.autograd as autograd
+    from abcpy.NN_utilities.utilities import jacobian_second_order, set_requires_grad
+    from abcpy.NN_utilities.losses import Fisher_divergence_loss_with_c_x
 
 from abcpy.NN_utilities.algorithms import FP_nn_training, triplet_training, contrastive_training
 from abcpy.NN_utilities.utilities import compute_similarity_matrix
@@ -47,7 +54,7 @@ def __init__(self, model, statistics_calc, backend, n_samples=1000, n_samples_va
         ----------
         model: abcpy.models.Model
             Model object that conforms to the Model class.
-        statistics_cal: abcpy.statistics.Statistics
+        statistics_calc: abcpy.statistics.Statistics
             Statistics object that conforms to the Statistics class.
         backend: abcpy.backends.Backend
             Backend object that conforms to the Backend class.
@@ -69,7 +76,8 @@ def __init__(self, model, statistics_calc, backend, n_samples=1000, n_samples_va
         simulations: array, optional
             A numpy array with shape (n_samples, output_size) that is used, together with `parameters` to fit the
             summary selection learning algorithm. It has to be provided together with `parameters`, in which case no
-            other simulations are performed to generate the training data. Default value is None.
+            other simulations are performed to generate the training data. These are transformed by the 
+            `statistics_calc` statistics before the learning step is done. Default value is None.
         parameters_val: array, optional
             A numpy array with shape (n_samples_val, n_parameters) that is used, together with `simulations_val` as a 
             validation set in the summary selection learning algorithm. It has to be provided together with 
@@ -78,7 +86,8 @@ def __init__(self, model, statistics_calc, backend, n_samples=1000, n_samples_va
         simulations_val: array, optional
             A numpy array with shape (n_samples_val, output_size) that is used, together with `parameters_val` as a 
             validation set in the summary selection learning algorithm. It has to be provided together with 
-            `parameters_val`, in which case no other simulations are performed to generate the validation set. Default
+            `parameters_val`, in which case no other simulations are performed to generate the validation set.
+            These are transformed by the `statistics_calc` statistics before the learning step is done. Default
             value is None.
         seed: integer, optional
             Optional initial seed for the random number generator. The default value is generated randomly.
@@ -212,6 +221,48 @@ def _sample_parameter_statistics(self, rng=np.random.RandomState()):
         return parameter, statistics
 
 
+class StatisticsLearningWithLosses(StatisticsLearning, metaclass=ABCMeta):
+    """This abstract base class subclasses the above and includes a utility method to plot losses.
+    """
+
+    def plot_losses(self, which_losses="both"):
+        """
+        Plot losses vs training epochs after the NN have been trained.
+
+        Parameters
+        ----------
+        which_losses: string, optional
+            Specifies which set of losses to display (between training and test loss).
+            Can be 'train', 'test' or 'both'. Notice that the test loss could be unavailable (in case no test set was
+            used for training), in which case the test loss is not shown even if requested. Defaults to 'both'.
+
+        Returns
+        -------
+
+        """
+
+        if which_losses not in ["both", "train", "test"]:
+            raise NotImplementedError("'which_losses' should be 'both', 'train' or 'test'")
+
+        fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 4))
+        if which_losses in ["both", "train"]:
+            ax.plot(np.arange(len(self.train_losses)) + 1, self.train_losses, label="Train loss", color="C0")
+        if which_losses in ["both", "test"]:
+            if self.test_losses is not None:
+                if len(self.test_losses) != len(self.train_losses):
+                    raise RuntimeError("Length of train and test losses list should be the same.")
+                ax.plot(np.arange(len(self.train_losses)) + 1, self.test_losses, label="Test loss", color="C1")
+            else:
+                self.logger.warning("You requested to plot test losses, but these are unavailable (probably due to no "
+                                    "test set been used during NN training.")
+
+        ax.set_xlabel("Training epoch")
+        ax.set_ylabel("Loss")
+        ax.legend()
+
+        return fig, ax
+
+
 class Semiautomatic(StatisticsLearning, GraphTools):
     """This class implements the semi automatic summary statistics learning technique described in Fearnhead and
     Prangle [1].
@@ -227,8 +278,8 @@ def __init__(self, model, statistics_calc, backend, n_samples=1000, n_samples_pe
         ----------
         model: abcpy.models.Model
             Model object that conforms to the Model class.
-        statistics_cal: abcpy.statistics.Statistics
-            Statistics object that conforms to the Statistics class.
+        statistics_calc: abcpy.statistics.Statistics
+            Statistics object that conforms to the Statistics class, applied before learning the transformation.
         backend: abcpy.backends.Backend
             Backend object that conforms to the Backend class.
         n_samples: int, optional
@@ -245,7 +296,8 @@ def __init__(self, model, statistics_calc, backend, n_samples=1000, n_samples_pe
         simulations: array, optional
             A numpy array with shape (n_samples, output_size) that is used, together with `parameters` to fit the
             summary selection learning algorithm. It has to be provided together with `parameters`, in which case no
-            other simulations are performed. Default value is None.
+            other simulations are performed. These are transformed by the 
+            `statistics_calc` statistics before the learning step is done. Default value is None.
         seed: integer, optional
             Optional initial seed for the random number generator. The default value is generated randomly.
         """
@@ -276,11 +328,14 @@ def get_statistics(self):
         return LinearTransformation(np.transpose(self.coefficients_learnt), previous_statistics=self.statistics_calc)
 
 
-class StatisticsLearningNN(StatisticsLearning, GraphTools):
+class StatisticsLearningNN(StatisticsLearningWithLosses, GraphTools):
     """This is the base class for all the statistics learning techniques involving neural networks. In most cases, you
     should not instantiate this directly. The actual classes instantiate this with the right arguments.
 
-    In order to use this technique, Pytorch is required to handle the neural networks.
+    In order to use this technique, Pytorch is required to handle the neural networks. However, the user does not need
+    to be fluent in Pytorch as some default neural networks are instantiated if the user does not provide them
+    (experienced users can of course provide the neural networks they would like to use). GPUs can be used to
+    train the neural networks if required.
     """
 
     def __init__(self, model, statistics_calc, backend, training_routine, distance_learning, embedding_net=None,
@@ -292,8 +347,8 @@ def __init__(self, model, statistics_calc, backend, training_routine, distance_l
         ----------
         model: abcpy.models.Model
             Model object that conforms to the Model class.
-        statistics_cal: abcpy.statistics.Statistics
-            Statistics object that conforms to the Statistics class.
+        statistics_calc: abcpy.statistics.Statistics
+            Statistics object that conforms to the Statistics class, applied before learning the transformation.
         backend: abcpy.backends.Backend
             Backend object that conforms to the Backend class.
         training_routine: function
@@ -304,11 +359,17 @@ def __init__(self, model, statistics_calc, backend, training_routine, distance_l
             this has to be True if the statistics learning technique is based on distance learning, in which case the
             __init__ computes the similarity matrix.
         embedding_net: torch.nn object or list
-            it can be a torch.nn object with input size corresponding to size of model output, alternatively, a list
+            it can be a torch.nn object with input size corresponding to size of model output 
+            (after being transformed by `statistics_calc`), alternatively, a list
             with integer numbers denoting the width of the hidden layers, from which a fully connected network with
-            that structure is created, having the input and output size corresponding to size of model output and
-            number of parameters. In case this is None, the depth of the network and the width of the hidden layers is
-            determined from the input and output size as specified in abcpy.NN_utilities.networks.DefaultNN.
+            that structure is created, having the input and output size corresponding to size of model output
+            (after being transformed by `statistics_calc`) and
+            number of parameters. In case this is None, a fully connected neural network with three hidden layers is
+            used; the width of the hidden layers is given by
+            ``[int(input_size * 1.5), int(input_size * 0.75 + output_size * 3), int(output_size * 5)]``,
+            where `input_size` is the size of the data after being transformed by `statistics_calc`, while `output_size`
+            is the number of parameters in the model. For further details check
+            :func:`abcpy.NN_utilities.networks.createDefaultNN`
         n_samples: int, optional
             The number of (parameter, simulated data) tuple to be generated to learn the summary statistics in pilot
             step. The default value is 1000.
@@ -327,12 +388,14 @@ def __init__(self, model, statistics_calc, backend, training_routine, distance_l
         simulations: array, optional
             A numpy array with shape (n_samples, output_size) that is used, together with `parameters` to fit the
             summary selection learning algorithm. It has to be provided together with `parameters`, in which case no
-            other simulations are performed to generate the training data. Default value is None.
+            other simulations are performed to generate the training data. These are transformed by the 
+            `statistics_calc` statistics before the learning step is done. Default value is None.
         parameters_val: array, optional
             A numpy array with shape (n_samples_val, n_parameters) that is used, together with `simulations_val` as a 
             validation set in the summary selection learning algorithm. It has to be provided together with 
-            `simulations_val`, in which case no other simulations are performed to generate the validation set. Default 
-            value is None.
+            `simulations_val`, in which case no other simulations are performed to generate the validation set. 
+             These are transformed by the 
+            `statistics_calc` statistics before the learning step is done. Default value is None.
         simulations_val: array, optional
             A numpy array with shape (n_samples_val, output_size) that is used, together with `parameters_val` as a 
             validation set in the summary selection learning algorithm. It has to be provided together with 
@@ -346,9 +409,8 @@ def __init__(self, model, statistics_calc, backend, training_routine, distance_l
             If True, a scaler of the class `sklearn.preprocessing.MinMaxScaler` will be fit on the training data before 
             neural network training, and training and validation data simulations data will be rescaled. 
             When calling the `get_statistics` method, 
-            a network of the class `ScalerAndNet` will be used in instantiating the statistics; this network is a 
-            wrapper of a neural network and a scaler and transforms the data with the scaler before applying the neural
-            network.  
+            the network will be wrapped by :class:`abcpy.NN_utilities.networks.ScalerAndNet`; this automatically
+            takes care of transforming the data with the scaler before applying the neural network.
             It is highly recommended to use a scaler, as neural networks are sensitive to the range of input data. A 
             case in which you may not want to use a scaler is timeseries data, as the scaler works independently on each
             feature of the data.
@@ -433,6 +495,8 @@ def __init__(self, model, statistics_calc, backend, training_routine, distance_l
             self.embedding_net = createDefaultNN(input_size=simulations.shape[1], output_size=target.shape[1],
                                                  hidden_sizes=embedding_net)()
             self.logger.debug('We generate a default neural network')
+        else:
+            raise RuntimeError("'embedding_net' needs to be either a torch.nn.Module, or a list, or None.")
 
         if cuda:
             self.embedding_net.cuda()
@@ -440,13 +504,13 @@ def __init__(self, model, statistics_calc, backend, training_routine, distance_l
         self.logger.debug('We now run the training routine')
 
         if distance_learning:
-            self.embedding_net = training_routine(simulations, similarity_set, embedding_net=self.embedding_net,
-                                                  cuda=cuda, samples_val=simulations_val, use_tqdm=use_tqdm,
-                                                  similarity_set_val=similarity_set_val, **training_routine_kwargs)
+            self.embedding_net, self.train_losses, self.test_losses = training_routine(
+                simulations, similarity_set, embedding_net=self.embedding_net, cuda=cuda, samples_val=simulations_val,
+                use_tqdm=use_tqdm, similarity_set_val=similarity_set_val, **training_routine_kwargs)
         else:
-            self.embedding_net = training_routine(simulations, target, embedding_net=self.embedding_net,
-                                                  cuda=cuda, samples_val=simulations_val, target_val=target_val,
-                                                  use_tqdm=use_tqdm, **training_routine_kwargs)
+            self.embedding_net, self.train_losses, self.test_losses = training_routine(
+                simulations, target, embedding_net=self.embedding_net, cuda=cuda, samples_val=simulations_val,
+                target_val=target_val, use_tqdm=use_tqdm, **training_routine_kwargs)
 
         self.logger.info("Finished learning the transformation.")
 
@@ -454,11 +518,11 @@ def __init__(self, model, statistics_calc, backend, training_routine, distance_l
 
     def get_statistics(self):
         """
-        Returns a NeuralEmbedding Statistics implementing the learned transformation.
+        Returns a :class:`abcpy.statistics.NeuralEmbedding` Statistics implementing the learned transformation.
 
-        If a scaler was used, the `net` attribute of the returned object is of the class `ScalerAndNet`, which is a
-        nn.Module object wrapping the scaler and the learned neural network and applies the scaler before the data is
-        fed through the neural network.
+        If a scaler was used, the `net` attribute of the returned object is of the class
+        :class:`abcpy.NN_utilities.networks.ScalerAndNet`, which is a nn.Module object wrapping the scaler and the
+        learned neural network and applies the scaler before the data is fed through the neural network.
 
         Returns
         -------
@@ -472,13 +536,16 @@ def get_statistics(self):
             return NeuralEmbedding(net=self.embedding_net, previous_statistics=self.statistics_calc)
 
 
-# the following classes subclass the base class StatisticsLearningNN with different training routines
+# the following three classes subclass the base class StatisticsLearningNN with different training routines
 
 class SemiautomaticNN(StatisticsLearningNN):
     """This class implements the semi automatic summary statistics learning technique as described in
      Jiang et al. 2017 [1].
 
-     In order to use this technique, Pytorch is required to handle the neural networks.
+     In order to use this technique, Pytorch is required to handle the neural networks. However, the user does not need
+     to be fluent in Pytorch as some default neural networks are instantiated if the user does not provide them
+     (experienced users can of course provide the neural networks they would like to use). GPUs can be used to
+     train the neural networks if required.
 
      [1] Jiang, B., Wu, T.Y., Zheng, C. and Wong, W.H., 2017. Learning summary statistic for approximate Bayesian
      computation via deep neural network. Statistica Sinica, pp.1595-1618.
@@ -495,17 +562,22 @@ def __init__(self, model, statistics_calc, backend, embedding_net=None, n_sample
         ----------
         model: abcpy.models.Model
             Model object that conforms to the Model class.
-        statistics_cal: abcpy.statistics.Statistics
-            Statistics object that conforms to the Statistics class.
+        statistics_calc: abcpy.statistics.Statistics
+            Statistics object that conforms to the Statistics class, applied before learning the transformation.
         backend: abcpy.backends.Backend
             Backend object that conforms to the Backend class.
         embedding_net: torch.nn object or list
-            it can be a torch.nn object with input size corresponding to size of model output and output size
-            corresponding to the number of parameters or, alternatively, a list with integer numbers denoting the width
-            of the hidden layers, from which a fully connected network with that structure is created, having the input
-            and output size corresponding to size of model output and number of parameters. In case this is None, the
-            depth of the network and the width of the hidden layers is determined from the input and output size as
-            specified in abcpy.NN_utilities.networks.DefaultNN.
+            it can be a torch.nn object with input size corresponding to size of model output
+            (after being transformed by `statistics_calc`), alternatively, a list
+            with integer numbers denoting the width of the hidden layers, from which a fully connected network with
+            that structure is created, having the input and output size corresponding to size of model output
+            (after being transformed by `statistics_calc`) and
+            number of parameters. In case this is None, a fully connected neural network with three hidden layers is
+            used; the width of the hidden layers is given by
+            ``[int(input_size * 1.5), int(input_size * 0.75 + output_size * 3), int(output_size * 5)]``,
+            where `input_size` is the size of the data after being transformed by `statistics_calc`, while `output_size`
+            is the number of parameters in the model. For further details check
+            :func:`abcpy.NN_utilities.networks.createDefaultNN`
         n_samples: int, optional
             The number of (parameter, simulated data) tuple to be generated to learn the summary statistics in pilot
             step. The default value is 1000.
@@ -524,7 +596,8 @@ def __init__(self, model, statistics_calc, backend, embedding_net=None, n_sample
         simulations: array, optional
             A numpy array with shape (n_samples, output_size) that is used, together with `parameters` to fit the
             summary selection learning algorithm. It has to be provided together with `parameters`, in which case no
-            other simulations are performed to generate the training data. Default value is None.
+            other simulations are performed to generate the training data. These are transformed by the 
+            `statistics_calc` statistics before the learning step is done. Default value is None.
         parameters_val: array, optional
             A numpy array with shape (n_samples_val, n_parameters) that is used, together with `simulations_val` as a 
             validation set in the summary selection learning algorithm. It has to be provided together with 
@@ -533,8 +606,9 @@ def __init__(self, model, statistics_calc, backend, embedding_net=None, n_sample
         simulations_val: array, optional
             A numpy array with shape (n_samples_val, output_size) that is used, together with `parameters_val` as a 
             validation set in the summary selection learning algorithm. It has to be provided together with 
-            `parameters_val`, in which case no other simulations are performed to generate the validation set. Default
-            value is None.
+            `parameters_val`, in which case no other simulations are performed to generate the validation set.
+             These are transformed by the `statistics_calc` statistics before the learning step is done.
+            Default value is None.
         early_stopping: boolean, optional
             If True, the validation set (which needs to be either provided through the arguments `parameters_val` and
             `simulations_val` or generated by setting `n_samples_val` to a value larger than 0) is used to early stop
@@ -542,8 +616,8 @@ def __init__(self, model, statistics_calc, backend, embedding_net=None, n_sample
             value is False.
         epochs_early_stopping_interval: integer, optional
             The frequency at which the validation error is compared in order to decide whether to early stop the
-            training or not. Namely, if `epochs_early_stopping_interval=10`, early stopping can happen only at epochs multiple of
-            10. Defaul value is 1.
+            training or not. Namely, if `epochs_early_stopping_interval=10`, early stopping can happen only at epochs
+            multiple of 10. Default value is 1.
         start_epoch_early_stopping: integer, optional
             The epoch after which early stopping can happen; in fact, as soon as training starts, there may be a 
             transient period in which the loss increases. Default value is 10.
@@ -555,9 +629,8 @@ def __init__(self, model, statistics_calc, backend, embedding_net=None, n_sample
             If True, a scaler of the class `sklearn.preprocessing.MinMaxScaler` will be fit on the training data before 
             neural network training, and training and validation data simulations data will be rescaled. 
             When calling the `get_statistics` method, 
-            a network of the class `ScalerAndNet` will be used in instantiating the statistics; this network is a 
-            wrapper of a neural network and a scaler and transforms the data with the scaler before applying the neural
-            network.  
+            the network will be wrapped by :class:`abcpy.NN_utilities.networks.ScalerAndNet`; this automatically
+            takes care of transforming the data with the scaler before applying the neural network.
             It is highly recommended to use a scaler, as neural networks are sensitive to the range of input data. A 
             case in which you may not want to use a scaler is timeseries data, as the scaler works independently on each
             feature of the data.
@@ -583,8 +656,6 @@ def __init__(self, model, statistics_calc, backend, embedding_net=None, n_sample
             If a scheduler is provided, for the first `start_epoch_training` epochs the scheduler is applied to modify
             the learning rate without training the network. From then on, the training proceeds normally, applying both
             the scheduler and the optimizer at each epoch. Default to 0.
-        verbose: boolean, optional
-            if True, prints more information from the training routine. Default to False.
         use_tqdm : boolean, optional
             Whether using tqdm or not to display progress. Defaults to True.
         optimizer_kwargs: Python dictionary, optional
@@ -613,15 +684,18 @@ def __init__(self, model, statistics_calc, backend, embedding_net=None, n_sample
 
 class TripletDistanceLearning(StatisticsLearningNN):
     """This class implements the statistics learning technique by using the triplet loss [1] for distance learning as
-     described in Pacchiardi et al. 2019 [2].
+     described in Pacchiardi et al. 2021 [2].
 
-     In order to use this technique, Pytorch is required to handle the neural networks.
+     In order to use this technique, Pytorch is required to handle the neural networks. However, the user does not need
+     to be fluent in Pytorch as some default neural networks are instantiated if the user does not provide them
+     (experienced users can of course provide the neural networks they would like to use). GPUs can be used to
+     train the neural networks if required.
 
      [1] Schroff, F., Kalenichenko, D. and Philbin, J., 2015. Facenet: A unified embedding for face recognition and
      clustering. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 815-823).
 
-     [2] Pacchiardi, L., Kunzli, P., Schoengens, M., Chopard, B. and Dutta, R., 2019. Distance-learning For Approximate
-     Bayesian Computation To Model a Volcanic Eruption. arXiv preprint arXiv:1909.13118.
+     [2] Pacchiardi, L., Kunzli, P., Schöngens, M., Chopard, B., and Dutta, R., 2021. Distance-learning for
+     approximate bayesian computation to model a volcanic eruption. Sankhya B, 83(1), 288-317.
     """
 
     def __init__(self, model, statistics_calc, backend, embedding_net=None, n_samples=1000, n_samples_val=0,
@@ -636,17 +710,22 @@ def __init__(self, model, statistics_calc, backend, embedding_net=None, n_sample
         ----------
         model: abcpy.models.Model
             Model object that conforms to the Model class.
-        statistics_cal: abcpy.statistics.Statistics
-            Statistics object that conforms to the Statistics class.
+        statistics_calc: abcpy.statistics.Statistics
+            Statistics object that conforms to the Statistics class, applied before learning the transformation.
         backend: abcpy.backends.Backend
             Backend object that conforms to the Backend class.
         embedding_net: torch.nn object or list
-            it can be a torch.nn object with input size corresponding to size of model output (output size can be any);
-            alternatively, a list with integer numbers denoting the width of the hidden layers, from which a fully
-            connected network with that structure is created, having the input and output size corresponding to size of
-            model output and number of parameters. In case this is None, the depth of the network and the width of the
-            hidden layers is determined from the input and output size as specified in
-            abcpy.NN_utilities.networks.DefaultNN.
+            it can be a torch.nn object with input size corresponding to size of model output 
+            (after being transformed by `statistics_calc`), alternatively, a list
+            with integer numbers denoting the width of the hidden layers, from which a fully connected network with
+            that structure is created, having the input and output size corresponding to size of model output
+            (after being transformed by `statistics_calc`) and
+            number of parameters. In case this is None, a fully connected neural network with three hidden layers is
+            used; the width of the hidden layers is given by
+            ``[int(input_size * 1.5), int(input_size * 0.75 + output_size * 3), int(output_size * 5)]``,
+            where `input_size` is the size of the data after being transformed by `statistics_calc`, while `output_size`
+            is the number of parameters in the model. For further details check
+            :func:`abcpy.NN_utilities.networks.createDefaultNN`
         n_samples: int, optional
             The number of (parameter, simulated data) tuple to be generated to learn the summary statistics in pilot
             step. The default value is 1000.
@@ -665,7 +744,8 @@ def __init__(self, model, statistics_calc, backend, embedding_net=None, n_sample
         simulations: array, optional
             A numpy array with shape (n_samples, output_size) that is used, together with `parameters` to fit the
             summary selection learning algorithm. It has to be provided together with `parameters`, in which case no
-            other simulations are performed to generate the training data. Default value is None.
+            other simulations are performed to generate the training data. These are transformed by the 
+            `statistics_calc` statistics before the learning step is done. Default value is None.
         parameters_val: array, optional
             A numpy array with shape (n_samples_val, n_parameters) that is used, together with `simulations_val` as a 
             validation set in the summary selection learning algorithm. It has to be provided together with 
@@ -674,8 +754,9 @@ def __init__(self, model, statistics_calc, backend, embedding_net=None, n_sample
         simulations_val: array, optional
             A numpy array with shape (n_samples_val, output_size) that is used, together with `parameters_val` as a 
             validation set in the summary selection learning algorithm. It has to be provided together with 
-            `parameters_val`, in which case no other simulations are performed to generate the validation set. Default
-            value is None.
+            `parameters_val`, in which case no other simulations are performed to generate the validation set.
+             These are transformed by the `statistics_calc` statistics before the learning step is done. 
+            Default value is None.
         early_stopping: boolean, optional
             If True, the validation set (which needs to be either provided through the arguments `parameters_val` and
             `simulations_val` or generated by setting `n_samples_val` to a value larger than 0) is used to early stop
@@ -684,7 +765,7 @@ def __init__(self, model, statistics_calc, backend, embedding_net=None, n_sample
         epochs_early_stopping_interval: integer, optional
             The frequency at which the validation error is compared in order to decide whether to early stop the
             training or not. Namely, if `epochs_early_stopping_interval=10`, early stopping can happen only at epochs
-            multiple of 10. Defaul value is 1.
+            multiple of 10. Default value is 1.
         start_epoch_early_stopping: integer, optional
             The epoch after which early stopping can happen; in fact, as soon as training starts, there may be a 
             transient period in which the loss increases. Default value is 10.
@@ -696,9 +777,8 @@ def __init__(self, model, statistics_calc, backend, embedding_net=None, n_sample
             If True, a scaler of the class `sklearn.preprocessing.MinMaxScaler` will be fit on the training data before 
             neural network training, and training and validation data simulations data will be rescaled. 
             When calling the `get_statistics` method, 
-            a network of the class `ScalerAndNet` will be used in instantiating the statistics; this network is a 
-            wrapper of a neural network and a scaler and transforms the data with the scaler before applying the neural
-            network.  
+            the network will be wrapped by :class:`abcpy.NN_utilities.networks.ScalerAndNet`; this automatically
+            takes care of transforming the data with the scaler before applying the neural network.
             It is highly recommended to use a scaler, as neural networks are sensitive to the range of input data. A 
             case in which you may not want to use a scaler is timeseries data, as the scaler works independently on each
             feature of the data.
@@ -729,8 +809,6 @@ def __init__(self, model, statistics_calc, backend, embedding_net=None, n_sample
             If a scheduler is provided, for the first `start_epoch_training` epochs the scheduler is applied to modify
             the learning rate without training the network. From then on, the training proceeds normally, applying both
             the scheduler and the optimizer at each epoch. Default to 0.
-        verbose: boolean, optional
-            if True, prints more information from the training routine. Default to False.
         use_tqdm : boolean, optional
             Whether using tqdm or not to display progress. Defaults to True.
         optimizer_kwargs: Python dictionary, optional
@@ -763,16 +841,19 @@ def __init__(self, model, statistics_calc, backend, embedding_net=None, n_sample
 
 class ContrastiveDistanceLearning(StatisticsLearningNN):
     """This class implements the statistics learning technique by using the contrastive loss [1] for distance learning
-     as described in Pacchiardi et al. 2019 [2].
+     as described in Pacchiardi et al. 2021 [2].
 
-     In order to use this technique, Pytorch is required to handle the neural networks.
+     In order to use this technique, Pytorch is required to handle the neural networks. However, the user does not need
+     to be fluent in Pytorch as some default neural networks are instantiated if the user does not provide them
+     (experienced users can of course provide the neural networks they would like to use). GPUs can be used to
+     train the neural networks if required.
 
      [1] Hadsell, R., Chopra, S. and LeCun, Y., 2006, June. Dimensionality reduction by learning an invariant mapping.
      In 2006 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR'06) (Vol. 2,
      pp. 1735-1742). IEEE.
 
-     [2] Pacchiardi, L., Kunzli, P., Schoengens, M., Chopard, B. and Dutta, R., 2019. Distance-learning For Approximate
-     Bayesian Computation To Model a Volcanic Eruption. arXiv preprint arXiv:1909.13118.
+     [2] Pacchiardi, L., Kunzli, P., Schöngens, M., Chopard, B., and Dutta, R., 2021. Distance-learning for
+     approximate bayesian computation to model a volcanic eruption. Sankhya B, 83(1), 288-317.
     """
 
     def __init__(self, model, statistics_calc, backend, embedding_net=None, n_samples=1000, n_samples_val=0,
@@ -786,21 +867,30 @@ def __init__(self, model, statistics_calc, backend, embedding_net=None, n_sample
         ----------
         model: abcpy.models.Model
             Model object that conforms to the Model class.
-        statistics_cal: abcpy.statistics.Statistics
-            Statistics object that conforms to the Statistics class.
+        statistics_calc: abcpy.statistics.Statistics
+            Statistics object that conforms to the Statistics class, applied before learning the transformation.
         backend: abcpy.backends.Backend
             Backend object that conforms to the Backend class.
         embedding_net: torch.nn object or list
-            it can be a torch.nn object with input size corresponding to size of model output (output size can be any);
-            alternatively, a list with integer numbers denoting the width of the hidden layers, from which a fully
-            connected network with that structure is created, having the input and output size corresponding to size of
-            model output and number of parameters. In case this is None, the depth of the network and the width of the
-            hidden layers is determined from the input and output size as specified in
-            abcpy.NN_utilities.networks.DefaultNN.
+            it can be a torch.nn object with input size corresponding to size of model output 
+            (after being transformed by `statistics_calc`), alternatively, a list
+            with integer numbers denoting the width of the hidden layers, from which a fully connected network with
+            that structure is created, having the input and output size corresponding to size of model output
+            (after being transformed by `statistics_calc`) and
+            number of parameters. In case this is None, a fully connected neural network with three hidden layers is
+            used; the width of the hidden layers is given by
+            ``[int(input_size * 1.5), int(input_size * 0.75 + output_size * 3), int(output_size * 5)]``,
+            where `input_size` is the size of the data after being transformed by `statistics_calc`, while `output_size`
+            is the number of parameters in the model. For further details check
+            :func:`abcpy.NN_utilities.networks.createDefaultNN`
         n_samples: int, optional
             The number of (parameter, simulated data) tuple to be generated to learn the summary statistics in pilot
             step. The default value is 1000.
             This is ignored if `simulations` and `parameters` are provided.
+        n_samples_val: int, optional
+            The number of (parameter, simulated data) tuple to be generated to be used as a validation set in the pilot
+            step. The default value is 0, which means no validation set is used.
+            This is ignored if `simulations_val` and `parameters_val` are provided.
         n_samples_per_param: int, optional
             Number of data points in each simulated data set. This is ignored if `simulations` and `parameters` are
             provided. Default to 1.
@@ -811,7 +901,31 @@ def __init__(self, model, statistics_calc, backend, embedding_net=None, n_sample
         simulations: array, optional
             A numpy array with shape (n_samples, output_size) that is used, together with `parameters` to fit the
             summary selection learning algorithm. It has to be provided together with `parameters`, in which case no
-            other simulations are performed. Default value is None.
+            other simulations are performed. These are transformed by the 
+            `statistics_calc` statistics before the learning step is done. Default value is None.
+        parameters_val: array, optional
+            A numpy array with shape (n_samples_val, n_parameters) that is used, together with `simulations_val` as a 
+            validation set in the summary selection learning algorithm. It has to be provided together with 
+            `simulations_val`, in which case no other simulations are performed to generate the validation set. Default 
+            value is None.
+        simulations_val: array, optional
+            A numpy array with shape (n_samples_val, output_size) that is used, together with `parameters_val` as a 
+            validation set in the summary selection learning algorithm. It has to be provided together with 
+            `parameters_val`, in which case no other simulations are performed to generate the validation set.
+             These are transformed by the `statistics_calc` statistics before the learning step is done. 
+            Default value is None.
+        early_stopping: boolean, optional
+            If True, the validation set (which needs to be either provided through the arguments `parameters_val` and
+            `simulations_val` or generated by setting `n_samples_val` to a value larger than 0) is used to early stop
+            the training of the neural network as soon as the loss on the validation set starts to increase. Default
+            value is False.
+        epochs_early_stopping_interval: integer, optional
+            The frequency at which the validation error is compared in order to decide whether to early stop the
+            training or not. Namely, if `epochs_early_stopping_interval=10`, early stopping can happen only at epochs
+            multiple of 10. Default value is 1.
+        start_epoch_early_stopping: integer, optional
+            The epoch after which early stopping can happen; in fact, as soon as training starts, there may be a 
+            transient period in which the loss increases. Default value is 10.
         seed: integer, optional
             Optional initial seed for the random number generator. The default value is generated randomly.
         cuda: boolean, optional
@@ -820,9 +934,8 @@ def __init__(self, model, statistics_calc, backend, embedding_net=None, n_sample
             If True, a scaler of the class `sklearn.preprocessing.MinMaxScaler` will be fit on the training data before 
             neural network training, and training and validation data simulations data will be rescaled. 
             When calling the `get_statistics` method, 
-            a network of the class `ScalerAndNet` will be used in instantiating the statistics; this network is a 
-            wrapper of a neural network and a scaler and transforms the data with the scaler before applying the neural
-            network.  
+            the network will be wrapped by :class:`abcpy.NN_utilities.networks.ScalerAndNet`; this automatically
+            takes care of transforming the data with the scaler before applying the neural network.
             It is highly recommended to use a scaler, as neural networks are sensitive to the range of input data. A 
             case in which you may not want to use a scaler is timeseries data, as the scaler works independently on each
             feature of the data.
@@ -857,8 +970,6 @@ def __init__(self, model, statistics_calc, backend, embedding_net=None, n_sample
             If a scheduler is provided, for the first `start_epoch_training` epochs the scheduler is applied to modify
             the learning rate without training the network. From then on, the training proceeds normally, applying both
             the scheduler and the optimizer at each epoch. Default to 0.
-        verbose: boolean, optional
-            if True, prints more information from the training routine. Default to False.
         use_tqdm : boolean, optional
             Whether using tqdm or not to display progress. Defaults to True.
         optimizer_kwargs: Python dictionary, optional
@@ -890,3 +1001,814 @@ def __init__(self, model, statistics_calc, backend, embedding_net=None, n_sample
                                                           optimizer_kwargs=optimizer_kwargs,
                                                           scheduler_kwargs=scheduler_kwargs,
                                                           loader_kwargs=loader_kwargs)
+
+
+class ExponentialFamilyScoreMatching(StatisticsLearningWithLosses, GraphTools):
+    """This class implements the statistics learning technique using exponential family as described in Pacchiardi et
+    al. 2020 [1]. Specifically, the idea is to fit an exponential family to a set of parameter-simulation pairs from
+    the model, with two neural networks representing the summary statistics and the natural parameters of the
+    exponential family. Once the neural networks have been fit, the one representing summary statistics is can be used
+    in ABC.
+
+    In order to fit the exponential family (which has an intractable normalization constant), the class relies on
+    Score Matching [2] or on Sliced Score Matching [3], which is a faster stochastic version of the former.
+
+    As Score Matching and its sliced version work on an unbounded simulation space, in case the simulations from the
+    model are unbounded, they need to be transformed to an unbounded space. This class takes care of this step, provided
+    that the user specifies the simulation bounds in the `lower_bound_simulations` and `upper_bound_simulations`
+    arguments.
+
+    In order to use this technique, Pytorch is required to handle the neural networks. However, the user does not need
+    to be fluent in Pytorch as some default neural networks are instantiated if the user does not provide them
+    (experienced users can of course provide the neural networks they would like to use). GPUs can be used to
+    train the neural networks if required.
+
+    [1] Pacchiardi, L., and Dutta, R., 2020. Score Matched Conditional Exponential Families for
+    Likelihood-Free Inference. arXiv preprint arXiv:2012.10903.
+
+    [2] Hyvärinen, A., and Dayan, P., 2005. Estimation of non-normalized statistical models by score matching.
+    Journal of Machine Learning Research, 6(4).
+
+    [3] Song, Y., Garg, S., Shi, J. and Ermon, S., 2019. Sliced score matching: A scalable approach to
+    density and score estimation. arXiv preprint arXiv:1905.07088, 2019
+    """
+
+    def __init__(self, model, statistics_calc, backend, simulations_net=None, parameters_net=None,
+                 embedding_dimension=None,
+                 n_samples=1000, n_samples_val=0, parameters=None, simulations=None,
+                 parameters_val=None, simulations_val=None,
+                 lower_bound_simulations=None, upper_bound_simulations=None,
+                 sliced=True, noise_type='radermacher', variance_reduction=False,
+                 n_epochs=100, batch_size=16,
+                 scale_samples=True, scale_parameters=False,
+                 early_stopping=False, epochs_early_stopping_interval=1, start_epoch_early_stopping=10,
+                 cuda=None, load_all_data_GPU=False, seed=None,
+                 nonlinearity_simulations=None, nonlinearity_parameters=None,
+                 batch_norm=True, batch_norm_momentum=0.1, batch_norm_update_before_test=False,
+                 lr_simulations=1e-3, lr_parameters=1e-3, lam=0.0,
+                 optimizer_simulations=None, optimizer_parameters=None,
+                 scheduler_simulations=None, scheduler_parameters=None, start_epoch_training=0,
+                 optimizer_simulations_kwargs={}, optimizer_parameters_kwargs={}, scheduler_simulations_kwargs={},
+                 scheduler_parameters_kwargs={},
+                 use_tqdm=True):
+        """
+        Parameters
+        ----------
+        model: abcpy.models.Model
+            Model object that conforms to the Model class.
+        statistics_calc: abcpy.statistics.Statistics
+            Statistics object that conforms to the Statistics class, applied before learning the transformation.
+        backend: abcpy.backends.Backend
+            Backend object that conforms to the Backend class.
+        simulations_net: torch.nn object or list, optional
+            The neural network which transforms the simulations to the summary statistics of the exponential family. 
+            At the end of the training routine, the output of `simulations_net` (except for the last component)
+            will be the learned summary statistics.  
+            It can be a torch.nn object with input size corresponding to size of model output 
+            (after being transformed by `statistics_calc`) or, alternatively, a list
+            with integer numbers denoting the width of the hidden layers, from which a fully connected network with
+            that structure is created, having the input size corresponding to size of model output
+            (after being transformed by `statistics_calc`) and the output size determined by 
+            `embedding_dimension` (see below). Importantly, the output size of `simulations_net` needs to be equal to 
+            the output size of `parameters_net` increased by one, as the two are used together in the code. If both nets
+            are left to their default values, this is done automatically.   
+            In case this is None, a fully connected neural network with three 
+            hidden layers is used; the width of the hidden layers is given by
+            ``[int(input_size * 1.5), int(input_size * 0.75 + output_size * 3), int(output_size * 5)]``,
+            where `input_size` is the size of the data after being transformed by `statistics_calc`, while
+            `output_size` is determined by `embedding_dimension`. For further details check
+            :func:`abcpy.NN_utilities.networks.createDefaultNN`. By default, this is None.
+        parameters_net: torch.nn object or list, optional
+            The neural network which maps the parameters to the natural parametrization form of the exponential family. 
+            It can be a torch.nn object with input size corresponding to the number of parameters 
+            or, alternatively, a list
+            with integer numbers denoting the width of the hidden layers, from which a fully connected network with
+            that structure is created, having the input size corresponding to the number of parameters
+            and the output size determined by 
+            `embedding_dimension` (see below). Importantly, the output size of `parameters_net` needs to be equal to 
+            the output size of `simulations_net` decreased by one, as the two are used together in the code. 
+            If both nets are left to their default values, this is done automatically.   
+            In case this is None, a fully connected neural network with three 
+            hidden layers is used; the width of the hidden layers is given by
+            ``[int(input_size * 1.5), int(input_size * 0.75 + output_size * 3), int(output_size * 5)]``,
+            where `input_size` is the number of parameters, while `output_size` is determined by `embedding_dimension`.
+            For further details check
+            :func:`abcpy.NN_utilities.networks.createDefaultNN`. By default, this is None.
+        embedding_dimension: integer, optional 
+            Size of the learned summary statistics if `simulations_net` is None or a list. 
+            Specifically, in these cases 
+            `simulations_net` is automatically created having output size `embedding_dimension + 1`, of which all but 
+            the latter components will represent the learned summaries (the latter instead is a learned base measure). 
+            If also `parameters_net` is None or a list, it will be automatically created with output size equal to
+            `embedding_dimension`. By default `embedding_dimension` is None, in which case it is fixed to the number 
+            of parameters in the model.   
+        n_samples: int, optional
+            The number of (parameter, simulated data) tuple to be generated to learn the summary statistics in pilot
+            step. The default value is 1000.
+            This is ignored if `simulations` and `parameters` are provided.
+        n_samples_val: int, optional
+            The number of (parameter, simulated data) tuple to be generated to be used as a validation set in the pilot
+            step. The default value is 0, which means no validation set is used.
+            This is ignored if `simulations_val` and `parameters_val` are provided.
+        parameters: array, optional
+            A numpy array with shape (n_samples, n_parameters) that is used, together with `simulations` to fit the
+            summary selection learning algorithm. It has to be provided together with `simulations`, in which case no
+            other simulations are performed to generate the training data. Default value is None.
+        simulations: array, optional
+            A numpy array with shape (n_samples, output_size) that is used, together with `parameters` to fit the
+            summary selection learning algorithm. It has to be provided together with `parameters`, in which case no
+            other simulations are performed to generate the training data. These are transformed by the 
+            `statistics_calc` statistics before the learning step is done. Default value is None.
+        parameters_val: array, optional
+            A numpy array with shape (n_samples_val, n_parameters) that is used, together with `simulations_val` as a 
+            validation set in the summary selection learning algorithm. It has to be provided together with 
+            `simulations_val`, in which case no other simulations are performed to generate the validation set. Default 
+            value is None.
+        simulations_val: array, optional
+            A numpy array with shape (n_samples_val, output_size) that is used, together with `parameters_val` as a 
+            validation set in the summary selection learning algorithm. It has to be provided together with 
+            `parameters_val`, in which case no other simulations are performed to generate the validation set. Default
+            value is None.
+        lower_bound_simulations: np.ndarray, optional
+            Array of the same length of the simulations on which the statistics will be learned (therefore, after
+            `statistics_calc` has been applied). It contains the lower bounds of the simulations, with each entry
+            being either None or a number. It works together with `upper_bound_simulations` to determine the
+            nonlinear transformation mapping the bounded space to an unbounded one: if both upper and lower
+            bounds for a given entry are None, no transformation is applied to that entry. If both of them are numbers,
+            a transformation mapping a compact domain to an unbounded one is applied. If instead the lower bound is a
+            number and the upper one is None, a transformation for lower bounded variables is applied. More details on
+            the transformations can be found at :class:`abcpy.transformers.BoundedVarTransformer`. By default,
+            `lower_bound_simulations` is None, in which case all variables are assumed to not be lower bounded.
+        upper_bound_simulations: np.ndarray, optional
+            Array of the same length of the simulations on which the statistics will be learned (therefore, after
+            `statistics_calc` has been applied). It contains the upper bounds of the simulations, with each entry
+            being either None or a number. It works together with `lower_bound_simulations` to determine the
+            nonlinear transformation mapping the bounded space to an unbounded one: if both upper and lower
+            bounds for a given entry are None, no transformation is applied to that entry. If both of them are numbers,
+            a transformation mapping a compact domain to an unbounded one is applied. If instead the lower bound is a
+            number and the upper one is None, a transformation for lower bounded variables is applied. More details on
+            the transformations can be found at :class:`abcpy.transformers.BoundedVarTransformer`. By default,
+            `upper_bound_simulations` is None, in which case all variables are assumed to not be upper bounded.
+        sliced: boolean, optional
+            If True, the exponential family is fit with the sliced Score Matching approach, which is a faster
+            (stochastic) version of Score Matching. If False, the full Score Matching approach is used. Default is True.
+        noise_type: basestring, optional
+            Denotes the noise type used in the sliced Score Matching version. It can be 'radermacher', 'gaussian' or
+            'sphere', with 'radermacher' being the default one. Ignored if `sliced=False`.
+        variance_reduction: boolean, optional
+            If True, use the variance reduction version of Sliced Score Matching (when that is used), which replaces a
+            term with its exact expectation over the noise distribution. Cannot be used when `noise=sphere`.
+            Default is False, ignored if `sliced=False`.
+        n_epochs: integer, optional
+            the number of epochs used for training the neural network. Default is 100
+        batch_size: integer, optional
+            the batch size used for training the neural network. Default is 16
+        scale_samples: boolean, optional
+            If True, the simulations are scaled to the (0,1) range before the transformation is learned (i.e., before
+            being fed to the neural network). This happens after the simulations have been transformed with
+            `statistics_calc` and after the (optional) nonlinear transformation governed by `lower_bound_simulations`
+            and `upper_bound_simulations` is applied. This relies on a wrapping of `sklearn.preprocessing.MinMaxScaler`.
+            The validation set will also be rescaled in the same fashion.
+            When calling the `get_statistics` and the `get_simulations_network` methods,
+            the network will be wrapped by :class:`abcpy.NN_utilities.networks.ScalerAndNet`; this automatically
+            takes care of transforming the data with the scaler before applying the neural network.
+            It is highly recommended to use a scaler, as neural networks are sensitive to the range of input data. A 
+            case in which you may not want to use a scaler is timeseries data, as the scaler works independently on each
+            feature of the data.
+            Default value is True. 
+        scale_parameters: boolean, optional
+            If True, the parameters are scaled to the (0,1) range before the natural parameters transformation
+            is learned (i.e., before being fed to the neural network).
+            This relies on a wrapping of `sklearn.preprocessing.MinMaxScaler`.
+            The validation set will also be rescaled in the same fashion.
+            When calling the `get_statistics` and the `get_parameters_network` methods,
+            the network will be wrapped by :class:`abcpy.NN_utilities.networks.ScalerAndNet`; this automatically
+            takes care of transforming the data with the scaler before applying the neural network.
+            For parameter, the scaler is not as critical as for simulations, as parameters usually have smaller ranges.
+            If however the different parameters differ by orderd of magnitude, using a scaler is recommended.
+            Default value is False.
+        early_stopping: boolean, optional
+            If True, the validation set (which needs to be either provided through the arguments `parameters_val` and
+            `simulations_val` or generated by setting `n_samples_val` to a value larger than 0) is used to early stop
+            the training of the neural network as soon as the loss on the validation set starts to increase. Default
+            value is False.
+        epochs_early_stopping_interval: integer, optional
+            The frequency at which the validation error is compared in order to decide whether to early stop the
+            training or not. Namely, if `epochs_early_stopping_interval=10`, early stopping can happen only at epochs
+            multiple of 10. Default value is 1.
+        start_epoch_early_stopping: integer, optional
+            The epoch after which early stopping can happen; in fact, as soon as training starts, there may be a 
+            transient period in which the loss increases. Default value is 10.
+        cuda: boolean, optional
+             If cuda=None, it will select GPU if it is available. Or you can specify True to use GPU or False to use CPU
+        load_all_data_GPU: boolean, optional
+            If True and if we a GPU is used, the whole dataset is loaded on the GPU before training begins; this may
+            speed up training as it avoid transfer between CPU and GPU, but it is not guaranteed to do. Note that if the
+            dataset is not small enough, setting this to True causes things to crash if the dataset is too large.
+            Default to False, you should not rely too much on this.
+        seed: integer, optional
+            Optional initial seed for the random number generator. The default value is generated randomly.
+        nonlinearity_simulations: torch.nn class, optional
+            If the neural networks for the simulations is built automatically (ie when `simulations_net` is either a
+            list or None), then this is used nonlinearity. Default is `torch.nn.Softplus`. This is because the Score
+            Matching routine (when `sliced=False`) needs the output of the simulations net to have a non-zero second
+            derivative with respect to data, which does not happen when using the common ReLU nonlinearity.
+        nonlinearity_parameters: torch.nn class, optional
+            If the neural networks for the simulations is built automatically (ie when `parameters_net` is either a
+            list or None), then this is used nonlinearity. Default is `torch.nn.ReLU`.
+        batch_norm: boolean, optional
+            If True, a batch normalization layer is put on top of the parameters net when that is built automatically.
+            This improves the performance of the method as it reduces the degeneracy of the
+            (summary statistics) * (natural parameters) product. Default is True.
+        batch_norm_momentum: float, optional
+            Momentum value with which the batch estimates in the batch norm layer are updated at each batch; see
+            `torch.nn.BatchNorm1d` for more information. Default is 0.1. Ignored if `batch_norm` is False, or if
+            an actual `parameters_net` is provided.
+        batch_norm_update_before_test: boolean, optional
+            When using batch norm layer on the test set, the resulting test loss evaluation can be noisy as the
+            batch norm estimates change during the train phase. To reduce this issue, it is enough to perform a simple
+            forward pass of the full train set (without backprop or loss evaulation) before the testing phase is
+            started. Set `batch_norm_update_before_test=True` to do that. Default is False.
+            Ignored if `batch_norm` is False, if an actual `parameters_net` is provided, as well as if no test set
+            is present.
+        lr_simulations: float, optional
+            The learning rate to be used in the iterative training scheme for the simulations neural network. 
+            Default to 1e-3.
+        lr_parameters: float, optional
+            The learning rate to be used in the iterative training scheme for the parameters neural network. 
+            Default to 1e-3.
+        lam: float, optional
+            If the full Score Matching approach is used (ie `sliced=False`) this denotes the amount of
+            second derivative regularization added to the Score Matching loss in the way proposed in Kingma & LeCun
+            (2010). Defaul is 0, corresponding to no regularization.
+        optimizer_simulations: torch Optimizer class, optional
+            A torch Optimizer class, for instance `SGD` or `Adam`, to be used for the simulations network. 
+            Default to `Adam`. Additional parameters may be passed through the `optimizer_simulations_kwargs` argument.
+        optimizer_parameters: torch Optimizer class, optional
+            A torch Optimizer class, for instance `SGD` or `Adam`, to be used for the parameters network. 
+            Default to `Adam`. Additional parameters may be passed through the `optimizer_parameters_kwargs` argument.
+        scheduler_simulations: torch _LRScheduler class, optional
+            A torch _LRScheduler class, used to modify the learning rate across epochs for the simulations net. 
+            By default, a :class:`torch.optim.lr_scheduler.ExponentialLR` scheduler with `gamma=0.99` is used.
+            Additional arguments may be passed through the `scheduler_simulations_kwargs` parameter.
+        scheduler_parameters: torch _LRScheduler class, optional
+            A torch _LRScheduler class, used to modify the learning rate across epochs for the parameters net. 
+            By default, a :class:`torch.optim.lr_scheduler.ExponentialLR` scheduler with `gamma=0.99` is used.
+            Additional arguments may be passed through the `scheduler_parameters_kwargs` parameter.
+        start_epoch_training: integer, optional
+            If schedulers is used, for the first `start_epoch_training` epochs the scheduler is applied to modify
+            the learning rate without training the network. From then on, the training proceeds normally, applying both
+            the scheduler and the optimizer at each epoch. Default to 0.
+        optimizer_simulations_kwargs: Python dictionary, optional
+            dictionary containing optional keyword arguments for the optimizer used for the simulations network.
+        optimizer_parameters_kwargs: Python dictionary, optional
+            dictionary containing optional keyword arguments for the optimizer used for the parameters network.
+        scheduler_simulations_kwargs: Python dictionary, optional
+            dictionary containing optional keyword arguments for the simulations scheduler.
+        scheduler_parameters_kwargs: Python dictionary, optional
+            dictionary containing optional keyword arguments for the parameters scheduler.
+        use_tqdm : boolean, optional
+            Whether using tqdm or not to display progress. Defaults to True.
+        """
+        self.logger = logging.getLogger(__name__)
+        self.scale_samples = scale_samples
+        self.scale_parameters = scale_parameters
+        self.sliced = sliced
+
+        if lower_bound_simulations is not None and not hasattr(lower_bound_simulations, "shape"):
+            raise RuntimeError("Provided lower bounds need to be a numpy array.")
+        if upper_bound_simulations is not None and not hasattr(upper_bound_simulations, "shape"):
+            raise RuntimeError("Provided upper bounds need to be a numpy array.")
+        if upper_bound_simulations is not None and lower_bound_simulations is not None and \
+                lower_bound_simulations.shape != upper_bound_simulations.shape:
+            raise RuntimeError("Provided lower and upper bounds need to have same shape.")
+
+        # Define device
+        if not has_torch:
+            raise ImportError(
+                "Pytorch is required to instantiate an element of the {} class, in order to handle "
+                "neural networks. Please install it. ".format(self.__class__.__name__))
+
+        # set random seed for torch as well:
+        if seed is not None:
+            torch.manual_seed(seed)
+
+        if cuda is None:
+            cuda = torch.cuda.is_available()
+        elif cuda and not torch.cuda.is_available():
+            # if the user requested to use GPU but no GPU is there
+            cuda = False
+            self.logger.warning(
+                "You requested to use GPU but no GPU is available! The computation will proceed on CPU.")
+
+        self.device = "cuda" if cuda and torch.cuda.is_available() else "cpu"
+        if self.device == "cuda":
+            self.logger.debug("We are using GPU to train the network.")
+        else:
+            self.logger.debug("We are using CPU to train the network.")
+
+        # this handles generation of the data (or its formatting in case the data is provided to the Semiautomatic
+        # class)
+        super(ExponentialFamilyScoreMatching, self).__init__(model, statistics_calc, backend, n_samples, n_samples_val,
+                                                             1, parameters, simulations, seed=seed,
+                                                             parameters_val=parameters_val, simulations_val=simulations_val)
+
+        # we have a validation set if it has the following attribute with size larger than 0
+        self.has_val_set = hasattr(self, "sample_parameters_val") and len(self.sample_parameters_val) > 0
+
+        self.logger.info('Learning of the transformation...')
+        # Define Data
+        parameters, simulations = self.sample_parameters, self.sample_statistics
+        if self.has_val_set:
+            parameters_val, simulations_val = self.sample_parameters_val, self.sample_statistics_val
+        else:
+            parameters_val, simulations_val = None, None
+
+        # define the scaler for the simulations by transforming them to a bounded domain (if needed, according to how
+        # the bounds are passed) and then rescaling to the [0,1] interval.
+        if lower_bound_simulations is None and upper_bound_simulations is None and not scale_samples:
+            # in this case we do not use any scaler for the simulations
+            self.has_scaler_for_simulations = False
+        else:
+            self.has_scaler_for_simulations = True
+
+            if lower_bound_simulations is None:
+                lower_bound_simulations = np.array([None] * simulations.shape[1])
+            if upper_bound_simulations is None:
+                upper_bound_simulations = np.array([None] * simulations.shape[1])
+            # if both bounds are None this scaler will just behave as a MinMaxScaler. It may be slightly efficient to
+            # use that, but it does not matter for now.
+            self.scaler_simulations = BoundedVarScaler(lower_bound_simulations, upper_bound_simulations,
+                                                       rescale_transformed_vars=self.scale_samples).fit(simulations)
+            simulations = self.scaler_simulations.transform(simulations)
+            if self.has_val_set:
+                simulations_val = self.scaler_simulations.transform(simulations_val)
+
+        # now scale the parameters
+        if self.scale_parameters:
+            self.scaler_parameters = MinMaxScaler().fit(parameters)
+            parameters = self.scaler_parameters.transform(parameters)
+            if self.has_val_set:
+                parameters_val = self.scaler_parameters.transform(parameters_val)
+
+        # torch.tensor(scaler.transform(samples.reshape(-1, samples.shape[-1])).astype("float32"),
+        #              requires_grad=requires_grad).reshape(samples.shape)
+
+        # transform to torch tensors:
+        simulations = torch.tensor(simulations.astype("float32"), requires_grad=True)
+        parameters = torch.tensor(parameters.astype("float32"), requires_grad=False)
+        if self.has_val_set:
+            simulations_val = torch.tensor(simulations_val.astype("float32"), requires_grad=True)
+            parameters_val = torch.tensor(parameters_val.astype("float32"), requires_grad=False)
+
+        # now setup the default neural network or not
+
+        if embedding_dimension is None:
+            embedding_dimension = parameters.shape[1]
+
+        if isinstance(simulations_net, torch.nn.Module):
+            self.simulations_net = simulations_net
+            self.logger.debug('We use the provided neural network for the summary statistics')
+
+        elif isinstance(simulations_net, list) or simulations_net is None:
+            # therefore we need to generate the neural network given the list. The following function returns a class
+            # of NN with given input size, output size and hidden sizes; then, need () to instantiate the network
+            self.simulations_net = createDefaultNNWithDerivatives(
+                input_size=simulations.shape[1], output_size=embedding_dimension + 1, hidden_sizes=simulations_net,
+                nonlinearity=torch.nn.Softplus if nonlinearity_simulations is None else nonlinearity_simulations)()
+            self.logger.debug('We generate a default neural network for the summary statistics')
+        else:
+            raise RuntimeError("'simulations_net' needs to be either a torch.nn.Module, or a list, or None.")
+
+        if isinstance(parameters_net, torch.nn.Module):
+            self.parameters_net = parameters_net
+            self.logger.debug('We use the provided neural network for the parameters')
+
+        elif isinstance(parameters_net, list) or parameters_net is None:
+            # therefore we need to generate the neural network given the list. The following function returns a class
+            # of NN with given input size, output size and hidden sizes; then, need () to instantiate the network
+            self.parameters_net = createDefaultNN(
+                input_size=parameters.shape[1], output_size=embedding_dimension, hidden_sizes=parameters_net,
+                nonlinearity=torch.nn.ReLU() if nonlinearity_parameters is None else nonlinearity_parameters(),
+                batch_norm_last_layer=batch_norm, batch_norm_last_layer_momentum=batch_norm_momentum)()
+            self.logger.debug('We generate a default neural network for the parameters')
+        else:
+            raise RuntimeError("'parameters_net' needs to be either a torch.nn.Module, or a list, or None.")
+
+        if cuda:
+            self.simulations_net.cuda()
+            self.parameters_net.cuda()
+
+        self.logger.debug('We now run the training routine')
+        self.train_losses, self.test_losses = self._train(simulations, parameters, simulations_val, parameters_val,
+                                                          n_epochs=n_epochs, use_tqdm=use_tqdm,
+                                                          early_stopping=early_stopping,
+                                                          load_all_data_GPU=load_all_data_GPU,
+                                                          lr_simulations=lr_simulations, lr_parameters=lr_parameters,
+                                                          batch_size=batch_size,
+                                                          epochs_test_interval=epochs_early_stopping_interval,
+                                                          epochs_before_early_stopping=start_epoch_early_stopping,
+                                                          batch_norm_update_before_test=batch_norm_update_before_test,
+                                                          noise_type=noise_type, variance_reduction=variance_reduction,
+                                                          optimizer_simulations=optimizer_simulations,
+                                                          optimizer_parameters=optimizer_parameters,
+                                                          scheduler_simulations=scheduler_simulations,
+                                                          scheduler_parameters=scheduler_parameters,
+                                                          optimizer_simulations_kwargs=optimizer_simulations_kwargs,
+                                                          optimizer_parameters_kwargs=optimizer_parameters_kwargs,
+                                                          scheduler_simulations_kwargs=scheduler_simulations_kwargs,
+                                                          scheduler_parameters_kwargs=scheduler_parameters_kwargs,
+                                                          lam=lam, start_epoch_training=start_epoch_training)
+        self.logger.info("Finished learning the transformation.")
+
+        # move back the nets to CPU.
+        self.simulations_net.cpu()
+        self.parameters_net.cpu()
+
+    def get_statistics(self, rescale_statistics=True):
+        """
+        Returns a :class:`abcpy.statistics.NeuralEmbedding` Statistics implementing the learned transformation.
+
+        If a scaler was used, the `net` attribute of the returned object is of the class
+        :class:`abcpy.NN_utilities.networks.ScalerAndNet`, which is a nn.Module object wrapping the scaler and the
+        learned neural network and applies the scaler before the data is fed through the neural network.
+
+        Additionally, as the learned summary statistics is given by the output of the simulations network excluding
+        the last component, we wrap the neural network in the :class:`abcpy.NN_utilities.networks.DiscardLastOutputNet`,
+        which automatically takes care of discarding the last output anytime the summary statistics are required.
+
+        Finally, notice that the summary statistics learned with the Exponential Family are only defined up to a
+        linear transformation. As such, it may happen that they have very different magnitude. For this reason, we
+        provide an option by which the summary statistics are rescale by their standard deviation on the training
+        (or test, whenever that was used) set. That is handled automatically by the
+        :class:`abcpy.statistics.NeuralEmbedding` class.
+
+        Parameters
+        ----------
+        rescale_statistics : boolean, optional
+            If this is set to True (default), then the returned statistics will be standardized, such that the
+            different components of the statistics have the same scale. The standardization works by dividing each
+            statistic by the standard deviation achieved on a reference set of simulations. Here, the used set is
+            either the validation set used in training (if available) or the training set. If False, no
+            standardization is used. Defaults to True.
+
+        Returns
+        -------
+        :class:`abcpy.statistics.NeuralEmbedding`
+            a statistics object that implements the learned transformation.
+        """
+        if self.has_scaler_for_simulations:
+            net = ScalerAndNet(net=DiscardLastOutputNet(self.simulations_net), scaler=self.scaler_simulations)
+        else:
+            net = DiscardLastOutputNet(self.simulations_net)
+
+        if rescale_statistics:
+            # We need a hacky way here. In fact sample_statistics is the one you obtain after the first
+            # statistics_calc is applied; if you initialize Neural embedding with the statistics_calc directly,
+            # that is applied again, which is not correct.
+            # Then we first instantiate the NeuralEmbedding without statistics_calc, and then add it later. In this way
+            # the standard deviation is computed correctly, but the behavior of the Statistics in new data will also be
+            # correct.
+            statistics_calc_new = NeuralEmbedding(net=net, reference_simulations=self.sample_statistics_val
+            if self.has_val_set else self.sample_statistics)
+            statistics_calc_new.previous_statistics = self.statistics_calc
+        else:
+            statistics_calc_new = NeuralEmbedding(net=net, previous_statistics=self.statistics_calc)
+        return statistics_calc_new
+
+    def get_simulations_network(self):
+        """
+        Returns the learned neural network for the simulations, representing the learned summary statistics;
+        if a scaler was used, the neural net is of the class
+        :class:`abcpy.NN_utilities.networks.ScalerAndNet`, which is a nn.Module object wrapping the scaler and the
+        learned neural network and automatically applies the scaler before the data is fed through the neural network.
+        The original neural network is contained in the `net` attribute of that.
+
+        Returns
+        -------
+        torch.nn object
+            the learned simulations neural network
+        """
+        return ScalerAndNet(self.simulations_net,
+                            self.scaler_simulations) if self.has_scaler_for_simulations else self.simulations_net
+
+    def get_parameters_network(self):
+        """
+        Returns the learned neural network for the parameters, representing the learned natural parameters in the
+        exponential family; if a scaler was used, the neural net is of the class
+        :class:`abcpy.NN_utilities.networks.ScalerAndNet`, which is a nn.Module object wrapping the scaler and the
+        learned neural network and automatically applies the scaler before the data is fed through the neural network.
+        The original neural network is contained in the `net` attribute of that.
+
+        Returns
+        -------
+        torch.nn object
+            the learned parameters neural network
+        """
+        return ScalerAndNet(self.parameters_net, self.scaler_parameters) if self.scale_parameters else \
+            self.parameters_net
+
+    def get_simulations_scaler(self):
+        """
+        Returns the scaler used for transforming the simulations before feeding them through the neural network.
+        Specifically, it can be a :class:`abcpy.transformers.BoundedVarScaler` if the simulations are transformed from
+        a bounded to an unbounded domain with a nonlinear transformation, or a
+        :class:`sklearn.preprocessing.MinMaxScaler` if the simulations were rescaled to (0,1) without non linear
+        transformation. It returns `None` if no scaler was used. The :class:`abcpy.transformers.BoundedVarScaler`
+        conforms to the same API as standard sklearn scalers, so it can be used in the same fashion.
+
+        Returns
+        -------
+        abcpy.transformers.BoundedVarScaler, sklearn.preprocessing.MinMaxScaler or None
+            the scaler applied to simulations before the neural network (if present).
+        """
+        return self.scaler_simulations if self.has_scaler_for_simulations else None
+
+    def get_parameters_scaler(self):
+        """
+        Returns the scaler used for transforming the parameters before feeding them through the neural network.
+        It is an instance of :class:`sklearn.preprocessing.MinMaxScaler`, which rescales the parameters to (0,1).
+        It returns `None` if no scaler was used.
+
+        Returns
+        -------
+        sklearn.preprocessing.MinMaxScaler or None
+            the scaler applied to parameters before the neural network (if present).
+        """
+        return self.scaler_parameters if self.scale_parameters else None
+
+    def _train(self, samples_matrix, theta_vect,
+               samples_matrix_test=None, theta_vect_test=None, epochs_test_interval=10,
+               epochs_before_early_stopping=100,
+               early_stopping=False,
+               n_epochs=100, batch_size=None, lr_simulations=0.001, lr_parameters=0.001,
+               load_all_data_GPU=False,
+               batch_norm_update_before_test=False,
+               noise_type='radermacher', variance_reduction=False,
+               optimizer_simulations=None, optimizer_parameters=None,
+               scheduler_simulations=None, scheduler_parameters=None,
+               optimizer_simulations_kwargs={}, optimizer_parameters_kwargs={},
+               scheduler_simulations_kwargs={}, scheduler_parameters_kwargs={},
+               start_epoch_training=0,
+               lam=0.0, use_tqdm=False):
+        """This assumes samples matrix to be a 2d tensor with size (n_theta, size_sample) and theta_vect a 2d tensor with
+        size (n_theta, p).
+        """
+        if self.sliced:
+            batch_steps = lambda samples, etas: self._single_sliced_score_matching(samples, etas, noise_type=noise_type,
+                                                                                   variance_reduction=variance_reduction)
+        else:
+            batch_steps = lambda samples, etas: self._batch_Fisher_div_with_c_x(samples, etas, lam=lam)
+
+        if load_all_data_GPU:
+            # we move all data to the gpu; it needs to be small enough
+            samples_matrix = samples_matrix.to(self.device)
+            if samples_matrix_test is not None:
+                samples_matrix_test = samples_matrix_test.to(self.device)
+            theta_vect = theta_vect.to(self.device)
+            if theta_vect_test is not None:
+                theta_vect_test = theta_vect_test.to(self.device)
+
+        compute_test_loss = False
+        if theta_vect_test is not None and samples_matrix_test is not None:
+            test_loss_list = []
+            compute_test_loss = True
+            n_theta_test = theta_vect_test.shape[0]
+
+        if optimizer_simulations is None:
+            optimizer_simulations = Adam(self.simulations_net.parameters(), lr=lr_simulations,
+                                         **optimizer_simulations_kwargs)
+        else:
+            optimizer_simulations = optimizer_simulations(self.simulations_net.parameters(), lr=lr_simulations,
+                                                          **optimizer_simulations_kwargs)
+
+        if optimizer_parameters is None:
+            optimizer_parameters = Adam(self.parameters_net.parameters(), lr=lr_parameters,
+                                        **optimizer_parameters_kwargs)
+        else:
+            optimizer_parameters = optimizer_parameters(self.parameters_net.parameters(), lr=lr_parameters,
+                                                        **optimizer_parameters_kwargs)
+
+        if batch_size is None:  # in this case use full batch
+            batch_size = theta_vect.shape[0]
+
+        n_theta = theta_vect.shape[0]
+
+        loss_list = []
+
+        # define now the LR schedulers:
+        enable_scheduler_simulations = True
+        enable_scheduler_parameters = True
+
+        if scheduler_simulations is False:
+            enable_scheduler_simulations = False
+        else:
+            if scheduler_simulations is None:
+                # default scheduler
+                scheduler_simulations = lr_scheduler.ExponentialLR
+                if len(scheduler_simulations_kwargs) == 0:  # no arguments provided
+                    scheduler_simulations_kwargs = dict(gamma=0.99)
+
+            # instantiate the scheduler
+            scheduler_simulations = scheduler_simulations(optimizer_simulations, **scheduler_simulations_kwargs)
+
+        if scheduler_parameters is False:
+            enable_scheduler_parameters = False
+        else:
+            if scheduler_parameters is None:
+                # default scheduler
+                scheduler_parameters = lr_scheduler.ExponentialLR
+                if len(scheduler_parameters_kwargs) == 0:  # no arguments provided
+                    scheduler_parameters_kwargs = dict(gamma=0.99)
+
+            # instantiate the scheduler
+            scheduler_parameters = scheduler_parameters(optimizer_parameters, **scheduler_parameters_kwargs)
+
+        # initialize the state_dict variables:
+        net_state_dict = None
+        net_state_dict_theta = None
+
+        for epoch in range(0, start_epoch_training):
+            if enable_scheduler_simulations:
+                scheduler_simulations.step()
+            if enable_scheduler_parameters:
+                scheduler_parameters.step()
+
+        for epoch in tqdm(range(start_epoch_training, n_epochs), disable=not use_tqdm):
+            # print("epoch", epoch)
+            # set nets to train mode (needed as there may be a batchnorm layer there):
+            self.simulations_net.train()
+            self.parameters_net.train()
+
+            indeces = self.rng.permutation(n_theta)  # this may be a bottleneck computationally?
+            batch_index = 0
+            total_train_loss_epoch = 0
+
+            # loop over batches
+            while batch_size * batch_index < n_theta:
+                # print(batch_index)
+                optimizer_simulations.zero_grad()
+                optimizer_parameters.zero_grad()
+
+                # by writing in this way, if we go above the number of elements in the vector, you don't care
+                batch_indeces = indeces[batch_size * batch_index:batch_size * (batch_index + 1)]
+
+                thetas_batch = theta_vect[batch_indeces].to(self.device)
+
+                # compute the transformed parameter values for the batch:
+                etas = self.parameters_net(thetas_batch)
+
+                samples_batch = samples_matrix[batch_indeces].to(self.device)
+                # now call the batch routine that takes care of forward step of simulations as well
+                batch_loss = batch_steps(samples_batch, etas)
+
+                total_train_loss_epoch += batch_loss.item()
+
+                # set requires_grad to False to save computation
+                if lr_simulations == 0:
+                    set_requires_grad(self.simulations_net, False)
+                if lr_parameters == 0:
+                    set_requires_grad(self.parameters_net, False)
+
+                batch_loss.backward()
+
+                # reset it
+                if lr_simulations == 0:
+                    set_requires_grad(self.simulations_net, True)
+                if lr_parameters == 0:
+                    set_requires_grad(self.parameters_net, True)
+
+                optimizer_simulations.step()
+                optimizer_parameters.step()
+
+                batch_index += 1
+
+            loss_list.append(total_train_loss_epoch / (batch_index + 1))
+
+            # at each epoch we compute the test loss; we need to use batches as well here, otherwise it may not fit
+            # to GPU memory
+            if compute_test_loss:
+                # first, we do forward pass of all the training data in order to update the batchnorm running means
+                # (if a batch norm layer is there):
+                if batch_norm_update_before_test:
+                    with torch.no_grad():
+                        batch_index = 0
+                        while batch_size * batch_index < n_theta:
+                            # the batchnorm is usually after the net; then, it is enough to feedforward the data there:
+                            thetas_batch = theta_vect[batch_size * batch_index:batch_size * (batch_index + 1)].to(
+                                self.device)
+                            _ = self.parameters_net(thetas_batch)
+                            batch_index += 1
+
+                self.simulations_net.eval()
+                self.parameters_net.eval()
+
+                batch_index = 0
+                total_test_loss_epoch = 0
+                while batch_size * batch_index < n_theta_test:
+                    # no need to shuffle the test data:
+                    thetas_batch = theta_vect_test[batch_size * batch_index:batch_size * (batch_index + 1)].to(
+                        self.device)
+                    samples_batch = samples_matrix_test[batch_size * batch_index:batch_size * (batch_index + 1)].to(
+                        self.device)
+
+                    # compute the transformed parameter values for the batch:
+                    etas_test = self.parameters_net(thetas_batch)
+
+                    total_test_loss_epoch += batch_steps(samples_batch, etas_test).item()
+
+                    batch_index += 1
+
+                test_loss_list.append(total_test_loss_epoch / (batch_index + 1))
+
+                # the test loss on last step is larger than the training_dataset_index before, stop training
+                if early_stopping and (epoch + 1) % epochs_test_interval == 0:
+                    # after `epochs_before_early_stopping` epochs, we can stop only if we saved a state_dict before
+                    # (ie if at least epochs_test_interval epochs have passed).
+                    if epoch + 1 > epochs_before_early_stopping and net_state_dict is not None:
+                        if test_loss_list[-1] > test_loss_list[- 1 - epochs_test_interval]:
+                            self.logger.info("Training has been early stopped at epoch {}.".format(epoch + 1))
+                            # reload the previous state dict:
+                            self.simulations_net.load_state_dict(net_state_dict)
+                            self.parameters_net.load_state_dict(net_state_dict_theta)
+                            break  # stop training
+                    # if we did not stop: update the state dict
+                    net_state_dict = self.simulations_net.state_dict()
+                    net_state_dict_theta = self.parameters_net.state_dict()
+
+            if enable_scheduler_simulations:
+                scheduler_simulations.step()
+            if enable_scheduler_parameters:
+                scheduler_parameters.step()
+
+        # after training, return to eval mode:
+        self.simulations_net.eval()
+        self.parameters_net.eval()
+
+        if compute_test_loss:
+            return loss_list, test_loss_list
+        else:
+            return loss_list, None
+
+    def _batch_Fisher_div_with_c_x(self, samples, etas, lam=0):
+        # do the forward pass at once here:
+        if hasattr(self.simulations_net, "forward_and_derivatives"):
+            transformed_samples, f, s = self.simulations_net.forward_and_derivatives(samples)
+        else:
+            transformed_samples = self.simulations_net(samples)
+            f, s = jacobian_second_order(samples, transformed_samples, diffable=True)
+
+        f = f.reshape(-1, f.shape[1], f.shape[2])
+        s = s.reshape(-1, s.shape[1], s.shape[2])
+
+        return Fisher_divergence_loss_with_c_x(f, s, etas, lam=lam) / (samples.shape[0])
+
+    def _single_sliced_score_matching(self, samples, etas, noise=None, detach=False, noise_type='radermacher',
+                                      variance_reduction=False):
+        """Can either receive noise as an input or generate it. etas have been pre-transformed by
+        parameters net
+        This was modified from:
+        https://github.com/ermongroup/sliced_score_matching/blob/master/losses/sliced_sm.py
+        This function takes care of generating the projection samples and computing the loss by taking the grad.
+
+        Here, only one projection is used for each sample; however the projection changes at each epoch.
+        """
+        etas = etas.view(-1, etas.shape[-1])
+        etas = torch.cat((etas, torch.ones(etas.shape[0], 1).to(etas)), dim=1)  # append a 1
+        reshaped_samples = samples.view(-1, samples.shape[-1])
+        reshaped_samples.requires_grad_(True)
+
+        if noise is None:
+            vectors = torch.randn_like(reshaped_samples).to(reshaped_samples)
+            if noise_type == 'radermacher':
+                vectors = vectors.sign()
+            elif noise_type == 'sphere':
+                if variance_reduction:
+                    raise RuntimeError("Noise of type 'sphere' can't be used with variance reduction.")
+                else:
+                    vectors = vectors / torch.norm(vectors, dim=-1, keepdim=True) * np.sqrt(vectors.shape[-1])
+            elif noise_type == 'gaussian':
+                pass
+            else:
+                raise RuntimeError("Noise type not implemented")
+        else:
+            vectors = noise
+
+        transformed_samples = self.simulations_net(reshaped_samples)
+        logp = torch.bmm(etas.unsqueeze(1), transformed_samples.unsqueeze(2))  # way to do batch dot products
+        logp = logp.sum()
+        grad1 = autograd.grad(logp, reshaped_samples, create_graph=True)[0]
+        gradv = torch.sum(grad1 * vectors)
+        if variance_reduction:
+            loss1 = torch.norm(grad1, dim=-1) ** 2 * 0.5  # this is the only difference
+        else:
+            loss1 = torch.sum(grad1 * vectors, dim=-1) ** 2 * 0.5
+        if detach:
+            loss1 = loss1.detach()
+        grad2 = autograd.grad(gradv, reshaped_samples, create_graph=True)[0]
+        loss2 = torch.sum(vectors * grad2, dim=-1)
+        if detach:
+            loss2 = loss2.detach()
+
+        loss = (loss1 + loss2).mean()
+        return loss
diff --git a/abcpy/transformers.py b/abcpy/transformers.py
index 95ba4675..ef3ea6b3 100644
--- a/abcpy/transformers.py
+++ b/abcpy/transformers.py
@@ -1,18 +1,43 @@
 import numpy as np
+from sklearn.preprocessing import MinMaxScaler
 
+try:
+    import torch
+except ImportError:
+    has_torch = False
+else:
+    has_torch = True
 
-# these transformers are used in the MCMC inference scheme, in order to run MCMC of an unbounded transformed space in
-# case the original space is bounded. It therefore also implements the jacobian terms which appear in the acceptance
-# rate.
+
+# The first two transformers are used in the MCMC inference scheme, in order to run MCMC of an unbounded transformed
+# space in case the original space is bounded. It therefore also implements the jacobian terms which appear in
+# the acceptance rate. BoundedVarScaler is instead used in the training routine for the Exponential Family summary
+# statistics learning.
 
 class BoundedVarTransformer:
     """
-    This scaler implements both lower bounded and two sided bounded transformations according to the provided bounds;
+    This scaler implements both lower bounded and two sided bounded transformations according to the provided bounds.
+    It works on 1d vectors. You need to specify separately the lower and upper bounds in two arrays with the same length 
+    of the objects on which the transformations will be applied (likely the parameters on which MCMC is conducted for 
+    this function). 
+    
+    If the bounds for a given variable are both None, it is assumed to be unbounded; if instead the 
+    lower bound is given and the upper bound is None, it is assumed to be lower bounded. Finally, if both bounds are 
+    given, it is assumed to be bounded on both sides. 
     """
 
     def __init__(self, lower_bound, upper_bound):
-
-        # upper and lower bounds can be both scalar or array-like with size the size of the variable
+        """
+        Parameters
+        ----------
+        lower_bound : np.ndarray
+            Array of the same length of the variable to which the transformation will be applied, containing lower 
+            bounds of the variable. Each entry of the array can be either None or a number (see above).            
+        upper_bound
+            Array of the same length of the variable to which the transformation will be applied, containing upper 
+            bounds of the variable. Each entry of the array can be either None or a number (see above).            
+        """
+        # upper and lower bounds need to be numpy arrays with size the size of the variable
         self.lower_bound = lower_bound
         self.upper_bound = upper_bound
 
@@ -37,21 +62,69 @@ def __init__(self, lower_bound, upper_bound):
     def logit(x):
         return np.log(x) - np.log(1 - x)
 
-    def _check_data_in_bounds(self, x):
-        if np.any(x[self.lower_bounded_vars] <= self.lower_bound_lower_bounded):
+    def _check_data_in_bounds(self, X):
+        # Takes as input 1d or 2d arrays
+        X = np.atleast_2d(X)  # convert to 2d if needed
+        if np.any(X[:, self.lower_bounded_vars] <= self.lower_bound_lower_bounded):
             raise RuntimeError("The provided data are out of the bounds.")
-        if (x[self.two_sided_bounded_vars] <= self.lower_bound[self.two_sided_bounded_vars]).any() or (
-                x[self.two_sided_bounded_vars] >= self.upper_bound_two_sided).any():
+        if (X[:, self.two_sided_bounded_vars] <= self.lower_bound[self.two_sided_bounded_vars]).any() or (
+                X[:, self.two_sided_bounded_vars] >= self.upper_bound_two_sided).any():
             raise RuntimeError("The provided data is out of the bounds.")
 
-    def _apply_nonlinear_transf(self, x):
-        # apply the different scalers to the different kind of variables:
-        x_transf = x.copy()
-        x_transf[self.lower_bounded_vars] = np.log(x[self.lower_bounded_vars] - self.lower_bound_lower_bounded)
-        x_transf[self.two_sided_bounded_vars] = self.logit(
-            (x[self.two_sided_bounded_vars] - self.lower_bound_two_sided) / (
+    def _apply_nonlinear_transf(self, X):
+        # apply the different transformations to the different kind of variables. Takes as input 1d or 2d arrays
+        squeeze = len(X.shape) == 1
+        X = np.atleast_2d(X)
+        X_transf = X.copy()
+        X_transf[:, self.lower_bounded_vars] = np.log(X[:, self.lower_bounded_vars] - self.lower_bound_lower_bounded)
+        X_transf[:, self.two_sided_bounded_vars] = self.logit(
+            (X[:, self.two_sided_bounded_vars] - self.lower_bound_two_sided) / (
                     self.upper_bound_two_sided - self.lower_bound_two_sided))
-        return x_transf
+        return X_transf.squeeze() if squeeze else X_transf
+
+    def _apply_inverse_nonlinear_transf(self, X):
+        # inverse transformation. Different trasformations applied to different kind of variables.
+        # Takes as input 1d or 2d arrays
+        squeeze = len(X.shape) == 1
+        X = np.atleast_2d(X)
+        inv_X = X.copy()
+        inv_X[:, self.two_sided_bounded_vars] = (self.upper_bound_two_sided - self.lower_bound_two_sided) * np.exp(
+            X[:, self.two_sided_bounded_vars]) / (1 + np.exp(
+            X[:, self.two_sided_bounded_vars])) + self.lower_bound_two_sided
+        inv_X[:, self.lower_bounded_vars] = np.exp(X[:, self.lower_bounded_vars]) + self.lower_bound_lower_bounded
+        return inv_X.squeeze() if squeeze else inv_X
+
+    def _jac_log_det(self, x):
+        # computes the jacobian log determinant. Takes as input arrays.
+        results = np.zeros_like(x)
+        results[self.two_sided_bounded_vars] = np.log(
+            (self.upper_bound_two_sided - self.lower_bound_two_sided).astype("float64") / (
+                    (x[self.two_sided_bounded_vars] - self.lower_bound_two_sided) * (
+                    self.upper_bound_two_sided - x[self.two_sided_bounded_vars])))
+        results[self.lower_bounded_vars] = - np.log(x[self.lower_bounded_vars] - self.lower_bound_lower_bounded)
+        return np.sum(results)
+
+    def _jac_log_det_inverse_transform(self, x):
+        # computes the log determinant of jacobian evaluated in the inverse transformation. Takes as input arrays.
+        results = np.zeros_like(x)
+        results[self.lower_bounded_vars] = - x[self.lower_bounded_vars]
+        # two sided: need some tricks to avoid numerical issues:
+        results[self.two_sided_bounded_vars] = - np.log(
+            self.upper_bound_two_sided - self.lower_bound_two_sided)
+
+        indices = x[self.two_sided_bounded_vars] < 100  # for avoiding numerical overflow
+        res_b = np.copy(x)[self.two_sided_bounded_vars]
+        res_b[indices] = np.log(1 + np.exp(x[self.two_sided_bounded_vars][indices]))
+        results[self.two_sided_bounded_vars] += res_b
+
+        indices = x[self.two_sided_bounded_vars] > - 100  # for avoiding numerical overflow
+        res_c = np.copy(- x)[self.two_sided_bounded_vars]
+        res_c[indices] = np.log(1 + np.exp(- x[self.two_sided_bounded_vars][indices]))
+        results[self.two_sided_bounded_vars] += res_c
+
+        # res = res_b + res_c - res_a
+
+        return np.sum(results)
 
     @staticmethod
     def _array_from_list(x):
@@ -70,10 +143,12 @@ def _list_from_array(x_arr, x):
 
     def transform(self, x):
         """Scale features of x according to feature_range.
+
         Parameters
         ----------
-        x : list of len n_parameters
+        x : list of length n_parameters
             Input data that will be transformed.
+
         Returns
         -------
         Xt : array-like of shape (n_samples, n_features)
@@ -95,10 +170,12 @@ def transform(self, x):
 
     def inverse_transform(self, x):
         """Undo the scaling of x according to feature_range.
+
         Parameters
         ----------
         x : list of len n_parameters
             Input data that will be transformed. It cannot be sparse.
+
         Returns
         -------
         Xt : array-like of shape (n_samples, n_features)
@@ -107,11 +184,7 @@ def inverse_transform(self, x):
         # now apply the inverse transform
         x_arr = self._array_from_list(x)
 
-        inv_x = x_arr.copy()
-        inv_x[self.two_sided_bounded_vars] = (self.upper_bound_two_sided - self.lower_bound_two_sided) * np.exp(
-            x_arr[self.two_sided_bounded_vars]) / (1 + np.exp(
-            x_arr[self.two_sided_bounded_vars])) + self.lower_bound_two_sided
-        inv_x[self.lower_bounded_vars] = np.exp(x_arr[self.lower_bounded_vars]) + self.lower_bound_lower_bounded
+        inv_x = self._apply_inverse_nonlinear_transf(x_arr)
 
         # convert back to the list structure:
         inv_x = self._list_from_array(inv_x, x)
@@ -119,12 +192,12 @@ def inverse_transform(self, x):
         return inv_x
 
     def jac_log_det(self, x):
-        """Returns the log determinant of the Jacobian: log |J_t(x)|.
+        """Returns the log determinant of the Jacobian: :math:`\log |J_t(x)|`.
 
         Parameters
         ----------
         x : list of len n_parameters
-            Input data, living in the original space (with lower bound constraints).
+            Input data, living in the original space (with optional bounds).
         Returns
         -------
         res : float
@@ -133,49 +206,211 @@ def jac_log_det(self, x):
         x = self._array_from_list(x)
         self._check_data_in_bounds(x)
 
-        results = np.zeros_like(x)
-        results[self.two_sided_bounded_vars] = np.log(
-            (self.upper_bound_two_sided - self.lower_bound_two_sided).astype("float64") / (
-                    (x[self.two_sided_bounded_vars] - self.lower_bound_two_sided) * (
-                    self.upper_bound_two_sided - x[self.two_sided_bounded_vars])))
-        results[self.lower_bounded_vars] = - np.log(x[self.lower_bounded_vars] - self.lower_bound_lower_bounded)
-
-        return np.sum(results)
+        return self._jac_log_det(x)
 
     def jac_log_det_inverse_transform(self, x):
         """Returns the log determinant of the Jacobian evaluated in the inverse transform:
-        log |J_t(t^{-1}(x))| = - log |J_{t^{-1}}(x)|
+        :math:`\log |J_t(t^{-1}(x))| = - \log |J_{t^{-1}}(x)|`
 
         Parameters
         ----------
         x : list of len n_parameters
-            Input data, living in the transformed space (spanning the whole R^d).
+            Input data, living in the transformed space (spanning the whole :math:`R^d`).
         Returns
         -------
         res : float
-            log determinant of the jacobian evaluated in t^{-1}(x)
+            log determinant of the jacobian evaluated in :math:`t^{-1}(x)`
         """
         x = self._array_from_list(x)
 
-        results = np.zeros_like(x)
-        results[self.lower_bounded_vars] = - x[self.lower_bounded_vars]
-        # two sided: need some tricks to avoid numerical issues:
-        results[self.two_sided_bounded_vars] = - np.log(
-            self.upper_bound_two_sided - self.lower_bound_two_sided)
+        return self._jac_log_det_inverse_transform(x)
 
-        indices = x[self.two_sided_bounded_vars] < 100  # for avoiding numerical overflow
-        res_b = np.copy(x)[self.two_sided_bounded_vars]
-        res_b[indices] = np.log(1 + np.exp(x[self.two_sided_bounded_vars][indices]))
-        results[self.two_sided_bounded_vars] += res_b
 
-        indices = x[self.two_sided_bounded_vars] > - 100  # for avoiding numerical overflow
-        res_c = np.copy(- x)[self.two_sided_bounded_vars]
-        res_c[indices] = np.log(1 + np.exp(- x[self.two_sided_bounded_vars][indices]))
-        results[self.two_sided_bounded_vars] += res_c
+class BoundedVarScaler(MinMaxScaler, BoundedVarTransformer):
+    """
+    This scaler implements both lower bounded and two sided bounded transformations according to the provided bounds.
+    After the nonlinear transformation is applied, we optionally rescale the transformed variables to the (0,1)
+    range (default for this is True).
 
-        # res = res_b + res_c - res_a
+    It works on 2d vectors. You need to specify separately the lower and upper bounds in two arrays with the same length
+    of the objects on which the transformations will be applied (likely the simulations used to learn the
+    exponential family summaries for this one).
 
-        return np.sum(results)
+    If the bounds for a given variable are both None, it is assumed to be unbounded; if instead the
+    lower bound is given and the upper bound is None, it is assumed to be lower bounded. Finally, if both bounds are
+    given, it is assumed to be bounded on both sides.
+
+    Practically, this inherits from BoundedVarTransformer, which provides the transformations, and from sklearn
+    MinMaxScaler, which provides the rescaling capabilities. This class has the same API as sklearn scalers,
+    implementing fit and transform methods.
+    """
+
+    def __init__(self, lower_bound, upper_bound, feature_range=(0, 1), copy=True, rescale_transformed_vars=True):
+        """
+        Parameters
+        ----------
+        lower_bound : np.ndarray
+            Array of the same length of the variable to which the transformation will be applied, containing lower
+            bounds of the variable. Each entry of the array can be either None or a number (see above).
+        upper_bound
+            Array of the same length of the variable to which the transformation will be applied, containing upper
+            bounds of the variable. Each entry of the array can be either None or a number (see above).
+        feature_range : tuple (min, max), optional
+            Desired range of transformed data (obtained with the MinMaxScaler after the
+            nonlinear transformation is computed). Default=(0, 1)
+        copy : bool, optional
+            Set to False to perform inplace row normalization and avoid a
+            copy in the MinMaxScaler (if the input is already a numpy array). Defaults to True.
+        rescale_transformed_vars : bool, optional
+            Whether to apply the MinMaxScaler after the nonlinear transformation. Defaults to True.
+        """
+        BoundedVarTransformer.__init__(self, lower_bound, upper_bound)
+
+        MinMaxScaler.__init__(self, feature_range=feature_range, copy=copy)
+        self.rescale_transformed_vars = rescale_transformed_vars
+
+    @staticmethod
+    def _check_reshape_single_sample(x):
+        """"""
+        if len(x.shape) == 1:
+            pass
+        elif len(x.shape) == 2 and x.shape[0] == 1:
+            x = x.reshape(-1)
+        else:
+            raise RuntimeError("This can be computed for one sample at a time.")
+        return x
+
+    def fit(self, X, y=None):
+        """Compute the minimum and maximum to be used for later scaling.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data used to compute the per-feature minimum and maximum
+            used for later scaling along the features axis.
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+            Fitted scaler.
+        """
+        # need to check if we can apply the log first:
+        if has_torch and isinstance(X, torch.Tensor):
+            X = X.detach().numpy()
+        self._check_data_in_bounds(X)
+
+        # we first transform the data with the log transformation and then apply the scaler (optionally):
+        X = self._apply_nonlinear_transf(X)
+
+        if self.rescale_transformed_vars:
+            return MinMaxScaler.fit(self, X)
+        else:
+            return self
+
+    def transform(self, X):
+        """Scale features of X according to feature_range.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input data that will be transformed.
+
+        Returns
+        -------
+        Xt : array-like of shape (n_samples, n_features)
+            Transformed data.
+        """
+        # need to check if we can apply the log first:
+        if has_torch and isinstance(X, torch.Tensor):
+            X = X.detach().numpy()
+        self._check_data_in_bounds(X)
+
+        # we first transform the data with the log transformation and then apply the scaler (optionally):
+        X = self._apply_nonlinear_transf(X)
+
+        if self.rescale_transformed_vars:
+            return MinMaxScaler.transform(self, X)
+        else:
+            return X
+
+    def inverse_transform(self, X):
+        """Undo the scaling of X according to feature_range.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input data that will be transformed. It cannot be sparse.
+
+        Returns
+        -------
+        Xt : array-like of shape (n_samples, n_features)
+            Transformed data.
+        """
+        if self.rescale_transformed_vars:
+            X = MinMaxScaler.inverse_transform(self, X)
+
+        # now apply the inverse transform
+        inv_X = BoundedVarTransformer._apply_inverse_nonlinear_transf(self, X)
+
+        return inv_X
+
+    def jac_log_det(self, x):
+        """Returns the log determinant of the Jacobian: :math:`\log |J_t(x)|`.
+
+        Note that this considers only the Jacobian arising from the non-linear transformation, neglecting the scaling
+        term arising from the subsequent linear rescaling. In fact, the latter does not play any role in MCMC acceptance
+        rate.
+
+        Parameters
+        ----------
+        x : array-like of shape (n_features)
+            Input data, living in the original space (with optional bounds).
+        Returns
+        -------
+        res : float
+            log determinant of the jacobian
+        """
+        if has_torch and isinstance(x, torch.Tensor):
+            x = x.detach().numpy()
+        x = self._check_reshape_single_sample(x)
+        self._check_data_in_bounds(x.reshape(1, -1))
+
+        return BoundedVarTransformer._jac_log_det(self, x)
+
+    def jac_log_det_inverse_transform(self, x):
+        """Returns the log determinant of the Jacobian evaluated in the inverse transform:
+        :math:`\log |J_t(t^{-1}(x))| = - \log |J_{t^{-1}}(x)|`
+
+        Note that this considers only the Jacobian arising from the non-linear transformation, neglecting the scaling
+        term arising from the subsequent linear rescaling. In fact, the latter does not play any role in MCMC acceptance
+        rate.
+
+        Parameters
+        ----------
+        x : array-like of shape (n_features)
+            Input data, living in the transformed space (spanning the whole :math:`R^d`). It needs to be the value
+            obtained after the optional linear rescaling is applied.
+        Returns
+        -------
+        res : float
+            log determinant of the jacobian evaluated in :math:`t^{-1}(x)`
+        """
+        if has_torch and isinstance(x, torch.Tensor):
+            x = x.detach().numpy()
+
+        if self.rescale_transformed_vars:
+            # you still need to apply the inverse linear transformation in the transformed space to compute the jacobian
+            # for the right value of t^-1(x) (even if the jacobian itself does not take into account the linear
+            # transformation). Otherwise this function does computes the jacobian in the point obtained by applying the
+            # nonlinear inverse transformation to the input x, which is not correct if x was rescaled in the (0,1)
+            # range
+            x = MinMaxScaler.inverse_transform(self, np.atleast_2d(x))
+
+        x = self._check_reshape_single_sample(x)
+
+        return BoundedVarTransformer._jac_log_det_inverse_transform(self, x)
 
 
 class DummyTransformer:
diff --git a/doc/source/class-diagram.png b/doc/source/class-diagram.png
index 33517d0c..dbdf5bd4 100644
Binary files a/doc/source/class-diagram.png and b/doc/source/class-diagram.png differ
diff --git a/doc/source/class-diagram.svg b/doc/source/class-diagram.svg
index 997e28ef..8f0fb03d 100644
--- a/doc/source/class-diagram.svg
+++ b/doc/source/class-diagram.svg
@@ -1,23 +1,23 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 <svg
-   xmlns:dc="http://purl.org/dc/elements/1.1/"
-   xmlns:cc="http://creativecommons.org/ns#"
-   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-   xmlns:svg="http://www.w3.org/2000/svg"
-   xmlns="http://www.w3.org/2000/svg"
-   xmlns:xlink="http://www.w3.org/1999/xlink"
-   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
-   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
    width="448.60001pt"
    height="751.40002pt"
    viewBox="0 0 448.6 751.4"
    version="1.2"
    id="svg2242"
    sodipodi:docname="class-diagram.svg"
-   inkscape:version="1.0.2 (1.0.2+r75+1)"
+   inkscape:version="1.1 (1:1.1+202105261517+ce6663b3b7)"
    inkscape:export-filename="/home/lorenzo/Scrivania/OxWaSP/ABC-project/Code/abcpy/doc/source/class-diagram.png"
    inkscape:export-xdpi="200"
-   inkscape:export-ydpi="200">
+   inkscape:export-ydpi="200"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:xlink="http://www.w3.org/1999/xlink"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:dc="http://purl.org/dc/elements/1.1/">
   <metadata
      id="metadata2246">
     <rdf:RDF>
@@ -43,9 +43,9 @@
      inkscape:window-height="1016"
      id="namedview2244"
      showgrid="false"
-     inkscape:zoom="2.114055"
-     inkscape:cx="224.26564"
-     inkscape:cy="473.59898"
+     inkscape:zoom="1.4142136"
+     inkscape:cx="375.12015"
+     inkscape:cy="513.71308"
      inkscape:window-x="0"
      inkscape:window-y="27"
      inkscape:window-maximized="1"
@@ -55,7 +55,9 @@
      inkscape:snap-bbox-edge-midpoints="true"
      inkscape:bbox-paths="true"
      inkscape:bbox-nodes="true"
-     inkscape:snap-intersection-paths="true" />
+     inkscape:snap-intersection-paths="true"
+     inkscape:pagecheckerboard="0"
+     inkscape:document-units="pt" />
   <defs
      id="defs385">
     <g
@@ -2281,115 +2283,6 @@
      id="path399"
      d="m 336.2,559.2 v 16 h 96 v -16 z m 0,0"
      style="fill:#d1d1d1;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.4;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1" />
-  <g
-     style="clip-rule:nonzero"
-     id="g871"
-     clip-path="url(#clip18)"
-     transform="translate(7.8100894e-4,39.000779)">
-    <g
-       id="g869"
-       style="fill:#000000;fill-opacity:1">
-      <use
-         height="100%"
-         width="100%"
-         id="use849"
-         y="532.19299"
-         x="340.20001"
-         xlink:href="#glyph0-16" />
-      <use
-         height="100%"
-         width="100%"
-         id="use851"
-         y="532.19299"
-         x="345.73523"
-         xlink:href="#glyph0-31" />
-      <use
-         height="100%"
-         width="100%"
-         id="use853"
-         y="532.19299"
-         x="350.99347"
-         xlink:href="#glyph0-15" />
-      <use
-         height="100%"
-         width="100%"
-         id="use855"
-         y="532.19299"
-         x="356.52869"
-         xlink:href="#glyph0-20" />
-      <use
-         height="100%"
-         width="100%"
-         id="use857"
-         y="532.19299"
-         x="362.75534"
-         xlink:href="#glyph0-6" />
-      <use
-         height="100%"
-         width="100%"
-         id="use859"
-         y="532.19299"
-         x="365.52295"
-         xlink:href="#glyph0-21" />
-      <use
-         height="100%"
-         width="100%"
-         id="use861"
-         y="532.19299"
-         x="370.78119"
-         xlink:href="#glyph0-7" />
-      <use
-         height="100%"
-         width="100%"
-         id="use863"
-         y="532.19299"
-         x="373.5488"
-         xlink:href="#glyph0-6" />
-      <use
-         height="100%"
-         width="100%"
-         id="use865"
-         y="532.19299"
-         x="376.31641"
-         xlink:href="#glyph0-22" />
-      <use
-         height="100%"
-         width="100%"
-         id="use867"
-         y="532.19299"
-         x="381.85162"
-         xlink:href="#glyph0-3" />
-    </g>
-  </g>
-  <g
-     id="g875"
-     style="fill:#000000;fill-opacity:1"
-     transform="translate(7.8100894e-4,39.000779)">
-    <use
-       height="100%"
-       width="100%"
-       id="use873"
-       y="532.19299"
-       x="387.11188"
-       xlink:href="#glyph0-3" />
-  </g>
-  <g
-     style="clip-rule:nonzero"
-     id="g881"
-     clip-path="url(#clip19)"
-     transform="translate(7.8100894e-4,39.000779)">
-    <g
-       id="g879"
-       style="fill:#000000;fill-opacity:1">
-      <use
-         height="100%"
-         width="100%"
-         id="use877"
-         y="532.19299"
-         x="392.36218"
-         xlink:href="#glyph0-12" />
-    </g>
-  </g>
   <path
      inkscape:connector-curvature="0"
      id="path883"
@@ -5247,7 +5140,7 @@
   </g>
   <g
      id="g4022"
-     transform="translate(7.8100894e-4,43.500779)">
+     transform="translate(7.8100894e-4,27.000779)">
     <path
        style="fill:#d1d1d1;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.4;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1"
        d="m 335.82031,340.19922 v 16 h 96.37891 v -16 z m 0,0"
@@ -5354,11 +5247,6 @@
       </g>
     </g>
   </g>
-  <path
-     inkscape:connector-curvature="0"
-     id="path1847"
-     d="m 336.2,363.7 v 16 h 96 v -16 z m 0,0"
-     style="fill:#787878;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.4;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1" />
   <path
      inkscape:connector-curvature="0"
      id="path1887"
@@ -5424,20 +5312,13 @@
   <path
      inkscape:connector-curvature="0"
      id="path1911"
-     d="m 440.2,367.7 -8,4 8,4 v -8"
+     d="m 440.59998,335.18945 -8,4 8,4 v -8"
      style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.4;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1" />
   <g
      style="clip-rule:nonzero"
      id="g1915"
      clip-path="url(#clip70)"
-     transform="translate(7.8100894e-4,43.500779)">
-    <path
-       inkscape:connector-curvature="0"
-       id="path1913"
-       transform="matrix(1,0,0,-1,-79.8,720.2)"
-       d="m 519.99922,392.00078 h 8 v -20 h -16"
-       style="fill:none;stroke:#000000;stroke-width:0.8;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1" />
-  </g>
+     transform="translate(7.8100894e-4,43.500779)" />
   <path
      inkscape:connector-curvature="0"
      id="path1917"
@@ -6442,8 +6323,9 @@
   <path
      inkscape:connector-curvature="0"
      id="path2235"
-     d="m 336.2,371.7 h -16 v -168 h -80 v 16"
-     style="fill:none;stroke:#000000;stroke-width:0.4;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1" />
+     d="m 336.2,354.89305 -16,0.0447 0,-151.23772 h -80 v 16"
+     style="fill:none;stroke:#000000;stroke-width:0.4;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1"
+     sodipodi:nodetypes="ccccc" />
   <path
      inkscape:connector-curvature="0"
      id="path2237"
@@ -6456,52 +6338,20 @@
      style="fill:none;stroke:#000000;stroke-width:0.4;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1" />
   <path
      style="fill:#d1d1d1;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.4;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1"
-     d="m 335.82109,443.7075 v 16 H 432.2 v -16 z m 0,0"
-     id="path4024"
-     inkscape:connector-curvature="0" />
-  <path
-     inkscape:connector-curvature="0"
-     id="path4058"
-     d="m 335.82109,423.705 v 16 H 432.2 v -16 z m 0,0"
-     style="fill:#d1d1d1;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.4;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1" />
-  <path
-     style="fill:#d1d1d1;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.4;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1"
-     d="m 335.82109,403.7025 v 16 H 432.2 v -16 z m 0,0"
+     d="m 335.82109,387.2025 v 16 H 432.2 v -16 z m 0,0"
      id="path4092"
      inkscape:connector-curvature="0" />
-  <text
-     xml:space="preserve"
-     style="font-style:italic;font-variant:normal;font-weight:500;font-stretch:normal;font-size:9.57565px;line-height:1.25;font-family:'CMU Serif';-inkscape-font-specification:'CMU Serif, Medium Italic';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1.22192"
-     x="340.22708"
-     y="455.7164"
-     id="text4163"><tspan
-       sodipodi:role="line"
-       id="tspan4161"
-       x="340.22708"
-       y="455.7164"
-       style="font-style:normal;stroke-width:1.22192">ContrastiveDistLearn</tspan></text>
   <text
      xml:space="preserve"
      style="font-style:italic;font-variant:normal;font-weight:500;font-stretch:normal;font-size:10.0514px;line-height:1.25;font-family:'CMU Serif';-inkscape-font-specification:'CMU Serif, Medium Italic';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1.28263"
      x="340.20044"
-     y="415.70093"
+     y="399.20093"
      id="text4167"><tspan
        sodipodi:role="line"
        id="tspan4165"
        x="340.20044"
-       y="415.70093"
+       y="399.20093"
        style="font-style:normal;stroke-width:1.28263">SemiautomaticNN</tspan></text>
-  <text
-     id="text4171"
-     y="435.86887"
-     x="340.41858"
-     style="font-style:italic;font-variant:normal;font-weight:500;font-stretch:normal;font-size:9.57565px;line-height:1.25;font-family:'CMU Serif';-inkscape-font-specification:'CMU Serif, Medium Italic';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1.22192"
-     xml:space="preserve"><tspan
-       style="font-style:normal;stroke-width:1.22192"
-       y="435.86887"
-       x="340.41858"
-       sodipodi:role="line"
-       id="tspan4173">TripletDistLearn</tspan></text>
   <path
      style="fill:#d1d1d1;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.4;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1"
      d="m 176.2,559.2025 v 16 h 96 v -16 z m 0,0"
@@ -6538,7 +6388,7 @@
      clip-path="url(#clip46)"
      id="g4320"
      style="clip-rule:nonzero"
-     transform="translate(7.8100894e-4,-116.49922)">
+     transform="translate(7.8100894e-4,-132.99922)">
     <path
        style="fill:none;stroke:#000000;stroke-width:0.8;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1"
        d="m 511.99922,192.00078 h 16 v 20"
@@ -6547,7 +6397,7 @@
        inkscape:connector-curvature="0" />
   </g>
   <g
-     transform="translate(7.8100894e-4,-96.49672)"
+     transform="translate(7.8100894e-4,-112.99672)"
      style="clip-rule:nonzero"
      id="g4324"
      clip-path="url(#clip46)">
@@ -6562,7 +6412,7 @@
      clip-path="url(#clip46)"
      id="g4328"
      style="clip-rule:nonzero"
-     transform="translate(7.8100894e-4,-76.494221)">
+     transform="translate(7.8100894e-4,-92.994221)">
     <path
        style="fill:none;stroke:#000000;stroke-width:0.8;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1"
        d="m 511.99922,192.00078 h 16 v 20"
@@ -6594,17 +6444,6 @@
        d="m 511.99922,192.00078 h 16 v 20"
        style="fill:none;stroke:#000000;stroke-width:0.8;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1" />
   </g>
-  <text
-     xml:space="preserve"
-     style="font-style:italic;font-variant:normal;font-weight:500;font-stretch:normal;font-size:9.96487px;line-height:1.25;font-family:'CMU Serif';-inkscape-font-specification:'CMU Serif, Medium Italic';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1.27159"
-     x="340.20526"
-     y="375.6879"
-     id="text2023"><tspan
-       sodipodi:role="line"
-       id="tspan2021"
-       x="340.20526"
-       y="375.6879"
-       style="font-style:normal;stroke-width:1.27159">StatisticsLearning</tspan></text>
   <text
      xml:space="preserve"
      style="font-style:italic;font-variant:normal;font-weight:500;font-stretch:normal;font-size:10.2142px;line-height:1.25;font-family:'CMU Serif';-inkscape-font-specification:'CMU Serif, Medium Italic';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1.3034"
@@ -7127,4 +6966,309 @@
      style="fill:none;stroke:#000000;stroke-width:0.39974998;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;stroke-miterlimit:4;stroke-dasharray:none"
      d="m 176.2,355.2 h -16"
      id="path2257" />
+  <g
+     id="g3143">
+    <use
+       height="100%"
+       width="100%"
+       id="use849"
+       y="532.19299"
+       x="340.20001"
+       xlink:href="#glyph0-16"
+       transform="translate(7.8100894e-4,39.000779)"
+       style="clip-rule:nonzero;fill:#000000;fill-opacity:1" />
+    <use
+       height="100%"
+       width="100%"
+       id="use851"
+       y="532.19299"
+       x="345.73523"
+       xlink:href="#glyph0-31"
+       transform="translate(7.8100894e-4,39.000779)"
+       style="clip-rule:nonzero;fill:#000000;fill-opacity:1" />
+    <use
+       height="100%"
+       width="100%"
+       id="use853"
+       y="532.19299"
+       x="350.99347"
+       xlink:href="#glyph0-15"
+       transform="translate(7.8100894e-4,39.000779)"
+       style="clip-rule:nonzero;fill:#000000;fill-opacity:1" />
+    <use
+       height="100%"
+       width="100%"
+       id="use855"
+       y="532.19299"
+       x="356.52869"
+       xlink:href="#glyph0-20"
+       transform="translate(7.8100894e-4,39.000779)"
+       style="clip-rule:nonzero;fill:#000000;fill-opacity:1" />
+    <use
+       height="100%"
+       width="100%"
+       id="use857"
+       y="532.19299"
+       x="362.75534"
+       xlink:href="#glyph0-6"
+       transform="translate(7.8100894e-4,39.000779)"
+       style="clip-rule:nonzero;fill:#000000;fill-opacity:1" />
+    <use
+       height="100%"
+       width="100%"
+       id="use859"
+       y="532.19299"
+       x="365.52295"
+       xlink:href="#glyph0-21"
+       transform="translate(7.8100894e-4,39.000779)"
+       style="clip-rule:nonzero;fill:#000000;fill-opacity:1" />
+    <g
+       id="g3012"
+       transform="translate(13.465485,-35.006013)">
+      <g
+         id="g2901"
+         style="fill:#000000;fill-opacity:1"
+         transform="translate(7.8100894e-4,114.00078)">
+        <use
+           height="100%"
+           width="100%"
+           id="use2893"
+           y="492.19901"
+           x="357.36258"
+           xlink:href="#glyph0-13" />
+        <use
+           height="100%"
+           width="100%"
+           id="use2895"
+           y="492.19901"
+           x="361.78995"
+           xlink:href="#glyph0-7" />
+        <use
+           height="100%"
+           width="100%"
+           id="use2897"
+           y="492.19901"
+           x="364.55756"
+           xlink:href="#glyph0-6" />
+        <use
+           height="100%"
+           width="100%"
+           id="use2899"
+           y="492.19901"
+           x="367.32516"
+           xlink:href="#glyph0-22" />
+      </g>
+      <g
+         id="g2905"
+         style="fill:#000000;fill-opacity:1"
+         transform="translate(7.8100894e-4,114.00078)">
+        <use
+           height="100%"
+           width="100%"
+           id="use2903"
+           y="492.19901"
+           x="372.85043"
+           xlink:href="#glyph0-3" />
+      </g>
+      <g
+         id="g2909"
+         style="fill:#000000;fill-opacity:1"
+         transform="translate(7.8100894e-4,114.00078)">
+        <use
+           height="100%"
+           width="100%"
+           id="use2907"
+           y="492.19901"
+           x="378.11069"
+           xlink:href="#glyph0-3" />
+      </g>
+      <use
+         height="100%"
+         width="100%"
+         id="use2911"
+         y="492.19901"
+         x="383.37094"
+         xlink:href="#glyph0-12"
+         transform="translate(7.8100894e-4,114.00078)"
+         style="fill:#000000;fill-opacity:1" />
+    </g>
+  </g>
+  <g
+     id="g8083"
+     transform="translate(-10,9.5)">
+    <path
+       inkscape:connector-curvature="0"
+       id="path4452"
+       d="m 346.2,321.7 v 16 h 96 v -16 z m 0,0"
+       style="fill:#787878;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.4;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1" />
+    <path
+       inkscape:connector-curvature="0"
+       id="path4484"
+       d="m 442.2,337.7 v 16 h -96 v -16"
+       style="fill:none;stroke:#000000;stroke-width:0.4;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1" />
+    <g
+       style="clip-rule:nonzero"
+       id="g4490"
+       clip-path="url(#clip58)"
+       transform="translate(330.00078,289.50078)">
+      <g
+         id="g4488"
+         style="fill:#000000;fill-opacity:1">
+        <use
+           height="100%"
+           width="100%"
+           id="use4486"
+           y="60.199001"
+           x="20.040001"
+           xlink:href="#glyph0-23" />
+      </g>
+    </g>
+    <text
+       xml:space="preserve"
+       style="font-style:italic;font-variant:normal;font-weight:500;font-stretch:normal;font-size:9.96487px;line-height:1.25;font-family:'CMU Serif';-inkscape-font-specification:'CMU Serif, Medium Italic';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1.27159"
+       x="350.04526"
+       y="333.36676"
+       id="text2023"><tspan
+         sodipodi:role="line"
+         id="tspan2021"
+         x="350.04526"
+         y="333.36676"
+         style="font-style:normal;stroke-width:1.27159">StatisticsLearning</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10.4096px;line-height:1.25;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;stroke-width:0.650614"
+       x="356.66864"
+       y="349.03488"
+       id="text5458"><tspan
+         sodipodi:role="line"
+         id="tspan5456"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:10.4096px;font-family:'CMU Serif';-inkscape-font-specification:'CMU Serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;stroke-width:0.650614"
+         x="356.66864"
+         y="349.03488">get statistics()</tspan></text>
+  </g>
+  <g
+     clip-path="url(#clip46)"
+     id="g9855"
+     style="clip-rule:nonzero"
+     transform="translate(7.8100894e-4,-132.99922)">
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.8;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1"
+       d="m 511.99922,192.00078 h 16 v 20"
+       transform="matrix(1,0,0,-1,-79.8,720.2)"
+       id="path9853"
+       inkscape:connector-curvature="0" />
+  </g>
+  <path
+     style="clip-rule:nonzero;fill:none;stroke:#000000;stroke-width:0.8;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-opacity:1"
+     d="m 432.2,374.8 h 16 v -20"
+     id="path9857"
+     inkscape:connector-curvature="0" />
+  <path
+     style="fill:none;stroke:#000000;stroke-width:0.8;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+     d="m 448.2,354.8 v -15.61055 h -7.60002"
+     id="path11377" />
+  <g
+     id="g3418"
+     transform="translate(0,-15.9975)">
+    <path
+       style="fill:#d1d1d1;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.4;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1"
+       d="m 335.82109,463.2075 v 16 H 432.2 v -16 z m 0,0"
+       id="path2099"
+       inkscape:connector-curvature="0" />
+    <text
+       xml:space="preserve"
+       style="font-style:italic;font-variant:normal;font-weight:500;font-stretch:normal;font-size:8.97905px;line-height:1.25;font-family:'CMU Serif';-inkscape-font-specification:'CMU Serif, Medium Italic';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1.1458"
+       x="340.46701"
+       y="475.0676"
+       id="text2103"><tspan
+         sodipodi:role="line"
+         id="tspan2101"
+         x="340.46701"
+         y="475.0676"
+         style="font-style:normal;stroke-width:1.1458">ExpFamScoreMatching</tspan></text>
+  </g>
+  <g
+     id="g3225"
+     transform="translate(0,-16.5)">
+    <path
+       style="fill:#d1d1d1;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.4;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1"
+       d="m 335.82109,443.7075 v 16 H 432.2 v -16 z m 0,0"
+       id="path4024"
+       inkscape:connector-curvature="0" />
+    <text
+       xml:space="preserve"
+       style="font-style:italic;font-variant:normal;font-weight:500;font-stretch:normal;font-size:9.57565px;line-height:1.25;font-family:'CMU Serif';-inkscape-font-specification:'CMU Serif, Medium Italic';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1.22192"
+       x="340.22708"
+       y="455.7164"
+       id="text4163"><tspan
+         sodipodi:role="line"
+         id="tspan4161"
+         x="340.22708"
+         y="455.7164"
+         style="font-style:normal;stroke-width:1.22192">ContrastiveDistLearn</tspan></text>
+    <path
+       style="fill:#d1d1d1;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.4;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1"
+       d="m 335.82109,443.7075 v 16 H 432.2 v -16 z m 0,0"
+       id="path3119"
+       inkscape:connector-curvature="0" />
+    <text
+       xml:space="preserve"
+       style="font-style:italic;font-variant:normal;font-weight:500;font-stretch:normal;font-size:9.57565px;line-height:1.25;font-family:'CMU Serif';-inkscape-font-specification:'CMU Serif, Medium Italic';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1.22192"
+       x="340.22708"
+       y="455.7164"
+       id="text3125"><tspan
+         sodipodi:role="line"
+         id="tspan3123"
+         x="340.22708"
+         y="455.7164"
+         style="font-style:normal;stroke-width:1.22192">ContrastiveDistLearn</tspan></text>
+  </g>
+  <g
+     id="g3217"
+     transform="translate(0,-16.5)">
+    <path
+       inkscape:connector-curvature="0"
+       id="path4058"
+       d="m 335.82109,423.705 v 16 H 432.2 v -16 z m 0,0"
+       style="fill:#d1d1d1;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.4;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1" />
+    <text
+       id="text4171"
+       y="435.86887"
+       x="340.41858"
+       style="font-style:italic;font-variant:normal;font-weight:500;font-stretch:normal;font-size:9.57565px;line-height:1.25;font-family:'CMU Serif';-inkscape-font-specification:'CMU Serif, Medium Italic';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1.22192"
+       xml:space="preserve"><tspan
+         style="font-style:normal;stroke-width:1.22192"
+         y="435.86887"
+         x="340.41858"
+         sodipodi:role="line"
+         id="tspan4173">TripletDistLearn</tspan></text>
+    <path
+       inkscape:connector-curvature="0"
+       id="path3121"
+       d="m 335.82109,423.705 v 16 H 432.2 v -16 z m 0,0"
+       style="fill:#d1d1d1;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.4;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1" />
+    <text
+       id="text3129"
+       y="435.86887"
+       x="340.41858"
+       style="font-style:italic;font-variant:normal;font-weight:500;font-stretch:normal;font-size:9.57565px;line-height:1.25;font-family:'CMU Serif';-inkscape-font-specification:'CMU Serif, Medium Italic';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1.22192"
+       xml:space="preserve"><tspan
+         style="font-style:normal;stroke-width:1.22192"
+         y="435.86887"
+         x="340.41858"
+         sodipodi:role="line"
+         id="tspan3127">TripletDistLearn</tspan></text>
+  </g>
+  <g
+     clip-path="url(#clip46)"
+     id="g3502"
+     style="clip-rule:nonzero"
+     transform="translate(7.8100894e-4,-72.994221)">
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.8;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:10;stroke-opacity:1"
+       d="m 511.99922,192.00078 h 16 v 20"
+       transform="matrix(1,0,0,-1,-79.8,720.2)"
+       id="path3500"
+       inkscape:connector-curvature="0" />
+  </g>
 </svg>
diff --git a/doc/source/getting_started.rst b/doc/source/getting_started.rst
index 392e5a1a..5f2ccf81 100644
--- a/doc/source/getting_started.rst
+++ b/doc/source/getting_started.rst
@@ -131,7 +131,7 @@ Note that the model and the observations are given as a list. This is due to the
 have hierarchical models, building relationships between co-occurring groups of datasets. To learn more, see the
 `Hierarchical Model`_ section.
 
-The full source can be found in `examples/extensions/models/gaussian_python/pmcabc_gaussian_model_simple.py`. To
+The full source can be found in `examples/extensions/models/gaussian_python/pmcabc_gaussian_model_simple.py <https://github.com/eth-cscs/abcpy/blob/master/examples/extensions/models/gaussian_python/pmcabc_gaussian_model_simple.py>`_. To
 execute the code you only need to run
 
 ::
@@ -249,7 +249,7 @@ different observed data sets respectively. Also notice now we provide two differ
 different root models and their observed datasets. Presently ABCpy combines the distances by a linear combination, however
 customized combination strategies can be implemented by the user.
 
-The full source code can be found in `examples/hierarchicalmodels/pmcabc_inference_on_multiple_sets_of_obs.py`.
+The full source code can be found in `examples/hierarchicalmodels/pmcabc_inference_on_multiple_sets_of_obs.py <https://github.com/eth-cscs/abcpy/blob/master/examples/hierarchicalmodels/pmcabc_inference_on_multiple_sets_of_obs.py>`_.
 
 
 Composite Perturbation Kernels
@@ -280,7 +280,7 @@ you would like to implement your own perturbation kernel, please check :ref:`Imp
 <user_customization>`. Please keep in mind that you can only perturb parameters. **You cannot use the access operator to
 perturb one component of a multi-dimensional random variable differently than another component of the same variable.**
 
-The source code to this section can be found in `examples/extensions/perturbationkernels/pmcabc_perturbation_kernels.py`
+The source code to this section can be found in `examples/extensions/perturbationkernels/pmcabc_perturbation_kernels.py <https://github.com/eth-cscs/abcpy/blob/master/examples/extensions/perturbationkernels/pmcabc_perturbation_kernels.py>`_.
 
 Inference Schemes
 ~~~~~~~~~~~~~~~~~
@@ -316,6 +316,8 @@ the following methods methods:
 * Semiparametric Synthetic likelihood :py:class:`abcpy.approx_lhd.SemiParametricSynLikelihood`, and another method using
 * penalized logistic regression :py:class:`abcpy.approx_lhd.PenLogReg`.
 
+Finally, we also include a utility class with a similar API as the previous inference schemes which allows to sample from the prior: :py:class:`abcpy.inferences.DrawFromPrior`.
+
 Next we explain how we can use PMC algorithm using approximation of the
 likelihood functions. As we are now considering two observed datasets
 corresponding to two root models, we need to define an approximation of
@@ -341,9 +343,9 @@ different observed data sets respectively. Also notice we now provide two differ
 different root models and their observed datasets. Presently ABCpy combines the distances by a linear combination.
 Further possibilities of combination will be made available in later versions of ABCpy.
 
-The source code can be found in `examples/approx_lhd/pmc_hierarchical_models.py`.
+The source code can be found in `examples/approx_lhd/pmc_hierarchical_models.py <https://github.com/eth-cscs/abcpy/blob/master/examples/approx_lhd/pmc_hierarchical_models.py>`_.
 
-For an example using MCMC instead of PMC, check out `examples/approx_lhd/mcmc_hierarchical_models.py`.
+For an example using MCMC instead of PMC, check out `examples/approx_lhd/mcmc_hierarchical_models.py <https://github.com/eth-cscs/abcpy/blob/master/examples/approx_lhd/mcmc_hierarchical_models.py>`_.
 
 Statistics Learning
 ~~~~~~~~~~~~~~~~~~~
@@ -360,11 +362,18 @@ step. The following techniques are available:
 * SemiautomaticNN :py:class:`abcpy.statisticslearning.SemiautomaticNN`,
 * ContrastiveDistanceLearning :py:class:`abcpy.statisticslearning.ContrastiveDistanceLearning`,
 * TripletDistanceLearning :py:class:`abcpy.statisticslearning.TripletDistanceLearning`.
+* ExponentialFamilyScoreMatching :py:class:`abcpy.statisticslearning.ExponentialFamilyScoreMatching`.
 
 The first two build a transformation that approximates the parameter that generated the corresponding observation, the
-first one by using a linear regression approach and the second one by using a neural network embedding. The other two
+first one by using a linear regression approach and the second one by using a neural network embedding. The two distance
+learning approaches
 use instead neural networks to learn an embedding of the data so that the distance between the embeddings is close to
-the distance between the parameter values that generated the data.
+the distance between the parameter values that generated the data. Finally, the last one fits an exponential family
+approximation to the likelihood using the generated data, and uses as summary statistics the sufficient statistics of
+the approximating family. Two neural networks are used here in the training phase, one to learn the summary
+statistics and one to transform
+the parameters to the natural parametrization of the learned exponential family (but only the second neural network will
+be used when the statistics are used in inference).
 
 We remark that the techniques using neural networks require `Pytorch <https://pytorch.org/>`_ to be installed. As this is an optional feature,
 however, Pytorch is not in the list of dependencies of ABCpy. Rather, when one of the neural network based routines is
@@ -403,10 +412,28 @@ We remark that the minimal amount of coding needed for using the neural network
     :lines: 64-72
     :dedent: 4
 
-And similarly for the other two approaches.
+And similarly for the other approaches.
+
+We remark how :py:class:`abcpy.statisticslearning.SemiautomaticNN` (as well as the other NN-based statistics learning approaches) allow to specify a neural network through the optional `embedding_net` parameter (in :py:class:`abcpy.statisticslearning.ExponentialFamilyScoreMatching`, you analogously have `simulations_net` and `parameters_net`). According to the value given to `embedding_net`, different NNs are used:
+
+* a torch.nn object can be passed to `embedding_net` to be used as the NN to learn summary statistics.
+* Alternatively, a list with some integer numbers denoting the width of the hidden layers of a fully connected NN can be specified (with the length of the list corresponding to the number of hidden layers). In this case, the input and output sizes are determined so that things work correctly: input size correspond to the data size after the provided `statistics_calculator` has been applied, while output size corresponds to the number of parameters in the model. The function taking care of instantiating the NN is :py:func:`abcpy.NN_utilities.networks.createDefaultNN`.
+* If `embedding_net` is not specified, the behavior is similar to the latter bullet point, but with the number of hidden sizes fixed to 3 and their width determined as: ``[int(input_size * 1.5), int(input_size * 0.75 + output_size * 3), int(output_size * 5)]``.
+
+The parameters `simulations_net` and `parameters_net` of :py:class:`abcpy.statisticslearning.ExponentialFamilyScoreMatching` have a similar behavior, with the former network still taking as input the data after `statistics_calculator` has been applied, while the latter taking as input the parameters; additionally, here the embedding size can be chosen arbitrarily through the argument `embedding_dimension`, for which the default value is the number of parameters in the model. You can find more information in the docstring for :py:class:`abcpy.statisticslearning.ExponentialFamilyScoreMatching`, or in the example in `examples/statisticslearning/gaussian_statistics_learning_exponential_family.py <https://github.com/eth-cscs/abcpy/blob/master/examples/statisticslearning/gaussian_statistics_learning_exponential_family.py>`_.
 
 We can then perform the inference as before, but the distances will be computed on the newly learned summary statistics.
 
+The above summary statistics learning routines can also be initialized with previously generated parameter-observation pairs (this can be useful for instance when different statistics learning techniques need to be tested with the same training dataset). The :py:class:`abcpy.inferences.DrawFromPrior` class can be used to generate such training data. We provide an example showcasing this in `examples/statisticslearning/gaussian_statistics_learning_DrawFromPrior_reload_NNs.py <https://github.com/eth-cscs/abcpy/blob/master/examples/statisticslearning/gaussian_statistics_learning_DrawFromPrior_reload_NNs.py>`_.
+
+Finally, when we use a neural network-based summary statistics learning approach, the neural network can be stored to disk and loaded later. This is done in the following chunk of code (taken from an example shipped with code); notice the use of the :py:class:`abcpy.statistics.NeuralEmbedding` Statistics class:
+
+.. literalinclude:: ../../examples/statisticslearning/gaussian_statistics_learning_DrawFromPrior_reload_NNs.py
+    :language: python
+    :lines: 83-84, 86-88, 91-97
+    :dedent: 4
+
+
 Model Selection
 ~~~~~~~~~~~~~~~
 
diff --git a/doc/source/postanalysis.rst b/doc/source/postanalysis.rst
index 260cea05..3f6dcd55 100644
--- a/doc/source/postanalysis.rst
+++ b/doc/source/postanalysis.rst
@@ -4,31 +4,33 @@
 ================
 
 The output of an inference scheme is a Journal
-(:py:class:``abcpy.output.Journal``) which holds all the necessary results and
+(:py:class:`abcpy.output.Journal`) which holds all the necessary results and
 convenient methods to do the post analysis.
 
-For example, one can easily access the sampled parameters and corresponding
-weights using:
+Basis Analysis
+~~~~~~~~~~~~~~
+
+One can easily access the sampled parameters and corresponding weights using:
 
 .. literalinclude:: ../../examples/backends/dummy/pmcabc_gaussian.py
     :language: python
     :lines: 77-78
     :dedent: 4
 
-The output of ``get_parameters()`` is a Python dictionary. The keys for this dictionary are the names you specified for the parameters. The corresponding values are the marginal posterior samples of that parameter. Here is a short example of what you would specify, and what would be the output in the end:
+The output of :py:meth:`get_parameters()<abcpy.output.Journal.get_parameters()>` is a Python dictionary. The keys for this dictionary are the names you specified for the parameters. The corresponding values are the marginal posterior samples of that parameter. Here is a short example of what you would specify, and what would be the output in the end:
 
 .. code-block:: python
 
     a = Normal([[1],[0.1]], name='parameter_1')
     b = MultivariateNormal([[1,1],[[0.1,0],[0,0.1]]], name='parameter_2')
 
-If one defined a model with these two parameters as inputs and ``n_sample=2``, the following would be the output of ``journal.get_parameters()``:
+If one defined a model with these two parameters as inputs and ``n_sample=2``, the following would be the output of :py:meth:`journal.get_parameters()<abcpy.output.Journal.get_parameters()>`:
 
 .. code-block:: python
 
     {'parameter_1' : [[0.95],[0.97]], 'parameter_2': [[0.98,1.03],[1.06,0.92]]}
 
-These are samples at the final step of ABC algorithm. If you want samples from the earlier steps you can get a Python dictionary for that step by using:
+These are samples at the final step of ABC algorithm. If you want samples from the earlier steps of a sequential algorithm you can get a Python dictionary for that step by using:
 
 .. code-block:: python
 
@@ -56,7 +58,18 @@ algorithm that created it:
     :lines: 85
     :dedent: 4
 
-Finally, you can plot the inferred posterior distribution of the parameters in the following way:
+And certainly, a journal can easily be saved to and loaded from disk:
+
+.. literalinclude:: ../../examples/backends/dummy/pmcabc_gaussian.py
+    :language: python
+    :lines: 91, 94
+    :dedent: 4
+
+Posterior plots and diagnostics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+You can plot the inferred posterior distribution of the parameters in the following way:
 
 .. literalinclude:: ../../examples/backends/dummy/pmcabc_gaussian.py
     :language: python
@@ -83,9 +96,47 @@ distributions defined by the samples and weights at subsequent iterations:
     journal.Wass_convergence_plot()
 
 
-And certainly, a journal can easily be saved to and loaded from disk:
+Instead, for journals generated by MCMC, we provide way to plot the traceplot for the required parameters:
+
+.. code-block:: python
+
+    journal.traceplot()
+
+
+
+Posterior resampling and predictive check
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In some cases, you may want to resample (for instance, bootstrapping or subsampling) the posterior samples stored in a
+Journal, by tacking into account the posterior weights.
+This can be done using the :py:meth:`resample()<abcpy.output.Journal.resample()>` method. Behind the scenes,
+this uses the numpy.random.choice method, and it inherits arguments from it. It allows to do different things,
+for instance:
+
+
+* if the set of posterior samples (weighted or unweighted) is too large, you can obtained a subsampled (without replacement) set by doing:
+
+.. code-block:: python
+
+    new_journal = journal.resample(n_samples=100, replace=False)
+
+
+* Alternatively, if the used algorithm returns weighted posterior samples, you may want instead an unweighted set of samples obtained by sampling with replacement (commonly called bootstrapping); this can be done with the following line (where the number of required bootstrapped samples in the new journal is unspecified and therefore corresponding to the number of samples in the old ``journal``):
+
+.. code-block:: python
+
+    new_journal = journal.resample()
+
+
+Finally, in some cases you may want to generate simulations from the model for parameter values sampled from the posterior,
+for instance in order to check similarity with the original observation (predictive check). ABCpy provides the
+:py:class:`output.GenerateFromJournal<abcpy.output.GenerateFromJournal>` to do that. This class needs to be instanstiated
+by providing to it the model and the backend which you want to use for the simulation; then, you can pass a Journal as argument to the
+:py:meth:`generate()<abcpy.output.GenerateFromJournal.generate()>` method in order to generate simulations from the
+posterior samples contained there:
+
+.. code-block:: python
+
+    generate_from_journal = GenerateFromJournal([model], backend=backend)
+    parameters, simulations, normalized_weights = generate_from_journal.generate(journal)
 
-.. literalinclude:: ../../examples/backends/dummy/pmcabc_gaussian.py
-    :language: python
-    :lines: 91, 94
-    :dedent: 4
diff --git a/doc/source/user_customization.rst b/doc/source/user_customization.rst
index 20636321..827cfa32 100644
--- a/doc/source/user_customization.rst
+++ b/doc/source/user_customization.rst
@@ -407,7 +407,7 @@ calculator should be provided. The following header conforms to this idea:
 
 .. literalinclude:: ../../abcpy/distances.py
     :language: python
-    :lines: 15,27-33
+    :lines: 16,28-34
     :dedent: 4
 
 Then, we need to define how the distance is calculated. We need first to compute the summary statistics from the datasets and after compute the distance between the summary statistics. Notice that we use the private method :py:meth:`Distance._calculate_summary_stat <abcpy.distances.Distance._calculate_summary_stat>` to compute the statistics from the dataset; internally, this saves the first dataset and the corresponding summary statistics while computing the summary statistics. In fact, we always pass the observed dataset first to the
@@ -416,14 +416,14 @@ compute it once and store it internally. At each call of the ``distance`` method
 
 .. literalinclude:: ../../abcpy/distances.py
     :language: python
-    :lines: 152-176
+    :lines: 169-193
     :dedent: 4
 
 Finally, we need to define the maximal distance that can be obtained from this distance measure. 
 
 .. literalinclude:: ../../abcpy/distances.py
     :language: python
-    :lines: 178-185
+    :lines: 195-202
     :dedent: 4
 
 The newly defined distance class can be used in the same way as the already existing once. The complete example for this
@@ -494,7 +494,7 @@ Let us now look at the implementation of the method:
 
 .. literalinclude:: ../../abcpy/perturbationkernel.py
     :language: python
-    :lines: 247-279
+    :lines: 246-278
     :dedent: 4
 
 Some of the implemented inference algorithms weigh different sets of parameters differently. Therefore, if such weights
@@ -523,7 +523,7 @@ Here the implementation for our kernel:
 
 .. literalinclude:: ../../abcpy/perturbationkernel.py
     :language: python
-    :lines: 281-329
+    :lines: 280-328
     :dedent: 4
 
 The first line shows how you obtain the values of the parameters that your kernel should perturb. These values are
@@ -540,7 +540,7 @@ This method is implemented as follows for the multivariate normal:
 
 .. literalinclude:: ../../abcpy/perturbationkernel.py
     :language: python
-    :lines: 331-358
+    :lines: 330-357
     :dedent: 4
 
 We simply obtain the parameter values and covariance matrix for this kernel and calculate the probability density
diff --git a/examples/statisticslearning/gaussian_statistics_learning_DrawFromPrior_reload_NNs.py b/examples/statisticslearning/gaussian_statistics_learning_DrawFromPrior_reload_NNs.py
new file mode 100644
index 00000000..3363ade6
--- /dev/null
+++ b/examples/statisticslearning/gaussian_statistics_learning_DrawFromPrior_reload_NNs.py
@@ -0,0 +1,154 @@
+import logging
+
+import numpy as np
+
+
+def infer_parameters(steps=2, n_sample=50, n_samples_per_param=1, logging_level=logging.WARN):
+    """Perform inference for this example.
+
+    Parameters
+    ----------
+    steps : integer, optional
+        Number of iterations in the sequential PMCABC algorithm ("generations"). The default value is 3
+    n_samples : integer, optional
+        Number of posterior samples to generate. The default value is 250.
+    n_samples_per_param : integer, optional
+        Number of data points in each simulated data set. The default value is 10.
+
+    Returns
+    -------
+    abcpy.output.Journal
+        A journal containing simulation results, metadata and optionally intermediate results.
+    """
+    logging.basicConfig(level=logging_level)
+    # define backend
+    # Note, the dummy backend does not parallelize the code!
+    from abcpy.backends import BackendDummy as Backend
+    backend = Backend()
+
+    # define observation for true parameters mean=170, std=15
+    height_obs = [160.82499176, 167.24266737, 185.71695756, 153.7045709, 163.40568812, 140.70658699, 169.59102084,
+                  172.81041696, 187.38782738, 179.66358934, 176.63417241, 189.16082803, 181.98288443, 170.18565017,
+                  183.78493886, 166.58387299, 161.9521899, 155.69213073, 156.17867343, 144.51580379, 170.29847515,
+                  197.96767899, 153.36646527, 162.22710198, 158.70012047, 178.53470703, 170.77697743, 164.31392633,
+                  165.88595994, 177.38083686, 146.67058471763457, 179.41946565658628, 238.02751620619537,
+                  206.22458790620766, 220.89530574344568, 221.04082532837026, 142.25301427453394, 261.37656571434275,
+                  171.63761180867033, 210.28121820385866, 237.29130237612236, 175.75558340169619, 224.54340549862235,
+                  197.42448680731226, 165.88273684581381, 166.55094082844519, 229.54308602661584, 222.99844054358519,
+                  185.30223966014586, 152.69149367593846, 206.94372818527413, 256.35498655339154, 165.43140916577741,
+                  250.19273595481803, 148.87781549665536, 223.05547559193792, 230.03418198709608, 146.13611923127021,
+                  138.24716809523139, 179.26755740864527, 141.21704876815426, 170.89587081800852, 222.96391329259626,
+                  188.27229523693822, 202.67075179617672, 211.75963110985992, 217.45423324370509]
+
+    # define prior
+    from abcpy.continuousmodels import Uniform
+    mu = Uniform([[150], [200]], name="mu")
+    sigma = Uniform([[5], [25]], name="sigma")
+
+    # define the model
+    from abcpy.continuousmodels import Normal
+    height = Normal([mu, sigma], )
+
+    # 1) generate simulations from prior
+    from abcpy.inferences import DrawFromPrior
+    draw_from_prior = DrawFromPrior([height], backend=backend)
+
+    # notice the use of the `.sample_par_sim_pairs` method rather than `.sample` to obtain data suitably formatted
+    # for the summary statistics learning routines
+    parameters, simulations = draw_from_prior.sample_par_sim_pairs(100, n_samples_per_param=1)
+    # if you want to use the test loss to do early stopping in the training:
+    parameters_val, simulations_val = draw_from_prior.sample_par_sim_pairs(100, n_samples_per_param=1)
+    # discard the mid dimension (n_samples_per_param, as the StatisticsLearning classes use that =1)
+    simulations = simulations.reshape(simulations.shape[0], simulations.shape[2])
+    simulations_val = simulations_val.reshape(simulations_val.shape[0], simulations_val.shape[2])
+
+    # 2) now train the NNs with the different methods with the generated data
+    from abcpy.statistics import Identity
+    identity = Identity()  # to apply before computing the statistics
+
+    logging.info("semiNN")
+    from abcpy.statisticslearning import SemiautomaticNN, TripletDistanceLearning
+    semiNN = SemiautomaticNN([height], identity, backend=backend, parameters=parameters,
+                             simulations=simulations, parameters_val=parameters_val, simulations_val=simulations_val,
+                             early_stopping=True,  # early stopping
+                             seed=1, n_epochs=10, scale_samples=False, use_tqdm=False)
+    logging.info("triplet")
+    triplet = TripletDistanceLearning([height], identity, backend=backend, parameters=parameters,
+                                      simulations=simulations, parameters_val=parameters_val,
+                                      simulations_val=simulations_val,
+                                      early_stopping=True,  # early stopping
+                                      seed=1, n_epochs=10, scale_samples=True, use_tqdm=False)
+
+    # 3) save and re-load NNs:
+    # get the statistics from the already fit StatisticsLearning object 'semiNN':
+    learned_seminn_stat = semiNN.get_statistics()
+    learned_triplet_stat = triplet.get_statistics()
+
+    # this has a save net method:
+    learned_seminn_stat.save_net("seminn_net.pth")
+    # if you used `scale_samples=True` in learning the NNs, need to provide a path where pickle stores the scaler too:
+    learned_triplet_stat.save_net("triplet_net.pth", path_to_scaler="scaler.pkl")
+
+    # to reload: need to use the Neural Embedding statistics fromFile; this needs to know which kind of NN it is using;
+    # need therefore to pass either the input/output size (it data size and number parameters) or the network class if
+    # that was specified explicitly in the StatisticsLearning class. Check the docstring for NeuralEmbedding.fromFile
+    # for more details.
+    from abcpy.statistics import NeuralEmbedding
+    learned_seminn_stat_loaded = NeuralEmbedding.fromFile("seminn_net.pth", input_size=1, output_size=2)
+    learned_triplet_stat_loaded = NeuralEmbedding.fromFile("triplet_net.pth", input_size=1, output_size=2,
+                                                           path_to_scaler="scaler.pkl")
+
+    # 4) you can optionally rescale the different summary statistics be their standard deviation on a reference dataset
+    # of simulations. To do this, it is enough to pass at initialization the reference dataset, and the rescaling will
+    # be applied every time the statistics is computed on some simulation or observation.
+    learned_triplet_stat_loaded = NeuralEmbedding.fromFile("triplet_net.pth", input_size=1, output_size=2,
+                                                           path_to_scaler="scaler.pkl",
+                                                           reference_simulations=simulations_val)
+
+    # 5) perform inference
+    # define distance
+    from abcpy.distances import Euclidean
+    distance_calculator = Euclidean(learned_seminn_stat_loaded)
+
+    # define kernel
+    from abcpy.perturbationkernel import DefaultKernel
+    kernel = DefaultKernel([mu, sigma])
+
+    # define sampling scheme
+    from abcpy.inferences import PMCABC
+    sampler = PMCABC([height], [distance_calculator], backend, kernel, seed=1)
+
+    eps_arr = np.array([500])  # starting value of epsilon; the smaller, the slower the algorithm.
+    # at each iteration, take as epsilon the epsilon_percentile of the distances obtained by simulations at previous
+    # iteration from the observation
+    epsilon_percentile = 10
+    journal = sampler.sample([height_obs], steps, eps_arr, n_sample, n_samples_per_param, epsilon_percentile)
+
+    return journal
+
+
+def analyse_journal(journal):
+    # output parameters and weights
+    print(journal.opt_values)
+    print(journal.get_weights())
+
+    # do post analysis
+    print(journal.posterior_mean())
+    print(journal.posterior_cov())
+
+    # print configuration
+    print(journal.configuration)
+
+    # plot posterior
+    journal.plot_posterior_distr(path_to_save="posterior.png")
+
+    # save and load journal
+    journal.save("experiments.jnl")
+
+    from abcpy.output import Journal
+    new_journal = Journal.fromFile('experiments.jnl')
+
+
+if __name__ == "__main__":
+    journal = infer_parameters(logging_level=logging.INFO)
+    analyse_journal(journal)
diff --git a/examples/statisticslearning/gaussian_statistics_learning_exponential_family.py b/examples/statisticslearning/gaussian_statistics_learning_exponential_family.py
new file mode 100644
index 00000000..c722589a
--- /dev/null
+++ b/examples/statisticslearning/gaussian_statistics_learning_exponential_family.py
@@ -0,0 +1,154 @@
+import logging
+
+import numpy as np
+
+
+def infer_parameters(steps=2, n_sample=50, n_samples_per_param=1, logging_level=logging.WARN):
+    """Perform inference for this example.
+
+    Parameters
+    ----------
+    steps : integer, optional
+        Number of iterations in the sequential PMCABC algorithm ("generations"). The default value is 3
+    n_samples : integer, optional
+        Number of posterior samples to generate. The default value is 250.
+    n_samples_per_param : integer, optional
+        Number of data points in each simulated data set. The default value is 10.
+
+    Returns
+    -------
+    abcpy.output.Journal
+        A journal containing simulation results, metadata and optionally intermediate results.
+    """
+    logging.basicConfig(level=logging_level)
+    # define backend
+    # Note, the dummy backend does not parallelize the code!
+    from abcpy.backends import BackendDummy as Backend
+    backend = Backend()
+
+    # define observation for true parameters mean=170, std=15
+    height_obs = [160.82499176, 167.24266737, 185.71695756, 153.7045709, 163.40568812, 140.70658699, 169.59102084,
+                  172.81041696, 187.38782738, 179.66358934, 176.63417241, 189.16082803, 181.98288443, 170.18565017,
+                  183.78493886, 166.58387299, 161.9521899, 155.69213073, 156.17867343, 144.51580379, 170.29847515,
+                  197.96767899, 153.36646527, 162.22710198, 158.70012047, 178.53470703, 170.77697743, 164.31392633,
+                  165.88595994, 177.38083686, 146.67058471763457, 179.41946565658628, 238.02751620619537,
+                  206.22458790620766, 220.89530574344568, 221.04082532837026, 142.25301427453394, 261.37656571434275,
+                  171.63761180867033, 210.28121820385866, 237.29130237612236, 175.75558340169619, 224.54340549862235,
+                  197.42448680731226, 165.88273684581381, 166.55094082844519, 229.54308602661584, 222.99844054358519,
+                  185.30223966014586, 152.69149367593846, 206.94372818527413, 256.35498655339154, 165.43140916577741,
+                  250.19273595481803, 148.87781549665536, 223.05547559193792, 230.03418198709608, 146.13611923127021,
+                  138.24716809523139, 179.26755740864527, 141.21704876815426, 170.89587081800852, 222.96391329259626,
+                  188.27229523693822, 202.67075179617672, 211.75963110985992, 217.45423324370509]
+
+    # define prior
+    from abcpy.continuousmodels import Uniform
+    mu = Uniform([[150], [200]], name="mu")
+    sigma = Uniform([[5], [25]], name="sigma")
+
+    # define the model
+    from abcpy.continuousmodels import Normal
+    height = Normal([mu, sigma], )
+
+    # 1) generate simulations from prior
+    from abcpy.inferences import DrawFromPrior
+    draw_from_prior = DrawFromPrior([height], backend=backend)
+
+    # notice the use of the `.sample_par_sim_pairs` method rather than `.sample` to obtain data suitably formatted
+    # for the summary statistics learning routines
+    parameters, simulations = draw_from_prior.sample_par_sim_pairs(100, n_samples_per_param=1)
+    # if you want to use the test loss to do early stopping in the training:
+    parameters_val, simulations_val = draw_from_prior.sample_par_sim_pairs(100, n_samples_per_param=1)
+    # discard the mid dimension (n_samples_per_param, as the StatisticsLearning classes use that =1)
+    simulations = simulations.reshape(simulations.shape[0], simulations.shape[2])
+    simulations_val = simulations_val.reshape(simulations_val.shape[0], simulations_val.shape[2])
+
+    # 2) now train the NNs with the different methods with the generated data
+    from abcpy.statistics import Identity
+    identity = Identity()  # to apply before computing the statistics
+
+    logging.info("Exponential family statistics")
+    from abcpy.statisticslearning import ExponentialFamilyScoreMatching
+    exp_fam_stats = ExponentialFamilyScoreMatching(
+        [height], identity, backend=backend, parameters=parameters,
+        simulations=simulations, parameters_val=parameters_val,
+        simulations_val=simulations_val,
+        early_stopping=True,  # early stopping
+        seed=1, n_epochs=10, batch_size=10,
+        scale_samples=False,  # whether to rescale the samples to (0,1) before NN
+        scale_parameters=True,  # whether to rescale the parameters to (0,1) before NN
+        embedding_dimension=3,  # embedding dimension of both simulations and parameter networks (equal to # statistics)
+        sliced=True,  # quicker version of score matching
+        use_tqdm=False  # do not use tqdm to display progress
+    )
+
+    # 3) save and re-load NNs:
+    # get the statistics from the already fit StatisticsLearning object 'exp_fam_stats'; if rescale_statistics=True,
+    # the training (or validation, if available) set is used to compute the standard deviation of the different
+    # statistics, and the statistics on each new observation/simulation are then rescaled by that. This is useful as the
+    # summary statistics obtained with this method can vary much in magnitude.
+    learned_stat = exp_fam_stats.get_statistics(rescale_statistics=True)
+
+    # this has a save net method:
+    # if you used `scale_samples=True` in learning the NNs (or if the simulations were bounded, which is not the case
+    # here), you need to provide a path where pickle stores the scaler too:
+    learned_stat.save_net("exp_fam_stats.pth", path_to_scaler="scaler.pkl")
+
+    # to reload: need to use the Neural Embedding statistics fromFile; this needs to know which kind of NN it is using;
+    # need therefore to pass either the input/output size (it data size and number parameters) or the network class if
+    # that was specified explicitly in the StatisticsLearning class. Check the docstring for NeuralEmbedding.fromFile
+    # for more details.
+    # Output size has to be here the embedding_dimension used in learning the summaries plus 1. The last component is
+    # a base measure which is automatically discarded when the summary statistics are used.
+    # Additionally, if you want to rescale the statistics by their standard deviation as discussed above, you need here
+    # to explicitly provide the set used for estimating the standard deviation in the argument `reference_simulations`
+    from abcpy.statistics import NeuralEmbedding
+    learned_stat_loaded = NeuralEmbedding.fromFile("exp_fam_stats.pth", input_size=1, output_size=4,
+                                                   reference_simulations=simulations_val)
+
+    # 4) perform inference
+    # define distance
+    from abcpy.distances import Euclidean
+    distance_calculator = Euclidean(learned_stat_loaded)
+
+    # define kernel
+    from abcpy.perturbationkernel import DefaultKernel
+    kernel = DefaultKernel([mu, sigma])
+
+    # define sampling scheme
+    from abcpy.inferences import PMCABC
+    sampler = PMCABC([height], [distance_calculator], backend, kernel, seed=1)
+
+    eps_arr = np.array([500])  # starting value of epsilon; the smaller, the slower the algorithm.
+    # at each iteration, take as epsilon the epsilon_percentile of the distances obtained by simulations at previous
+    # iteration from the observation
+    epsilon_percentile = 10
+    journal = sampler.sample([height_obs], steps, eps_arr, n_sample, n_samples_per_param, epsilon_percentile)
+
+    return journal
+
+
+def analyse_journal(journal):
+    # output parameters and weights
+    print(journal.opt_values)
+    print(journal.get_weights())
+
+    # do post analysis
+    print(journal.posterior_mean())
+    print(journal.posterior_cov())
+
+    # print configuration
+    print(journal.configuration)
+
+    # plot posterior
+    journal.plot_posterior_distr(path_to_save="posterior.png")
+
+    # save and load journal
+    journal.save("experiments.jnl")
+
+    from abcpy.output import Journal
+    new_journal = Journal.fromFile('experiments.jnl')
+
+
+if __name__ == "__main__":
+    journal = infer_parameters(logging_level=logging.INFO)
+    analyse_journal(journal)
diff --git a/examples/statisticslearning/pmcabc_gaussian_statistics_learning.py b/examples/statisticslearning/pmcabc_gaussian_statistics_learning.py
index 387f7207..abbcc29b 100644
--- a/examples/statisticslearning/pmcabc_gaussian_statistics_learning.py
+++ b/examples/statisticslearning/pmcabc_gaussian_statistics_learning.py
@@ -65,7 +65,7 @@ def infer_parameters(steps=3, n_sample=250, n_samples_per_param=10, logging_leve
     # we use 200 samples as a validation set for early stopping:
     from abcpy.statisticslearning import SemiautomaticNN
     statistics_learning = SemiautomaticNN([height], statistics_calculator, backend,
-                                          n_samples=1000, n_samples_val=200,
+                                          n_samples=1000, n_samples_val=200, n_epochs=20, use_tqdm=False,
                                           n_samples_per_param=1, seed=1, early_stopping=True)
 
     # Redefine the statistics function
diff --git a/tests/NN_utilities_networks_tests.py b/tests/NN_utilities_networks_tests.py
new file mode 100644
index 00000000..0f6a7733
--- /dev/null
+++ b/tests/NN_utilities_networks_tests.py
@@ -0,0 +1,82 @@
+import unittest
+
+try:
+    import torch
+except ImportError:
+    has_torch = False
+else:
+    has_torch = True
+    from abcpy.NN_utilities.networks import createDefaultNNWithDerivatives, createDefaultNN, DiscardLastOutputNet
+    from abcpy.NN_utilities.utilities import jacobian_second_order, jacobian, jacobian_hessian
+
+
+class test_default_NN_with_derivatives(unittest.TestCase):
+
+    def setUp(self):
+        if has_torch:
+            self.net = createDefaultNNWithDerivatives(5, 2, nonlinearity=torch.nn.Softplus)()
+            self.net_first_der_only = createDefaultNNWithDerivatives(5, 2, nonlinearity=torch.nn.Softplus,
+                                                                     first_derivative_only=True)()
+            self.tensor = torch.randn((10, 5), requires_grad=True)
+
+    def test_first_der(self):
+        if has_torch:
+            # compute derivative with forward pass
+            y, f1 = self.net_first_der_only.forward_and_derivatives(self.tensor)
+            f2 = jacobian(self.tensor, y)
+
+            assert torch.allclose(f1, f2)
+
+    def test_first_and_second_der(self):
+        if has_torch:
+            # compute derivative with forward pass
+            y, f1, s1 = self.net.forward_and_derivatives(self.tensor)
+            f2, s2 = jacobian_second_order(self.tensor, y)
+
+            assert torch.allclose(f1, f2)
+            assert torch.allclose(s1, s2)
+
+    def test_first_der_and_hessian(self):
+        if has_torch:
+            # compute derivative with forward pass
+            y, f1, H1 = self.net.forward_and_full_derivatives(self.tensor)
+            f2, H2 = jacobian_hessian(self.tensor, y)
+
+            assert torch.allclose(f1, f2)
+            assert torch.allclose(H1, H2)
+
+    def test_error(self):
+        if has_torch:
+            with self.assertRaises(RuntimeError):
+                self.net = createDefaultNNWithDerivatives(5, 2, nonlinearity=torch.nn.Softsign)()
+
+
+class test_discard_last_output_wrapper(unittest.TestCase):
+
+    def setUp(self):
+        if has_torch:
+            self.net = createDefaultNN(2, 3)()
+            self.net_with_discard_wrapper = DiscardLastOutputNet(self.net)
+            # reference input and output
+            torch.random.manual_seed(1)
+            self.tensor_1 = torch.randn(2)
+            self.tensor_2 = torch.randn(1, 2)
+            self.tensor_3 = torch.randn(1, 3, 2)
+
+    def test_output(self):
+        if has_torch:
+            out = self.net(self.tensor_1)
+            out_discard = self.net_with_discard_wrapper(self.tensor_1)
+            self.assertTrue(torch.allclose(out[:-1], out_discard))
+
+            out = self.net(self.tensor_2)
+            out_discard = self.net_with_discard_wrapper(self.tensor_2)
+            self.assertTrue(torch.allclose(out[:, :-1], out_discard))
+
+            out = self.net(self.tensor_3)
+            out_discard = self.net_with_discard_wrapper(self.tensor_3)
+            self.assertTrue(torch.allclose(out[:, :, :-1], out_discard))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/NN_utilities_utilities_tests.py b/tests/NN_utilities_utilities_tests.py
new file mode 100644
index 00000000..37663af2
--- /dev/null
+++ b/tests/NN_utilities_utilities_tests.py
@@ -0,0 +1,100 @@
+import unittest
+
+import numpy as np
+
+from abcpy.statistics import Identity, LinearTransformation, NeuralEmbedding
+
+try:
+    import torch
+except ImportError:
+    has_torch = False
+else:
+    has_torch = True
+    from abcpy.NN_utilities.utilities import jacobian_second_order, jacobian, jacobian_hessian
+    from abcpy.NN_utilities.networks import createDefaultNN
+
+
+class test_jacobian_functions(unittest.TestCase):
+    # it tests that this gives the correct errors and that the result is same if you put diffable=True or False.
+    # it does not test that the derivatives numerical errors are correct but they are.
+
+    def setUp(self):
+        if has_torch:
+            net = createDefaultNN(5, 2, nonlinearity=torch.nn.Softplus(), batch_norm_last_layer=False)()
+            net_bn = createDefaultNN(5, 2, nonlinearity=torch.nn.Softplus(), batch_norm_last_layer=True)()
+            self.tensor = torch.randn((10, 5), requires_grad=True)
+            self.y = net(self.tensor)
+            self.y_bn = net_bn(self.tensor)
+
+            self.y_with_infinities = self.y.detach().clone()
+            self.y_with_infinities[0, 0] = np.inf
+
+            self.f, self.s = jacobian_second_order(self.tensor, self.y)  # reference derivatives
+            self.f_bn, self.s_bn = jacobian_second_order(self.tensor, self.y_bn)  # reference derivatives
+
+    def test_first_der(self):
+        if has_torch:
+            # compute derivative with forward pass
+            f2 = jacobian(self.tensor, self.y, diffable=False)
+
+            assert torch.allclose(self.f, f2)
+
+    def test_first_and_second_der(self):
+        if has_torch:
+            # compute derivative with forward pass
+            f2, s2 = jacobian_second_order(self.tensor, self.y, diffable=False)
+
+            assert torch.allclose(self.f, f2)
+            assert torch.allclose(self.s, s2)
+
+    def test_first_der_and_hessian(self):
+        if has_torch:
+            # compute derivative with forward pass
+            f1, H1 = jacobian_hessian(self.tensor, self.y)
+            f2, H2 = jacobian_hessian(self.tensor, self.y, diffable=False)
+            s2 = torch.einsum('biik->bik', H2)  # obtain the second order jacobian from Hessian matrix
+
+            assert torch.allclose(self.f, f2)
+            assert torch.allclose(f1, f2)
+            assert torch.allclose(H1, H2)
+            assert torch.allclose(self.s, s2)
+
+    def test_first_der_bn(self):
+        if has_torch:
+            # compute derivative with forward pass
+            f2 = jacobian(self.tensor, self.y_bn, diffable=False)
+
+            assert torch.allclose(self.f_bn, f2)
+
+    def test_first_and_second_der_bn(self):
+        if has_torch:
+            # compute derivative with forward pass
+            f2, s2 = jacobian_second_order(self.tensor, self.y_bn, diffable=False)
+
+            assert torch.allclose(self.f_bn, f2)
+            assert torch.allclose(self.s_bn, s2)
+
+    def test_first_der_and_hessian_bn(self):
+        if has_torch:
+            # compute derivative with forward pass
+            f1, H1 = jacobian_hessian(self.tensor, self.y_bn)
+            f2, H2 = jacobian_hessian(self.tensor, self.y_bn, diffable=False)
+            s2 = torch.einsum('biik->bik', H2)  # obtain the second order jacobian from Hessian matrix
+
+            assert torch.allclose(self.f_bn, f2)
+            assert torch.allclose(f1, f2)
+            assert torch.allclose(H1, H2)
+            assert torch.allclose(self.s_bn, s2)
+
+    def test_errors(self):
+        if has_torch:
+            with self.assertRaises(ValueError):
+                f1 = jacobian(self.tensor, self.y_with_infinities)
+            with self.assertRaises(ValueError):
+                f1, s1 = jacobian_second_order(self.tensor, self.y_with_infinities)
+            with self.assertRaises(ValueError):
+                f1, H1 = jacobian_hessian(self.tensor, self.y_with_infinities)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/inferences_tests.py b/tests/inferences_tests.py
index da824ae3..15f31764 100644
--- a/tests/inferences_tests.py
+++ b/tests/inferences_tests.py
@@ -7,12 +7,105 @@
 from abcpy.continuousmodels import Normal
 from abcpy.continuousmodels import Uniform
 from abcpy.distances import Euclidean, MMD
-from abcpy.inferences import RejectionABC, PMC, PMCABC, SABC, ABCsubsim, SMCABC, APMCABC, RSMCABC, \
+from abcpy.inferences import DrawFromPrior, RejectionABC, PMC, PMCABC, SABC, ABCsubsim, SMCABC, APMCABC, RSMCABC, \
     MCMCMetropoliHastings
 from abcpy.statistics import Identity
 
 
-class RejectionABCTest(unittest.TestCase):
+class DrawFromPriorTests(unittest.TestCase):
+    def setUp(self):
+        # setup backend
+        dummy = BackendDummy()
+
+        # define a uniform prior distribution
+        mu = Uniform([[-5.0], [5.0]], name='mu')
+        sigma = Uniform([[0.0], [10.0]], name='sigma')
+        # define a Gaussian model
+        self.model = Normal([mu, sigma])
+
+        # for correct seeding define 4 samplers (and discard large values in 3nd and 4rd to test if that works)
+        self.sampler = DrawFromPrior([self.model], dummy, seed=1)
+        self.sampler2 = DrawFromPrior([self.model], dummy, seed=1, max_chunk_size=2)
+        self.sampler3 = DrawFromPrior([self.model], dummy, seed=1, discard_too_large_values=True)
+        self.sampler4 = DrawFromPrior([self.model], dummy, seed=1, discard_too_large_values=True, max_chunk_size=2 )
+
+        # expected mean values from 100 prior samples:
+        self.mu_mean = -0.24621316447913139
+        self.sigma_mean = 5.182264389159227
+
+    def test_sample(self):
+        # test drawing parameter values from the prior in a similar fashion to the other InferenceMethdod's
+        journal = self.sampler.sample(100, path_to_save_journal="tmp.jnl")
+        mu_sample = np.array(journal.get_parameters()['mu'])
+        sigma_sample = np.array(journal.get_parameters()['sigma'])
+
+        accepted_parameters = journal.get_accepted_parameters()
+        self.assertEqual(len(accepted_parameters), 100)
+        self.assertEqual(len(accepted_parameters[0]), 2)
+
+        # test shape of samples
+        mu_shape, sigma_shape = (len(mu_sample), mu_sample[0].shape[1]), \
+                                (len(sigma_sample), sigma_sample[0].shape[1])
+        self.assertEqual(mu_shape, (100, 1))
+        self.assertEqual(sigma_shape, (100, 1))
+
+        # Compute posterior mean
+        self.assertAlmostEqual(np.average(mu_sample), self.mu_mean)
+        self.assertAlmostEqual(np.average(sigma_sample), self.sigma_mean)
+
+        self.assertTrue(journal.number_of_simulations[0] == 0)
+
+        # test now it gives same results with max_chunk_size=2
+        journal2 = self.sampler2.sample(100)
+        mu_sample = np.array(journal2.get_parameters()['mu'])
+        sigma_sample = np.array(journal2.get_parameters()['sigma'])
+
+        accepted_parameters = journal2.get_accepted_parameters()
+        self.assertEqual(len(accepted_parameters), 100)
+        self.assertEqual(len(accepted_parameters[0]), 2)
+
+        # test shape of samples
+        mu_shape, sigma_shape = (len(mu_sample), mu_sample[0].shape[1]), \
+                                (len(sigma_sample), sigma_sample[0].shape[1])
+        self.assertEqual(mu_shape, (100, 1))
+        self.assertEqual(sigma_shape, (100, 1))
+
+        # Compute posterior mean
+        self.assertAlmostEqual(np.average(mu_sample), self.mu_mean)
+        self.assertAlmostEqual(np.average(sigma_sample), self.sigma_mean)
+
+        self.assertTrue(journal2.number_of_simulations[0] == 0)
+
+    def test_param_simulation_pairs(self):
+        # sample single simulation for each par value
+        parameters, simulations = self.sampler.sample_par_sim_pairs(10, 1)
+        self.assertEqual(parameters.shape, (10, 2))
+        self.assertEqual(simulations.shape, (10, 1, 1))
+
+        # sample multiple simulations for each par value
+        parameters, simulations = self.sampler.sample_par_sim_pairs(10, 3)
+        self.assertEqual(parameters.shape, (10, 2))
+        self.assertEqual(simulations.shape, (10, 3, 1))
+
+        # now run with the new sampler to check if the means are the same as with `.sample` method:
+        parameters, simulations = self.sampler3.sample_par_sim_pairs(100, 1)
+        means = np.mean(parameters, axis=0)
+        self.assertAlmostEqual(means[0], self.mu_mean)
+        self.assertAlmostEqual(means[1], self.sigma_mean)
+
+        # check also if that gives same results by splitting in chunks:
+        parameters, simulations = self.sampler4.sample_par_sim_pairs(100, 1)
+        means = np.mean(parameters, axis=0)
+        self.assertAlmostEqual(means[0], self.mu_mean)
+        self.assertAlmostEqual(means[1], self.sigma_mean)
+
+        # check sizes with smaller max_chunk_size
+        parameters, simulations = self.sampler4.sample_par_sim_pairs(10, 3)
+        self.assertEqual(parameters.shape, (10, 2))
+        self.assertEqual(simulations.shape, (10, 3, 1))
+
+
+class RejectionABCTests(unittest.TestCase):
     def setUp(self):
         # setup backend
         dummy = BackendDummy()
@@ -49,7 +142,6 @@ def test_sample_n_samples(self):
         self.assertEqual(sigma_shape, (10, 1))
 
         # Compute posterior mean
-        # self.assertAlmostEqual(np.average(np.asarray(samples[:,0])),1.22301,10e-2)
         self.assertAlmostEqual(np.average(mu_sample), 1.223012836345375)
         self.assertAlmostEqual(np.average(sigma_sample), 6.992218962395242)
 
@@ -68,7 +160,6 @@ def test_sample_simulation_budget(self):
         self.assertEqual(sigma_shape, (3, 1))
 
         # Compute posterior mean
-        # self.assertAlmostEqual(np.average(np.asarray(samples[:,0])),1.22301,10e-2)
         self.assertAlmostEqual(np.average(mu_sample), 0.8175361535037666)
         self.assertAlmostEqual(np.average(sigma_sample), 8.155647092489977)
 
@@ -86,7 +177,6 @@ def test_sample_simulation_budget(self):
         self.assertEqual(sigma_shape, (10, 1))
 
         # Compute posterior mean
-        # self.assertAlmostEqual(np.average(np.asarray(samples[:,0])),1.22301,10e-2)
         self.assertAlmostEqual(np.average(mu_sample), 0.10394992719538543)
         self.assertAlmostEqual(np.average(sigma_sample), 6.746940834914168)
 
@@ -220,6 +310,33 @@ def test_sample(self):
         self.assertAlmostEqual(sigma_post_mean_1, 5.751158868437219)
         self.assertAlmostEqual(sigma_post_mean_2, 8.103358539327967)
 
+    def test_sample_with_inipoint(self):
+        # we check whether we can compute the posterior covariance, which means the right reshaping of inipoint is used.
+
+        n_sample, n_samples_per_param = 50, 20
+
+        sampler = MCMCMetropoliHastings([self.model], [self.likfun], self.backend, seed=1)
+        journal1 = sampler.sample([self.y_obs], n_sample, n_samples_per_param, cov_matrices=None,
+                                  iniPoint=np.array([-0.8, 7]), burnin=10, adapt_proposal_cov_interval=5,
+                                  use_tqdm=False, path_to_save_journal="tmp.jnl")
+
+        journal2 = sampler.sample([self.y_obs], n_sample, n_samples_per_param, cov_matrices=None,
+                                  iniPoint=np.array([np.array([-0.8]), np.array([7])]), burnin=10,
+                                  adapt_proposal_cov_interval=5, use_tqdm=False, path_to_save_journal="tmp.jnl")
+
+        journal3 = sampler.sample([self.y_obs], n_sample, n_samples_per_param, cov_matrices=None,
+                                  iniPoint=[-0.8, 7], burnin=10, adapt_proposal_cov_interval=5, use_tqdm=False,
+                                  path_to_save_journal="tmp.jnl")
+
+        journal4 = sampler.sample([self.y_obs], n_sample, n_samples_per_param, cov_matrices=None,
+                                  iniPoint=[np.array([-0.8]), np.array([7])], burnin=10, adapt_proposal_cov_interval=5,
+                                  use_tqdm=False, path_to_save_journal="tmp.jnl")
+
+        cov1 = journal1.posterior_cov()
+        cov2 = journal2.posterior_cov()
+        cov3 = journal3.posterior_cov()
+        cov4 = journal3.posterior_cov()
+
     def test_sample_with_transformer(self):
         n_sample, n_samples_per_param = 50, 20
 
@@ -801,6 +918,17 @@ def test_restart_from_journal_delmoral(self):
             self.assertEqual(journal_final_1.configuration["epsilon_arr"], journal_final_2.configuration["epsilon_arr"])
             self.assertEqual(journal_final_1.posterior_mean()['mu'], journal_final_2.posterior_mean()['mu'])
 
+        # test now that restarting fails if I do not store simulations in the Journal at the first .sample call
+        sampler = SMCABC([self.model], [self.dist_calc], self.backend, seed=1)
+        journal_intermediate = sampler.sample([self.observation], 2, n_sample, n_simulate,
+                                              which_mcmc_kernel=which_mcmc_kernel,
+                                              store_simulations_in_journal=False)
+        journal_intermediate.save("tmp.jnl")
+        with self.assertRaises(RuntimeError):
+            journal_final_1 = sampler.sample([self.observation], 1, n_sample, n_simulate,
+                                         which_mcmc_kernel=which_mcmc_kernel,
+                                         journal_file="tmp.jnl")
+
     def test_restart_from_journal_bernton(self):
         n_sample, n_simulate = 10, 10
         # loop over standard MCMC kernel, r-hit kernel version 1 and r-hit kernel version 2
@@ -821,6 +949,17 @@ def test_restart_from_journal_bernton(self):
             self.assertEqual(journal_final_1.configuration["epsilon_arr"], journal_final_2.configuration["epsilon_arr"])
             self.assertEqual(journal_final_1.posterior_mean()['mu'], journal_final_2.posterior_mean()['mu'])
 
+        # test now that restarting fails if I do not store simulations in the Journal at the first .sample call
+        sampler = SMCABC([self.model], [self.dist_calc_2], self.backend, seed=1, version="Bernton")
+        journal_intermediate = sampler.sample([self.observation_2], 1, n_sample, n_simulate,
+                                              which_mcmc_kernel=which_mcmc_kernel,
+                                              store_simulations_in_journal=False)
+        journal_intermediate.save("tmp.jnl")
+        with self.assertRaises(RuntimeError):
+            journal_final_1 = sampler.sample([self.observation_2], 1, n_sample, n_simulate,
+                                         which_mcmc_kernel=which_mcmc_kernel,
+                                         journal_file="tmp.jnl")
+
     def test_errors(self):
         with self.assertRaises(RuntimeError):
             sampler = SMCABC([self.model], [self.dist_calc_2], self.backend, seed=1, version="DelMoral")
diff --git a/tests/output_tests.py b/tests/output_tests.py
index 7ebfd882..347cad3b 100644
--- a/tests/output_tests.py
+++ b/tests/output_tests.py
@@ -2,7 +2,11 @@
 
 import numpy as np
 
-from abcpy.output import Journal
+from abcpy.backends import BackendDummy
+from abcpy.continuousmodels import Normal
+from abcpy.continuousmodels import Uniform
+from abcpy.inferences import DrawFromPrior
+from abcpy.output import Journal, GenerateFromJournal
 
 
 class JournalTests(unittest.TestCase):
@@ -44,24 +48,52 @@ def test_add_weights(self):
         np.testing.assert_equal(journal_recon.weights[0], weights1)
         np.testing.assert_equal(journal_recon.weights[1], weights2)
 
-    def test_add_opt_values(self):
-        opt_values1 = np.zeros((2, 4))
-        opt_values2 = np.ones((2, 4))
+    def test_add_simulations(self):
+        simulations1 = np.zeros((2, 4))
+        simulations2 = np.ones((2, 4))
 
         # test whether production mode only stores the last set of parameters
         journal_prod = Journal(0)
-        journal_prod.add_opt_values(opt_values1)
-        journal_prod.add_opt_values(opt_values2)
-        self.assertEqual(len(journal_prod.opt_values), 1)
-        np.testing.assert_equal(journal_prod.opt_values[0], opt_values2)
+        journal_prod.add_accepted_simulations(simulations1)
+        journal_prod.add_accepted_simulations(simulations2)
+        self.assertEqual(len(journal_prod.get_accepted_simulations()), 2)
+        np.testing.assert_equal(journal_prod.get_accepted_simulations(), simulations2)
 
         # test whether reconstruction mode stores all parameter sets
         journal_recon = Journal(1)
-        journal_recon.add_opt_values(opt_values1)
-        journal_recon.add_opt_values(opt_values2)
-        self.assertEqual(len(journal_recon.opt_values), 2)
-        np.testing.assert_equal(journal_recon.opt_values[0], opt_values1)
-        np.testing.assert_equal(journal_recon.opt_values[1], opt_values2)
+        journal_recon.add_accepted_simulations(simulations1)
+        journal_recon.add_accepted_simulations(simulations2)
+        self.assertEqual(len(journal_recon.get_accepted_simulations()), 2)
+        np.testing.assert_equal(journal_recon.get_accepted_simulations(0), simulations1)
+        np.testing.assert_equal(journal_recon.get_accepted_simulations(1), simulations2)
+
+        # test whether not storing it returns the correct value
+        journal_empty = Journal(0)
+        self.assertIsNone(journal_empty.get_accepted_simulations())
+
+    def test_add_cov_mats(self):
+        cov_mats1 = np.zeros((2, 4))
+        cov_mats2 = np.ones((2, 4))
+
+        # test whether production mode only stores the last set of parameters
+        journal_prod = Journal(0)
+        journal_prod.add_accepted_cov_mats(cov_mats1)
+        journal_prod.add_accepted_cov_mats(cov_mats2)
+        self.assertEqual(len(journal_prod.get_accepted_cov_mats()), 2)
+        np.testing.assert_equal(journal_prod.get_accepted_cov_mats(), cov_mats2)
+
+        # test whether reconstruction mode stores all parameter sets
+        journal_recon = Journal(1)
+        journal_recon.add_accepted_cov_mats(cov_mats1)
+        journal_recon.add_accepted_cov_mats(cov_mats2)
+        self.assertEqual(len(journal_recon.get_accepted_cov_mats()), 2)
+        np.testing.assert_equal(journal_recon.get_accepted_cov_mats(0), cov_mats1)
+        np.testing.assert_equal(journal_recon.get_accepted_cov_mats(1), cov_mats2)
+
+        # test whether not storing it returns the correct value
+        journal_empty = Journal(0)
+        self.assertIsNone(journal_empty.get_accepted_cov_mats())
+
 
     def test_load_and_save(self):
         params1 = np.zeros((2, 4))
@@ -139,6 +171,41 @@ def test_plot_wass_dist(self):
         print(len(journal_4.accepted_parameters))
         self.assertRaises(RuntimeError, journal_4.Wass_convergence_plot)
 
+        # now use lists
+        weights_identical = np.ones((100, 1))
+        params_0 = params_0.tolist()
+        weights_1 = np.arange(100)
+        params_1 = params_1.tolist()
+        weights_2 = np.arange(100, 200)
+        params_2 = params_2.tolist()
+        weights_3 = np.arange(200, 300)
+        params_3 = params_3.tolist()
+        weights_4 = np.arange(300, 400)
+        params_4 = params_4.tolist()
+        journal = Journal(1)
+        journal.add_weights(weights_identical)
+        journal.add_accepted_parameters(params_0)
+        journal.add_weights(weights_1)
+        journal.add_accepted_parameters(params_1)
+        journal.add_weights(weights_2)
+        journal.add_accepted_parameters(params_2)
+        journal.add_weights(weights_3)
+        journal.add_accepted_parameters(params_3)
+        journal.add_weights(weights_4)
+        journal.add_accepted_parameters(params_4)
+        fig, ax, wass_dist_lists = journal.Wass_convergence_plot()
+        self.assertAlmostEqual(wass_dist_lists[0], 0.22829193592175878)
+        # check the Errors
+        journal_2 = Journal(0)
+        self.assertRaises(RuntimeError, journal_2.Wass_convergence_plot)
+        journal_3 = Journal(1)
+        journal_3.add_weights(weights_identical)
+        self.assertRaises(RuntimeError, journal_3.Wass_convergence_plot)
+        journal_4 = Journal(1)
+        journal_4.add_accepted_parameters(np.array([np.array([1]), np.array([1, 2])], dtype="object"))
+        print(len(journal_4.accepted_parameters))
+        self.assertRaises(RuntimeError, journal_4.Wass_convergence_plot)
+
     def test_plot_post_distr(self):
         rng = np.random.RandomState(1)
         weights_identical = np.ones((100, 1))
@@ -184,6 +251,148 @@ def test_traceplot(self):
         # now try correctly:
         fig, ax = journal.traceplot()
 
+    def test_resample(self):
+        # -- setup --
+        # setup backend
+        dummy = BackendDummy()
+
+        # define a uniform prior distribution
+        mu = Uniform([[-5.0], [5.0]], name='mu')
+        sigma = Uniform([[0.0], [10.0]], name='sigma')
+        # define a Gaussian model
+        model = Normal([mu, sigma])
+
+        sampler = DrawFromPrior([model], dummy, seed=1)
+        original_journal = sampler.sample(100)
+
+        # expected mean values from bootstrapped samples:
+        mu_mean = -0.5631214403709973
+        sigma_mean = 5.2341427118053705
+        # expected mean values from subsampled samples:
+        mu_mean_2 = -0.6414897172489
+        sigma_mean_2 = 6.217381777130734
+
+        # -- bootstrap --
+        new_j = original_journal.resample(path_to_save_journal="tmp.jnl", seed=42)
+        mu_sample = np.array(new_j.get_parameters()['mu'])
+        sigma_sample = np.array(new_j.get_parameters()['sigma'])
+
+        accepted_parameters = new_j.get_accepted_parameters()
+        self.assertEqual(len(accepted_parameters), 100)
+        self.assertEqual(len(accepted_parameters[0]), 2)
+
+        # test shape of samples
+        mu_shape, sigma_shape = (len(mu_sample), mu_sample[0].shape[1]), \
+                                (len(sigma_sample), sigma_sample[0].shape[1])
+        self.assertEqual(mu_shape, (100, 1))
+        self.assertEqual(sigma_shape, (100, 1))
+
+        # Compute posterior mean
+        self.assertAlmostEqual(np.average(mu_sample), mu_mean)
+        self.assertAlmostEqual(np.average(sigma_sample), sigma_mean)
+
+        self.assertTrue(new_j.number_of_simulations[0] == 0)
+
+        # check whether the dictionary or parameter list contain same data:
+        self.assertEqual(new_j.get_parameters()["mu"][9], new_j.get_accepted_parameters()[9][0])
+        self.assertEqual(new_j.get_parameters()["sigma"][7], new_j.get_accepted_parameters()[7][1])
+
+        # -- subsample (replace=False, smaller number than the full sample) --
+        new_j_2 = original_journal.resample(replace=False, n_samples=10, seed=42)
+        mu_sample = np.array(new_j_2.get_parameters()['mu'])
+        sigma_sample = np.array(new_j_2.get_parameters()['sigma'])
+
+        accepted_parameters = new_j_2.get_accepted_parameters()
+        self.assertEqual(len(accepted_parameters), 10)
+        self.assertEqual(len(accepted_parameters[0]), 2)
+
+        # test shape of samples
+        mu_shape, sigma_shape = (len(mu_sample), mu_sample[0].shape[1]), \
+                                (len(sigma_sample), sigma_sample[0].shape[1])
+        self.assertEqual(mu_shape, (10, 1))
+        self.assertEqual(sigma_shape, (10, 1))
+
+        # Compute posterior mean
+        self.assertAlmostEqual(np.average(mu_sample), mu_mean_2)
+        self.assertAlmostEqual(np.average(sigma_sample), sigma_mean_2)
+
+        self.assertTrue(new_j_2.number_of_simulations[0] == 0)
+
+        # check whether the dictionary or parameter list contain same data:
+        self.assertEqual(new_j_2.get_parameters()["mu"][9], new_j_2.get_accepted_parameters()[9][0])
+        self.assertEqual(new_j_2.get_parameters()["sigma"][7], new_j_2.get_accepted_parameters()[7][1])
+
+        # -- check that resampling the full samples with replace=False gives the exact same posterior mean and std --
+        new_j_3 = original_journal.resample(replace=False, n_samples=100)
+        mu_sample = np.array(new_j_3.get_parameters()['mu'])
+        sigma_sample = np.array(new_j_3.get_parameters()['sigma'])
+
+        # original journal
+        mu_sample_original = np.array(original_journal.get_parameters()['mu'])
+        sigma_sample_original = np.array(original_journal.get_parameters()['sigma'])
+
+        # Compute posterior mean and std
+        self.assertAlmostEqual(np.average(mu_sample), np.average(mu_sample_original))
+        self.assertAlmostEqual(np.average(sigma_sample), np.average(sigma_sample_original))
+        self.assertAlmostEqual(np.std(mu_sample), np.std(mu_sample_original))
+        self.assertAlmostEqual(np.std(sigma_sample), np.std(sigma_sample_original))
+
+        # check whether the dictionary or parameter list contain same data:
+        self.assertEqual(new_j_3.get_parameters()["mu"][9], new_j_3.get_accepted_parameters()[9][0])
+        self.assertEqual(new_j_3.get_parameters()["sigma"][7], new_j_3.get_accepted_parameters()[7][1])
+
+        # -- test the error --
+        with self.assertRaises(RuntimeError):
+            original_journal.resample(replace=False, n_samples=200)
+
+
+class GenerateFromJournalTests(unittest.TestCase):
+    def setUp(self):
+        # setup backend
+        dummy = BackendDummy()
+
+        # define a uniform prior distribution
+        mu = Uniform([[-5.0], [5.0]], name='mu')
+        sigma = Uniform([[0.0], [10.0]], name='sigma')
+        # define a Gaussian model
+        self.model = Normal([mu, sigma])
+
+        # define a stupid uniform model now
+        self.model2 = Uniform([[0], [10]])
+
+        self.sampler = DrawFromPrior([self.model], dummy, seed=1)
+        self.original_journal = self.sampler.sample(100)
+
+        self.generate_from_journal = GenerateFromJournal([self.model], dummy, seed=2)
+        self.generate_from_journal_2 = GenerateFromJournal([self.model2], dummy, seed=2)
+
+        # expected mean values from bootstrapped samples:
+        self.mu_mean = -0.2050921750330999
+        self.sigma_mean = 5.178647189918053
+        # expected mean values from subsampled samples:
+        self.mu_mean_2 = -0.021275259024241676
+        self.sigma_mean_2 = 5.672004487129107
+
+    def test_generate(self):
+        # sample single simulation for each par value
+        parameters, simulations, normalized_weights = self.generate_from_journal.generate(journal=self.original_journal)
+        self.assertEqual(parameters.shape, (100, 2))
+        self.assertEqual(simulations.shape, (100, 1, 1))
+        self.assertEqual(normalized_weights.shape, (100,))
+
+        # sample multiple simulations for each par value
+        parameters, simulations, normalized_weights = self.generate_from_journal.generate(self.original_journal,
+                                                                                          n_samples_per_param=3,
+                                                                                          iteration=-1)
+        self.assertEqual(parameters.shape, (100, 2))
+        self.assertEqual(simulations.shape, (100, 3, 1))
+        self.assertEqual(normalized_weights.shape, (100,))
+
+    def test_errors(self):
+        # check whether using a different model leads to errors:
+        with self.assertRaises(RuntimeError):
+            self.generate_from_journal_2.generate(self.original_journal)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/statistics_tests.py b/tests/statistics_tests.py
index 5628920a..0e008d13 100644
--- a/tests/statistics_tests.py
+++ b/tests/statistics_tests.py
@@ -2,6 +2,9 @@
 
 import numpy as np
 
+from abcpy.backends import BackendDummy
+from abcpy.continuousmodels import Uniform, Normal
+from abcpy.inferences import DrawFromPrior
 from abcpy.statistics import Identity, LinearTransformation, NeuralEmbedding
 
 try:
@@ -10,14 +13,29 @@
     has_torch = False
 else:
     has_torch = True
-    from abcpy.NN_utilities.networks import createDefaultNN, ScalerAndNet
+    from abcpy.NN_utilities.networks import createDefaultNN, ScalerAndNet, DiscardLastOutputNet
 
 
 class IdentityTests(unittest.TestCase):
     def setUp(self):
-        self.stat_calc = Identity(degree=1, cross=0)
+        self.stat_calc = Identity(degree=1, cross=False)
         self.stat_calc_pipeline = Identity(degree=2, cross=False, previous_statistics=self.stat_calc)
 
+        # try now the statistics rescaling option:
+        mu = Uniform([[-5.0], [5.0]], name='mu')
+        sigma = Uniform([[0.0], [10.0]], name='sigma')
+        # define a Gaussian model
+        self.model = Normal([mu, sigma])
+
+        sampler = DrawFromPrior([self.model], BackendDummy(), seed=1)
+        reference_parameters, reference_simulations = sampler.sample_par_sim_pairs(30, 1)
+        reference_simulations = reference_simulations.reshape(reference_simulations.shape[0],
+                                                              reference_simulations.shape[2])
+        reference_simulations_double = np.concatenate([reference_simulations, reference_simulations], axis=1)
+
+        self.stat_calc_rescaling = Identity(reference_simulations=reference_simulations_double)
+        self.stat_calc_rescaling_2 = Identity(reference_simulations=reference_simulations)
+
     def test_statistics(self):
         self.assertRaises(TypeError, self.stat_calc.statistics, 3.4)
         vec1 = np.array([1, 2])
@@ -26,22 +44,27 @@ def test_statistics(self):
         self.assertTrue((self.stat_calc.statistics([vec1, vec1]) == np.array([[vec1], [vec1]])).all())
         self.assertTrue((self.stat_calc.statistics([vec2, vec2]) == np.array([[vec2], [vec2]])).all())
 
+        self.assertTrue((self.stat_calc_rescaling.statistics([vec1]) != self.stat_calc.statistics([vec1])).all())
+        self.assertTrue((self.stat_calc_rescaling_2.statistics([vec2]) != self.stat_calc.statistics([vec2])).all())
+
+        self.assertRaises(RuntimeError, self.stat_calc_rescaling.statistics, [vec2])
+
     def test_polynomial_expansion(self):
         # Checks whether wrong input type produces error message
         self.assertRaises(TypeError, self.stat_calc._polynomial_expansion, 3.4)
 
         a = [np.array([0, 2]), np.array([2, 1])]
         # test cross-product part
-        self.stat_calc = Identity(degree=2, cross=1)
+        self.stat_calc = Identity(degree=2, cross=True)
         self.assertTrue((self.stat_calc.statistics(a) == np.array([[0, 2, 0, 4, 0], [2, 1, 4, 1, 2]])).all())
         # When a tuple
         a = [np.array([0, 2])]
-        self.stat_calc = Identity(degree=2, cross=1)
+        self.stat_calc = Identity(degree=2, cross=True)
         self.assertTrue((self.stat_calc.statistics(a) == np.array([[0, 2, 0, 4, 0]])).all())
-        self.stat_calc = Identity(degree=2, cross=0)
+        self.stat_calc = Identity(degree=2, cross=False)
         self.assertTrue((self.stat_calc.statistics(a) == np.array([[0, 2, 0, 4]])).all())
         a = list(np.array([2]))
-        self.stat_calc = Identity(degree=2, cross=1)
+        self.stat_calc = Identity(degree=2, cross=True)
         self.assertTrue((self.stat_calc.statistics(a) == np.array([[2, 4]])).all())
 
     def test_pipeline(self):
@@ -52,7 +75,21 @@ def test_pipeline(self):
 class LinearTransformationTests(unittest.TestCase):
     def setUp(self):
         self.coeff = np.array([[3, 4], [5, 6]])
-        self.stat_calc = LinearTransformation(self.coeff, degree=1, cross=0)
+        self.stat_calc = LinearTransformation(self.coeff, degree=1, cross=False)
+
+        # try now the statistics rescaling option:
+        mu = Uniform([[-5.0], [5.0]], name='mu')
+        sigma = Uniform([[0.0], [10.0]], name='sigma')
+        # define a Gaussian model
+        self.model = Normal([mu, sigma])
+
+        sampler = DrawFromPrior([self.model], BackendDummy(), seed=1)
+        reference_parameters, reference_simulations = sampler.sample_par_sim_pairs(30, 1)
+        reference_simulations = reference_simulations.reshape(reference_simulations.shape[0],
+                                                              reference_simulations.shape[2])
+        reference_simulations_double = np.concatenate([reference_simulations, reference_simulations], axis=1)
+
+        self.stat_calc_rescaling = LinearTransformation(self.coeff, reference_simulations=reference_simulations_double)
 
     def test_statistics(self):
         self.assertRaises(TypeError, self.stat_calc.statistics, 3.4)
@@ -63,23 +100,25 @@ def test_statistics(self):
             [np.dot(np.array([1, 2]), self.coeff), np.dot(np.array([1, 2]), self.coeff)])).all())
         self.assertRaises(ValueError, self.stat_calc.statistics, [vec2])
 
+        self.assertTrue((self.stat_calc_rescaling.statistics([vec1]) != self.stat_calc.statistics([vec1])).all())
+
     def test_polynomial_expansion(self):
         # Checks whether wrong input type produces error message
         self.assertRaises(TypeError, self.stat_calc._polynomial_expansion, 3.4)
 
         a = [np.array([0, 2]), np.array([2, 1])]
         # test cross-product part
-        self.stat_calc = LinearTransformation(self.coeff, degree=2, cross=1)
+        self.stat_calc = LinearTransformation(self.coeff, degree=2, cross=True)
         self.assertTrue((self.stat_calc.statistics(a) == np.array([[10, 12, 100, 144, 120],
                                                                    [11, 14, 121, 196, 154]])).all())
         # When a tuple
         a = [np.array([0, 2])]
-        self.stat_calc = LinearTransformation(self.coeff, degree=2, cross=1)
+        self.stat_calc = LinearTransformation(self.coeff, degree=2, cross=True)
         self.assertTrue((self.stat_calc.statistics(a) == np.array([[10, 12, 100, 144, 120]])).all())
-        self.stat_calc = LinearTransformation(self.coeff, degree=2, cross=0)
+        self.stat_calc = LinearTransformation(self.coeff, degree=2, cross=False)
         self.assertTrue((self.stat_calc.statistics(a) == np.array([[10, 12, 100, 144]])).all())
         a = list(np.array([2]))
-        self.stat_calc = LinearTransformation(self.coeff, degree=2, cross=1)
+        self.stat_calc = LinearTransformation(self.coeff, degree=2, cross=True)
         self.assertRaises(ValueError, self.stat_calc.statistics, a)
 
 
@@ -88,8 +127,30 @@ def setUp(self):
         if has_torch:
             self.net = createDefaultNN(2, 3)()
             self.net_with_scaler = ScalerAndNet(self.net, None)
+            self.net_with_discard_wrapper = DiscardLastOutputNet(self.net)
             self.stat_calc = NeuralEmbedding(self.net)
             self.stat_calc_with_scaler = NeuralEmbedding(self.net_with_scaler)
+            self.stat_calc_with_discard_wrapper = NeuralEmbedding(self.net_with_discard_wrapper)
+            # reference input and output
+            torch.random.manual_seed(1)
+            self.tensor = torch.randn(1, 2)
+            self.out = self.net(self.tensor)
+            self.out_discard = self.net_with_discard_wrapper(self.tensor)
+
+            # try now the statistics rescaling option:
+            mu = Uniform([[-5.0], [5.0]], name='mu')
+            sigma = Uniform([[0.0], [10.0]], name='sigma')
+            # define a Gaussian model
+            self.model = Normal([mu, sigma])
+
+            sampler = DrawFromPrior([self.model], BackendDummy(), seed=1)
+            reference_parameters, reference_simulations = sampler.sample_par_sim_pairs(30, 1)
+            reference_simulations = reference_simulations.reshape(reference_simulations.shape[0],
+                                                                  reference_simulations.shape[2])
+
+            self.stat_calc_rescaling = NeuralEmbedding(self.net, reference_simulations=reference_simulations,
+                                                       previous_statistics=Identity(degree=2))
+
         if not has_torch:
             self.assertRaises(ImportError, NeuralEmbedding, None)
 
@@ -102,22 +163,40 @@ def test_statistics(self):
             self.assertTrue((self.stat_calc.statistics([vec1, vec1])).all())
             self.assertRaises(RuntimeError, self.stat_calc.statistics, [vec2])
 
+            self.assertTrue((self.stat_calc_rescaling.statistics([vec2])).all())
+
     def test_save_load(self):
         if has_torch:
             self.stat_calc.save_net("net.pth")
             self.stat_calc_with_scaler.save_net("net.pth", path_to_scaler="scaler.pkl")
-            self.stat_calc_loaded = NeuralEmbedding.fromFile("net.pth", input_size=2, output_size=3)
-            self.stat_calc_loaded = NeuralEmbedding.fromFile("net.pth", network_class=createDefaultNN(2, 3))
-            self.stat_calc_loaded_with_scaler = NeuralEmbedding.fromFile("net.pth", network_class=createDefaultNN(2, 3),
-                                                                         path_to_scaler="scaler.pkl")
+            stat_calc_loaded = NeuralEmbedding.fromFile("net.pth", input_size=2, output_size=3)
+            stat_calc_loaded = NeuralEmbedding.fromFile("net.pth", network_class=createDefaultNN(2, 3))
+            stat_calc_loaded_with_scaler = NeuralEmbedding.fromFile("net.pth", network_class=createDefaultNN(2, 3),
+                                                                    path_to_scaler="scaler.pkl")
+            # test the network was recovered correctly
+            out_new = stat_calc_loaded.net(self.tensor)
+            self.assertTrue(torch.allclose(self.out, out_new))
+
+            # now with the DiscardLastOutput wrapper
+            self.stat_calc_with_discard_wrapper.save_net("net_with_discard_wrapper.pth")
+            stat_calc_with_discard_loaded = NeuralEmbedding.fromFile("net_with_discard_wrapper.pth", input_size=2,
+                                                                     output_size=3)
+            # test the network was recovered correctly
+            out_new_discard = stat_calc_with_discard_loaded.net(self.tensor)
+            self.assertTrue(torch.allclose(self.out_discard, out_new_discard))
+
+            # now with both DiscardLastOutput and Scaler wrappers
+            stat_calc_with_discard_and_scaler_loaded = NeuralEmbedding.fromFile("net_with_discard_wrapper.pth",
+                                                                                input_size=2, output_size=3,
+                                                                                path_to_scaler="scaler.pkl")
 
             with self.assertRaises(RuntimeError):
                 self.stat_calc_with_scaler.save_net("net.pth")
-                self.stat_calc_loaded = NeuralEmbedding.fromFile("net.pth")
-                self.stat_calc_loaded = NeuralEmbedding.fromFile("net.pth", network_class=createDefaultNN(2, 3),
-                                                                 input_size=1)
-                self.stat_calc_loaded = NeuralEmbedding.fromFile("net.pth", network_class=createDefaultNN(2, 3),
-                                                                 hidden_sizes=[2, 3])
+                stat_calc_loaded = NeuralEmbedding.fromFile("net.pth")
+                stat_calc_loaded = NeuralEmbedding.fromFile("net.pth", network_class=createDefaultNN(2, 3),
+                                                            input_size=1)
+                stat_calc_loaded = NeuralEmbedding.fromFile("net.pth", network_class=createDefaultNN(2, 3),
+                                                            hidden_sizes=[2, 3])
 
 
 if __name__ == '__main__':
diff --git a/tests/statisticslearning_tests.py b/tests/statisticslearning_tests.py
index 8247bd40..f7fe9b07 100644
--- a/tests/statisticslearning_tests.py
+++ b/tests/statisticslearning_tests.py
@@ -7,7 +7,7 @@
 from abcpy.continuousmodels import Uniform
 from abcpy.statistics import Identity
 from abcpy.statisticslearning import Semiautomatic, SemiautomaticNN, TripletDistanceLearning, \
-    ContrastiveDistanceLearning
+    ContrastiveDistanceLearning, ExponentialFamilyScoreMatching
 
 try:
     import torch
@@ -15,6 +15,7 @@
     has_torch = False
 else:
     has_torch = True
+    from abcpy.NN_utilities.networks import createDefaultNN
 
 
 class SemiautomaticTests(unittest.TestCase):
@@ -65,12 +66,15 @@ def setUp(self):
         if has_torch:
             # Initialize statistics learning
             self.statisticslearning = SemiautomaticNN([self.Y], self.statistics_cal, self.backend, n_samples=100,
-                                                      n_samples_per_param=1, seed=1, n_epochs=10, scale_samples=False,
-                                                      use_tqdm=False)
+                                                      n_samples_val=100, n_samples_per_param=1, seed=1, n_epochs=2,
+                                                      scale_samples=False, use_tqdm=False)
+            self.statisticslearning2 = SemiautomaticNN([self.Y], self.statistics_cal, self.backend, n_samples=10,
+                                                       n_samples_val=10, n_samples_per_param=1, seed=1, n_epochs=5,
+                                                       scale_samples=False, use_tqdm=False)
             # with sample scaler:
             self.statisticslearning_with_scaler = SemiautomaticNN([self.Y], self.statistics_cal, self.backend,
                                                                   n_samples=100, n_samples_per_param=1, seed=1,
-                                                                  n_epochs=10, scale_samples=True, use_tqdm=False)
+                                                                  n_epochs=2, scale_samples=True, use_tqdm=False)
 
     def test_initialization(self):
         if not has_torch:
@@ -93,13 +97,17 @@ def test_transformation(self):
             extracted_statistics = self.new_statistics_calculator_with_scaler.statistics(y_obs)
             self.assertEqual(np.shape(extracted_statistics), (1, 2))
 
-            self.assertRaises(ValueError, self.new_statistics_calculator_with_scaler.statistics, [np.array([1, 2])])
+            self.assertRaises(RuntimeError, self.new_statistics_calculator_with_scaler.statistics, [np.array([1, 2])])
 
     def test_errors(self):
         if has_torch:
             with self.assertRaises(RuntimeError):
                 self.statisticslearning = SemiautomaticNN([self.Y], self.statistics_cal, self.backend, n_samples=1000,
                                                           n_samples_per_param=1, seed=1, parameters=np.ones((100, 1)))
+            with self.assertRaises(RuntimeError):
+                self.statisticslearning = SemiautomaticNN([self.Y], self.statistics_cal, self.backend, n_samples=1000,
+                                                          n_samples_per_param=1, seed=1,
+                                                          embedding_net=createDefaultNN(1, 2))
             with self.assertRaises(RuntimeError):
                 self.statisticslearning = SemiautomaticNN([self.Y], self.statistics_cal, self.backend, n_samples=1000,
                                                           n_samples_per_param=1, seed=1, simulations=np.ones((100, 1)))
@@ -146,6 +154,17 @@ def test_errors(self):
                                                           n_samples_per_param=1, seed=1,
                                                           parameters_val=[i for i in range(10)],
                                                           simulations_val=[i for i in range(10)])
+            with self.assertRaises(RuntimeError):
+                self.statisticslearning2.test_losses = [4, 2, 1]
+                self.statisticslearning2.plot_losses()
+            with self.assertRaises(NotImplementedError):
+                self.statisticslearning.plot_losses(which_losses="foo")
+
+    def test_plots(self):
+        if has_torch:
+            self.statisticslearning.plot_losses()
+            self.statisticslearning.plot_losses(which_losses="train")
+            self.statisticslearning.plot_losses(which_losses="test")
 
 
 class ContrastiveDistanceLearningTests(unittest.TestCase):
@@ -164,13 +183,15 @@ def setUp(self):
         if has_torch:
             # Initialize statistics learning
             self.statisticslearning = ContrastiveDistanceLearning([self.Y], self.statistics_cal, self.backend,
-                                                                  n_samples=100, n_samples_per_param=1, seed=1,
-                                                                  n_epochs=10, scale_samples=False, use_tqdm=False)
+                                                                  n_samples=100, n_samples_val=100,
+                                                                  n_samples_per_param=1, seed=1, n_epochs=2,
+                                                                  scale_samples=False, use_tqdm=False)
             # with sample scaler:
             self.statisticslearning_with_scaler = ContrastiveDistanceLearning([self.Y], self.statistics_cal,
                                                                               self.backend, n_samples=100,
                                                                               n_samples_per_param=1, seed=1,
-                                                                              n_epochs=10, scale_samples=True, use_tqdm=False)
+                                                                              n_epochs=2, scale_samples=True,
+                                                                              use_tqdm=False)
 
     def test_initialization(self):
         if not has_torch:
@@ -194,7 +215,13 @@ def test_transformation(self):
             extracted_statistics = self.new_statistics_calculator_with_scaler.statistics(y_obs)
             self.assertEqual(np.shape(extracted_statistics), (1, 2))
 
-            self.assertRaises(ValueError, self.new_statistics_calculator_with_scaler.statistics, [np.array([1, 2])])
+            self.assertRaises(RuntimeError, self.new_statistics_calculator_with_scaler.statistics, [np.array([1, 2])])
+
+    def test_plots(self):
+        if has_torch:
+            self.statisticslearning.plot_losses()
+            self.statisticslearning.plot_losses(which_losses="train")
+            self.statisticslearning.plot_losses(which_losses="test")
 
 
 class TripletDistanceLearningTests(unittest.TestCase):
@@ -213,13 +240,13 @@ def setUp(self):
         if has_torch:
             # Initialize statistics learning
             self.statisticslearning = TripletDistanceLearning([self.Y], self.statistics_cal, self.backend,
-                                                              scale_samples=False, use_tqdm=False,
-                                                              n_samples=100, n_samples_per_param=1, seed=1, n_epochs=10)
+                                                              n_samples=100, n_samples_val=100, n_samples_per_param=1,
+                                                              seed=1, n_epochs=2, scale_samples=False, use_tqdm=False)
             # with sample scaler:
             self.statisticslearning_with_scaler = TripletDistanceLearning([self.Y], self.statistics_cal, self.backend,
                                                                           scale_samples=True, use_tqdm=False,
                                                                           n_samples=100, n_samples_per_param=1, seed=1,
-                                                                          n_epochs=10)
+                                                                          n_epochs=2)
 
     def test_initialization(self):
         if not has_torch:
@@ -242,7 +269,186 @@ def test_transformation(self):
             extracted_statistics = self.new_statistics_calculator_with_scaler.statistics(y_obs)
             self.assertEqual(np.shape(extracted_statistics), (1, 2))
 
-            self.assertRaises(ValueError, self.new_statistics_calculator_with_scaler.statistics, [np.array([1, 2])])
+            self.assertRaises(RuntimeError, self.new_statistics_calculator_with_scaler.statistics, [np.array([1, 2])])
+
+    def test_plots(self):
+        if has_torch:
+            self.statisticslearning.plot_losses()
+            self.statisticslearning.plot_losses(which_losses="train")
+            self.statisticslearning.plot_losses(which_losses="test")
+
+
+class ExponentialFamilyScoreMatchingTests(unittest.TestCase):
+    def setUp(self):
+        # define prior and model
+        sigma = Uniform([[1], [2]])
+        mu = Normal([0, 1])
+        self.Y = Normal([mu, sigma])
+
+        # define backend
+        self.backend = Backend()
+
+        # define statistics
+        self.statistics_cal = Identity(degree=3, cross=False)
+
+        if has_torch:
+            self.statisticslearning_all_defaults = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend,
+                                                                                  n_samples=4, n_epochs=2, use_tqdm=False)
+            self.statisticslearning_no_sliced = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend,
+                                                                               n_samples=4, n_epochs=2,
+                                                                               sliced=False, use_tqdm=False)
+            self.statisticslearning_sphere_noise = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend,
+                                                                                  n_samples=4, n_epochs=2, use_tqdm=False,
+                                                                                  noise_type="sphere")
+            self.statisticslearning_gaussian_noise = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend,
+                                                                                    n_samples=4, n_epochs=2, use_tqdm=False,
+                                                                                    noise_type="gaussian")
+            self.statisticslearning_variance_reduction = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend,
+                                                                                        n_samples=4, n_epochs=2, use_tqdm=False,
+                                                                                        variance_reduction=True)
+            self.statisticslearning_no_bn = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend, n_samples=4,
+                                                                           n_epochs=2, batch_norm=False, use_tqdm=False)
+            self.statisticslearning_provide_nets = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend,
+                                                                                  n_samples=4, n_epochs=2,
+                                                                                  simulations_net=createDefaultNN(3, 3)(),
+                                                                                  parameters_net=createDefaultNN(2, 2)(),
+                                                                                  use_tqdm=False)
+            self.statisticslearning_embedding_dim = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend,
+                                                                                   n_samples=4, n_epochs=2,
+                                                                                   embedding_dimension=4, use_tqdm=False)
+            self.statisticslearning_validation_early_stop = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal,
+                                                                                           self.backend,
+                                                                                           n_samples=4, n_epochs=20,
+                                                                                           n_samples_val=20, early_stopping=True,
+                                                                                           use_tqdm=False)
+            self.statisticslearning_scale = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend,
+                                                                           n_samples=4, n_epochs=2, scale_samples=False,
+                                                                           scale_parameters=True, use_tqdm=False)
+            self.statisticslearning_bounds = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend,
+                                                                            n_samples=4, n_epochs=2,
+                                                                            lower_bound_simulations=np.array([-1000, -1000, -1000]),
+                                                                            upper_bound_simulations=np.array([1000, 1000, 1000]),
+                                                                            use_tqdm=False, seed=1)
+            self.statisticslearning_no_schedulers = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend,
+                                                                                   n_samples=4, n_epochs=2,
+                                                                                   scheduler_parameters=False,
+                                                                                   scheduler_simulations=False, use_tqdm=False)
+            self.statisticslearning_lam = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend,
+                                                                         n_samples=4, n_epochs=2, use_tqdm=False, sliced=False,
+                                                                         lam=0.1)
+
+    def test_initialization(self):
+        if not has_torch:
+            self.assertRaises(ImportError, ExponentialFamilyScoreMatching, [self.Y], self.statistics_cal, self.backend)
+
+    def test_transformation(self):
+        if has_torch:
+            self.new_statistics_calculator = self.statisticslearning_all_defaults.get_statistics()
+            # with no scaler on data:
+            self.new_statistics_calculator_no_scaler = self.statisticslearning_scale.get_statistics()
+            # with no rescaling of the statistics:
+            self.new_statistics_calculator_no_rescale = self.statisticslearning_all_defaults.get_statistics(
+                rescale_statistics=False)
+
+            # Simulate observed data
+            Obs = Normal([2, 4])
+            y_obs = Obs.forward_simulate(Obs.get_input_values(), 1)[0].tolist()
+
+            extracted_statistics = self.new_statistics_calculator.statistics(y_obs)
+            self.assertEqual(np.shape(extracted_statistics), (1, 2))
+            extracted_statistics_no_rescale = self.new_statistics_calculator_no_rescale.statistics(y_obs)
+            self.assertEqual(np.shape(extracted_statistics_no_rescale), (1, 2))
+            self.assertFalse(np.allclose(extracted_statistics_no_rescale, extracted_statistics))
+
+            self.assertRaises(RuntimeError, self.new_statistics_calculator.statistics, [np.array([1, 2])])
+            self.assertRaises(RuntimeError, self.new_statistics_calculator_no_scaler.statistics, [np.array([1, 2])])
+
+    def test_errors(self):
+        if has_torch:
+            with self.assertRaises(RuntimeError):
+                self.statisticslearning = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend, n_samples=1000,
+                                                                         seed=1, parameters=np.ones((100, 1)))
+            with self.assertRaises(RuntimeError):
+                self.statisticslearning = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend, n_samples=1000,
+                                                                         seed=1, simulations_net=createDefaultNN(1, 3))
+            with self.assertRaises(RuntimeError):
+                self.statisticslearning = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend, n_samples=1000,
+                                                                         seed=1, parameters_net=createDefaultNN(1, 3))
+            with self.assertRaises(RuntimeError):
+                self.statisticslearning = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend, n_samples=1000,
+                                                                         seed=1, noise_type="ciao", use_tqdm=False)
+            with self.assertRaises(RuntimeError):
+                self.statisticslearning = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend, n_samples=1000,
+                                                                         seed=1, noise_type="sphere", variance_reduction=True,
+                                                                         use_tqdm=False)
+            with self.assertRaises(RuntimeError):
+                self.statisticslearning = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend, n_samples=1000,
+                                                                         seed=1, simulations=np.ones((100, 1)))
+            with self.assertRaises(RuntimeError):
+                self.statisticslearning = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend, n_samples=1000,
+                                                                         seed=1,
+                                                                         simulations=np.ones((100, 1, 3)))
+            with self.assertRaises(RuntimeError):
+                self.statisticslearning = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend, n_samples=1000,
+                                                                         seed=1,
+                                                                         parameters=np.ones((100, 1, 2)))
+            with self.assertRaises(RuntimeError):
+                self.statisticslearning = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend, n_samples=1000,
+                                                                         seed=1, simulations=np.ones((100, 1)),
+                                                                         parameters=np.zeros((99, 1)))
+            with self.assertRaises(RuntimeError):
+                self.statisticslearning = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend, n_samples=1000,
+                                                                         seed=1,
+                                                                         parameters_val=np.ones((100, 1)))
+            with self.assertRaises(RuntimeError):
+                self.statisticslearning = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend, n_samples=1000,
+                                                                         seed=1,
+                                                                         simulations_val=np.ones((100, 1)))
+            with self.assertRaises(RuntimeError):
+                self.statisticslearning = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend, n_samples=1000,
+                                                                         seed=1,
+                                                                         simulations_val=np.ones((100, 1, 3)))
+            with self.assertRaises(RuntimeError):
+                self.statisticslearning = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend, n_samples=1000,
+                                                                         seed=1,
+                                                                         parameters_val=np.ones((100, 1, 2)))
+            with self.assertRaises(RuntimeError):
+                self.statisticslearning = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend, n_samples=1000,
+                                                                         seed=1,
+                                                                         simulations_val=np.ones((100, 1)),
+                                                                         parameters_val=np.zeros((99, 1)))
+            with self.assertRaises(TypeError):
+                self.statisticslearning = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend, n_samples=1000,
+                                                                         seed=1,
+                                                                         parameters=[i for i in range(10)],
+                                                                         simulations=[i for i in range(10)])
+            with self.assertRaises(TypeError):
+                self.statisticslearning = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend, n_samples=1000,
+                                                                         seed=1,
+                                                                         parameters_val=[i for i in range(10)],
+                                                                         simulations_val=[i for i in range(10)])
+            with self.assertRaises(RuntimeError):
+                self.statisticslearning = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend, n_samples=1000,
+                                                                         seed=1, lower_bound_simulations=[1, 2, 3])
+            with self.assertRaises(RuntimeError):
+                self.statisticslearning = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend, n_samples=1000,
+                                                                         seed=1, upper_bound_simulations=[1, 2, 3])
+            with self.assertRaises(RuntimeError):
+                self.statisticslearning = ExponentialFamilyScoreMatching([self.Y], self.statistics_cal, self.backend, n_samples=1000,
+                                                                         lower_bound_simulations=np.array([-1000, -1000]), seed=1,
+                                                                         upper_bound_simulations=np.array([1000, 1000, 1000]))
+
+            with self.assertRaises(RuntimeError):
+                self.statisticslearning_all_defaults.test_losses = [4, 2, 1]
+                self.statisticslearning_all_defaults.plot_losses()
+            with self.assertRaises(NotImplementedError):
+                self.statisticslearning_all_defaults.plot_losses(which_losses="foo")
+
+    def test_plots(self):
+        if has_torch:
+            self.statisticslearning_all_defaults.plot_losses()
+            self.statisticslearning_all_defaults.plot_losses(which_losses="train")
+            self.statisticslearning_all_defaults.plot_losses(which_losses="test")
 
 
 if __name__ == '__main__':
diff --git a/tests/test_examples.py b/tests/test_examples.py
index 2fb97d6d..73746aa1 100644
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -96,6 +96,22 @@ def test_pmcabc(self):
             expected_result = 172.52136853079725
             self.assertAlmostEqual(test_result, expected_result)
 
+    def test_gaussian_statistics_learning_DrawFromPrior_reload_NNs(self):
+        if has_torch:
+            from examples.statisticslearning.gaussian_statistics_learning_DrawFromPrior_reload_NNs import infer_parameters
+            journal = infer_parameters(steps=1, n_sample=50)
+            test_result = journal.posterior_mean()["mu"]
+            expected_result = 172.52136853079725
+            self.assertAlmostEqual(test_result, expected_result)
+
+    def test_gaussian_statistics_learning_exponential_family(self):
+        if has_torch:
+            from examples.statisticslearning.gaussian_statistics_learning_exponential_family import infer_parameters
+            journal = infer_parameters(steps=1, n_sample=50)
+            test_result = journal.posterior_mean()["mu"]
+            expected_result = 172.52136853079725
+            self.assertAlmostEqual(test_result, expected_result)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/transformers_tests.py b/tests/transformers_tests.py
index 9dfa3d66..3fee3fc0 100644
--- a/tests/transformers_tests.py
+++ b/tests/transformers_tests.py
@@ -2,7 +2,14 @@
 
 import numpy as np
 
-from abcpy.transformers import DummyTransformer, BoundedVarTransformer
+from abcpy.transformers import DummyTransformer, BoundedVarTransformer, BoundedVarScaler
+
+try:
+    import torch
+except ImportError:
+    has_torch = False
+else:
+    has_torch = True
 
 
 class DummyTransformerTests(unittest.TestCase):
@@ -53,5 +60,113 @@ def test_errors(self):
             self.transformer_two_sided.transform(x=[np.array([13.2]), np.array([2.4])])
 
 
+class test_BoundedVarScaler(unittest.TestCase):
+
+    def setUp(self):
+        self.scaler_lower_bounded = BoundedVarScaler(lower_bound=np.array([0, 0]),
+                                                     upper_bound=np.array([None, None]))
+        self.scaler_two_sided = BoundedVarScaler(lower_bound=np.array([0, 0]), upper_bound=np.array([10, 10]))
+        self.scaler_mixed = BoundedVarScaler(lower_bound=np.array([0, 0]), upper_bound=np.array([10, None]))
+        self.scaler_dummy = BoundedVarScaler(lower_bound=np.array([None, None]),
+                                             upper_bound=np.array([None, None]))
+        # without minmax
+        self.scaler_lower_bounded_no_minmax = BoundedVarScaler(lower_bound=np.array([0, 0]),
+                                                               upper_bound=np.array([None, None]),
+                                                               rescale_transformed_vars=False)
+        self.scaler_two_sided_no_minmax = BoundedVarScaler(lower_bound=np.array([0, 0]), upper_bound=np.array([10, 10]),
+                                                           rescale_transformed_vars=False)
+        self.scaler_mixed_no_minmax = BoundedVarScaler(lower_bound=np.array([0, 0]), upper_bound=np.array([10, None]),
+                                                       rescale_transformed_vars=False)
+        self.scaler_dummy_no_minmax = BoundedVarScaler(lower_bound=np.array([None, None]),
+                                                       upper_bound=np.array([None, None]),
+                                                       rescale_transformed_vars=False)
+
+        self.list_scalers_minmax = [self.scaler_dummy, self.scaler_mixed,
+                                    self.scaler_two_sided, self.scaler_lower_bounded]
+        self.list_scalers_no_minmax = [self.scaler_dummy_no_minmax, self.scaler_mixed_no_minmax,
+                                       self.scaler_two_sided_no_minmax, self.scaler_lower_bounded_no_minmax]
+
+        self.list_scalers = self.list_scalers_minmax + self.list_scalers_no_minmax
+
+        # data
+        self.x = np.array([[3.2, 4.5]])
+        self.x2 = np.array([[4.2, 3.5]])
+
+    def test(self):
+        for scaler in self.list_scalers:
+            scaler.fit(self.x)
+            self.assertEqual(self.x.shape, scaler.inverse_transform(scaler.transform(self.x)).shape)
+            self.assertTrue(np.allclose(np.array(self.x), np.array(scaler.inverse_transform(scaler.transform(self.x)))))
+            self.assertAlmostEqual(scaler.jac_log_det(self.x),
+                                   scaler.jac_log_det_inverse_transform(scaler.transform(self.x)), delta=1e-7)
+
+        # test dummy scaler actually does nothing:
+        self.assertTrue(np.allclose(self.x, self.scaler_dummy_no_minmax.transform(self.x)))
+        self.assertTrue(np.allclose(self.x, self.scaler_dummy_no_minmax.inverse_transform(self.x)))
+        self.assertEqual(0, self.scaler_dummy.jac_log_det_inverse_transform(self.x))
+        self.assertEqual(0, self.scaler_dummy.jac_log_det(self.x))
+        self.assertEqual(0, self.scaler_dummy_no_minmax.jac_log_det_inverse_transform(self.x))
+        self.assertEqual(0, self.scaler_dummy_no_minmax.jac_log_det(self.x))
+
+        # test that the jacobian works on 1d things as well:
+        self.assertEqual(0, self.scaler_dummy.jac_log_det_inverse_transform(self.x.squeeze()))
+        self.assertEqual(0, self.scaler_dummy.jac_log_det(self.x.squeeze()))
+        self.assertEqual(0, self.scaler_dummy_no_minmax.jac_log_det_inverse_transform(self.x.squeeze()))
+        self.assertEqual(0, self.scaler_dummy_no_minmax.jac_log_det(self.x.squeeze()))
+
+    def test_torch(self):
+        # same as test but using torch input
+        if has_torch:
+            x_torch = torch.from_numpy(self.x)
+            for scaler in self.list_scalers:
+                scaler.fit(x_torch)
+                self.assertEqual(x_torch.shape, scaler.inverse_transform(scaler.transform(x_torch)).shape)
+                self.assertTrue(np.allclose(self.x, np.array(scaler.inverse_transform(scaler.transform(x_torch)))))
+                self.assertAlmostEqual(scaler.jac_log_det(x_torch),
+                                       scaler.jac_log_det_inverse_transform(scaler.transform(x_torch)), delta=1e-7)
+
+            # test dummy scaler actually does nothing:
+            self.assertTrue(np.allclose(x_torch, self.scaler_dummy_no_minmax.transform(x_torch)))
+            self.assertTrue(np.allclose(x_torch, self.scaler_dummy_no_minmax.inverse_transform(x_torch)))
+            self.assertEqual(0, self.scaler_dummy.jac_log_det_inverse_transform(x_torch))
+            self.assertEqual(0, self.scaler_dummy.jac_log_det(x_torch))
+            self.assertEqual(0, self.scaler_dummy_no_minmax.jac_log_det_inverse_transform(x_torch))
+            self.assertEqual(0, self.scaler_dummy_no_minmax.jac_log_det(x_torch))
+
+            # test that the jacobian works on 1d things as well:
+            self.assertEqual(0, self.scaler_dummy.jac_log_det_inverse_transform(x_torch.squeeze()))
+            self.assertEqual(0, self.scaler_dummy.jac_log_det(x_torch.squeeze()))
+            self.assertEqual(0, self.scaler_dummy_no_minmax.jac_log_det_inverse_transform(x_torch.squeeze()))
+            self.assertEqual(0, self.scaler_dummy_no_minmax.jac_log_det(x_torch.squeeze()))
+
+    def test_jacobian_difference(self):
+        # the values of the jacobian log det do not take into account the linear transformation as what
+        # really matters are the difference between them for two x values (in an MCMC acceptance rate).
+        # Then the difference of the jacobian for the same two points in original and transformed space should be
+        # the same.
+        for scaler_minmax, scaler_no_minmax in zip(self.list_scalers_minmax, self.list_scalers_no_minmax):
+            scaler_minmax.fit(self.x)
+            scaler_no_minmax.fit(self.x)
+
+            # the difference of the log det of jacobian between two points in the original space should be the same
+            self.assertAlmostEqual(
+                scaler_minmax.jac_log_det(self.x) - scaler_minmax.jac_log_det(self.x2),
+                scaler_no_minmax.jac_log_det(self.x) - scaler_no_minmax.jac_log_det(self.x2),
+                delta=1e-7)
+
+            # the difference of the log det of jacobian between two points corresponding to the same two points in the
+            # original space (either if the linear rescaling is applied or not) should be the same
+            self.assertAlmostEqual(
+                scaler_minmax.jac_log_det_inverse_transform(scaler_minmax.transform(self.x)) -
+                scaler_minmax.jac_log_det_inverse_transform(scaler_minmax.transform(self.x2)),
+                scaler_no_minmax.jac_log_det_inverse_transform(scaler_no_minmax.transform(self.x)) -
+                scaler_no_minmax.jac_log_det_inverse_transform(scaler_no_minmax.transform(self.x2)),
+                delta=1e-7)
+
+    def test_errors(self):
+        with self.assertRaises(RuntimeError):
+            self.scaler_mixed.jac_log_det(np.array([[1.1, 2.2], [3.3, 4.4]]))
+
+
 if __name__ == '__main__':
     unittest.main()