From 285f1fd0ef2d2f2c85516658ecf2cfdc96dc6de9 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 14 Jun 2024 12:02:31 +0200
Subject: [PATCH 01/56] Added Simple Solvers.

---
 Project.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Project.toml b/Project.toml
index 6807607e8..b69dee8f2 100644
--- a/Project.toml
+++ b/Project.toml
@@ -24,6 +24,7 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
+SimpleSolvers = "36b790f5-6434-4fbe-b711-1f64a1e2f6a2"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"

From 31c07049de78ce469c60e4d88c3d89bd41092d13 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 17 Jun 2024 16:46:31 +0200
Subject: [PATCH 02/56] Added references to relevant structs.

---
 src/architectures/transformer_integrator.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/architectures/transformer_integrator.jl b/src/architectures/transformer_integrator.jl
index 666deed82..1dd44ba7a 100644
--- a/src/architectures/transformer_integrator.jl
+++ b/src/architectures/transformer_integrator.jl
@@ -1,5 +1,5 @@
 @doc raw"""
-Encompasses various transformer architectures, such as the structure-preserving transformer and the linear symplectic transformer. 
+Encompasses various transformer architectures, such as the [`VolumePreservingTransformmer`](@ref) and the [`LinearSymplecticTransformer`](@ref). 
 """
 abstract type TransformerIntegrator <: Architecture end
 

From d153b013918380baefeada012b87e175cb8072e7 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 17 Jun 2024 16:50:06 +0200
Subject: [PATCH 03/56] Moved definition of losses to after architectures since
 we first need encoder and decoder.

---
 src/GeometricMachineLearning.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/GeometricMachineLearning.jl b/src/GeometricMachineLearning.jl
index acba20ec9..f205216f5 100644
--- a/src/GeometricMachineLearning.jl
+++ b/src/GeometricMachineLearning.jl
@@ -49,8 +49,6 @@ module GeometricMachineLearning
 
     include("data_loader/data_loader.jl")
 
-    include("loss/losses.jl")
-
     # INCLUDE ARRAYS
     include("arrays/skew_symmetric.jl")
     include("arrays/symmetric.jl")
@@ -286,6 +284,8 @@ module GeometricMachineLearning
 
     include("architectures/default_architecture.jl")
 
+    include("loss/losses.jl")
+
     export DataLoader, onehotbatch, accuracy
     export Batch, optimize_for_one_epoch!
     include("data_loader/tensor_assign.jl")

From 660c124fe14b5455feef9e977b57c38910320240 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 17 Jun 2024 16:50:37 +0200
Subject: [PATCH 04/56] Added description of symplectic encoder and symplectic
 decoder.

---
 src/architectures/symplectic_autoencoder.jl | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/architectures/symplectic_autoencoder.jl b/src/architectures/symplectic_autoencoder.jl
index cb54c6518..1042bd687 100644
--- a/src/architectures/symplectic_autoencoder.jl
+++ b/src/architectures/symplectic_autoencoder.jl
@@ -40,6 +40,11 @@ struct SymplecticAutoencoder{EncoderInit, DecoderInit, AT} <: SymplecticCompress
     activation::AT
 end
 
+"""
+The `NonLinearSymplecticEncoder`. 
+
+This is typically generated by calling [`encoder`](@ref) on an instance of [`SymplecticAutoencoder`](@ref). 
+"""
 struct NonLinearSymplecticEncoder{AT} <: SymplecticEncoder
     full_dim::Int
     reduced_dim::Int 
@@ -49,6 +54,11 @@ struct NonLinearSymplecticEncoder{AT} <: SymplecticEncoder
     activation::AT
 end
 
+"""
+The `NonLinearSymplecticDecoder`. 
+
+This is typically generated by calling [`decoder`](@ref) on an instance of [`SymplecticAutoencoder`](@ref). 
+"""
 struct NonLinearSymplecticDecoder{AT} <: SymplecticDecoder
     full_dim::Int
     reduced_dim::Int 

From 0efa87eebb3650763b6cd73fc96896fa2b62a51e Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 17 Jun 2024 16:53:02 +0200
Subject: [PATCH 05/56] Cleaned up existing docstrings a bit and added missing
 ones for SymplecticEncoder and SymplecticDecoder.

---
 src/architectures/autoencoder.jl | 44 ++++++++++++++++++++++++++++----
 1 file changed, 39 insertions(+), 5 deletions(-)

diff --git a/src/architectures/autoencoder.jl b/src/architectures/autoencoder.jl
index 45e606c1b..cb6a1bc16 100644
--- a/src/architectures/autoencoder.jl
+++ b/src/architectures/autoencoder.jl
@@ -1,6 +1,4 @@
 @doc raw"""
-## The autoencoder architecture
-
 An autoencoder [goodfellow2016deep](@cite) is a neural network consisting of an encoder ``\Psi^e`` and a decoder ``\Psi^d``. In the simplest case they are trained on some data set ``\mathcal{D}`` to reduce the following error: 
 
 ```math
@@ -9,26 +7,48 @@ An autoencoder [goodfellow2016deep](@cite) is a neural network consisting of an
 
 which we call the *reconstruction error* or *autoencoder error* (see the docs for [AutoEncoderLoss](@ref)) and ``||\cdot||`` is some norm.
 
-## Implementation details.
+# Implementation
 
 Abstract `AutoEncoder` type. If a custom `<:AutoEncoder` architecture is implemented it should have the fields `full_dim`, `reduced_dim`, `n_encoder_blocks` and `n_decoder_blocks`. Further the routines `encoder`, `decoder`, `encoder_parameters` and `decoder_parameters` should be extended.
 """
 abstract type AutoEncoder <: Architecture end
 
 """
-Abstract `Encoder` type. If a custom `<:Encoder` architecture is implemented it should have the fields `full_dim`, `reduced_dim` and `n_encoder_blocks`.
+Abstract `Encoder` type. 
+
+See 
+
+# Implementation
+
+If a custom `<:Encoder` architecture is implemented it should have the fields `full_dim`, `reduced_dim` and `n_encoder_blocks`.
 """
 abstract type Encoder <: Architecture end 
 
 """
-Abstract `Decoder` type. If a custom `<:Decoder` architecture is implemented it should have the fields `full_dim`, `reduced_dim` and `n_decoder_blocks`.
+Abstract `Decoder` type. 
+
+See 
+
+# Implementation
+
+If a custom `<:Decoder` architecture is implemented it should have the fields `full_dim`, `reduced_dim` and `n_decoder_blocks`.
 """
 abstract type Decoder <: Architecture end
 
 abstract type SymplecticCompression <: AutoEncoder end
 
+"""
+Abstract `SymplecticEncoder` type. 
+
+See [`Encoder`](@ref) for the super type and [`NonLinearSymplecticEncoder`](@ref) for a derived `struct`.
+"""
 abstract type SymplecticEncoder <: Encoder end 
 
+"""
+Abstract `SymplecticDecoder` type.
+
+See [`Decoder`](@ref) for the super type and [`NonLinearSymplecticDecoder`](@ref) for a derived `struct`.
+"""
 abstract type SymplecticDecoder <: Decoder end
 
 const SymplecticDimensionChange = Union{SymplecticCompression, SymplecticEncoder, SymplecticDecoder}
@@ -108,10 +128,24 @@ function Chain(arch::AutoEncoder)
     Chain(encoder_model(arch).layers..., decoder_model(arch).layers...)
 end
 
+"""
+    encoder(nn::NeuralNetwork{<:AutoEncoder})
+
+Obtain the *encoder* from a [`AutoEncoder`](@ref) neural network. 
+
+The input is a neural network and the output is as well.
+"""
 function encoder(nn::NeuralNetwork{<:AutoEncoder})
     NeuralNetwork(UnknownEncoder(nn.architecture.full_dim, nn.architecture.reduced_dim, nn.architecture.n_encoder_blocks), encoder_model(nn.architecture), encoder_parameters(nn), get_backend(nn))
 end
 
+"""
+    decoder(nn::NeuralNetwork{<:AutoEncoder})
+
+Obtain the *decoder* from a [`AutoEncoder`](@ref) neural network. 
+
+The input is a neural network and the output is as well.
+"""
 function decoder(nn::NeuralNetwork{<:AutoEncoder})
     NeuralNetwork(UnknownDecoder(nn.architecture.full_dim, nn.architecture.reduced_dim, nn.architecture.n_encoder_blocks), decoder_model(nn.architecture), decoder_parameters(nn), get_backend(nn))
 end

From d0931e6bdda827e59bae42b9e81681f966640665 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 17 Jun 2024 16:54:09 +0200
Subject: [PATCH 06/56] Made addition between a skew-symmetric matrix and a
 standard matrix work on gpu.

---
 src/arrays/skew_symmetric.jl | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/src/arrays/skew_symmetric.jl b/src/arrays/skew_symmetric.jl
index 83409f2e8..ea680e262 100644
--- a/src/arrays/skew_symmetric.jl
+++ b/src/arrays/skew_symmetric.jl
@@ -77,20 +77,41 @@ function SkewSymMatrix(S::AbstractMatrix{T}) where {T}
     SkewSymMatrix(S_vec, n)
 end
 
-function Base.getindex(A::SkewSymMatrix, i::Int, j::Int)
+function return_element(S::AbstractVector{T}, i::Int, j::Int) where T
     if j == i
-        return zero(eltype(A))
+        return zero(T)
     end
     if i > j
-        return A.S[(i-2) * (i-1) ÷ 2 + j]
+        return S[(i-2) * (i-1) ÷ 2 + j]
     end
-    return - A.S[ (j-2) * (j-1) ÷ 2 + i] 
+    return - S[ (j-2) * (j-1) ÷ 2 + i]
+end
+
+function Base.getindex(A::SkewSymMatrix, i::Int, j::Int)
+    return_element(A.S, i, j)
 end
 
 
 Base.parent(A::SkewSymMatrix) = A.S
 Base.size(A::SkewSymMatrix) = (A.n,A.n)
 
+@kernel function addition_kernel!(C::AbstractMatrix, S::AbstractVector, B::AbstractMatrix)
+    i, j = @index(Global, NTuple)
+    C[i, j] = return_element(S, i, j) + B[i, j]
+end
+
+function Base.:+(A::SkewSymMatrix{T}, B::AbstractMatrix{T}) where T
+    @assert size(A) == size(B)
+    backend = KernelAbstractions.get_backend(B)
+    addition! = addition_kernel!(backend)
+    C = KernelAbstractions.allocate(backend, T, size(A)...)
+    addition!(C, A.S, B; ndrange = size(A))
+
+    C
+end
+
+Base.:+(B::AbstractMatrix, A::SkewSymMatrix) = B + A
+
 function Base.:+(A::SkewSymMatrix, B::SkewSymMatrix)
     @assert A.n == B.n 
     SkewSymMatrix(A.S + B.S, A.n) 
@@ -190,7 +211,7 @@ function Base.:*(A::SkewSymMatrix{T}, B::AbstractMatrix{T}) where T
 end
 
 @kernel function skew_mat_mul_kernel!(C::AbstractMatrix{T}, S::AbstractVector{T}, B::AbstractMatrix{T}, n) where T
-    i,j = @index(Global, NTuple)
+    i, j = @index(Global, NTuple)
 
     tmp_sum = zero(T)
     for k = 1:(i-1)

From 80ffb52b6b2fe897b878e540f75264cfa9390dc2 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 17 Jun 2024 16:54:40 +0200
Subject: [PATCH 07/56] Made addition between StiefelLieAlgHorMatrix and
 standard arrays work on gpu.

---
 src/arrays/stiefel_lie_algebra_horizontal.jl | 24 +++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/src/arrays/stiefel_lie_algebra_horizontal.jl b/src/arrays/stiefel_lie_algebra_horizontal.jl
index 54b9c764b..09709e69a 100644
--- a/src/arrays/stiefel_lie_algebra_horizontal.jl
+++ b/src/arrays/stiefel_lie_algebra_horizontal.jl
@@ -121,7 +121,20 @@ function Base.:*(A::StiefelLieAlgHorMatrix, α::Real)
     StiefelLieAlgHorMatrix( α*A.A, α*A.B, A.N, A.n)
 end
 
-Base.:*(α::Real, A::StiefelLieAlgHorMatrix) = A*α
+function Base.:+(B::StiefelLieAlgHorMatrix, A::AbstractMatrix)
+    @assert size(A) == size(B)
+
+    C = copy(A)
+    @views C[1:B.n, 1:B.n] .= B.A + A[1:B.n, 1:B.n]
+    @views C[(B.n+1):B.N, 1:B.n] .= B.B + A[(B.n+1):B.N, 1:B.n]
+    @views C[1:B.n, (B.n+1):B.N] .= A[1:B.n, (B.n+1):B.N] - B.B'
+    
+    C
+end
+
+Base.:+(A::AbstractMatrix, B::StiefelLieAlgHorMatrix) = B + A
+
+Base.:*(α::Real, A::StiefelLieAlgHorMatrix) = A * α
 
 function Base.zeros(::Type{StiefelLieAlgHorMatrix{T}}, N::Integer, n::Integer) where T
     StiefelLieAlgHorMatrix(
@@ -256,6 +269,15 @@ function assign!(A::AbstractArray, B::AbstractArray)
     nothing
 end
 
+function Base.one(B::StiefelLieAlgHorMatrix{T}) where T
+    backend = get_backend(B)
+    oneB = KernelAbstractions.zeros(backend, T, B.N, B.N)
+    write_ones! = write_ones_kernel!(backend)
+    write_ones!(oneB; ndrange = B.N)
+
+    oneB
+end
+
 function _round(B::StiefelLieAlgHorMatrix; kwargs...)
     StiefelLieAlgHorMatrix(
         _round(B.A; kwargs...),

From 7bf4f865f9aa956391b06f6d13a8f7dce037f4e2 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 17 Jun 2024 16:55:02 +0200
Subject: [PATCH 08/56] Added ReducedLoss.

---
 src/loss/losses.jl | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/src/loss/losses.jl b/src/loss/losses.jl
index 84bc48cec..40661dbbd 100644
--- a/src/loss/losses.jl
+++ b/src/loss/losses.jl
@@ -81,7 +81,7 @@ This doesn't have any parameters.
 struct FeedForwardLoss <: NetworkLoss end
 
 @doc raw"""
-This loss should always be used together with a neural network of type [AutoEncoder](@ref) (and it is also the default for training such a network). 
+This loss should always be used together with a neural network of type [`AutoEncoder`](@ref) (and it is also the default for training such a network). 
 
 It simply computes: 
 
@@ -97,4 +97,37 @@ end
 
 function (loss::AutoEncoderLoss)(model::Union{Chain, AbstractExplicitLayer}, ps::Union{Tuple, NamedTuple}, input)
     loss(model, ps, input, input)
+end
+
+@doc raw"""
+    ReducedLoss(encoder, decoder)
+
+Make an instance of `ReducedLoss` based on an [`Encoder`](@ref) and a [`Decoder`](@ref).
+
+This loss should be used together with a [`NeuralNetworkIntegrator`](@ref) or [`TransformerIntegrator`](@ref).
+
+The loss is computed as: 
+
+```math
+\mathrm{loss}_{\mathcal{E}, \mathcal{D}}(\mathcal{NN}, \mathrm{input}, \mathrm{output}) = ||\mathcal{D}(\mathcal{NN}(\mathcal{E}(\mathrm{input}))) - \mathrm{output}||,
+```
+where ``\mathcal{E}`` is the [`Encoder`](@ref), ``\mathcal{D}`` is the [`Decoder`](@ref).
+``\mathcal{NN}`` is the neural network we compute the loss of.
+"""
+struct ReducedLoss{ET <: NeuralNetwork{<:Encoder}, DT <: NeuralNetwork{<:Decoder}} <: NetworkLoss
+    encoder::ET
+    decoder::DT
+end
+
+"""
+    ReducedLoss(autoencoder)
+
+Make an instance of `ReducedLoss` based on a neural network of type [`AutoEncoder`](@ref).
+"""
+function ReducedLoss(autoencoder::NeuralNetwork{<:AutoEncoder})
+    ReducedLoss(encoder(autoencoder), decoder(autoencoder))
+end
+
+function (loss::ReducedLoss)(model::Chain, params::Tuple, input::CT, output::CT) where {AT <:AbstractArray, CT <: NamedTuple{(:q, :p), Tuple{AT, AT}}}
+    _compute_loss(loss.decoder(model(loss.encoder(input), params)), output)
 end
\ No newline at end of file

From e44664a77ff3039ed2a800923f72f9002a8e01be Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 17 Jun 2024 16:55:35 +0200
Subject: [PATCH 09/56] Fixed problem which meant global section for the
 StiefelManifold didn't work on gpu.

---
 src/optimizers/manifold_related/global_sections.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/optimizers/manifold_related/global_sections.jl b/src/optimizers/manifold_related/global_sections.jl
index 2b57b67f1..95105df98 100644
--- a/src/optimizers/manifold_related/global_sections.jl
+++ b/src/optimizers/manifold_related/global_sections.jl
@@ -237,8 +237,8 @@ function update_section!(Λ⁽ᵗ⁻¹⁾::GlobalSection{T, MT}, B⁽ᵗ⁻¹⁾
     N, n = B⁽ᵗ⁻¹⁾.N, B⁽ᵗ⁻¹⁾.n
     expB = retraction(B⁽ᵗ⁻¹⁾)
     apply_section!(expB, Λ⁽ᵗ⁻¹⁾, expB)
-    Λ⁽ᵗ⁻¹⁾.Y.A .= @view expB[:, 1:n]
-    Λ⁽ᵗ⁻¹⁾.λ .= @view expB[:, (n+1):N]
+    Λ⁽ᵗ⁻¹⁾.Y.A .= @view expB.A[:, 1:n]
+    Λ⁽ᵗ⁻¹⁾.λ .= @view expB.A[:, (n+1):N]
 
     nothing
 end

From 3a3a52a581f3b7eb6e58f59f72c77adcfb17bac9 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 17 Jun 2024 16:56:02 +0200
Subject: [PATCH 10/56] Added test for addition with an element of
 StiefelLieAlgHorMatrix.

---
 test/arrays/array_tests.jl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/arrays/array_tests.jl b/test/arrays/array_tests.jl
index ace01c918..f2baca095 100644
--- a/test/arrays/array_tests.jl
+++ b/test/arrays/array_tests.jl
@@ -70,12 +70,16 @@ function stiefel_lie_alg_add_sub_test(N, n)
     S₁ = StiefelLieAlgHorMatrix(W₁,n)
     W₂ = SkewSymMatrix(rand(N,N))
     S₂ = StiefelLieAlgHorMatrix(W₂,n)
+    A = rand(N, N)
     S₃ = S₁ + S₂
     S₄ = S₁ - S₂
     @test typeof(S₃) <: StiefelLieAlgHorMatrix
     @test typeof(S₄) <: StiefelLieAlgHorMatrix
     @test all(abs.(projection(W₁ + W₂) .- S₃) .< 1e-10)
     @test all(abs.(projection(W₁ - W₂) .- S₄) .< 1e-10)
+    # check custom addition
+    @test S₁ + A ≈ Matrix(S₁) + A
+    @test A + S₁ ≈ Matrix(S₁) + A
 end
 
 

From b566565d5794c4daee8588bbe9d455b5f37d0409 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 17 Jun 2024 16:57:14 +0200
Subject: [PATCH 11/56] Fixed script and using MakieCairo instead of Plots now.

---
 .../symplectic_autoencoders/online_sympnet.jl | 84 +++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 scripts/symplectic_autoencoders/online_sympnet.jl

diff --git a/scripts/symplectic_autoencoders/online_sympnet.jl b/scripts/symplectic_autoencoders/online_sympnet.jl
new file mode 100644
index 000000000..67c91f915
--- /dev/null
+++ b/scripts/symplectic_autoencoders/online_sympnet.jl
@@ -0,0 +1,84 @@
+using CUDA
+using GeometricIntegrators: integrate, ImplicitMidpoint
+using CairoMakie
+using GeometricProblems.TodaLattice: hodeproblem
+using GeometricMachineLearning 
+import Random
+
+backend = CUDABackend()
+
+pr = hodeproblem(; tspan = (0.0, 100.))
+sol = integrate(pr, ImplicitMidpoint())
+dl_cpu = DataLoader(sol; autoencoder = true)
+dl = DataLoader((
+                q = reshape(dl_cpu.input.q |> cu, size(dl_cpu.input.q, 1), size(dl_cpu.input.q, 3)), 
+                p = reshape(dl_cpu.input.p |> cu, size(dl_cpu.input.p, 1), size(dl_cpu.input.p, 3))); 
+                autoencoder = true)
+
+const reduced_dim = 2
+
+psd_arch = PSDArch(dl.input_dim, reduced_dim)
+sae_arch = SymplecticAutoencoder(dl.input_dim, reduced_dim; n_encoder_blocks = 4, n_decoder_blocks = 4, n_encoder_layers = 2, n_decoder_layers = 2)
+
+Random.seed!(123)
+psd_nn = NeuralNetwork(psd_arch, backend)
+sae_nn = NeuralNetwork(sae_arch, backend)
+
+const n_epochs = 512
+const batch_size = 512
+
+o = Optimizer(sae_nn, AdamOptimizer())
+
+psd_error = solve!(psd_nn, dl)
+sae_error = o(sae_nn, dl, Batch(batch_size), n_epochs)
+
+hline([psd_error]; color = 2, label = "PSD error")
+lines!(sae_error; color = 3, label = "SAE error", xlabel = "epoch", ylabel = "training error")
+
+const mtc = GeometricMachineLearning.map_to_cpu
+psd_nn_cpu = mtc(psd_nn)
+sae_nn_cpu = mtc(sae_nn)
+psd_rs = HRedSys(pr, encoder(psd_nn_cpu), decoder(psd_nn_cpu); integrator = ImplicitMidpoint())
+sae_rs = HRedSys(pr, encoder(sae_nn_cpu), decoder(sae_nn_cpu); integrator = ImplicitMidpoint())
+
+projection_error(psd_rs)
+
+projection_error(sae_rs)
+
+sol_full = integrate_full_system(psd_rs)
+sol_psd_reduced = integrate_reduced_system(psd_rs)
+sol_sae_reduced = integrate_reduced_system(sae_rs)
+
+const t_steps = 100
+p_val = lines(sol_full.s.q[t_steps], label = "Implicit Midpoint")
+lines!(p_val, psd_rs.decoder((q = sol_psd_reduced.s.q[t_steps], p = sol_psd_reduced.s.p[t_steps])).q, label = "PSD")
+lines!(p_val, sae_rs.decoder((q = sol_sae_reduced.s.q[t_steps], p = sol_sae_reduced.s.p[t_steps])).q, label = "SAE")
+
+data_unprocessed = encoder(sae_nn)(dl.input)
+data_processed = (  q = reshape(data_unprocessed.q, reduced_dim ÷ 2, length(data_unprocessed.q)), 
+                    p = reshape(data_unprocessed.p, reduced_dim ÷ 2, length(data_unprocessed.p))
+                    )
+
+dl_reduced = DataLoader(data_processed; autoencoder = false)
+integrator_batch_size = 512
+integrator_train_epochs = 512
+
+integrator_nn = NeuralNetwork(GSympNet(reduced_dim; n_layers = 5), backend)
+o_integrator = Optimizer(AdamOptimizer(), integrator_nn)
+
+loss = GeometricMachineLearning.ReducedLoss(encoder(sae_nn), decoder(sae_nn))
+dl_integration = DataLoader((q = reshape(dl.input.q, size(dl.input.q, 1), size(dl.input.q, 3)),
+                             p = reshape(dl.input.p, size(dl.input.p, 1), size(dl.input.p, 3)));
+                            autoencoder = false
+                            )
+
+o_integrator(integrator_nn, dl_integration, Batch(integrator_batch_size), integrator_train_epochs, loss)
+
+ics = (q = mtc(dl_reduced.input.q[:, 1]), p = mtc(dl_reduced.input.p[:, 1]))
+time_series = iterate(mtc(integrator_nn), ics; n_points = t_steps)
+prediction = (q = time_series.q[:, end], p = time_series.p[:, end])
+sol = decoder(sae_nn_cpu)(prediction)
+
+lines!(p_val, sol.q; label = "Neural Network Integrator")
+
+save("symplectic_autoencoder_validation.png", p_val)
\ No newline at end of file

From b86c428ec250cf62f75a211aa4dc4be329990e47 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 18 Jun 2024 13:09:35 +0200
Subject: [PATCH 12/56] Updated script.

---
 .../symplectic_autoencoders/online_sympnet.jl | 74 ++++++++++++++-----
 1 file changed, 56 insertions(+), 18 deletions(-)

diff --git a/scripts/symplectic_autoencoders/online_sympnet.jl b/scripts/symplectic_autoencoders/online_sympnet.jl
index 67c91f915..4b1b0374b 100644
--- a/scripts/symplectic_autoencoders/online_sympnet.jl
+++ b/scripts/symplectic_autoencoders/online_sympnet.jl
@@ -24,16 +24,28 @@ Random.seed!(123)
 psd_nn = NeuralNetwork(psd_arch, backend)
 sae_nn = NeuralNetwork(sae_arch, backend)
 
-const n_epochs = 512
+const n_epochs = 4096
 const batch_size = 512
 
-o = Optimizer(sae_nn, AdamOptimizer())
+sae_method = AdamOptimizerWithDecay(n_epochs)
+o = Optimizer(sae_nn, sae_method)
 
 psd_error = solve!(psd_nn, dl)
 sae_error = o(sae_nn, dl, Batch(batch_size), n_epochs)
 
-hline([psd_error]; color = 2, label = "PSD error")
-lines!(sae_error; color = 3, label = "SAE error", xlabel = "epoch", ylabel = "training error")
+fig = Figure()
+ax = Axis(fig[1, 1], xlabel = "epoch", ylabel = "training error")
+morange = RGBf(255 / 256, 127 / 256, 14 / 256)
+mred = RGBf(214 / 256, 39 / 256, 40 / 256) 
+mpurple = RGBf(148 / 256, 103 / 256, 189 / 256)
+mblue = RGBf(31 / 256, 119 / 256, 180 / 256)
+mgreen = RGBf(44 / 256, 160 / 256, 44 / 256)
+
+hlines!([psd_error]; color = morange, label = "PSD error")
+lines!(sae_error; color = mgreen, label = "SAE error")
+const text_color = :black
+axislegend(; position = (.82, .75), backgroundcolor = :transparent, color = text_color)
+save("compare_errors.png", fig)
 
 const mtc = GeometricMachineLearning.map_to_cpu
 psd_nn_cpu = mtc(psd_nn)
@@ -41,18 +53,20 @@ sae_nn_cpu = mtc(sae_nn)
 psd_rs = HRedSys(pr, encoder(psd_nn_cpu), decoder(psd_nn_cpu); integrator = ImplicitMidpoint())
 sae_rs = HRedSys(pr, encoder(sae_nn_cpu), decoder(sae_nn_cpu); integrator = ImplicitMidpoint())
 
-projection_error(psd_rs)
+# projection_error(psd_rs)
+# 
+# projection_error(sae_rs)
 
-projection_error(sae_rs)
+######################################################################
 
+# integrate system
 sol_full = integrate_full_system(psd_rs)
 sol_psd_reduced = integrate_reduced_system(psd_rs)
 sol_sae_reduced = integrate_reduced_system(sae_rs)
 
-const t_steps = 100
-p_val = lines(sol_full.s.q[t_steps], label = "Implicit Midpoint")
-lines!(p_val, psd_rs.decoder((q = sol_psd_reduced.s.q[t_steps], p = sol_psd_reduced.s.p[t_steps])).q, label = "PSD")
-lines!(p_val, sae_rs.decoder((q = sol_sae_reduced.s.q[t_steps], p = sol_sae_reduced.s.p[t_steps])).q, label = "SAE")
+######################################################################
+
+# train sympnet 
 
 data_unprocessed = encoder(sae_nn)(dl.input)
 data_processed = (  q = reshape(data_unprocessed.q, reduced_dim ÷ 2, length(data_unprocessed.q)), 
@@ -60,11 +74,12 @@ data_processed = (  q = reshape(data_unprocessed.q, reduced_dim ÷ 2, length(dat
                     )
 
 dl_reduced = DataLoader(data_processed; autoencoder = false)
+integrator_train_epochs = 4096
 integrator_batch_size = 512
-integrator_train_epochs = 512
 
 integrator_nn = NeuralNetwork(GSympNet(reduced_dim; n_layers = 5), backend)
-o_integrator = Optimizer(AdamOptimizer(), integrator_nn)
+integrator_method = AdamOptimizerWithDecay(integrator_train_epochs)
+o_integrator = Optimizer(integrator_method, integrator_nn)
 
 loss = GeometricMachineLearning.ReducedLoss(encoder(sae_nn), decoder(sae_nn))
 dl_integration = DataLoader((q = reshape(dl.input.q, size(dl.input.q, 1), size(dl.input.q, 3)),
@@ -74,11 +89,34 @@ dl_integration = DataLoader((q = reshape(dl.input.q, size(dl.input.q, 1), size(d
 
 o_integrator(integrator_nn, dl_integration, Batch(integrator_batch_size), integrator_train_epochs, loss)
 
-ics = (q = mtc(dl_reduced.input.q[:, 1]), p = mtc(dl_reduced.input.p[:, 1]))
-time_series = iterate(mtc(integrator_nn), ics; n_points = t_steps)
-prediction = (q = time_series.q[:, end], p = time_series.p[:, end])
-sol = decoder(sae_nn_cpu)(prediction)
+const ics = (q = mtc(dl_reduced.input.q[:, 1]), p = mtc(dl_reduced.input.p[:, 1]))
+
+######################################################################
+
+# plot validation
+function plot_validation(t_steps::Integer=100)
+    fig_val = Figure()
+    ax_val = Axis(fig_val[1, 1])
+    lines!(ax_val, sol_full.s.q[t_steps], label = "Implicit Midpoint", color = mblue)
+    lines!(ax_val, psd_rs.decoder((q = sol_psd_reduced.s.q[t_steps], p = sol_psd_reduced.s.p[t_steps])).q, 
+        label = "PSD", color = morange)
+    lines!(ax_val, sae_rs.decoder((q = sol_sae_reduced.s.q[t_steps], p = sol_sae_reduced.s.p[t_steps])).q, 
+        label = "SAE", color = mgreen)
+
+    name = "symplectic_autoencoder_validation_" * string(t_steps)
+    axislegend(; position = (.82, .75), backgroundcolor = :transparent, color = text_color)
+    save(name * ".png", fig_val)
+
+    time_series = iterate(mtc(integrator_nn), ics; n_points = t_steps)
+    prediction = (q = time_series.q[:, end], p = time_series.p[:, end])
+    sol = decoder(sae_nn_cpu)(prediction)
+
+    lines!(ax_val, sol.q; label = "Neural Network Integrator", color = mpurple)
 
-lines!(p_val, sol.q; label = "Neural Network Integrator")
+    axislegend(; position = (.82, .75), backgroundcolor = :transparent, color = text_color)
+    save(name * "_with_nn_integrator.png", fig_val)
+end
 
-save("symplectic_autoencoder_validation.png", p_val)
\ No newline at end of file
+for t_steps in (10, 100, 200, 300, 400, 500, 600, 700, 800, 900)
+    plot_validation(t_steps)
+end
\ No newline at end of file

From 365c20d70276cfa4036e58b25b04cc0098f157aa Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 18 Jun 2024 15:48:35 +0200
Subject: [PATCH 13/56] Fixed integration with qp coordinates.

---
 src/architectures/transformer_integrator.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/architectures/transformer_integrator.jl b/src/architectures/transformer_integrator.jl
index 1dd44ba7a..3b9f03acb 100644
--- a/src/architectures/transformer_integrator.jl
+++ b/src/architectures/transformer_integrator.jl
@@ -37,8 +37,8 @@ function Base.iterate(nn::NeuralNetwork{<:TransformerIntegrator}, ics::NamedTupl
         start_index = (i - 1) * prediction_window + 1
         @views qp_temp = (q = q_valuation[:, start_index:(start_index + seq_length - 1)], p = p_valuation[:, start_index:(start_index + seq_length - 1)]) 
         qp_prediction = nn(qp_temp)
-        q_valuation[seq_length + (i - 1) * prediction_window, seq_length + i * prediction_window] = qp_prediction.q[:, (seq_length - prediction_window + 1):end]
-        p_valuation[seq_length + (i - 1) * prediction_window, seq_length + i * prediction_window] = qp_prediction.p[:, (seq_length - prediction_window + 1):end]
+        q_valuation[:, seq_length + (i - 1) * prediction_window + 1 : seq_length + i * prediction_window] = qp_prediction.q[:, (seq_length - prediction_window + 1):end]
+        p_valuation[:, seq_length + (i - 1) * prediction_window + 1 : seq_length + i * prediction_window] = qp_prediction.p[:, (seq_length - prediction_window + 1):end]
     end
 
     (q=q_valuation[:, 1:n_points], p=p_valuation[:, 1:n_points])

From 93b796b8d29dafb24e7bb40bae79b88f5431ee4e Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 18 Jun 2024 15:48:47 +0200
Subject: [PATCH 14/56] Now using linear symplectic transformer.

---
 scripts/symplectic_autoencoders/online_sympnet.jl | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/scripts/symplectic_autoencoders/online_sympnet.jl b/scripts/symplectic_autoencoders/online_sympnet.jl
index 4b1b0374b..6540b99df 100644
--- a/scripts/symplectic_autoencoders/online_sympnet.jl
+++ b/scripts/symplectic_autoencoders/online_sympnet.jl
@@ -77,7 +77,8 @@ dl_reduced = DataLoader(data_processed; autoencoder = false)
 integrator_train_epochs = 4096
 integrator_batch_size = 512
 
-integrator_nn = NeuralNetwork(GSympNet(reduced_dim; n_layers = 5), backend)
+seq_length = 4
+integrator_nn = NeuralNetwork(LinearSymplecticTransformer(reduced_dim, seq_length; n_sympnet = 3, L = 3), backend)
 integrator_method = AdamOptimizerWithDecay(integrator_train_epochs)
 o_integrator = Optimizer(integrator_method, integrator_nn)
 
@@ -87,9 +88,9 @@ dl_integration = DataLoader((q = reshape(dl.input.q, size(dl.input.q, 1), size(d
                             autoencoder = false
                             )
 
-o_integrator(integrator_nn, dl_integration, Batch(integrator_batch_size), integrator_train_epochs, loss)
+o_integrator(integrator_nn, dl_integration, Batch(integrator_batch_size, seq_length), integrator_train_epochs, loss)
 
-const ics = (q = mtc(dl_reduced.input.q[:, 1]), p = mtc(dl_reduced.input.p[:, 1]))
+const ics = (q = mtc(dl_reduced.input.q[:, 1:seq_length, 1]), p = mtc(dl_reduced.input.p[:, 1:seq_length, 1]))
 
 ######################################################################
 
@@ -107,7 +108,7 @@ function plot_validation(t_steps::Integer=100)
     axislegend(; position = (.82, .75), backgroundcolor = :transparent, color = text_color)
     save(name * ".png", fig_val)
 
-    time_series = iterate(mtc(integrator_nn), ics; n_points = t_steps)
+    time_series = iterate(mtc(integrator_nn), ics; n_points = t_steps, prediction_window = seq_length)
     prediction = (q = time_series.q[:, end], p = time_series.p[:, end])
     sol = decoder(sae_nn_cpu)(prediction)
 

From 9dd02db029a8b17be417f2c841e5e031426b072b Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 19 Jun 2024 12:03:44 +0200
Subject: [PATCH 15/56] Using the transformer for the integration step now.

---
 scripts/symplectic_autoencoders/online_sympnet.jl | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/scripts/symplectic_autoencoders/online_sympnet.jl b/scripts/symplectic_autoencoders/online_sympnet.jl
index 6540b99df..5719c6a36 100644
--- a/scripts/symplectic_autoencoders/online_sympnet.jl
+++ b/scripts/symplectic_autoencoders/online_sympnet.jl
@@ -78,7 +78,8 @@ integrator_train_epochs = 4096
 integrator_batch_size = 512
 
 seq_length = 4
-integrator_nn = NeuralNetwork(LinearSymplecticTransformer(reduced_dim, seq_length; n_sympnet = 3, L = 3), backend)
+integrator_architecture = StandardTransformerIntegrator(reduced_dim; transformer_dim = 30, n_blocks = 4, n_heads = 5, L = 3, upscaling_activation = tanh)
+integrator_nn = NeuralNetwork(integrator_architecture, backend)
 integrator_method = AdamOptimizerWithDecay(integrator_train_epochs)
 o_integrator = Optimizer(integrator_method, integrator_nn)
 
@@ -88,9 +89,12 @@ dl_integration = DataLoader((q = reshape(dl.input.q, size(dl.input.q, 1), size(d
                             autoencoder = false
                             )
 
+# the regular transformer can't deal with symplectic data!
+dl_integration = DataLoader(vcat(dl_integration.input.q, dl_integration.input.p))
 o_integrator(integrator_nn, dl_integration, Batch(integrator_batch_size, seq_length), integrator_train_epochs, loss)
 
-const ics = (q = mtc(dl_reduced.input.q[:, 1:seq_length, 1]), p = mtc(dl_reduced.input.p[:, 1:seq_length, 1]))
+const ics_nt = (q = mtc(dl_reduced.input.q[:, 1:seq_length, 1]), p = mtc(dl_reduced.input.p[:, 1:seq_length, 1]))
+const ics = vcat(ics_nt, ics_nt)
 
 ######################################################################
 
@@ -109,10 +113,11 @@ function plot_validation(t_steps::Integer=100)
     save(name * ".png", fig_val)
 
     time_series = iterate(mtc(integrator_nn), ics; n_points = t_steps, prediction_window = seq_length)
-    prediction = (q = time_series.q[:, end], p = time_series.p[:, end])
+    # prediction = (q = time_series.q[:, end], p = time_series.p[:, end])
+    prediction = time.series[:, end]
     sol = decoder(sae_nn_cpu)(prediction)
 
-    lines!(ax_val, sol.q; label = "Neural Network Integrator", color = mpurple)
+    lines!(ax_val, sol[1:(dl.sys_dim ÷ 2)]; label = "Neural Network Integrator", color = mpurple)
 
     axislegend(; position = (.82, .75), backgroundcolor = :transparent, color = text_color)
     save(name * "_with_nn_integrator.png", fig_val)

From a8348453a7b5273f9f75f51fd6091c9edf07e965 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 19 Jun 2024 12:04:40 +0200
Subject: [PATCH 16/56] Added a paragraph on how to perform the online phase
 with a neural network.

---
 docs/src/tutorials/symplectic_autoencoder.md | 57 ++++++++++++++++++--
 1 file changed, 53 insertions(+), 4 deletions(-)

diff --git a/docs/src/tutorials/symplectic_autoencoder.md b/docs/src/tutorials/symplectic_autoencoder.md
index ad12f57fb..bb407230d 100644
--- a/docs/src/tutorials/symplectic_autoencoder.md
+++ b/docs/src/tutorials/symplectic_autoencoder.md
@@ -104,7 +104,7 @@ hline([psd_error]; color = 2, label = "PSD error")
 plot!(sae_error; color = 3, label = "SAE error", xlabel = "epoch", ylabel = "training error")
 ```
 
-## The online stage 
+## The online stage with a standard integrator
 
 After having trained our neural network we can now evaluate it in the online stage of reduced complexity modeling: 
 
@@ -126,14 +126,63 @@ sol_full = integrate_full_system(psd_rs)
 sol_psd_reduced = integrate_reduced_system(psd_rs)
 sol_sae_reduced = integrate_reduced_system(sae_rs)
 
-const t_step = 100
+const t_steps = 100
 plot(sol_full.s.q[t_step], label = "Implicit Midpoint")
-plot!(psd_rs.decoder((q = sol_psd_reduced.s.q[t_step], p = sol_psd_reduced.s.p[t_step])).q, label = "PSD")
-plot!(sae_rs.decoder((q = sol_sae_reduced.s.q[t_step], p = sol_sae_reduced.s.p[t_step])).q, label = "SAE")
+plot!(psd_rs.decoder((q = sol_psd_reduced.s.q[t_steps], p = sol_psd_reduced.s.p[t_steps])).q, label = "PSD")
+plot!(sae_rs.decoder((q = sol_sae_reduced.s.q[t_steps], p = sol_sae_reduced.s.p[t_steps])).q, label = "SAE")
 ```
 
 We can see that the autoencoder approach has much more approximation capabilities than the psd approach. The jiggly lines are due to the fact that training was done for only 8 epochs. 
 
+## The online stage with a neural network
+
+Instead of using a standard integrator we can also use a neural network that is trained on the reduced data. For this: 
+
+```@example toda_lattice
+data_unprocessed = encoder(sae_nn)(dl.input)
+data_processed = (  q = reshape(data_unprocessed.q, reduced_dim ÷ 2, length(data_unprocessed.q)), 
+                    p = reshape(data_unprocessed.p, reduced_dim ÷ 2, length(data_unprocessed.p))
+                    )
+
+dl_reduced = DataLoader(data_processed; autoencoder = false)
+integrator_batch_size = 128
+integrator_train_epochs = 4
+
+integrator_nn = NeuralNetwork(GSympNet(reduced_dim))
+o_integrator = Optimizer(AdamOptimizer(Float64), integrator_nn)
+struct ReducedLoss{ET, DT} <: GeometricMachineLearning.NetworkLoss
+    encoder::ET
+    decoder::DT
+end
+function (loss::ReducedLoss)(model::Chain, params::Tuple, input::CT, output::CT) where {AT <:Array, CT <: NamedTuple{(:q, :p), Tuple{AT, AT}}}
+    GeometricMachineLearning._compute_loss(loss.decoder(model(loss.encoder(input), params)), output)
+end
+
+loss = ReducedLoss(encoder(sae_nn), decoder(sae_nn))
+dl_integration = DataLoader((q = reshape(dl.input.q, size(dl.input.q, 1), size(dl.input.q, 3)),
+                             p = reshape(dl.input.p, size(dl.input.p, 1), size(dl.input.p, 3)));
+                            autoencoder = false
+                            )
+
+o_integrator(integrator_nn, dl_integration, Batch(integrator_batch_size), integrator_train_epochs, loss)
+
+nothing # hide
+```
+
+We can now evaluate the solution:
+
+```@example toda_lattice
+ics = (q = dl_reduced.input.q[:, 1], p = dl_reduced.input.p[:, 1])
+time_series = iterate(integrator_nn, ics; n_points = t_steps)
+prediction = (q = time_series.q[:, end], p = time_series.p[:, end])
+sol = decoder(sae_nn)(prediction)
+
+plot!(sol.q; label = "Neural Network Integrator")
+```
+
+
+
+
 ## References 
 ```@bibliography
 Pages = []

From 6c9cd9cda35b8fad0f8ab1455a6108c8e07e21ed Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 19 Jun 2024 12:05:07 +0200
Subject: [PATCH 17/56] Added CairoMakie.

---
 scripts/Project.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/Project.toml b/scripts/Project.toml
index d4718b8b3..1b6c03731 100644
--- a/scripts/Project.toml
+++ b/scripts/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 GeometricEquations = "c85262ba-a08a-430a-b926-d29770767bf2"
 GeometricIntegrators = "dcce2d33-59f6-5b8d-9047-0defad88ae06"

From 89fdc79f467522df4a76332545d55c3d50697e47 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 19 Jun 2024 19:00:16 +0200
Subject: [PATCH 18/56] Moved reduced order modeling section before
 architectures.

---
 docs/make.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/make.jl b/docs/make.jl
index 7f3b3728c..dfccfed37 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -177,6 +177,12 @@ makedocs(;
             "Multihead Attention" => "layers/multihead_attention_layer.md",
             "Linear Symplectic Attention" => "layers/linear_symplectic_attention.md",
         ],
+        "Reduced Order Modelling" =>[
+            "POD and Autoencoders" => "reduced_order_modeling/autoencoder.md",
+            "PSD and Symplectic Autoencoders" => "reduced_order_modeling/symplectic_autoencoder.md",
+            "Kolmogorov n-width" => "reduced_order_modeling/kolmogorov_n_width.md",
+            "Projection and Reduction Error" => "reduced_order_modeling/projection_reduction_errors.md",
+        ],
         "Architectures" => [
             "Symplectic Autoencoders" => "architectures/symplectic_autoencoder.md",
             "Neural Network Integrators" => "architectures/neural_network_integrators.md",
@@ -190,12 +196,6 @@ makedocs(;
             "Routines" => "data_loader/data_loader.md",
             "Snapshot matrix & tensor" => "data_loader/snapshot_matrix.md",
         ],
-        "Reduced Order Modelling" =>[
-            "POD and Autoencoders" => "reduced_order_modeling/autoencoder.md",
-            "PSD and Symplectic Autoencoders" => "reduced_order_modeling/symplectic_autoencoder.md",
-            "Kolmogorov n-width" => "reduced_order_modeling/kolmogorov_n_width.md",
-            "Projection and Reduction Error" => "reduced_order_modeling/projection_reduction_errors.md",
-        ],
         "Tutorials" =>[
             "Sympnets" => "tutorials/sympnet_tutorial.md",
             "Symplectic Autoencoders" => "tutorials/symplectic_autoencoder.md",

From 242120c5d5193915e869481f121abf31f98b9326 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 19 Jun 2024 19:37:38 +0200
Subject: [PATCH 19/56] Added a tikz picture for the offline online phase.

---
 docs/src/tikz/Makefile                |  2 ++
 docs/src/tikz/offline_online.tex      | 33 ++++++++++++++++++++++++++
 docs/src/tikz/offline_online_dark.tex | 34 +++++++++++++++++++++++++++
 3 files changed, 69 insertions(+)
 create mode 100644 docs/src/tikz/offline_online.tex
 create mode 100644 docs/src/tikz/offline_online_dark.tex

diff --git a/docs/src/tikz/Makefile b/docs/src/tikz/Makefile
index 65f5f04c6..2af6f4f97 100644
--- a/docs/src/tikz/Makefile
+++ b/docs/src/tikz/Makefile
@@ -57,6 +57,8 @@ png:
 	pdftocairo 	-png -r $(res1) -transp -singlefile  linear_symplectic_transformer_dark.pdf			linear_symplectic_transformer_dark
 	pdftocairo 	-png -r $(res1) -transp -singlefile  skew_sym_visualization.pdf						skew_sym_visualization
 	pdftocairo 	-png -r $(res1) -transp -singlefile  skew_sym_visualization_dark.pdf				skew_sym_visualization_dark
+	pdftocairo  -png -r $(res2) -transp -singlefile  offline_online.pdf								offline_online
+	pdftocairo  -png -r $(res2) -transp -singlefile  offline_online_dark.pdf						offline_online_dark
 
 logo:
 	cp logo_with_name.png 		../assets/logo.png
diff --git a/docs/src/tikz/offline_online.tex b/docs/src/tikz/offline_online.tex
new file mode 100644
index 000000000..54fa400e0
--- /dev/null
+++ b/docs/src/tikz/offline_online.tex
@@ -0,0 +1,33 @@
+\documentclass[tikz]{standalone}
+\definecolor{morange}{RGB}{255,127,14}
+\definecolor{mblue}{RGB}{31,119,180}
+\definecolor{mred}{RGB}{214,39,40}
+\definecolor{mpurple}{RGB}{148,103,189}
+\definecolor{mgreen}{RGB}{44,160,44}
+
+\newcommand{\pro}{\ensuremath{\mathcal{P}}}
+\newcommand{\rec}{\ensuremath{\mathcal{R}}}
+
+\begin{document}
+\begin{tikzpicture}[
+    squarednodeF/.style={circle, draw=black!60, fill=gray!5, very thick, minimum size=10mm},
+    squarednodeR/.style={rectangle, draw=black!60, fill=cyan!5, very thick, minimum size=5mm},
+    ]
+    %Nodes
+    \node[squarednodeF]      (full0)                              {$\mathrm{FOM}^0$};
+    \node[squarednodeR]        (reduced0)       [below of=full0, yshift=-1cm] {$\mathrm{ROM}^0$};
+    \node[squarednodeR]      (reduced1)       [right of=reduced0, xshift=2cm] {$\mathrm{ROM}^1$};
+    \node[squarednodeF]        (full1)       [above of=reduced1, yshift=1cm] {$\mathrm{FOM}^1$};
+    \node[squarednodeR] (reduced2) [right of=reduced1, xshift=2cm] {$\mathrm{ROM}^2$};
+    \node (reduced3) [right of=reduced2, xshift=2cm] {$\cdots$};
+    \node (full2) [above of=reduced2, yshift=1cm] {$\cdots$};
+    
+    %Lines
+    \draw[->, mred] (full0.south) to node[right] {\pro} (reduced0.north);
+    \draw[->] (reduced0.east) to node[above] {$\mathcal{NN}$} (reduced1.west);
+    \draw[->, mred] (reduced1.north) to node[left] {\rec} (full1.south);
+    \draw[->, mred] (reduced2.north) to node[left] {\rec} (full2.south);
+    \draw[->] (reduced1.east) to node[above] {$\mathcal{NN}$} (reduced2.west);
+    \draw[->] (reduced2.east) to node[above] {$\mathcal{NN}$} (reduced3.west);
+    \end{tikzpicture}
+\end{document}
diff --git a/docs/src/tikz/offline_online_dark.tex b/docs/src/tikz/offline_online_dark.tex
new file mode 100644
index 000000000..f3a994ff2
--- /dev/null
+++ b/docs/src/tikz/offline_online_dark.tex
@@ -0,0 +1,34 @@
+\documentclass[tikz]{standalone}
+\definecolor{morange}{RGB}{255,127,14}
+\definecolor{mblue}{RGB}{31,119,180}
+\definecolor{mred}{RGB}{214,39,40}
+\definecolor{mpurple}{RGB}{148,103,189}
+\definecolor{mgreen}{RGB}{44,160,44}
+
+\newcommand{\pro}{\ensuremath{\mathcal{P}}}
+\newcommand{\rec}{\ensuremath{\mathcal{R}}}
+
+\begin{document}
+\begin{tikzpicture}[
+    squarednodeF/.style={circle, draw=black!60, fill=gray!5, very thick, minimum size=10mm},
+    squarednodeR/.style={rectangle, draw=black!60, fill=cyan!5, very thick, minimum size=5mm},
+    color = white
+    ]
+    %Nodes
+    \node[squarednodeF]      (full0)                              {$\mathrm{FOM}^0$};
+    \node[squarednodeR]        (reduced0)       [below of=full0, yshift=-1cm] {$\mathrm{ROM}^0$};
+    \node[squarednodeR]      (reduced1)       [right of=reduced0, xshift=2cm] {$\mathrm{ROM}^1$};
+    \node[squarednodeF]        (full1)       [above of=reduced1, yshift=1cm] {$\mathrm{FOM}^1$};
+    \node[squarednodeR] (reduced2) [right of=reduced1, xshift=2cm] {$\mathrm{ROM}^2$};
+    \node (reduced3) [right of=reduced2, xshift=2cm] {$\cdots$};
+    \node (full2) [above of=reduced2, yshift=1cm] {$\cdots$};
+    
+    %Lines
+    \draw[->, mred] (full0.south) to node[right] {\pro} (reduced0.north);
+    \draw[->] (reduced0.east) to node[above] {$\mathcal{NN}$} (reduced1.west);
+    \draw[->, mred] (reduced1.north) to node[left] {\rec} (full1.south);
+    \draw[->, mred] (reduced2.north) to node[left] {\rec} (full2.south);
+    \draw[->] (reduced1.east) to node[above] {$\mathcal{NN}$} (reduced2.west);
+    \draw[->] (reduced2.east) to node[above] {$\mathcal{NN}$} (reduced3.west);
+    \end{tikzpicture}
+\end{document}

From 8f57b2f85d2d0337cca8175679353f24e748e749 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 19 Jun 2024 19:44:34 +0200
Subject: [PATCH 20/56] Now also exporting reduced loss.

---
 src/GeometricMachineLearning.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/GeometricMachineLearning.jl b/src/GeometricMachineLearning.jl
index f205216f5..d65e2eef5 100644
--- a/src/GeometricMachineLearning.jl
+++ b/src/GeometricMachineLearning.jl
@@ -245,7 +245,7 @@ module GeometricMachineLearning
     include("backends/backends.jl")
     include("backends/lux.jl")
 
-    export TransformerLoss, FeedForwardLoss, AutoEncoderLoss
+    export TransformerLoss, FeedForwardLoss, AutoEncoderLoss, ReducedLoss
 
     #INCLUDE ARCHITECTURES
     include("architectures/neural_network_integrator.jl")

From 7588c63ca675eedbe9a46b0db40f66ab00bb149d Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 19 Jun 2024 19:45:09 +0200
Subject: [PATCH 21/56] Expanded on the architecture a bit.

---
 .../architectures/symplectic_autoencoder.md   | 69 ++++++++++++++++---
 1 file changed, 61 insertions(+), 8 deletions(-)

diff --git a/docs/src/architectures/symplectic_autoencoder.md b/docs/src/architectures/symplectic_autoencoder.md
index e1ac8d965..7afe646d5 100644
--- a/docs/src/architectures/symplectic_autoencoder.md
+++ b/docs/src/architectures/symplectic_autoencoder.md
@@ -1,17 +1,70 @@
 # Symplectic Autoencoder 
 
+Symplectic autoencoders offer a structure-preserving way of mapping a high-dimensional system to a low dimensional system. Concretely this means that if we obtain a reduced system by means of a symplectic autoencoder, this system will again be reduced. 
+
+```@example 
+Main.include_graphics("../tikz/symplectic_autoencoder") # hide
+```
+
+## Intermediate Dimensions
+
+For a high-fidelity system of dimension ``2N`` and a reduced system of dimension ``2n``, the intermediate dimensions in the symplectic encoder and the decoder are computed according to: 
+
+```julia
+iterations = Vector{Int}(n : (N - n) ÷ (n_blocks - 1) : N)
+iterations[end] = full_dim2
+iterations * 2
+```
+
+So for e.g. ``2N = 100,`` ``2n = 10`` and ``\mathtt{n\_blocks} = 3`` we get 
+
+```math
+\mathrm{iterations} = 5\mathtt{:}(45 \div 2)\mathtt{:}50 = 5\mathtt{:}22\mathtt{:}50 = (5, 27, 49),
+```
+
+and after the further two modifications the dimensions are:
+
+```math
+(10, 54, 100).
+```
+
+
+## Example
+
 A visualization of an instance of [SymplecticAutoencoder](@ref) is shown below: 
 
 ```@example 
 Main.include_graphics("../tikz/symplectic_autoencoder_architecture") # hide
 ```
 
-The *intermediate dimension* ``M`` is calculated via `n : (N - n) ÷ (n_blocks - 1) : N`. Further we have the following choices:
-- `n_encoder_layers::Integer = 4`
-- `n_encoder_blocks::Integer = 2` 
-- `n_decoder_layers::Integer = 2` 
-- `n_decoder_blocks::Integer = 3`
-- `encoder_init_q::Bool = true`
-- `decoder_init_q::Bool = true`
+In this example shown in the figure `n_encoder_blocks` is two, `n_encoder_layers` is four, `n_decoder_blocks` is 3 and `n_decoder_layers` is 2. You can build such an instance of a symplectic autoencoder by calling:
+
+```@example sae
+using GeometricMachineLearning
+
+const full_dim = 100
+const reduced_dim = 10
+
+model = SymplecticAutoencoder(full_dim, reduced_dim; n_encoder_blocks = 2, n_encoder_layers = 4, n_decoder_blocks = 3, n_decoder_layers = 2)
+
+for layer in Chain(model)
+    println(stdout, layer)
+end
+```
+
+We also see that the intermediate dimension in the decoder is 54.
+
+## Library Functions
+
+```@docs; canonical = false
+SymplecticAutoencoder
+```
+
+## References
+
+```@bibliography
+Pages = []
+Canonical = false
 
-Note that all of these are keyword arguments that can be supplied to [SymplecticAutoencoder](@ref).
\ No newline at end of file
+brantner2023symplectic
+```
\ No newline at end of file

From e412445bee175b37f237c949c8a7f8f3108c831d Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 19 Jun 2024 19:45:53 +0200
Subject: [PATCH 22/56] Turned this into a more general section on reduced
 order modeling.

---
 .../src/reduced_order_modeling/autoencoder.md | 41 +++++++++++--------
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/docs/src/reduced_order_modeling/autoencoder.md b/docs/src/reduced_order_modeling/autoencoder.md
index 13dedc3eb..d91c4b12f 100644
--- a/docs/src/reduced_order_modeling/autoencoder.md
+++ b/docs/src/reduced_order_modeling/autoencoder.md
@@ -1,49 +1,58 @@
-# Reduced Order modeling and Autoencoders 
+# Reduced Order Modeling
 
-Reduced order modeling is a data-driven technique that exploits the structure of parametric PDEs to make solving those PDEs easier.
+Reduced order modeling is a data-driven technique that exploits the structure of parametric partial differential equations (PPDEs) to make repeated simulations of this PPDE much cheaper.
 
-Consider a parametric PDE written in the form: $F(z(\mu);\mu)=0$ where $z(\mu)$ evolves on a infinite-dimensional Hilbert space $V$. 
+For this consider a PPDE written in the form: ``F(z(\mu);\mu)=0`` where ``z(\mu)`` evolves on a infinite-dimensional Hilbert space ``V``. 
 
-In modeling any PDE we have to choose a discretization (particle discretization, finite element method, ...) of $V$, which will be denoted by $V_h$. 
+In modeling any PDE we have to choose a discretization (particle discretization, finite element method, ...) of ``V``, which will be denoted by ``V_h``. The space ``V_h`` is not infinite-dimensional but still very large. Solving a discretized PDE in this space is typically very expensive! In reduced order modeling we utilize the fact that slightly different choices of parameters ``\mu`` will give qualitatively similar solutions. We can therefore perform a few simulations in the full space ``V_h`` and then make successive simulations cheaper by *learning* from the past simulations. A crucial concept in this is the *solution manifold*.
 
 ## Solution manifold 
 
-To any parametric PDE we associate a **solution manifold**: 
+To any PPDE and a certain parameter set ``\mathbb{P}`` we associate a *solution manifold*: 
 
 ```math 
 \mathcal{M} = \{z(\mu):F(z(\mu);\mu)=0, \mu\in\mathbb{P}\}.
 ```
 
-![](../tikz/solution_manifold_2.png)
+A motivation for reduced order modeling is that even though the space ``V_h`` is of very high-dimension, the solution manifold will typically be a very small space. The image below shows a two-dimension solution manifold[^1] embedded in ``V_h\equiv\mathbb{R}^3``:
+
+[^1]: The systems be deal with usually have much greater dimension of course. The dimension of ``V_h`` will be in the thousands and the dimension of the solution manifold will be a few order of magnitudes smaller. Because this cannot be easily visualized we resort to showing a two-dimensional manifold in a three-dimensional space here. 
 
-In the image above a 2-dimensional solution manifold is visualized as a sub-manifold in 3-dimensional space. In general the embedding space is an infinite-dimensional function space.
+![](../tikz/solution_manifold_2.png)
 
-As an example of this consider the 1-dimensional wave equation: 
+As an example of this consider the one-dimensional wave equation [blickhan2023registration](@cite): 
 
 ```math
 \partial_{tt}^2q(t,\xi;\mu) = \mu^2\partial_{\xi\xi}^2q(t,\xi;\mu)\text{ on }I\times\Omega,
 ```
-where $I = (0,1)$ and $\Omega=(-1/2,1/2)$. As initial condition for the first derivative we have $\partial_tq(0,\xi;\mu) = -\mu\partial_\xi{}q_0(\xi;\mu)$ and furthermore $q(t,\xi;\mu)=0$ on the boundary (i.e. $\xi\in\{-1/2,1/2\}$).
+where ``I = (0,1)$ and $\Omega=(-1/2,1/2)``. As initial condition for the first derivative we have ``\partial_tq(0,\xi;\mu) = -\mu\partial_\xi{}q_0(\xi;\mu)`` and furthermore ``q(t,\xi;\mu)=0`` on the boundary (i.e. ``\xi\in\{-1/2,1/2\}``).
 
-The solution manifold is a 1-dimensional submanifold: 
+The solution manifold is a 1-dimensional submanifold of an infinite-dimensional function space: 
 
 ```math
 \mathcal{M} = \{(t, \xi)\mapsto{}q(t,\xi;\mu)=q_0(\xi-\mu{}t;\mu):\mu\in\mathbb{P}\subset\mathbb{R}\}.
 ```
 
-If we provide an initial condition $u_0$, a parameter instance $\mu$ and a time $t$, then $\xi\mapsto{}q(t,\xi;\mu)$ will be the momentary solution. If we consider the time evolution of $q(t,\xi;\mu)$, then it evolves on a two-dimensional submanifold $\bar{\mathcal{M}} := \{\xi\mapsto{}q(t,\xi;\mu):t\in{}I,\mu\in\mathbb{P}\}$.
+If we provide an initial condition $u_0$, a parameter instance ``\mu`` and a time ``t``, then ``\xi\mapsto{}q(t,\xi;\mu)`` will be the momentary solution. If we consider the time evolution of ``q(t,\xi;\mu)``, then it evolves on a two-dimensional submanifold ``\bar{\mathcal{M}} := \{\xi\mapsto{}q(t,\xi;\mu):t\in{}I,\mu\in\mathbb{P}\}``.
 
-## General workflow
+In reduced order modeling we try to find an approximation to this solution manifolds. Neural networks offer a way of doing so efficiently!
 
-In reduced order modeling we aim to construct a mapping to a space that is close to this solution manifold. This is done through the following steps: 
+## General workflow
 
+In reduced order modeling we aim to construct an approximation to the solution manifold and that is ideally of a dimension not much greater than that of the solution manifold and the solved so-called *reduced equations* in the small space. This approximation to the solution manifold is performed in the following steps: 
 1. Discretize the PDE.
+2. Solve the discretized PDE for a certain set of parameter instances ``\mu\in\mathbb{P}``.
+3. Build a reduced basis with the data obtained from having solved the discretized PDE. This step consists of finding two mappings: the *reduction* ``\mathcal{P}`` and the *reconstruction* ``\mathcal{R}``.
+
+The third step can be done with various machine learning (ML) techniques. Traditionally the most popular of these has been *Proper orthogonal decomposition* (POD), but in recent years *autoencoders* have also become a popular alternative [fresca2021comprehensive](@cite).
 
-2. Solve the discretized PDE for a certain set of parameter instances $\mu\in\mathbb{P}$.
+After having obtained ``\mathcal{P}`` and ``\mathcal{R}`` we still need to solve the *reduced system*. Solving the reduced system is typically referred to as the *online phase* in reduced order modeling. This is sketched below: 
 
-3. Build a reduced basis with the data obtained from having solved the discretized PDE. This step consists of finding two mappings: the **reduction** $\mathcal{P}$ and the **reconstruction** $\mathcal{R}$.
+```@example
+include_graphics(../tikz/offline_online) # hide
+```
 
-The third step can be done with various machine learning (ML) techniques. Traditionally the most popular of these has been **Proper orthogonal decomposition** (POD), but in recent years **autoencoders** have also become a popular alternative (see (Fresca et al, 2021)). 
+The online phase is applying the mapping ``\mathcal{NN}`` in the low-dimensional space in order to predict the next time step. Crucially this step can be made very cheap when compared to the full-order model.
 
 ## References 
 ```@bibliography

From d0ba50f10949a1c6fbc3a255812cd16a24ef27b7 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 19 Jun 2024 19:47:14 +0200
Subject: [PATCH 23/56] Improved readibility of docs.

---
 src/architectures/symplectic_autoencoder.jl | 30 +++++++++++----------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/src/architectures/symplectic_autoencoder.jl b/src/architectures/symplectic_autoencoder.jl
index 1042bd687..65d84994b 100644
--- a/src/architectures/symplectic_autoencoder.jl
+++ b/src/architectures/symplectic_autoencoder.jl
@@ -1,5 +1,9 @@
 @doc raw"""
-## The architecture
+    SymplecticAutoencoder(full_dim, reduced_dim)
+
+Make an instance of `SymplecticAutoencoder` for dimensions `full_dim` and `reduced_dim` (those have to be provided as `Integer`s).
+
+# The architecture
 
 The symplectic autoencoder architecture was introduced in [brantner2023symplectic](@cite). Like any other autoencoder it consists of an *encoder* ``\Psi^e:\mathbb{R}^{2N}\to\mathbb{R}^{2n}`` and a *decoder* ``\Psi^d:\mathbb{R}^{2n}\to\mathbb{R}^{2N}`` with ``n\ll{}N``. These satisfy the following properties: 
 
@@ -15,19 +19,17 @@ Because the decoder has this particular property, the reduced system can be desc
 
 where ``(\nabla_\xi\Psi^d)^+`` is the pseudoinverse of ``\nabla_\xi\Psi^d`` (for more details see the docs on the [AutoEncoder](@ref) type).
 
-## The constructor
-
-The constructor is called with
-- `full_dim::Integer` 
-- `reduced_dim::Integer` 
-- `n_encoder_layers::Integer = 4` (keyword argument)
-- `n_encoder_blocks::Integer = 2` (keyword argument)
-- `n_decoder_layers::Integer = 1` (keyword argument)
-- `n_decoder_blocks::Integer = 3` (keyword argument)
-- `sympnet_upscale::Integer = 5` (keyword argument)
-- `activation = tanh` (keyword argument)
-- `encoder_init_q::Bool = true` (keyword argument)
-- `decoder_init_q::Bool = true` (keyword argument)
+# Arguments
+
+You can provide the following keyword arguments:
+- `n_encoder_layers::Integer = 4`: The number of layers in one encoder block.
+- `n_encoder_blocks::Integer = 2`: The number of encoder blocks.
+- `n_decoder_layers::Integer = 1`: The number of layers in one decoder block.
+- `n_decoder_blocks::Integer = 3`: The number of decoder blocks.
+- `sympnet_upscale::Integer = 5`: The *upscaling dimension* of the GSympNet. See [`GradientLayerQ`](@ref) and [`GradientLayerP`](@ref).
+- `activation = tanh`: The activation in the gradient layers.
+- `encoder_init_q::Bool = true`: Specifies if the first layer in each encoder block should be of $q$ type.
+- `decoder_init_q::Bool = true`: Specifies if the first layer in each decoder block should be of $p$ type.
 """
 struct SymplecticAutoencoder{EncoderInit, DecoderInit, AT} <: SymplecticCompression 
     full_dim::Int

From 4aad585d1d3d246e5ad5cd0a72f3047d5dae52ac Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 19 Jun 2024 19:47:35 +0200
Subject: [PATCH 24/56] Put constructor at beginning of file.

---
 src/layers/sympnets.jl | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/layers/sympnets.jl b/src/layers/sympnets.jl
index 725811dce..c687f32d0 100644
--- a/src/layers/sympnets.jl
+++ b/src/layers/sympnets.jl
@@ -14,6 +14,10 @@ struct GradientLayer{M, N, TA, C} <: SympNetLayer{M, N}
 end
 
 @doc raw"""
+    GradientLayerQ(sys_dim, upscaling_dimension, activation)
+
+Make an instance of a gradient-``q`` layer.
+
 The gradient layer that changes the ``q`` component. It is of the form: 
 
 ```math
@@ -27,7 +31,11 @@ with ``V(p) = \sum_{i=1}^Ma_i\Sigma(\sum_jk_{ij}p_j+b_i)``, where ``\Sigma`` is
 const GradientLayerQ{M, N, TA} = GradientLayer{M, N, TA, :Q}
 
 @doc raw"""
-The gradient layer that changes the ``q`` component. It is of the form: 
+    GradientLayerP(sys_dim, upscaling_dimension, activation)
+
+Make an instance of a gradient-``p`` layer.
+
+The gradient layer that changes the ``p`` component. It is of the form: 
 
 ```math
 \begin{bmatrix}
@@ -35,7 +43,7 @@ The gradient layer that changes the ``q`` component. It is of the form:
 \end{bmatrix},
 ```
 
-with ``V(p) = \sum_{i=1}^Ma_i\Sigma(\sum_jk_{ij}p_j+b_i)``, where ``\Sigma`` is the antiderivative of the activation function ``\sigma`` (one-layer neural network). We refer to $M$ as the *upscaling dimension*. Such layers are by construction symplectic.
+with ``V(p) = \sum_{i=1}^Ma_i\Sigma(\sum_jk_{ij}q_j+b_i)``, where ``\Sigma`` is the antiderivative of the activation function ``\sigma`` (one-layer neural network). We refer to $M$ as the *upscaling dimension*. Such layers are by construction symplectic.
 """
 const GradientLayerP{M, N, TA} = GradientLayer{M, N, TA, :P}
 

From 4a531625abbe2e781d17124ffc41e06ab0085c97 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 19 Jun 2024 19:49:38 +0200
Subject: [PATCH 25/56] Added neural network losses.

---
 docs/make.jl                              |  1 +
 docs/src/reduced_order_modeling/losses.md | 10 ++++++++++
 2 files changed, 11 insertions(+)
 create mode 100644 docs/src/reduced_order_modeling/losses.md

diff --git a/docs/make.jl b/docs/make.jl
index dfccfed37..c0e99d281 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -179,6 +179,7 @@ makedocs(;
         ],
         "Reduced Order Modelling" =>[
             "POD and Autoencoders" => "reduced_order_modeling/autoencoder.md",
+            "Network Losses" => "reduced_order_modeling/losses.md",
             "PSD and Symplectic Autoencoders" => "reduced_order_modeling/symplectic_autoencoder.md",
             "Kolmogorov n-width" => "reduced_order_modeling/kolmogorov_n_width.md",
             "Projection and Reduction Error" => "reduced_order_modeling/projection_reduction_errors.md",
diff --git a/docs/src/reduced_order_modeling/losses.md b/docs/src/reduced_order_modeling/losses.md
new file mode 100644
index 000000000..25bef1294
--- /dev/null
+++ b/docs/src/reduced_order_modeling/losses.md
@@ -0,0 +1,10 @@
+# Different Neural Network Losses
+
+
+```@docs; canonical = false
+GeometricMachineLearning.NetworkLoss
+TransformerLoss
+FeedForwardLoss
+AutoEncoderLoss
+ReducedLoss
+```
\ No newline at end of file

From 6c1f0fe0c30a96f6334e1db5e5d180ac2ac04a9b Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 19 Jun 2024 19:50:42 +0200
Subject: [PATCH 26/56] Renamed file.

---
 docs/make.jl                                                    | 2 +-
 .../{autoencoder.md => reduced_order_modeling.md}               | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename docs/src/reduced_order_modeling/{autoencoder.md => reduced_order_modeling.md} (100%)

diff --git a/docs/make.jl b/docs/make.jl
index c0e99d281..5a0e62eb8 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -178,7 +178,7 @@ makedocs(;
             "Linear Symplectic Attention" => "layers/linear_symplectic_attention.md",
         ],
         "Reduced Order Modelling" =>[
-            "POD and Autoencoders" => "reduced_order_modeling/autoencoder.md",
+            "General Framework" => "reduced_order_modeling/reduced_order_modeling.md",
             "Network Losses" => "reduced_order_modeling/losses.md",
             "PSD and Symplectic Autoencoders" => "reduced_order_modeling/symplectic_autoencoder.md",
             "Kolmogorov n-width" => "reduced_order_modeling/kolmogorov_n_width.md",
diff --git a/docs/src/reduced_order_modeling/autoencoder.md b/docs/src/reduced_order_modeling/reduced_order_modeling.md
similarity index 100%
rename from docs/src/reduced_order_modeling/autoencoder.md
rename to docs/src/reduced_order_modeling/reduced_order_modeling.md

From dce2f505d8ec617838c56f2ee48c17587aee7962 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 21 Jun 2024 02:06:09 +0200
Subject: [PATCH 27/56] Implemented convenience constructors for the DataLoader
 that can be used if dl has already been allocated.

---
 src/data_loader/data_loader.jl | 86 ++++++++++++++++++++++++++++++----
 1 file changed, 76 insertions(+), 10 deletions(-)

diff --git a/src/data_loader/data_loader.jl b/src/data_loader/data_loader.jl
index dfc52ec82..bd2495946 100644
--- a/src/data_loader/data_loader.jl
+++ b/src/data_loader/data_loader.jl
@@ -1,7 +1,11 @@
 description(::Val{:DataLoader}) = raw"""
-Data Loader is a struct that creates an instance based on a tensor (or different input format) and is designed to make training convenient. 
+    DataLoader(data)
+
+Make an instance based on a data set.
 
-## Constructor 
+This is designed such to make training convenient.
+
+# Constructor 
 
 The data loader can be called with various inputs:
 - **A single vector**: If the data loader is called with a single vector (and no other arguments are given), then this is interpreted as an autoencoder problem, i.e. the second axis indicates parameter values and/or time steps and the system has a single degree of freedom (i.e. the system dimension is one).
@@ -17,7 +21,7 @@ When we supply a single vector or a single matrix as input to `DataLoader` and f
 """
 $(description(Val(:DataLoader)))
 
-## Fields of `DataLoader`
+# Fields of `DataLoader`
 
 The fields of the `DataLoader` struct are the following: 
 - `input`: The input data with axes (i) system dimension, (ii) number of time steps and (iii) number of parameters.
@@ -28,9 +32,32 @@ The fields of the `DataLoader` struct are the following:
 - `output_dim`: The dimension of the output tensor (first axis). If `output` is of type `Nothing`, then this is also of type `Nothing`.
 - `output_time_steps`: The size of the second axis of the output tensor. If `output` is of type `Nothing`, then this is also of type `Nothing`.
 
-### The `input` and `output` fields of `DataLoader`
+# Implementation
+
+Even though `DataLoader` can be called with inputs of various forms, internally it always stores tensors with three axes.
+
+```jldoctest
+using GeometricMachineLearning
+
+data = [1 2 3; 4 5 6]
+dl = DataLoader(data)
+dl.input
+
+# output
 
-Even though the arguments to the Constructor may be vectors or matrices, internally `DataLoader` always stores tensors.
+2×1×3 Array{Int64, 3}:
+[:, :, 1] =
+ 1
+ 4
+
+[:, :, 2] =
+ 2
+ 5
+
+[:, :, 3] =
+ 3
+ 6
+```
 """
 struct DataLoader{T, AT<:Union{NamedTuple, AbstractArray{T}}, OT<:Union{AbstractArray, Nothing}, DataType}
     input::AT
@@ -164,13 +191,47 @@ function DataLoader(ensemble_solution::EnsembleSolution{T, T1, Vector{ST}}) wher
     DataLoader(data)
 end
 
+function map_to_new_backend(input::AT, backend::KernelAbstractions.Backend) where AT <: AbstractArray
+    input₂ = KernelAbstractions.allocate(backend, size(input)...)
+    KernelAbstractions.copyto!(backend, input₂, input)
+    DataLoader(input₂)
+end
+
+function map_to_new_backend(input::QPT, backend::KernelAbstractions.Backend)
+    input₂ = (q = KernelAbstractions.allocate(backend, size(input₂.q)...), p = KernelAbstractions.allocate(backend, size(input₂.p)...))
+    KernelAbstractions.copyto!(backend, input₂.q, input.q)
+    KernelAbstractions.copyto!(backend, input₂.p, input.p)
+    DataLoader(input₂)
+end
+
+function DataLoader(dl::DataLoader{T, <:QPTOAT, Nothing}, backend::KernelAbstractions.Backend)
+    DataLoader(map_to_new_backend(dl.input, backend))
+end
+
+function reshape_to_matrix(dl::DataLoader{T, <:QPT, Nothing, :RegularData})
+    (q = reshape(dl.input.q, dl.input_dim ÷ 2, dl.n_params), p = reshape(dl.input.p, dl.input_dim ÷ 2, dl.n_params))
+end
+
+function reshape_to_matrix(dl::DataLoader{T, <:AbstractArray, Nothing, :RegularData})
+    reshape(dl.input, dl.input_dim, dl.n_params)
+end
+
+function DataLoader(dl::DataLoader{T, <: QPTOAT, Nothing, :RegularData}, 
+    backend::KernelAbstractions.Backend=KernelAbstractions.get_backend(dl); 
+    autoencoder::Bool=false)
+
+    if autoencoder == true
+        DataLoader(map_to_new_backend(backend, dl.input))
+    elseif autoencoder == false
+        matrix_data = reshape_to_matrix(dl)
+        DataLoader(map_to_new_backend(backend, matrix_data); autoencoder=true)
+    end
+end
+
 @doc raw"""
-Computes the accuracy (as opposed to the loss) of a neural network classifier. 
+    accuracy(model, ps, dl)
 
-It takes as input:
-- `model::Chain`
-- `ps`: parameters of the network
-- `dl::DataLoader`
+Compute the accuracy of a neural network classifier. 
 """
 function accuracy(model::Chain, ps::Tuple, dl::DataLoader{T, AT, BT}) where {T, T1<:Integer, AT<:AbstractArray{T}, BT<:AbstractArray{T1}}
     output_tensor = model(dl.input, ps)
@@ -183,6 +244,11 @@ function accuracy(model::Chain, ps::Tuple, dl::DataLoader{T, AT, BT}) where {T,
     (size(dl.output, 3)-sum(abs.(dl.output - tensor_of_maximum_elements))/T1(2))/size(dl.output, 3)
 end
 
+"""
+    accuracy(nn, dl)
+
+Compute the accuracy of a neural network classifier.
+"""
 accuracy(nn::NeuralNetwork, dl::DataLoader) = accuracy(nn.model, nn.params, dl)
 
 Base.eltype(::DataLoader{T}) where T = T

From 1080f4da43ee445a4ae661d0b7e2f6395791ccc1 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 21 Jun 2024 02:06:36 +0200
Subject: [PATCH 28/56] Defined new constants for qp-type data.

---
 src/utils.jl | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/utils.jl b/src/utils.jl
index 697c31128..c940790ec 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -110,6 +110,19 @@ function global_section(::AbstractVecOrMat)
     nothing
 end
 
+"""
+The type for data in ``(q, p)`` coordinates.
+"""
+const QPT = NamedTuple{(:q, :p), Tuple{AT, AT}} where AT <: AbstractArray
+
+"""
+A union of two types:
+```julia
+const QPTOAT = Union{QPT, AbstractArray}
+```
+"""
+const QPTOAT = Union{QPT, AbstractArray}
+
 _eltype(x) = eltype(x)
 _eltype(ps::NamedTuple) = _eltype(ps[1])
 _eltype(ps::Tuple) = _eltype(ps[1])
\ No newline at end of file

From c3b6a79ce723da57f20bd696e101638e030ba45b Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 21 Jun 2024 02:15:16 +0200
Subject: [PATCH 29/56] Also include the type now in definition of QPT and
 QPTOAT.

---
 src/utils.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/utils.jl b/src/utils.jl
index c940790ec..21f35ba27 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -113,7 +113,7 @@ end
 """
 The type for data in ``(q, p)`` coordinates.
 """
-const QPT = NamedTuple{(:q, :p), Tuple{AT, AT}} where AT <: AbstractArray
+const QPT{T} = NamedTuple{(:q, :p), Tuple{AT, AT}} where {T, AT <: AbstractArray{T}}
 
 """
 A union of two types:
@@ -121,7 +121,7 @@ A union of two types:
 const QPTOAT = Union{QPT, AbstractArray}
 ```
 """
-const QPTOAT = Union{QPT, AbstractArray}
+const QPTOAT{T} = Union{QPT{T}, AbstractArray{T}} where T
 
 _eltype(x) = eltype(x)
 _eltype(ps::NamedTuple) = _eltype(ps[1])

From 7fc97f86d8540479d7e76a34cf5d12c423f31323 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 21 Jun 2024 02:15:34 +0200
Subject: [PATCH 30/56] Changed order in which files are included in order for
 gml to compile.

---
 src/GeometricMachineLearning.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/GeometricMachineLearning.jl b/src/GeometricMachineLearning.jl
index d65e2eef5..0ab7b80dc 100644
--- a/src/GeometricMachineLearning.jl
+++ b/src/GeometricMachineLearning.jl
@@ -47,6 +47,8 @@ module GeometricMachineLearning
     # from GeometricBase to print docs
     export description
 
+    include("utils.jl")
+
     include("data_loader/data_loader.jl")
 
     # INCLUDE ARRAYS
@@ -103,7 +105,6 @@ module GeometricMachineLearning
 
     # are these needed?
     export UnknownProblem, NothingFunction
-    include("utils.jl")
 
     # + operation has been overloaded to work with NamedTuples!
     export _add, apply_toNT, split_and_flatten, add!

From 93f9fe4d6fc5799b9017276eb7d10fd729940bf7 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 21 Jun 2024 02:15:52 +0200
Subject: [PATCH 31/56] Specified missing types.

---
 src/data_loader/data_loader.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/data_loader/data_loader.jl b/src/data_loader/data_loader.jl
index bd2495946..62567e5cc 100644
--- a/src/data_loader/data_loader.jl
+++ b/src/data_loader/data_loader.jl
@@ -197,26 +197,26 @@ function map_to_new_backend(input::AT, backend::KernelAbstractions.Backend) wher
     DataLoader(input₂)
 end
 
-function map_to_new_backend(input::QPT, backend::KernelAbstractions.Backend)
-    input₂ = (q = KernelAbstractions.allocate(backend, size(input₂.q)...), p = KernelAbstractions.allocate(backend, size(input₂.p)...))
+function map_to_new_backend(input::QPT{T}, backend::KernelAbstractions.Backend) where T
+    input₂ = (q = KernelAbstractions.allocate(backend, T, size(input₂.q)...), p = KernelAbstractions.allocate(backend, size(input₂.p)...))
     KernelAbstractions.copyto!(backend, input₂.q, input.q)
     KernelAbstractions.copyto!(backend, input₂.p, input.p)
     DataLoader(input₂)
 end
 
-function DataLoader(dl::DataLoader{T, <:QPTOAT, Nothing}, backend::KernelAbstractions.Backend)
+function DataLoader(dl::DataLoader{<:Number, <:QPTOAT, Nothing}, backend::KernelAbstractions.Backend)
     DataLoader(map_to_new_backend(dl.input, backend))
 end
 
-function reshape_to_matrix(dl::DataLoader{T, <:QPT, Nothing, :RegularData})
+function reshape_to_matrix(dl::DataLoader{<:Number, <:QPT, Nothing, :RegularData})
     (q = reshape(dl.input.q, dl.input_dim ÷ 2, dl.n_params), p = reshape(dl.input.p, dl.input_dim ÷ 2, dl.n_params))
 end
 
-function reshape_to_matrix(dl::DataLoader{T, <:AbstractArray, Nothing, :RegularData})
+function reshape_to_matrix(dl::DataLoader{<:Number, <:AbstractArray, Nothing, :RegularData})
     reshape(dl.input, dl.input_dim, dl.n_params)
 end
 
-function DataLoader(dl::DataLoader{T, <: QPTOAT, Nothing, :RegularData}, 
+function DataLoader(dl::DataLoader{<: Number, <: QPTOAT, Nothing, :RegularData}, 
     backend::KernelAbstractions.Backend=KernelAbstractions.get_backend(dl); 
     autoencoder::Bool=false)
 

From b9d008000fdea03bc0bd6bf8204f07964ae98b4a Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 21 Jun 2024 02:23:20 +0200
Subject: [PATCH 32/56] Added info line (doctest not passing otherwise).

---
 src/data_loader/data_loader.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/data_loader/data_loader.jl b/src/data_loader/data_loader.jl
index 62567e5cc..be20c6769 100644
--- a/src/data_loader/data_loader.jl
+++ b/src/data_loader/data_loader.jl
@@ -45,6 +45,7 @@ dl.input
 
 # output
 
+[ Info: You have provided a matrix as input. The axes will be interpreted as (i) system dimension and (ii) number of parameters.
 2×1×3 Array{Int64, 3}:
 [:, :, 1] =
  1

From 425ad4f56c8a8f4e522363801030f9a721d3844f Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 21 Jun 2024 02:29:11 +0200
Subject: [PATCH 33/56] Added two more lines of text.

---
 docs/src/architectures/symplectic_autoencoder.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/src/architectures/symplectic_autoencoder.md b/docs/src/architectures/symplectic_autoencoder.md
index 7afe646d5..e0a7de83d 100644
--- a/docs/src/architectures/symplectic_autoencoder.md
+++ b/docs/src/architectures/symplectic_autoencoder.md
@@ -2,10 +2,15 @@
 
 Symplectic autoencoders offer a structure-preserving way of mapping a high-dimensional system to a low dimensional system. Concretely this means that if we obtain a reduced system by means of a symplectic autoencoder, this system will again be reduced. 
 
+The architecture is represented by the figure below:
+
 ```@example 
 Main.include_graphics("../tikz/symplectic_autoencoder") # hide
 ```
 
+It is a composition of [SympNet gradient layers](@ref "SympNet Gradient Layer") and PSD-like matrices.
+
+
 ## Intermediate Dimensions
 
 For a high-fidelity system of dimension ``2N`` and a reduced system of dimension ``2n``, the intermediate dimensions in the symplectic encoder and the decoder are computed according to: 

From 6ec0bc73b5a9b4cabb33c4a8ca539db438bb672b Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 26 Jun 2024 12:36:31 +0200
Subject: [PATCH 34/56] Also including the other sympnet layers in this file
 now.

---
 docs/make.jl                        |   2 +-
 docs/src/layers/sympnet_gradient.md | 109 ++++++++++++++++++----------
 2 files changed, 73 insertions(+), 38 deletions(-)

diff --git a/docs/make.jl b/docs/make.jl
index 5a0e62eb8..9805f8a71 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -171,7 +171,7 @@ makedocs(;
             "BFGS Optimizer" => "optimizers/bfgs_optimizer.md",
             ],
         "Special Neural Network Layers" => [
-            "Sympnet Gradient Layers" => "layers/sympnet_gradient.md",
+            "Sympnet Layers" => "layers/sympnet_gradient.md",
             "Volume-Preserving Layers" => "layers/volume_preserving_feedforward.md",
             "Attention" => "layers/attention_layer.md",
             "Multihead Attention" => "layers/multihead_attention_layer.md",
diff --git a/docs/src/layers/sympnet_gradient.md b/docs/src/layers/sympnet_gradient.md
index c024ba3dc..4a026e4c4 100644
--- a/docs/src/layers/sympnet_gradient.md
+++ b/docs/src/layers/sympnet_gradient.md
@@ -1,4 +1,8 @@
-# SympNet Gradient Layer
+# SympNet Layers
+
+The *SympNet paper* [jin2020sympnets](@cite) discusses three different kinds of sympnet layers: *activation layers*, *linear layers* and *gradient layers*. We discuss them below. Because activation layers are just a simplified form of gradient layers those two will be discussed together. A neural network that consists of many of these layers we call a [SympNet](@ref "SympNet Architecture").
+
+## SympNet Gradient Layer
 
 The Sympnet gradient layer (called [`GradientLayer`](@ref) in `GeometricMachineLearning`) is based on the following theorem: 
 
@@ -6,54 +10,85 @@ The Sympnet gradient layer (called [`GradientLayer`](@ref) in `GeometricMachineL
 Main.theorem(raw"""Given a symplectic vector space ``\mathbb{R}^{2n}`` which coordinates ``q_1, \ldots, q_n, p_1, \ldots, p_n`` and a function ``f:\mathbb{R}^n\to\mathbb{R}`` that only acts on the ``q`` part, the map ``(q, p) \mapsto (q, p + \nabla_qf)`` is symplectic. A similar statement holds if ``f`` only acts on the ``p`` part.""")
 ```
 
-Proofing this is straightforward by looking at the gradient of the mapping:
+```@eval
+Main.proof(raw"""Proofing this is straightforward by looking at the gradient of the mapping:
+""" * Main.indentation * raw"""
+""" * Main.indentation * raw"""```math
+""" * Main.indentation * raw"""    \begin{pmatrix}
+""" * Main.indentation * raw"""        \mathbb{I} & \mathbb{O} \\ 
+""" * Main.indentation * raw"""        \nabla_q^2f & \mathbb{I}
+""" * Main.indentation * raw"""    \end{pmatrix},
+""" * Main.indentation * raw"""```
+""" * Main.indentation * raw"""
+""" * Main.indentation * raw"""where ``\nabla_q^2f`` is the Hessian of ``f``. This matrix is symmetric and for any symmetric matrix ``A`` we have that: 
+""" * Main.indentation * raw"""
+""" * Main.indentation * raw"""```math
+""" * Main.indentation * raw"""    \begin{pmatrix}
+""" * Main.indentation * raw"""        \mathbb{I} & \mathbb{O} \\ 
+""" * Main.indentation * raw"""        A & \mathbb{I}
+""" * Main.indentation * raw"""    \end{pmatrix}^T \mathbb{J}_{2n} 
+""" * Main.indentation * raw"""    \begin{pmatrix} 
+""" * Main.indentation * raw"""        \mathbb{I} & \mathbb{O} \\ 
+""" * Main.indentation * raw"""        A & \mathbb{I} 
+""" * Main.indentation * raw"""    \end{pmatrix} = 
+""" * Main.indentation * raw"""    \begin{pmatrix}
+""" * Main.indentation * raw"""        \mathbb{I} & A \\ 
+""" * Main.indentation * raw"""        \mathbb{O} & \mathbb{I}
+""" * Main.indentation * raw"""    \end{pmatrix} 
+""" * Main.indentation * raw"""    \begin{pmatrix} 
+""" * Main.indentation * raw"""        \mathbb{O} & \mathbb{I} \\ 
+""" * Main.indentation * raw"""        -\mathbb{I} & \mathbb{O} 
+""" * Main.indentation * raw"""    \end{pmatrix} 
+""" * Main.indentation * raw"""    \begin{pmatrix}
+""" * Main.indentation * raw"""        \mathbb{I} & \mathbb{O} \\ 
+""" * Main.indentation * raw"""        A & \mathbb{I}
+""" * Main.indentation * raw"""    \end{pmatrix} = 
+""" * Main.indentation * raw"""    \begin{pmatrix}
+""" * Main.indentation * raw"""        \mathbb{O} & \mathbb{I} \\ 
+""" * Main.indentation * raw"""        -\mathbb{I} & \mathbb{O} 
+""" * Main.indentation * raw"""    \end{pmatrix} = \mathbb{J}_{2n}.
+""" * Main.indentation * raw"""```
+""" * Main.indentation * raw"""Thus showing symplecticity.""")
+```
+
+If we deal with [`GSympNet`](@ref)s this function ``f`` is 
 
 ```math
-    \begin{pmatrix}
-        \mathbb{I} & \mathbb{O} \\ 
-        \nabla_q^2f & \mathbb{I}
-    \end{pmatrix},
+    f(q) = a^T \Sigma(Kq + b),
 ```
 
-where ``\nabla_q^2f`` is the Hessian of ``f``. This matrix is symmetric and for any symmetric matrix ``A`` we have that: 
+where ``a, b\in\mathbb{R}^m``, ``K\in\mathbb{R}^{m\times{}n}`` and ``\Sigma`` is the antiderivative of some common activation function ``\sigma``. We routinely refer to ``m`` as the *upscaling dimension* in `GeometricMachineLearning`. Computing the gradient of ``f`` gives: 
 
 ```math
-    \begin{pmatrix}
-        \mathbb{I} & \mathbb{O} \\ 
-        A & \mathbb{I}
-    \end{pmatrix}^T \mathbb{J}_{2n} 
-    \begin{pmatrix} 
-        \mathbb{I} & \mathbb{O} \\ 
-        A & \mathbb{I} 
-    \end{pmatrix} = 
-    \begin{pmatrix}
-        \mathbb{I} & A \\ 
-        \mathbb{O} & \mathbb{I}
-    \end{pmatrix} 
-    \begin{pmatrix} 
-        \mathbb{O} & \mathbb{I} \\ 
-        -\mathbb{I} & \mathbb{O} 
-    \end{pmatrix} 
-    \begin{pmatrix}
-        \mathbb{I} & \mathbb{O} \\ 
-        A & \mathbb{I}
-    \end{pmatrix} = 
-    \begin{pmatrix}
-        \mathbb{O} & \mathbb{I} \\ 
-        -\mathbb{I} & \mathbb{O} 
-    \end{pmatrix}.
+    [\nabla_qf]_k = \sum_{i=1}^m a_i \sigma(\sum_{j=1}^nk_{ij}q_j + b_i)k_{ik} = K^T a \odot \sigma(Kq + b),
 ```
 
-If we deal with [`GSympNet`](@ref)s this function ``f`` is 
+where ``\odot`` is the element-wise product, i.e. ``[a\odot{}v]_k = a_kv_k``. This is the form that *gradient layers* take. In addition to gradient layers `GeometricMachineLearning` also has *linear* and *activation* layers implemented. Activation layers are simplified versions of *gradient layers*. These are equivalent to taking ``m = n`` and ``K = \mathbb{I}.``
+
+## SympNet Linear Layer
+
+Linear layers of type ``q`` are of the form:
 
 ```math
-    f(q) = a^T \Sigma(Kq + b),
+\begin{pmatrix} q \\ p \end{pmatrix} \mapsto \begin{pmatrix} \mathbb{I} & \mathbb{O} \\ A & \mathbb{I} \end{pmatrix} \begin{pmatrix} q \\ p \end{pmatrix},
 ```
 
-where ``a, b\in\mathbb{R}^m``, ``K\in\mathbb{R}^{m\times{}n}`` and ``\Sigma`` is the antiderivative of some common activation function ``\sigma``. We routinely refer to ``m`` as the *upscaling dimension* in `GeometricMachineLearning`. Computing the gradient of ``f`` gives: 
+where ``A`` is a symmetric matrix. This is implemented very efficiently in `GeometricMachineLearning` with the special matrix [`SymmetricMatrix`](@ref).
 
-```math
-    [\nabla_qf]_k = \sum_{i=1}^m a_i \sigma(\sum_{j=1}^nk_{ij}q_j + b_i)k_{ik} = = K^T a \odot \sigma(Kq + b),
+## Library Functions
+
+```@docs; canonical = false
+GeometricMachineLearning.SympNetLayer
+GeometricMachineLearning.GradientLayer
+GeometricMachineLearning.GradientLayerQ
+GeometricMachineLearning.GradientLayerP
 ```
 
-where ``\odot`` is the element-wise product, i.e. ``[a\odot{}v]_k = a_kv_k``.
\ No newline at end of file
+## References
+
+```@bibliography
+Canonical = false
+Pages = []
+
+jin2020sympnet
+```
\ No newline at end of file

From cf9406ebbc9026adf00f41a71d8e2cf6153c44cc Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 26 Jun 2024 12:37:37 +0200
Subject: [PATCH 35/56] Reworked the sympnet docs and moved the picture
 explaining the architecture.

---
 docs/src/architectures/sympnet.md      | 93 +++++++++++++-------------
 docs/src/tutorials/sympnet_tutorial.md |  6 --
 2 files changed, 45 insertions(+), 54 deletions(-)

diff --git a/docs/src/architectures/sympnet.md b/docs/src/architectures/sympnet.md
index 4fb08c193..4864d2199 100644
--- a/docs/src/architectures/sympnet.md
+++ b/docs/src/architectures/sympnet.md
@@ -6,15 +6,20 @@ This document discusses the SympNet architecture and its implementation in `Geom
 
 ### Principle
 
-SympNets (see [jin2020sympnets](@cite) for the eponymous paper) are a type of neural network that can model the trajectory of a Hamiltonian system in phase space. Take ``(q^T,p^T)^T=(q_1,\ldots,q_d,p_1,\ldots,p_d)^T\in \mathbb{R}^{2d}`` as the coordinates in phase space, where ``q=(q_1, \ldots, q_d)^T\in \mathbb{R}^{d}`` is refered to as the *position* and ``p=(p_1, \ldots, p_d)^T\in \mathbb{R}^{d}`` the *momentum*. Given a point ``(q^T,p^T)^T`` in ``\mathbb{R}^{2d}`` the SympNet aims to compute the *next position* ``((q')^T,(p')^T)^T`` and thus predicts the trajectory while preserving the *symplectic structure* of the system.
+SympNets [jin2020sympnets](@cite) are a type of neural network that can model the trajectory of a Hamiltonian system in phase space. Take ``(q^T,p^T)^T=(q_1,\ldots,q_d,p_1,\ldots,p_d)^T\in \mathbb{R}^{2d}`` as the coordinates in phase space, where ``q=(q_1, \ldots, q_d)^T\in \mathbb{R}^{d}`` is refered to as the *position* and ``p=(p_1, \ldots, p_d)^T\in \mathbb{R}^{d}`` the *momentum*. Given a point ``(q^T,p^T)^T`` in ``\mathbb{R}^{2d}`` the SympNet aims to compute the *next position* ``((q')^T,(p')^T)^T`` and thus predicts the trajectory while preserving the *symplectic structure* of the system.
 SympNets are enforcing symplecticity strongly, meaning that this property is hard-coded into the network architecture. The layers are reminiscent of traditional neural network feedforward layers, but have a strong restriction imposed on them in order to be symplectic.
 
-SympNets can be viewed as a "symplectic integrator" (see [hairer2006geometric](@cite) and [leimkuhler2004simulating](@cite)). Their goal is to predict, based on an initial condition $((q^{(0)})^T,(p^{(0)})^T)^T$, a sequence of points in phase space that fit the training data as well as possible:
+SympNets can be viewed as a *symplectic integrator* or symplectic one-step method[^1] [hairer2006geometric, leimkuhler2004simulating](@cite). Their goal is to predict, based on an initial condition ``((q^{(0)})^T,(p^{(0)})^T)^T``, a sequence of points in phase space that fit the training data as well as possible:
+
+[^1]: *Symplectic multi-step methods* can be modeled with [transformers](@ref "Linear Symplectic Transformers").
+
 ```math
 \begin{pmatrix} q^{(0)} \\ p^{(0)} \end{pmatrix}, \cdots, \begin{pmatrix} \tilde{q}^{(1)} \\ \tilde{p}^{(1)} \end{pmatrix}, \cdots \begin{pmatrix} \tilde{q}^{(n)} \\ \tilde{p}^{(n)} \end{pmatrix}.
 ```
 The tilde in the above equation indicates *predicted data*. The time step between predictions is not a parameter we can choose but is related to the *temporal frequency of the training data*. This means that if data is recorded in an interval of e.g. 0.1 seconds, then this will be the time step of our integrator.
 
+SympNets preserve symplecticity by exploiting the ``(q, p)`` structure of the system. This is visualized below:
+
 
 ```@example 
   Main.include_graphics("../tikz/sympnet_architecture"; # hide
@@ -23,11 +28,11 @@ The tilde in the above equation indicates *predicted data*. The time step betwee
   ) # hide
 ```
 
-There are two types of SympNet architectures: $LA$-SympNets and $G$-SympNets. 
+In the figure above we see that an update for ``q`` is based on data coming from ``p`` and an update for ``p`` is based on data coming from ``q``. There are two types of SympNet architectures: ``LA``-SympNets and ``G``-SympNets. 
  
-#### $LA$-SympNet
+#### ``LA``-SympNet
 
-The first type of SympNets, $LA$-SympNets, are obtained from composing two types of layers: *symplectic linear layers* and *symplectic activation layers*.  For a given integer $n$, a symplectic linear layer is defined by
+The first type of SympNets, ``LA``-SympNets, are obtained from composing two types of layers: [symplectic linear layers](@ref "SympNet Linear Layer") and [symplectic activation layers](@ref "SympNet Gradient Layer").  For a given integer ``n``, the *linear part* of an ``LA``-SympNet is
 
 ```math
 \mathcal{L}^{n,q}
@@ -59,69 +64,47 @@ The first type of SympNets, $LA$-SympNets, are obtained from composing two types
 or 
  
 ```math
-\mathcal{L}^{n,p}
+\mathcal{L}^{w,p}
 \begin{pmatrix}  q  \\  
  p  \end{pmatrix} =  
   \begin{pmatrix} 
- I & 0/S^n  \\ 
- S^n/0 & I
+ I & 0/A^w  \\ 
+ A^w/0 & I
  \end{pmatrix} \cdots 
   \begin{pmatrix} 
- I & S^2  \\ 
+ I & A^2  \\ 
  0 & I
  \end{pmatrix}
  \begin{pmatrix} 
  I & 0  \\ 
- S^1 & I
+ A^1 & I
  \end{pmatrix}
  \begin{pmatrix}  q  \\  
  p  \end{pmatrix}
   + b . 
 ```
 
-The superscripts $q$ and $p$ indicate whether the $q$ or the $p$ part is changed. The learnable parameters are the symmetric matrices $S^i\in\mathbb{R}^{d\times d}$ and the bias $b\in\mathbb{R}^{2d}$. The integer $n$ is the width of the symplectic linear layer. It can be shown that five of these layers, i.e. $n\geq{}5$, can represent any linear symplectic map (see [jin2022optimal](@cite)), so $n$ need not be larger than five. We denote the set of symplectic linear layers by $\mathcal{M}^L$.
+The superscripts ``q`` and ``p`` indicate whether the ``q`` or the ``p`` part is first changed. The learnable parameters are the symmetric matrices ``A^i\in\mathbb{R}^{d\times d}`` and the bias ``b\in\mathbb{R}^{2d}``. The integer ``w`` is the number of linear layers in one block. It can be shown that five of these layers, i.e. ``w\geq{}5``, can represent any linear symplectic map (see [jin2022optimal](@cite)), so ``w`` need not be larger than five. We denote the set of symplectic linear layers by ``\mathcal{M}^L``.
 
-The second type of layer needed for $LA$-SympNets are so-called *activation layers*:
-
-```math
- \mathcal{A}^{q}  \begin{pmatrix}  q  \\  
- p  \end{pmatrix} =  
-  \begin{bmatrix} 
- I&\hat{\sigma}^{a}  \\ 
- 0&I
- \end{bmatrix} \begin{pmatrix}  q  \\  
- p  \end{pmatrix} :=
- \begin{pmatrix} 
-  \mathrm{diag}(a)\sigma(p)+q \\ 
-  p
- \end{pmatrix},
-```
- 
- and
+The second type of layer needed for ``LA``-SympNet are so-called [activation layers](@ref "SympNet Gradient Layer").
  
-```math
- \mathcal{A}^{p}  \begin{pmatrix}  q  \\  
- p  \end{pmatrix} =  
-  \begin{bmatrix} 
- I&0  \\ 
- \hat{\sigma}^{a}&I
- \end{bmatrix} \begin{pmatrix}  q  \\  
- p  \end{pmatrix}
- :=
- \begin{pmatrix} 
- q \\ 
- \mathrm{diag}(a)\sigma(q)+p
- \end{pmatrix}.
-```
-The activation function $\sigma$ can be any nonlinearity (on which minor restrictions are imposed below). Here the *scaling vector* $a\in\mathbb{R^{d}}$ constitutes the learnable weights. We denote the set of symplectic activation layers by $\mathcal{M}^A$. 
- 
-An $LA$-SympNet is a function of the form $\Psi=l_{k} \circ a_{k} \circ l_{k-1} \circ \cdots \circ a_1 \circ l_0$ where $(l_i)_{0\leq i\leq k} \subset (\mathcal{M}^L)^{k+1}$ and $(a_i)_{1\leq i\leq k} \subset (\mathcal{M}^A)^{k}$. We will refer to $k$ as the *number of hidden layers* of the SympNet[^1] and the number $n$ above as the *depth* of the linear layer.
+An ``LA``-SympNet is a function of the form ``\Psi=l_{k} \circ a_{k} \circ l_{k-1} \circ \cdots \circ a_1 \circ l_0`` where ``(l_i)_{0\leq i\leq k} \subset \mathcal{M}^L`` and ``(a_i)_{1\leq i\leq k} \subset \mathcal{M}^A``. We will refer to ``k`` as the *number of hidden layers* of the SympNet[^1] and the number ``w`` above as the *depth* of the linear layer.
+
+[^1]: Note that if ``k=1`` then the ``LA``-SympNet consists of only one linear layer.
+
+We give an example of calling ``LA``-SympNet:
+
+```@example
+using GeometricMachineLearning
 
-[^1]: Note that if $k=1$ then the $LA$-SympNet consists of only one linear layer.
+arch = LASympNet(4; depth=2, nhidden=1, activation=tanh, init_upper_linear=true, init_upper_act=true)
+
+model = Chain(arch).layers
+```
  
- #### $G$-SympNets
+#### $G$-SympNets
  
- $G$-SympNets are an alternative to $LA$-SympNets. They are built with only one kind of layer, called *gradient layer*. For a given activation function $\sigma$ and an integer $n\geq d$, a gradient layers is a symplectic map from $\mathbb{R}^{2d}$ to $\mathbb{R}^{2d}$ defined by
+$G$-SympNets are an alternative to $LA$-SympNets. They are built with only one kind of layer, called *gradient layer*. For a given activation function $\sigma$ and an integer $n\geq d$, a gradient layers is a symplectic map from $\mathbb{R}^{2d}$ to $\mathbb{R}^{2d}$ defined by
  
 ```math
  \mathcal{G}^{up}  \begin{pmatrix}  q  \\  
@@ -201,6 +184,20 @@ where $d$ is a distance on $\mathbb{R}^d$.
 
 See the [tutorial section](../tutorials/sympnet_tutorial.md) for an introduction into using SympNets with `GeometricMachineLearning.jl`.
 
+## Data Structures in `GeometricMachineLearning.jl`
+
+```@example 
+Main.include_graphics("../tikz/structs_visualization") # hide
+```
+
+## Library Functions
+
+```docs; canonical = false
+SympNet
+LASympNet
+GSympNet
+```
+
 ## References
 ```@bibliography
 Pages = []
diff --git a/docs/src/tutorials/sympnet_tutorial.md b/docs/src/tutorials/sympnet_tutorial.md
index 7e5ce157c..98b8dd848 100644
--- a/docs/src/tutorials/sympnet_tutorial.md
+++ b/docs/src/tutorials/sympnet_tutorial.md
@@ -47,12 +47,6 @@ and severals keywords argument :
 
 The loss function described in the [theory section](../architectures/sympnet.md) is the default choice used in `GeometricMachineLearning.jl` for training SympNets.
 
-## Data Structures in `GeometricMachineLearning.jl`
-
-```@example 
-Main.include_graphics("../tikz/structs_visualization") # hide
-```
-
 ## Examples
 
 Let us see how to use it on several examples.

From 60c6a9c13b07e4d047008736796662b41e8eee1a Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 26 Jun 2024 12:38:18 +0200
Subject: [PATCH 36/56] Including bias layers at the points where they are
 needed.

---
 src/layers/sympnets.jl | 109 ++++++++++++++++++++++++-----------------
 1 file changed, 65 insertions(+), 44 deletions(-)

diff --git a/src/layers/sympnets.jl b/src/layers/sympnets.jl
index c687f32d0..273f9e095 100644
--- a/src/layers/sympnets.jl
+++ b/src/layers/sympnets.jl
@@ -14,7 +14,7 @@ struct GradientLayer{M, N, TA, C} <: SympNetLayer{M, N}
 end
 
 @doc raw"""
-    GradientLayerQ(sys_dim, upscaling_dimension, activation)
+    GradientLayerQ(n, upscaling_dimension, activation)
 
 Make an instance of a gradient-``q`` layer.
 
@@ -31,7 +31,7 @@ with ``V(p) = \sum_{i=1}^Ma_i\Sigma(\sum_jk_{ij}p_j+b_i)``, where ``\Sigma`` is
 const GradientLayerQ{M, N, TA} = GradientLayer{M, N, TA, :Q}
 
 @doc raw"""
-    GradientLayerP(sys_dim, upscaling_dimension, activation)
+    GradientLayerP(n, upscaling_dimension, activation)
 
 Make an instance of a gradient-``p`` layer.
 
@@ -48,33 +48,59 @@ with ``V(p) = \sum_{i=1}^Ma_i\Sigma(\sum_jk_{ij}q_j+b_i)``, where ``\Sigma`` is
 const GradientLayerP{M, N, TA} = GradientLayer{M, N, TA, :P}
 
 @doc raw"""
-`LinearLayer` is the `struct` corresponding to the constructors `LinearLayerQ` and `LinearLayerP`. See those for more information.
+`LinearLayer` is the `struct` corresponding to the constructors [`LinearLayerQ`](@ref) and [`LinearLayerP`](@ref). See those for more information.
+
+# Implementation
+
+`LinearLayer` uses the custom matrix [`SymmetricMatrix`](@ref) for an especially efficient implementation. 
 """
 struct LinearLayer{M, N, C} <: SympNetLayer{M, N}
 end
 
+@doc raw"""
+    LinearLayerQ(n)
+
+Make a linear layer of dimension ``n\times{}n`` that only changes the ``q`` component.
+
+Equivalent to a left multiplication by the matrix:
+```math
+\begin{pmatrix}
+\mathbb{I} & B \\ 
+\mathbb{O} & \mathbb{I}
+\end{pmatrix}, 
+```
+where $B$ is a symmetric matrix.
+"""
 const LinearLayerQ{M, N, TA} = LinearLayer{M, N, :Q}
+
+@doc raw"""
+    LinearLayerP(n)
+
+Make a linear layer of dimension ``n\times{}n`` that only changes the ``p`` component.
+
+Equivalent to a left multiplication by the matrix:
+```math
+\begin{pmatrix}
+\mathbb{I} & \mathbb{O} \\ 
+B & \mathbb{I}
+\end{pmatrix}, 
+```
+where $B$ is a symmetric matrix.
+"""
 const LinearLayerP{M, N, TA} = LinearLayer{M, N, :P}
 
 @doc raw"""
-`ActivationLayer` is the `struct` corresponding to the constructors `ActivationLayerQ` and `ActivationLayerP`. See those for more information.
+`ActivationLayer` is the `struct` corresponding to the constructors [`ActivationLayerQ`](@ref) and [`ActivationLayerP`](@ref). See those for more information.
 """
 struct  ActivationLayer{M, N, TA, C} <: SympNetLayer{M, N}
         activation::TA
 end
 
-const ActivationLayerQ{M, N, TA} = ActivationLayer{M, N, TA, :Q}
-const ActivationLayerP{M, N, TA} = ActivationLayer{M, N, TA, :P}
-
-function GradientLayerQ(M, upscaling_dimension, activation)
-        GradientLayer{M, M, typeof(activation), :Q}(upscaling_dimension, activation)
-end
+@doc raw"""
+    ActivationLayerQ(n, σ)
 
-function GradientLayerP(M, upscaling_dimension, activation)
-        GradientLayer{M, M, typeof(activation), :P}(upscaling_dimension, activation)
-end
+Make an activation layer of size ``n`` and with activation ``σ`` that only changes the ``q`` component. 
 
-@doc raw"""
 Performs:
 
 ```math
@@ -87,11 +113,13 @@ Performs:
 ```
 
 """
-function ActivationLayerQ(M, activation)
-        ActivationLayerQ{M, M, typeof(activation)}(activation)
-end
+const ActivationLayerQ{M, N, TA} = ActivationLayer{M, N, TA, :Q}
 
 @doc raw"""
+    ActivationLayerP(n, σ)
+
+Make an activation layer of size ``n`` and with activation ``σ`` that only changes the ``p`` component. 
+
 Performs:
 
 ```math
@@ -104,35 +132,28 @@ Performs:
 ```
 
 """
+const ActivationLayerP{M, N, TA} = ActivationLayer{M, N, TA, :P}
+
+function GradientLayerQ(M, upscaling_dimension, activation)
+        GradientLayer{M, M, typeof(activation), :Q}(upscaling_dimension, activation)
+end
+
+function GradientLayerP(M, upscaling_dimension, activation)
+        GradientLayer{M, M, typeof(activation), :P}(upscaling_dimension, activation)
+end
+
+function ActivationLayerQ(M, activation)
+        ActivationLayerQ{M, M, typeof(activation)}(activation)
+end
+
 function ActivationLayerP(M, activation)
         ActivationLayerP{M, M, typeof(activation)}(activation)
 end
 
-
-@doc raw"""
-Equivalent to a left multiplication by the matrix:
-```math
-\begin{pmatrix}
-\mathbb{I} & B \\ 
-\mathbb{O} & \mathbb{I}
-\end{pmatrix}, 
-```
-where $B$ is a symmetric matrix.
-"""
 function LinearLayerQ(M)
         LinearLayerQ{M, M}()
 end
 
-@doc raw"""
-Equivalent to a left multiplication by the matrix:
-```math
-\begin{pmatrix}
-\mathbb{I} & \mathbb{O} \\ 
-B & \mathbb{I}
-\end{pmatrix}, 
-```
-where $B$ is a symmetric matrix.
-"""
 function LinearLayerP(M)
         LinearLayerP{M, M}()
 end
@@ -168,27 +189,27 @@ function initialparameters(d::GradientLayer{M, M}, backend::Backend, ::Type{T};
 end
 
 function initialparameters(::ActivationLayer{M, M}, backend::Backend, ::Type{T}; rng::AbstractRNG = Random.default_rng(), init_scale = GlorotUniform()) where {M, T}
-        a = KernelAbstractions.zeros(backend, T, M÷2)
+        a = KernelAbstractions.zeros(backend, T, M ÷ 2)
         init_scale(rng, a)
         return (scale = a,)
 end
 
 function initialparameters(::LinearLayer{M, M}, backend::Backend, ::Type{T}; rng::AbstractRNG = Random.default_rng(), init_weight = GlorotUniform()) where {M, T}
-        S = KernelAbstractions.allocate(backend, T, (M÷2)*(M÷2+1)÷2)
+        S = KernelAbstractions.allocate(backend, T, (M ÷ 2) * (M ÷ 2 + 1) ÷ 2)
         init_weight(rng, S)
-        (weight=SymmetricMatrix(S, M÷2), )
+        (weight=SymmetricMatrix(S, M ÷ 2), )
 end
 
 function parameterlength(d::GradientLayer{M, M}) where {M}
-        d.second_dim÷2 * (M÷2 + 2)
+        d.second_dim ÷ 2 * (M ÷ 2 + 2)
 end
 
 function parameterlength(::ActivationLayer{M, M}) where {M}
-        M÷2
+        M ÷ 2
 end
 
 function parameterlength(::LinearLayer{M, M}) where {M}
-        (M÷2)*(M÷2+1)÷2
+        (M ÷ 2) * (M ÷ 2 + 1) ÷ 2
 end
 
 @doc raw"""

From 14e3c593228bda04af2f55796a269edd9575d8a6 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 26 Jun 2024 12:38:56 +0200
Subject: [PATCH 37/56] Started reworking docstrings for sympnet architectures.

---
 src/architectures/sympnet.jl | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/src/architectures/sympnet.jl b/src/architectures/sympnet.jl
index 80255e8f5..df74874b0 100644
--- a/src/architectures/sympnet.jl
+++ b/src/architectures/sympnet.jl
@@ -4,12 +4,20 @@ The `SympNet` type encompasses [`GSympNet`](@ref)s and [`LASympNet`](@ref)s. Sym
 abstract type SympNet{AT} <: NeuralNetworkIntegrator end
 
 @doc raw"""
-`LASympNet` is called with **a single input argument**, the **system dimension**, or with an instance of `DataLoader`. Optional input arguments are: 
+    LASympNet(d)
+
+Make an ``LA``-SympNet with dimension ``d.``
+
+There exists an additional constructor that can be called by supplying an instance of [`DataLoader`](@ref).
+
+# Arguments
+
+Keyword arguments are: 
 - `depth::Int`: The number of linear layers that are applied. The default is 5.
 - `nhidden::Int`: The number of hidden layers (i.e. layers that are **not** input or output layers). The default is 2.
 - `activation`: The activation function that is applied. By default this is `tanh`.
-- `init_upper_linear::Bool`: Initialize the linear layer so that it first modifies the $q$-component. The default is `true`.
-- `init_upper_act::Bool`: Initialize the activation layer so that it first modifies the $q$-component. The default is `true`.
+- `init_upper_linear::Bool`: Initialize the linear layer so that it first modifies the ``q``-component. The default is `true`.
+- `init_upper_act::Bool`: Initialize the activation layer so that it first modifies the ``q``-component. The default is `true`.
 """
 struct LASympNet{AT, InitUpperLinear, InitUpperAct} <: SympNet{AT} where {InitUpperLinear, InitUpperAct}
     dim::Int
@@ -29,7 +37,15 @@ end
 @inline AbstractNeuralNetworks.dim(arch::SympNet) = arch.dim
 
 @doc raw"""
-`GSympNet` is called with **a single input argument**, the **system dimension**, or with an instance of `DataLoader`. Optional input arguments are: 
+    GSympNet(d)
+
+Make a ``G``-SympNet with dimension ``d.``
+
+There exists an additional constructor that can be called by supplying an instance of [`DataLoader`](@ref).
+
+# Arguments
+
+`Keyword arguments are:
 - `upscaling_dimension::Int`: The *upscaling dimension* of the gradient layer. See the documentation for `GradientLayerQ` and `GradientLayerP` for further explanation. The default is `2*dim`.
 - `n_layers::Int`: The number of layers (i.e. the total number of [`GradientLayerQ`](@ref) and [`GradientLayerP`](@ref)). The default is 2.
 - `activation`: The activation function that is applied. By default this is `tanh`.
@@ -98,6 +114,7 @@ function Chain(arch::LASympNet{AT, false, true}) where {AT}
         for j in 1:arch.depth
             layers = isodd(j) ? (layers..., LinearLayerP(arch.dim)) : (layers..., LinearLayerQ(arch.dim))
         end
+        layers = (layers..., BiasLayer(arch.dim))
         layers = (layers..., ActivationLayerQ(arch.dim, arch.activation))
         layers = (layers..., ActivationLayerP(arch.dim, arch.activation))
     end
@@ -116,6 +133,7 @@ function Chain(arch::LASympNet{AT, false, false}) where {AT}
         for j in 1:arch.depth
             layers = isodd(j) ? (layers..., LinearLayerP(arch.dim)) : (layers..., LinearLayerQ(arch.dim))
         end
+        layers = (layers..., BiasLayer(arch.dim))
         layers = (layers..., ActivationLayerP(arch.dim, arch.activation))
         layers = (layers..., ActivationLayerQ(arch.dim, arch.activation))
     end
@@ -134,6 +152,7 @@ function Chain(arch::LASympNet{AT, true, true}) where {AT}
         for j in 1:arch.depth
             layers = isodd(j) ? (layers..., LinearLayerQ(arch.dim)) : (layers..., LinearLayerP(arch.dim))
         end
+        layers = (layers..., BiasLayer(arch.dim))
         layers = (layers..., ActivationLayerQ(arch.dim, arch.activation))
         layers = (layers..., ActivationLayerP(arch.dim, arch.activation))
     end

From b86663eaa4a5a45ba9eda32477927d2374d35c00 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Thu, 27 Jun 2024 06:33:04 +0200
Subject: [PATCH 38/56] Made SympNet theory bit better conform with docstrings
 etc.

---
 docs/src/architectures/sympnet.md | 142 +++++++++++++++---------------
 1 file changed, 73 insertions(+), 69 deletions(-)

diff --git a/docs/src/architectures/sympnet.md b/docs/src/architectures/sympnet.md
index 4864d2199..99ef4471d 100644
--- a/docs/src/architectures/sympnet.md
+++ b/docs/src/architectures/sympnet.md
@@ -2,9 +2,7 @@
 
 This document discusses the SympNet architecture and its implementation in `GeometricMachineLearning.jl`.
 
-## Quick overview of the theory of SympNets
-
-### Principle
+## Principle
 
 SympNets [jin2020sympnets](@cite) are a type of neural network that can model the trajectory of a Hamiltonian system in phase space. Take ``(q^T,p^T)^T=(q_1,\ldots,q_d,p_1,\ldots,p_d)^T\in \mathbb{R}^{2d}`` as the coordinates in phase space, where ``q=(q_1, \ldots, q_d)^T\in \mathbb{R}^{d}`` is refered to as the *position* and ``p=(p_1, \ldots, p_d)^T\in \mathbb{R}^{d}`` the *momentum*. Given a point ``(q^T,p^T)^T`` in ``\mathbb{R}^{2d}`` the SympNet aims to compute the *next position* ``((q')^T,(p')^T)^T`` and thus predicts the trajectory while preserving the *symplectic structure* of the system.
 SympNets are enforcing symplecticity strongly, meaning that this property is hard-coded into the network architecture. The layers are reminiscent of traditional neural network feedforward layers, but have a strong restriction imposed on them in order to be symplectic.
@@ -30,20 +28,20 @@ SympNets preserve symplecticity by exploiting the ``(q, p)`` structure of the sy
 
 In the figure above we see that an update for ``q`` is based on data coming from ``p`` and an update for ``p`` is based on data coming from ``q``. There are two types of SympNet architectures: ``LA``-SympNets and ``G``-SympNets. 
  
-#### ``LA``-SympNet
+## ``LA``-SympNet
 
-The first type of SympNets, ``LA``-SympNets, are obtained from composing two types of layers: [symplectic linear layers](@ref "SympNet Linear Layer") and [symplectic activation layers](@ref "SympNet Gradient Layer").  For a given integer ``n``, the *linear part* of an ``LA``-SympNet is
+The first type of SympNets, ``LA``-SympNets, are obtained from composing two types of layers: [symplectic linear layers](@ref "SympNet Linear Layer") and [symplectic activation layers](@ref "SympNet Gradient Layer").  For a given integer ``w``, the *linear part* of an ``LA``-SympNet is
 
 ```math
-\mathcal{L}^{n,q}
+\mathcal{L}^{w,\mathrm{up}}
 \begin{pmatrix}
  q \\
  p \\
 \end{pmatrix}
  =  
 \begin{pmatrix} 
- I & S^n/0 \\
- 0/S^n & I \\
+ I & S^w/0 \\
+ 0/S^w & I \\
 \end{pmatrix}
  \cdots 
 \begin{pmatrix} 
@@ -64,7 +62,7 @@ The first type of SympNets, ``LA``-SympNets, are obtained from composing two typ
 or 
  
 ```math
-\mathcal{L}^{w,p}
+\mathcal{L}^{w,\mathrm{low}}
 \begin{pmatrix}  q  \\  
  p  \end{pmatrix} =  
   \begin{pmatrix} 
@@ -84,107 +82,113 @@ or
   + b . 
 ```
 
-The superscripts ``q`` and ``p`` indicate whether the ``q`` or the ``p`` part is first changed. The learnable parameters are the symmetric matrices ``A^i\in\mathbb{R}^{d\times d}`` and the bias ``b\in\mathbb{R}^{2d}``. The integer ``w`` is the number of linear layers in one block. It can be shown that five of these layers, i.e. ``w\geq{}5``, can represent any linear symplectic map (see [jin2022optimal](@cite)), so ``w`` need not be larger than five. We denote the set of symplectic linear layers by ``\mathcal{M}^L``.
+The superscripts ``\mathrm{up}`` and ``\mathrm{low}`` indicate whether the ``q`` or the ``p`` part is first changed. The learnable parameters are the symmetric matrices ``A^i\in\mathbb{R}^{d\times d}`` and the bias ``b\in\mathbb{R}^{2d}``. The integer ``w`` is the number of linear layers in one block. It can be shown that five of these layers, i.e. ``w\geq{}5``, can represent any linear symplectic map (see [jin2022optimal](@cite)), so ``w`` need not be larger than five. We denote the set of symplectic linear layers by ``\mathcal{M}^L``.
 
-The second type of layer needed for ``LA``-SympNet are so-called [activation layers](@ref "SympNet Gradient Layer").
+The second type of layer needed for ``LA``-SympNets are [activation layers](@ref "SympNet Gradient Layer").
  
-An ``LA``-SympNet is a function of the form ``\Psi=l_{k} \circ a_{k} \circ l_{k-1} \circ \cdots \circ a_1 \circ l_0`` where ``(l_i)_{0\leq i\leq k} \subset \mathcal{M}^L`` and ``(a_i)_{1\leq i\leq k} \subset \mathcal{M}^A``. We will refer to ``k`` as the *number of hidden layers* of the SympNet[^1] and the number ``w`` above as the *depth* of the linear layer.
+An ``LA``-SympNet is a mapping of the form ``\Psi=l_{k} \circ a_{k} \circ l_{k-1} \circ \cdots \circ a_1 \circ l_0`` where ``(l_i)_{0\leq i\leq k} \subset \mathcal{M}^L`` and ``(a_i)_{1\leq i\leq k} \subset \mathcal{M}^A``. We will refer to ``k`` as the *number of hidden layers* of the SympNet[^1] and the number ``w`` above as the *depth* of the linear layer.
 
-[^1]: Note that if ``k=1`` then the ``LA``-SympNet consists of only one linear layer.
+[^1]: Note that if ``k=0`` then the ``LA``-SympNet consists of only one linear layer.
 
 We give an example of calling ``LA``-SympNet:
 
 ```@example
 using GeometricMachineLearning
 
-arch = LASympNet(4; depth=2, nhidden=1, activation=tanh, init_upper_linear=true, init_upper_act=true)
+k = 1
+w = 2
+arch = LASympNet(4; nhidden = k, depth = 2, init_upper_linear = true, init_upper_act = true, activation = tanh)
 
 model = Chain(arch).layers
 ```
  
-#### $G$-SympNets
- 
-$G$-SympNets are an alternative to $LA$-SympNets. They are built with only one kind of layer, called *gradient layer*. For a given activation function $\sigma$ and an integer $n\geq d$, a gradient layers is a symplectic map from $\mathbb{R}^{2d}$ to $\mathbb{R}^{2d}$ defined by
- 
-```math
- \mathcal{G}^{up}  \begin{pmatrix}  q  \\  
- p  \end{pmatrix} =  
-  \begin{bmatrix} 
- I&\hat{\sigma}^{K,a,b}  \\ 
- 0&I
- \end{bmatrix} \begin{pmatrix}  q  \\  
- p  \end{pmatrix} :=
- \begin{pmatrix} 
-  K^T \mathrm{diag}(a)\sigma(Kp+b)+q \\ 
-  p
- \end{pmatrix},
-```
- 
-or
- 
-```math
- \mathcal{G}^{low}  \begin{pmatrix}  q  \\  
- p  \end{pmatrix} =  
-  \begin{bmatrix} 
- I&0  \\ 
- \hat{\sigma}^{K,a,b}&I
- \end{bmatrix} \begin{pmatrix}  q  \\  
- p  \end{pmatrix}
- :=
- \begin{pmatrix} 
- q \\ 
- K^T \mathrm{diag}(a)\sigma(Kq+b)+p
- \end{pmatrix}.
-```
+The keywords `init_upper_linear` and `init_upper_act` indicate whether the first linear (respectively activation) layer is of ``q`` type[2].
+
+[^2]: If `init_upper_linear = true` then the first layer is of ``q`` type, i.e. changes the ``q`` component and leaves the ``p`` component unchanged. 
 
-The parameters of this layer are the *scaling matrix* $K\in\mathbb{R}^{m\times d}$, the bias $b\in\mathbb{R}^{m}$ and the *scaling vector* $a\in\mathbb{R}^{m}$. The name "gradient layer" has its origin in the fact that the expression $[K^T\mathrm{diag}(a)\sigma(Kq+b)]_i = \sum_jk_{ji}a_j\sigma(\sum_\ell{}k_{j\ell}q_\ell+b_j)$ is the gradient of a function $\sum_ja_j\tilde{\sigma}(\sum_\ell{}k_{j\ell}q_\ell+b_j)$, where $\tilde{\sigma}$ is the antiderivative of $\sigma$. The first dimension of $K$ we refer to as the *upscaling dimension*.
+## ``G``-SympNets
  
-If we denote by $\mathcal{M}^G$ the set of gradient layers, a $G$-SympNet is a function of the form $\Psi=g_k \circ g_{k-1} \circ \cdots \circ g_0$ where $(g_i)_{0\leq i\leq k} \subset (\mathcal{M}^G)^k$. The index $k$ is again the *number of hidden layers*.
+``G``-SympNets are an alternative to ``LA``-SympNets. They are built with only one kind of layer, the [gradient layer](@ref "SympNet Gradient Layer"). If we denote by ``\mathcal{M}^G`` the set of gradient layers, a ``G``-SympNet is a function of the form ``\Psi=g_k \circ g_{k-1} \circ \cdots \circ g_1`` where ``(g_i)_{1\leq i\leq k} \subset \mathcal{M}^G``. The index ``k`` here is the *number of layers* in the SympNet.
 
-Further note here the different roles played by round and square brackets: the latter indicates a nonlinear operation as opposed to a regular vector or matrix. 
+```@example
+using GeometricMachineLearning
 
-### Universal approximation theorems
+k = 2
+n = 10
+arch = GSympNet(4; upscaling_dimension = n, n_layers = k, init_upper = true, activation = tanh)
+
+model = Chain(arch).layers
+```
+
+## Universal approximation theorem
 
 In order to state the *universal approximation theorem* for both architectures we first need a few definitions:
  
-Let $U$ be an open set of $\mathbb{R}^{2d}$, and let us denote by $\mathcal{SP}^r(U)$ the set of $C^r$ smooth symplectic maps on $U$. We now define a topology on $C^r(K, \mathbb{R}^n)$, the set of $C^r$-smooth maps from a compact set $K\subset\mathbb{R}^{n}$ to $\mathbb{R}^{n}$ through the norm
+Let ``U`` be an open set of ``\mathbb{R}^{2d}``, and let us denote by ``\mathcal{SP}^r(U)`` the set of ``C^r`` smooth symplectic maps on ``U``. We now define a topology on ``C^r(K, \mathbb{R}^{2d})``, the set of ``C^r``-smooth maps from a compact set ``K\subset{}U}`` to ``\mathbb{R}^{2d}`` through the norm
 
 ```math
-||f||_{C^r(K,\mathbb{R}^{n})} = \underset{|\alpha|\leq r}{\sum} \underset{1\leq i \leq n}{\max}\underset{x\in K}{\sup} |D^\alpha f_i(x)|,
+||f||_{C^r(K,\mathbb{R}^{2d})} = \sum_{|\alpha|\leq r} \underset{1\leq i \leq 2d}{\max}\underset{x\in K}{\sup} |D^\alpha f_i(x)|,
 ```
-where the differential operator $D^\alpha$ is defined by 
+where the differential operator ``D^\alpha`` is defined by 
 ```math
 D^\alpha f = \frac{\partial^{|\alpha|} f}{\partial x_1^{\alpha_1}...x_n^{\alpha_n}},
 ```
-with $|\alpha| = \alpha_1 +...+ \alpha_n$. 
+with ``|\alpha| = \alpha_1 +...+ \alpha_{2d}``$. We impose the following definition on the activation function:
 
-__Definition__ $\sigma$ is **$r$-finite** if $\sigma\in C^r(\mathbb{R},\mathbb{R})$ and $\int |D^r\sigma(x)|dx <+\infty$.
+```@eval
+Main.definition(raw"$\sigma$ is **``r``-finite** if ``\sigma\in C^r(\mathbb{R},\mathbb{R})`` and ``\int |D^r\sigma(x)|dx <\infty``.")
+```
 
 
-__Definition__ Let $m,n,r\in \mathbb{N}$ with $m,n>0$ be given, $U$ an open set of $\mathbb{R}^m$, and $I,J\subset C^r(U,\mathbb{R}^n)$. We say $J$ is **$r$-uniformly dense on compacta in $I$** if $J \subset I$ and for any $f\in I$, $\epsilon>0$, and any compact $K\subset U$, there exists $g\in J$ such that $||f-g||_{C^r(K,\mathbb{R}^{n})} < \epsilon$.
+```@eval
+Main.definition(raw"Let ``m,n,r\in \mathbb{N}`` with ``m,n>0`` be given, ``U`` an open set of ``\mathbb{R}^m``, and ``I,J\subset C^r(U,\mathbb{R}^n)``. We say ``J`` is **``r``-uniformly dense on compacta in ``I``** if ``J \subset I`` and for any ``f\in I``, ``\epsilon>0``, and any compact ``K\subset U``, there exists ``g\in J`` such that ``||f-g||_{C^r(K,\mathbb{R}^{n})} < \epsilon``.")
+```
 
 We can now state the universal approximation theorems:
 
-__Theorem (Approximation theorem for LA-SympNet)__ For any positive integer $r>0$ and open set $U\in \mathbb{R}^{2d}$, the set of $LA$-SympNet is $r$-uniformly dense on compacta in $SP^r(U)$ if the activation function $\sigma$ is $r$-finite.
+```@eval
+Main.theorem(raw"For any positive integer ``r>0`` and open set ``U\in \mathbb{R}^{2d}``, the set of ``LA``-SympNet is ``r``-uniformly dense on compacta in ``SP^r(U)`` if the activation function ``\sigma`` is ``r``-finite."; name = raw"Approximation theorem for ``LA``-SympNets")
+```
 
-__Theorem (Approximation theorem for G-SympNet)__ For any positive integer $r>0$ and open set $U\in \mathbb{R}^{2d}$, the set of $G$-SympNet is $r$-uniformly dense on compacta in $SP^r(U)$ if the activation function $\sigma$ is $r$-finite.
+and
 
-There are many $r$-finite activation functions commonly used in neural networks, for example:
-- sigmoid $\sigma(x)=\frac{1}{1+e^{-x}}$ for any positive integer $r$, 
-- tanh $\tanh(x)=\frac{e^x-e^{-x}}{e^x+e^{-x}}$ for any positive integer $r$. 
+```@eval
+Main.theorem(raw"For any positive integer ``r>0`` and open set ``U\in \mathbb{R}^{2d}``, the set of ``G``-SympNet is ``r``-uniformly dense on compacta in ``SP^r(U)`` if the activation function ``\sigma`` is ``r``-finite."; name = raw"Approximation theorem for ``G``-SympNets")
+```
+
+There are many ``r``-finite activation functions commonly used in neural networks, for example:
+- The sigmoid activation function: ``\sigma(x)=\frac{1}{1+e^{-x}}``, 
+- The hyperbolic tangent function: ``\tanh(x)=\frac{e^x-e^{-x}}{e^x+e^{-x}}``. 
 
-The universal approximation theorems state that we can, in principle, get arbitrarily close to any symplectomorphism defined on $\mathbb{R}^{2d}$. But this does not tell us anything about how to optimize the network. This is can be done with any common [neural network optimizer](@ref "Neural Network Optimizers") and these neural network optimizers always rely on a corresponding loss function.  
+The universal approximation theorems state that we can, in principle, get arbitrarily close to any symplectomorphism defined on ``\mathbb{R}^{2d}``. But this does not tell us anything about how to optimize the network. This is can be done with any common [neural network optimizer](@ref "Neural Network Optimizers") and these neural network optimizers always rely on a corresponding loss function.  
 
 ## Loss function
 
-To train the SympNet, one need data along a trajectory such that the model is trained to perform an integration. These data are $(Q,P)$ where $Q[i,j]$ (respectively $P[i,j]$) is the real number $q_j(t_i)$ (respectively $p[i,j]$) which is the j-th coordinates of the generalized position (respectively momentum) at the i-th time step. One also need a loss function defined as :
+To train the SympNet, one needs data along a trajectory such that the model is trained to perform an integration. The loss function is defined as[^3]:
+
+[^3]: This loss function is implemented as [`FeedForwardLoss`](@ref) in `GeometricMachineLearning`.
+
+```math
+\mathrm{loss}(z^\mathrm{c}, z^\mathrm{p}) = || z^\mathrm{c} - z^\mathrm{p} || / || z^\mathrm{c} ||,
+```
+
+where 
+
+```math
+z^\mathrm{c} = \begin{pmatrix} q^\mathrm{c} \\ p^\mathrm{c} \end{pmatrix}
+``` 
+
+is the current state and 
+
+```math
+z^\mathrm{p} = \begin{pmatrix} q^\mathrm{p} \\ p^\mathrm{p} \end{pmatrix}
+```
 
-$$Loss(Q,P) = \underset{i}{\sum} d(\Phi(Q[i,-],P[i,-]), [Q[i,-] P[i,-]]^T)$$
-where $d$ is a distance on $\mathbb{R}^d$.
+is the predicted state. In the [tutorial section](@ref "SympNets with `GeometricMachineLearning`") we show how to use SympNets in `GeometricMachineLearning.jl` and how to [modify the loss function](@ref "Adjusting the Loss Function").
 
-See the [tutorial section](../tutorials/sympnet_tutorial.md) for an introduction into using SympNets with `GeometricMachineLearning.jl`.
+## How are SympNets Implemented?
 
-## Data Structures in `GeometricMachineLearning.jl`
+The following picture illustrates where how the different SympNet architectures are implemented:
 
 ```@example 
 Main.include_graphics("../tikz/structs_visualization") # hide

From a6561bc24ab91723645a22363bd7f4bdd38f9a29 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Thu, 27 Jun 2024 06:33:44 +0200
Subject: [PATCH 39/56] GeometricMachineLearning.jl ->
 GeometricMachineLearning.

---
 docs/src/tutorials/sympnet_tutorial.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/tutorials/sympnet_tutorial.md b/docs/src/tutorials/sympnet_tutorial.md
index 98b8dd848..7f5527f35 100644
--- a/docs/src/tutorials/sympnet_tutorial.md
+++ b/docs/src/tutorials/sympnet_tutorial.md
@@ -1,4 +1,4 @@
-# SympNets with `GeometricMachineLearning.jl`
+# SympNets with `GeometricMachineLearning`
 
 This page serves as a short introduction into using SympNets with `GeometricMachineLearning.jl`. For the general theory see [the theory section](../architectures/sympnet.md).
 

From 4767c04ffbc8f583211922ab3dea1f3a7c0bc498 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Thu, 27 Jun 2024 06:44:49 +0200
Subject: [PATCH 40/56] Started adding tikz picture showing ingredients.

---
 docs/src/tikz/ingredients.tex | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 docs/src/tikz/ingredients.tex

diff --git a/docs/src/tikz/ingredients.tex b/docs/src/tikz/ingredients.tex
new file mode 100644
index 000000000..3bbfff9c8
--- /dev/null
+++ b/docs/src/tikz/ingredients.tex
@@ -0,0 +1,27 @@
+\documentclass[tikz]{standalone}
+\usepackage{amsmath}
+\usepackage{amsfonts}
+
+\begin{document}
+
+\begin{tikzpicture}
+    \node[rectangle,draw] (r) at (0,0) {\textbf{Network architecture}: function $NN_\theta:\mathbb{R}^\mathrm{input-dim}\to\mathbb{R}^\mathrm{output-dim}$ parametrized by $\theta \equiv (W,b)$ };
+\end{tikzpicture}
+
+\vspace{2em}
+{\Huge+}
+\vspace{2em}
+
+\begin{tikzpicture}
+   \node[rectangle,draw] (r) at (0,0) {\textbf{Loss function} $L(\theta)$: is minimized during training.};% $\implies$ \textbf{reconstruction error} $||M - \mathcal{R}\circ\mathcal{P}(M)||$};
+\end{tikzpicture}
+
+\vspace{2em}
+{\Huge+}
+\vspace{2em}
+
+\begin{tikzpicture}
+  \node[rectangle,draw] (r) at (0,0) {\textbf{Optimization procedure (gradient descent)}: updates $\theta$ with $\nabla{}L(\theta)$; $\theta \leftarrow \theta - \eta\nabla{}L(\theta)$};
+\end{tikzpicture}
+
+\end{document}
\ No newline at end of file

From b313669fd62051664739502f43e78fd7792eaa8f Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Thu, 27 Jun 2024 06:45:14 +0200
Subject: [PATCH 41/56] Added sentence on different losses.

---
 docs/src/reduced_order_modeling/losses.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/src/reduced_order_modeling/losses.md b/docs/src/reduced_order_modeling/losses.md
index 25bef1294..06278b82e 100644
--- a/docs/src/reduced_order_modeling/losses.md
+++ b/docs/src/reduced_order_modeling/losses.md
@@ -1,5 +1,6 @@
 # Different Neural Network Losses
 
+`GeometricMachineLearning` has a number of loss functions implemented that can be called *standard losses*. How to implement custom losses is shown in the [tutorials](@ref "Adjusting the Loss Function").
 
 ```@docs; canonical = false
 GeometricMachineLearning.NetworkLoss

From 2407345e8fcb71fe87e8486b6a4c4f2b946837ad Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Thu, 27 Jun 2024 09:39:38 +0200
Subject: [PATCH 42/56] Finished ingredients tikz.

---
 docs/src/tikz/Makefile             |  2 ++
 docs/src/tikz/ingredients.tex      | 22 +++++-----------------
 docs/src/tikz/ingredients_dark.tex | 15 +++++++++++++++
 3 files changed, 22 insertions(+), 17 deletions(-)
 create mode 100644 docs/src/tikz/ingredients_dark.tex

diff --git a/docs/src/tikz/Makefile b/docs/src/tikz/Makefile
index 2af6f4f97..68b441c29 100644
--- a/docs/src/tikz/Makefile
+++ b/docs/src/tikz/Makefile
@@ -59,6 +59,8 @@ png:
 	pdftocairo 	-png -r $(res1) -transp -singlefile  skew_sym_visualization_dark.pdf				skew_sym_visualization_dark
 	pdftocairo  -png -r $(res2) -transp -singlefile  offline_online.pdf								offline_online
 	pdftocairo  -png -r $(res2) -transp -singlefile  offline_online_dark.pdf						offline_online_dark
+	pdftocairo  -png -r $(res2) -transp -singlefile  ingredients.pdf								ingredients
+	pdftocairo  -png -r $(res2) -transp -singlefile  ingredients_dark.pdf							ingredients_dark
 
 logo:
 	cp logo_with_name.png 		../assets/logo.png
diff --git a/docs/src/tikz/ingredients.tex b/docs/src/tikz/ingredients.tex
index 3bbfff9c8..df0d73b8f 100644
--- a/docs/src/tikz/ingredients.tex
+++ b/docs/src/tikz/ingredients.tex
@@ -5,23 +5,11 @@
 \begin{document}
 
 \begin{tikzpicture}
-    \node[rectangle,draw] (r) at (0,0) {\textbf{Network architecture}: function $NN_\theta:\mathbb{R}^\mathrm{input-dim}\to\mathbb{R}^\mathrm{output-dim}$ parametrized by $\theta \equiv (W,b)$ };
-\end{tikzpicture}
-
-\vspace{2em}
-{\Huge+}
-\vspace{2em}
-
-\begin{tikzpicture}
-   \node[rectangle,draw] (r) at (0,0) {\textbf{Loss function} $L(\theta)$: is minimized during training.};% $\implies$ \textbf{reconstruction error} $||M - \mathcal{R}\circ\mathcal{P}(M)||$};
-\end{tikzpicture}
-
-\vspace{2em}
-{\Huge+}
-\vspace{2em}
-
-\begin{tikzpicture}
-  \node[rectangle,draw] (r) at (0,0) {\textbf{Optimization procedure (gradient descent)}: updates $\theta$ with $\nabla{}L(\theta)$; $\theta \leftarrow \theta - \eta\nabla{}L(\theta)$};
+    \node[rectangle,draw] (i1) at (0,0) {\textbf{Network architecture}: function $NN_\theta:\mathbb{R}^\mathrm{input-dim}\to\mathbb{R}^\mathrm{output-dim}$ parametrized by $\theta \equiv (W,b)$ };
+    \node[below of=i1] (p1) {\Huge+};
+    \node[rectangle,draw, below of=p1] (i2) {\textbf{Loss function} $L(\theta)$: is minimized during training.};% $\implies$ \textbf{reconstruction error} $||M - \mathcal{R}\circ\mathcal{P}(M)||$};
+    \node[below of=i2] (p2) {\Huge+};
+    \node[rectangle,draw, below of=p2] (i3) {\textbf{Optimization procedure (gradient descent)}: updates $\theta$ with $\nabla{}L(\theta)$; $\theta \leftarrow \theta - \eta\nabla{}L(\theta)$};
 \end{tikzpicture}
 
 \end{document}
\ No newline at end of file
diff --git a/docs/src/tikz/ingredients_dark.tex b/docs/src/tikz/ingredients_dark.tex
new file mode 100644
index 000000000..bd86939ed
--- /dev/null
+++ b/docs/src/tikz/ingredients_dark.tex
@@ -0,0 +1,15 @@
+\documentclass[tikz]{standalone}
+\usepackage{amsmath}
+\usepackage{amsfonts}
+
+\begin{document}
+
+\begin{tikzpicture}[color=white]
+    \node[rectangle,draw] (i1) at (0,0) {\textbf{Network architecture}: function $NN_\theta:\mathbb{R}^\mathrm{input-dim}\to\mathbb{R}^\mathrm{output-dim}$ parametrized by $\theta \equiv (W,b)$ };
+    \node[below of=i1] (p1) {\Huge+};
+    \node[rectangle,draw, below of=p1] (i2) {\textbf{Loss function} $L(\theta)$: is minimized during training.};% $\implies$ \textbf{reconstruction error} $||M - \mathcal{R}\circ\mathcal{P}(M)||$};
+    \node[below of=i2] (p2) {\Huge+};
+    \node[rectangle,draw, below of=p2] (i3) {\textbf{Optimization procedure (gradient descent)}: updates $\theta$ with $\nabla{}L(\theta)$; $\theta \leftarrow \theta - \eta\nabla{}L(\theta)$};
+\end{tikzpicture}
+
+\end{document}
\ No newline at end of file

From 15223249c945fb33fe961197234a25faac3cd373 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Thu, 27 Jun 2024 09:40:03 +0200
Subject: [PATCH 43/56] Added PINN reference.

---
 docs/src/GeometricMachineLearning.bib     | 10 ++++++++++
 docs/src/reduced_order_modeling/losses.md | 18 ++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/docs/src/GeometricMachineLearning.bib b/docs/src/GeometricMachineLearning.bib
index a7ee2687a..499ad51e3 100644
--- a/docs/src/GeometricMachineLearning.bib
+++ b/docs/src/GeometricMachineLearning.bib
@@ -448,4 +448,14 @@ @inproceedings{huang2016riemannian
   pages={627--634},
   year={2016},
   organization={Springer}
+}
+
+@article{raissi2019physics,
+  title={Physics-informed neural networks: A deep learning framework for solving forward and inverse problems involving nonlinear partial differential equations},
+  author={Raissi, Maziar and Perdikaris, Paris and Karniadakis, George E},
+  journal={Journal of Computational physics},
+  volume={378},
+  pages={686--707},
+  year={2019},
+  publisher={Elsevier}
 }
\ No newline at end of file
diff --git a/docs/src/reduced_order_modeling/losses.md b/docs/src/reduced_order_modeling/losses.md
index 06278b82e..ec1132bd6 100644
--- a/docs/src/reduced_order_modeling/losses.md
+++ b/docs/src/reduced_order_modeling/losses.md
@@ -2,10 +2,28 @@
 
 `GeometricMachineLearning` has a number of loss functions implemented that can be called *standard losses*. How to implement custom losses is shown in the [tutorials](@ref "Adjusting the Loss Function").
 
+## A Note on Physics-Informed Neural Networks
+
+A popular trend in recent years has been considering known physical properties of the differential equation, or the entire differential equation, through the loss function [raissi2019physics](@cite). This is one way of considering physical properties, and `GeometricMachineLearning` allows for a [flexible implementation of custom losses](@ref "Adjusting the Loss Function"), but this is nonetheless discouraged. In general a neural networks consists of *three ingredients*:
+
+```@example
+Main.include_graphics("../tikz/ingredients") # hide
+```
+
+Instead of considering certain properties through the loss function, we instead do so by enforcing them strongly through the network architecture and the optimizer; the latter pertains to [manifold optimization](@ref "Generalization to Homogeneous Spaces"). The advantages of this approach are the strong enforcement of properties that we now our network should have and much easier training because we do not have to tune hyperparameters. 
+
+
 ```@docs; canonical = false
 GeometricMachineLearning.NetworkLoss
 TransformerLoss
 FeedForwardLoss
 AutoEncoderLoss
 ReducedLoss
+```
+
+```@bibliography
+Canonical = false
+Pages = []
+
+raissi2019physics
 ```
\ No newline at end of file

From efb20379fde8b194c0764bb77cfa40c93ed5fbc8 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Thu, 27 Jun 2024 09:40:33 +0200
Subject: [PATCH 44/56] Fixed include_graphics.

---
 docs/src/reduced_order_modeling/reduced_order_modeling.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/reduced_order_modeling/reduced_order_modeling.md b/docs/src/reduced_order_modeling/reduced_order_modeling.md
index d91c4b12f..58d263de0 100644
--- a/docs/src/reduced_order_modeling/reduced_order_modeling.md
+++ b/docs/src/reduced_order_modeling/reduced_order_modeling.md
@@ -49,7 +49,7 @@ The third step can be done with various machine learning (ML) techniques. Tradit
 After having obtained ``\mathcal{P}`` and ``\mathcal{R}`` we still need to solve the *reduced system*. Solving the reduced system is typically referred to as the *online phase* in reduced order modeling. This is sketched below: 
 
 ```@example
-include_graphics(../tikz/offline_online) # hide
+Main.include_graphics("../tikz/offline_online") # hide
 ```
 
 The online phase is applying the mapping ``\mathcal{NN}`` in the low-dimensional space in order to predict the next time step. Crucially this step can be made very cheap when compared to the full-order model.

From 8f8626fd86f02a937709bb487f140412908b243f Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Thu, 27 Jun 2024 09:41:21 +0200
Subject: [PATCH 45/56] Implemented new routines to change the type in
 DataLoader.

---
 src/data_loader/data_loader.jl | 76 ++++++++++++++++++++++++++++------
 1 file changed, 63 insertions(+), 13 deletions(-)

diff --git a/src/data_loader/data_loader.jl b/src/data_loader/data_loader.jl
index be20c6769..6c774f44c 100644
--- a/src/data_loader/data_loader.jl
+++ b/src/data_loader/data_loader.jl
@@ -76,6 +76,15 @@ function DataLoader(data::AbstractArray{T, 3}) where T
     DataLoader{T, typeof(data), Nothing, :TimeSeries}(data, nothing, input_dim, input_time_steps, n_params, nothing, nothing)
 end
 
+"""
+    DataLoader(data::AbstractMatrix)
+
+Make an instance of `DataLoader` based on a matrix.
+
+# Arguments 
+
+This has an addition keyword argument `autoencoder`, which is set to `true` by default.
+"""
 function DataLoader(data::AbstractMatrix{T}; autoencoder=true) where T 
     @info "You have provided a matrix as input. The axes will be interpreted as (i) system dimension and (ii) number of parameters."
     
@@ -177,7 +186,6 @@ end
 Constructor for `EnsembleSolution` form package `GeometricSolutions` with fields `q` and `p`.
 """
 function DataLoader(ensemble_solution::EnsembleSolution{T, T1, Vector{ST}}) where {T, T1, DT <: DataSeries{T}, ST <: GeometricSolution{T, T1, NamedTuple{(:q, :p), Tuple{DT, DT}}}}
-
     sys_dim, input_time_steps, n_params = length(ensemble_solution.s[1].q[0]), length(ensemble_solution.t), length(ensemble_solution.s)
 
     data = (q = zeros(T, sys_dim, input_time_steps, n_params), p = zeros(T, sys_dim, input_time_steps, n_params))
@@ -192,20 +200,35 @@ function DataLoader(ensemble_solution::EnsembleSolution{T, T1, Vector{ST}}) wher
     DataLoader(data)
 end
 
-function map_to_new_backend(input::AT, backend::KernelAbstractions.Backend) where AT <: AbstractArray
-    input₂ = KernelAbstractions.allocate(backend, size(input)...)
+function map_to_new_backend(input::AbstractArray{T}, backend::KernelAbstractions.Backend) where T
+    input₂ = KernelAbstractions.allocate(backend, T, size(input)...)
     KernelAbstractions.copyto!(backend, input₂, input)
-    DataLoader(input₂)
+    input₂
 end
 
 function map_to_new_backend(input::QPT{T}, backend::KernelAbstractions.Backend) where T
-    input₂ = (q = KernelAbstractions.allocate(backend, T, size(input₂.q)...), p = KernelAbstractions.allocate(backend, size(input₂.p)...))
+    input₂ = (q = KernelAbstractions.allocate(backend, T, size(input.q)...), p = KernelAbstractions.allocate(backend, T, size(input.p)...))
     KernelAbstractions.copyto!(backend, input₂.q, input.q)
     KernelAbstractions.copyto!(backend, input₂.p, input.p)
-    DataLoader(input₂)
+    input₂
+end
+
+function map_to_type(input::QPT, T::DataType)
+    (q = T.(input.q), p = T.(input.p))
 end
 
-function DataLoader(dl::DataLoader{<:Number, <:QPTOAT, Nothing}, backend::KernelAbstractions.Backend)
+function map_to_type(input::AbstractArray, T::DataType)
+    T.(input)
+end
+
+function DataLoader(dl::DataLoader{T1, <:QPTOAT, Nothing}, backend::KernelAbstractions.Backend, T::DataType=T1) where T1
+    input = 
+        if T==T1
+            dl.input
+        else
+            map_to_new_type(dl.input, T)
+        end
+
     DataLoader(map_to_new_backend(dl.input, backend))
 end
 
@@ -217,15 +240,39 @@ function reshape_to_matrix(dl::DataLoader{<:Number, <:AbstractArray, Nothing, :R
     reshape(dl.input, dl.input_dim, dl.n_params)
 end
 
-function DataLoader(dl::DataLoader{<: Number, <: QPTOAT, Nothing, :RegularData}, 
-    backend::KernelAbstractions.Backend=KernelAbstractions.get_backend(dl); 
-    autoencoder::Bool=false)
+function DataLoader(dl::DataLoader{T1, <: QPTOAT, Nothing, :RegularData}, 
+    backend::KernelAbstractions.Backend=KernelAbstractions.get_backend(dl),
+    T::DataType=T1;
+    autoencoder::Bool=true) where T1
 
     if autoencoder == true
-        DataLoader(map_to_new_backend(backend, dl.input))
+        input = 
+        if T == T1
+            dl.input
+        else
+            map_to_type(dl.input, T)
+        end 
+
+        new_input = map_to_new_backend(input, backend)
+        DataLoader{T, typeof(new_input), Nothing, :RegularData}(
+            new_input,
+            nothing,
+            dl.input_dim,
+            dl.input_time_steps,
+            dl.n_params,
+            nothing,
+            nothing)
     elseif autoencoder == false
         matrix_data = reshape_to_matrix(dl)
-        DataLoader(map_to_new_backend(backend, matrix_data); autoencoder=true)
+
+        matrix_data_new_type = 
+            if T == T1
+                matrix_data
+            else
+                map_to_type(matrix_data, T)
+            end
+
+        DataLoader(map_to_new_backend(matrix_data_new_type, backend); autoencoder=false)
     end
 end
 
@@ -254,4 +301,7 @@ accuracy(nn::NeuralNetwork, dl::DataLoader) = accuracy(nn.model, nn.params, dl)
 
 Base.eltype(::DataLoader{T}) where T = T
 
-KernelAbstractions.get_backend(dl::DataLoader) = KernelAbstractions.get_backend(dl.input)
\ No newline at end of file
+KernelAbstractions.get_backend(dl::DataLoader) = KernelAbstractions.get_backend(dl.input)
+function KernelAbstractions.get_backend(dl::DataLoader{T, QPT{T}}) where T
+    KernelAbstractions.get_backend(dl.input.q)
+end
\ No newline at end of file

From b83f5ea319884b8451bd648e53d002de61ed62e5 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Thu, 27 Jun 2024 13:49:50 +0200
Subject: [PATCH 46/56] Got rid of 1.8 tests.

---
 .github/workflows/CI.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 3bb101591..80284fe86 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -19,7 +19,6 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.8'
           - '1.10'
           - '^1.11.0-0'
         os:

From 7c5e5972fbdd0ed534707f5bbd673de896f1fa35 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Thu, 27 Jun 2024 13:50:47 +0200
Subject: [PATCH 47/56] Fixed typos.

---
 docs/src/architectures/sympnet.md                  | 14 +++++++-------
 docs/src/data_loader/snapshot_matrix.md            |  4 ++--
 docs/src/layers/sympnet_gradient.md                |  2 +-
 .../src/optimizers/manifold_related/retractions.md |  3 +--
 .../symplectic_autoencoder.md                      |  2 +-
 docs/src/tutorials/symplectic_autoencoder.md       |  2 +-
 src/architectures/transformer_integrator.jl        |  2 +-
 src/data_loader/data_loader.jl                     |  2 +-
 8 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/docs/src/architectures/sympnet.md b/docs/src/architectures/sympnet.md
index 99ef4471d..676f56433 100644
--- a/docs/src/architectures/sympnet.md
+++ b/docs/src/architectures/sympnet.md
@@ -9,7 +9,7 @@ SympNets are enforcing symplecticity strongly, meaning that this property is har
 
 SympNets can be viewed as a *symplectic integrator* or symplectic one-step method[^1] [hairer2006geometric, leimkuhler2004simulating](@cite). Their goal is to predict, based on an initial condition ``((q^{(0)})^T,(p^{(0)})^T)^T``, a sequence of points in phase space that fit the training data as well as possible:
 
-[^1]: *Symplectic multi-step methods* can be modeled with [transformers](@ref "Linear Symplectic Transformers").
+[^1]: *Symplectic multi-step methods* can be modeled with [transformers](@ref "Linear Symplectic Transformer").
 
 ```math
 \begin{pmatrix} q^{(0)} \\ p^{(0)} \end{pmatrix}, \cdots, \begin{pmatrix} \tilde{q}^{(1)} \\ \tilde{p}^{(1)} \end{pmatrix}, \cdots \begin{pmatrix} \tilde{q}^{(n)} \\ \tilde{p}^{(n)} \end{pmatrix}.
@@ -86,9 +86,9 @@ The superscripts ``\mathrm{up}`` and ``\mathrm{low}`` indicate whether the ``q``
 
 The second type of layer needed for ``LA``-SympNets are [activation layers](@ref "SympNet Gradient Layer").
  
-An ``LA``-SympNet is a mapping of the form ``\Psi=l_{k} \circ a_{k} \circ l_{k-1} \circ \cdots \circ a_1 \circ l_0`` where ``(l_i)_{0\leq i\leq k} \subset \mathcal{M}^L`` and ``(a_i)_{1\leq i\leq k} \subset \mathcal{M}^A``. We will refer to ``k`` as the *number of hidden layers* of the SympNet[^1] and the number ``w`` above as the *depth* of the linear layer.
+An ``LA``-SympNet is a mapping of the form ``\Psi=l_{k} \circ a_{k} \circ l_{k-1} \circ \cdots \circ a_1 \circ l_0`` where ``(l_i)_{0\leq i\leq k} \subset \mathcal{M}^L`` and ``(a_i)_{1\leq i\leq k} \subset \mathcal{M}^A``. We will refer to ``k`` as the *number of hidden layers* of the SympNet[^2] and the number ``w`` above as the *depth* of the linear layer.
 
-[^1]: Note that if ``k=0`` then the ``LA``-SympNet consists of only one linear layer.
+[^2]: Note that if ``k=0`` then the ``LA``-SympNet consists of only one linear layer.
 
 We give an example of calling ``LA``-SympNet:
 
@@ -102,9 +102,9 @@ arch = LASympNet(4; nhidden = k, depth = 2, init_upper_linear = true, init_upper
 model = Chain(arch).layers
 ```
  
-The keywords `init_upper_linear` and `init_upper_act` indicate whether the first linear (respectively activation) layer is of ``q`` type[2].
+The keywords `init_upper_linear` and `init_upper_act` indicate whether the first linear (respectively activation) layer is of ``q`` type[^3].
 
-[^2]: If `init_upper_linear = true` then the first layer is of ``q`` type, i.e. changes the ``q`` component and leaves the ``p`` component unchanged. 
+[^3]: If `init_upper_linear = true` then the first layer is of ``q`` type, i.e. changes the ``q`` component and leaves the ``p`` component unchanged. 
 
 ## ``G``-SympNets
  
@@ -164,9 +164,9 @@ The universal approximation theorems state that we can, in principle, get arbitr
 
 ## Loss function
 
-To train the SympNet, one needs data along a trajectory such that the model is trained to perform an integration. The loss function is defined as[^3]:
+To train the SympNet, one needs data along a trajectory such that the model is trained to perform an integration. The loss function is defined as[^4]:
 
-[^3]: This loss function is implemented as [`FeedForwardLoss`](@ref) in `GeometricMachineLearning`.
+[^4]: This loss function is implemented as [`FeedForwardLoss`](@ref) in `GeometricMachineLearning`.
 
 ```math
 \mathrm{loss}(z^\mathrm{c}, z^\mathrm{p}) = || z^\mathrm{c} - z^\mathrm{p} || / || z^\mathrm{c} ||,
diff --git a/docs/src/data_loader/snapshot_matrix.md b/docs/src/data_loader/snapshot_matrix.md
index 90f62b6b7..524f600c1 100644
--- a/docs/src/data_loader/snapshot_matrix.md
+++ b/docs/src/data_loader/snapshot_matrix.md
@@ -1,6 +1,6 @@
 # Snapshot matrix
 
-The snapshot matrix stores solutions of the high-dimensional ODE (obtained from discretizing a PDE). This is then used to construct [reduced bases](../reduced_order_modeling/autoencoder.md) in a data-driven way. So (for a single parameter[^1]) the snapshot matrix takes the following form: 
+The snapshot matrix stores solutions of the high-dimensional ODE (obtained from discretizing a PDE). This is then used to construct [reduced bases](../reduced_order_modeling/reduced_order_modeling.md) in a data-driven way. So (for a single parameter[^1]) the snapshot matrix takes the following form: 
 
 [^1]: If we deal with a parametrized PDE then there are **two stages** at which the snapshot matrix has to be processed: the offline stage and the online stage. 
 
@@ -16,7 +16,7 @@ M = \left[\begin{array}{c:c:c:c}
 
 In the above example we store a matrix whose first axis is the system dimension (i.e. a state is an element of ``\mathbb{R}^{2n}``) and the second dimension gives the time step. 
 
-The starting point for using the snapshot matrix as data for a machine learning model is that all the columns of ``M`` live on a lower-dimensional [solution manifold](../reduced_order_modeling/autoencoder.md) and we can use techniques such as *POD* and *autoencoders* to find this solution manifold. We also note that the second axis of ``M`` does not necessarily indicate time but can also represent various parameters (including initial conditions). The second axis in the `DataLoader` struct is therefore saved in the field `n_params`.
+The starting point for using the snapshot matrix as data for a machine learning model is that all the columns of ``M`` live on a lower-dimensional [solution manifold](../reduced_order_modeling/reduced_order_modeling.md) and we can use techniques such as *POD* and *autoencoders* to find this solution manifold. We also note that the second axis of ``M`` does not necessarily indicate time but can also represent various parameters (including initial conditions). The second axis in the `DataLoader` struct is therefore saved in the field `n_params`.
 
 
 
diff --git a/docs/src/layers/sympnet_gradient.md b/docs/src/layers/sympnet_gradient.md
index 4a026e4c4..03b410338 100644
--- a/docs/src/layers/sympnet_gradient.md
+++ b/docs/src/layers/sympnet_gradient.md
@@ -90,5 +90,5 @@ GeometricMachineLearning.GradientLayerP
 Canonical = false
 Pages = []
 
-jin2020sympnet
+jin2020sympnets
 ```
\ No newline at end of file
diff --git a/docs/src/optimizers/manifold_related/retractions.md b/docs/src/optimizers/manifold_related/retractions.md
index 6df431498..87f41e974 100644
--- a/docs/src/optimizers/manifold_related/retractions.md
+++ b/docs/src/optimizers/manifold_related/retractions.md
@@ -288,8 +288,7 @@ and
 where ``B = \lambda(Y)^{-1}\Omega(\Delta)\lambda(Y)``. These expressions for `geodesic` and `cayley` are the ones that we typically use in `GeometricMachineLearning` for computational reasons. We show how we can utilize the sparse structure of ``\mathfrak{g}^\mathrm{hor}`` for computing the geodesic retraction and the Cayley retraction (i.e. the expressions ``\exp(B)`` and ``\mathrm{Cayley}(B)`` for ``B\in\mathfrak{g}^\mathrm{hor}``). Similar derivations can be found in [celledoni2000approximating, fraikin2007optimization, bendokat2021real](@cite).
 
 ```@eval
-Main.remark(raw"Further note that, even though the global section ``\lambda:\mathcal{M} \to G`` is not unique, the final geodesic ``
-\gamma_\Delta(t) = \lambda(Y)\exp(\lambda(Y)^{-1}\Omega(\Delta)\lambda(Y))E`` does not depend on the particular section we choose.")
+Main.remark(raw"Further note that, even though the global section ``\lambda:\mathcal{M} \to G`` is not unique, the final geodesic ``\gamma_\Delta(t) = \lambda(Y)\exp(\lambda(Y)^{-1}\Omega(\Delta)\lambda(Y))E`` does not depend on the particular section we choose.")
 ```
 
 ### The Geodesic Retraction
diff --git a/docs/src/reduced_order_modeling/symplectic_autoencoder.md b/docs/src/reduced_order_modeling/symplectic_autoencoder.md
index 5ac6b7734..905c5c41c 100644
--- a/docs/src/reduced_order_modeling/symplectic_autoencoder.md
+++ b/docs/src/reduced_order_modeling/symplectic_autoencoder.md
@@ -24,7 +24,7 @@ I NEED A PROOF OR SOME EXPLANATION FOR THIS!
 
 ## Workflow for Symplectic ROM
 
-As with any other [reduced order modeling technique](autoencoder.md) we first discretize the PDE. This should be done with a structure-preserving scheme, thus yielding a (high-dimensional) Hamiltonian ODE as a result. Discretizing the wave equation above with finite differences yields a Hamiltonian system: 
+As with any other [reduced order modeling technique](reduced_order_modeling.md) we first discretize the PDE. This should be done with a structure-preserving scheme, thus yielding a (high-dimensional) Hamiltonian ODE as a result. Discretizing the wave equation above with finite differences yields a Hamiltonian system: 
 
 ```math
 \mathcal{H}_\mathrm{discr}(z(t;\mu);\mu) := \frac{1}{2}x(t;\mu)^T\begin{bmatrix}  -\mu^2D_{\xi{}\xi} & \mathbb{O} \\ \mathbb{O} & \mathbb{I}  \end{bmatrix} x(t;\mu).
diff --git a/docs/src/tutorials/symplectic_autoencoder.md b/docs/src/tutorials/symplectic_autoencoder.md
index bb407230d..3fd1924f6 100644
--- a/docs/src/tutorials/symplectic_autoencoder.md
+++ b/docs/src/tutorials/symplectic_autoencoder.md
@@ -127,7 +127,7 @@ sol_psd_reduced = integrate_reduced_system(psd_rs)
 sol_sae_reduced = integrate_reduced_system(sae_rs)
 
 const t_steps = 100
-plot(sol_full.s.q[t_step], label = "Implicit Midpoint")
+plot(sol_full.s.q[t_steps], label = "Implicit Midpoint")
 plot!(psd_rs.decoder((q = sol_psd_reduced.s.q[t_steps], p = sol_psd_reduced.s.p[t_steps])).q, label = "PSD")
 plot!(sae_rs.decoder((q = sol_sae_reduced.s.q[t_steps], p = sol_sae_reduced.s.p[t_steps])).q, label = "SAE")
 ```
diff --git a/src/architectures/transformer_integrator.jl b/src/architectures/transformer_integrator.jl
index 3b9f03acb..32a621dad 100644
--- a/src/architectures/transformer_integrator.jl
+++ b/src/architectures/transformer_integrator.jl
@@ -1,5 +1,5 @@
 @doc raw"""
-Encompasses various transformer architectures, such as the [`VolumePreservingTransformmer`](@ref) and the [`LinearSymplecticTransformer`](@ref). 
+Encompasses various transformer architectures, such as the [`VolumePreservingTransformer`](@ref) and the [`LinearSymplecticTransformer`](@ref). 
 """
 abstract type TransformerIntegrator <: Architecture end
 
diff --git a/src/data_loader/data_loader.jl b/src/data_loader/data_loader.jl
index 6c774f44c..4c6befb01 100644
--- a/src/data_loader/data_loader.jl
+++ b/src/data_loader/data_loader.jl
@@ -302,6 +302,6 @@ accuracy(nn::NeuralNetwork, dl::DataLoader) = accuracy(nn.model, nn.params, dl)
 Base.eltype(::DataLoader{T}) where T = T
 
 KernelAbstractions.get_backend(dl::DataLoader) = KernelAbstractions.get_backend(dl.input)
-function KernelAbstractions.get_backend(dl::DataLoader{T, QPT{T}}) where T
+function KernelAbstractions.get_backend(dl::DataLoader{T, <:QPT{T}}) where T
     KernelAbstractions.get_backend(dl.input.q)
 end
\ No newline at end of file

From 0dc73ab3f8875f7fac295d0e0af1236a02cf342a Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Thu, 27 Jun 2024 13:51:04 +0200
Subject: [PATCH 48/56] Changed resolution of ingredients tikz.

---
 docs/src/tikz/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/src/tikz/Makefile b/docs/src/tikz/Makefile
index 68b441c29..f01eadd2b 100644
--- a/docs/src/tikz/Makefile
+++ b/docs/src/tikz/Makefile
@@ -59,8 +59,8 @@ png:
 	pdftocairo 	-png -r $(res1) -transp -singlefile  skew_sym_visualization_dark.pdf				skew_sym_visualization_dark
 	pdftocairo  -png -r $(res2) -transp -singlefile  offline_online.pdf								offline_online
 	pdftocairo  -png -r $(res2) -transp -singlefile  offline_online_dark.pdf						offline_online_dark
-	pdftocairo  -png -r $(res2) -transp -singlefile  ingredients.pdf								ingredients
-	pdftocairo  -png -r $(res2) -transp -singlefile  ingredients_dark.pdf							ingredients_dark
+	pdftocairo  -png -r $(res1) -transp -singlefile  ingredients.pdf								ingredients
+	pdftocairo  -png -r $(res1) -transp -singlefile  ingredients_dark.pdf							ingredients_dark
 
 logo:
 	cp logo_with_name.png 		../assets/logo.png

From 1d1c081788fe6a5d9e78a7d6da0df258a5458ae3 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Thu, 27 Jun 2024 13:51:32 +0200
Subject: [PATCH 49/56] Changed alpha value for better visibility.

---
 docs/src/tikz/offline_online_dark.tex | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/src/tikz/offline_online_dark.tex b/docs/src/tikz/offline_online_dark.tex
index f3a994ff2..d8358df0c 100644
--- a/docs/src/tikz/offline_online_dark.tex
+++ b/docs/src/tikz/offline_online_dark.tex
@@ -10,8 +10,8 @@
 
 \begin{document}
 \begin{tikzpicture}[
-    squarednodeF/.style={circle, draw=black!60, fill=gray!5, very thick, minimum size=10mm},
-    squarednodeR/.style={rectangle, draw=black!60, fill=cyan!5, very thick, minimum size=5mm},
+    squarednodeF/.style={circle, draw=black!60, fill=gray!60, very thick, minimum size=10mm},
+    squarednodeR/.style={rectangle, draw=black!60, fill=cyan!60, very thick, minimum size=5mm},
     color = white
     ]
     %Nodes

From be0e33c2a7299af8da738f2fbf4169e19c0c0b9a Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Thu, 27 Jun 2024 13:52:26 +0200
Subject: [PATCH 50/56] Fixed problem with types (arguments were flipped.

---
 src/layers/bias_layer.jl | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/layers/bias_layer.jl b/src/layers/bias_layer.jl
index 64122c358..045b5cc41 100644
--- a/src/layers/bias_layer.jl
+++ b/src/layers/bias_layer.jl
@@ -20,8 +20,24 @@ function parameterlength(::BiasLayer{M, M}) where M
     M 
 end
 
-(::BiasLayer{M, M})(z::NT, ps::NT) where {M, AT<:AbstractVector, NT<:NamedTuple{(:q, :p), Tuple{AT, AT}}} =  (q = z.q + ps.q, p = z.p + ps.p)
-(::BiasLayer{M, M})(z::NT1, ps::NT2) where {M, T, AT<:AbstractVector, BT<:Union{AbstractMatrix, AbstractArray{T, 3}}, NT1<:NamedTuple{(:q, :p), Tuple{AT, AT}}, NT2<:NamedTuple{(:q, :p), Tuple{BT, BT}}} =  (q = z.q .+ ps.q, p = z.p .+ ps.p)
+function (::BiasLayer{M, M})(z::NT, ps::NT) where {
+                                                    M, 
+                                                    AT<:AbstractVector, 
+                                                    NT<:NamedTuple{(:q, :p), Tuple{AT, AT}}
+                                                    }
+    (q = z.q + ps.q, p = z.p + ps.p)
+end
+
+function (::BiasLayer{M, M})(z::NT2, ps::NT1) where {   
+                                                    M, 
+                                                    T, 
+                                                    AT<:AbstractVector{T}, 
+                                                    BT<:Union{AbstractMatrix{T}, AbstractArray{T, 3}}, 
+                                                    NT1<:NamedTuple{(:q, :p), Tuple{AT, AT}}, 
+                                                    NT2<:NamedTuple{(:q, :p), Tuple{BT, BT}}
+                                                    }
+    (q = z.q .+ ps.q, p = z.p .+ ps.p)
+end
 
 function (d::BiasLayer{M, M})(z::AbstractArray, ps) where M
     apply_layer_to_nt_and_return_array(z, d, ps)

From 18cd19151e486020908c17a426a7dfd4c8c06e0f Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Thu, 27 Jun 2024 16:38:12 +0200
Subject: [PATCH 51/56] Restricted function further.

---
 docs/src/tutorials/adjusting_the_loss_function.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/src/tutorials/adjusting_the_loss_function.md b/docs/src/tutorials/adjusting_the_loss_function.md
index 58dff4f82..4fe94a41c 100644
--- a/docs/src/tutorials/adjusting_the_loss_function.md
+++ b/docs/src/tutorials/adjusting_the_loss_function.md
@@ -54,7 +54,8 @@ We now implement a custom loss such that:
 struct CustomLoss <: GeometricMachineLearning.NetworkLoss end
 
 function (loss::CustomLoss)(model::Chain, params::Tuple, input::CT, output::CT) where {
-                                                            AT<:AbstractArray, 
+                                                            T,
+                                                            AT<:AbstractArray{T, 3}, 
                                                             CT<:@NamedTuple{q::AT, p::AT}
                                                             }
     FeedForwardLoss()(model, params, input, output) + .1 * network_parameter_norm(params)
@@ -106,7 +107,6 @@ end
 lines!(ax, data.input.q[:], data.input.p[:], label = rich("Training Data"; color = textcolor))
 lines!(ax, prediction1[1, :], prediction1[2, :], label = rich("FeedForwardLoss"; color = textcolor))
 lines!(ax, prediction2[1, :], prediction2[2, :], label = rich("CustomLoss"; color = textcolor))
-text_color = :white # hide
 axislegend(; position = (.82, .75), backgroundcolor = :transparent) # hide
 
 fig

From e1f97f90f2ea31be55d926b6b9d7049488f79db9 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Thu, 27 Jun 2024 16:38:33 +0200
Subject: [PATCH 52/56] Made transformer integrator work.

---
 .../symplectic_autoencoders/online_sympnet.jl | 39 +++++++++++--------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/scripts/symplectic_autoencoders/online_sympnet.jl b/scripts/symplectic_autoencoders/online_sympnet.jl
index 5719c6a36..b1d7a084c 100644
--- a/scripts/symplectic_autoencoders/online_sympnet.jl
+++ b/scripts/symplectic_autoencoders/online_sympnet.jl
@@ -10,10 +10,7 @@ backend = CUDABackend()
 pr = hodeproblem(; tspan = (0.0, 100.))
 sol = integrate(pr, ImplicitMidpoint())
 dl_cpu = DataLoader(sol; autoencoder = true)
-dl = DataLoader((
-                q = reshape(dl_cpu.input.q |> cu, size(dl_cpu.input.q, 1), size(dl_cpu.input.q, 3)), 
-                p = reshape(dl_cpu.input.p |> cu, size(dl_cpu.input.p, 1), size(dl_cpu.input.p, 3))); 
-                autoencoder = true)
+dl = DataLoader(dl_cpu, backend, Float32)
 
 const reduced_dim = 2
 
@@ -24,7 +21,7 @@ Random.seed!(123)
 psd_nn = NeuralNetwork(psd_arch, backend)
 sae_nn = NeuralNetwork(sae_arch, backend)
 
-const n_epochs = 4096
+const n_epochs = 8192
 const batch_size = 512
 
 sae_method = AdamOptimizerWithDecay(n_epochs)
@@ -74,27 +71,27 @@ data_processed = (  q = reshape(data_unprocessed.q, reduced_dim ÷ 2, length(dat
                     )
 
 dl_reduced = DataLoader(data_processed; autoencoder = false)
-integrator_train_epochs = 4096
+integrator_train_epochs = 8192
 integrator_batch_size = 512
 
 seq_length = 4
-integrator_architecture = StandardTransformerIntegrator(reduced_dim; transformer_dim = 30, n_blocks = 4, n_heads = 5, L = 3, upscaling_activation = tanh)
+integrator_architecture = StandardTransformerIntegrator(reduced_dim; transformer_dim = 10, n_blocks = 3, n_heads = 5, L = 2, upscaling_activation = tanh)
 integrator_nn = NeuralNetwork(integrator_architecture, backend)
 integrator_method = AdamOptimizerWithDecay(integrator_train_epochs)
 o_integrator = Optimizer(integrator_method, integrator_nn)
 
 loss = GeometricMachineLearning.ReducedLoss(encoder(sae_nn), decoder(sae_nn))
-dl_integration = DataLoader((q = reshape(dl.input.q, size(dl.input.q, 1), size(dl.input.q, 3)),
-                             p = reshape(dl.input.p, size(dl.input.p, 1), size(dl.input.p, 3)));
-                            autoencoder = false
-                            )
+
+# map autoencoder-like data to time-series like data
+dl_integration = DataLoader(dl; autoencoder = false)
 
 # the regular transformer can't deal with symplectic data!
 dl_integration = DataLoader(vcat(dl_integration.input.q, dl_integration.input.p))
-o_integrator(integrator_nn, dl_integration, Batch(integrator_batch_size, seq_length), integrator_train_epochs, loss)
+integrator_batch = Batch(integrator_batch_size, seq_length)
+o_integrator(integrator_nn, dl_integration, integrator_batch, integrator_train_epochs, loss)
 
 const ics_nt = (q = mtc(dl_reduced.input.q[:, 1:seq_length, 1]), p = mtc(dl_reduced.input.p[:, 1:seq_length, 1]))
-const ics = vcat(ics_nt, ics_nt)
+const ics = vcat(ics_nt.q, ics_nt.p)
 
 ######################################################################
 
@@ -114,10 +111,10 @@ function plot_validation(t_steps::Integer=100)
 
     time_series = iterate(mtc(integrator_nn), ics; n_points = t_steps, prediction_window = seq_length)
     # prediction = (q = time_series.q[:, end], p = time_series.p[:, end])
-    prediction = time.series[:, end]
+    prediction = time_series[:, end]
     sol = decoder(sae_nn_cpu)(prediction)
 
-    lines!(ax_val, sol[1:(dl.sys_dim ÷ 2)]; label = "Neural Network Integrator", color = mpurple)
+    lines!(ax_val, sol[1:(dl.input_dim ÷ 2)]; label = "Neural Network Integrator", color = mpurple)
 
     axislegend(; position = (.82, .75), backgroundcolor = :transparent, color = text_color)
     save(name * "_with_nn_integrator.png", fig_val)
@@ -125,4 +122,14 @@ end
 
 for t_steps in (10, 100, 200, 300, 400, 500, 600, 700, 800, 900)
     plot_validation(t_steps)
-end
\ No newline at end of file
+end
+
+######################################################################
+
+# plot the reduced data (should be a closed orbit)
+reduced_data_matrix = vcat(dl_reduced.input.q, dl_reduced.input.p)
+fig_reduced = Figure()
+ax_reduced = Axis(reduced_fig[1, 1])
+lines!(ax_reduced, reduced_data_matrix[1, :, 1], reduced_data_matrix[2, :, 1]; color = mgreen, label = "Reduced Data")
+axislegend(; position = (.82, .75), backgroundcolor = :transparent, color = text_color)
+save("reduced_data.png", fig_reduced)
\ No newline at end of file

From cdeaf42f102eb4901a7d3b541b668f443440aef3 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Thu, 27 Jun 2024 16:39:18 +0200
Subject: [PATCH 53/56] Returning an error if a vector is provided as initial
 condition.

---
 src/architectures/transformer_integrator.jl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/architectures/transformer_integrator.jl b/src/architectures/transformer_integrator.jl
index 32a621dad..7c03bea7d 100644
--- a/src/architectures/transformer_integrator.jl
+++ b/src/architectures/transformer_integrator.jl
@@ -44,6 +44,10 @@ function Base.iterate(nn::NeuralNetwork{<:TransformerIntegrator}, ics::NamedTupl
     (q=q_valuation[:, 1:n_points], p=p_valuation[:, 1:n_points])
 end
 
+function Base.iterate(::NeuralNetwork{<:TransformerIntegrator}, ics::AT; n_points::Int = 100, prediction_window::Union{Nothing, Int} = size(ics, 2)) where {T, AT<:AbstractVector{T}}
+    error("You have to provide a matrix as initial condition for the transformer!")
+end
+
 function Base.iterate(nn::NeuralNetwork{<:TransformerIntegrator}, ics::AT; n_points::Int = 100, prediction_window::Union{Nothing, Int} = size(ics, 2)) where {T, AT<:AbstractMatrix{T}}
 
     seq_length = typeof(nn.architecture) <: StandardTransformerIntegrator ? prediction_window : nn.architecture.seq_length

From 2f7926166417497db58b9dfde9946da502747a97 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Thu, 27 Jun 2024 16:39:41 +0200
Subject: [PATCH 54/56] Using QPTOAT for types now.

---
 src/loss/losses.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/loss/losses.jl b/src/loss/losses.jl
index 40661dbbd..35904112b 100644
--- a/src/loss/losses.jl
+++ b/src/loss/losses.jl
@@ -16,12 +16,12 @@ function _compute_loss(output_prediction::CT1, output::CT2) where {AT<:AbstractA
     _norm(_diff(output_prediction, output)) / _norm(output)
 end 
 
-function _compute_loss(model::Union{AbstractExplicitLayer, Chain}, ps::Union{Tuple, NamedTuple}, input::CT, output::CT) where {AT<:AbstractArray, BT <: NamedTuple{(:q, :p), Tuple{AT, AT}}, CT <: Union{AT, BT}}
+function _compute_loss(model::Union{AbstractExplicitLayer, Chain}, ps::Union{Tuple, NamedTuple}, input::CT, output::CT) where {CT <: QPTOAT}
     output_prediction = model(input, ps)
     _compute_loss(output_prediction, output)
 end
 
-function (loss::NetworkLoss)(model::Union{Chain, AbstractExplicitLayer}, ps::Union{Tuple, NamedTuple}, input::CT, output::CT) where {AT<:AbstractArray, BT <: NamedTuple{(:q, :p), Tuple{AT, AT}}, CT <: Union{AT, BT}}
+function (loss::NetworkLoss)(model::Union{Chain, AbstractExplicitLayer}, ps::Union{Tuple, NamedTuple}, input::CT, output::CT) where {CT <: QPTOAT}
     _compute_loss(model, ps, input, output)
 end
 
@@ -128,6 +128,6 @@ function ReducedLoss(autoencoder::NeuralNetwork{<:AutoEncoder})
     ReducedLoss(encoder(autoencoder), decoder(autoencoder))
 end
 
-function (loss::ReducedLoss)(model::Chain, params::Tuple, input::CT, output::CT) where {AT <:AbstractArray, CT <: NamedTuple{(:q, :p), Tuple{AT, AT}}}
+function (loss::ReducedLoss)(model::Chain, params::Tuple, input::CT, output::CT) where {CT <: QPTOAT}
     _compute_loss(loss.decoder(model(loss.encoder(input), params)), output)
 end
\ No newline at end of file

From fc2990cba763499ab86827b0c066eb99db0d767d Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Thu, 27 Jun 2024 16:41:03 +0200
Subject: [PATCH 55/56] Removed Simple solvers.

---
 Project.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 21d259125..ea59b24ea 100644
--- a/Project.toml
+++ b/Project.toml
@@ -25,7 +25,6 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
-SimpleSolvers = "36b790f5-6434-4fbe-b711-1f64a1e2f6a2"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"

From d991daca0c8e75b823408d1c6e5a2d00eb4a787e Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Thu, 27 Jun 2024 17:04:52 +0200
Subject: [PATCH 56/56] Fix problem with curly braces.

---
 docs/src/architectures/sympnet.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/architectures/sympnet.md b/docs/src/architectures/sympnet.md
index 676f56433..251abc8d3 100644
--- a/docs/src/architectures/sympnet.md
+++ b/docs/src/architectures/sympnet.md
@@ -124,7 +124,7 @@ model = Chain(arch).layers
 
 In order to state the *universal approximation theorem* for both architectures we first need a few definitions:
  
-Let ``U`` be an open set of ``\mathbb{R}^{2d}``, and let us denote by ``\mathcal{SP}^r(U)`` the set of ``C^r`` smooth symplectic maps on ``U``. We now define a topology on ``C^r(K, \mathbb{R}^{2d})``, the set of ``C^r``-smooth maps from a compact set ``K\subset{}U}`` to ``\mathbb{R}^{2d}`` through the norm
+Let ``U`` be an open set of ``\mathbb{R}^{2d}``, and let us denote by ``\mathcal{SP}^r(U)`` the set of ``C^r`` smooth symplectic maps on ``U``. We now define a topology on ``C^r(K, \mathbb{R}^{2d})``, the set of ``C^r``-smooth maps from a compact set ``K\subset{}U`` to ``\mathbb{R}^{2d}`` through the norm
 
 ```math
 ||f||_{C^r(K,\mathbb{R}^{2d})} = \sum_{|\alpha|\leq r} \underset{1\leq i \leq 2d}{\max}\underset{x\in K}{\sup} |D^\alpha f_i(x)|,