Merge pull request #171 from JuliaGNI/adjust-to-new-interface

Adjust to new interface
JuliaGNI · Nov 13, 2024 · 4bc5fc5 · 4bc5fc5
2 parents 388d07b + af47e48
commit 4bc5fc5
Show file tree

Hide file tree

Showing 25 changed files with 110 additions and 69 deletions.
diff --git a/Project.toml b/Project.toml
@@ -32,7 +32,7 @@ UpdateJulia = "770da0de-323d-4d28-9202-0e205c1e0aff"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
-AbstractNeuralNetworks = "0.3"
+AbstractNeuralNetworks = "0.3.2"
 BandedMatrices = "0.17, 1"
 ChainRules = "1"
 ChainRulesCore = "1"

diff --git a/docs/make.jl b/docs/make.jl
@@ -352,7 +352,8 @@ _latex_pages = [
 ]
 
 _keys = [page[1] for page in _latex_pages]
-filter!(key -> (key ≠ "HOME") & (key ≠ "Index of Docstrings") & (key ≠ "References"), _keys)
+# don't generate docstring indices for specific chapters (introduction, conclusion, ...)
+filter!(key -> (key ≠ "HOME") & (key ≠ "Index of Docstrings") & (key ≠ "References") & (key ≠ "Summary and Outlook"), _keys)
 index_latex_pages = vcat([Dict(_latex_pages)[key] for key in _keys]...)
 
 makedocs(;
@@ -371,3 +372,7 @@ deploydocs(;
     devurl = "latest",
     devbranch = "main",
 )
+
+# got the error "exception = GLFWError (NOT_INITIALIZED): The GLFW library is not initialized" before; also see https://discourse.julialang.org/t/warning-error-closing-screen/111939
+import GLMakie
+GLMakie.closeall()
diff --git a/docs/src/GeometricMachineLearning.bib b/docs/src/GeometricMachineLearning.bib
@@ -27,7 +27,8 @@ @book{hairer2006geometric
   title={Geometric Numerical integration: structure-preserving algorithms for ordinary differential equations},
   author={Hairer, Ernst and Lubich, Christian and Wanner, Gerhard},
   year={2006},
-  publisher={Springer}
+  publisher={Springer},
+  address={Heidelberg}
 }
 
 

diff --git a/docs/src/docstring_index.md b/docs/src/docstring_index.md
@@ -7,66 +7,66 @@
 ### Manifolds
 
 ```@index
-Pages = Dict(index_latex_pages)["Manifolds"]
+Pages = Dict(Main.index_latex_pages)["Manifolds"]
 ```
 
 ### Geometric Structure
 
 ```@index
-Pages = Dict(index_latex_pages)["Geometric Structure"]
+Pages = Dict(Main.index_latex_pages)["Geometric Structure"]
 ```
 
 ### Reduced Order Modeling
 
 ```@index
-Pages = Dict(index_latex_pages)["Reduced Order Modeling"]
+Pages = Dict(Main.index_latex_pages)["Reduced Order Modeling"]
 ```
 
 ### General Framework for Manifold Optimization
 
 ```@index
-Pages = Dict(index_latex_pages)["General Framework for Manifold Optimization"]
+Pages = Dict(Main.index_latex_pages)["General Framework for Manifold Optimization"]
 ```
 
 ### Optimizer Methods
 
 ```@index
-Pages = Dict(index_latex_pages)["Optimizer Methods"]
+Pages = Dict(Main.index_latex_pages)["Optimizer Methods"]
 ```
 
 ### Layers
 
 ```@index
-Pages = Dict(index_latex_pages)["Layers"]
+Pages = Dict(Main.index_latex_pages)["Layers"]
 ```
 
 ### Architectures
 
 ```@index
-Pages = Dict(index_latex_pages)["Architectures"]
+Pages = Dict(Main.index_latex_pages)["Architectures"]
 ```
 
 
 ### Transformers with Structure
 
 ```@index
-Pages = Dict(index_latex_pages)["Transformers with Structure"]
+Pages = Dict(Main.index_latex_pages)["Transformers with Structure"]
 ```
 
 ### Learning Nonlinear Spaces
 
 ```@index
-Pages = [Dict(index_latex_pages)["Learning Nonlinear Spaces"]]
+Pages = [Dict(Main.index_latex_pages)["Learning Nonlinear Spaces"]]
 ```
 
 ### Data Loader
 
 ```@index
-Pages = Dict(index_latex_pages)["Data Loader"]
+Pages = Dict(Main.index_latex_pages)["Data Loader"]
 ```
 
 ### Special Arrays, Tensors and Pullbacks
 
 ```@index
-Pages = Dict(index_latex_pages)["Special Arrays, Tensors and Pullbacks"]
+Pages = Dict(Main.index_latex_pages)["Special Arrays, Tensors and Pullbacks"]
 ```
diff --git a/docs/src/optimizers/manifold_related/parallel_transport.md b/docs/src/optimizers/manifold_related/parallel_transport.md
@@ -7,7 +7,7 @@ A precise definition of parallel transport needs a notion of a *connection* [lan
 ```@eval
 Main.theorem(raw"Given two elements ``B^A_1, B^A_2\in{}T_AG`` the parallel transport of ``B^A_2`` along the geodesic of ``B^A_1`` is given by
 " * Main.indentation * raw"```math
-" * Main.indentation * raw"\Pi_{A\to\gamma_{B^A_1}(t)} = A\exp(t\cdot{}A^{-1}B^A_1)A^{-1}B^A_2 = A\exp(t\cdot{}B_1)B_2,
+" * Main.indentation * raw"\Pi_{A\to\gamma_{B^A_1}(t)}B^A_2 = A\exp(t\cdot{}A^{-1}B^A_1)A^{-1}B^A_2 = A\exp(t\cdot{}B_1)B_2,
 " * Main.indentation * raw"```
 " * Main.indentation * raw"where ``B_i := A^{-1}B^A_i.``")
 ```
@@ -19,18 +19,18 @@ For the Stiefel manifold this is not much more complicated[^1]:
 ```@eval
 Main.theorem(raw"Given two elements ``\Delta_1, \Delta_2\in{}T_Y\mathcal{M}``, the parallel transport of ``\Delta_2`` along the geodesic of ``\Delta_1`` is given by
 " * Main.indentation * raw"```math
-" * Main.indentation * raw"\Pi_{Y\to\gamma_{\Delta_1}(t)} = \exp(t\cdot\Omega(Y, \Delta_1))\Delta_2 =  \lambda(Y)\exp(\bar{B}_1)\lambda(Y)^{-1}\Delta_2,
+" * Main.indentation * raw"\Pi_{Y\to\gamma_{\Delta_1}(t)}\Delta_2 = \exp(t\cdot\Omega(Y, \Delta_1))\Delta_2 =  \lambda(Y)\exp(\bar{B}_1)\lambda(Y)^{-1}\Delta_2,
 " * Main.indentation * raw"```
 " * Main.indentation * raw"where ``\bar{B}_1 = \lambda(Y)^{-1}\Omega(Y, \Delta_1)\lambda(Y).``")
 ```
 
 We can further modify the expression of parallel transport for the Stiefel manifold: 
 
 ```math
-\Pi_{Y\to\gamma_{\Delta_1}(t)} = \lambda(Y)\exp(B_1)\lambda(Y)\Omega(Y, \Delta_2)Y = \lambda(Y)\exp(B_1)B_2E,
+\Pi_{Y\to\gamma_{\Delta_1}(t)}\Delta_2 = \lambda(Y)\exp(B_1)\lambda(Y)\Omega(Y, \Delta_2)Y = \lambda(Y)\exp(B_1)B_2E,
 ```
 
-where ``B_2 = \lambda(Y)^{-1}\Omega(Y, \Delta_2)\lambda(Y).`` We can now define explicit updating rules for the [global section](@ref "Global Sections") ``\Lambda^{(\cdot)}``, the element of the homogeneous space ``Y^{(\cdot)}``, the tangent vector ``\Delta^{(\cdot)}`` and ``D^{(\cdot)} = (\Lambda^{(\cdot)})^{-1}\Omega(\Delta^{(\cdot)})\Lambda^{(cdot)}``, its representation in ``\mathfrak{g}^\mathrm{hor}``.
+where ``B_2 = \lambda(Y)^{-1}\Omega(Y, \Delta_2)\lambda(Y).`` We can now define explicit updating rules for the [global section](@ref "Global Sections") ``\Lambda^{(\cdot)}``, the element of the homogeneous space ``Y^{(\cdot)}``, the tangent vector ``\Delta^{(\cdot)}`` and ``D^{(\cdot)} = (\Lambda^{(\cdot)})^{-1}\Omega(\Delta^{(\cdot)})\Lambda^{(\cdot)}``, its representation in ``\mathfrak{g}^\mathrm{hor}``.
 
 We thus have:
 1. ``\Lambda^{(t)} \leftarrow \Lambda^{(t-1)}\exp(B^{(t-1)}),``

diff --git a/docs/src/tutorials/adjusting_the_loss_function.md b/docs/src/tutorials/adjusting_the_loss_function.md
@@ -41,7 +41,10 @@ using LinearAlgebra: norm  # hide
 # norm of parameters for single layer
 network_parameter_norm(params::NamedTuple) = sum([norm(params[i]) for i in 1:length(params)])
 # norm of parameters for entire network
-network_parameter_norm(params) = sum([network_parameter_norm(param) for param in params])
+function network_parameter_norm(params::NeuralNetworkParameters)
+    sum([network_parameter_norm(params[key]) for key in keys(params)])
+end
+
 network_parameter_norm(nn.params)
 ```
 
@@ -55,7 +58,7 @@ We now implement a custom loss such that:
 struct CustomLoss <: GeometricMachineLearning.NetworkLoss end
 
 const λ = .1
-function (loss::CustomLoss)(model::Chain, params::Tuple, input::CT, output::CT) where {
+function (loss::CustomLoss)(model::Chain, params::NeuralNetworkParameters, input::CT, output::CT) where {
                                                             T,
                                                             AT<:AbstractArray{T, 3}, 
                                                             CT<:@NamedTuple{q::AT, p::AT}

diff --git a/docs/src/tutorials/grassmann_layer.md b/docs/src/tutorials/grassmann_layer.md
@@ -112,7 +112,6 @@ We then *lift* the neural network parameters via [`GlobalSection`](@ref).
 
 ```@example rosenbrock
 λY = GlobalSection(nn.params)
-
 nothing # hide
 ```
 
@@ -258,7 +257,7 @@ where `np` is the number of points in ``\mathcal{D}_2`` and ``W_2`` is the *Wass
 where ``\nabla{}W_2`` is equivalent to the function `compute_wasserstein_gradient`.
 
 ```@example rosenbrock
-function compute_gradient(ps::Tuple)
+function compute_gradient(ps::NeuralNetworkParameters)
     samples = randn(2, size(xyz_points, 2))
     estimate, nn_pullback = Zygote.pullback(ps -> model(samples, ps), ps)
 
@@ -283,7 +282,7 @@ loss_array = zeros(training_steps)
 for i in 1:training_steps
     val, dp = compute_gradient(nn.params)
     loss_array[i] = val
-    optimization_step!(optimizer, λY, nn.params, dp)
+    optimization_step!(optimizer, λY, nn.params, dp.params)
 end
 ```
 

diff --git a/docs/src/tutorials/mnist/mnist_tutorial.md b/docs/src/tutorials/mnist/mnist_tutorial.md
@@ -151,10 +151,11 @@ accuracy_score2 = data["accuracy_score2"]
 accuracy_score3 = data["accuracy_score3"]
 accuracy_score4 = data["accuracy_score4"]
 
-nn1 = NeuralNetwork(nn1.architecture, nn1.model, data["nn1weights"], CPU())
-nn2 = NeuralNetwork(nn2.architecture, nn2.model, data["nn2weights"], CPU())
-nn3 = NeuralNetwork(nn3.architecture, nn3.model, data["nn3weights"], CPU())
-nn4 = NeuralNetwork(nn4.architecture, nn4.model, data["nn4weights"], CPU())
+_nnp(ps::Tuple) = NeuralNetworkParameters{Tuple(Symbol("L$(i)") for i in 1:length(ps))}(ps)
+nn1 = NeuralNetwork(nn1.architecture, nn1.model, _nnp(data["nn1weights"]), CPU())
+nn2 = NeuralNetwork(nn2.architecture, nn2.model, _nnp(data["nn2weights"]), CPU())
+nn3 = NeuralNetwork(nn3.architecture, nn3.model, _nnp(data["nn3weights"]), CPU())
+nn4 = NeuralNetwork(nn4.architecture, nn4.model, _nnp(data["nn4weights"]), CPU())
 
 morange = RGBf(255 / 256, 127 / 256, 14 / 256) # hide
 mred = RGBf(214 / 256, 39 / 256, 40 / 256) # hide

diff --git a/docs/src/tutorials/symplectic_autoencoder.md b/docs/src/tutorials/symplectic_autoencoder.md
@@ -156,7 +156,8 @@ sae_nn_cpu = mtc(sae_nn_gpu)
 using JLD2
 
 sae_trained_parameters = load("sae_parameters.jld2")["sae_parameters"]
-sae_nn_cpu = NeuralNetwork(sae_arch, Chain(sae_arch), sae_trained_parameters, CPU())
+_nnp(ps::Tuple) = NeuralNetworkParameters{Tuple(Symbol("L$(i)") for i in 1:length(ps))}(ps)
+sae_nn_cpu = NeuralNetwork(sae_arch, Chain(sae_arch), _nnp(sae_trained_parameters), CPU())
 
 nothing  # hide
 ```
@@ -284,7 +285,6 @@ dl = dl_cpu # hide
 dl_integration = DataLoader(dl; autoencoder = false)
 
 integrator_batch = Batch(integrator_batch_size, seq_length)
-
 nothing # hide
 ```
 ```julia
@@ -301,7 +301,7 @@ We can now evaluate the solution:
 
 ```@example toda_lattice
 nn_integrator_parameters = load("integrator_parameters.jld2")["integrator_parameters"] # hide
-integrator_nn = NeuralNetwork(integrator_architecture, Chain(integrator_architecture), nn_integrator_parameters, backend) # hide
+integrator_nn = NeuralNetwork(integrator_architecture, Chain(integrator_architecture), _nnp(nn_integrator_parameters), backend) # hide
 ics = encoder(sae_nn_cpu)((q = dl.input.q[:, 1:seq_length, 1], p = dl.input.p[:, 1:seq_length, 1])) # hide
 iterate(mtc(integrator_nn), ics; n_points = length(sol.t), prediction_window = seq_length) # hide
 @time "time stepping with transformer" time_series = iterate(mtc(integrator_nn), ics; n_points = length(sol.t), prediction_window = seq_length)
@@ -487,7 +487,7 @@ train_integrator_loss2 = o_integrator(integrator_nn2, dl_integration, integrator
 
 ```@setup toda_lattice
 nn_integrator_parameters2 = load("integrator_parameters_psd.jld2")["integrator_parameters"] # hide
-integrator_nn2 = NeuralNetwork(integrator_architecture2, Chain(integrator_architecture2), nn_integrator_parameters2, backend) # hide
+integrator_nn2 = NeuralNetwork(integrator_architecture2, Chain(integrator_architecture2), _nnp(nn_integrator_parameters2), backend) # hide
 ics = encoder(psd_nn2)((q = dl_cpu.input.q[:, 1:seq_length, 1], p = dl_cpu.input.p[:, 1:seq_length, 1])) # hide
 nothing # hide
 ```
@@ -509,7 +509,7 @@ time_steps
 Now we do so with:
 
 ```@example toda_lattice
-time_steps = (0, 2, 4)
+time_steps = (0, 4, 5)
 nothing # hide
 ```
 

diff --git a/docs/src/tutorials/volume_preserving_transformer_rigid_body.md b/docs/src/tutorials/volume_preserving_transformer_rigid_body.md
@@ -214,9 +214,10 @@ nn_st = mtc(nn_st)
 using JLD2 # hide
 # get correct parameters from jld2 file # hide
 f = load("transformer_rigid_body.jld2")  # hide
-nn_vpff = NeuralNetwork(nn_vpff.architecture, nn_vpff.model, f["nn_vpff_params"], nn_vpff.backend) # hide
-nn_vpt = NeuralNetwork(nn_vpt.architecture, nn_vpt.model, f["nn_vpt_arb_params"], nn_vpt.backend) # hide
-nn_st = NeuralNetwork(nn_st.architecture, nn_st.model, f["nn_st_params"], nn_st.backend) # hide
+_nnp(ps::Tuple) = NeuralNetworkParameters{Tuple(Symbol("L$(i)") for i in 1:length(ps))}(ps) # hide
+nn_vpff = NeuralNetwork(nn_vpff.architecture, nn_vpff.model, _nnp(f["nn_vpff_params"]), nn_vpff.backend) # hide
+nn_vpt = NeuralNetwork(nn_vpt.architecture, nn_vpt.model, _nnp(f["nn_vpt_arb_params"]), nn_vpt.backend) # hide
+nn_st = NeuralNetwork(nn_st.architecture, nn_st.model, _nnp(f["nn_st_params"]), nn_st.backend) # hide
 nothing # hide
 ```
 

diff --git a/src/GeometricMachineLearning.jl b/src/GeometricMachineLearning.jl
@@ -41,6 +41,7 @@ module GeometricMachineLearning
     export Dense, Linear
     export initialparameters
     export parameterlength
+    export NeuralNetworkParameters
 
     export σ, sigmoid, softmax
 

diff --git a/src/architectures/autoencoder.jl b/src/architectures/autoencoder.jl
@@ -85,7 +85,8 @@ using GeometricMachineLearning
 using GeometricMachineLearning: UnknownEncoder
 
 model = Chain(Dense(5, 3, tanh; use_bias = false), Dense(3, 2, identity; use_bias = false))
-nn = NeuralNetwork(UnknownEncoder(5, 2, 2), model, initialparameters(model), CPU())
+params = NeuralNetworkParameters(initialparameters(model))
+nn = NeuralNetwork(UnknownEncoder(5, 2, 2), model, params, CPU())
 
 typeof(nn) <: NeuralNetwork{<:GeometricMachineLearning.Encoder}
 
@@ -171,7 +172,8 @@ end
 # """
 function encoder_parameters(nn::NeuralNetwork{<:AutoEncoder})
     n_encoder_layers = length(encoder_model(nn.architecture).layers)
-    nn.params[1:n_encoder_layers]
+    keys = Tuple(Symbol.(["L$(i)" for i in 1:n_encoder_layers]))
+    NeuralNetworkParameters(NamedTuple{keys}(Tuple([nn.params[key] for key in keys])))
 end
 
 # """
@@ -181,7 +183,13 @@ end
 # """
 function decoder_parameters(nn::NeuralNetwork{<:AutoEncoder})
     n_decoder_layers = length(decoder_model(nn.architecture).layers)
-    nn.params[(end - (n_decoder_layers - 1)):end]
+    all_keys = keys(nn.params)
+    # "old keys" are the ones describing the correct parameters in nn.params
+    keys_old = Tuple(Symbol.(["L$(i)" for i in (length(all_keys) - (n_decoder_layers - 1)):length(all_keys)]))
+    n_keys = length(keys_old)
+    # "new keys" are the ones describing the keys in the new NamedTuple
+    keys_new = Tuple(Symbol.(["L$(i)" for i in 1:n_keys]))
+    NeuralNetworkParameters(NamedTuple{keys_new}(Tuple([nn.params[key] for key in keys_old])))
 end
 
 function Chain(arch::AutoEncoder)

diff --git a/src/architectures/neural_network_integrator.jl b/src/architectures/neural_network_integrator.jl
@@ -73,7 +73,8 @@ using GeometricMachineLearning
 model = ResNet(3, 0, identity)
 weight = [1 0 0; 0 2 0; 0 0 1]
 bias = [0, 0, 1]
-nn = NeuralNetwork(model, Chain(model), ((weight = weight, bias = bias), ), CPU())
+ps = NeuralNetworkParameters((L1 = (weight = weight, bias = bias), ))
+nn = NeuralNetwork(model, Chain(model), ps, CPU())
 
 ics = [1, 1, 1]
 iterate(nn, ics; n_points = 4)

diff --git a/src/data_loader/batch.jl b/src/data_loader/batch.jl
@@ -19,17 +19,18 @@ Consider the following example for drawing batches of size 2 for an instance of
 using GeometricMachineLearning
 import Random
 
-Random.seed!(123)
+rng = Random.TaskLocalRNG()
+Random.seed!(rng, 123)
 
-dl = DataLoader(rand(5))
+dl = DataLoader(rand(rng, 5))
 batch = Batch(2)
 
 batch(dl)
 
 # output
 
 [ Info: You have provided a matrix as input. The axes will be interpreted as (i) system dimension and (ii) number of parameters.
-([(1, 4), (1, 3)], [(1, 2), (1, 1)], [(1, 5)])
+([(1, 5), (1, 3)], [(1, 4), (1, 1)], [(1, 2)])
 ```
 
 Here the first index is always 1 (the time dimension). We get a total number of 3 batches. 
@@ -106,6 +107,7 @@ Here the distinction is between data that are *time-series like* and data that a
 using GeometricMachineLearning
 using GeometricMachineLearning: number_of_batches
 import Random
+
 Random.seed!(123)
 
 dat = [1, 2, 3, 4, 5]
@@ -123,8 +125,8 @@ println(stdout, batch(dl₁), "\n", batch(dl₂))
 
 Number of batches of dl₁: 2
 Number of batches of dl₂: 2
-([(1, 1), (3, 1), (4, 1)], [(2, 1)])
-([(1, 4), (1, 3), (1, 2)], [(1, 5), (1, 1)])
+([(1, 1), (4, 1), (2, 1)], [(3, 1)])
+([(1, 3), (1, 2), (1, 4)], [(1, 1), (1, 5)])
 ```
 
 Here we see that in the *autoencoder case* that last minibatch has an additional element.

diff --git a/src/data_loader/data_loader.jl b/src/data_loader/data_loader.jl
@@ -483,7 +483,7 @@ Compute the accuracy of a neural network classifier.
 
 This needs an instance of [`DataLoader`](@ref) that stores the *test data*.
 """
-function accuracy(model::Chain, ps::Tuple, dl::DataLoader{T, AT, BT}) where {T, T1<:Integer, AT<:AbstractArray{T}, BT<:AbstractArray{T1}}
+function accuracy(model::Chain, ps::NeuralNetworkParameters, dl::DataLoader{T, AT, BT}) where {T, T1<:Integer, AT<:AbstractArray{T}, BT<:AbstractArray{T1}}
     output_tensor = model(dl.input, ps)
     output_estimate = assign_output_estimate(output_tensor, dl.output_time_steps)
     backend = KernelAbstractions.get_backend(output_estimate)

diff --git a/src/data_loader/optimize.jl b/src/data_loader/optimize.jl
@@ -47,7 +47,7 @@ number_of_batches(dl, batch)
 3
 ```
 """
-function optimize_for_one_epoch!(opt::Optimizer, model, ps::Union{Tuple, NamedTuple}, dl::DataLoader{T}, batch::Batch, loss::Union{typeof(loss), NetworkLoss}, λY) where T
+function optimize_for_one_epoch!(opt::Optimizer, model, ps::Union{NeuralNetworkParameters, NamedTuple}, dl::DataLoader{T}, batch::Batch, loss::Union{typeof(loss), NetworkLoss}, λY) where T
     count = 0
     total_error = T(0)
     batches = batch(dl)
@@ -61,12 +61,16 @@ function optimize_for_one_epoch!(opt::Optimizer, model, ps::Union{Tuple, NamedTu
             Zygote.pullback(ps -> loss(model, ps, input_nt_output_nt), ps)
         end
         total_error += loss_value
-        dp = pullback(one(loss_value))[1]
+        dp = return_correct_named_tuple(pullback(one(loss_value))[1])
         optimization_step!(opt, λY, ps, dp)
     end
     total_error / count
 end
 
+# this is needed because of the specific way in which we store nn parameters
+return_correct_named_tuple(dx::NamedTuple{(:params, )}) = dx.params
+return_correct_named_tuple(dx) = dx
+
 _copy(a::AbstractArray) = copy(a)
 _copy(qp::QPT) = (q = copy(qp.q), p = copy(qp.p))
 _copy(t::Tuple{<:QPTOAT, <:QPTOAT}) = _copy.(t)