diff --git a/Project.toml b/Project.toml index 735b2131c..06df85b61 100644 --- a/Project.toml +++ b/Project.toml @@ -41,7 +41,6 @@ GeometricEquations = "0.16" GeometricIntegrators = "0.13" HDF5 = "0.16, 0.17" KernelAbstractions = "0.9" -Lux = "0.4, 0.5" NNlib = "0.8, 0.9" ProgressMeter = "1" SafeTestsets = "0.1" @@ -55,11 +54,10 @@ Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a" GeometricEquations = "c85262ba-a08a-430a-b926-d29770767bf2" GeometricProblems = "18cb22b4-ad41-5c80-9c5f-710df63fbdc9" -Lux = "b2108857-7c20-44ae-9111-449ecde12c47" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" RungeKutta = "fb486d5c-30a0-4a8a-8415-a8b4ace5a6f7" SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["ChainRulesTestUtils", "Lux", "Random", "SafeTestsets", "Test"] +test = ["ChainRulesTestUtils", "Random", "SafeTestsets", "Test"] diff --git a/code_generation/matrixinverse.jl b/code_generation/matrixinverse.jl new file mode 100644 index 000000000..1ce7dd61a --- /dev/null +++ b/code_generation/matrixinverse.jl @@ -0,0 +1,21 @@ +using Symbolics + +function matrix_inverse(n) + @variables A[1:n,1:n] + + B = inv(collect(A)) + + if n ≤ 6 + B = simplify.(B) + end + + build_function(B, A)[2] +end + +for n in 2:8 + println("$(n)x$(n)...") + expr = matrix_inverse(n) + str = replace(replace(replace(string(expr), r"#= [^*\s]* =#" => ""), r"\n[\s]*\n" => "\n"), "Num" => "") + str = replace(str, "function (ˍ₋out, A)" => "function matinv$(n)x$(n)(ˍ₋out, A)") + write("matrixinverse_$(n)x$(n).jl", str) +end diff --git a/docs/src/tikz/Makefile b/docs/src/tikz/Makefile index a85607aa9..ee0a13f6e 100644 --- a/docs/src/tikz/Makefile +++ b/docs/src/tikz/Makefile @@ -22,8 +22,6 @@ convert_with_pdftocairo: $(MYDIR)/*.pdf done png: - pdftocairo -png -r 500 -transp -singlefile logo_with_name.pdf logo_with_name - pdftocairo -png -r 500 -transp -singlefile logo_with_name_dark.pdf logo_with_name_dark pdftocairo -png -r $(res1) -transp -singlefile adam_optimizer.pdf adam_optimizer pdftocairo -png -r $(res1) -transp -singlefile adam_optimizer_dark.pdf adam_optimizer_dark pdftocairo -png -r $(res1) -transp -singlefile general_optimization.pdf general_optimization @@ -36,9 +34,9 @@ png: pdftocairo -png -r $(res2) -transp -singlefile sympnet_architecture_dark.pdf sympnet_architecture_dark pdftocairo -png -r $(res3) -transp -singlefile structs_visualization.pdf structs_visualization pdftocairo -png -r $(res3) -transp -singlefile structs_visualization_dark.pdf structs_visualization_dark - pdftocairo -png -r $(res1) -transp -singlefile logo.pdf logo pdftocairo -png -r 500 -transp -singlefile logo_with_name.pdf logo_with_name pdftocairo -png -r 500 -transp -singlefile logo_with_name_dark.pdf logo_with_name_dark + pdftocairo -png -r $(res1) -transp -singlefile logo.pdf logo pdftocairo -png -r $(res1) -transp -singlefile symplectic_autoencoder.pdf symplectic_autoencoder pdftocairo -png -r $(res1) -transp -singlefile symplectic_autoencoder_dark.pdf symplectic_autoencoder_dark pdftocairo -png -r $(res1) -transp -singlefile solution_manifold_2.pdf solution_manifold_2 diff --git a/docs/src/tutorials/sympnet_tutorial.md b/docs/src/tutorials/sympnet_tutorial.md index 35066f75e..30c4710f1 100644 --- a/docs/src/tutorials/sympnet_tutorial.md +++ b/docs/src/tutorials/sympnet_tutorial.md @@ -78,7 +78,10 @@ H:(q,p)\in\mathbb{R}^2 \mapsto \frac{1}{2}p^2-cos(q) \in \mathbb{R}. Here we generate pendulum data with the script `GeometricMachineLearning/scripts/pendulum.jl`: ```@example sympnet -using GeometricMachineLearning +using GeometricMachineLearning # hide +import Random # hide + +Random.seed!(1234) # load script include("../../../scripts/pendulum.jl") @@ -165,7 +168,7 @@ The train function will change the parameters of the neural networks and gives a The trainings data `data_q` and `data_p` must be matrices of $\mathbb{R}^{n\times d}$ where $n$ is the length of data and $d$ is the half of the dimension of the system, i.e `data_q[i,j]` is $q_j(t_i)$ where $(t_1,...,t_n)$ are the corresponding time of the training data. -Then we can make prediction. Let's compare the initial data with a prediction starting from the same phase space point using the provided function Iterate_Sympnet: +Then we can make prediction. Let's compare the initial data with a prediction starting from the same phase space point using the provided function `iterate`: ```@example sympnet ics = (q=qp_data.q[:,1], p=qp_data.p[:,1]) diff --git a/scripts/Project.toml b/scripts/Project.toml index 23feb61e7..4bf77383d 100644 --- a/scripts/Project.toml +++ b/scripts/Project.toml @@ -1,10 +1,13 @@ [deps] -Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +GeometricEquations = "c85262ba-a08a-430a-b926-d29770767bf2" GeometricIntegrators = "dcce2d33-59f6-5b8d-9047-0defad88ae06" GeometricMachineLearning = "194d25b2-d3f5-49f0-af24-c124f4aa80cc" +GeometricProblems = "18cb22b4-ad41-5c80-9c5f-710df63fbdc9" +GeometricSolutions = "7843afe4-64f4-4df4-9231-049495c56661" LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -Lux = "b2108857-7c20-44ae-9111-449ecde12c47" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" +ProfileView = "c46f51b8-102a-5cf2-8d2c-8597cb0e0da7" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" diff --git a/scripts/transformer_coupled_oscillator/auxiliary_functions.jl b/scripts/transformer_coupled_oscillator/auxiliary_functions.jl deleted file mode 100644 index 458aa7a6f..000000000 --- a/scripts/transformer_coupled_oscillator/auxiliary_functions.jl +++ /dev/null @@ -1,81 +0,0 @@ -using KernelAbstractions, ChainRulesCore, LinearAlgebra - -batch = KernelAbstractions.allocate(backend, T, sys_dim, seq_length, batch_size) -output = KernelAbstractions.allocate(backend, T, sys_dim, prediction_window, batch_size) -#output_estimate = prediction_window == 1 ? KernelAbstractions.allocate(backend, T, dim, batch_size) : KernelAbstractions.allocate(backend, T, dim, prediction_window, batch_size) - -# this kernel draws a batch based on arrays of parameters and time_steps -@kernel function assign_batch_kernel!(batch::AbstractArray{T, 3}, data::AbstractArray{T, 3}, params, time_steps) where T - i,j,k = @index(Global, NTuple) - time_step = time_steps[k] - param = params[k] - batch[i,j,k] = data[i,param,time_step-1+j] -end -assign_batch! = assign_batch_kernel!(backend) - -# this kernel assigns the output based on the batch -@kernel function assign_output_kernel!(output::AbstractArray{T, 3}, data::AbstractArray{T,3}, params, time_steps) where T - i,j,k = @index(Global, NTuple) - time_step = time_steps[k] - param = params[k] - output[i,j,k] = data[i,param,time_step+seq_length+j-1] -end -assign_output! = assign_output_kernel!(backend) - -# this kernel assigns the output estimate -@kernel function assign_output_estimate_kernel!(output_estimate::AbstractArray{T, 3}, batch::AbstractArray{T,3}, seq_length, prediction_window) where T - i,j,k = @index(Global, NTuple) - output_estimate[i,j,k] = batch[i,seq_length-prediction_window+j,k] -end -assign_output_estimate! = assign_output_estimate_kernel!(backend) -function assign_output_estimate(batch::AbstractArray{T, 3}, seq_length, prediction_window) where T - output_estimate = KernelAbstractions.allocate(backend, T, sys_dim, prediction_window, batch_size) - assign_output_estimate!(output_estimate, batch, seq_length, prediction_window, ndrange=size(output_estimate)) - output_estimate -end - -# draw batch (for one training step) -function draw_batch!(batch::AbstractArray{T, 3}, output::AbstractArray{T, 3}, seq_length, prediction_window) where T - params = KernelAbstractions.allocate(backend, T, batch_size) - time_steps = KernelAbstractions.allocate(backend, T, batch_size) - rand!(Random.default_rng(), params) - rand!(Random.default_rng(), time_steps) - params = Int.(ceil.(n_params*params)) - time_steps = Int.(ceil.((n_time_steps-seq_length+1-prediction_window)*time_steps)) - assign_batch!(batch, data, params, time_steps, ndrange=size(batch)) - assign_output!(output, data, params, time_steps, ndrange=size(output)) -end - -function loss(model, ps, batch::AbstractArray{T, 3}, output::AbstractArray{T}, seq_length) where T - prediction_window = size(output, 2) - batch_output = model(batch, ps) - output_estimate = assign_output_estimate(batch_output, seq_length, prediction_window) - norm(output - output_estimate)/T(sqrt(batch_size))/T(sqrt(prediction_window)) -end -loss(model, ps) = loss(model, ps, batch, output, seq_length) - -@kernel function augment_zeros_kernel!(zero_tensor::AbstractArray{T, 3}, output_diff::AbstractArray{T, 3}, seq_length, prediction_window) where T - i,j,k = @index(Global, NTuple) - zero_tensor[i,seq_length-prediction_window+j,k] = output_diff[i,j,k] -end -augment_zeros! = augment_zeros_kernel!(backend) -function augment_zeros(output_diff::AbstractArray{T, 3}, seq_length) where T - dim, prediction_window, batch_size = size(output_diff) - zero_tensor = KernelAbstractions.zeros(backend, T, sys_dim, seq_length, batch_size) - augment_zeros!(zero_tensor, output_diff, seq_length, prediction_window, ndrange=size(output_diff)) - zero_tensor -end - -function ChainRulesCore.rrule(::typeof(assign_output_estimate), batch::AbstractArray{T, 3}, seq_length, prediction_window) where T - output_estimate = assign_output_estimate(batch, seq_length, prediction_window) - function assign_output_estimate_pullback(output_diff::AbstractArray) - f̄ = NoTangent() - batch_diff = @thunk augment_zeros(output_diff, seq_length) - return f̄, batch_diff, NoTangent(), NoTangent() - end - return output_estimate, assign_output_estimate_pullback -end - -# using ChainRulesTestUtils -# draw_batch!(batch, output) -# test_rrule(assign_output_estimate, batch) \ No newline at end of file diff --git a/scripts/transformer_coupled_oscillator/generate_data.jl b/scripts/transformer_coupled_oscillator/generate_data.jl deleted file mode 100644 index 2b4d55318..000000000 --- a/scripts/transformer_coupled_oscillator/generate_data.jl +++ /dev/null @@ -1,68 +0,0 @@ -using GeometricIntegrators, KernelAbstractions, JLD2 - -T = Float64 - -initial_conditions_collection = ( (q=[T(1.),T(0.)], p=[T(2.),T(0.)]), ) - -m1 = T(2.) -m2 = T(1.) -k1 = T(1.5) -k2 = T(0.3) -k = T.(0.0:0.1:4) - -params_collection = Tuple([(m1=m1,m2=m2,k1=k1,k2=k2,k=k_val) for k_val in k]) - -const t_integration = 100 -const time_step = T(.4) - -function q̇(v, t, q, p, params) - v[1] = p[1]/params.m1 - v[2] = p[2]/params.m2 -end - -function sigmoid(x::T) where {T<:Real} - T(1)/(T(1) + exp(-x)) -end - -function ṗ(f, t, q, p, params) - f[1] = -params.k1 * q[1] - params.k * (q[1] - q[2]) * sigmoid(q[1]) - params.k /2 * (q[1] - q[2])^2 * sigmoid(q[1])^2 * exp(-q[1]) - f[2] = -params.k2 * q[2] + params.k * (q[1] - q[2]) * sigmoid(q[1]) -end - -@kernel function create_tensor_kernel_q!(data_tensor, sols) - i,j,k = @index(Global, NTuple) - data_tensor[i,j,k] = sols[j].q[k-1][i] -end -@kernel function create_tensor_kernel_p!(data_tensor, sols) - i,j,k = @index(Global, NTuple) - data_tensor[i+2,j,k] = sols[j].p[k-1][i] -end -function assign_tensor(data_tensor, sols) - assign_q! = create_tensor_kernel_q!(CPU()) - assign_p! = create_tensor_kernel_p!(CPU()) - dims = (2, size(data_tensor,2), size(data_tensor,3)) - assign_q!(data_tensor, sols, ndrange=dims) - assign_p!(data_tensor, sols, ndrange=dims) -end - -function generate_data(params_collection=params_collection, initial_conditions=initial_conditions, t_integration=t_integration, Float32_not_working_yet=true) - Float32_not_working_yet ? (@warn "Float32 not used in integration!!") : nothing - sols = [] - for params in params_collection - for initial_conditions in initial_conditions_collection - pode = PODEProblem(q̇, ṗ, (0.0, t_integration), time_step, initial_conditions; parameters = params) - sol = integrate(pode,ImplicitMidpoint()) - push!(sols, sol) - end - end - - time_steps = length(sols[1].q) - data_tensor = zeros(4, length(sols), time_steps) - assign_tensor(data_tensor, sols) - Float32_not_working_yet ? Float32.(data_tensor) : data_tensor -end - -data_tensor = generate_data() - -jldsave("data", tensor=data_tensor, params=params_collection, initial_conditions=initial_conditions_collection, - time_interval=(0, t_integration), time_step=time_step) \ No newline at end of file diff --git a/scripts/transformer_coupled_oscillator/train_symplectic_transformer.jl b/scripts/transformer_coupled_oscillator/train_symplectic_transformer.jl deleted file mode 100644 index ffaa3b291..000000000 --- a/scripts/transformer_coupled_oscillator/train_symplectic_transformer.jl +++ /dev/null @@ -1,59 +0,0 @@ -using GeometricMachineLearning, ProgressMeter, Zygote -using KernelAbstractions -using CUDA -using Random -using JLD2 - -backend = CPU() -T = Float32 - -file = jldopen("data", "r") -data_raw = file["tensor"] -sys_dim, n_params, n_time_steps = size(data_raw) - -data = KernelAbstractions.allocate(backend, T, size(data_raw)) -copyto!(data, data_raw) - -transformer_dim = 20 -num_heads = 4 -seq_length = 20 -n_epochs = 2000 -batch_size = 512 -prediction_window = 5 -include("auxiliary_functions.jl") - -model = Chain( PSDLayer(sys_dim, transformer_dim), - Attention(transformer_dim), - Gradient(transformer_dim, 2*transformer_dim, tanh), - Gradient(transformer_dim, 2*transformer_dim, tanh, change_q=false), - Attention(transformer_dim), - Gradient(transformer_dim, 2*transformer_dim, tanh, change_q=false), - Gradient(transformer_dim, 2*transformer_dim, tanh), - PSDLayer(transformer_dim, sys_dim) - ) - -loss(ps) = loss(model, ps) -ps = initialparameters(backend, T, model) - -o = Optimizer(AdamOptimizer(), ps) - -n_training_steps_per_epoch = Int(ceil(n_time_steps/batch_size)) -n_training_steps = n_epochs*n_training_steps_per_epoch - -progress_object = Progress(n_training_steps; enabled=true) - -for t in 1:n_training_steps - draw_batch!(batch, output, seq_length, prediction_window) - loss_val, pullback = Zygote.pullback(loss, ps) - dx = pullback(1)[1] - optimization_step!(o, model, ps, dx) - ProgressMeter.next!(progress_object; showvalues = [(:TrainingLoss,loss_val)]) -end - -map_to_cpu(ps::Tuple) = Tuple([map_to_cpu(layer) for layer in ps]) -map_to_cpu(layer::NamedTuple) = apply_toNT(map_to_cpu, layer) -function map_to_cpu(A::AbstractArray{T}) where T - Array{T}(A) -end - -jldsave("nn_model_symplectic", model=model, ps=ps, seq_length=seq_length, prediction_window=prediction_window, transformer_dim=transformer_dim, num_heads=num_heads) diff --git a/scripts/transformer_coupled_oscillator/train_transformer.jl b/scripts/transformer_coupled_oscillator/train_transformer.jl deleted file mode 100644 index 176e83785..000000000 --- a/scripts/transformer_coupled_oscillator/train_transformer.jl +++ /dev/null @@ -1,71 +0,0 @@ -using GeometricMachineLearning, ProgressMeter, Zygote -using KernelAbstractions -using CUDA -using Random -using JLD2 - -backend = CPU() -T = Float32 - -file = jldopen("data", "r") -data_raw = file["tensor"] -sys_dim, n_params, n_time_steps = size(data_raw) - -data = KernelAbstractions.allocate(backend, T, size(data_raw)) -copyto!(data, data_raw) - -transformer_dim = 20 -num_heads = 4 -seq_length = 100 -n_epochs = 200 -batch_size = 128 -prediction_window = 50 -include("auxiliary_functions.jl") - -#= -model = Chain( MultiHeadAttention(dim,2,Stiefel=true), - Gradient(dim,5*dim,tanh,change_q=true), - Gradient(dim,5*dim,tanh,change_q=false), - MultiHeadAttention(dim,2,Stiefel=true), - Gradient(dim,5*dim,tanh,change_q=false), - Gradient(dim,5*dim,tanh,change_q=true), - MultiHeadAttention(dim,2,Stiefel=true), - Gradient(dim,5*dim,tanh,change_q=true), - Gradient(dim,5*dim,tanh,change_q=false), - Gradient(dim,5*dim,identity,change_q=true), - Gradient(dim,5*dim,identity,change_q=false)) -=# - -model = Chain( Dense(sys_dim, transformer_dim, tanh), - MultiHeadAttention(transformer_dim, num_heads), - ResNet(transformer_dim, tanh), - MultiHeadAttention(transformer_dim, num_heads), - ResNet(transformer_dim, tanh), - Dense(transformer_dim, sys_dim, identity) - ) - -loss(ps) = loss(model, ps) -ps = initialparameters(backend, T, model) - -o = Optimizer(AdamOptimizer(), ps) - -n_training_steps_per_epoch = Int(ceil(n_time_steps/batch_size)) -n_training_steps = n_epochs*n_training_steps_per_epoch - -progress_object = Progress(n_training_steps; enabled=true) - -for t in 1:n_training_steps - draw_batch!(batch, output, seq_length, prediction_window) - loss_val, pullback = Zygote.pullback(loss, ps) - dx = pullback(1)[1] - optimization_step!(o, model, ps, dx) - ProgressMeter.next!(progress_object; showvalues = [(:TrainingLoss,loss_val)]) -end - -map_to_cpu(ps::Tuple) = Tuple([map_to_cpu(layer) for layer in ps]) -map_to_cpu(layer::NamedTuple) = apply_toNT(map_to_cpu, layer) -function map_to_cpu(A::AbstractArray{T}) where T - Array{T}(A) -end - -jldsave("nn_model", model=model, ps=ps, seq_length=seq_length, prediction_window=prediction_window, transformer_dim=transformer_dim, num_heads=num_heads) diff --git a/scripts/transformer_coupled_oscillator/validation.jl b/scripts/transformer_coupled_oscillator/validation.jl deleted file mode 100644 index 878e29763..000000000 --- a/scripts/transformer_coupled_oscillator/validation.jl +++ /dev/null @@ -1,72 +0,0 @@ -using Plots, JLD2, GeometricIntegrators, AbstractNeuralNetworks, GeometricMachineLearning - -# this file stores parameters relevant for the NN -file_nn = jldopen("nn_model", "r") -model = file_nn["model"] -ps = file_nn["ps"] -seq_length = file_nn["seq_length"] -prediction_window = file_nn["prediction_window"] -transformer_dim = file_nn["transformer_dim"] -num_heads = file_nn["num_heads"] - -# this file stores parameters relevant for dataset/vector field (theoretically only needed for validation!) -file_data = jldopen("data", "r") - -# somehow find a routine so that you don't have to define the vector field twice -function q̇(v, t, q, p, params) - v[1] = p[1]/params.m1 - v[2] = p[2]/params.m2 -end -function sigmoid(x::T) where {T<:Real} - T(1)/(T(1) + exp(-x)) -end -function ṗ(f, t, q, p, params) - f[1] = -params.k1 * q[1] - params.k * (q[1] - q[2]) * sigmoid(q[1]) - params.k /2 * (q[1] - q[2])^2 * sigmoid(q[1])^2 * exp(-q[1]) - f[2] = -params.k2 * q[2] + params.k * (q[1] - q[2]) * sigmoid(q[1]) -end -time_step = file_data["time_step"] - -model = Chain( Dense(dim, transformer_dim, tanh), - MultiHeadAttention(transformer_dim, num_heads), - ResNet(transformer_dim, tanh), - MultiHeadAttention(transformer_dim, num_heads), - ResNet(transformer_dim, tanh), - Dense(transformer_dim, dim, identity) - ) - -T = Float64 -m1 = T(2.) -m2 = T(1.) -k1 = T(1.5) -k2 = T(0.3) -k = T(3.5) -params = (m1=m1, m2=m2, k1=k1, k2=k2, k=k) -initial_conditions_val = (q=[T(1.),T(0.)], p=[T(2.),T(0.)]) -t_integration = 50 -pode = PODEProblem(q̇, ṗ, (T(0.0), T(t_integration)), time_step, initial_conditions_val; parameters = params) -sol = integrate(pode,ImplicitMidpoint()) - -data_matrix = zeros(T, 4, Int(t_integration/time_step) + 1) -for i in 1:seq_length data_matrix[1:2, i] = sol.q[i-1] end -for i in 1:seq_length data_matrix[3:4, i] = sol.p[i-1] end - -function compute_step(input::AbstractMatrix{T}) - model(input, ps)[:,(seq_length-prediction_window+1):end] -end - -total_steps = Int(floor((Int(t_integration/time_step) - seq_length + 1)/prediction_window)) -for i in 1:total_steps - data_matrix[:,(i-1)*prediction_window + seq_length + 1: i*prediction_window + seq_length] = compute_step(data_matrix[:,(i-1)*prediction_window+1:(i-1)*prediction_window+seq_length]) -end - -q1 = zeros(size(data_matrix,2)) -t = zeros(size(data_matrix,2)) -for i in 1:length(q1) - q1[i] = sol.q[i-1][1] - t[i] = sol.t[i-1] -end -plot1 = plot(t, q1, label="Numeric Integration", size=(1000,600)) -plot!(plot1, t, data_matrix[1,:], label="Neural Network") -vline!(plot1, [seq_length*time_step], color="red",label="Start of Prediction") - -png(plot1, "seq_length"*string(seq_length)*"_prediction_window"*string(prediction_window)) \ No newline at end of file diff --git a/scripts/transformer_coupled_oscillator/visualize_data.jl b/scripts/transformer_coupled_oscillator/visualize_data.jl deleted file mode 100644 index fc0488a96..000000000 --- a/scripts/transformer_coupled_oscillator/visualize_data.jl +++ /dev/null @@ -1,42 +0,0 @@ -using Plots - -include("generate_data.jl") -# here the second point mass is altered -params_collection = ( (m1=2, m2=1, k1=1.5, k2=0.3, k=0.0), - (m1=2, m2=1, k1=1.5, k2=0.3, k=0.5), - (m1=2, m2=1, k1=1.5, k2=0.3, k=0.75), - (m1=2, m2=1, k1=1.5, k2=0.3, k=1.0), - #(m1=2, m2=1, k1=1.5, k2=0.3, k=1.5), - (m1=2, m2=1, k1=1.5, k2=0.3, k=2.0), - #(m1=2, m2=1, k1=1.5, k2=0.3, k=2.5), - (m1=2, m2=1, k1=1.5, k2=0.3, k=3.0), - #(m1=2, m2=1, k1=1.5, k2=0.3, k=3.5), - (m1=2, m2=1, k1=1.5, k2=0.3, k=4.0) - ) - -initial_conditions_collection = ( (q=[1.,0.], p=[2.,0.]), ) - -t_integration_plots = 100 - -function plot_curves(data_tensor::AbstractArray{T, 3}, one_plot=true, psize=(1500,1000)) where T - q₁ = data_tensor[1,:,:] - q₂ = data_tensor[2,:,:] - p₁ = data_tensor[3,:,:] - p₂ = data_tensor[4,:,:] - h = t_integration_plots/(size(q₁, 2) - 1) - t = 0.0:h:t_integration_plots - n_param_sets = length(params_collection) - labels = reshape(["k = "*string(params.k) for params in params_collection], 1, n_param_sets) - plot_q₁ = one_plot ? plot(t, q₁', size=psize) : plot(t, q₁', layout=(n_param_sets, 1), size=psize, label=labels, legend=:topright) - plot_q₂ = one_plot ? plot(t, q₂', size=psize) : plot(t, q₂', layout=(n_param_sets, 1), size=psize, label=labels, legend=:topright) - plot_p₁ = one_plot ? plot(t, p₁', size=psize) : plot(t, p₁', layout=(n_param_sets, 1), size=psize, label=labels, legend=:topright) - plot_p₂ = one_plot ? plot(t, p₂', size=psize) : plot(t, p₁', layout=(n_param_sets, 1), size=psize, label=labels, legend=:topright) - png(plot_q₁, "plots_of_data_set/q1") - png(plot_q₂, "plots_of_data_set/q2") - png(plot_p₁, "plots_of_data_set/p1") - png(plot_p₂, "plots_of_data_set/p2") -end - -data = generate_data(params_collection, initial_conditions_collection, t_integration_plots) -plot_curves(data, false) - diff --git a/scripts/volume_preserving_transformer/rigid_body.jl b/scripts/volume_preserving_transformer/rigid_body.jl new file mode 100644 index 000000000..e85564064 --- /dev/null +++ b/scripts/volume_preserving_transformer/rigid_body.jl @@ -0,0 +1,189 @@ +using CUDA +using Plots; pyplot() +using GeometricMachineLearning +using GeometricMachineLearning: map_to_cpu +using GeometricIntegrators: integrate, ImplicitMidpoint +using GeometricProblems.RigidBody: odeproblem, odeensemble, default_parameters +using LaTeXStrings +import Random + +# hyperparameters for the problem +const tstep = .2 +const tspan = (0., 20.) +const ics₁ = [[sin(val), 0., cos(val)] for val in .1:.01:(2*π)] +const ics₂ = [[0., sin(val), cos(val)] for val in .1:.01:(2*π)] +const ics = [ics₁..., ics₂...] + +ensemble_problem = odeensemble(ics; tspan = tspan, tstep = tstep, parameters = default_parameters) +ensemble_solution = integrate(ensemble_problem, ImplicitMidpoint()) + +dl₁ = DataLoader(ensemble_solution) + +# hyperparameters concerning architecture +const sys_dim = size(dl₁.input, 1) +const n_heads = 1 +const L = 3 # transformer blocks +const activation = tanh +const n_linear = 1 +const n_blocks = 2 +const skew_sym = true + +# backend +const backend = CUDABackend() + +# data type +const T = Float32 + +# data loader +const dl = backend == CPU() ? DataLoader(dl₁.input) : DataLoader(dl₁.input |> CuArray{T}) + +# hyperparameters concerning training +const n_epochs = 500000 +const batch_size = 16384 +const seq_length = 3 +const opt_method = AdamOptimizerWithDecay(n_epochs, T; η₁ = 1e-2, η₂ = 1e-6) +const resnet_activation = tanh + +# parameters for evaluation +ics_val = [sin(1.1), 0., cos(1.1)] +ics_val₂ = [0., sin(1.1), cos(1.1)] +const t_validation = 14 +const t_validation_long = 100 + +function train_the_network(nn₀::GeometricMachineLearning.NeuralNetwork, batch::Batch) + Random.seed!(1234) + + o₀ = Optimizer(opt_method, nn₀) + + loss_array = o₀(nn₀, dl, batch, n_epochs) + + GeometricMachineLearning.map_to_cpu(nn₀), loss_array +end + +function setup_and_train(model::GeometricMachineLearning.Architecture, batch::Batch) + Random.seed!(1234) + + nn₀ = NeuralNetwork(model, backend, T) + + train_the_network(nn₀, batch) +end + +function setup_and_train(model::GeometricMachineLearning.Chain, batch::Batch) + Random.seed!(1234) + + nn₀ = NeuralNetwork(model, backend, T) + nn₀ = NeuralNetwork(GeometricMachineLearning.DummyTransformer(seq_length), nn₀.model, nn₀.params) + + train_the_network(nn₀, batch) +end + +feedforward_batch = Batch(batch_size) +transformer_batch = Batch(batch_size, seq_length, seq_length) + +# attention only +# model₁ = Chain(VolumePreservingAttention(sys_dim, seq_length; skew_sym = skew_sym)) + +model₂ = VolumePreservingFeedForward(sys_dim, n_blocks * L, n_linear, resnet_activation) + +model₃ = VolumePreservingTransformer(sys_dim, seq_length; n_blocks = n_blocks, n_linear = n_linear, L = L, activation = resnet_activation, skew_sym = skew_sym) + +model₄ = RegularTransformerIntegrator(sys_dim, sys_dim, n_heads; n_blocks = n_blocks, L = L, resnet_activation = resnet_activation, add_connection = false) + +# nn₁, loss_array₁ = setup_and_train(model₁, transformer_batch) +nn₂, loss_array₂ = setup_and_train(model₂, feedforward_batch) +nn₃, loss_array₃ = setup_and_train(model₃, transformer_batch) +nn₄, loss_array₄ = setup_and_train(model₄, transformer_batch) + +function numerical_solution(sys_dim::Int, t_integration::Int, tstep::Real, ics_val::Vector) + validation_problem = odeproblem(ics_val; tspan = (0.0, t_integration), tstep = tstep, parameters = default_parameters) + sol = integrate(validation_problem, ImplicitMidpoint()) + + numerical_solution = zeros(sys_dim, length(sol.t)) + for i in axes(sol.t, 1) numerical_solution[:, i+1] = sol.q[i] end + + t_array = zeros(length(sol.t)) + for i in axes(sol.t, 1) t_array[i+1] = sol.t[i] end + + T.(numerical_solution), T.(t_array) +end + +function plot_validation(t_validation; nn₂=nn₂, nn₃=nn₃, nn₄=nn₄, plot_regular_transformer = false, plot_vp_transformer = false) + + numerical, t_array = numerical_solution(sys_dim, t_validation, tstep, ics_val) + + # nn₁_solution = iterate(nn₁, numerical[:, 1:seq_length]; n_points = Int(floor(t_validation / tstep)) + 1) + nn₂_solution = iterate(nn₂, numerical[:, 1]; n_points = Int(floor(t_validation / tstep)) + 1) + nn₃_solution = iterate(nn₃, numerical[:, 1:seq_length]; n_points = Int(floor(t_validation / tstep)) + 1, prediction_window = seq_length) + nn₄_solution = iterate(nn₄, numerical[:, 1:seq_length]; n_points = Int(floor(t_validation / tstep)) + 1, prediction_window = seq_length) + + ########################### plot validation + + p_validation = plot(t_array, numerical[1, :], label = "implicit midpoint", color = 1, linewidth = 2, dpi = 400, ylabel = "z", xlabel = "time") + + # plot!(p_validation, t_array, nn₁_solution[1, :], label = "attention only", color = 2, linewidth = 2) + + plot!(p_validation, t_array, nn₂_solution[1, :], label = "feedforward", color = 3, linewidth = 2) + + if plot_vp_transformer + plot!(p_validation, t_array, nn₃_solution[1, :], label = "transformer", color = 4, linewidth = 2) + end + + if plot_regular_transformer + plot!(p_validation, t_array, nn₄_solution[1, :], label = "standard transformer", color = 5, linewidth = 2) + end + + p_validation +end + +p_validation = plot_validation(t_validation; plot_regular_transformer = true, plot_vp_transformer = true) +p_validation_long = plot_validation(t_validation_long) + +########################### plot training loss + +p_training_loss = plot(loss_array₃, label = "transformer", color = 4, linewidth = 2, yaxis = :log, dpi = 400, ylabel = "training loss", xlabel = "epoch") + +# plot!(loss_array₁, label = "attention only", color = 2, linewidth = 2) + +plot!(p_training_loss, loss_array₂, label = "feedforward", color = 3, linewidth = 2) + +plot!(p_training_loss, loss_array₄, label = "standard transformer", color = 5, linewidth = 2) + +########################## plot 3d validation + +function sphere(r, C) # r: radius; C: center [cx,cy,cz] + n = 100 + u = range(-π, π; length = n) + v = range(0, π; length = n) + x = C[1] .+ r*cos.(u) * sin.(v)' + y = C[2] .+ r*sin.(u) * sin.(v)' + z = C[3] .+ r*ones(n) * cos.(v)' + return x, y, z +end + +function make_validation_plot3d(t_validation::Int, nn::NeuralNetwork) + numerical, _ = numerical_solution(sys_dim, t_validation, tstep, ics_val) + numerical₂, _ = numerical_solution(sys_dim, t_validation, tstep, ics_val₂) + + prediction_window = typeof(nn) <: NeuralNetwork{<:GeometricMachineLearning.TransformerIntegrator} ? seq_length : 1 + + nn₁_solution = iterate(nn, numerical[:, 1:seq_length]; n_points = Int(floor(t_validation / tstep)) + 1, prediction_window = prediction_window) + nn₁_solution₂ = iterate(nn, numerical₂[:, 1:seq_length]; n_points = Int(floor(t_validation / tstep)) + 1, prediction_window = prediction_window) + + ########################### plot validation + + p_validation = surface(sphere(1., [0., 0., 0.]), alpha = .2, colorbar = false, dpi = 400, xlabel = L"z_1", ylabel = L"z_2", zlabel = L"z_3", xlims = (-1, 1), ylims = (-1, 1), zlims = (-1, 1), aspect_ratio = :equal) + + plot!(p_validation, numerical[1, :], numerical[2, :], numerical[3, :], label = "implicit midpoint", color = 1, linewidth = 2, dpi = 400) + plot!(p_validation, numerical₂[1, :], numerical₂[2, :], numerical₂[3, :], label = nothing, color = 1, linewidth = 2, dpi = 400) + + plot!(p_validation, nn₁_solution[1, :], nn₁_solution[2,:], nn₁_solution[3, :], label = "volume-preserving transformer", color = 4, linewidth = 2) + plot!(p_validation, nn₁_solution₂[1, :], nn₁_solution₂[2,:], nn₁_solution₂[3, :], label = nothing, color = 4, linewidth = 2) + + p_validation +end + +p_validation3d = make_validation_plot3d(t_validation_long, nn₃) + +png(p_validation, joinpath(@__DIR__, "rigid_body/validation_"*string(seq_length))) +png(p_training_loss, joinpath(@__DIR__, "rigid_body/training_loss_"*string(seq_length))) +png(p_validation3d, joinpath(@__DIR__, "rigid_body/validation3d_"*string(seq_length))) \ No newline at end of file diff --git a/src/GeometricMachineLearning.jl b/src/GeometricMachineLearning.jl index aedc02d64..b32407f06 100644 --- a/src/GeometricMachineLearning.jl +++ b/src/GeometricMachineLearning.jl @@ -30,8 +30,24 @@ module GeometricMachineLearning export dim import GeometricIntegrators.Integrators: method, GeometricIntegrator import NNlib: σ, sigmoid, softmax + import Base: iterate #import LogExpFunctions: softmax + export CPU, GPU + export Chain, NeuralNetwork + export Dense, Linear + export initialparameters + export parameterlength + + export σ, sigmoid, softmax + + # from GeometricBase to print docs + export description + + include("data_loader/data_loader.jl") + + include("loss/losses.jl") + # INCLUDE ARRAYS include("arrays/skew_symmetric.jl") include("arrays/symmetric.jl") @@ -43,28 +59,12 @@ module GeometricMachineLearning include("arrays/lower_triangular.jl") include("arrays/upper_triangular.jl") - export SymmetricMatrix, SymplecticPotential, SkewSymMatrix, LowerTriangular, UpperTriangular + export SymmetricMatrix, SymplecticPotential, SkewSymMatrix export StiefelLieAlgHorMatrix export SymplecticLieAlgMatrix, SymplecticLieAlgHorMatrix export GrassmannLieAlgHorMatrix export StiefelProjection, SymplecticProjection - - - export CPU, GPU - export Chain, NeuralNetwork - export Dense, Linear - export initialparameters - export parameterlength - - export σ, sigmoid, softmax - - # from GeometricBase to print docs - export description - - include("data_loader/data_loader.jl") - - include("loss/loss_routines.jl") - include("loss/losses.jl") + export LowerTriangular, UpperTriangular include("kernels/assign_q_and_p.jl") include("kernels/tensor_mat_mul.jl") @@ -148,18 +148,21 @@ module GeometricMachineLearning include("layers/grassmann_layer.jl") include("layers/multi_head_attention.jl") include("layers/volume_preserving_attention.jl") + include("layers/volume_preserving_feedforward.jl") include("layers/transformer.jl") include("layers/psd_like_layer.jl") include("layers/classification.jl") - include("layers/volume_preserving_feedforward.jl") # include("layers/symplectic_stiefel_layer.jl") export StiefelLayer, GrassmannLayer, ManifoldLayer export PSDLayer export MultiHeadAttention export VolumePreservingAttention - export ResNetLayer + export VolumePreservingLowerLayer + export VolumePreservingUpperLayer + export ResNet export Transformer + export RegularTransformerIntegrator export Classification export VolumePreservingLowerLayer export VolumePreservingUpperLayer @@ -170,6 +173,7 @@ module GeometricMachineLearning export MomentumOptimizer, MomentumCache export AdamOptimizerWithDecay export AdamOptimizer, AdamCache + export AdamOptimizerWithDecay export BFGSOptimizer, BFGSCache export Optimizer @@ -237,12 +241,7 @@ module GeometricMachineLearning include("backends/backends.jl") include("backends/lux.jl") - export DataLoader, onehotbatch, accuracy - export Batch, optimize_for_one_epoch! - include("data_loader/tensor_assign.jl") - include("data_loader/matrix_assign.jl") - include("data_loader/mnist_utils.jl") - include("data_loader/batch.jl") + export TransformerLoss, FeedForwardLoss #INCLUDE ARCHITECTURES include("architectures/neural_network_integrator.jl") @@ -259,6 +258,7 @@ module GeometricMachineLearning include("architectures/LSTM_neural_network.jl") include("architectures/transformer_neural_network.jl") include("architectures/volume_preserving_feedforward.jl") + include("architectures/volume_preserving_transformer.jl") export HamiltonianNeuralNetwork export LagrangianNeuralNetwork @@ -268,17 +268,23 @@ module GeometricMachineLearning export RecurrentNeuralNetwork export LSTMNeuralNetwork export ClassificationTransformer - export ResNet - export RegularTransformerIntegrator export VolumePreservingFeedForward export train!, apply!, jacobian! - export Iterate_Sympnet + export iterate export default_arch include("architectures/default_architecture.jl") + export DataLoader, onehotbatch, accuracy + export Batch, optimize_for_one_epoch! + include("data_loader/tensor_assign.jl") + include("data_loader/matrix_assign.jl") + include("data_loader/mnist_utils.jl") + include("data_loader/batch.jl") + include("data_loader/optimize.jl") + export default_optimizer include("optimizers/default_optimizer.jl") @@ -299,7 +305,7 @@ module GeometricMachineLearning include("nnsolution/history.jl") export NeuralNetSolution - export nn, problem, tstep, history, size_history + export problem, tstep, history, size_history export set_sizemax_history include("nnsolution/neural_net_solution.jl") @@ -312,13 +318,13 @@ module GeometricMachineLearning # INCLUDE TRAINING integrator export TrainingSet - export nn, parameters # , data + export parameters # , data include("training/training_set.jl") export EnsembleTraining export isnnShared, isParametersShared, isDataShared - export nn, parameters, data + export parameters, data export push!, merge!, size include("training/ensemble_training.jl") @@ -390,5 +396,5 @@ module GeometricMachineLearning export ReducedSystem, compute_reduction_error, compute_projection_error, reduced_vector_field_from_full_explicit_vector_field, perform_integration_reduced, perform_integration_full - include("data_loader/optimize.jl") + include("map_to_cpu.jl") end \ No newline at end of file diff --git a/src/architectures/volume_preserving_transformer.jl b/src/architectures/volume_preserving_transformer.jl new file mode 100644 index 000000000..f9bc84800 --- /dev/null +++ b/src/architectures/volume_preserving_transformer.jl @@ -0,0 +1,36 @@ +@doc raw""" +The volume-preserving transformer with the Cayley activation function and built-in upscaling. The arguments for the constructor are: +- `sys_dim::Int` +- `seq_length::Int`: The sequence length of the data fed into the transformer. +- `n_blocks::Int=1` (keyword argument): The number of blocks in one transformer unit (containing linear layers and nonlinear layers). Default is `1`. +- `n_linear::Int=1` (keyword argument): The number of linear `VolumePreservingLowerLayer`s and `VolumePreservingUpperLayer`s in one block. Default is `1`. +- `L::Int=1` (keyword argument): The number of transformer units (default is 2). +- `activation=tanh` (keyward argument): The activation function (`tanh` by default). +- `init_upper::Bool=false` (keyword argument): Specifies if the network first acts on the ``q`` component. +- `skew_sym::Bool=false` (keyword argument): specifies if we the weight matrix is skew symmetric or arbitrary. +""" +struct VolumePreservingTransformer{AT} <: TransformerIntegrator + sys_dim::Int + seq_length::Int + n_blocks::Int + n_linear::Int + L::Int + activation::AT + init_upper::Bool + skew_sym::Bool +end + +function VolumePreservingTransformer(sys_dim::Int, seq_length::Int; n_blocks::Int=1, n_linear::Int=1, activation=tanh, L::Int=2, init_upper::Bool=false, skew_sym::Bool=false) + VolumePreservingTransformer{typeof(activation)}(sys_dim, seq_length, n_blocks, n_linear, L, activation, init_upper, skew_sym) +end + +function Chain(arch::VolumePreservingTransformer) + layers = () + + for _ in 1:arch.L + layers = (layers..., VolumePreservingAttention(arch.sys_dim, arch.seq_length; skew_sym = arch.skew_sym)) + layers = (layers..., Chain(VolumePreservingFeedForward(arch.sys_dim, arch.n_blocks, arch.n_linear, arch.activation; init_upper = arch.init_upper)).layers...) + end + + Chain(layers...) +end \ No newline at end of file diff --git a/src/kernels/inverses/inverse_kernel.jl b/src/kernels/inverses/inverse_kernel.jl deleted file mode 100644 index e39af406e..000000000 --- a/src/kernels/inverses/inverse_kernel.jl +++ /dev/null @@ -1,86 +0,0 @@ -""" -For now this implements the inverse for a batch in serial (fix!!!) -""" - -using ChainRulesCore, KernelAbstractions - -@kernel function assign_matrix_kernel!(B::AbstractMatrix{T}, A::AbstractArray{T, 3}, k::Integer) where T - i, j = @index(Global, NTuple) - B[i,j] = A[i,j,k] -end -@kernel function assign_tensor_kernel!(A::AbstractArray{T, 3}, B::AbstractMatrix{T}, k::Integer) where T - i, j = @index(Global, NTuple) - A[i,j,k] = B[i,j] -end -function assign_matrix(A::AbstractArray{T, 3}, k::Integer) where T - m1, m2, m3 = size(A) - backend = KernelAbstractions.get_backend(A) - B = allocate(backend, T, m1, m2) - assign_matrix! = assign_matrix_kernel!(backend) - assign_matrix!(B, A, k, ndrange=size(B)) - B -end -function assign_tensor!(A::AbstractArray{T,3}, B::AbstractMatrix{T}, total_length::Integer, k::Integer) where T - m1, m2, _ = size(A) - backend = KernelAbstractions.get_backend(B) - assign_tensor_apply! = assign_tensor_kernel!(backend) - assign_tensor_apply!(A, B, k, ndrange=size(B)) -end -function assign_tensor!(A::AbstractArray{T,3}, B::AbstractMatrix{T}, k::Integer) where T - total_length = size(A, 3) - assign_tensor!(A, B, total_length, k) -end -function assign_tensor(B::AbstractMatrix{T}, total_length::Integer, k::Integer) where T - backend = KernelAbstractions.get_backend(B) - m1, m2 = size(B) - A = KernelAbstractions.zeros(backend, T, m1, m2, total_length) - assign_tensor!(A, B, total_length, k) - A -end - -function ChainRulesCore.rrule(::typeof(assign_matrix), A::AbstractArray{T, 3}, k) where T - B = assign_matrix(A, k) - function assign_matrix_pullback(B_diff) - total_length = size(A, 3) - A_diff = assign_tensor(B_diff, total_length, k) - NoTangent(), A_diff, NoTangent() - end - B, assign_matrix_pullback -end - -function ChainRulesCore.rrule(::typeof(assign_tensor), B::AbstractMatrix{T}, total_length::Integer, k::Integer) where T - A = assign_tensor(B, total_length, k) - function assign_tensor_pullback(A_diff) - B_diff = @thunk assign_matrix(A_diff, k) - return NoTangent(), B_diff, NoTangent(), NoTangent() - end - A, assign_tensor_pullback -end - -function tensor_inverse(A::AbstractArray{T, 3}) where T - A_inv = zero(A) - total_length = size(A, 3) - #for k in 1:total_length - # B = assign_matrix(A, k) - # B_inv = B\one(B) - # A_inv += assign_tensor(B_inv, total_length, k) - #end - - for k in axes(A_inv, 3) - B = @view A[:,:,k] - A_inv[:,:,k] .= B \ one(B) - end - - A_inv -end - -assign_matrix(A::Thunk, k) = Thunk(() -> assign_matrix(unthunk(A), k)) -assign_tensor(B::Thunk, total_length, k) = Thunk(() -> assign_tensor(unthunk(B), total_length, k)) -function assign_tensor!(A::AbstractArray{T, 3}, B::Thunk, total_length, k) where T - Thunk(() -> assign_tensor!(A, unthunk(B), total_length, k)) -end - -function tensor_cayley(A::AbstractArray{T, 3}) where T - one_tensor = init_output(A) - tensor_tensor_mul(one_tensor - A, tensor_inverse(one_tensor + A)) -end \ No newline at end of file diff --git a/src/layers/attention_layer.jl b/src/layers/attention_layer.jl deleted file mode 100644 index 6a74a69c3..000000000 --- a/src/layers/attention_layer.jl +++ /dev/null @@ -1,145 +0,0 @@ -""" -MultiHeadAttention (MHA) serves as a preprocessing step in the transformer. It reweights the input vectors bases on correlations within those data. -""" -struct Attention{M, N, Stiefel, retraction, add_connection, FT} <: LayerWithOptionalManifold{M, N, Stiefel, retraction} - activation::FT -end - -default_retr = Geodesic() -function orthonormal_activation(A::AbstractMatrix{T}) where T - reshape(orthonormal_activation(reshape(A, size(A)..., 1)), size(A)...) -end - -function orthonormal_activation(A::AbstractArray{T, 3}) where T - A_ut = upper_triangular_asymmetrize(A) - fac = ceil(norm(A_ut)/size(A,3)) - expA = tensor_exponential(A_ut/fac) - expA_mul = copy(expA) - for _ in 2:fac - expA_mul = tensor_tensor_mul(expA, expA_mul) - end - expA_mul -end - -function orthonormal_activation_cayley(A::AbstractArray{T, 3}) where T - A_ut = upper_triangular_asymmetrize(A) - tensor_cayley(A_ut) -end - -function orthonormal_activation_cayley(A::AbstractMatrix{T}) where T - reshape(orthonormal_activation_cayley(reshape(A, size(A)..., 1)), size(A)...) -end - - -function Attention(dim::Integer, activation=orthonormal_activation_cayley; Stiefel::Bool=false, retraction::AbstractRetraction=default_retr, add_connection::Bool=false) - Attention{dim, dim, Stiefel, typeof(retraction), add_connection, typeof(activation)}(activation) -end - -function parameterlength(::Attention{M, M, false}) where M - 2*M^2 -end - -function parameterlength(d::Attention{M, M, true}) where M - M*(M-1) -end - -function initialparameters(d::Attention{M, M, false}, backend::KernelAbstractions.Backend, T::Type; rng::AbstractRNG=Random.default_rng(), initializer::AbstractNeuralNetworks.AbstractInitializer=GlorotUniform()) where {M} - # transformations for queries and keys. - PQ_weight = KernelAbstractions.allocate(backend, T, M, M) - PK_weight = KernelAbstractions.allocate(backend, T, M, M) - initializer(rng, PQ_weight) - initializer(rng, PK_weight) - (PQ=PQ_weight, PK=PK_weight) -end - -function initialparameters(d::Attention{M, M, true}, backend::KernelAbstractions.Backend, T::Type; rng::AbstractRNG=Random.default_rng(), initializer::AbstractNeuralNetworks.AbstractInitializer=GlorotUniform()) where {M} - # projections for queries, keys and vectors. - PQ_weight = rand(backend, rng, StiefelManifold{T}, M, M) - PK_weight = rand(backend, rng, StiefelManifold{T}, M, M) - (PQ=PQ_weight, PK=PK_weight) -end - -function (d::Attention{M, M, Stiefel, Retraction, true})(x::AbstractMatrix{T}, ps::NamedTuple) where {M, Stiefel, Retraction, T} - dim, input_length = size(x) - @assert dim == M - - x + x*d.activation((ps.PQ'*x)'*(ps.PK'*x)/T(sqrt(M))) -end - -function (d::Attention{M, M, Stiefel, Retraction, false})(x::AbstractMatrix{T}, ps::NamedTuple) where {M, Stiefel, Retraction, T} - dim, input_length = size(x) - @assert dim == M - - x*d.activation((ps.PQ'*x)'*(ps.PK'*x)/T(sqrt(M))) -end - -function (d::Attention{M, M, Stiefel, Retraction, true})(x::AbstractArray{T, 3}, ps::NamedTuple) where {M, Stiefel, Retraction, T} - dim, input_length, number_data = size(x) - @assert dim == M - - Q_tensor = mat_tensor_mul(ps.PQ', x) - K_tensor = mat_tensor_mul(ps.PK', x) - QK_tensor = tensor_transpose_tensor_mul(Q_tensor, K_tensor) - x + tensor_tensor_mul(x, d.activation(QK_tensor/T(sqrt(M)))) -end - -function (d::Attention{M, M, Stiefel, Retraction, false})(x::AbstractArray{T, 3}, ps::NamedTuple) where {M, Stiefel, Retraction, T} - dim, input_length, number_data = size(x) - @assert dim == M - - Q_tensor = mat_tensor_mul(ps.PQ', x) - K_tensor = mat_tensor_mul(ps.PK', x) - QK_tensor = tensor_transpose_tensor_mul(Q_tensor, K_tensor) - tensor_tensor_mul(x, d.activation(QK_tensor/T(sqrt(M)))) -end - -@kernel function upper_triangular_asymmetrize_kernel!(output::AbstractArray{T, 3}, input::AbstractArray{T, 3}) where T - i,j,k = @index(Global, NTuple) - if i < j - output[i,j,k] = input[i,j,k] - elseif i > j - output[i,j,k] = -input[j,i,k] - end -end - -function upper_triangular_asymmetrize(A::AbstractArray{T, 3}) where T - output = zero(A) - backend = KernelAbstractions.get_backend(A) - upper_triangular_asymmetrize! = upper_triangular_asymmetrize_kernel!(backend) - upper_triangular_asymmetrize!(output, A, ndrange=size(A)) - output -end - -### the functions starting from here are needed for computing the derivative. - -@kernel function assign_upper_triangular_kernel!(output, input, size1, size2) - k = @index(Global) - for j in 1:size2 - for i = 1:(j-1) - output[i,j,k] += input[i,j,k] - end - for i = (j+1):size1 - output[j,i,k] -= input[i,j,k] - end - end -end - -function assign_upper_triangular(A::AbstractArray{T, 3}) where T - output = zero(A) - backend = KernelAbstractions.get_backend(A) - assign_upper_triangular! = assign_upper_triangular_kernel!(backend) - assign_upper_triangular!(output, A, size(A, 1), size(A, 2), ndrange=size(A, 3)) - output -end - -function ChainRulesCore.rrule(::typeof(upper_triangular_asymmetrize), A::AbstractArray{T, 3}) where T - output = upper_triangular_asymmetrize(A) - function upper_triangular_asymmetrize_pullback(output_diff) - A_diff = @thunk assign_upper_triangular(output_diff) - return NoTangent(), A_diff - end - return output, upper_triangular_asymmetrize_pullback -end - -#upper_triangular_asymmetrize(output_diff::Thunk) = Thunk(() -> upper_triangular_asymmetrize(unthunk(output_diff))) -assign_upper_triangular(output_diff::Thunk) = Thunk(() -> assign_upper_triangular(unthunk(output_diff))) \ No newline at end of file diff --git a/src/layers/manifold_layer.jl b/src/layers/manifold_layer.jl index cbc9bb794..49a71daef 100644 --- a/src/layers/manifold_layer.jl +++ b/src/layers/manifold_layer.jl @@ -3,10 +3,14 @@ This defines a manifold layer that only has one matrix-valued manifold $A$ assoc """ abstract type ManifoldLayer{M, N, retraction} <: LayerWithManifold{M, N, retraction} end -function (d::ManifoldLayer{M, N})(x::AbstractArray, ps::NamedTuple) where {M, N} +function (d::ManifoldLayer{M, N})(x::AbstractVecOrMat, ps::NamedTuple) where {M, N} N > M ? ps.weight*x : ps.weight'*x end +function (d::ManifoldLayer{M, N})(x::AbstractArray{T, 3}, ps::NamedTuple) where {M, N, T} + N > M ? mat_tensor_mul(ps.weight, x) : mat_tensor_mul(ps.weight', x) +end + function retraction(::ManifoldLayer{N, M, Geodesic}, B::NamedTuple{(:weight,),Tuple{AT}}) where {N, M, AT<:AbstractLieAlgHorMatrix} geodesic(B) end diff --git a/src/layers/multi_head_attention.jl b/src/layers/multi_head_attention.jl index 8d3ac6e61..3f4a03d2b 100644 --- a/src/layers/multi_head_attention.jl +++ b/src/layers/multi_head_attention.jl @@ -1,12 +1,20 @@ """ MultiHeadAttention (MHA) serves as a preprocessing step in the transformer. It reweights the input vectors bases on correlations within those data. + +### Constructor +Takes input arguments: +- `dim::Int`: The system dimension +- `n_heads::Int`: The number of heads. +- `Stiefel::Bool=true` (keyword argument): whether the weights should be put on the Stiefel manifold. +- `retraction::AbstractRetraction` (keyword argument): what kind of retraction should be used. By default this is the geodesic retraction. +- `add_connection::Bool=true` (keyword argument): determines if the input should be added to the output for the final result. """ struct MultiHeadAttention{M, N, Stiefel, retraction, add_connection} <: LayerWithOptionalManifold{M, N, Stiefel, retraction} - n_heads::Integer + n_heads::Int end default_retr = Geodesic() -function MultiHeadAttention(dim::Integer, n_heads::Integer; Stiefel::Bool=false, retraction::AbstractRetraction=default_retr, add_connection::Bool=true) +function MultiHeadAttention(dim::Int, n_heads::Int; Stiefel::Bool=false, retraction::AbstractRetraction=default_retr, add_connection::Bool=true) @assert dim % n_heads == 0 MultiHeadAttention{dim, dim, Stiefel, typeof(retraction), add_connection}(n_heads) end diff --git a/src/layers/stiefel_layer.jl b/src/layers/stiefel_layer.jl index 51492410a..712ed9b5c 100644 --- a/src/layers/stiefel_layer.jl +++ b/src/layers/stiefel_layer.jl @@ -13,5 +13,5 @@ function AbstractNeuralNetworks.initialparameters(d::StiefelLayer{M,N}, backend: end function parameterlength(::StiefelLayer{M, N}) where {M, N} - N > M ? M*(M-1)÷2 + (N-M)*M : N*(N-1)÷2 + (M-N)*N + N > M ? M * (M - 1) ÷ 2 + (N - M) * M : N * (N - 1) ÷ 2 + (M - N) * N end \ No newline at end of file diff --git a/src/loss/loss_routines.jl b/src/loss/loss_routines.jl deleted file mode 100644 index ea5935b3d..000000000 --- a/src/loss/loss_routines.jl +++ /dev/null @@ -1,109 +0,0 @@ -@doc raw""" -Computes the loss for a neural network and a data set. -The computed loss is -```math -||output - \mathcal{NN}(input)||_F/||output||_F, -``` -where ``||A||_F := \sqrt{\sum_{i_1,\ldots,i_k}|a_{i_1,\ldots,i_k}^2}|^2`` is the Frobenius norm. - -It takes as input: -- `model::Union{Chain, AbstractExplicitLayer}` -- `ps::Union{Tuple, NamedTuple}` -- `input::Union{Array, NamedTuple}` -- `output::Uniont{Array, NamedTuple}` -""" -function loss(model::Union{Chain, AbstractExplicitLayer}, ps::Union{Tuple, NamedTuple}, input::AT, output::BT) where {T, T1, AT<:AbstractArray{T}, BT<:AbstractArray{T1}} - output_estimate = model(input, ps) - norm(output - output_estimate) / norm(output) -end - -@doc raw""" -The *autoencoder loss*: -```math -||output - \mathcal{NN}(input)||_F/||output||_F. -``` - -It takes as input: -- `model::Union{Chain, AbstractExplicitLayer}` -- `ps::Union{Tuple, NamedTuple}` -- `input::Union{Array, NamedTuple}` -""" -function loss(model::Union{Chain, AbstractExplicitLayer}, ps::Union{Tuple, NamedTuple}, input::BT) where {T, BT<:AbstractArray{T}} - output_estimate = model(input, ps) - norm(output_estimate - input) / norm(input) -end - -nt_diff(A, B) = (q = A.q - B.q, p = A.p - B.p) -nt_norm(A) = norm(A.q) + norm(A.p) - -function loss(model::Union{Chain, AbstractExplicitLayer}, ps::Union{Tuple, NamedTuple}, input::NT, output::NT) where {T, AT<:AbstractArray{T}, NT<:NamedTuple{(:q, :p,), Tuple{AT, AT}}} - output_estimate = model(input, ps) - nt_norm(nt_diff(output_estimate, output)) / nt_norm(input) -end - -function loss(model::Union{Chain, AbstractExplicitLayer}, ps::Union{Tuple, NamedTuple}, input::NT) where {T, AT<:AbstractArray{T}, NT<:NamedTuple{(:q, :p,), Tuple{AT, AT}}} - output_estimate = model(input, ps) - nt_norm(nt_diff(output_estimate, input)) / nt_norm(input) -end - -@doc raw""" -The transformer works similarly to the regular loss, but with the difference that ``\mathcal{NN}(input)`` and ``output`` may have different sizes. - -It takes as input: -- `model::Union{Chain, AbstractExplicitLayer}` -- `ps::Union{Tuple, NamedTuple}` -- `input::Union{Array, NamedTuple}` -- `output::Uniont{Array, NamedTuple}` -""" -function transformer_loss(model::Union{Chain, AbstractExplicitLayer}, ps::Union{Tuple, NamedTuple}, input::BT, output::BT) where {T, BT<:AbstractArray{T}} - norm( - crop_array_for_transformer_loss(model(input, ps), output) - - output - ) / - norm(input) -end - -function transformer_loss(model::Union{Chain, AbstractExplicitLayer}, ps::Union{Tuple, NamedTuple}, input::NT, output::NT) where {T, AT<:AbstractArray{T}, NT<:NamedTuple{(:q, :p,), Tuple{AT, AT}}} - output_estimate = model(input, ps) - - nt_norm( - nt_diff( - (q = crop_array_for_transformer_loss(output_estimate.q, output.q), - p = crop_array_for_transformer_loss(output_estimate.p, output.p)), - output - ) - ) / - nt_norm(input) -end - - -@doc raw""" -Alternative call of the loss function. This takes as input: -- `model` -- `ps` -- `dl::DataLoader` -""" -function loss(model::Union{Chain, AbstractExplicitLayer}, ps::Union{Tuple, NamedTuple}, dl::DataLoader{T, AT, BT}) where {T, T1, AT<:AbstractArray{T, 3}, BT<:AbstractArray{T1, 3}} - loss(model, ps, dl.input, dl.output) -end - -function loss(model::Chain, ps::Tuple, dl::DataLoader{T, BT, Nothing}) where {T, BT<:AbstractArray{T, 3}} - loss(model, ps, dl.input) -end - -function loss(model::Chain, ps::Tuple, dl::DataLoader{T, BT, Nothing}) where {T, BT<:AbstractArray{T, 2}} - loss(model, ps, dl.input) -end - -function loss(model::Chain, ps::Tuple, dl::DataLoader{T, BT}) where {T, BT<:NamedTuple} - loss(model, ps, dl.input) -end - -@doc raw""" -Wrapper if we deal with a neural network. - -You can supply an instance of `NeuralNetwork` instead of the two arguments model (of type `Union{Chain, AbstractExplicitLayer}`) and parameters (of type `Union{Tuple, NamedTuple}`). -""" -function loss(nn::NeuralNetwork, var_args...) - loss(nn.model, nn.params, var_args...) -end \ No newline at end of file diff --git a/src/map_to_cpu.jl b/src/map_to_cpu.jl new file mode 100644 index 000000000..b721d6894 --- /dev/null +++ b/src/map_to_cpu.jl @@ -0,0 +1,30 @@ +map_to_cpu(ps::Tuple) = Tuple([map_to_cpu(layer) for layer in ps]) +map_to_cpu(layer::NamedTuple) = apply_toNT(map_to_cpu, layer) +function map_to_cpu(A::AbstractArray{T}) where T + Array{T}(A) +end + +function map_to_cpu(Y::StiefelManifold{T}) where T + StiefelManifold(Array{T}(Y.A)) +end + +function map_to_cpu(U::UpperTriangular{T}) where T + UpperTriangular(Array{T}(U.S), U.n) +end + +function map_to_cpu(L::LowerTriangular{T}) where T + LowerTriangular(Array{T}(L.S), L.n) +end + +function map_to_cpu(A::SkewSymMatrix{T}) where T + SkewSymMatrix(Array{T}(A.S), A.n) +end + +function map_to_cpu(A::SymmetricMatrix{T}) where T + SymmetricMatrix(Array{T}(A.S), A.n) +end + +function map_to_cpu(nn::NeuralNetwork{AT, MT}) where {AT, MT} + ps = map_to_cpu(nn.params) + NeuralNetwork{AT, MT, typeof(ps)}(nn.architecture, nn.model, ps) +end \ No newline at end of file diff --git a/test/arrays/map_to_skew.jl b/test/arrays/map_to_skew.jl new file mode 100644 index 000000000..58f5e9063 --- /dev/null +++ b/test/arrays/map_to_skew.jl @@ -0,0 +1,8 @@ +using GeometricMachineLearning: SkewSymMatrix, map_to_Skew + +function test_map_to_Skew(n::Int = 5) + A = rand(SkewSymMatrix, n) + @test A.S ≈ map_to_Skew(A) +end + +test_map_to_Skew() \ No newline at end of file diff --git a/test/arrays/skew_sym_conv_test.jl b/test/arrays/skew_sym_conv_test.jl new file mode 100644 index 000000000..292723257 --- /dev/null +++ b/test/arrays/skew_sym_conv_test.jl @@ -0,0 +1,23 @@ +using Zygote +import Random + +Random.seed!(123) + +function test_skew_symmetric_matrix_convergence(n::Int = 5, T::Type = Float32) + A = rand(T, n, n) + A = .5 * (A - A') + ps = (weight = rand(SkewSymMatrix{T}, n), ) + o = Optimizer(AdamOptimizer(), ps) + _loss(ps::NamedTuple, A::AbstractMatrix) = norm(ps.weight - A) + for _ in 1:1200 + o.step += 1 + dp = Zygote.gradient(ps -> _loss(ps, A), ps)[1] + update!(o, o.cache.weight, dp.weight) + ps.weight .= ps.weight + dp.weight + end + # check if type stays SkewSymMatrix + @test typeof(ps.weight) <: SkewSymMatrix + @test ps.weight ≈ A +end + +test_skew_symmetric_matrix_convergence() \ No newline at end of file diff --git a/test/custom_ad_rules/kernel_pullbacks.jl b/test/custom_ad_rules/kernel_pullbacks.jl index 644b3f84d..1c1369130 100644 --- a/test/custom_ad_rules/kernel_pullbacks.jl +++ b/test/custom_ad_rules/kernel_pullbacks.jl @@ -1,5 +1,4 @@ -using GeometricMachineLearning: tensor_mat_mul, tensor_tensor_mul, tensor_transpose_tensor_mul, assign_q_and_p, tensor_transpose, assign_output_estimate, vec_tensor_mul -using GeometricMachineLearning: mat_tensor_mul, lo_mat_mul, up_mat_mul, skew_mat_mul, symmetric_mat_mul +using GeometricMachineLearning: lo_mat_mul, up_mat_mul, skew_mat_mul, symmetric_mat_mul using GeometricMachineLearning: tensor_mat_mul, mat_tensor_mul, tensor_tensor_mul, tensor_transpose_tensor_mul, assign_q_and_p, tensor_transpose, assign_output_estimate, vec_tensor_mul, tensor_mat_skew_sym_assign using ChainRulesTestUtils using Printf diff --git a/test/layers/resnet_tests.jl b/test/layers/resnet_tests.jl index 959fffadc..80817ddab 100644 --- a/test/layers/resnet_tests.jl +++ b/test/layers/resnet_tests.jl @@ -1,4 +1,5 @@ using GeometricMachineLearning, Test +using GeometricMachineLearning: ResNetLayer import Random Random.seed!(1234) diff --git a/test/runtests.jl b/test/runtests.jl index aab723333..9372ed008 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,6 +3,7 @@ using SafeTestsets @safetestset "Check parameterlength " begin include("parameterlength/check_parameterlengths.jl") end @safetestset "Arrays #1 " begin include("arrays/array_tests.jl") end +@safetestset "Map to skew " begin include("arrays/map_to_skew.jl") end @safetestset "Sampling of arrays " begin include("arrays/random_generation_of_custom_arrays.jl") end @safetestset "Addition tests for custom arrays " begin include("arrays/addition_tests_for_custom_arrays.jl") end @safetestset "Scalar multiplication tests for custom arrays " begin include("arrays/scalar_multiplication_for_custom_arrays.jl") end @@ -64,5 +65,5 @@ using SafeTestsets @safetestset "Batch functor(s) " begin include("batch/batch_functor.jl") end -@safetestset "SympNet integrator " begin include("sympnet_integrator.jl") end -@safetestset "Regular transformer integrator " begin include("regular_transformer_integrator.jl") end \ No newline at end of file +@safetestset "Volume-Preserving Transformer (skew-symmetric tests) " begin include("volume_preserving_attention/test_skew_map.jl") end +@safetestset "Volume-Preserving Transformer (cayley-transform tests) " begin include("volume_preserving_attention/test_cayley_transforms.jl") end \ No newline at end of file diff --git a/test/train!/test_neuralnet_solution.jl b/test/train!/test_neuralnet_solution.jl index 321d4a987..36072b67f 100644 --- a/test/train!/test_neuralnet_solution.jl +++ b/test/train!/test_neuralnet_solution.jl @@ -20,7 +20,7 @@ training_parameters = TrainingParameters(nruns, method, mopt; batch_size = batch neural_net_solution = train!(neuralnet, training_data, training_parameters) -@test nn(neural_net_solution) == neuralnet +@test GeometricMachineLearning.nn(neural_net_solution) == neuralnet @test problem(neural_net_solution) == UnknownProblem() @test tstep(neural_net_solution) == 0.1 @test length(loss(neural_net_solution)) == nruns @@ -41,7 +41,7 @@ training_parameters2 = TrainingParameters(2*nruns, BasicSympNet(), AdamOptimizer training_data2 = training_data train!(neural_net_solution, training_data2, training_parameters2) -@test nn(neural_net_solution) == neuralnet +@test GeometricMachineLearning.nn(neural_net_solution) == neuralnet @test problem(neural_net_solution) == UnknownProblem() @test tstep(neural_net_solution) == 0.1 @test length(loss(neural_net_solution)) == 2*nruns @@ -61,7 +61,7 @@ training_parameters3 = TrainingParameters(nruns, BasicSympNet(), MomentumOptimiz training_data3 = training_data train!(neural_net_solution, training_data3, training_parameters3) -@test nn(neural_net_solution) == neuralnet +@test GeometricMachineLearning.nn(neural_net_solution) == neuralnet @test problem(neural_net_solution) == UnknownProblem() @test tstep(neural_net_solution) == 0.1 @test length(loss(neural_net_solution)) == nruns diff --git a/test/volume_preserving_attention/test_cayley_transforms.jl b/test/volume_preserving_attention/test_cayley_transforms.jl new file mode 100644 index 000000000..040b347a2 --- /dev/null +++ b/test/volume_preserving_attention/test_cayley_transforms.jl @@ -0,0 +1,27 @@ +using GeometricMachineLearning: tensor_cayley4, tensor_cayley3, cpu_tensor_cayley, tensor_transpose +using Test + +function test_orthonormal(A::AbstractMatrix) + @test A' * A ≈ one(A) +end + +function test_orthonormal(A::AbstractArray{T, 3}) where T + for i in axes(A, 3) + A_temp = @view A[:, :, i] + test_orthonormal(A_temp) + end +end + +function test_cayley_transforms(third_dim::Int = 10) + A₄ = rand(4, 4, third_dim) + A₃ = rand(3, 3, third_dim) + # asymmetrize + A₄ = A₄ - tensor_transpose(A₄) + A₃ = A₃ - tensor_transpose(A₃) + test_orthonormal(tensor_cayley4(A₄)) + test_orthonormal(tensor_cayley3(A₃)) + test_orthonormal(cpu_tensor_cayley(A₄)) + test_orthonormal(cpu_tensor_cayley(A₃)) +end + +test_cayley_transforms() \ No newline at end of file diff --git a/test/volume_preserving_attention/test_skew_map.jl b/test/volume_preserving_attention/test_skew_map.jl new file mode 100644 index 000000000..fbcb3279d --- /dev/null +++ b/test/volume_preserving_attention/test_skew_map.jl @@ -0,0 +1,32 @@ +using GeometricMachineLearning +using GeometricMachineLearning: mat_tensor_mul, tensor_mat_skew_sym_assign, tensor_transpose_tensor_mul, tensor_transpose +using Test + +function isskew(A::AbstractMatrix) + @test -A ≈ A' +end + +function isskew(A::AbstractArray{T, 3}) where T + for i in axes(A, 3) + A_matrix = @view A[:, :, i] + isskew(A_matrix) + end +end + + +function test_first_option(n::Int = 10, seq_length::Int = 10, batch_size::Int = 20) + A = rand(SkewSymMatrix, n) + x = rand(n, seq_length, batch_size) + xAx = tensor_transpose_tensor_mul(x, mat_tensor_mul(A, x)) + isskew(xAx) +end + +function test_second_option(n::Int = 10, seq_length::Int = 10, batch_size::Int = 20) + A = rand(n, n) + x = rand(n, seq_length, batch_size) + xAx = tensor_mat_skew_sym_assign(x, A) / √n + isskew(xAx - tensor_transpose(xAx)) +end + +test_first_option() +test_second_option() \ No newline at end of file