Merge pull request #105 from JuliaGNI/fix_mnist_tutorial

Fixed mnist tutorial
JuliaGNI · Dec 20, 2023 · 830c2c9 · 830c2c9
2 parents 767dd78 + fb6dcb9
commit 830c2c9
Show file tree

Hide file tree

Showing 5 changed files with 68 additions and 51 deletions.
diff --git a/docs/src/GeometricMachineLearning.bib b/docs/src/GeometricMachineLearning.bib
@@ -151,3 +151,17 @@ @article{vaswani2017attention
   volume={30},
   year={2017}
 }
+
+@article{brantner2023symplectic,
+  title={Symplectic autoencoders for Model Reduction of Hamiltonian Systems},
+  author={Brantner, Benedikt and Kraus, Michael},
+  journal={arXiv preprint arXiv:2312.10004},
+  year={2023}
+}
+
+@article{brantner2023structure,
+    author = {Brantner, Benedikt and de Romemont, Guillaume and Kraus, Michael and Li, Zeyuan},
+    title = {Structure-Preserving Transformers for Learning Parametrized Hamiltonian Systems},
+    journal = {arXiv preprint arXiv:2312:11166},
+    year = {2023}
+}
diff --git a/docs/src/references.md b/docs/src/references.md
@@ -1,4 +1,5 @@
 # References 
 
 ```@bibliography
+*
 ```
diff --git a/docs/src/tutorials/mnist_tutorial.md b/docs/src/tutorials/mnist_tutorial.md
@@ -1,15 +1,15 @@
 # MNIST tutorial 
 
-This is a short tutorial that shows how we can use `GeometricMachineLearning` to build a vision transformer and apply it for MNIST, while also putting some of the weights on a manifold. 
+This is a short tutorial that shows how we can use `GeometricMachineLearning` to build a vision transformer and apply it for MNIST, while also putting some of the weights on a manifold. This is also the result presented in [brantner2023generalizing](@cite).
 
 First, we need to import the relevant packages: 
 
 ```julia
-using GeometricMachineLearning, CUDA
-import Zygote, MLDatasets
+using GeometricMachineLearning, CUDA, Plots
+import Zygote, MLDatasets, KernelAbstractions
 ```
 
-In this example `Zygote` as an AD routine and we get the dataset from `MLDatasets`. First we need to load the data set, and put it on GPU (if you have one):
+For the AD routine we here use the `GeometricMachineLearning` default and we get the dataset from `MLDatasets`. First we need to load the data set, and put it on GPU (if you have one):
 
 ```julia
 train_x, train_y = MLDatasets.MNIST(split=:train)[:]
@@ -24,36 +24,49 @@ test_y = test_y |> cu
 
 ```julia
 patch_length = 7
-dl = DataLoader(train_x, train_y, batch_size=512, patch_length=patch_length)
-dl_test = DataLoader(train_x, train_y, batch_size=length(y), patch_length=patch_length)
+dl = DataLoader(train_x, train_y, patch_length=patch_length)
+dl_test = DataLoader(train_x, train_y, patch_length=patch_length)
 ```
 
-The second line in the above code snippet indicates that we use the entire data set as one "batch" when processing the test set. For training, the batch size was here set to 512. 
+Here `patch_length` indicates the size one patch has. One image in MNIST is of dimension ``28\times28``, this means that we decompose this into 16 ``(7\times7)`` images (also see [brantner2023generalizing](@cite)).
+
+We next define the model with which we want to train:
 
 ```julia
-ps = initialparameters(backend, eltype(dl.data), Ψᵉ) 
+model = ClassificationTransformer(dl, n_heads=n_heads, n_layers=n_layers, Stiefel=true)
+```
 
-optimizer_instance = Optimizer(o, ps)
+Here we have chosen a `ClassificationTransformer`, i.e. a composition of a specific number of transformer layers composed with a classification layer. We also set the *Stiefel option* to `true`, i.e. we are optimizing on the Stiefel manifold.
 
-println("initial test accuracy: ", accuracy(Ψᵉ, ps, dl_test), "\n")
+We now have to initialize the neural network weights. This is done with the constructor for `NeuralNetwork`:
+
+```julia
+backend = KernelAbstractions.get_backend(dl)
+T = eltype(dl)
+nn = NeuralNetwork(model, backend, T)
+```
+
+And with this we can finally perform the training:
 
-progress_object = Progress(n_training_steps; enabled=true)
+```julia
+# an instance of batch is needed for the optimizer
+batch = Batch(batch_size)
+
+optimizer_instance = Optimizer(AdamOptimizer(), nn)
 
-loss_array = zeros(eltype(train_x), n_training_steps)
-for i in 1:n_training_steps
-    redraw_batch!(dl)
-    # get rid of try catch statement. This softmax issue should be solvable!
-    loss_val, pb = try Zygote.pullback(ps -> loss(Ψᵉ, ps, dl), ps)
-    catch
-        loss_array[i] = loss_array[i-1] 
-        continue 
-    end
-    dp = pb(one(loss_val))[1]
+# this prints the accuracy and is optional
+println("initial test accuracy: ", accuracy(Ψᵉ, ps, dl_test), "\n")
 
-    optimization_step!(optimizer_instance, Ψᵉ, ps, dp)
-    ProgressMeter.next!(progress_object; showvalues = [(:TrainingLoss, loss_val)])   
-    loss_array[i] = loss_val
-end
+loss_array = optimizer_instance(nn, dl, batch, n_epochs)
 
 println("final test accuracy: ", accuracy(Ψᵉ, ps, dl_test), "\n")
+```
+
+It is instructive to play with `n_layers`, `n_epochs` and the Stiefel property.
+
+```@bibliography
+Pages = []
+Canonical = false
+
+brantner2023generalizing
 ```
diff --git a/scripts/transformer_mnist.jl b/scripts/transformer_mnist.jl
@@ -2,19 +2,16 @@
 TODO: Add a better predictor at the end! It should set the biggest value of the softmax to 1 and the rest to zero!
 """
 
-using GeometricMachineLearning, ProgressMeter, Plots, CUDA
-import Zygote, MLDatasets
+using GeometricMachineLearning, Plots, CUDA
+import MLDatasets
 
 # MNIST images are 28×28, so a sequence_length of 16 = 4² means the image patches are of size 7² = 49
-image_dim = 28
-patch_length = 7
-transformer_dim = 49
-n_heads = 7
-n_layers = 16
-number_of_patch = (image_dim÷patch_length)^2
-batch_size = 2048
-activation = softmax
-n_epochs = 1000
+const patch_length = 7
+const n_heads = 7
+const n_layers = 16
+const batch_size = 2048
+const activation = softmax
+const n_epochs = 100
 add_connection = false
 
 train_x, train_y = MLDatasets.MNIST(split=:train)[:]
@@ -36,8 +33,8 @@ backend, train_x, test_x, train_y, test_y =
         test_y
 end
 
-dl = DataLoader(train_x, train_y)
-dl_test = DataLoader(test_x, test_y)
+dl = DataLoader(train_x, train_y, patch_length=7)
+dl_test = DataLoader(test_x, test_y, patch_length=7)
 
 # the difference between the first and the second model is that we put the weights on the Stiefel manifold in the second case
 model1 = ClassificationTransformer(dl, n_heads=n_heads, n_layers=n_layers, Stiefel=false, add_connection=add_connection)
@@ -52,23 +49,13 @@ function transformer_training(Ψᵉ::GeometricMachineLearning.Architecture; n_ep
 
     println("initial test accuracy: ", GeometricMachineLearning.accuracy(nn, dl_test), "\n")
 
-    progress_object = Progress(n_epochs; enabled=true)
-
     # use the `time` function to get the system time.
     init_time = time()
     total_time = init_time - time()
 
-    loss_array = zeros(eltype(train_x), n_epochs)
-    for i in 1:n_epochs
-        # there is some functionality in a recent PR that streamlines some of this -> make sure to include this!
-        loss_val = optimize_for_one_epoch!(optimizer_instance, nn, dl, batch)
-
-        ProgressMeter.next!(progress_object; showvalues = [(:TrainingLoss, loss_val)])   
-        loss_array[i] = loss_val
+    loss_array = optimizer_instance(nn, dl, batch, n_epochs)
 
-        # update runtime
-        total_time = init_time - time()
-    end
+    total_time = init_time - time()
 
     accuracy_score = GeometricMachineLearning.accuracy(nn, dl_test)
     println("final test accuracy: ", accuracy_score, "\n")

diff --git a/src/data_loader/data_loader.jl b/src/data_loader/data_loader.jl
@@ -174,4 +174,6 @@ end
 
 accuracy(nn::NeuralNetwork, dl::DataLoader) = accuracy(nn.model, nn.params, dl)
 
-Base.eltype(::DataLoader{T}) where T = T
+Base.eltype(::DataLoader{T}) where T = T
+
+KernelAbstractions.get_backend(dl::DataLoader) = KernelAbstractions.get_backend(dl.input)