From fec5faf9c67de100909d528f8487e953321d18bf Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Fri, 15 Jul 2022 02:09:42 +0530
Subject: [PATCH] Update code blocks to get rid of `Flux.params`

---
 docs/src/getting_started/linear_regression.md | 180 +++++++++---------
 1 file changed, 89 insertions(+), 91 deletions(-)

diff --git a/docs/src/getting_started/linear_regression.md b/docs/src/getting_started/linear_regression.md
index 0f3fc08fad..9349bd876f 100644
--- a/docs/src/getting_started/linear_regression.md
+++ b/docs/src/getting_started/linear_regression.md
@@ -65,13 +65,13 @@ We can now proceed ahead and build a model for our dataset!
 A linear regression model is mathematically defined as -
 
 ```math
-model(x) = Wx + b
+model(W, b, x) = Wx + b
 ```
 
 where `W` is the weight matrix and `b` is the bias. For our case, the weight matrix (`W`) would constitute only a single element, as we have only a single feature. We can define our model in `Julia` using the exact same notation!
 
 ```jldoctest linear_regression_simple
-julia> model(x) = @. W*x + b
+julia> model(W, b, x) = @. W*x + b
 model (generic function with 1 method)
 ```
 
@@ -82,7 +82,7 @@ The next step would be to initialize the model parameters, which are the weight
 ```jldoctest linear_regression_simple; filter = r"[+-]?([0-9]*[.])?[0-9]+"
 julia> W = rand(Float32, 1, 1)
 1×1 Matrix{Float32}:
- 0.33832288
+ 0.99285793
 
 julia> b = [0.0f0]
 1-element Vector{Float32}:
@@ -92,23 +92,23 @@ julia> b = [0.0f0]
 Time to test if our model works!
 
 ```jldoctest linear_regression_simple; filter = r"[+-]?([0-9]*[.])?[0-9]+"
-julia> model(x) |> size
+julia> model(W, b, x) |> size
 (1, 61)
 
-julia> model(x)[1], y[1]
-(-0.5491928f0, -7.0f0)
+julia> model(W, b, x)[1], y[1]
+(-1.6116865f0, -7.0f0)
 ```
 
 It does! But the predictions are way off. We need to train the model to improve the predictions, but before training the model we need to define the loss function. The loss function would ideally output a quantity that we will try to minimize during the entire training process. Here we will use the mean sum squared error loss function.
 
 ```jldoctest linear_regression_simple; filter = r"[+-]?([0-9]*[.])?[0-9]+"
-julia> function loss(x, y)
-           ŷ = model(x)
+julia> function loss(W, b, x, y)
+           ŷ = model(W, b, x)
            sum((y .- ŷ).^2) / length(x)
        end;
 
-julia> loss(x, y)
-28.880724f0
+julia> loss(W, b, x, y)
+23.772217f0
 ```
 
 Calling the loss function on our `x`s and `y`s shows how far our predictions (`ŷ`) are from the real labels. More precisely, it calculates the sum of the squares of residuals and divides it by the total number of data points.
@@ -124,7 +124,7 @@ A [`Dense(1 => 1)`](@ref Dense) layer denotes a layer of one neuron with one inp
 
 ```jldoctest linear_regression_simple; filter = r"[+-]?([0-9]*[.])?[0-9]+"
 julia> flux_model.weight, flux_model.bias
-(Float32[1.0764818], Float32[0.0])
+(Float32[1.1412252], Float32[0.0])
 ```
 
 Now we can check if our model is acting right. We can pass the complete data in one go, with each `x` having exactly one feature (one input) -
@@ -134,7 +134,7 @@ julia> flux_model(x) |> size
 (1, 61)
 
 julia> flux_model(x)[1], y[1]
-(-1.7474315f0, -7.0f0)
+(-1.8525281f0, -7.0f0)
 ```
 
 It is! The next step would be defining the loss function using `Flux`'s functions -
@@ -146,55 +146,51 @@ julia> function flux_loss(x, y)
        end;
 
 julia> flux_loss(x, y)
-23.189152f0
+22.74856f0
 ```
 
 Everything works as before! It almost feels like `Flux` provides us with smart wrappers for the functions we could have written on our own. Now, as the last step of this section, let's see how different the `flux_model` is from our custom `model`. A good way to go about this would be to fix the parameters of both models to be the same. Let's change the parameters of our custom `model` to match that of the `flux_model` -
 
 
 ```jldoctest linear_regression_simple; filter = r"[+-]?([0-9]*[.])?[0-9]+"
-julia> W = Float32[1.0764818]
+julia> W = Float32[1.1412252]
 1-element Vector{Float32}:
- 1.0764818
+ 1.1412252
 ```
 
 To check how both the models are performing on the data, let's find out the losses using the `loss` and `flux_loss` functions -
 
 ```jldoctest linear_regression_simple; filter = r"[+-]?([0-9]*[.])?[0-9]+"
-julia> loss(x, y), flux_loss(x, y)
-(23.189152f0, 23.189152f0)
+julia> loss(W, b, x, y), flux_loss(x, y)
+(22.74856f0, 22.74856f0)
 ```
 
 The losses are identical! This means that our `model` and the `flux_model` are identical on some level, and the loss functions are completely identical! The difference in models would be that `Flux`'s [`Dense`](@ref) layer supports many other arguments that can be used to customize the layer further. But, for this tutorial, let us stick to our simple custom `model`.
 
 ### Training the model
 
-Before we begin the training procedure with `Flux`, let's initialize an optimiser, finalize our data, and pass our parameters through [`Flux.params`](@ref) to specify that we want all derivatives of `W` and `b`. We will be using the classic [`Gradient Descent`](@ref Descent) algorithm. `Flux` comes loaded with a lot of different optimisers; refer to [Optimisers](@ref) for more information on the same.
+Before we begin the training procedure with `Flux`, let's initialize an optimiser and finalize our data. We will be using the classic [`Gradient Descent`](@ref Descent) algorithm. `Flux` comes loaded with a lot of different optimisers; refer to [Optimisers](@ref) for more information on the same.
 
 ```jldoctest linear_regression_simple; filter = r"[+-]?([0-9]*[.])?[0-9]+"
-julia> opt = Descent(0.01);
+julia> dLdW, dLdb, _, _ = gradient(loss, W, b, x, y)
+(Float32[-6.7322206], Float32[-4.132563], Float32[0.1926041 0.14162663 … -0.39782608 -0.29997927], Float32[-0.16876957 -0.12410051 … 0.3485956 0.2628572])
 
-julia> data = [(x, y)];
+julia> W .= W .- 0.1 .* dLdW
+1-element Vector{Float32}:
+ 1.8144473
 
-julia> params = Flux.params(W, b)
-Params([Float32[0.71305436], Float32[0.0]])
+julia> b .= b .- 0.1 .* dLdb
+1-element Vector{Float32}:
+ 0.41325632
 ```
 
 Now, we can move to the actual training! The training consists of obtaining the gradient and updating the current parameters with the obtained derivatives using backpropagation. This is achieved using `Flux.gradient` (see see [Taking Gradients](@ref)) and [`Flux.Optimise.update!`](@ref) functions respectively.
 
-```jldoctest linear_regression_simple
-julia> gs = Flux.gradient(params) do
-                  loss(x, y)
-            end;
-
-julia> Flux.Optimise.update!(opt, params, gs)
-```
-
 We can now check the values of our parameters and the value of the loss function -
 
 ```jldoctest linear_regression_simple; filter = r"[+-]?([0-9]*[.])?[0-9]+"
-julia> params, loss(x, y)
-(Params([Float32[1.145264], Float32[0.041250423]]), 22.5526f0)
+julia> loss(W, b, x, y)
+17.157953f0
 ```
 
 The parameters changed, and the loss went down! This means that we successfully trained our model for one epoch. We can plug the training code written above into a loop and train the model for a higher number of epochs. It can be customized either to have a fixed number of epochs or to stop when certain conditions are met, for example, `change in loss < 0.1`. This loop can be customized to suit a user's needs, and the conditions can be specified in plain `Julia`!
@@ -202,21 +198,27 @@ The parameters changed, and the loss went down! This means that we successfully
 `Flux` also provides a convenience function to train a model. The [`Flux.train!`](@ref) function performs the same task described above and does not require calculating the gradient manually.
 
 ```jldoctest linear_regression_simple; filter = r"[+-]?([0-9]*[.])?[0-9]+"
-julia> Flux.train!(loss, params, data, opt)
+julia> function train_model()
+           dLdW, dLdb, _, _ = gradient(loss, W, b, x, y)
+           @. W = W - 0.1 * dLdW
+           @. b = b - 0.1 * dLdb
+       end;
+
+julia> train_model();
 
-julia> params, loss(x, y)
-(Params([Float32[1.2125431], Float32[0.08175573]]), 21.94231f0)
+julia> W, b, loss(W, b, x, y)
+(Float32[2.340657], Float32[0.7516814], 13.64972f0)
 ```
 
 The parameters changed again, and the loss went down again! This was the second epoch of our training procedure. Let's plug this in a for loop and train the model for 60 epochs.
 
 ```jldoctest linear_regression_simple; filter = r"[+-]?([0-9]*[.])?[0-9]+"
-julia> for i = 1:60
-          Flux.train!(loss, params, data, opt)
+julia> for i = 1:30
+          train_model()
        end
 
-julia> params, loss(x, y)
-(Params([Float32[3.426797], Float32[1.5412952]]), 8.848401f0)
+julia> W, b, loss(W, b, x, y)
+(Float32[4.2408285], Float32[2.243728], 7.668049f0)
 ```
 
 The loss went down significantly!
@@ -224,7 +226,7 @@ The loss went down significantly!
 `Flux` provides yet another convenience functionality, the [`Flux.@epochs`](@ref) macro, which can be used to train a model for a specific number of epochs.
 
 ```jldoctest linear_regression_simple; filter = r"[+-]?([0-9]*[.])?[0-9]+"
-julia> Flux.@epochs 10 Flux.train!(loss, params, data, opt)
+julia> Flux.@epochs 10 train_model()
 [ Info: Epoch 1
 [ Info: Epoch 2
 [ Info: Epoch 3
@@ -236,11 +238,11 @@ julia> Flux.@epochs 10 Flux.train!(loss, params, data, opt)
 [ Info: Epoch 9
 [ Info: Epoch 10
 
-julia> params, loss(x, y)
-(Params([Float32[3.58633], Float32[1.6624337]]), 8.44982f0)
+julia> W, b, loss(W, b, x, y)
+(Float32[4.2422233], Float32[2.2460847], 7.6680417f0)
 ```
 
-We can train the model even more or tweak the hyperparameters to achieve the desired result faster, but let's stop here. We trained our model for 72 epochs, and loss went down from `23.189152` to `8.44982`. Time for some visualization!
+We can train the model even more or tweak the hyperparameters to achieve the desired result faster, but let's stop here. We trained our model for 72 epochs, and loss went down from `22.74856` to `7.6680417f`. Time for some visualization!
 
 ### Results
 The main objective of this tutorial was to fit a line to our dataset using the linear regression algorithm. The training procedure went well, and the loss went down significantly! Let's see what the fitted line looks like. Remember, `Wx + b` is nothing more than a line's equation, with `slope = W[1]` and `y-intercept = b[1]` (indexing at `1` as `W` and `b` are iterable).
@@ -252,7 +254,8 @@ julia> plot(reshape(x, (61, 1)), reshape(y, (61, 1)), lw = 3, seriestype = :scat
 julia> plot!((x) -> b[1] + W[1] * x, -3, 3, label="Custom model", lw=2);
 ```
 
-![linear-regression-line](https://user-images.githubusercontent.com/74055102/177034985-d53adf40-5527-4a83-b9f6-7a62e5cc678f.png)
+![linear-regression-line](https://user-images.githubusercontent.com/74055102/179050736-366bedcc-6990-40ee-83be-e11d07492e05.png)
+
 
 The line fits well! There is room for improvement, but we leave that up to you! You can play with the optimisers, the number of epochs, learning rate, etc. to improve the fitting and reduce the loss!
 
@@ -308,13 +311,7 @@ julia> std(x_train_n)
 1.0000843694328236
 ```
 
-The standard deviation is now close to one! The last step for this section would be to wrap the `x`s and `y`s together to create the training data.
-
-```jldoctest linear_regression_complex
-julia> train_data = [(x_train_n, y_train)];
-```
-
-Our data is ready!
+The standard deviation is now close to one! Our data is ready!
 
 ### Model
 We can now directly use `Flux` and let it do all the work internally! Let's define a model that takes in 13 inputs (13 features) and gives us a single output (the label). We will then pass our entire data through this model in one go, and `Flux` will handle everything for us! Remember, we could have declared a model in plain `Julia` as well. The model will have 14 parameters, 13 weights, and one bias.
@@ -327,13 +324,13 @@ Dense(13 => 1)      # 14 parameters
 Same as before, our next step would be to define a loss function to quantify our accuracy somehow. The lower the loss, the better the model!
 
 ```jldoctest linear_regression_complex; filter = r"[+-]?([0-9]*[.])?[0-9]+"
-julia> function loss(x, y)
+julia> function loss(model, x, y)
            ŷ = model(x)
            Flux.mse(ŷ, y)
        end;
 
-julia> loss(x_train_n, y_train)
-685.4700669900504
+julia> loss(model, x_train_n, y_train)
+676.165591625047
 ```
 
 We can now proceed to the training phase!
@@ -342,9 +339,11 @@ We can now proceed to the training phase!
 Before training the model, let's initialize the optimiser and let `Flux` know that we want all the derivatives of all the parameters of our `model`.
 
 ```jldoctest linear_regression_complex
-julia> opt = Descent(0.05);
-
-julia> params = Flux.params(model);
+julia> function train_model()
+           dLdm, _, _ = gradient(loss, model, x, y)
+           @. model.weight = model.weight - 0.000001 * dLdm.weight
+           @. model.bias = model.bias - 0.000001 * dLdm.bias
+       end;
 ```
 
 Contrary to our last training procedure, let's say that this time we don't want to hardcode the number of epochs. We want the training procedure to stop when the loss converges, that is, when `change in loss < δ`. The quantity `δ` can be altered according to a user's need, but let's fix it to `10⁻³` for this tutorial.
@@ -354,15 +353,15 @@ We can write such custom training loops effortlessly using Flux and plain Julia!
 julia> loss_init = Inf;
 
 julia> while true
-           Flux.train!(loss, params, train_data, opt)
+           train_model()
            if loss_init == Inf
-               loss_init = loss(x_train_n, y_train)
+               loss_init = loss(model, x_train_n, y_train)
                continue
            end
-           if abs(loss_init - loss(x_train_n, y_train)) < 1e-3
+           if abs(loss_init - loss(model, x_train_n, y_train)) < 1e-3
                break
            else
-               loss_init = loss(x_train_n, y_train)
+               loss_init = loss(model, x_train_n, y_train)
            end
        end;
 ```
@@ -374,7 +373,7 @@ This custom loop works! This shows how easily a user can write down any custom t
 Let's have a look at the loss -
 
 ```jldoctest linear_regression_complex; filter = r"[+-]?([0-9]*[.])?[0-9]+"
-julia> loss(x_train_n, y_train)
+julia> loss(model, x_train_n, y_train)
 27.127200028562164
 ```
 
@@ -386,7 +385,7 @@ The last step of this tutorial would be to test our model using the testing data
 ```jldoctest linear_regression_complex; filter = r"[+-]?([0-9]*[.])?[0-9]+"
 julia> x_test_n = Flux.normalise(x_test);
 
-julia> loss(x_test_n, y_test)
+julia> loss(model, x_test_n, y_test)
 66.91014769713368
 ```
 
@@ -406,7 +405,6 @@ After getting familiar with the basics of Flux and Julia, we moved ahead to buil
 using Flux
 using Plots
 
-
 # data
 x = hcat(collect(Float32, -3:0.1:3)...)
 f(x) = @. 3x + 2
@@ -417,29 +415,30 @@ x = x .* reshape(rand(Float32, 61), (1, 61))
 plot(reshape(x, (61, 1)), reshape(y, (61, 1)), lw = 3, seriestype = :scatter, label = "", title = "Generated data", xlabel = "x", ylabel= "y")
 
 # custom model and parameters
-model(x) = @. W*x + b
+model(W, b, x) = @. W*x + b
 W = rand(Float32, 1, 1)
 b = [0.0f0]
 
 # loss function
-function loss(x, y)
+function loss(model, x, y)
     ŷ = model(x)
     sum((y .- ŷ).^2) / length(x)
 end;
 
-print("Initial loss", loss(x, y), "\n")
-
-# optimiser, data, and parameters
-opt = Descent(0.01);
-data = [(x, y)];
-params = Flux.params(W, b)
+print("Initial loss", loss(model, x, y), "\n")
 
 # train
-for i = 1:72
-    Flux.train!(loss, params, data, opt)
+function train_model()
+    dLdW, dLdb, _, _ = gradient(loss, W, b, x, y)
+    @. W = W - 0.1 * dLdW
+    @. b = b - 0.1 * dLdb
+end
+
+for i = 1:40
+    train_model()
 end
 
-print("Final loss", loss(x, y), "\n")
+print("Final loss", loss(model, x, y), "\n")
 
 # plot data and results
 plot(reshape(x, (61, 1)), reshape(y, (61, 1)), lw = 3, seriestype = :scatter, label = "", title = "Simple Linear Regression", xlabel = "x", ylabel= "y")
@@ -451,47 +450,46 @@ using Flux
 using Statistics
 using MLDatasets: BostonHousing
 
-
 # data
 x, y = BostonHousing(as_df=false)[:]
 x_train, x_test, y_train, y_test = x[:, 1:400], x[:, 401:end], y[:, 1:400], y[:, 401:end]
 x_train_n = Flux.normalise(x_train)
-train_data = [(x_train_n, y_train)]
 
 # model
 model = Dense(13 => 1)
 
 # loss function
-function loss(x, y)
+function loss(model, x, y)
     ŷ = model(x)
     Flux.mse(ŷ, y)
 end;
 
-print("Initial loss", loss(x_train_n, y_train), "\n")
-
-# optimiser and parameters
-opt = Descent(0.05);
-params = Flux.params(model);
+print("Initial loss", loss(model, x_train_n, y_train), "\n")
 
 # train
+function train_model()
+    dLdm, _, _ = gradient(loss, model, x, y)
+    @. model.weight = model.weight - 0.000001 * dLdm.weight
+    @. model.bias = model.bias - 0.000001 * dLdm.bias
+end
+
 loss_init = Inf;
 while true
-    Flux.train!(loss, params, data, opt)
+    train_model()
     if loss_init == Inf
-        loss_init = loss(x_train_n, y_train)
+        loss_init = loss(model, x_train_n, y_train)
         continue
     end
-
-    if abs(loss_init - loss(x_train_n, y_train)) < 1e-3
+    if abs(loss_init - loss(model, x_train_n, y_train)) < 1e-3
         break
     else
-        loss_init = loss(x_train_n, y_train)
+        loss_init = loss(model, x_train_n, y_train)
     end
 end
 
-print("Final loss", loss(x_train_n, y_train), "\n")
+print("Final loss", loss(model, x_train_n, y_train), "\n")
 
-# testing
+# test
 x_test_n = Flux.normalise(x_test);
-print("Test loss", loss(x_test_n, y_test), "\n")
+print("Test loss", loss(model, x_test_n, y_test), "\n")
 ```
\ No newline at end of file