running on eagle

vpuri3 · Aug 26, 2023 · d391430 · d391430
1 parent 81f035c
commit d391430
Show file tree

Hide file tree

Showing 18 changed files with 63 additions and 59 deletions.
diff --git a/Project.toml b/Project.toml
@@ -12,7 +12,6 @@ Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
-Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 FourierSpaces = "f3d7ce31-8807-4b9f-8144-4bd3ed8f0112"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"

diff --git a/examples/diffusion_fourier/exp_bilinear_scale/bilin.jl b/examples/diffusion_fourier/exp_bilinear_scale/bilin.jl
@@ -30,8 +30,8 @@ import Lux: cpu, gpu
 using Tullio, Zygote
 
 using FFTW, LinearAlgebra
-BLAS.set_num_threads(2)
-FFTW.set_num_threads(8)
+BLAS.set_num_threads(12)
+FFTW.set_num_threads(24)
 
 include("../datagen.jl")
 
@@ -64,15 +64,15 @@ o = size(__data[2], 1) # out channels
 
 NN = Lux.Chain(
     PermutedBatchNorm(c, 3),
-    Dense(c , w, tanh),
-    OpKernel(w, w, m, tanh),
-    OpKernel(w, w, m, tanh),
-    OpKernel(w, w, m, tanh),
+    Dense(c , w, Lux.relu),
+    OpKernel(w, w, m, Lux.relu),
+    OpKernel(w, w, m, Lux.relu),
+    OpKernel(w, w, m, Lux.relu),
     Dense(w , o)
 )
 
 opt = Optimisers.Adam()
-batchsize = size(__data[1])[end]
+batchsize = 128 #size(__data[1])[end]
 learning_rates = (1f-2, 1f-3,)
 nepochs  = E .* (0.10, 0.90,) .|> Int
 dir = joinpath(@__DIR__, "exp_FNO_nonlin")
@@ -109,18 +109,20 @@ bilin  = OpConvBilinear(w1, w2, o, m)
 NN = linear_nonlinear(split, nonlin, linear, bilin)
 
 opt = Optimisers.Adam()
-batchsize = 256 # size(__data[1])[end] # 1024
+batchsize = 128 # size(__data[1])[end] # 1024
 learning_rates = (1f-3,)
 nepochs = E .* (1.00,) .|> Int
 # learning_rates = (1f-3, 5f-4, 2.5f-4, 1.25f-4,)
 # nepochs        = E .* (0.25, 0.25, 0.25, 0.25,) .|> Int
 dir = joinpath(@__DIR__, "exp_FNO_linear_nonlinear")
-device = Lux.cpu
+device = Lux.gpu
 
 model, ST = train_model(rng, NN, __data, data__, _V, opt;
         batchsize, learning_rates, nepochs, dir, cbstep = 1, device)
 
 end
 
+plot_training(ST...) |> display
+
 nothing
 #
diff --git a/...s/diffusion_fourier/exp_bilinear_scale/exp_FNO_linear_nonlinear/plt_r2_test.png b/...s/diffusion_fourier/exp_bilinear_scale/exp_FNO_linear_nonlinear/plt_r2_test.png
diff --git a/.../diffusion_fourier/exp_bilinear_scale/exp_FNO_linear_nonlinear/plt_r2_train.png b/.../diffusion_fourier/exp_bilinear_scale/exp_FNO_linear_nonlinear/plt_r2_train.png
diff --git a/.../diffusion_fourier/exp_bilinear_scale/exp_FNO_linear_nonlinear/plt_training.png b/.../diffusion_fourier/exp_bilinear_scale/exp_FNO_linear_nonlinear/plt_training.png
diff --git a/...diffusion_fourier/exp_bilinear_scale/exp_FNO_linear_nonlinear/plt_traj_test.png b/...diffusion_fourier/exp_bilinear_scale/exp_FNO_linear_nonlinear/plt_traj_test.png
diff --git a/...iffusion_fourier/exp_bilinear_scale/exp_FNO_linear_nonlinear/plt_traj_train.png b/...iffusion_fourier/exp_bilinear_scale/exp_FNO_linear_nonlinear/plt_traj_train.png
diff --git a/examples/diffusion_fourier/exp_bilinear_scale/exp_FNO_linear_nonlinear/statistics.txt b/examples/diffusion_fourier/exp_bilinear_scale/exp_FNO_linear_nonlinear/statistics.txt
@@ -1,19 +1,19 @@
-	 TRAIN LOSS: 0.02262737	 TEST LOSS: 0.03570642
+	 TRAIN LOSS: 0.03218706	 TEST LOSS: 0.04251174
 #======================#
 TRAIN STATS
-R² score:                   0.9730648
-MSE (mean SQR error):       0.02510134
-RMSE (root mean SQR error): 0.15843402
-MAE (mean ABS error):       0.11758655
-maxAE (max ABS error)       1.0394883
+R² score:                   0.96962214
+MSE (mean SQR error):       0.03111367
+RMSE (root mean SQR error): 0.17639066
+MAE (mean ABS error):       0.12991187
+maxAE (max ABS error)       1.218371
 
 #======================#
 #======================#
 TEST  STATS
-R² score:                   0.97130257
-MSE (mean SQR error):       0.03251882
-RMSE (root mean SQR error): 0.18032974
-MAE (mean ABS error):       0.13228929
-maxAE (max ABS error)       1.1780059
+R² score:                   0.9665479
+MSE (mean SQR error):       0.04228398
+RMSE (root mean SQR error): 0.20563067
+MAE (mean ABS error):       0.15284078
+maxAE (max ABS error)       1.322362
 
 #======================#
diff --git a/examples/diffusion_fourier/exp_bilinear_scale/exp_FNO_nonlin/plt_r2_test.png b/examples/diffusion_fourier/exp_bilinear_scale/exp_FNO_nonlin/plt_r2_test.png
diff --git a/examples/diffusion_fourier/exp_bilinear_scale/exp_FNO_nonlin/plt_r2_train.png b/examples/diffusion_fourier/exp_bilinear_scale/exp_FNO_nonlin/plt_r2_train.png
diff --git a/examples/diffusion_fourier/exp_bilinear_scale/exp_FNO_nonlin/plt_training.png b/examples/diffusion_fourier/exp_bilinear_scale/exp_FNO_nonlin/plt_training.png
diff --git a/examples/diffusion_fourier/exp_bilinear_scale/exp_FNO_nonlin/plt_traj_test.png b/examples/diffusion_fourier/exp_bilinear_scale/exp_FNO_nonlin/plt_traj_test.png
diff --git a/examples/diffusion_fourier/exp_bilinear_scale/exp_FNO_nonlin/plt_traj_train.png b/examples/diffusion_fourier/exp_bilinear_scale/exp_FNO_nonlin/plt_traj_train.png
diff --git a/examples/diffusion_fourier/exp_bilinear_scale/exp_FNO_nonlin/statistics.txt b/examples/diffusion_fourier/exp_bilinear_scale/exp_FNO_nonlin/statistics.txt
@@ -1,17 +1,19 @@
-	 TRAIN LOSS: 0.00225263	 TEST LOSS: 0.00903751
+	 TRAIN LOSS: 0.00091016	 TEST LOSS: 0.0081393
 #======================#
 TRAIN STATS
-R² score:       0.9977584
-mean SQR error: 0.00225267
-mean ABS error: 0.03524051
-max  ABS error: 0.48005915
+R² score:                   0.99895054
+MSE (mean SQR error):       0.00102989
+RMSE (root mean SQR error): 0.03209183
+MAE (mean ABS error):       0.02561593
+maxAE (max ABS error)       0.20224762
 
 #======================#
 #======================#
 TEST  STATS
-R² score:       0.99156046
-mean SQR error: 0.00903739
-mean ABS error: 0.0693927
-max  ABS error: 0.84695154
+R² score:                   0.9920869
+MSE (mean SQR error):       0.0082745
+RMSE (root mean SQR error): 0.09096426
+MAE (mean ABS error):       0.06705302
+maxAE (max ABS error)       0.65916514
 
 #======================#
diff --git a/examples/pdebench/darcy2d.jl b/examples/pdebench/darcy2d.jl
@@ -30,18 +30,18 @@ import Lux: cpu, gpu
 using Tullio, Zygote
 
 using FFTW, LinearAlgebra
-BLAS.set_num_threads(4)
-FFTW.set_num_threads(8)
+BLAS.set_num_threads(12)
+FFTW.set_num_threads(24)
 
 rng = Random.default_rng()
 Random.seed!(rng, 345)
 
 N = 128
-E = 40
+E = 100
 
 # trajectories
-_K = 128
-K_ = 32
+_K = 512
+K_ = 128
 
 # get data
 dir = @__DIR__
@@ -55,31 +55,32 @@ V = FourierSpace(N, N)
 # FNO model
 ###
 
-w = 32        # width
+w = 64        # width
 m = (32, 32,) # modes
 c = size(_data[1], 1) # in  channels
 o = size(_data[2], 1) # out channels
+act = Lux.relu
 
 NN = Lux.Chain(
-    Dense(c, w, tanh),
-    OpKernel(w, w, m, tanh),
-    OpKernel(w, w, m, tanh),
-    OpKernel(w, w, m, tanh),
-    OpKernel(w, w, m, tanh),
+    Dense(c, w, act),
+    OpKernel(w, w, m, act),
+    OpKernel(w, w, m, act),
+    OpKernel(w, w, m, act),
+    OpKernel(w, w, m, act),
     Dense(w, o),
 )
 
 opt = Optimisers.Adam()
-batchsize = 16
+batchsize = 32
 learning_rates = (1f-2, 1f-3, 5f-4, 2.5f-4,)
 nepochs  = E .* (0.25, 0.25, 0.25, 0.25,) .|> Int
 dir = joinpath(@__DIR__, "model_darcy2D")
-device = Lux.cpu # Lux.gpu
+device = Lux.gpu
 
 model, ST = train_model(rng, NN, _data, data_, V, opt;
     batchsize, learning_rates, nepochs, dir, device)
 
-plot_training(ST...)
+plot_training(ST...) |> display
 
-# nothing
+nothing
 #
diff --git a/examples/pdebench/model_darcy2D/plt_training.png b/examples/pdebench/model_darcy2D/plt_training.png
diff --git a/examples/pdebench/model_darcy2D/statistics.txt b/examples/pdebench/model_darcy2D/statistics.txt
@@ -1,19 +1,19 @@
-	 TRAIN LOSS: 0.00020804	 TEST LOSS: 8.247e-5
+	 TRAIN LOSS: 2.19e-6	 TEST LOSS: 0.00093983
 #======================#
 TRAIN STATS
-R² score:                   0.76355976
-MSE (mean SQR error):       0.0003289
-RMSE (root mean SQR error): 0.01813549
-MAE (mean ABS error):       0.00971096
-maxAE (max ABS error)       0.26131245
+R² score:                   0.9909916
+MSE (mean SQR error):       9.42e-6
+RMSE (root mean SQR error): 0.00306904
+MAE (mean ABS error):       0.00133418
+maxAE (max ABS error)       0.08552231
 
 #======================#
 #======================#
 TEST  STATS
-R² score:                   0.85738647
-MSE (mean SQR error):       0.00025149
-RMSE (root mean SQR error): 0.01585852
-MAE (mean ABS error):       0.00898928
-maxAE (max ABS error)       0.16604173
+R² score:                   0.27241534
+MSE (mean SQR error):       0.00044176
+RMSE (root mean SQR error): 0.02101806
+MAE (mean ABS error):       0.00636383
+maxAE (max ABS error)       0.277627
 
 #======================#
diff --git a/src/train.jl b/src/train.jl
@@ -74,8 +74,8 @@ function train_model(
 
     # cb_batch = 
 
-    # early stopping: (need fullbatch validation loss for early stopping?)
-    # https://github.com/jeffheaton/app_deep_learning/blob/main/t81_558_class_03_4_early_stop.ipynb
+    # TODO: early stopping: (need fullbatch validation loss for early stopping?)
+    # TODO: https://github.com/jeffheaton/app_deep_learning/blob/main/t81_558_class_03_4_early_stop.ipynb
 
     # callback for training
     cb_epoch = (p, st, epoch, nepoch; io = io) -> callback(p, st; io,
@@ -121,7 +121,7 @@ function train_model(
         cb_stats(p, st)
     end
 
-    # TODO - output a train.log file with timings
+    # TODO: output a train.log file with timings
 
     # save statistics
     statsfile = open(joinpath(dir, "statistics.txt"), "w")