Skip to content

Commit

Permalink
running on eagle
Browse files Browse the repository at this point in the history
  • Loading branch information
vpuri3 committed Aug 26, 2023
1 parent 81f035c commit d391430
Show file tree
Hide file tree
Showing 18 changed files with 63 additions and 59 deletions.
1 change: 0 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
FourierSpaces = "f3d7ce31-8807-4b9f-8144-4bd3ed8f0112"
HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
Expand Down
20 changes: 11 additions & 9 deletions examples/diffusion_fourier/exp_bilinear_scale/bilin.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ import Lux: cpu, gpu
using Tullio, Zygote

using FFTW, LinearAlgebra
BLAS.set_num_threads(2)
FFTW.set_num_threads(8)
BLAS.set_num_threads(12)
FFTW.set_num_threads(24)

include("../datagen.jl")

Expand Down Expand Up @@ -64,15 +64,15 @@ o = size(__data[2], 1) # out channels

NN = Lux.Chain(
PermutedBatchNorm(c, 3),
Dense(c , w, tanh),
OpKernel(w, w, m, tanh),
OpKernel(w, w, m, tanh),
OpKernel(w, w, m, tanh),
Dense(c , w, Lux.relu),
OpKernel(w, w, m, Lux.relu),
OpKernel(w, w, m, Lux.relu),
OpKernel(w, w, m, Lux.relu),
Dense(w , o)
)

opt = Optimisers.Adam()
batchsize = size(__data[1])[end]
batchsize = 128 #size(__data[1])[end]
learning_rates = (1f-2, 1f-3,)
nepochs = E .* (0.10, 0.90,) .|> Int
dir = joinpath(@__DIR__, "exp_FNO_nonlin")
Expand Down Expand Up @@ -109,18 +109,20 @@ bilin = OpConvBilinear(w1, w2, o, m)
NN = linear_nonlinear(split, nonlin, linear, bilin)

opt = Optimisers.Adam()
batchsize = 256 # size(__data[1])[end] # 1024
batchsize = 128 # size(__data[1])[end] # 1024
learning_rates = (1f-3,)
nepochs = E .* (1.00,) .|> Int
# learning_rates = (1f-3, 5f-4, 2.5f-4, 1.25f-4,)
# nepochs = E .* (0.25, 0.25, 0.25, 0.25,) .|> Int
dir = joinpath(@__DIR__, "exp_FNO_linear_nonlinear")
device = Lux.cpu
device = Lux.gpu

model, ST = train_model(rng, NN, __data, data__, _V, opt;
batchsize, learning_rates, nepochs, dir, cbstep = 1, device)

end

plot_training(ST...) |> display

nothing
#
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
TRAIN LOSS: 0.02262737 TEST LOSS: 0.03570642
TRAIN LOSS: 0.03218706 TEST LOSS: 0.04251174
#======================#
TRAIN STATS
R² score: 0.9730648
MSE (mean SQR error): 0.02510134
RMSE (root mean SQR error): 0.15843402
MAE (mean ABS error): 0.11758655
maxAE (max ABS error) 1.0394883
R² score: 0.96962214
MSE (mean SQR error): 0.03111367
RMSE (root mean SQR error): 0.17639066
MAE (mean ABS error): 0.12991187
maxAE (max ABS error) 1.218371

#======================#
#======================#
TEST STATS
R² score: 0.97130257
MSE (mean SQR error): 0.03251882
RMSE (root mean SQR error): 0.18032974
MAE (mean ABS error): 0.13228929
maxAE (max ABS error) 1.1780059
R² score: 0.9665479
MSE (mean SQR error): 0.04228398
RMSE (root mean SQR error): 0.20563067
MAE (mean ABS error): 0.15284078
maxAE (max ABS error) 1.322362

#======================#
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
TRAIN LOSS: 0.00225263 TEST LOSS: 0.00903751
TRAIN LOSS: 0.00091016 TEST LOSS: 0.0081393
#======================#
TRAIN STATS
R² score: 0.9977584
mean SQR error: 0.00225267
mean ABS error: 0.03524051
max ABS error: 0.48005915
R² score: 0.99895054
MSE (mean SQR error): 0.00102989
RMSE (root mean SQR error): 0.03209183
MAE (mean ABS error): 0.02561593
maxAE (max ABS error) 0.20224762

#======================#
#======================#
TEST STATS
R² score: 0.99156046
mean SQR error: 0.00903739
mean ABS error: 0.0693927
max ABS error: 0.84695154
R² score: 0.9920869
MSE (mean SQR error): 0.0082745
RMSE (root mean SQR error): 0.09096426
MAE (mean ABS error): 0.06705302
maxAE (max ABS error) 0.65916514

#======================#
31 changes: 16 additions & 15 deletions examples/pdebench/darcy2d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,18 @@ import Lux: cpu, gpu
using Tullio, Zygote

using FFTW, LinearAlgebra
BLAS.set_num_threads(4)
FFTW.set_num_threads(8)
BLAS.set_num_threads(12)
FFTW.set_num_threads(24)

rng = Random.default_rng()
Random.seed!(rng, 345)

N = 128
E = 40
E = 100

# trajectories
_K = 128
K_ = 32
_K = 512
K_ = 128

# get data
dir = @__DIR__
Expand All @@ -55,31 +55,32 @@ V = FourierSpace(N, N)
# FNO model
###

w = 32 # width
w = 64 # width
m = (32, 32,) # modes
c = size(_data[1], 1) # in channels
o = size(_data[2], 1) # out channels
act = Lux.relu

NN = Lux.Chain(
Dense(c, w, tanh),
OpKernel(w, w, m, tanh),
OpKernel(w, w, m, tanh),
OpKernel(w, w, m, tanh),
OpKernel(w, w, m, tanh),
Dense(c, w, act),
OpKernel(w, w, m, act),
OpKernel(w, w, m, act),
OpKernel(w, w, m, act),
OpKernel(w, w, m, act),
Dense(w, o),
)

opt = Optimisers.Adam()
batchsize = 16
batchsize = 32
learning_rates = (1f-2, 1f-3, 5f-4, 2.5f-4,)
nepochs = E .* (0.25, 0.25, 0.25, 0.25,) .|> Int
dir = joinpath(@__DIR__, "model_darcy2D")
device = Lux.cpu # Lux.gpu
device = Lux.gpu

model, ST = train_model(rng, NN, _data, data_, V, opt;
batchsize, learning_rates, nepochs, dir, device)

plot_training(ST...)
plot_training(ST...) |> display

# nothing
nothing
#
Binary file modified examples/pdebench/model_darcy2D/plt_training.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
22 changes: 11 additions & 11 deletions examples/pdebench/model_darcy2D/statistics.txt
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
TRAIN LOSS: 0.00020804 TEST LOSS: 8.247e-5
TRAIN LOSS: 2.19e-6 TEST LOSS: 0.00093983
#======================#
TRAIN STATS
R² score: 0.76355976
MSE (mean SQR error): 0.0003289
RMSE (root mean SQR error): 0.01813549
MAE (mean ABS error): 0.00971096
maxAE (max ABS error) 0.26131245
R² score: 0.9909916
MSE (mean SQR error): 9.42e-6
RMSE (root mean SQR error): 0.00306904
MAE (mean ABS error): 0.00133418
maxAE (max ABS error) 0.08552231

#======================#
#======================#
TEST STATS
R² score: 0.85738647
MSE (mean SQR error): 0.00025149
RMSE (root mean SQR error): 0.01585852
MAE (mean ABS error): 0.00898928
maxAE (max ABS error) 0.16604173
R² score: 0.27241534
MSE (mean SQR error): 0.00044176
RMSE (root mean SQR error): 0.02101806
MAE (mean ABS error): 0.00636383
maxAE (max ABS error) 0.277627

#======================#
6 changes: 3 additions & 3 deletions src/train.jl
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ function train_model(

# cb_batch =

# early stopping: (need fullbatch validation loss for early stopping?)
# https://github.com/jeffheaton/app_deep_learning/blob/main/t81_558_class_03_4_early_stop.ipynb
# TODO: early stopping: (need fullbatch validation loss for early stopping?)
# TODO: https://github.com/jeffheaton/app_deep_learning/blob/main/t81_558_class_03_4_early_stop.ipynb

# callback for training
cb_epoch = (p, st, epoch, nepoch; io = io) -> callback(p, st; io,
Expand Down Expand Up @@ -121,7 +121,7 @@ function train_model(
cb_stats(p, st)
end

# TODO - output a train.log file with timings
# TODO: output a train.log file with timings

# save statistics
statsfile = open(joinpath(dir, "statistics.txt"), "w")
Expand Down

0 comments on commit d391430

Please sign in to comment.