Skip to content

Commit

Permalink
update to latest ClimaCore + CUDA, fix GC issues in pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
simonbyrne committed Nov 2, 2023
1 parent 6845c98 commit f428068
Show file tree
Hide file tree
Showing 9 changed files with 95 additions and 64 deletions.
3 changes: 2 additions & 1 deletion .buildkite/JuliaProject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ HDF5_jll = "0234f1f7-429e-5d53-9886-15a909be8d59"
MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"

[preferences.CUDA_Runtime_jll]
version = "local"
version = "12.2"
local = "true"

[preferences.HDF5_jll]
libhdf5_path = "libhdf5"
Expand Down
50 changes: 35 additions & 15 deletions .buildkite/gpu_pipeline/pipeline.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
agents:
queue: clima
slurm_mem: 8G
modules: julia/1.9.3 cuda/julia-pref openmpi/4.1.5
slurm_mem_per_cpu: 16G
modules: julia/1.9.3 cuda/julia-pref openmpi/4.1.5 nsight-systems/2023.3.1

env:
OPENBLAS_NUM_THREADS: 1
JULIA_NVTX_CALLBACKS: gc
JULIA_MAX_NUM_PRECOMPILE_FILES: 100
CLIMAATMOS_GC_NSTEPS: 10
GPU_CONFIG_PATH: "config/gpu_configs/"
SLURM_GPU_BIND: none # https://github.com/open-mpi/ompi/issues/11949#issuecomment-1737712291

Expand All @@ -23,12 +24,13 @@ steps:
# force the initialization of the CUDA runtime as it is lazily loaded by default
- julia --project=examples -e 'using CUDA; CUDA.precompile_runtime()'
- julia --project=examples -e 'using CUDA; CUDA.versioninfo()'

- echo "--- Download artifacts"
- julia --project=examples artifacts/download_artifacts.jl

agents:
slurm_gpus: 1
slurm_cpus_per_task: 8
env:
JULIA_NUM_PRECOMPILE_TASKS: 8
JULIA_MAX_NUM_PRECOMPILE_FILES: 50
Expand All @@ -49,41 +51,59 @@ steps:
artifact_paths: "target_gpu_implicit_baroclinic_wave/*"
agents:
slurm_gpus: 1
slurm_time: 23:00:00
slurm_cpus_per_task: 4

- label: "gpu_aquaplanet_dyamond"
command:
- mkdir -p gpu_aquaplanet_dyamond
- >
- >
nsys profile --trace=nvtx,cuda --output=gpu_aquaplanet_dyamond/report
julia --color=yes --project=examples examples/hybrid/driver.jl
--config_file ${GPU_CONFIG_PATH}gpu_aquaplanet_dyamond.yml
artifact_paths: "gpu_aquaplanet_dyamond/*"
agents:
slurm_gpus: 1
slurm_cpus_per_task: 4

- label: "moist Held-Suarez"
key: "gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km"
command:
- mkdir -p gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km
key: "target_gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km"
command:
- mkdir -p target_gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km
- >
nsys profile --trace=nvtx,cuda --output=gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km/report
nsys profile --trace=nvtx,cuda --output=target_gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km/report
julia --color=yes --project=examples examples/hybrid/driver.jl
--config_file ${GPU_CONFIG_PATH}gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km.yml
artifact_paths: "gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km/*"
--config_file ${GPU_CONFIG_PATH}target_gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km.yml
artifact_paths: "target_gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km/*"
agents:
slurm_gpus: 1
slurm_cpus_per_task: 4

- label: "moist Held-Suarez - 8 gpus"
key: "target_gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km_4process"
command:
- mkdir -p target_gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km_4process
- >
srun --cpu-bind=cores
nsys profile --trace=nvtx,cuda --output=target_gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km_4process/report-%q{PMI_RANK}
julia --color=yes --project=examples examples/hybrid/driver.jl
--config_file ${GPU_CONFIG_PATH}target_gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km_4process.yml
artifact_paths: "target_gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km_4process/*"
agents:
slurm_gpus_per_task: 1
slurm_cpus_per_task: 4
slurm_ntasks: 4

- label: "dry baroclinic wave - 4 gpus"
- label: "dry baroclinic wave - 8 gpus"
key: "target_gpu_implicit_baroclinic_wave_4process"
command:
- mkdir -p target_gpu_implicit_baroclinic_wave_4process
- >
srun
nsys profile --trace=nvtx,cuda,mpi --output=target_gpu_implicit_baroclinic_wave_4process/report-%q{PMI_RANK}
srun --cpu-bind=cores
nsys profile --trace=osrt,nvtx,cuda,mpi,ucx --output=target_gpu_implicit_baroclinic_wave_4process/report-%q{PMI_RANK}
julia --color=yes --project=examples examples/hybrid/driver.jl
--config_file ${GPU_CONFIG_PATH}target_gpu_implicit_baroclinic_wave_4process.yml
artifact_paths: "target_gpu_implicit_baroclinic_wave_4process/*"
agents:
slurm_gpus_per_task: 1
slurm_cpus_per_task: 4
slurm_ntasks: 4
1 change: 1 addition & 0 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ steps:
- echo "--- Instantiate perf"
- "julia --project=perf -e 'using Pkg; Pkg.instantiate(;verbose=true)'"
- "julia --project=perf -e 'using Pkg; Pkg.precompile()'"
- "julia --project=perf --check-bounds=yes -e 'using Pkg; Pkg.precompile()'"
- "julia --project=perf -e 'using Pkg; Pkg.status()'"

- echo "--- Download artifacts"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
dt_save_to_disk: "10days"
dt_save_to_sol: "Inf"
dt: "100secs"
t_end: "1days"
h_elem: 30
Expand All @@ -12,5 +12,5 @@ moist: "equil"
precip_model: "0M"
rayleigh_sponge: true
forcing: "held_suarez"
job_id: "gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km"
job_id: "target_gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km"
toml: [toml/longrun_hs_rhoe_equilmoist_nz63_0M_55km_rs35km.toml]
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
dt_save_to_sol: "Inf"
dt: "100secs"
t_end: "1days"
h_elem: 30
z_elem: 63
dz_bottom: 30.0
dz_top: 3000.0
z_max: 55000.0
kappa_4: 1.0e15
vert_diff: "true"
moist: "equil"
precip_model: "0M"
rayleigh_sponge: true
forcing: "held_suarez"
job_id: "target_gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km_4process"
toml: [toml/longrun_hs_rhoe_equilmoist_nz63_0M_55km_rs35km.toml]
34 changes: 14 additions & 20 deletions docs/Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ weakdeps = ["SparseArrays"]
deps = ["ArgParse", "ArtifactWrappers", "Artifacts", "AtmosphericProfilesLibrary", "CLIMAParameters", "ClimaComms", "ClimaCore", "ClimaTimeSteppers", "CloudMicrophysics", "Colors", "Dates", "Dierckx", "DiffEqBase", "DiffEqCallbacks", "DocStringExtensions", "FastGaussQuadrature", "ImageFiltering", "Insolation", "Interpolations", "IntervalSets", "Krylov", "LinearAlgebra", "Logging", "NCDatasets", "NVTX", "Pkg", "Printf", "RRTMGP", "Random", "RootSolvers", "SciMLBase", "StaticArrays", "Statistics", "StatsBase", "SurfaceFluxes", "Test", "Thermodynamics", "YAML"]
path = ".."
uuid = "b2c96348-7fb7-4fe0-8da9-78d88439e717"
version = "0.16.2"
version = "0.16.3"

[[deps.ClimaComms]]
deps = ["CUDA", "MPI"]
Expand Down Expand Up @@ -268,9 +268,9 @@ version = "0.1.12"

[[deps.CloudMicrophysics]]
deps = ["CLIMAParameters", "DocStringExtensions", "ForwardDiff", "RootSolvers", "SpecialFunctions", "Thermodynamics"]
git-tree-sha1 = "3d4273c5b64ea2270115abbffbb2494572bd2583"
git-tree-sha1 = "a6993c57af74e2e0f2191b556178cc8d8c0df9d6"
uuid = "6a9e3e04-43cd-43ba-94b9-e8782df3c71b"
version = "0.15.0"
version = "0.15.1"

[[deps.ColorTypes]]
deps = ["FixedPointNumbers", "Random"]
Expand Down Expand Up @@ -460,12 +460,6 @@ git-tree-sha1 = "23163d55f885173722d1e4cf0f6110cdbaf7e272"
uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
version = "1.15.1"

[[deps.DiskArrays]]
deps = ["OffsetArrays"]
git-tree-sha1 = "1bfa9de80f35ac63c6c381b2d43c590875896a1f"
uuid = "3c3547ce-8d99-4f5e-a174-61eb10b00ae3"
version = "0.3.22"

[[deps.Distances]]
deps = ["LinearAlgebra", "Statistics", "StatsAPI"]
git-tree-sha1 = "5225c965635d8c21168e32a12954675e7bea1151"
Expand Down Expand Up @@ -495,9 +489,9 @@ version = "1.1.2"

[[deps.DocumenterCitations]]
deps = ["AbstractTrees", "Bibliography", "Dates", "Documenter", "Logging", "Markdown", "MarkdownAST", "OrderedCollections", "Unicode"]
git-tree-sha1 = "6059e3ea85130a0c69db74d56d0649656b0ef439"
git-tree-sha1 = "5bab3faaa2e87b09efd75c674c039364a6d85104"
uuid = "daee34ce-89f3-4625-b898-19384cb65244"
version = "1.3.0"
version = "1.3.1"

[[deps.Downloads]]
deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
Expand Down Expand Up @@ -1000,9 +994,9 @@ version = "2.28.2+0"

[[deps.MicrosoftMPI_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "a7023883872e52bc29bcaac74f19adf39347d2d5"
git-tree-sha1 = "b01beb91d20b0d1312a9471a36017b5b339d26de"
uuid = "9237b28f-5490-5468-be7b-bb81f5f5e6cf"
version = "10.1.4+0"
version = "10.1.4+1"

[[deps.Missings]]
deps = ["DataAPI"]
Expand All @@ -1029,10 +1023,10 @@ uuid = "46d2c3a1-f734-5fdb-9937-b9b9aeba4221"
version = "0.2.4"

[[deps.NCDatasets]]
deps = ["CFTime", "CommonDataModel", "DataStructures", "Dates", "DiskArrays", "NetCDF_jll", "NetworkOptions", "Printf"]
git-tree-sha1 = "7fcb4378f9c648a186bcb996fa29acc929a179ed"
deps = ["CFTime", "CommonDataModel", "DataStructures", "Dates", "NetCDF_jll", "NetworkOptions", "Printf"]
git-tree-sha1 = "4263c4220f22e20729329838bf7e94a49d1ac32f"
uuid = "85f8d34a-cbdd-5861-8df4-14fed0d494ab"
version = "0.13.1"
version = "0.12.17"

[[deps.NLSolversBase]]
deps = ["DiffResults", "Distributed", "FiniteDiff", "ForwardDiff"]
Expand Down Expand Up @@ -1332,9 +1326,9 @@ version = "0.3.6"

[[deps.Scratch]]
deps = ["Dates"]
git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a"
git-tree-sha1 = "3bac05bc7e74a75fd9cba4295cde4045d9fe2386"
uuid = "6c6a2e73-6563-6170-7368-637461726353"
version = "1.2.0"
version = "1.2.1"

[[deps.SentinelArrays]]
deps = ["Dates", "Random"]
Expand Down Expand Up @@ -1638,9 +1632,9 @@ version = "5.8.0+0"

[[deps.libevent_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "OpenSSL_jll"]
git-tree-sha1 = "7dcec01ff5bf9c2febffaa7ec00d3199172ade8b"
git-tree-sha1 = "f04ec6d9a186115fb38f858f05c0c4e1b7fc9dcb"
uuid = "1080aeaf-3a6a-583e-a51c-c537b09f60ec"
version = "2.1.13+0"
version = "2.1.13+1"

[[deps.nghttp2_jll]]
deps = ["Artifacts", "Libdl"]
Expand Down
24 changes: 12 additions & 12 deletions examples/Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

julia_version = "1.9.3"
manifest_format = "2.0"
project_hash = "97503acc47d6551f0f811a36b2d6bc5c5bc7e710"
project_hash = "314cba4e1f6458dfa3aaef9b3bde09b147ed6b03"

[[deps.ADTypes]]
git-tree-sha1 = "5d2e21d7b0d8c22f67483ef95ebdc39c0e6b6003"
Expand Down Expand Up @@ -285,7 +285,7 @@ weakdeps = ["SparseArrays"]
deps = ["ArgParse", "ArtifactWrappers", "Artifacts", "AtmosphericProfilesLibrary", "CLIMAParameters", "ClimaComms", "ClimaCore", "ClimaTimeSteppers", "CloudMicrophysics", "Colors", "Dates", "Dierckx", "DiffEqBase", "DiffEqCallbacks", "DocStringExtensions", "FastGaussQuadrature", "ImageFiltering", "Insolation", "Interpolations", "IntervalSets", "Krylov", "LinearAlgebra", "Logging", "NCDatasets", "NVTX", "Pkg", "Printf", "RRTMGP", "Random", "RootSolvers", "SciMLBase", "StaticArrays", "Statistics", "StatsBase", "SurfaceFluxes", "Test", "Thermodynamics", "YAML"]
path = ".."
uuid = "b2c96348-7fb7-4fe0-8da9-78d88439e717"
version = "0.16.2"
version = "0.16.3"

[[deps.ClimaComms]]
deps = ["CUDA", "MPI"]
Expand Down Expand Up @@ -343,9 +343,9 @@ version = "0.1.12"

[[deps.CloudMicrophysics]]
deps = ["CLIMAParameters", "DocStringExtensions", "ForwardDiff", "RootSolvers", "SpecialFunctions", "Thermodynamics"]
git-tree-sha1 = "3d4273c5b64ea2270115abbffbb2494572bd2583"
git-tree-sha1 = "a6993c57af74e2e0f2191b556178cc8d8c0df9d6"
uuid = "6a9e3e04-43cd-43ba-94b9-e8782df3c71b"
version = "0.15.0"
version = "0.15.1"

[[deps.CodeTracking]]
deps = ["InteractiveUtils", "UUIDs"]
Expand Down Expand Up @@ -427,9 +427,9 @@ version = "0.3.2"

[[deps.ConcurrentUtilities]]
deps = ["Serialization", "Sockets"]
git-tree-sha1 = "5372dbbf8f0bdb8c700db5367132925c0771ef7e"
git-tree-sha1 = "8cfa272e8bdedfa88b6aefbbca7c19f1befac519"
uuid = "f0e56b4a-5159-44fe-b623-3e5288b988bb"
version = "2.2.1"
version = "2.3.0"

[[deps.ConstructionBase]]
deps = ["LinearAlgebra"]
Expand Down Expand Up @@ -1488,9 +1488,9 @@ version = "0.3.2"

[[deps.MicrosoftMPI_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "a7023883872e52bc29bcaac74f19adf39347d2d5"
git-tree-sha1 = "b01beb91d20b0d1312a9471a36017b5b339d26de"
uuid = "9237b28f-5490-5468-be7b-bb81f5f5e6cf"
version = "10.1.4+0"
version = "10.1.4+1"

[[deps.Missings]]
deps = ["DataAPI"]
Expand Down Expand Up @@ -2082,9 +2082,9 @@ version = "0.3.6"

[[deps.Scratch]]
deps = ["Dates"]
git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a"
git-tree-sha1 = "3bac05bc7e74a75fd9cba4295cde4045d9fe2386"
uuid = "6c6a2e73-6563-6170-7368-637461726353"
version = "1.2.0"
version = "1.2.1"

[[deps.SentinelArrays]]
deps = ["Dates", "Random"]
Expand Down Expand Up @@ -2194,9 +2194,9 @@ weakdeps = ["ChainRulesCore"]

[[deps.StableHashTraits]]
deps = ["Compat", "SHA", "Tables", "TupleTools"]
git-tree-sha1 = "30edbce1c797dc7d4c74bc07b2b6a57b891bead3"
git-tree-sha1 = "d29023a76780bb8a3f2273b29153fd00828cb73f"
uuid = "c5dd0088-6c3f-4803-b00e-f31a60c170fa"
version = "1.1.0"
version = "1.1.1"

[[deps.StackViews]]
deps = ["OffsetArrays"]
Expand Down
Loading

0 comments on commit f428068

Please sign in to comment.