Skip to content

Commit

Permalink
update to latest ClimaCore + CUDA, fix GC issues in pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
simonbyrne committed Oct 30, 2023
1 parent 098e92c commit dbdb99a
Show file tree
Hide file tree
Showing 11 changed files with 285 additions and 268 deletions.
3 changes: 2 additions & 1 deletion .buildkite/JuliaProject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ HDF5_jll = "0234f1f7-429e-5d53-9886-15a909be8d59"
MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"

[preferences.CUDA_Runtime_jll]
version = "local"
version = "12.2"
local = "true"

[preferences.HDF5_jll]
libhdf5_path = "libhdf5"
Expand Down
52 changes: 36 additions & 16 deletions .buildkite/gpu_pipeline/pipeline.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
agents:
queue: clima
slurm_mem: 8G
modules: julia/1.9.3 cuda/julia-pref openmpi/4.1.5
slurm_mem_per_cpu: 16G
modules: julia/1.9.3 cuda/julia-pref openmpi/4.1.5 nsight-systems/2023.3.1

env:
OPENBLAS_NUM_THREADS: 1
JULIA_NVTX_CALLBACKS: gc
JULIA_MAX_NUM_PRECOMPILE_FILES: 100
CLIMAATMOS_GC_NSTEPS: 10
GPU_CONFIG_PATH: "config/gpu_configs/"
SLURM_GPU_BIND: none # https://github.com/open-mpi/ompi/issues/11949#issuecomment-1737712291

Expand All @@ -23,12 +24,13 @@ steps:
# force the initialization of the CUDA runtime as it is lazily loaded by default
- julia --project=examples -e 'using CUDA; CUDA.precompile_runtime()'
- julia --project=examples -e 'using CUDA; CUDA.versioninfo()'

- echo "--- Download artifacts"
- julia --project=examples artifacts/download_artifacts.jl

agents:
slurm_gpus: 1
slurm_cpus_per_task: 8
env:
JULIA_NUM_PRECOMPILE_TASKS: 8
JULIA_MAX_NUM_PRECOMPILE_FILES: 50
Expand All @@ -49,41 +51,59 @@ steps:
artifact_paths: "target_gpu_implicit_baroclinic_wave/*"
agents:
slurm_gpus: 1
slurm_time: 23:00:00
slurm_cpus_per_task: 4

- label: "gpu_aquaplanet_dyamond"
command:
- mkdir -p gpu_aquaplanet_dyamond
- >
- >
nsys profile --trace=nvtx,cuda --output=gpu_aquaplanet_dyamond/report
julia --color=yes --project=examples examples/hybrid/driver.jl
--config_file ${GPU_CONFIG_PATH}gpu_aquaplanet_dyamond.yml
artifact_paths: "gpu_aquaplanet_dyamond/*"
agents:
slurm_gpus: 1
slurm_cpus_per_task: 4

- label: "moist Held-Suarez"
key: "gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km"
command:
- mkdir -p gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km
key: "target_gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km"
command:
- mkdir -p target_gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km
- >
nsys profile --trace=nvtx,cuda --output=gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km/report
nsys profile --trace=nvtx,cuda --output=target_gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km/report
julia --color=yes --project=examples examples/hybrid/driver.jl
--config_file ${GPU_CONFIG_PATH}gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km.yml
artifact_paths: "gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km/*"
--config_file ${GPU_CONFIG_PATH}target_gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km.yml
artifact_paths: "target_gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km/*"
agents:
slurm_gpus: 1
slurm_cpus_per_task: 4

- label: "moist Held-Suarez - 8 gpus"
key: "target_gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km_4process"
command:
- mkdir -p target_gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km_4process
- >
srun --cpu-bind=cores
nsys profile --trace=nvtx,cuda --output=target_gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km_4process/report-%q{PMI_RANK}
julia --color=yes --project=examples examples/hybrid/driver.jl
--config_file ${GPU_CONFIG_PATH}target_gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km_4process.yml
artifact_paths: "target_gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km_4process/*"
agents:
slurm_gpus_per_task: 1
slurm_cpus_per_task: 4
slurm_ntasks: 8

- label: "dry baroclinic wave - 4 gpus"
- label: "dry baroclinic wave - 8 gpus"
key: "target_gpu_implicit_baroclinic_wave_4process"
command:
- mkdir -p target_gpu_implicit_baroclinic_wave_4process
- >
srun
nsys profile --trace=nvtx,cuda,mpi --output=target_gpu_implicit_baroclinic_wave_4process/report-%q{PMI_RANK}
srun --cpu-bind=cores
nsys profile --trace=osrt,nvtx,cuda,mpi,ucx --output=target_gpu_implicit_baroclinic_wave_4process/report-%q{PMI_RANK}
julia --color=yes --project=examples examples/hybrid/driver.jl
--config_file ${GPU_CONFIG_PATH}target_gpu_implicit_baroclinic_wave_4process.yml
artifact_paths: "target_gpu_implicit_baroclinic_wave_4process/*"
agents:
slurm_gpus_per_task: 1
slurm_ntasks: 4
slurm_cpus_per_task: 4
slurm_ntasks: 8
1 change: 1 addition & 0 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ steps:
- echo "--- Instantiate perf"
- "julia --project=perf -e 'using Pkg; Pkg.instantiate(;verbose=true)'"
- "julia --project=perf -e 'using Pkg; Pkg.precompile()'"
- "julia --project=perf --check-bounds=yes -e 'using Pkg; Pkg.precompile()'"
- "julia --project=perf -e 'using Pkg; Pkg.status()'"

- echo "--- Download artifacts"
Expand Down
4 changes: 2 additions & 2 deletions .dev/Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,9 @@ version = "0.21.4"

[[deps.JuliaFormatter]]
deps = ["CSTParser", "CommonMark", "DataStructures", "Glob", "Pkg", "PrecompileTools", "Tokenize"]
git-tree-sha1 = "80031f6e58b09b0de4553bf63d9a36ec5db57967"
git-tree-sha1 = "b11c259ae111b08aabdc2ff5186dad12a95fedca"
uuid = "98e50ef6-434e-11e9-1051-2b60c6c9e899"
version = "1.0.39"
version = "1.0.41"

[[deps.LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ ArgParse = "1"
ArtifactWrappers = "0.2"
AtmosphericProfilesLibrary = "0.1"
CLIMAParameters = "0.7.24"
CUDA = "4"
CUDA = "4, 5"
ClimaComms = "0.5.5"
ClimaCore = "0.10.55"
ClimaTimeSteppers = "0.7.14"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
dt_save_to_disk: "10days"
dt_save_to_sol: "Inf"
dt: "100secs"
t_end: "1days"
h_elem: 30
Expand All @@ -12,5 +12,5 @@ moist: "equil"
precip_model: "0M"
rayleigh_sponge: true
forcing: "held_suarez"
job_id: "gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km"
job_id: "target_gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km"
toml: [toml/longrun_hs_rhoe_equilmoist_nz63_0M_55km_rs35km.toml]
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
dt_save_to_sol: "Inf"
dt: "100secs"
t_end: "1days"
h_elem: 30
z_elem: 63
dz_bottom: 30.0
dz_top: 3000.0
z_max: 55000.0
kappa_4: 1.0e15
vert_diff: "true"
moist: "equil"
precip_model: "0M"
rayleigh_sponge: true
forcing: "held_suarez"
job_id: "target_gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km_4process"
toml: [toml/longrun_hs_rhoe_equilmoist_nz63_0M_55km_rs35km.toml]
Loading

0 comments on commit dbdb99a

Please sign in to comment.