From cb94af569c32819a9112fd1bfb1ff7a5d3828eb7 Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Thu, 18 Sep 2025 11:33:39 -0600 Subject: [PATCH 01/25] Updates beaker-py version. --- pyproject.toml | 2 +- uv.lock | 70 ++++++++++++++++++++++++-------------------------- 2 files changed, 34 insertions(+), 38 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5ec2c83b3..a00784abd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,7 +76,7 @@ link-mode = "hardlink" [dependency-groups] dev = [ - "beaker-py>=1.32.2,<2.0", + "beaker-py>=2.5.0", "mkdocs-material>=9.6.8", "markdown-include>=0.8.1", "pytest>=8.3.4", diff --git a/uv.lock b/uv.lock index e6c858c79..a4ea563e3 100644 --- a/uv.lock +++ b/uv.lock @@ -238,21 +238,19 @@ wheels = [ [[package]] name = "beaker-py" -version = "1.36.4" +version = "2.5.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "docker" }, + { name = "google-crc32c" }, { name = "grpcio" }, { name = "packaging" }, { name = "protobuf" }, - { name = "pydantic" }, { name = "pyyaml" }, { name = "requests" }, - { name = "rich" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/05/10/526ccf25572e442844fb545bec273ebcfd1360e2a711f542610ac3222877/beaker_py-1.36.4.tar.gz", hash = "sha256:d958d702ff5ac6f4de4fa7c50db7e25a68cb68a2ab245a0f137adf6d2ba7d648", size = 118807, upload-time = "2025-07-16T22:30:46.998Z" } +sdist = { url = "https://files.pythonhosted.org/packages/84/6c/32a4254ceb98d01a403665a0636030ad65ea618f6737a804ac1a0b5db99a/beaker_py-2.5.0.tar.gz", hash = "sha256:346045705d6e434296db03b2a95b9440e2b20af8f095b07551bb774ce2fbf5a2", size = 91252, upload-time = "2025-08-20T19:00:11.007Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/7a/e4/01a69c51892dc8af740dad911899b308de8462a1ba7cfc4061a850221ea9/beaker_py-1.36.4-py3-none-any.whl", hash = "sha256:b7370ceeb1ddeb631d4c2c2b3d5335a51d65117e6a38b03b7cf2ddfbbefd4520", size = 132650, upload-time = "2025-07-16T22:30:45.167Z" }, + { url = "https://files.pythonhosted.org/packages/4f/f2/81b2c9f4e3f2d4d1c0f45ad7465d244de2739d4977b8a49a84e0be8d4baf/beaker_py-2.5.0-py3-none-any.whl", hash = "sha256:33f46ad795487beb8a433f8669a9f2513d04d40d394f28969dfea6e637c511a8", size = 102270, upload-time = "2025-08-20T19:00:09.436Z" }, ] [[package]] @@ -680,20 +678,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/68/1b/e0a87d256e40e8c888847551b20a017a6b98139178505dc7ffb96f04e954/dnspython-2.7.0-py3-none-any.whl", hash = "sha256:b4c34b7d10b51bcc3a5071e7b8dee77939f1e878477eeecc965e9835f63c6c86", size = 313632, upload-time = "2024-10-05T20:14:57.687Z" }, ] -[[package]] -name = "docker" -version = "7.1.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pywin32", marker = "sys_platform == 'win32'" }, - { name = "requests" }, - { name = "urllib3" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/91/9b/4a2ea29aeba62471211598dac5d96825bb49348fa07e906ea930394a83ce/docker-7.1.0.tar.gz", hash = "sha256:ad8c70e6e3f8926cb8a92619b832b4ea5299e2831c14284663184e200546fa6c", size = 117834, upload-time = "2024-05-23T11:13:57.216Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e3/26/57c6fb270950d476074c087527a558ccb6f4436657314bfb6cdf484114c4/docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0", size = 147774, upload-time = "2024-05-23T11:13:55.01Z" }, -] - [[package]] name = "docker-pycreds" version = "0.4.0" @@ -1017,6 +1001,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ce/12/ad37a1ef86006d0a0117fc06a4a00bd461c775356b534b425f00dde208ea/google_auth-2.39.0-py2.py3-none-any.whl", hash = "sha256:0150b6711e97fb9f52fe599f55648950cc4540015565d8fbb31be2ad6e1548a2", size = 212319, upload-time = "2025-04-14T17:44:47.699Z" }, ] +[[package]] +name = "google-crc32c" +version = "1.7.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/19/ae/87802e6d9f9d69adfaedfcfd599266bf386a54d0be058b532d04c794f76d/google_crc32c-1.7.1.tar.gz", hash = "sha256:2bff2305f98846f3e825dbeec9ee406f89da7962accdb29356e4eadc251bd472", size = 14495, upload-time = "2025-03-26T14:29:13.32Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/eb/69/b1b05cf415df0d86691d6a8b4b7e60ab3a6fb6efb783ee5cd3ed1382bfd3/google_crc32c-1.7.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:b07d48faf8292b4db7c3d64ab86f950c2e94e93a11fd47271c28ba458e4a0d76", size = 30467, upload-time = "2025-03-26T14:31:11.92Z" }, + { url = "https://files.pythonhosted.org/packages/44/3d/92f8928ecd671bd5b071756596971c79d252d09b835cdca5a44177fa87aa/google_crc32c-1.7.1-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:7cc81b3a2fbd932a4313eb53cc7d9dde424088ca3a0337160f35d91826880c1d", size = 30311, upload-time = "2025-03-26T14:53:14.161Z" }, + { url = "https://files.pythonhosted.org/packages/33/42/c2d15a73df79d45ed6b430b9e801d0bd8e28ac139a9012d7d58af50a385d/google_crc32c-1.7.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:1c67ca0a1f5b56162951a9dae987988679a7db682d6f97ce0f6381ebf0fbea4c", size = 37889, upload-time = "2025-03-26T14:41:27.83Z" }, + { url = "https://files.pythonhosted.org/packages/57/ea/ac59c86a3c694afd117bb669bde32aaf17d0de4305d01d706495f09cbf19/google_crc32c-1.7.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc5319db92daa516b653600794d5b9f9439a9a121f3e162f94b0e1891c7933cb", size = 33028, upload-time = "2025-03-26T14:41:29.141Z" }, + { url = "https://files.pythonhosted.org/packages/60/44/87e77e8476767a4a93f6cf271157c6d948eacec63688c093580af13b04be/google_crc32c-1.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dcdf5a64adb747610140572ed18d011896e3b9ae5195f2514b7ff678c80f1603", size = 38026, upload-time = "2025-03-26T14:41:29.921Z" }, + { url = "https://files.pythonhosted.org/packages/c8/bf/21ac7bb305cd7c1a6de9c52f71db0868e104a5b573a4977cd9d0ff830f82/google_crc32c-1.7.1-cp310-cp310-win_amd64.whl", hash = "sha256:754561c6c66e89d55754106739e22fdaa93fafa8da7221b29c8b8e8270c6ec8a", size = 33476, upload-time = "2025-03-26T14:29:09.086Z" }, + { url = "https://files.pythonhosted.org/packages/f7/94/220139ea87822b6fdfdab4fb9ba81b3fff7ea2c82e2af34adc726085bffc/google_crc32c-1.7.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:6fbab4b935989e2c3610371963ba1b86afb09537fd0c633049be82afe153ac06", size = 30468, upload-time = "2025-03-26T14:32:52.215Z" }, + { url = "https://files.pythonhosted.org/packages/94/97/789b23bdeeb9d15dc2904660463ad539d0318286d7633fe2760c10ed0c1c/google_crc32c-1.7.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:ed66cbe1ed9cbaaad9392b5259b3eba4a9e565420d734e6238813c428c3336c9", size = 30313, upload-time = "2025-03-26T14:57:38.758Z" }, + { url = "https://files.pythonhosted.org/packages/81/b8/976a2b843610c211e7ccb3e248996a61e87dbb2c09b1499847e295080aec/google_crc32c-1.7.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee6547b657621b6cbed3562ea7826c3e11cab01cd33b74e1f677690652883e77", size = 33048, upload-time = "2025-03-26T14:41:30.679Z" }, + { url = "https://files.pythonhosted.org/packages/c9/16/a3842c2cf591093b111d4a5e2bfb478ac6692d02f1b386d2a33283a19dc9/google_crc32c-1.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d68e17bad8f7dd9a49181a1f5a8f4b251c6dbc8cc96fb79f1d321dfd57d66f53", size = 32669, upload-time = "2025-03-26T14:41:31.432Z" }, + { url = "https://files.pythonhosted.org/packages/04/17/ed9aba495916fcf5fe4ecb2267ceb851fc5f273c4e4625ae453350cfd564/google_crc32c-1.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:6335de12921f06e1f774d0dd1fbea6bf610abe0887a1638f64d694013138be5d", size = 33476, upload-time = "2025-03-26T14:29:10.211Z" }, + { url = "https://files.pythonhosted.org/packages/dd/b7/787e2453cf8639c94b3d06c9d61f512234a82e1d12d13d18584bd3049904/google_crc32c-1.7.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:2d73a68a653c57281401871dd4aeebbb6af3191dcac751a76ce430df4d403194", size = 30470, upload-time = "2025-03-26T14:34:31.655Z" }, + { url = "https://files.pythonhosted.org/packages/ed/b4/6042c2b0cbac3ec3a69bb4c49b28d2f517b7a0f4a0232603c42c58e22b44/google_crc32c-1.7.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:22beacf83baaf59f9d3ab2bbb4db0fb018da8e5aebdce07ef9f09fce8220285e", size = 30315, upload-time = "2025-03-26T15:01:54.634Z" }, + { url = "https://files.pythonhosted.org/packages/29/ad/01e7a61a5d059bc57b702d9ff6a18b2585ad97f720bd0a0dbe215df1ab0e/google_crc32c-1.7.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19eafa0e4af11b0a4eb3974483d55d2d77ad1911e6cf6f832e1574f6781fd337", size = 33180, upload-time = "2025-03-26T14:41:32.168Z" }, + { url = "https://files.pythonhosted.org/packages/3b/a5/7279055cf004561894ed3a7bfdf5bf90a53f28fadd01af7cd166e88ddf16/google_crc32c-1.7.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6d86616faaea68101195c6bdc40c494e4d76f41e07a37ffdef270879c15fb65", size = 32794, upload-time = "2025-03-26T14:41:33.264Z" }, + { url = "https://files.pythonhosted.org/packages/0f/d6/77060dbd140c624e42ae3ece3df53b9d811000729a5c821b9fd671ceaac6/google_crc32c-1.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:b7491bdc0c7564fcf48c0179d2048ab2f7c7ba36b84ccd3a3e1c3f7a72d3bba6", size = 33477, upload-time = "2025-03-26T14:29:10.94Z" }, + { url = "https://files.pythonhosted.org/packages/0b/43/31e57ce04530794917dfe25243860ec141de9fadf4aa9783dffe7dac7c39/google_crc32c-1.7.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8e9afc74168b0b2232fb32dd202c93e46b7d5e4bf03e66ba5dc273bb3559589", size = 28242, upload-time = "2025-03-26T14:41:42.858Z" }, + { url = "https://files.pythonhosted.org/packages/eb/f3/8b84cd4e0ad111e63e30eb89453f8dd308e3ad36f42305cf8c202461cdf0/google_crc32c-1.7.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa8136cc14dd27f34a3221c0f16fd42d8a40e4778273e61a3c19aedaa44daf6b", size = 28049, upload-time = "2025-03-26T14:41:44.651Z" }, + { url = "https://files.pythonhosted.org/packages/16/1b/1693372bf423ada422f80fd88260dbfd140754adb15cbc4d7e9a68b1cb8e/google_crc32c-1.7.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85fef7fae11494e747c9fd1359a527e5970fc9603c90764843caabd3a16a0a48", size = 28241, upload-time = "2025-03-26T14:41:45.898Z" }, + { url = "https://files.pythonhosted.org/packages/fd/3c/2a19a60a473de48717b4efb19398c3f914795b64a96cf3fbe82588044f78/google_crc32c-1.7.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6efb97eb4369d52593ad6f75e7e10d053cf00c48983f7a973105bc70b0ac4d82", size = 28048, upload-time = "2025-03-26T14:41:46.696Z" }, +] + [[package]] name = "googleapis-common-protos" version = "1.70.0" @@ -2267,7 +2279,7 @@ provides-extras = ["code"] [package.metadata.requires-dev] dev = [ - { name = "beaker-py", specifier = ">=1.32.2,<2.0" }, + { name = "beaker-py", specifier = ">=2.5.0" }, { name = "markdown-include", specifier = ">=0.8.1" }, { name = "mkdocs-material", specifier = ">=9.6.8" }, { name = "parameterized", specifier = ">=0.9.0" }, @@ -3091,22 +3103,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" }, ] -[[package]] -name = "pywin32" -version = "310" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/95/da/a5f38fffbba2fb99aa4aa905480ac4b8e83ca486659ac8c95bce47fb5276/pywin32-310-cp310-cp310-win32.whl", hash = "sha256:6dd97011efc8bf51d6793a82292419eba2c71cf8e7250cfac03bba284454abc1", size = 8848240, upload-time = "2025-03-17T00:55:46.783Z" }, - { url = "https://files.pythonhosted.org/packages/aa/fe/d873a773324fa565619ba555a82c9dabd677301720f3660a731a5d07e49a/pywin32-310-cp310-cp310-win_amd64.whl", hash = "sha256:c3e78706e4229b915a0821941a84e7ef420bf2b77e08c9dae3c76fd03fd2ae3d", size = 9601854, upload-time = "2025-03-17T00:55:48.783Z" }, - { url = "https://files.pythonhosted.org/packages/3c/84/1a8e3d7a15490d28a5d816efa229ecb4999cdc51a7c30dd8914f669093b8/pywin32-310-cp310-cp310-win_arm64.whl", hash = "sha256:33babed0cf0c92a6f94cc6cc13546ab24ee13e3e800e61ed87609ab91e4c8213", size = 8522963, upload-time = "2025-03-17T00:55:50.969Z" }, - { url = "https://files.pythonhosted.org/packages/f7/b1/68aa2986129fb1011dabbe95f0136f44509afaf072b12b8f815905a39f33/pywin32-310-cp311-cp311-win32.whl", hash = "sha256:1e765f9564e83011a63321bb9d27ec456a0ed90d3732c4b2e312b855365ed8bd", size = 8784284, upload-time = "2025-03-17T00:55:53.124Z" }, - { url = "https://files.pythonhosted.org/packages/b3/bd/d1592635992dd8db5bb8ace0551bc3a769de1ac8850200cfa517e72739fb/pywin32-310-cp311-cp311-win_amd64.whl", hash = "sha256:126298077a9d7c95c53823934f000599f66ec9296b09167810eb24875f32689c", size = 9520748, upload-time = "2025-03-17T00:55:55.203Z" }, - { url = "https://files.pythonhosted.org/packages/90/b1/ac8b1ffce6603849eb45a91cf126c0fa5431f186c2e768bf56889c46f51c/pywin32-310-cp311-cp311-win_arm64.whl", hash = "sha256:19ec5fc9b1d51c4350be7bb00760ffce46e6c95eaf2f0b2f1150657b1a43c582", size = 8455941, upload-time = "2025-03-17T00:55:57.048Z" }, - { url = "https://files.pythonhosted.org/packages/6b/ec/4fdbe47932f671d6e348474ea35ed94227fb5df56a7c30cbbb42cd396ed0/pywin32-310-cp312-cp312-win32.whl", hash = "sha256:8a75a5cc3893e83a108c05d82198880704c44bbaee4d06e442e471d3c9ea4f3d", size = 8796239, upload-time = "2025-03-17T00:55:58.807Z" }, - { url = "https://files.pythonhosted.org/packages/e3/e5/b0627f8bb84e06991bea89ad8153a9e50ace40b2e1195d68e9dff6b03d0f/pywin32-310-cp312-cp312-win_amd64.whl", hash = "sha256:bf5c397c9a9a19a6f62f3fb821fbf36cac08f03770056711f765ec1503972060", size = 9503839, upload-time = "2025-03-17T00:56:00.8Z" }, - { url = "https://files.pythonhosted.org/packages/1f/32/9ccf53748df72301a89713936645a664ec001abd35ecc8578beda593d37d/pywin32-310-cp312-cp312-win_arm64.whl", hash = "sha256:2349cc906eae872d0663d4d6290d13b90621eaf78964bb1578632ff20e152966", size = 8459470, upload-time = "2025-03-17T00:56:02.601Z" }, -] - [[package]] name = "pyyaml" version = "6.0.2" From 5c8795db6c3d7bac389b757a97ef53f678714f56 Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Thu, 18 Sep 2025 12:24:10 -0600 Subject: [PATCH 02/25] Fixed secrets ref. --- mason.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mason.py b/mason.py index 215b19a4b..ba8f3e94f 100644 --- a/mason.py +++ b/mason.py @@ -881,7 +881,7 @@ def main(): beaker_client = beaker.Beaker.from_env(default_workspace=args.workspace) else: beaker_client = beaker.Beaker.from_env() - beaker_secrets = [secret.name for secret in beaker_client.workspace.secrets()] + beaker_secrets = [secret.name for secret in beaker_client.secret.list()] whoami = beaker_client.account.whoami().name full_commands = [make_internal_command(command, args, whoami, is_external_user) for command in commands] From 169fe8a90bdf9814b83b6e6c8be81fe9ba4b1231 Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Thu, 18 Sep 2025 12:27:12 -0600 Subject: [PATCH 03/25] Fixed whoami command --- mason.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mason.py b/mason.py index ba8f3e94f..c81542921 100644 --- a/mason.py +++ b/mason.py @@ -882,7 +882,7 @@ def main(): else: beaker_client = beaker.Beaker.from_env() beaker_secrets = [secret.name for secret in beaker_client.secret.list()] - whoami = beaker_client.account.whoami().name + whoami = beaker_client.user.get().name full_commands = [make_internal_command(command, args, whoami, is_external_user) for command in commands] if is_external_user: From 662661bf42570e0b7bbf7ece4bbce7714bd2131b Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Thu, 18 Sep 2025 12:30:23 -0600 Subject: [PATCH 04/25] Fixed experimentspec error --- mason.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mason.py b/mason.py index c81542921..05f6ceaaf 100644 --- a/mason.py +++ b/mason.py @@ -899,7 +899,7 @@ def main(): console.print(Text(full_command)) if is_external_user: return - experiment_spec = beaker.ExperimentSpec( + experiment_spec = beaker.BeakerExperimentSpec( description=args.description, tasks=[make_task_spec(args, full_command, i, beaker_secrets, whoami, args.resumable) for i, full_command in enumerate(full_commands)], budget=args.budget, From ea655b962ee00952116cf1f9ffb9a811fb2d3732 Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Thu, 18 Sep 2025 12:32:55 -0600 Subject: [PATCH 05/25] Fixed constraints reference. --- mason.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mason.py b/mason.py index 05f6ceaaf..99ba92228 100644 --- a/mason.py +++ b/mason.py @@ -835,9 +835,9 @@ def make_task_spec(args, full_command: str, i: int, beaker_secrets: str, whoami: raise ValueError("GCP clusters do not have the dev filesystem, please use a proper image") if args.hostname is not None: - constraints = beaker.Constraints(hostname=args.hostname) + constraints = beaker.BeakerConstraints(hostname=args.hostname) else: - constraints = beaker.Constraints(cluster=args.cluster) + constraints = beaker.BeakerConstraints(cluster=args.cluster) spec = beaker.TaskSpec( name=f"{args.task_name}__{i}", image=beaker.ImageSource(beaker=args.image), From 1c686afc5420f570f9859962ac2ac6b537e4f58e Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Thu, 18 Sep 2025 12:38:47 -0600 Subject: [PATCH 06/25] Update mason.py for beaker-py v2 API changes - Changed beaker_client.workspace.secrets() to beaker_client.secret.list() - Changed beaker_client.account.whoami() to beaker_client.user.get() - Changed beaker.ExperimentSpec to beaker.BeakerExperimentSpec - Changed beaker.Constraints to beaker.BeakerConstraints - Changed beaker.RetrySpec to beaker.BeakerRetrySpec --- mason.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mason.py b/mason.py index 99ba92228..cd20d5052 100644 --- a/mason.py +++ b/mason.py @@ -903,7 +903,7 @@ def main(): description=args.description, tasks=[make_task_spec(args, full_command, i, beaker_secrets, whoami, args.resumable) for i, full_command in enumerate(full_commands)], budget=args.budget, - retry=beaker.RetrySpec(allowed_task_retries=args.max_retries) + retry=beaker.BeakerRetrySpec(allowed_task_retries=args.max_retries) ) exp = beaker_client.experiment.create(spec=experiment_spec) console.log(f"Kicked off Beaker job. https://beaker.org/ex/{exp.id}") From 26c8c2ad3481c184a053a14614d97969252976de Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Thu, 18 Sep 2025 12:39:35 -0600 Subject: [PATCH 07/25] Fix beaker.TaskSpec to beaker.BeakerTaskSpec for v2 API --- mason.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mason.py b/mason.py index cd20d5052..7202fde13 100644 --- a/mason.py +++ b/mason.py @@ -838,7 +838,7 @@ def make_task_spec(args, full_command: str, i: int, beaker_secrets: str, whoami: constraints = beaker.BeakerConstraints(hostname=args.hostname) else: constraints = beaker.BeakerConstraints(cluster=args.cluster) - spec = beaker.TaskSpec( + spec = beaker.BeakerTaskSpec( name=f"{args.task_name}__{i}", image=beaker.ImageSource(beaker=args.image), command=['/bin/bash', '-c'], From 0edd090a938055649d209abb945a74f5f1e70598 Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Thu, 18 Sep 2025 12:42:19 -0600 Subject: [PATCH 08/25] Fix remaining beaker v2 API changes - Changed all beaker.EnvVar to beaker.BeakerEnvVar - Changed beaker.DataMount to beaker.BeakerDataMount - Changed beaker.DataSource to beaker.BeakerDataSource - Changed beaker.TaskResources to beaker.BeakerTaskResources - Changed beaker.ImageSource to beaker.BeakerImageSource - Changed beaker.ResultSpec to beaker.BeakerResultSpec - Changed beaker.TaskContext to beaker.BeakerTaskContext - Changed beaker.Priority to beaker.BeakerPriority --- mason.py | 118 +++++++++++++++++++++++++++---------------------------- 1 file changed, 59 insertions(+), 59 deletions(-) diff --git a/mason.py b/mason.py index 7202fde13..2e8752dad 100644 --- a/mason.py +++ b/mason.py @@ -252,12 +252,12 @@ def get_env_vars(pure_docker_mode: bool, cluster: List[str], beaker_secrets: Lis env_vars = [] additional_env_var_names = [var["name"] for var in additional_env_vars] if "RAY_CGRAPH_get_timeout" not in additional_env_var_names: - env_vars.append(beaker.EnvVar(name="RAY_CGRAPH_get_timeout", + env_vars.append(beaker.BeakerEnvVar(name="RAY_CGRAPH_get_timeout", value="300")) # Add user-specified environment variables first for env_var in additional_env_vars: env_vars.append( - beaker.EnvVar( + beaker.BeakerEnvVar( name=env_var["name"], value=env_var["value"] ) @@ -265,7 +265,7 @@ def get_env_vars(pure_docker_mode: bool, cluster: List[str], beaker_secrets: Lis # add user-specific secrets for secret in additional_secrets: env_vars.append( - beaker.EnvVar( + beaker.BeakerEnvVar( name=secret["name"], secret=secret["value"], ) @@ -284,14 +284,14 @@ def get_env_vars(pure_docker_mode: bool, cluster: List[str], beaker_secrets: Lis for useful_secret in useful_secrets: if f"{whoami}_{useful_secret}" in beaker_secrets: env_vars.append( - beaker.EnvVar( + beaker.BeakerEnvVar( name=useful_secret, secret=f"{whoami}_{useful_secret}", ) ) elif useful_secret in beaker_secrets: env_vars.append( - beaker.EnvVar( + beaker.BeakerEnvVar( name=useful_secret, secret=useful_secret, ) @@ -300,7 +300,7 @@ def get_env_vars(pure_docker_mode: bool, cluster: List[str], beaker_secrets: Lis # use the user's PATH; including the conda / python PATH if not pure_docker_mode: env_vars.extend([ - beaker.EnvVar( + beaker.BeakerEnvVar( name="PATH", value=os.getenv("PATH"), ), @@ -309,34 +309,34 @@ def get_env_vars(pure_docker_mode: bool, cluster: List[str], beaker_secrets: Lis # if all cluster is in weka, we mount the weka if all(c in WEKA_CLUSTERS for c in cluster): env_vars.extend([ - beaker.EnvVar( + beaker.BeakerEnvVar( name="HF_HOME", value="/weka/oe-adapt-default/allennlp/.cache/huggingface", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="HF_DATASETS_CACHE", value="/weka/oe-adapt-default/allennlp/.cache/huggingface", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="HF_HUB_CACHE", value="/weka/oe-adapt-default/allennlp/.cache/hub", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="CHECKPOINT_OUTPUT_DIR", value=f"/weka/oe-adapt-default/allennlp/deletable_checkpoint_states/{global_wandb_id}", ), ]) if num_nodes > 1: env_vars.extend([ - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_SOCKET_IFNAME", value="ib", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_IB_HCA", value="^=mlx5_bond_0", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_DEBUG", value="INFO", ), @@ -345,130 +345,130 @@ def get_env_vars(pure_docker_mode: bool, cluster: List[str], beaker_secrets: Lis elif all(c in GCP_CLUSTERS for c in cluster): env_vars.extend([ - beaker.EnvVar( + beaker.BeakerEnvVar( name="HF_HOME", value="/filestore/.cache/huggingface", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="HF_DATASETS_CACHE", value="/filestore/.cache/huggingface", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="HF_HUB_CACHE", value="/filestore/.cache/hub", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="HF_HUB_ENABLE_HF_TRANSFER", value="0", # we disable it because GCP is weird on uploading to the hub ), ]) if num_nodes > 1: env_vars.extend([ - beaker.EnvVar( + beaker.BeakerEnvVar( name="LD_LIBRARY_PATH", value=r"/var/lib/tcpxo/lib64:${LD_LIBRARY_PATH}", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_CROSS_NIC", value="0", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_ALGO", value="Ring,Tree", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_PROTO", value="Simple", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_MIN_NCHANNELS", value="4", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_P2P_NET_CHUNKSIZE", value="524288", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_P2P_PCI_CHUNKSIZE", value="524288", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_P2P_NVL_CHUNKSIZE", value="1048576", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_FASTRAK_NUM_FLOWS", value="2", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL", value="0", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_BUFFSIZE", value="8388608", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_FASTRAK_USE_SNAP", value="1", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="CUDA_VISIBLE_DEVICES", value="0,1,2,3,4,5,6,7", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_NET_GDR_LEVEL", value="PIX", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING", value="0", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_TUNER_PLUGIN", value="libnccl-tuner.so", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_TUNER_CONFIG_PATH", value="/var/lib/tcpxo/lib64/a3plus_tuner_config.textproto", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE", value="/var/lib/tcpxo/lib64/a3plus_guest_config.textproto", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS", value="600000", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_NVLS_ENABLE", value="0", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_DEBUG", value="WARN", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_FASTRAK_CTRL_DEV", value="enp0s12", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_FASTRAK_IFNAME", value="enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_SOCKET_IFNAME", value="enp0s12", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_USE_SNAP", value="1", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_FASTRAK_USE_LLCM", value="1", ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY", value="/dev/aperture_devices", ), @@ -479,11 +479,11 @@ def get_env_vars(pure_docker_mode: bool, cluster: List[str], beaker_secrets: Lis if resumable: env_vars.extend([ - beaker.EnvVar( + beaker.BeakerEnvVar( name="WANDB_RUN_ID", value=global_wandb_id, ), - beaker.EnvVar( + beaker.BeakerEnvVar( name="WANDB_RESUME", value="allow", ), @@ -493,7 +493,7 @@ def get_env_vars(pure_docker_mode: bool, cluster: List[str], beaker_secrets: Lis # torch compile caching seems consistently broken, but the actual compiling isn't. # Not sure why, for now we have disabled the caching (VLLM_DISABLE_COMPILE_CACHE=1). env_vars.extend([ - beaker.EnvVar( + beaker.BeakerEnvVar( name="VLLM_DISABLE_COMPILE_CACHE", value="1", ), @@ -508,25 +508,25 @@ def get_datasets(beaker_datasets, cluster: List[str]): # if all cluster is in weka, we mount the weka if all(c in WEKA_CLUSTERS for c in cluster): res = [ - beaker.DataMount( - source=beaker.DataSource(weka="oe-adapt-default"), + beaker.BeakerDataMount( + source=beaker.BeakerDataSource(weka="oe-adapt-default"), mount_path="/weka/oe-adapt-default", ), - beaker.DataMount( - source=beaker.DataSource(weka="oe-training-default"), + beaker.BeakerDataMount( + source=beaker.BeakerDataSource(weka="oe-training-default"), mount_path="/weka/oe-training-default", ), ] elif all(c in GCP_CLUSTERS for c in cluster): res = [ - beaker.DataMount( - source=beaker.DataSource(host_path="/mnt/filestore_1"), + beaker.BeakerDataMount( + source=beaker.BeakerDataSource(host_path="/mnt/filestore_1"), mount_path="/filestore", ), ] for beaker_dataset in beaker_datasets: - to_append = beaker.DataMount( - source=beaker.DataSource(beaker=beaker_dataset["beaker"]), + to_append = beaker.BeakerDataMount( + source=beaker.BeakerDataSource(beaker=beaker_dataset["beaker"]), mount_path=beaker_dataset["mount_path"], ) res.append(to_append) @@ -840,17 +840,17 @@ def make_task_spec(args, full_command: str, i: int, beaker_secrets: str, whoami: constraints = beaker.BeakerConstraints(cluster=args.cluster) spec = beaker.BeakerTaskSpec( name=f"{args.task_name}__{i}", - image=beaker.ImageSource(beaker=args.image), + image=beaker.BeakerImageSource(beaker=args.image), command=['/bin/bash', '-c'], arguments=[full_command], - result=beaker.ResultSpec(path="/output"), + result=beaker.BeakerResultSpec(path="/output"), datasets=get_datasets(args.beaker_datasets, args.cluster), - context=beaker.TaskContext(priority=beaker.Priority(args.priority), + context=beaker.BeakerTaskContext(priority=beaker.BeakerPriority(args.priority), preemptible=args.preemptible), constraints=constraints, env_vars=get_env_vars(args.pure_docker_mode, args.cluster, beaker_secrets, whoami, resumable, args.num_nodes, args.env, args.secret), - resources=beaker.TaskResources(gpu_count=args.gpus), + resources=beaker.BeakerTaskResources(gpu_count=args.gpus), replicas=args.num_nodes, ) if args.num_nodes > 1: From 7f8e4db9750fa51ae60346c97c789d3c310d9a84 Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Thu, 18 Sep 2025 12:43:51 -0600 Subject: [PATCH 09/25] Fix BeakerPriority to BeakerJobPriority for v2 API --- mason.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mason.py b/mason.py index 2e8752dad..2a7cd5a7c 100644 --- a/mason.py +++ b/mason.py @@ -845,7 +845,7 @@ def make_task_spec(args, full_command: str, i: int, beaker_secrets: str, whoami: arguments=[full_command], result=beaker.BeakerResultSpec(path="/output"), datasets=get_datasets(args.beaker_datasets, args.cluster), - context=beaker.BeakerTaskContext(priority=beaker.BeakerPriority(args.priority), + context=beaker.BeakerTaskContext(priority=beaker.BeakerJobPriority(args.priority), preemptible=args.preemptible), constraints=constraints, env_vars=get_env_vars(args.pure_docker_mode, args.cluster, beaker_secrets, From 5aee8fc8d3a89b07d4c649a47f78228b3752ffd7 Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Thu, 18 Sep 2025 12:44:51 -0600 Subject: [PATCH 10/25] Fix BeakerJobPriority enum access to use string key --- mason.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mason.py b/mason.py index 2a7cd5a7c..8826f8433 100644 --- a/mason.py +++ b/mason.py @@ -845,7 +845,7 @@ def make_task_spec(args, full_command: str, i: int, beaker_secrets: str, whoami: arguments=[full_command], result=beaker.BeakerResultSpec(path="/output"), datasets=get_datasets(args.beaker_datasets, args.cluster), - context=beaker.BeakerTaskContext(priority=beaker.BeakerJobPriority(args.priority), + context=beaker.BeakerTaskContext(priority=beaker.BeakerJobPriority[args.priority], preemptible=args.preemptible), constraints=constraints, env_vars=get_env_vars(args.pure_docker_mode, args.cluster, beaker_secrets, From 1e4677046e8ea9ed8e0fd766191a3eb4c5fd6e2d Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Thu, 18 Sep 2025 12:46:25 -0600 Subject: [PATCH 11/25] Fix experiment ID access for beaker v2 API The experiment.create() method now returns a BeakerWorkload object, which has an experiment field containing the ID. --- mason.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mason.py b/mason.py index 8826f8433..7851122ff 100644 --- a/mason.py +++ b/mason.py @@ -906,7 +906,7 @@ def main(): retry=beaker.BeakerRetrySpec(allowed_task_retries=args.max_retries) ) exp = beaker_client.experiment.create(spec=experiment_spec) - console.log(f"Kicked off Beaker job. https://beaker.org/ex/{exp.id}") + console.log(f"Kicked off Beaker job. https://beaker.org/ex/{exp.experiment.id}") if __name__ == "__main__": From 73a4f3f0a55a712193fadc35d3a1b48fe10e558c Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Thu, 18 Sep 2025 13:00:08 -0600 Subject: [PATCH 12/25] Fix beaker v2 API compatibility in utils.py and test_utils.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fixed exception names: ConfigurationError → BeakerConfigurationError, ExperimentNotFound → BeakerExperimentNotFound - Updated to use workload.get() and experiment.get_spec() instead of experiment.get() - Changed description update to use workload.update() instead of experiment.set_description() - Updated test mocks to match the new API structure --- open_instruct/test_utils.py | 14 ++++++++++---- open_instruct/utils.py | 12 ++++++++---- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/open_instruct/test_utils.py b/open_instruct/test_utils.py index 925a4c21a..5f72253c1 100644 --- a/open_instruct/test_utils.py +++ b/open_instruct/test_utils.py @@ -85,16 +85,22 @@ def _setup_beaker_mocks(mock_beaker_from_env, mock_is_beaker_job, initial_descri mock_client = mock.MagicMock() mock_beaker_from_env.return_value = mock_client + # Mock the workload object + mock_workload = mock.MagicMock() + mock_client.workload.get.return_value = mock_workload + + # Mock the spec object returned by experiment.get_spec mock_spec = mock.MagicMock() mock_spec.description = initial_description - mock_client.experiment.get.return_value = mock_spec + mock_client.experiment.get_spec.return_value = mock_spec description_history = [] - def track_description(exp_id, desc): - description_history.append(desc) + def track_description(workload, description=None): + if description is not None: + description_history.append(description) - mock_client.experiment.set_description.side_effect = track_description + mock_client.workload.update.side_effect = track_description return mock_client, mock_spec, description_history diff --git a/open_instruct/utils.py b/open_instruct/utils.py index be5d40099..c1184ff0c 100644 --- a/open_instruct/utils.py +++ b/open_instruct/utils.py @@ -978,13 +978,16 @@ def maybe_update_beaker_description( try: client = beaker.Beaker.from_env() - except beaker.exceptions.ConfigurationError as e: + except beaker.exceptions.BeakerConfigurationError as e: logger.warning(f"Failed to initialize Beaker client: {e}") return try: - spec = client.experiment.get(experiment_id) - except beaker.exceptions.ExperimentNotFound: + # Get the workload first (experiment_id is actually BEAKER_WORKLOAD_ID) + workload = client.workload.get(experiment_id) + # Then get the experiment spec from the workload + spec = client.experiment.get_spec(workload) + except beaker.exceptions.BeakerExperimentNotFound: logger.warning( f"Failed to get Beaker experiment with ID: {experiment_id}" "This might be fine if you are e.g. running in an interactive job." @@ -1025,7 +1028,8 @@ def maybe_update_beaker_description( description_components.append(progress_bar) new_description = " ".join(description_components) try: - client.experiment.set_description(experiment_id, new_description) + # Update the workload description using the workload object we got earlier + client.workload.update(workload, description=new_description) except requests.exceptions.HTTPError as e: logger.warning( f"Failed to update Beaker description due to HTTP error: {e}" From c1a6768682870dfe49a0560f0d5be175e8a92fb5 Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Wed, 1 Oct 2025 11:14:16 -0600 Subject: [PATCH 13/25] Updated finetune script --- scripts/train/tulu3/finetune_8b.sh | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/train/tulu3/finetune_8b.sh b/scripts/train/tulu3/finetune_8b.sh index 8a42076e2..b88f0b910 100644 --- a/scripts/train/tulu3/finetune_8b.sh +++ b/scripts/train/tulu3/finetune_8b.sh @@ -2,11 +2,14 @@ python mason.py \ --cluster ai2/jupiter \ --workspace ai2/tulu-3-dev \ --priority high \ - --image nathanl/open_instruct_auto --pure_docker_mode \ + --image nathanl/open_instruct_auto \ + --pure_docker_mode \ --preemptible \ --num_nodes 8 \ --budget ai2/oe-adapt \ - --gpus 8 -- accelerate launch \ + --gpus 8 \ + -- \ + accelerate launch \ --mixed_precision bf16 \ --num_processes 8 \ --use_deepspeed \ @@ -19,7 +22,7 @@ python mason.py \ --tokenizer_name meta-llama/Llama-3.1-8B \ --tokenizer_revision main \ --use_slow_tokenizer \ - --dataset_mixer_list allenai/tulu-3-sft-mixture 1.0 \ + --dataset_mixer_list allenai/tulu-3-sft-mixture 512 \ --max_seq_length 4096 \ --per_device_train_batch_size 1 \ --gradient_accumulation_steps 2 \ @@ -28,9 +31,10 @@ python mason.py \ --warmup_ratio 0.03 \ --weight_decay 0.0 \ --num_train_epochs 2 \ + --reduce_loss sum \ --use_flash_attn \ --gradient_checkpointing \ --report_to wandb \ --with_tracking \ --logging_steps 1 \ - --seed 8 \ No newline at end of file + --seed 8 From 7e4cfb1fb663aa644144a8105cfc18581e68dcd3 Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Wed, 1 Oct 2025 11:14:43 -0600 Subject: [PATCH 14/25] Now, finetune_8b.sh uses uv. --- scripts/train/tulu3/finetune_8b.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 scripts/train/tulu3/finetune_8b.sh diff --git a/scripts/train/tulu3/finetune_8b.sh b/scripts/train/tulu3/finetune_8b.sh old mode 100644 new mode 100755 index b88f0b910..607e175e7 --- a/scripts/train/tulu3/finetune_8b.sh +++ b/scripts/train/tulu3/finetune_8b.sh @@ -1,4 +1,4 @@ -python mason.py \ +uv run python mason.py \ --cluster ai2/jupiter \ --workspace ai2/tulu-3-dev \ --priority high \ From 50423b26e2aab1b09b502325c30ef65e7b14e713 Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Wed, 1 Oct 2025 11:20:57 -0600 Subject: [PATCH 15/25] Updated finetune_8b.sh to work with build_image_and_launch.sh. --- scripts/train/tulu3/finetune_8b.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/train/tulu3/finetune_8b.sh b/scripts/train/tulu3/finetune_8b.sh index 607e175e7..80a761882 100755 --- a/scripts/train/tulu3/finetune_8b.sh +++ b/scripts/train/tulu3/finetune_8b.sh @@ -1,8 +1,14 @@ +#!/bin/bash + +BEAKER_IMAGE="${1:-nathanl/open_instruct_auto}" + +echo "Using Beaker image: $BEAKER_IMAGE" + uv run python mason.py \ --cluster ai2/jupiter \ --workspace ai2/tulu-3-dev \ --priority high \ - --image nathanl/open_instruct_auto \ + --image "$BEAKER_IMAGE" \ --pure_docker_mode \ --preemptible \ --num_nodes 8 \ From cdd2391f04b0f5be5b7b82114ad70a8cc08e3ad9 Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Wed, 1 Oct 2025 11:40:34 -0600 Subject: [PATCH 16/25] Updated code --- scripts/train/tulu3/finetune_8b.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/train/tulu3/finetune_8b.sh b/scripts/train/tulu3/finetune_8b.sh index 80a761882..fed7c1932 100755 --- a/scripts/train/tulu3/finetune_8b.sh +++ b/scripts/train/tulu3/finetune_8b.sh @@ -37,7 +37,6 @@ uv run python mason.py \ --warmup_ratio 0.03 \ --weight_decay 0.0 \ --num_train_epochs 2 \ - --reduce_loss sum \ --use_flash_attn \ --gradient_checkpointing \ --report_to wandb \ From abf1b45396bc41dfb6f04f22e9e9b8d182bf1124 Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Wed, 1 Oct 2025 12:17:14 -0600 Subject: [PATCH 17/25] added chat template to script. --- scripts/train/tulu3/finetune_8b.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/train/tulu3/finetune_8b.sh b/scripts/train/tulu3/finetune_8b.sh index fed7c1932..b43ee4412 100755 --- a/scripts/train/tulu3/finetune_8b.sh +++ b/scripts/train/tulu3/finetune_8b.sh @@ -28,6 +28,7 @@ uv run python mason.py \ --tokenizer_name meta-llama/Llama-3.1-8B \ --tokenizer_revision main \ --use_slow_tokenizer \ + --chat_template tulu \ --dataset_mixer_list allenai/tulu-3-sft-mixture 512 \ --max_seq_length 4096 \ --per_device_train_batch_size 1 \ From 64fa69241bed1d4100a6a12af7e6f159953db743 Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Wed, 1 Oct 2025 12:18:37 -0600 Subject: [PATCH 18/25] changed priority --- scripts/train/tulu3/finetune_8b.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/train/tulu3/finetune_8b.sh b/scripts/train/tulu3/finetune_8b.sh index b43ee4412..4502fd5f8 100755 --- a/scripts/train/tulu3/finetune_8b.sh +++ b/scripts/train/tulu3/finetune_8b.sh @@ -6,7 +6,7 @@ echo "Using Beaker image: $BEAKER_IMAGE" uv run python mason.py \ --cluster ai2/jupiter \ - --workspace ai2/tulu-3-dev \ + --workspace ai2/open-instruct-dev \ --priority high \ --image "$BEAKER_IMAGE" \ --pure_docker_mode \ From f20e13e2deb9f052bed7790e8f15bf46b23afb36 Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Wed, 1 Oct 2025 12:23:17 -0600 Subject: [PATCH 19/25] updated priority --- scripts/train/tulu3/finetune_8b.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/train/tulu3/finetune_8b.sh b/scripts/train/tulu3/finetune_8b.sh index 4502fd5f8..f9a1131f1 100755 --- a/scripts/train/tulu3/finetune_8b.sh +++ b/scripts/train/tulu3/finetune_8b.sh @@ -7,7 +7,7 @@ echo "Using Beaker image: $BEAKER_IMAGE" uv run python mason.py \ --cluster ai2/jupiter \ --workspace ai2/open-instruct-dev \ - --priority high \ + --priority normal \ --image "$BEAKER_IMAGE" \ --pure_docker_mode \ --preemptible \ From 5bcda4044fbd87b4c42f2809e0ef2bcf230bfaf6 Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Wed, 1 Oct 2025 12:28:59 -0600 Subject: [PATCH 20/25] Updated scripts/train/debug/finetune.sh. --- scripts/train/debug/finetune.sh | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/scripts/train/debug/finetune.sh b/scripts/train/debug/finetune.sh index e307055c1..469800e43 100755 --- a/scripts/train/debug/finetune.sh +++ b/scripts/train/debug/finetune.sh @@ -1,4 +1,21 @@ -uv run accelerate launch \ +#!/bin/bash + +BEAKER_IMAGE="${1:-nathanl/open_instruct_auto}" + +echo "Using Beaker image: $BEAKER_IMAGE" + +uv run python mason.py \ + --cluster ai2/jupiter \ + --workspace ai2/open-instruct-dev \ + --priority normal \ + --image "$BEAKER_IMAGE" \ + --pure_docker_mode \ + --preemptible \ + --num_nodes 1 \ + --budget ai2/oe-adapt \ + --gpus 1 \ + -- \ + accelerate launch \ --mixed_precision bf16 \ --num_processes 1 \ open_instruct/finetune.py \ @@ -12,11 +29,10 @@ uv run accelerate launch \ --warmup_ratio 0.03 \ --weight_decay 0.0 \ --num_train_epochs 2 \ - --output_dir output/ \ --report_to wandb \ --logging_steps 1 \ --model_revision main \ --dataset_mixer_list allenai/tulu-3-sft-personas-algebra 100 \ --add_bos \ --seed 123 \ - # --with_tracking \ \ No newline at end of file + --with_tracking From 8bdd7187eaad91a1be5f69f05ac44ebd674d11b2 Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Wed, 1 Oct 2025 12:29:47 -0600 Subject: [PATCH 21/25] added non-resumable flag. --- scripts/train/debug/finetune.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/train/debug/finetune.sh b/scripts/train/debug/finetune.sh index 469800e43..db5941a7c 100755 --- a/scripts/train/debug/finetune.sh +++ b/scripts/train/debug/finetune.sh @@ -14,6 +14,7 @@ uv run python mason.py \ --num_nodes 1 \ --budget ai2/oe-adapt \ --gpus 1 \ + --non_resumable \ -- \ accelerate launch \ --mixed_precision bf16 \ From 892f5a9c8dc81b702f6134e9d901f906454f43da Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Wed, 1 Oct 2025 12:31:05 -0600 Subject: [PATCH 22/25] Set description for debug finetune script. --- scripts/train/debug/finetune.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/train/debug/finetune.sh b/scripts/train/debug/finetune.sh index db5941a7c..2166ac01d 100755 --- a/scripts/train/debug/finetune.sh +++ b/scripts/train/debug/finetune.sh @@ -9,6 +9,7 @@ uv run python mason.py \ --workspace ai2/open-instruct-dev \ --priority normal \ --image "$BEAKER_IMAGE" \ + --description "Single GPU finetune job." \ --pure_docker_mode \ --preemptible \ --num_nodes 1 \ From 92add3e8222063b8c9fecb04fd6a2adaa3ac2442 Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Wed, 1 Oct 2025 12:41:12 -0600 Subject: [PATCH 23/25] Actually set chat template name. --- scripts/train/debug/finetune.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/train/debug/finetune.sh b/scripts/train/debug/finetune.sh index 2166ac01d..46d2b82d1 100755 --- a/scripts/train/debug/finetune.sh +++ b/scripts/train/debug/finetune.sh @@ -37,4 +37,5 @@ uv run python mason.py \ --dataset_mixer_list allenai/tulu-3-sft-personas-algebra 100 \ --add_bos \ --seed 123 \ + --chat_template_name tulu \ --with_tracking From 8371060a21f13e2616424e585857e3e096601988 Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Wed, 1 Oct 2025 13:20:55 -0600 Subject: [PATCH 24/25] Updated env var name --- mason.py | 558 +++++++++++++++++++++++-------------------------------- 1 file changed, 233 insertions(+), 325 deletions(-) diff --git a/mason.py b/mason.py index bed0d17ff..91617084a 100644 --- a/mason.py +++ b/mason.py @@ -30,9 +30,8 @@ "open_instruct/reward_modeling.py", ] -OPEN_INSTRUCT_RESUMABLES = [ - "open_instruct/grpo_fast.py", -] +OPEN_INSTRUCT_RESUMABLES = ["open_instruct/grpo_fast.py"] + # ---------------------------------------------------------------------- # Mason logic @@ -46,34 +45,18 @@ def parse_beaker_dataset(dataset_str): def parse_env_var(env_var_str: str) -> Dict[str, str]: """Parse environment variable string in the format 'name=value'""" - if '=' not in env_var_str: - raise argparse.ArgumentTypeError( - f"Environment variable must be in format 'name=value', got: {env_var_str}" - ) - name, value = env_var_str.split('=', 1) + if "=" not in env_var_str: + raise argparse.ArgumentTypeError(f"Environment variable must be in format 'name=value', got: {env_var_str}") + name, value = env_var_str.split("=", 1) if not name: raise argparse.ArgumentTypeError("Environment variable name cannot be empty") return {"name": name, "value": value} -WEKA_CLUSTERS = [ - "ai2/jupiter", - "ai2/saturn", - "ai2/titan", - "ai2/neptune", - "ai2/ceres", - "ai2/triton", - "ai2/rhea", -] -GCP_CLUSTERS = [ - "ai2/augusta" -] -INTERCONNECT_CLUSTERS = [ - "ai2/jupiter", - "ai2/ceres", - "ai2/titan", - "ai2/augusta", -] +WEKA_CLUSTERS = ["ai2/jupiter", "ai2/saturn", "ai2/titan", "ai2/neptune", "ai2/ceres", "ai2/triton", "ai2/rhea"] +GCP_CLUSTERS = ["ai2/augusta"] + +INTERCONNECT_CLUSTERS = ["ai2/jupiter", "ai2/ceres", "ai2/titan", "ai2/augusta"] # by default, we turn off vllm compile cache # torch compile caching seems consistently broken, but the actual compiling isn't. @@ -89,18 +72,10 @@ def parse_env_var(env_var_str: str) -> Dict[str, str]: def get_args(): parser = argparse.ArgumentParser() parser.add_argument( - "--cluster", - type=str, - nargs="+", - help="Beaker clusters on which the job could be run.", - required=True, + "--cluster", type=str, nargs="+", help="Beaker clusters on which the job could be run.", required=True ) parser.add_argument( - "--hostname", - type=str, - nargs="+", - help="Beaker hostname on which the job could be run.", - default=None + "--hostname", type=str, nargs="+", help="Beaker hostname on which the job could be run.", default=None ) parser.add_argument("--max_retries", type=int, help="Number of retries", default=0) parser.add_argument("--budget", type=str, help="Budget to use.", required=True) @@ -113,10 +88,7 @@ def get_args(): default="ai2/cuda11.8-cudnn8-dev-ubuntu20.04", ) parser.add_argument( - "--workspace", - type=str, - help="The Beaker workspace to use. If not set, use your default.", - default=None, + "--workspace", type=str, help="The Beaker workspace to use. If not set, use your default.", default=None ) parser.add_argument( "--beaker_datasets", @@ -134,40 +106,27 @@ def get_args(): help="Optionally, a description for this job in Beaker.", default="Beaker-Mason job.", ) - parser.add_argument( - "--task_name", - type=str, - help="Name for the Beaker task.", - default="beaker_mason" - ) - parser.add_argument( - "--priority", type=str, help="Beaker job priority.", default="normal" - ) - parser.add_argument( - "--preemptible", action="store_true", help="If given, run as preemptible" - ) - parser.add_argument( - "--pure_docker_mode", action="store_true", help="If given, run in pure docker mode" - ) - parser.add_argument( - "--no_hf_cache_env", action="store_true", help="Getting deprecated; it does nothing" - ) - parser.add_argument( - "--no_mount_nfs", action="store_true", help="Getting deprecated; it does nothing" - ) - parser.add_argument( - "--non_resumable", action="store_true", help="If given, disable resumable mode" - ) + parser.add_argument("--task_name", type=str, help="Name for the Beaker task.", default="beaker_mason") + parser.add_argument("--priority", type=str, help="Beaker job priority.", default="normal") + parser.add_argument("--preemptible", action="store_true", help="If given, run as preemptible") + parser.add_argument("--pure_docker_mode", action="store_true", help="If given, run in pure docker mode") + parser.add_argument("--no_hf_cache_env", action="store_true", help="Getting deprecated; it does nothing") + parser.add_argument("--no_mount_nfs", action="store_true", help="Getting deprecated; it does nothing") + parser.add_argument("--non_resumable", action="store_true", help="If given, disable resumable mode") parser.add_argument( "--no_auto_dataset_cache", action="store_true", help="If given, don't cache the dataset automatically" ) parser.add_argument( - "--auto_output_dir_path", type=str, default="/weka/oe-adapt-default/allennlp/deletable_checkpoint", - help="If given, automatically replace the `--output_dir` argument with this path, essentially using it as a prefix" + "--auto_output_dir_path", + type=str, + default="/weka/oe-adapt-default/allennlp/deletable_checkpoint", + help="If given, automatically replace the `--output_dir` argument with this path, essentially using it as a prefix", ) parser.add_argument( - "--auto_checkpoint_state_dir", type=str, default="/weka/oe-adapt-default/allennlp/deletable_checkpoint_states", - help="If given, automatically replace the `--checkpoint_state_dir` argument with this path, essentially using it as a prefix" + "--auto_checkpoint_state_dir", + type=str, + default="/weka/oe-adapt-default/allennlp/deletable_checkpoint_states", + help="If given, automatically replace the `--checkpoint_state_dir` argument with this path, essentially using it as a prefix", ) parser.add_argument( "--gs_model_name", @@ -179,13 +138,13 @@ def get_args(): "--env", type=parse_env_var, action="append", - help="""Additional environment variables in the format 'name=value'. + help="""Additional environment variables in the format 'name=value'. Can be specified multiple times. Example: --env MY_VAR=value1 --env OTHER_VAR=value2""", default=[], ) parser.add_argument( "--secret", - type=parse_env_var, + type=parse_env_var, action="append", help="""Additional secret env variables in the format 'name=value'. Can be specified multiple times. Example: --secret MY_VAR=value1 --secret OTHER_VAR=value2""", @@ -216,7 +175,9 @@ def _commands_include_resumable_target(cmds: List[List[str]]) -> bool: # can resume if the command is in OPEN_INSTRUCT_RESUMABLES and --non_resumable is not set is_resumable = _commands_include_resumable_target(commands) and not mason_args.non_resumable if not is_resumable and not mason_args.non_resumable: - console.log("--non_resumable is not set, but the command is not in OPEN_INSTRUCT_RESUMABLES, so the job will not be resumable") + console.log( + "--non_resumable is not set, but the command is not in OPEN_INSTRUCT_RESUMABLES, so the job will not be resumable" + ) setattr(mason_args, "resumable", is_resumable) return mason_args, commands @@ -256,32 +217,29 @@ def parse_commands(command_args: List[str]) -> List[List[str]]: return commands -def get_env_vars(pure_docker_mode: bool, cluster: List[str], beaker_secrets: List[str], - whoami: str, resumable: bool, num_nodes: int, additional_env_vars: List[Dict[str, str]], - additional_secrets: List[Dict[str, str]]): +def get_env_vars( + pure_docker_mode: bool, + cluster: List[str], + beaker_secrets: List[str], + whoami: str, + resumable: bool, + num_nodes: int, + additional_env_vars: List[Dict[str, str]], + additional_secrets: List[Dict[str, str]], +): env_vars = [] conflicting_vars = {var["name"] for var in additional_env_vars} & DEFAULT_ENV_VARS.keys() if conflicting_vars: raise ValueError(f"Cannot override default environment variables: {conflicting_vars}") for name, value in DEFAULT_ENV_VARS.items(): - env_vars.append(beaker.EnvVar(name=name, value=value)) + env_vars.append(beaker.BeaverEnvVar(name=name, value=value)) for env_var in additional_env_vars: - env_vars.append( - beaker.BeakerEnvVar( - name=env_var["name"], - value=env_var["value"] - ) - ) + env_vars.append(beaker.BeakerEnvVar(name=env_var["name"], value=env_var["value"])) # add user-specific secrets for secret in additional_secrets: - env_vars.append( - beaker.BeakerEnvVar( - name=secret["name"], - secret=secret["value"], - ) - ) + env_vars.append(beaker.BeakerEnvVar(name=secret["name"], secret=secret["value"])) useful_secrets = [ "HF_TOKEN", @@ -295,203 +253,100 @@ def get_env_vars(pure_docker_mode: bool, cluster: List[str], beaker_secrets: Lis ] for useful_secret in useful_secrets: if f"{whoami}_{useful_secret}" in beaker_secrets: - env_vars.append( - beaker.BeakerEnvVar( - name=useful_secret, - secret=f"{whoami}_{useful_secret}", - ) - ) + env_vars.append(beaker.BeakerEnvVar(name=useful_secret, secret=f"{whoami}_{useful_secret}")) elif useful_secret in beaker_secrets: - env_vars.append( - beaker.BeakerEnvVar( - name=useful_secret, - secret=useful_secret, - ) - ) + env_vars.append(beaker.BeakerEnvVar(name=useful_secret, secret=useful_secret)) - # use the user's PATH; including the conda / python PATH + # use the user's PATH; including the conda / python PATH if not pure_docker_mode: - env_vars.extend([ - beaker.BeakerEnvVar( - name="PATH", - value=os.getenv("PATH"), - ), - ]) + env_vars.extend([beaker.BeakerEnvVar(name="PATH", value=os.getenv("PATH"))]) # if all cluster is in weka, we mount the weka if all(c in WEKA_CLUSTERS for c in cluster): - env_vars.extend([ - beaker.BeakerEnvVar( - name="HF_HOME", - value="/weka/oe-adapt-default/allennlp/.cache/huggingface", - ), - beaker.BeakerEnvVar( - name="HF_DATASETS_CACHE", - value="/weka/oe-adapt-default/allennlp/.cache/huggingface", - ), - beaker.BeakerEnvVar( - name="HF_HUB_CACHE", - value="/weka/oe-adapt-default/allennlp/.cache/hub", - ), - beaker.BeakerEnvVar( - name="CHECKPOINT_OUTPUT_DIR", - value=f"/weka/oe-adapt-default/allennlp/deletable_checkpoint_states/{global_wandb_id}", - ), - ]) - if num_nodes > 1: - env_vars.extend([ + env_vars.extend( + [ + beaker.BeakerEnvVar(name="HF_HOME", value="/weka/oe-adapt-default/allennlp/.cache/huggingface"), beaker.BeakerEnvVar( - name="NCCL_SOCKET_IFNAME", - value="ib", + name="HF_DATASETS_CACHE", value="/weka/oe-adapt-default/allennlp/.cache/huggingface" ), + beaker.BeakerEnvVar(name="HF_HUB_CACHE", value="/weka/oe-adapt-default/allennlp/.cache/hub"), beaker.BeakerEnvVar( - name="NCCL_IB_HCA", - value="^=mlx5_bond_0", + name="CHECKPOINT_OUTPUT_DIR", + value=f"/weka/oe-adapt-default/allennlp/deletable_checkpoint_states/{global_wandb_id}", ), - ]) + ] + ) + if num_nodes > 1: + env_vars.extend( + [ + beaker.BeakerEnvVar(name="NCCL_SOCKET_IFNAME", value="ib"), + beaker.BeakerEnvVar(name="NCCL_IB_HCA", value="^=mlx5_bond_0"), + ] + ) # if all cluster is in gcp we add the following env elif all(c in GCP_CLUSTERS for c in cluster): - env_vars.extend([ - beaker.BeakerEnvVar( - name="HF_HOME", - value="/filestore/.cache/huggingface", - ), - beaker.BeakerEnvVar( - name="HF_DATASETS_CACHE", - value="/filestore/.cache/huggingface", - ), - beaker.BeakerEnvVar( - name="HF_HUB_CACHE", - value="/filestore/.cache/hub", - ), - beaker.BeakerEnvVar( - name="HF_HUB_ENABLE_HF_TRANSFER", - value="0", # we disable it because GCP is weird on uploading to the hub - ), - ]) - if num_nodes > 1: - env_vars.extend([ - beaker.BeakerEnvVar( - name="LD_LIBRARY_PATH", - value=r"/var/lib/tcpxo/lib64:${LD_LIBRARY_PATH}", - ), - beaker.BeakerEnvVar( - name="NCCL_CROSS_NIC", - value="0", - ), - beaker.BeakerEnvVar( - name="NCCL_ALGO", - value="Ring,Tree", - ), - beaker.BeakerEnvVar( - name="NCCL_PROTO", - value="Simple", - ), - beaker.BeakerEnvVar( - name="NCCL_MIN_NCHANNELS", - value="4", - ), + env_vars.extend( + [ + beaker.BeakerEnvVar(name="HF_HOME", value="/filestore/.cache/huggingface"), + beaker.BeakerEnvVar(name="HF_DATASETS_CACHE", value="/filestore/.cache/huggingface"), + beaker.BeakerEnvVar(name="HF_HUB_CACHE", value="/filestore/.cache/hub"), beaker.BeakerEnvVar( - name="NCCL_P2P_NET_CHUNKSIZE", - value="524288", + name="HF_HUB_ENABLE_HF_TRANSFER", + value="0", # we disable it because GCP is weird on uploading to the hub ), - beaker.BeakerEnvVar( - name="NCCL_P2P_PCI_CHUNKSIZE", - value="524288", - ), - beaker.BeakerEnvVar( - name="NCCL_P2P_NVL_CHUNKSIZE", - value="1048576", - ), - beaker.BeakerEnvVar( - name="NCCL_FASTRAK_NUM_FLOWS", - value="2", - ), - beaker.BeakerEnvVar( - name="NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL", - value="0", - ), - beaker.BeakerEnvVar( - name="NCCL_BUFFSIZE", - value="8388608", - ), - beaker.BeakerEnvVar( - name="NCCL_FASTRAK_USE_SNAP", - value="1", - ), - beaker.BeakerEnvVar( - name="CUDA_VISIBLE_DEVICES", - value="0,1,2,3,4,5,6,7", - ), - beaker.BeakerEnvVar( - name="NCCL_NET_GDR_LEVEL", - value="PIX", - ), - beaker.BeakerEnvVar( - name="NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING", - value="0", - ), - beaker.BeakerEnvVar( - name="NCCL_TUNER_PLUGIN", - value="libnccl-tuner.so", - ), - beaker.BeakerEnvVar( - name="NCCL_TUNER_CONFIG_PATH", - value="/var/lib/tcpxo/lib64/a3plus_tuner_config.textproto", - ), - beaker.BeakerEnvVar( - name="NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE", - value="/var/lib/tcpxo/lib64/a3plus_guest_config.textproto", - ), - beaker.BeakerEnvVar( - name="NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS", - value="600000", - ), - beaker.BeakerEnvVar( - name="NCCL_NVLS_ENABLE", - value="0", - ), - beaker.EnvVar( - name="NCCL_FASTRAK_CTRL_DEV", - value="enp0s12", - ), - beaker.BeakerEnvVar( - name="NCCL_FASTRAK_IFNAME", - value="enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0", - ), - beaker.BeakerEnvVar( - name="NCCL_SOCKET_IFNAME", - value="enp0s12", - ), - beaker.BeakerEnvVar( - name="NCCL_USE_SNAP", - value="1", - ), - beaker.BeakerEnvVar( - name="NCCL_FASTRAK_USE_LLCM", - value="1", - ), - beaker.BeakerEnvVar( - name="NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY", - value="/dev/aperture_devices", - ), - ]) + ] + ) + if num_nodes > 1: + env_vars.extend( + [ + beaker.BeakerEnvVar(name="LD_LIBRARY_PATH", value=r"/var/lib/tcpxo/lib64:${LD_LIBRARY_PATH}"), + beaker.BeakerEnvVar(name="NCCL_CROSS_NIC", value="0"), + beaker.BeakerEnvVar(name="NCCL_ALGO", value="Ring,Tree"), + beaker.BeakerEnvVar(name="NCCL_PROTO", value="Simple"), + beaker.BeakerEnvVar(name="NCCL_MIN_NCHANNELS", value="4"), + beaker.BeakerEnvVar(name="NCCL_P2P_NET_CHUNKSIZE", value="524288"), + beaker.BeakerEnvVar(name="NCCL_P2P_PCI_CHUNKSIZE", value="524288"), + beaker.BeakerEnvVar(name="NCCL_P2P_NVL_CHUNKSIZE", value="1048576"), + beaker.BeakerEnvVar(name="NCCL_FASTRAK_NUM_FLOWS", value="2"), + beaker.BeakerEnvVar(name="NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL", value="0"), + beaker.BeakerEnvVar(name="NCCL_BUFFSIZE", value="8388608"), + beaker.BeakerEnvVar(name="NCCL_FASTRAK_USE_SNAP", value="1"), + beaker.BeakerEnvVar(name="CUDA_VISIBLE_DEVICES", value="0,1,2,3,4,5,6,7"), + beaker.BeakerEnvVar(name="NCCL_NET_GDR_LEVEL", value="PIX"), + beaker.BeakerEnvVar(name="NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING", value="0"), + beaker.BeakerEnvVar(name="NCCL_TUNER_PLUGIN", value="libnccl-tuner.so"), + beaker.BeakerEnvVar( + name="NCCL_TUNER_CONFIG_PATH", value="/var/lib/tcpxo/lib64/a3plus_tuner_config.textproto" + ), + beaker.BeakerEnvVar( + name="NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE", + value="/var/lib/tcpxo/lib64/a3plus_guest_config.textproto", + ), + beaker.BeakerEnvVar(name="NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS", value="600000"), + beaker.BeakerEnvVar(name="NCCL_NVLS_ENABLE", value="0"), + beaker.BeaverEnvVar(name="NCCL_FASTRAK_CTRL_DEV", value="enp0s12"), + beaker.BeakerEnvVar( + name="NCCL_FASTRAK_IFNAME", + value="enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0", + ), + beaker.BeakerEnvVar(name="NCCL_SOCKET_IFNAME", value="enp0s12"), + beaker.BeakerEnvVar(name="NCCL_USE_SNAP", value="1"), + beaker.BeakerEnvVar(name="NCCL_FASTRAK_USE_LLCM", value="1"), + beaker.BeakerEnvVar(name="NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY", value="/dev/aperture_devices"), + ] + ) # don't mount anything; assume no cache else: pass if resumable: - env_vars.extend([ - beaker.BeakerEnvVar( - name="WANDB_RUN_ID", - value=global_wandb_id, - ), - beaker.BeakerEnvVar( - name="WANDB_RESUME", - value="allow", - ), - ]) + env_vars.extend( + [ + beaker.BeakerEnvVar(name="WANDB_RUN_ID", value=global_wandb_id), + beaker.BeakerEnvVar(name="WANDB_RESUME", value="allow"), + ] + ) return env_vars @@ -503,25 +358,21 @@ def get_datasets(beaker_datasets, cluster: List[str]): if all(c in WEKA_CLUSTERS for c in cluster): res = [ beaker.BeakerDataMount( - source=beaker.BeakerDataSource(weka="oe-adapt-default"), - mount_path="/weka/oe-adapt-default", + source=beaker.BeakerDataSource(weka="oe-adapt-default"), mount_path="/weka/oe-adapt-default" ), beaker.BeakerDataMount( - source=beaker.BeakerDataSource(weka="oe-training-default"), - mount_path="/weka/oe-training-default", + source=beaker.BeakerDataSource(weka="oe-training-default"), mount_path="/weka/oe-training-default" ), ] elif all(c in GCP_CLUSTERS for c in cluster): res = [ beaker.BeakerDataMount( - source=beaker.BeakerDataSource(host_path="/mnt/filestore_1"), - mount_path="/filestore", - ), + source=beaker.BeakerDataSource(host_path="/mnt/filestore_1"), mount_path="/filestore" + ) ] for beaker_dataset in beaker_datasets: to_append = beaker.BeakerDataMount( - source=beaker.BeakerDataSource(beaker=beaker_dataset["beaker"]), - mount_path=beaker_dataset["mount_path"], + source=beaker.BeakerDataSource(beaker=beaker_dataset["beaker"]), mount_path=beaker_dataset["mount_path"] ) res.append(to_append) @@ -547,6 +398,7 @@ def make_internal_command(command: List[str], args: argparse.Namespace, whoami: if is_open_instruct_training: from open_instruct.dataset_transformation import get_commit_hash from open_instruct.utils import download_from_hf, gs_folder_exists, upload_to_gs_bucket + # HACK: Cache dataset logic: # Here we basically try to run the tokenization full_command locally before running it on beaker # We could in theory submit a cpu only job to beaker to do this, but that requires setting up @@ -589,6 +441,7 @@ def remove_arg_from_list(lst: List[str], item: str, remove_value: bool = False): caching_command = "python " + " ".join(caching_command[idx:]) + " --cache_dataset_only" console.log("📦📦📦 Running the caching command with `--cache_dataset_only`") import subprocess + # Use Popen to get real-time output while also capturing it process = subprocess.Popen( caching_command, @@ -596,7 +449,7 @@ def remove_arg_from_list(lst: List[str], item: str, remove_value: bool = False): stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, - bufsize=1 + bufsize=1, ) stdout_data, stderr_data = [], [] @@ -622,11 +475,15 @@ def remove_arg_from_list(lst: List[str], item: str, remove_value: bool = False): if done and process.poll() is not None: break - result = type('SubprocessResult', (), { - 'returncode': process.returncode, - 'stdout': ''.join(stdout_data), - 'stderr': ''.join(stderr_data) - }) + result = type( + "SubprocessResult", + (), + { + "returncode": process.returncode, + "stdout": "".join(stdout_data), + "stderr": "".join(stderr_data), + }, + ) stdout = result.stdout # Extract the cached dataset path from stdout if it exists for line in stdout.splitlines(): @@ -657,7 +514,9 @@ def remove_arg_from_list(lst: List[str], item: str, remove_value: bool = False): if need_to_override_checkpoint_state_dir and is_open_instruct_training and not is_external_user: new_checkpoint_state_dir = f"{args.auto_checkpoint_state_dir}/{whoami}/{int(time.time())}_{random.randint(0, 1000000)}" - console.log(f"🔍🔍🔍 Automatically overriding the `--checkpoint_state_dir` argument to be in `{new_checkpoint_state_dir}`") + console.log( + f"🔍🔍🔍 Automatically overriding the `--checkpoint_state_dir` argument to be in `{new_checkpoint_state_dir}`" + ) command.append("--checkpoint_state_dir") command.append(new_checkpoint_state_dir) command.append("--checkpoint_state_freq") @@ -676,7 +535,9 @@ def remove_arg_from_list(lst: List[str], item: str, remove_value: bool = False): break if need_to_override_output_dir and is_open_instruct_training and not is_external_user: new_output_dir = f"{args.auto_output_dir_path}/{whoami}/" - console.log(f"🔍🔍🔍 Automatically overriding the `--output_dir` argument to be in `{new_output_dir}`") + console.log( + f"🔍🔍🔍 Automatically overriding the `--output_dir` argument to be in `{new_output_dir}`" + ) command.append("--output_dir") command.append(new_output_dir) else: @@ -689,10 +550,12 @@ def remove_arg_from_list(lst: List[str], item: str, remove_value: bool = False): no_eval_concat_commands = [" ".join(cmd) for cmd in no_eval_commands] no_eval_concat_command_exists = any(cmd in command for cmd in no_eval_concat_commands) if not no_eval_concat_command_exists: - raise ValueError("To auto-evaluation is turned on by default, to make sure it works, you must:\n" - "1. run mason with`--auto_output_dir_path /weka/...`, or\n" - "2. in the training command, disable auto-evaluation with `--no_try_launch_beaker_eval_jobs`, or\n" - "3. in the training command, use a `--output_dir` that starts with `/weka/`") + raise ValueError( + "To auto-evaluation is turned on by default, to make sure it works, you must:\n" + "1. run mason with`--auto_output_dir_path /weka/...`, or\n" + "2. in the training command, disable auto-evaluation with `--no_try_launch_beaker_eval_jobs`, or\n" + "3. in the training command, use a `--output_dir` that starts with `/weka/`" + ) # For GCP clusters, since shared storage is slow, we optimize model loading by: if any(c in GCP_CLUSTERS for c in args.cluster): @@ -721,25 +584,38 @@ def remove_arg_from_list(lst: List[str], item: str, remove_value: bool = False): f"Local model is already downloaded, using gs_model_name {model_name_or_path}, with hash of model path {commit_hash}" ) else: - download_from_hf(model_name_or_path, model_revision) # first download the model - path = download_from_hf(model_name_or_path, model_revision) # then get the path + download_from_hf(model_name_or_path, model_revision) # first download the model + path = download_from_hf(model_name_or_path, model_revision) # then get the path gs_saved_path = f"gs://ai2-llm/post-training/deletable_cache_models/{model_name_or_path}/{commit_hash}" - gs_folder = gs_folder_exists(gs_saved_path) # race condition exists, but it's fine since we are launching mason sequentially + gs_folder = gs_folder_exists( + gs_saved_path + ) # race condition exists, but it's fine since we are launching mason sequentially if not gs_folder: upload_to_gs_bucket(path, gs_saved_path) download_path = gs_saved_path.replace("gs://", "/gs/") download_path_without_last_folder = download_path.rsplit("/", 1)[0] gs_download_command = [ - "mkdir", "-p", download_path, + "mkdir", + "-p", + download_path, "&&", "gsutil", - "-o", "GSUtil:parallel_thread_count=1", - "-o", "GSUtil:sliced_object_download_threshold=150", + "-o", + "GSUtil:parallel_thread_count=1", + "-o", + "GSUtil:sliced_object_download_threshold=150", "-m", - "cp", "-r", gs_saved_path, download_path_without_last_folder, - "&&", "ls", download_path_without_last_folder, - "&&", "ls", download_path, + "cp", + "-r", + gs_saved_path, + download_path_without_last_folder, + "&&", + "ls", + download_path_without_last_folder, + "&&", + "ls", + download_path, "&&", ] @@ -758,19 +634,32 @@ def remove_arg_from_list(lst: List[str], item: str, remove_value: bool = False): # Save dataset to GCS if len(dataset_cache_paths) > 0: - for cidx, (dataset_cache_path, dataset_config_hash) in enumerate(zip(dataset_cache_paths, dataset_config_hashes)): + for cidx, (dataset_cache_path, dataset_config_hash) in enumerate( + zip(dataset_cache_paths, dataset_config_hashes) + ): gs_saved_path = f"gs://ai2-llm/post-training/deletable_cache_datasets/{dataset_cache_path}" - gs_folder = gs_folder_exists(gs_saved_path) # race condition exists, but it's fine since we are launching mason sequentially + gs_folder = gs_folder_exists( + gs_saved_path + ) # race condition exists, but it's fine since we are launching mason sequentially if not gs_folder: upload_to_gs_bucket(dataset_cache_path, gs_saved_path) dataset_cache_path_without_last_folder = dataset_cache_path.rsplit("/", 1)[0] gs_download_command += [ - "mkdir", "-p", dataset_cache_path_without_last_folder, + "mkdir", + "-p", + dataset_cache_path_without_last_folder, "&&", "gsutil", - "cp", "-r", gs_saved_path, dataset_cache_path_without_last_folder, - "&&", "ls", dataset_cache_path_without_last_folder, - "&&", "ls", dataset_cache_path, + "cp", + "-r", + gs_saved_path, + dataset_cache_path_without_last_folder, + "&&", + "ls", + dataset_cache_path_without_last_folder, + "&&", + "ls", + dataset_cache_path, "&&", ] if cidx == 0: @@ -798,31 +687,36 @@ def remove_arg_from_list(lst: List[str], item: str, remove_value: bool = False): if "--num_processes" not in join_full_command and "accelerate" in join_full_command: raise ValueError("num_processes must be specified in the command for accelerate-based multi-node jobs.") join_full_command = re.sub( - r'--num_processes (\d+)', + r"--num_processes (\d+)", lambda m: ( - f'--num_processes {int(m.group(1)) * args.num_nodes} ' - f'--num_machines {args.num_nodes} ' - '--machine_rank $BEAKER_REPLICA_RANK ' - '--main_process_ip $BEAKER_LEADER_REPLICA_HOSTNAME ' - '--main_process_port 29400 ' + f"--num_processes {int(m.group(1)) * args.num_nodes} " + f"--num_machines {args.num_nodes} " + "--machine_rank $BEAKER_REPLICA_RANK " + "--main_process_ip $BEAKER_LEADER_REPLICA_HOSTNAME " + "--main_process_port 29400 " ), - join_full_command + join_full_command, ) full_command = setup_commands + join_full_command console.log("🔍🔍🔍 Full command") print(full_command) return full_command + def make_task_spec(args, full_command: str, i: int, beaker_secrets: str, whoami: str, resumable: bool): # Add a check to ensure that the user is using the correct clusters for multi-node jobs if args.num_nodes > 1 and not all(c in INTERCONNECT_CLUSTERS for c in args.cluster): confirmation = False while not confirmation: - confirmation = input("Interconnect clusters are required for multi-node jobs. Are you sure you want to continue? (y/n)") + confirmation = input( + "Interconnect clusters are required for multi-node jobs. Are you sure you want to continue? (y/n)" + ) if confirmation == "y": confirmation = True elif confirmation == "n": - raise ValueError(f"Interconnect clusters are required for multi-node jobs; please only use the following clusters: {INTERCONNECT_CLUSTERS}") + raise ValueError( + f"Interconnect clusters are required for multi-node jobs; please only use the following clusters: {INTERCONNECT_CLUSTERS}" + ) else: print("Invalid input. Please enter 'y' or 'n'.") if args.image == "ai2/cuda11.8-cudnn8-dev-ubuntu20.04" and any(c in GCP_CLUSTERS for c in args.cluster): @@ -835,15 +729,24 @@ def make_task_spec(args, full_command: str, i: int, beaker_secrets: str, whoami: spec = beaker.BeakerTaskSpec( name=f"{args.task_name}__{i}", image=beaker.BeakerImageSource(beaker=args.image), - command=['/bin/bash', '-c'], + command=["/bin/bash", "-c"], arguments=[full_command], result=beaker.BeakerResultSpec(path="/output"), datasets=get_datasets(args.beaker_datasets, args.cluster), - context=beaker.BeakerTaskContext(priority=beaker.BeakerJobPriority[args.priority], - preemptible=args.preemptible), + context=beaker.BeakerTaskContext( + priority=beaker.BeakerJobPriority[args.priority], preemptible=args.preemptible + ), constraints=constraints, - env_vars=get_env_vars(args.pure_docker_mode, args.cluster, beaker_secrets, - whoami, resumable, args.num_nodes, args.env, args.secret), + env_vars=get_env_vars( + args.pure_docker_mode, + args.cluster, + beaker_secrets, + whoami, + resumable, + args.num_nodes, + args.env, + args.secret, + ), resources=beaker.BeakerTaskResources(gpu_count=args.gpus), replicas=args.num_nodes, ) @@ -855,7 +758,7 @@ def make_task_spec(args, full_command: str, i: int, beaker_secrets: str, whoami: spec.host_networking = False else: spec.host_networking = True - + if args.timeout is not None: spec.timeout = args.timeout @@ -881,23 +784,28 @@ def main(): full_commands = [make_internal_command(command, args, whoami, is_external_user) for command in commands] if is_external_user: console.rule("[bold red]Non-Ai2 User Detected[/bold red]") - console.print(Text( - ( - "👋 Hi external user! The following command will be executed in our internal server; feel free to modify it to your needs. " - "(For example, you might need to replace `\"$BEAKER_LEADER_REPLICA_HOSTNAME\"` with your own hostname)" - ), - style="bold", - )) + console.print( + Text( + ( + "👋 Hi external user! The following command will be executed in our internal server; feel free to modify it to your needs. " + '(For example, you might need to replace `"$BEAKER_LEADER_REPLICA_HOSTNAME"` with your own hostname)' + ), + style="bold", + ) + ) for idx, full_command in enumerate(full_commands): - console.rule(f"[bold blue]Command {idx+1}[/bold blue]") + console.rule(f"[bold blue]Command {idx + 1}[/bold blue]") console.print(Text(full_command)) if is_external_user: return experiment_spec = beaker.BeakerExperimentSpec( description=args.description, - tasks=[make_task_spec(args, full_command, i, beaker_secrets, whoami, args.resumable) for i, full_command in enumerate(full_commands)], + tasks=[ + make_task_spec(args, full_command, i, beaker_secrets, whoami, args.resumable) + for i, full_command in enumerate(full_commands) + ], budget=args.budget, - retry=beaker.BeakerRetrySpec(allowed_task_retries=args.max_retries) + retry=beaker.BeakerRetrySpec(allowed_task_retries=args.max_retries), ) exp = beaker_client.experiment.create(spec=experiment_spec) console.log(f"Kicked off Beaker job. https://beaker.org/ex/{exp.experiment.id}") From c291c042aa3bb1f4b19ec0a2d7d28425ce91528f Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Wed, 1 Oct 2025 13:22:04 -0600 Subject: [PATCH 25/25] updated typo --- mason.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mason.py b/mason.py index 91617084a..16dfe7f00 100644 --- a/mason.py +++ b/mason.py @@ -233,7 +233,7 @@ def get_env_vars( raise ValueError(f"Cannot override default environment variables: {conflicting_vars}") for name, value in DEFAULT_ENV_VARS.items(): - env_vars.append(beaker.BeaverEnvVar(name=name, value=value)) + env_vars.append(beaker.BeakerEnvVar(name=name, value=value)) for env_var in additional_env_vars: env_vars.append(beaker.BeakerEnvVar(name=env_var["name"], value=env_var["value"])) @@ -325,7 +325,7 @@ def get_env_vars( ), beaker.BeakerEnvVar(name="NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS", value="600000"), beaker.BeakerEnvVar(name="NCCL_NVLS_ENABLE", value="0"), - beaker.BeaverEnvVar(name="NCCL_FASTRAK_CTRL_DEV", value="enp0s12"), + beaker.BeakerEnvVar(name="NCCL_FASTRAK_CTRL_DEV", value="enp0s12"), beaker.BeakerEnvVar( name="NCCL_FASTRAK_IFNAME", value="enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0",