diff --git a/MODULE.bazel b/MODULE.bazel index 453cbd2a..6f017042 100644 --- a/MODULE.bazel +++ b/MODULE.bazel @@ -8,7 +8,7 @@ module( # Lower-bound versions of direct dependencies. # When bumping, add a comment explaining what's required from the newer release. -bazel_dep(name = "aspect_bazel_lib", version = "1.40.0") +bazel_dep(name = "aspect_bazel_lib", version = "2.9.1") # py_image_layer requires 2.x for the `tar` rule. bazel_dep(name = "bazel_skylib", version = "1.4.2") bazel_dep(name = "rules_python", version = "0.29.0") bazel_dep(name = "platforms", version = "0.0.7") diff --git a/docs/BUILD.bazel b/docs/BUILD.bazel index 59785a2d..563385ab 100644 --- a/docs/BUILD.bazel +++ b/docs/BUILD.bazel @@ -31,6 +31,11 @@ stardoc_with_diff_test( bzl_library_target = "//py/private:py_pex_binary", ) +stardoc_with_diff_test( + name = "py_image_layer", + bzl_library_target = "//py/private:py_image_layer", +) + stardoc_with_diff_test( name = "venv", bzl_library_target = "//py/private:py_venv", diff --git a/docs/py_image_layer.md b/docs/py_image_layer.md new file mode 100644 index 00000000..650d2555 --- /dev/null +++ b/docs/py_image_layer.md @@ -0,0 +1,81 @@ + + +py_image_layer macro for creating multiple layers from a py_binary + +> [!WARNING] +> This macro is EXPERIMENTAL and is not subject to our SemVer guarantees. + +A py_binary that uses `torch` and `numpy` can use the following layer groups: + +``` +load("@rules_oci//oci:defs.bzl", "oci_image") +load("@aspect_rules_py//py:defs.bzl", "py_image_layer", "py_binary") + +py_binary( + name = "my_app_bin", + deps = [ + "@pip_deps//numpy", + "@pip_deps//torch" + ] +) + +oci_image( + tars = py_image_layer( + name = "my_app", + py_binary = ":my_app_bin", + layer_groups = { + "torch": "pip_deps_torch.*", + "numpy": "pip_deps_numpy.*", + } + ) +) +``` + + + + +## py_image_layer + +
+py_image_layer(name, py_binary, root, layer_groups, compress, tar_args, kwargs)
+
+ +Produce a separate tar output for each layer of a python app + +> Requires `awk` to be installed on the host machine/rbe runner. + +For better performance, it is recommended to split the output of a py_binary into multiple layers. +This can be done by grouping files into layers based on their path by using the `layer_groups` attribute. + +The matching order for layer groups is as follows: + 1. `layer_groups` are checked first. + 2. If no match is found for `layer_groups`, the `default layer groups` are checked. + 3. Any remaining files are placed into the default layer. + +The default layer groups are: +``` +{ + "packages": "\.runfiles/.*/site-packages",, # contains third-party deps + "interpreter": "\.runfiles/python.*-.*/", # contains the python interpreter +} +``` + + +**PARAMETERS** + + +| Name | Description | Default Value | +| :------------- | :------------- | :------------- | +| name | base name for targets | none | +| py_binary | a py_binary target | none | +| root | Path to where the layers should be rooted. If not specified, the layers will be rooted at the workspace root. | None | +| layer_groups | Additional layer groups to create. They are used to group files into layers based on their path. In the form of:
{"<name>": "regex_to_match_against_file_paths"}
| {} | +| compress | Compression algorithm to use. Default is gzip. See: https://github.com/bazel-contrib/bazel-lib/blob/main/docs/tar.md#tar_rule | "gzip" | +| tar_args | Additional arguments to pass to the tar rule. Default is ["--options", "gzip:!timestamp"]. See: https://github.com/bazel-contrib/bazel-lib/blob/main/docs/tar.md#tar_rule | ["--options", "gzip:!timestamp"] | +| kwargs | attribute that apply to all targets expanded by the macro | none | + +**RETURNS** + +A list of labels for each layer. + + diff --git a/py/BUILD.bazel b/py/BUILD.bazel index f90c3b2a..231b06bb 100644 --- a/py/BUILD.bazel +++ b/py/BUILD.bazel @@ -38,6 +38,7 @@ bzl_library( "//py/private:py_wheel", "//py/private:virtual", "//py/private:py_pex_binary", + "//py/private:py_image_layer", "@aspect_bazel_lib//lib:utils", ], ) diff --git a/py/defs.bzl b/py/defs.bzl index abd76926..fa250767 100644 --- a/py/defs.bzl +++ b/py/defs.bzl @@ -38,12 +38,13 @@ python.toolchain(python_version = "3.9", is_default = True) load("@aspect_bazel_lib//lib:utils.bzl", "propagate_common_rule_attributes") load("//py/private:py_binary.bzl", _py_binary = "py_binary", _py_test = "py_test") load("//py/private:py_executable.bzl", "determine_main") +load("//py/private:py_image_layer.bzl", _py_image_layer = "py_image_layer") load("//py/private:py_library.bzl", _py_library = "py_library") load("//py/private:py_pex_binary.bzl", _py_pex_binary = "py_pex_binary") load("//py/private:py_pytest_main.bzl", _py_pytest_main = "py_pytest_main") load("//py/private:py_unpacked_wheel.bzl", _py_unpacked_wheel = "py_unpacked_wheel") -load("//py/private:virtual.bzl", _resolutions = "resolutions") load("//py/private:py_venv.bzl", _py_venv = "py_venv") +load("//py/private:virtual.bzl", _resolutions = "resolutions") py_pex_binary = _py_pex_binary py_pytest_main = _py_pytest_main @@ -54,6 +55,8 @@ py_test_rule = _py_test py_library = _py_library py_unpacked_wheel = _py_unpacked_wheel +py_image_layer = _py_image_layer + resolutions = _resolutions def _py_binary_or_test(name, rule, srcs, main, deps = [], resolutions = {}, **kwargs): diff --git a/py/private/BUILD.bazel b/py/private/BUILD.bazel index 943d0fb8..886408ba 100644 --- a/py/private/BUILD.bazel +++ b/py/private/BUILD.bazel @@ -22,6 +22,14 @@ exports_files( visibility = ["//docs:__pkg__"], ) +bzl_library( + name = "py_image_layer", + srcs = ["py_image_layer.bzl"], + deps = [ + "@aspect_bazel_lib//lib:tar", + ], +) + bzl_library( name = "py_binary", srcs = ["py_binary.bzl"], diff --git a/py/private/py_image_layer.bzl b/py/private/py_image_layer.bzl new file mode 100644 index 00000000..023f97d8 --- /dev/null +++ b/py/private/py_image_layer.bzl @@ -0,0 +1,156 @@ +"""py_image_layer macro for creating multiple layers from a py_binary + +> [!WARNING] +> This macro is EXPERIMENTAL and is not subject to our SemVer guarantees. + +A py_binary that uses `torch` and `numpy` can use the following layer groups: + +``` +load("@rules_oci//oci:defs.bzl", "oci_image") +load("@aspect_rules_py//py:defs.bzl", "py_image_layer", "py_binary") + +py_binary( + name = "my_app_bin", + deps = [ + "@pip_deps//numpy", + "@pip_deps//torch" + ] +) + +oci_image( + tars = py_image_layer( + name = "my_app", + py_binary = ":my_app_bin", + layer_groups = { + "torch": "pip_deps_torch.*", + "numpy": "pip_deps_numpy.*", + } + ) +) +``` +""" + +load("@aspect_bazel_lib//lib:tar.bzl", "mtree_spec", "tar") + +default_layer_groups = { + # match *only* external pip like repositories that contain the string "site-packages" + "packages": "\\.runfiles/.*/site-packages", + # match *only* external repositories that begins with the string "python" + # e.g. this will match + # `/hello_world/hello_world_bin.runfiles/rules_python~0.21.0~python~python3_9_aarch64-unknown-linux-gnu/bin/python3` + # but not match + # `/hello_world/hello_world_bin.runfiles/_main/python_app` + "interpreter": "\\.runfiles/python.*-.*/", +} + +def _split_mtree_into_layer_groups(name, root, groups, group_names, **kwargs): + mtree_begin_blocks = "\n".join([ + 'print "#mtree" >> "$(RULEDIR)/%s.%s.manifest.spec";' % (name, gn) + for gn in group_names + ]) + + # When an mtree entry matches a layer group, it will be moved into the mtree + # for that group. + ifs = "\n".join([ + """\ +if ($$1 ~ "%s") { + print $$0 >> "$(RULEDIR)/%s.%s.manifest.spec"; + next +}""" % (regex, name, gn) + for (gn, regex) in groups.items() + ]) + + cmd = """\ +awk < $< 'BEGIN { + %s +} +{ + # Exclude .whl files from container images + if ($$1 ~ ".whl") { + next + } + # Move everything under the specified root + sub(/^/, ".%s") + # Match by regexes and write to the destination. + %s + # Every line that did not match the layer groups will go into the default layer. + print $$0 >> "$(RULEDIR)/%s.default.manifest.spec" +}' +""" % (mtree_begin_blocks, root, ifs, name) + + native.genrule( + name = "_{}_manifests".format(name), + srcs = [name + ".manifest"], + outs = [ + "{}.{}.manifest.spec".format(name, group_name) + for group_name in group_names + ], + cmd = cmd, + **kwargs + ) + + +def py_image_layer(name, py_binary, root = None, layer_groups = {}, compress = "gzip", tar_args = ["--options", "gzip:!timestamp"], **kwargs): + """Produce a separate tar output for each layer of a python app + + > Requires `awk` to be installed on the host machine/rbe runner. + + For better performance, it is recommended to split the output of a py_binary into multiple layers. + This can be done by grouping files into layers based on their path by using the `layer_groups` attribute. + + The matching order for layer groups is as follows: + 1. `layer_groups` are checked first. + 2. If no match is found for `layer_groups`, the `default layer groups` are checked. + 3. Any remaining files are placed into the default layer. + + The default layer groups are: + ``` + { + "packages": "\\.runfiles/.*/site-packages",, # contains third-party deps + "interpreter": "\\.runfiles/python.*-.*/", # contains the python interpreter + } + ``` + + Args: + name: base name for targets + py_binary: a py_binary target + root: Path to where the layers should be rooted. If not specified, the layers will be rooted at the workspace root. + layer_groups: Additional layer groups to create. They are used to group files into layers based on their path. In the form of: ```{"": "regex_to_match_against_file_paths"}``` + compress: Compression algorithm to use. Default is gzip. See: https://github.com/bazel-contrib/bazel-lib/blob/main/docs/tar.md#tar_rule + tar_args: Additional arguments to pass to the tar rule. Default is `["--options", "gzip:!timestamp"]`. See: https://github.com/bazel-contrib/bazel-lib/blob/main/docs/tar.md#tar_rule + **kwargs: attribute that apply to all targets expanded by the macro + + Returns: + A list of labels for each layer. + """ + if root != None and not root.startswith("/"): + fail("root path must start with '/' but got '{root}', expected '/{root}'".format(root = root)) + + # Produce the manifest for a tar file of our py_binary, but don't tar it up yet, so we can split + # into fine-grained layers for better pull, push and remote cache performance. + mtree_spec( + name = name + ".manifest", + srcs = [py_binary], + **kwargs + ) + + groups = dict(**layer_groups) + group_names = groups.keys() + ["default"] + + _split_mtree_into_layer_groups(name, root, groups, group_names, **kwargs) + + # Finally create layers using the tar rule + result = [] + for group_name in group_names: + tar_target = "_{}_{}".format(name, group_name) + tar( + name = tar_target, + srcs = [py_binary], + mtree = "{}.{}.manifest.spec".format(name, group_name), + compress = compress, + args = tar_args, + **kwargs + ) + result.append(tar_target) + + return result diff --git a/py/repositories.bzl b/py/repositories.bzl index 77d3261e..e012bf20 100644 --- a/py/repositories.bzl +++ b/py/repositories.bzl @@ -30,11 +30,12 @@ def rules_py_dependencies(): url = "https://github.com/bazelbuild/bazel-skylib/archive/refs/tags/1.5.0.tar.gz", ) + # py_image_layer requires 2.x for the `tar` rule. http_archive( name = "aspect_bazel_lib", - sha256 = "6e6f8ac3c601d6df25810cd51e51d85831e3437e873b152c5c4ecd3b96964bc8", - strip_prefix = "bazel-lib-1.42.3", - url = "https://github.com/aspect-build/bazel-lib/archive/refs/tags/v1.42.3.tar.gz", + sha256 = "f93d386d8d0b0149031175e81df42a488be4267c3ca2249ba5321c23c60bc1f0", + strip_prefix = "bazel-lib-2.9.1", + url = "https://github.com/bazel-contrib/bazel-lib/releases/download/v2.9.1/bazel-lib-v2.9.1.tar.gz", ) http_archive( diff --git a/py/toolchains.bzl b/py/toolchains.bzl index ecd60532..a42fcad0 100644 --- a/py/toolchains.bzl +++ b/py/toolchains.bzl @@ -1,12 +1,12 @@ """Declare toolchains""" +load("@aspect_bazel_lib//lib:repositories.bzl", "register_tar_toolchains") load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_file") load("//py/private/toolchain:autodetecting.bzl", _register_autodetecting_python_toolchain = "register_autodetecting_python_toolchain") load("//py/private/toolchain:repo.bzl", "prerelease_toolchains_repo", "toolchains_repo") load("//py/private/toolchain:tools.bzl", "TOOLCHAIN_PLATFORMS", "prebuilt_tool_repo") load("//tools:version.bzl", "IS_PRERELEASE") - register_autodetecting_python_toolchain = _register_autodetecting_python_toolchain DEFAULT_TOOLS_REPOSITORY = "rules_py_tools" @@ -19,6 +19,9 @@ def rules_py_toolchains(name = DEFAULT_TOOLS_REPOSITORY, register = True, is_pre register: whether to call the register_toolchains, should be True for WORKSPACE and False for bzlmod. is_prerelease: True iff there are no pre-built tool binaries for this version of rules_py """ + + register_tar_toolchains(register = register) + if is_prerelease: prerelease_toolchains_repo(name = name) if register: