Merge branch 'tixxx/determine_local' into terryysun/all2all_memcpyp2p

openxla · Aug 1, 2024 · 6db4cec · 6db4cec
2 parents 6e2fcf3 + f7a7d5d
commit 6db4cec
Show file tree

Hide file tree

Showing 927 changed files with 25,870 additions and 21,243 deletions.
diff --git a/.bazelrc b/.bazelrc
@@ -42,9 +42,7 @@
 #     rocm:         Build with AMD GPU support (rocm)
 #     mkl:          Enable full mkl support.
 #     tensorrt:     Enable Tensorrt support.
-#     noaws:        Disable AWS S3 storage support
 #     nogcp:        Disable GCS support.
-#     nohdfs:       Disable hadoop hdfs support.
 #     nonccl:       Disable nccl support.
 #
 #
@@ -117,10 +115,6 @@ build --config=short_logs
 # TODO(mihaimaruseac): Document this option or remove if no longer needed
 build --config=v2
 
-# Disable AWS/HDFS support by default
-build --define=no_aws_support=true
-build --define=no_hdfs_support=true
-
 # TF now has `cc_shared_library` targets, so it needs the experimental flag
 # TODO(rostam): Remove when `cc_shared_library` is enabled by default
 build --experimental_cc_shared_library
@@ -296,9 +290,7 @@ build:sycl --define=tensorflow_mkldnn_contraction_kernel=0
 build:sycl --repo_env TF_NEED_SYCL=1
 
 # Options to disable default on features
-build:noaws --define=no_aws_support=true
 build:nogcp --define=no_gcp_support=true
-build:nohdfs --define=no_hdfs_support=true
 build:nonccl --define=no_nccl_support=true
 
 # Modular TF build options
@@ -359,6 +351,13 @@ build:windows --features=archive_param_file
 build:windows --copt=/d2ReducedOptimizeHugeFunctions
 build:windows --host_copt=/d2ReducedOptimizeHugeFunctions
 
+# Before VS 2017 15.8, the member "type" would non-conformingly have an
+# alignment of only alignof(max_align_t). VS 2017 15.8 was fixed to handle this
+# correctly, but the fix inherently changes layout and breaks binary
+# compatibility (*only* for uses of aligned_storage with extended alignments).
+build:windows --copt=-D_ENABLE_EXTENDED_ALIGNED_STORAGE
+build:windows --host_copt=-D_ENABLE_EXTENDED_ALIGNED_STORAGE
+
 # Enable the runfiles symlink tree on Windows. This makes it possible to build
 # the pip package on Windows without an intermediate data-file archive, as the
 # build_pip_package script in its current form (as of Aug 2023) uses the

diff --git a/.github/workflows/clang_format.yml b/.github/workflows/clang_format.yml
@@ -26,6 +26,7 @@ jobs:
         shell: bash
     timeout-minutes: 1
     if: |
+      github.event.sender.type == 'User' ||
       contains(github.event.pull_request.body, 'FORCE_TEST_ACTIONS')
     steps:
       - name: "Checking out repository"

diff --git a/build_tools/build.py b/build_tools/build.py
@@ -100,7 +100,9 @@ def _pull_docker_image_with_retries(self, retries=3) -> None:
     """Pulls docker image with retries to avoid transient rate limit errors."""
     for _ in range(retries):
       pull_proc = sh(["docker", "pull", self.image_url], check=False)
-      if pull_proc.returncode != 0:
+      if pull_proc.returncode == 0:
+        break  # Don't keep pulling after successful pull.
+      else:
         time.sleep(15)
 
     # write SHA of image to the sponge config
@@ -245,11 +247,7 @@ def nvidia_gpu_build_with_compute_capability(
     repo="openxla/xla",
     docker_image=_DEFAULT_IMAGE,
     configs=("warnings", "nonccl", "rbe_linux_cpu"),
-    target_patterns=_XLA_DEFAULT_TARGET_PATTERNS
-    + (
-        "-//xla/service/gpu/model/fuzztest/...",
-        "-//xla/service/gpu/fusions/triton:triton_support_test",
-    ),
+    target_patterns=_XLA_DEFAULT_TARGET_PATTERNS,
     build_tag_filters=cpu_x86_tag_filter,
     test_tag_filters=cpu_x86_tag_filter,
     options=_DEFAULT_BAZEL_OPTIONS,
@@ -267,11 +265,7 @@ def nvidia_gpu_build_with_compute_capability(
     repo="openxla/xla",
     docker_image=_ARM64_JAX_MULTI_PYTHON_IMAGE,
     configs=("warnings", "rbe_cross_compile_linux_arm64_xla", "nonccl"),
-    target_patterns=_XLA_DEFAULT_TARGET_PATTERNS
-    + (
-        "-//xla/service/gpu/model/fuzztest/...",
-        "-//xla/service/gpu/fusions/triton:triton_support_test",
-    ),
+    target_patterns=_XLA_DEFAULT_TARGET_PATTERNS,
     options={**_DEFAULT_BAZEL_OPTIONS, "build_tests_only": True},
     build_tag_filters=cpu_arm_tag_filter,
     test_tag_filters=cpu_arm_tag_filter,

diff --git a/docs/_toc.yaml b/docs/_toc.yaml
@@ -24,6 +24,8 @@ toc:
       path: /xla/custom_call
     - title: Persisted autotuning
       path: /xla/persisted_autotuning
+    - title: Determinism
+      path: /xla/determinism
     - title: XLA Tooling
       path: /xla/tools
     - title: Using LSP autocompletion

diff --git a/docs/determinism.md b/docs/determinism.md
@@ -0,0 +1,17 @@
+# Determinism (GPU)
+
+## Compilation
+
+XLA compilation is deterministic if
+[persisted autotuning](./persisted_autotuning) is used to perform autotuning
+once and avoid it in subsequent compilations. Otherwise due to fluctuations in
+measurements different kernels can be picked as the fastest ones in different
+compilation runs.
+
+## Execution
+
+Programs compiled by XLA can be non-deterministic on operations like scatter,
+select-and-scatter, GEMMs, convolutions, multi-headed attention. The flag
+`--xla_gpu_exclude_nondeterministic_ops` switches these operations to
+deterministic and potentially slower implementations and makes compilation fail
+on select-and-scatter which does not have a deterministic implementaiton.