pytorch · larryliu0820 · Nov 4, 2025 · Nov 4, 2025 · Nov 4, 2025
@@ -800,6 +800,15 @@ if(EXECUTORCH_BUILD_PYBIND)
       torch
   )
 
+  # Build common AOTI functionality if needed by CUDA or Metal backends
+  if(EXECUTORCH_BUILD_CUDA OR EXECUTORCH_BUILD_METAL)
+    list(APPEND _dep_libs aoti_common)
+  endif()
+
+  if(EXECUTORCH_BUILD_CUDA)
+    list(APPEND _dep_libs aoti_cuda)
+  endif()
+
   if(EXECUTORCH_BUILD_EXTENSION_MODULE)
     # Always use static linking for pybindings to avoid runtime symbol
     # resolution issues

@@ -51,6 +51,12 @@
 # wouldn't preserve the static type annotations.
 #
 # Note that all of these are experimental, and subject to change without notice.
+
+# Set dlopen flags to RTLD_GLOBAL to ensure that the symbols in _portable_lib can
+# be found by another shared library (for example, in AOTI where we want to load
+# an AOTI compiled .so file with needed symbols defined in _portable_lib).
+prev = sys.getdlopenflags()
+sys.setdlopenflags(prev | os.RTLD_GLOBAL)
 from executorch.extension.pybindings._portable_lib import (  # noqa: F401
     # Disable "imported but unused" (F401) checks.
     _create_profile_block,  # noqa: F401
@@ -73,6 +79,7 @@
     MethodMeta,  # noqa: F401
     Verification,  # noqa: F401
 )
+sys.setdlopenflags(prev)
 
 # Clean up so that `dir(portable_lib)` is the same as `dir(_portable_lib)`
 # (apart from some __dunder__ names).

diff --git a/install_requirements.py b/install_requirements.py
@@ -18,14 +18,6 @@
 # This will be dynamically set based on CUDA availability and CUDA backend enabled/disabled.
 TORCH_NIGHTLY_URL_BASE = "https://download.pytorch.org/whl/nightly"
 
-# Supported CUDA versions - modify this to add/remove supported versions
-# Format: tuple of (major, minor) version numbers
-SUPPORTED_CUDA_VERSIONS = (
-    (12, 6),
-    (12, 8),
-    (13, 0),
-)
-
 # Since ExecuTorch often uses main-branch features of pytorch, only the nightly
 # pip versions will have the required features.
 #
@@ -51,7 +43,7 @@ def install_requirements(use_pytorch_nightly):
         sys.exit(1)
 
     # Determine the appropriate PyTorch URL based on CUDA delegate status
-    torch_url = determine_torch_url(TORCH_NIGHTLY_URL_BASE, SUPPORTED_CUDA_VERSIONS)
+    torch_url = determine_torch_url(TORCH_NIGHTLY_URL_BASE)
 
     # pip packages needed by exir.
     TORCH_PACKAGE = [
@@ -121,7 +113,7 @@ def install_requirements(use_pytorch_nightly):
 
 def install_optional_example_requirements(use_pytorch_nightly):
     # Determine the appropriate PyTorch URL based on CUDA delegate status
-    torch_url = determine_torch_url(TORCH_NIGHTLY_URL_BASE, SUPPORTED_CUDA_VERSIONS)
+    torch_url = determine_torch_url(TORCH_NIGHTLY_URL_BASE)
 
     print("Installing torch domain libraries")
     DOMAIN_LIBRARIES = [

diff --git a/install_utils.py b/install_utils.py
@@ -10,31 +10,57 @@
 import re
 import subprocess
 import sys
+from typing import List, Optional
 
+# Supported CUDA versions - modify this to add/remove supported versions
+# Format: tuple of (major, minor) version numbers
+SUPPORTED_CUDA_VERSIONS = (
+    (12, 6),
+    (12, 8),
+    (13, 0),
+)
 
-def _cuda_version_to_pytorch_suffix(major, minor):
+
+def is_cmake_option_on(
+    cmake_configuration_args: List[str], var_name: str, default: bool
+) -> bool:
     """
-    Generate PyTorch CUDA wheel suffix from CUDA version numbers.
+    Get a boolean CMake variable, from a list of CMake configuration arguments.
+    The var_name should not include the "-D" prefix.
 
     Args:
-        major: CUDA major version (e.g., 12)
-        minor: CUDA minor version (e.g., 6)
+        cmake_configuration_args: List of CMake configuration arguments.
+        var_name: Name of the CMake variable.
+        default: Default boolean value if the variable is not set.
 
     Returns:
-        PyTorch wheel suffix string (e.g., "cu126")
+        Boolean value of the CMake variable.
     """
-    return f"cu{major}{minor}"
+    cmake_define = _extract_cmake_define(cmake_configuration_args, var_name)
+
+    return _normalize_cmake_bool(cmake_define, default)
+
+
+def is_cuda_available() -> bool:
+    """
+    Check if CUDA is available on the system by attempting to get the CUDA version.
+
+    Returns:
+        True if CUDA is available and supported, False otherwise.
+    """
+    try:
+        _get_cuda_version()
+        return True
+    except Exception:
+        return False
 
 
 @functools.lru_cache(maxsize=1)
-def _get_cuda_version(supported_cuda_versions):
+def _get_cuda_version():
     """
     Get the CUDA version installed on the system using nvcc command.
     Returns a tuple (major, minor).
 
-    Args:
-        supported_cuda_versions: List of supported CUDA versions as tuples
-
     Raises:
         RuntimeError: if nvcc is not found or version cannot be parsed
     """
@@ -50,9 +76,9 @@ def _get_cuda_version(supported_cuda_versions):
             major, minor = int(match.group(1)), int(match.group(2))
 
             # Check if the detected version is supported
-            if (major, minor) not in supported_cuda_versions:
+            if (major, minor) not in SUPPORTED_CUDA_VERSIONS:
                 available_versions = ", ".join(
-                    [f"{maj}.{min}" for maj, min in supported_cuda_versions]
+                    [f"{maj}.{min}" for maj, min in SUPPORTED_CUDA_VERSIONS]
                 )
                 raise RuntimeError(
                     f"Detected CUDA version {major}.{minor} is not supported. "
@@ -76,6 +102,39 @@ def _get_cuda_version(supported_cuda_versions):
         )
 
 
+def _extract_cmake_define(args: List[str], name: str) -> Optional[str]:
+    prefix = f"-D{name}="
+    for arg in args:
+        if arg.startswith(prefix):
+            return arg[len(prefix) :]
+    return None
+
+
+def _normalize_cmake_bool(value: Optional[str], default: bool = False) -> bool:
+    if value is None:
+        return default
+    normalized = value.strip().upper()
+    if normalized in {"ON", "1", "TRUE", "YES"}:
+        return True
+    if normalized in {"OFF", "0", "FALSE", "NO"}:
+        return False
+    return default
+
+
+def _cuda_version_to_pytorch_suffix(major, minor):
+    """
+    Generate PyTorch CUDA wheel suffix from CUDA version numbers.
+
+    Args:
+        major: CUDA major version (e.g., 12)
+        minor: CUDA minor version (e.g., 6)
+
+    Returns:
+        PyTorch wheel suffix string (e.g., "cu126")
+    """
+    return f"cu{major}{minor}"
+
+
 def _get_pytorch_cuda_url(cuda_version, torch_nightly_url_base):
     """
     Get the appropriate PyTorch CUDA URL for the given CUDA version.
@@ -95,14 +154,13 @@ def _get_pytorch_cuda_url(cuda_version, torch_nightly_url_base):
 
 
 @functools.lru_cache(maxsize=1)
-def determine_torch_url(torch_nightly_url_base, supported_cuda_versions):
+def determine_torch_url(torch_nightly_url_base):
     """
     Determine the appropriate PyTorch installation URL based on CUDA availability.
     Uses @functools.lru_cache to avoid redundant CUDA detection and print statements.
 
     Args:
         torch_nightly_url_base: Base URL for PyTorch nightly packages
-        supported_cuda_versions: List of supported CUDA versions as tuples
 
     Returns:
         URL string for PyTorch packages
@@ -116,7 +174,7 @@ def determine_torch_url(torch_nightly_url_base, supported_cuda_versions):
     print("Attempting to detect CUDA via nvcc...")
 
     try:
-        cuda_version = _get_cuda_version(supported_cuda_versions)
+        cuda_version = _get_cuda_version()
     except Exception as err:
         print(f"CUDA detection failed ({err}), using CPU-only PyTorch")
         return f"{torch_nightly_url_base}/cpu"

diff --git a/setup.py b/setup.py
@@ -65,6 +65,20 @@
 from pathlib import Path
 from typing import List, Optional
 
+# Add the current directory to sys.path to import install_utils
+CWD = Path(__file__).absolute().parent
+# Add the current directory to the Python path so that we can import `install_utils`.
+# This is required when running this script with a PEP-517-enabled build backend.
+#
+# From the PEP-517 documentation: https://peps.python.org/pep-0517
+#
+# > When importing the module path, we do *not* look in the directory containing
+# > the source tree, unless that would be on `sys.path` anyway (e.g. because it
+# > is specified in `PYTHONPATH`).
+#
+sys.path.insert(0, str(CWD))  # this only affects the current process
+import install_utils
+
 from setuptools import Extension, setup
 from setuptools.command.build import build
 from setuptools.command.build_ext import build_ext
@@ -769,6 +783,12 @@ def run(self):  # noqa C901
             item for item in re.split(r"\s+", os.environ.get("CMAKE_ARGS", "")) if item
         ]
 
+        # Check if CUDA is available, and if so, enable building the CUDA
+        # backend by default.
+        if install_utils.is_cuda_available() and install_utils.is_cmake_option_on(
+            cmake_configuration_args, "EXECUTORCH_BUILD_CUDA", default=True
+        ):
+            cmake_configuration_args += ["-DEXECUTORCH_BUILD_CUDA=ON"]
         with Buck2EnvironmentFixer():
             # Generate the cmake cache from scratch to ensure that the cache state
             # is predictable.
@@ -821,6 +841,10 @@ def run(self):  # noqa C901
             cmake_build_args += ["--target", "portable_lib"]
             cmake_build_args += ["--target", "selective_build"]
 
+        if cmake_cache.is_enabled("EXECUTORCH_BUILD_CUDA"):
+            cmake_build_args += ["--target", "aoti_cuda"]
+            cmake_build_args += ["--target", "aoti_common"]
+
         if cmake_cache.is_enabled("EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER"):
             cmake_build_args += ["--target", "_llm_runner"]