Torch install back to build (#365)

Move torch install back to smart build, revert CI build to use `[ml]`, update docs with proper build steps [ committed by @MattToast ] [ reviewed by @al-rigazzi @ashao @mellis13 ]
CrayLabs · Sep 14, 2023 · e1a5783 · e1a5783
1 parent fa59b18
commit e1a5783
Show file tree

Hide file tree

Showing 9 changed files with 189 additions and 202 deletions.
diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
@@ -108,7 +108,7 @@ jobs:
       - name: Install SmartSim (with ML backends)
         run: |
           python -m pip install git+https://github.com/CrayLabs/SmartRedis.git@develop#egg=smartredis
-          python -m pip install .[dev,ml-cpu]
+          python -m pip install .[dev,ml]
 
       - name: Install ML Runtimes with Smart (with pt, tf, and onnx support)
         if: (matrix.py_v != '3.10')

diff --git a/doc/changelog.rst b/doc/changelog.rst
@@ -15,7 +15,7 @@ SmartSim
 0.5.1
 -----
 
-Released on 13 September, 2023
+Released on 14 September, 2023
 
 Description
 
@@ -45,7 +45,6 @@ Detailed Notes
 - Create public properties where appropriate to mitigate `protected-access` errors. (PR341_)
 - Fix a failure to execute `_prep_colocated_db` due to incorrect named attr check. (PR339_)
 - Enabled and mitigated mypy `disallow_any_generics` and `warn_return_any`. (PR338_)
-- Move installation of all optional SmartSim Python ML dependencies to `pip install` time. (PR336_)
 - Add a `smart validate` target to provide a simple smoke test to assess a SmartSim build. (PR336_, PR351_)
 - Add typehints to `smartsim._core.launcher.step.*`. (PR334_)
 - Log errors reported from slurm WLM when attempts to retrieve status fail. (PR331_, PR332_)

diff --git a/doc/installation_instructions/basic.rst b/doc/installation_instructions/basic.rst
@@ -160,15 +160,15 @@ and install SmartSim from PyPI with the following command:
 
 If you would like SmartSim to also install python machine learning libraries
 that can be used outside SmartSim to build SmartSim-compatible models, you
-can request their installation through the ``ml-*`` optional dependencies,
+can request their installation through the ``[ml]`` optional dependencies,
 as follows:
 
 .. code-block:: bash
 
-    # For CPU based models
-    pip install smartsim[ml-cpu]
-    # For CPU and CUDA based models
-    pip install smartsim[ml-cuda]
+    # For bash
+    pip install smartsim[ml]
+    # For zsh
+    pip install smartsim\[ml\]
 
 At this point, SmartSim is installed and can be used for more basic features.
 If you want to use the machine learning features of SmartSim, you will need
@@ -287,9 +287,8 @@ source remains at the site of the clone instead of in site-packages.
 .. code-block:: bash
 
   cd smartsim
-  pip install -e .[dev,ml-cpu]   # for CPU only
-  # OR
-  pip install -e .[dev,ml-cuda]  # for CUDA support
+  pip install -e .[dev,ml]    # for bash users
+  pip install -e .\[dev,ml\]  # for zsh users
 
 Use the now installed ``smart`` cli to install the machine learning runtimes.
 

diff --git a/doc/installation_instructions/site-install.rst b/doc/installation_instructions/site-install.rst
@@ -11,4 +11,5 @@ from source with the following steps replacing ``COMPILER_VERSION`` and
 
     module use -a /lus/scratch/smartsim/local/modulefiles
     module load cudatoolkit/11.8 cudnn smartsim-deps/COMPILER_VERSION/SMARTSIM_VERSION
-    pip install smartsim[ml-cuda]
+    pip install smartsim[ml]
+    smart build --only_python_packages --device gpu [--onnx]
diff --git a/smartsim/_core/_cli/build.py b/smartsim/_core/_cli/build.py
@@ -32,15 +32,15 @@
 
 from tabulate import tabulate
 
-from smartsim._core._cli.utils import color_bool, SMART_LOGGER_FORMAT
+from smartsim._core._cli.utils import SMART_LOGGER_FORMAT, color_bool, pip
 from smartsim._core._install import builder
 from smartsim._core._install.buildenv import (
     BuildEnv,
+    DbEngine,
     SetupError,
     Version_,
-    Versioner,
-    DbEngine,
     VersionConflictError,
+    Versioner,
 )
 from smartsim._core._install.builder import BuildError
 from smartsim._core.config import CONFIG
@@ -244,16 +244,34 @@ def check_py_torch_version(versions: Versioner, device: _TDeviceStr = "cpu") ->
         else:
             raise BuildError("Unrecognized device requested")
 
-    _check_packages_in_python_env(
-        {
-            "torch": Version_(f"{versions.TORCH}{device_suffix}"),
-            "torchvision": Version_(f"{versions.TORCHVISION}{device_suffix}"),
-        },
+    torch_deps = {
+        "torch": Version_(f"{versions.TORCH}{device_suffix}"),
+        "torchvision": Version_(f"{versions.TORCHVISION}{device_suffix}"),
+    }
+    missing, conflicts = _assess_python_env(
+        torch_deps,
+        package_pinning="==",
         validate_installed_version=_create_torch_version_validator(
             with_suffix=device_suffix
         ),
     )
 
+    if len(missing) == len(torch_deps) and not conflicts:
+        # All PyTorch deps are not installed and there are no conflicting
+        # python packages. We can try to install torch deps into the current env.
+        logger.info(
+            "Torch version not found in python environment. "
+            "Attempting to install via `pip`"
+        )
+        pip(
+            "install",
+            "-f",
+            "https://download.pytorch.org/whl/torch_stable.html",
+            *(f"{package}=={version}" for package, version in torch_deps.items()),
+        )
+    elif missing or conflicts:
+        logger.warning(_format_incompatible_python_env_message(missing, conflicts))
+
 
 def _create_torch_version_validator(
     with_suffix: str,
@@ -297,20 +315,7 @@ def _check_packages_in_python_env(
     )
 
     if missing or conflicts:
-        indent = "\n\t"
-        fmt_list: t.Callable[[str, t.List[str]], str] = (
-            lambda n, l: f"{n}:{indent}{indent.join(l)}" if l else ""
-        )
-        missing_str = fmt_list("Missing", missing)
-        conflict_str = fmt_list("Conflicting", conflicts)
-        sep = "\n" if missing_str and conflict_str else ""
-        logger.warning(
-            "Python Env Status Warning!\n"
-            "Requested Packages are Missing or Conflicting:\n\n"
-            f"{missing_str}{sep}{conflict_str}"
-            "\n\nConsider installing packages at the requested versions via "
-            "`pip` or installing SmartSim with optional ML dependencies"
-        )
+        logger.warning(_format_incompatible_python_env_message(missing, conflicts))
 
 
 def _assess_python_env(
@@ -334,6 +339,26 @@ def _assess_python_env(
     return missing, conflicts
 
 
+def _format_incompatible_python_env_message(
+    missing: t.Iterable[str], conflicting: t.Iterable[str]
+) -> str:
+    indent = "\n\t"
+    fmt_list: t.Callable[[str, t.Iterable[str]], str] = (
+        lambda n, l: f"{n}:{indent}{indent.join(l)}" if l else ""
+    )
+    missing_str = fmt_list("Missing", missing)
+    conflict_str = fmt_list("Conflicting", conflicting)
+    sep = "\n" if missing_str and conflict_str else ""
+    return (
+        "Python Env Status Warning!\n"
+        "Requested Packages are Missing or Conflicting:\n\n"
+        f"{missing_str}{sep}{conflict_str}\n\n"
+        "Consider installing packages at the requested versions via `pip` or "
+        "uninstalling them, installing SmartSim with optional ML dependencies "
+        "(`pip install smartsim[ml]`), and running `smart clean && smart build ...`"
+    )
+
+
 def execute(args: argparse.Namespace) -> int:
     verbose = args.v
     keydb = args.keydb
@@ -376,21 +401,22 @@ def execute(args: argparse.Namespace) -> int:
         print(tabulate(vers, headers=version_names, tablefmt="github"), "\n")
 
     try:
-        # REDIS/KeyDB
-        build_database(build_env, versions, keydb, verbose)
-
-        # REDISAI
-        build_redis_ai(
-            build_env,
-            versions,
-            device,
-            pt,
-            tf,
-            onnx,
-            args.torch_dir,
-            args.libtensorflow_dir,
-            verbose=verbose,
-        )
+        if not args.only_python_packages:
+            # REDIS/KeyDB
+            build_database(build_env, versions, keydb, verbose)
+
+            # REDISAI
+            build_redis_ai(
+                build_env,
+                versions,
+                device,
+                pt,
+                tf,
+                onnx,
+                args.torch_dir,
+                args.libtensorflow_dir,
+                verbose=verbose,
+            )
     except (SetupError, BuildError) as e:
         logger.error(str(e))
         return 1
@@ -406,7 +432,7 @@ def execute(args: argparse.Namespace) -> int:
             check_py_tf_version(versions)
         if "onnxruntime" in backends:
             check_py_onnx_version(versions)
-    except SetupError as e:
+    except (SetupError, BuildError) as e:
         logger.error(str(e))
         return 1
 
@@ -430,6 +456,12 @@ def configure_parser(parser: argparse.ArgumentParser) -> None:
         choices=["cpu", "gpu"],
         help="Device to build ML runtimes for",
     )
+    parser.add_argument(
+        "--only_python_packages",
+        action="store_true",
+        default=False,
+        help="Only evaluate the python packages (i.e. skip building backends)",
+    )
     parser.add_argument(
         "--no_pt",
         action="store_true",

diff --git a/smartsim/_core/_cli/utils.py b/smartsim/_core/_cli/utils.py
@@ -26,11 +26,14 @@
 
 import importlib
 import shutil
+import subprocess as sp
+import sys
 import typing as t
 from argparse import ArgumentParser, Namespace
 from pathlib import Path
 
 from smartsim._core._install.buildenv import SetupError
+from smartsim._core._install.builder import BuildError
 from smartsim._core.utils import colorize
 from smartsim.log import get_logger
 
@@ -60,6 +63,16 @@ def color_bool(trigger: bool = True) -> str:
     return colorize(str(trigger), color=_color)
 
 
+def pip(*args: str) -> None:
+    cmd = (sys.executable, "-m", "pip") + args
+    with sp.Popen(cmd, stdout=sp.PIPE, stderr=sp.PIPE) as proc:
+        _, err = proc.communicate()
+        if int(proc.returncode) != 0:
+            raise BuildError(
+                f"`pip` returned with a non-zero exit code:\n{err.decode('utf-8')}"
+            )
+
+
 def clean(core_path: Path, _all: bool = False) -> int:
     """Remove pre existing installations of ML runtimes
 

diff --git a/smartsim/_core/_cli/validate.py b/smartsim/_core/_cli/validate.py
@@ -194,7 +194,7 @@ def _test_tf_install(client: Client, tmp_dir: str, device: _TCapitalDeviceStr) -
         model_path, inputs, outputs = recv_conn.recv()
     except EOFError as e:
         raise Exception(
-            "Failed to recieve serialized model from subprocess. "
+            "Failed to receive serialized model from subprocess. "
             "Is the `tensorflow` python package installed?"
         ) from e
 

diff --git a/smartsim/_core/_install/buildenv.py b/smartsim/_core/_install/buildenv.py
@@ -26,7 +26,6 @@
 
 # pylint: disable=invalid-name
 
-import itertools
 import importlib.metadata
 import os
 import platform
@@ -262,30 +261,6 @@ def get_defaults(self) -> t.Dict[str, str]:
         return self.defaults[self.version].copy()
 
 
-def _format_linux_torch_py_package_req(
-    arch: str, python_version: str, torch_version: str
-) -> str:
-    pyv_no_dot = python_version.replace(".", "")
-    return (
-        "torch"
-        # pylint: disable-next=line-too-long
-        f"  @ https://download.pytorch.org/whl/{arch}/torch-{torch_version}%2B{arch}-cp{pyv_no_dot}-cp{pyv_no_dot}-linux_x86_64.whl"
-        f'  ; python_version == "{python_version}" and sys_platform != "darwin"'
-    )
-
-
-def _format_linux_torchvision_py_package_req(
-    arch: str, python_version: str, torchvision_version: str
-) -> str:
-    pyv_no_dot = python_version.replace(".", "")
-    return (
-        "torchvision"
-        # pylint: disable-next=line-too-long
-        f"  @ https://download.pytorch.org/whl/{arch}/torchvision-{torchvision_version}%2B{arch}-cp{pyv_no_dot}-cp{pyv_no_dot}-linux_x86_64.whl"
-        f'  ; python_version == "{python_version}" and sys_platform != "darwin"'
-    )
-
-
 class Versioner:
     """Versioner is responsible for managing all the versions
     within SmartSim including SmartSim itself.
@@ -376,26 +351,8 @@ def ml_extras_required(self) -> t.Dict[str, t.List[str]]:
         """
         ml_defaults = self.REDISAI.get_defaults()
 
-        def _format_custom_linux_torch_deps(
-            torchv: str, torchvisionv: str, arch: str
-        ) -> t.Tuple[str, ...]:
-            # The correct versions and suffixes were scraped from
-            # https://pytorch.org/get-started/previous-versions/
-            supported_py_versions = ("3.8", "3.9", "3.10")
-            return tuple(
-                itertools.chain.from_iterable(
-                    (
-                        _format_linux_torch_py_package_req(arch, pyv, torchv),
-                        _format_linux_torchvision_py_package_req(
-                            arch, pyv, torchvisionv
-                        ),
-                    )
-                    for pyv in supported_py_versions
-                )
-            )
-
         # remove torch-related fields as they are subject to change
-        # by having the user set env vars
+        # by having the user change hardware (cpu/gpu)
         _torch_fields = [
             "torch",
             "torchvision",
@@ -405,25 +362,8 @@ def _format_custom_linux_torch_deps(
         for field in _torch_fields:
             ml_defaults.pop(field)
 
-        common = tuple(f"{lib}=={vers}" for lib, vers in ml_defaults.items())
         return {
-            "ml-cpu": [
-                *common,
-                # osx
-                f'torch=={self.TORCH} ; sys_platform == "darwin"',
-                f'torchvision=={self.TORCHVISION} ; sys_platform == "darwin"',
-                # linux
-                *_format_custom_linux_torch_deps(
-                    self.TORCH, self.TORCHVISION, self.TORCH_CPU_SUFFIX.lstrip("+")
-                ),
-            ],
-            "ml-cuda": [
-                *common,
-                # linux
-                *_format_custom_linux_torch_deps(
-                    self.TORCH, self.TORCHVISION, self.TORCH_CUDA_SUFFIX.lstrip("+")
-                ),
-            ],
+            "ml": [f"{lib}=={vers}" for lib, vers in ml_defaults.items()]
         }
 
     @staticmethod