Skip to content

Commit

Permalink
swancustomenvironments: If resolved, install requirements using uv
Browse files Browse the repository at this point in the history
When creating a custom environment, two types of a requirements file
can now be provided:
1. High-level requirements, i.e. not fully resolved. These are provided by
a file called requirements.in.
2. Fully resolved requirements, i.e. all the necessary packages including
their dependencies and specific versions. These are provided by a file
called requirements.txt.

If (2) is present, the packages will be installed with uv for performance
reasons. Since a full resolution has already been done, it is safe to do
so (we don't depend on the resolution of uv). If (2) is not present but
(1) is, the packages will be installed with pip, probably in a slower way
but with the dependency resolution of pip. This guarantees a more
stable solution in the medium-long term.
  • Loading branch information
rodrigo-sobral authored and etejedor committed Nov 28, 2024
1 parent 36d7172 commit 220b804
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 28 deletions.
Original file line number Diff line number Diff line change
@@ -1,24 +1,26 @@
#!/bin/bash

# If using NXCALS, we need to install the Spark extensions and the nxcals package.
if [ -n "${INSTALL_NXCALS}" ]; then
# Create a middle layer for installing ipykernel, putting it apart from the user environment
uv venv $SWAN_ENV --seed 2>&1
source $SWAN_ENV/bin/activate
uv pip install "ipykernel==${IPYKERNEL_VERSION}"
SWAN_PACKAGES_PATH=$(python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])')
deactivate

# If using NXCALS, we need to also install the Spark packages and their dependencies in the SWAN environment
if [ -n "${USE_NXCALS}" ]; then
SPARKCONNECTOR="sparkconnector==$(python -c 'import sparkconnector; print(sparkconnector.__version__)')"
SPARKMONITOR="sparkmonitor==$(python -c 'import sparkmonitor; print(sparkmonitor.__version__)')"
NXCALS="nxcals"
SPARKCONNECTOR_DEPENDENCIES="swanportallocator requests" # TODO: Remove swanportallocator and requests installation when the SparkConnector package gets properly updated

# Create a middle layer for installing Spark extensions, putting them apart from the user environment
SWAN_ENV="${HOME}/swan"
python -m venv ${SWAN_ENV} 2>&1
source ${SWAN_ENV}/bin/activate
SWAN_PACKAGES_PATH=$(python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])')

pip install ${SPARKMONITOR} ${SPARKCONNECTOR_DEPENDENCIES} 2>&1
# Activate the SWAN environment for installing the Spark packages
source $SWAN_ENV/bin/activate
uv pip install ${SPARKMONITOR} ${SPARKCONNECTOR_DEPENDENCIES} 2>&1

# -------------- HACK SECTION --------------
# Install SPARKCONNECTOR_DEPENDENCIES separately, install SparkConnector without its dependencies and change the configuration file
# TODO: Remove this when the SparkConnector package gets properly updated
pip install ${SPARKCONNECTOR} --no-deps 2>&1
uv pip install ${SPARKCONNECTOR} --no-deps 2>&1
wget https://raw.githubusercontent.com/swan-cern/jupyter-extensions/refs/heads/swan-on-tn/SparkConnector/sparkconnector/configuration.py -O ${SWAN_PACKAGES_PATH}/sparkconnector/configuration.py 2>&1
fi

Expand All @@ -31,16 +33,23 @@ _log "Setting up the environment..."
ACTIVATE_ENV_CMD="source ${ENV_PATH}/bin/activate"
eval "${ACTIVATE_ENV_CMD}"

# Install packages in the environment and the same ipykernel that the Jupyter server uses
# Install user-requested packages in the environment.
# Use uv for better performance if environment is fully resolved;
# Otherwise, use pip for resolution (more reliable long-term).
_log "Installing packages from ${REQ_PATH}..."
pip install -r "${REQ_PATH}" "ipykernel==${IPYKERNEL_VERSION}" ${NXCALS} 2>&1
if [ "${RESOLVED_REQ}" = true ]; then
# Use the same pip configuration as the Acc-Py default pip
ACCPY_PIP_CONF="-i $(pip config get global.index-url) --allow-insecure-host $(pip config get global.trusted-host)"
uv pip install ${ACCPY_PIP_CONF} -r "${REQ_PATH}" 2>&1
else
pip install -r "${REQ_PATH}" 2>&1
fi
if [ $? -ne 0 ]; then
exit 1
fi

# Inject middle layer packages into the user environment by adding a .pth file to
# the environment site-packages that contains the path to the middle layer site-packages
if [ -n "${INSTALL_NXCALS}" ]; then
USER_PACKAGES_PATH=$(python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])')
echo ${SWAN_PACKAGES_PATH} > ${USER_PACKAGES_PATH}/$(basename ${SWAN_ENV}).pth
fi
USER_PACKAGES_PATH=$(python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])')
echo ${SWAN_PACKAGES_PATH} > ${USER_PACKAGES_PATH}/$(basename $SWAN_ENV).pth

29 changes: 25 additions & 4 deletions SwanCustomEnvironments/swancustomenvironments/scripts/builders/venv.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,16 +1,37 @@
#!/bin/bash

# Create the environment
python3 -m venv ${ENV_PATH} 2>&1
# Create a middle layer for installing ipykernel, putting it apart from the user environment
uv venv $SWAN_ENV --seed 2>&1
source $SWAN_ENV/bin/activate
uv pip install "ipykernel==${IPYKERNEL_VERSION}"
SWAN_PACKAGES_PATH=$(python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])')
deactivate

if [ "${RESOLVED_REQ}" = true ]; then
uv venv ${ENV_PATH} --seed 2>&1
else
python -m venv ${ENV_PATH} 2>&1
fi

# Activate the environment
_log "Setting up the environment..."
ACTIVATE_ENV_CMD="source ${ENV_PATH}/bin/activate"
eval "${ACTIVATE_ENV_CMD}"

# Install packages in the environment and the same ipykernel that the Jupyter server uses
# Install user-requested packages in the environment.
# Use uv for better performance if environment is fully resolved;
# Otherwise, use pip for resolution (more reliable long-term).
_log "Installing packages from ${REQ_PATH}..."
pip install -r "${REQ_PATH}" "ipykernel==${IPYKERNEL_VERSION}" 2>&1
if [ "${RESOLVED_REQ}" = true ]; then
uv pip install -r "${REQ_PATH}" 2>&1
else
pip install -r "${REQ_PATH}" 2>&1
fi
if [ $? -ne 0 ]; then
exit 1
fi

# Inject middle layer packages into the user environment by adding a .pth file to
# the environment site-packages that contains the path to the middle layer site-packages
USER_PACKAGES_PATH=$(python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])')
echo ${SWAN_PACKAGES_PATH} > ${USER_PACKAGES_PATH}/$(basename $SWAN_ENV).pth
26 changes: 19 additions & 7 deletions SwanCustomEnvironments/swancustomenvironments/scripts/makenv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ while [ $# -gt 0 ]; do
shift
;;
--nxcals)
INSTALL_NXCALS=true
USE_NXCALS=true
shift
;;
--help|-h)
Expand Down Expand Up @@ -157,13 +157,25 @@ fi
# Create and set up the environment

ENV_PATH="/home/$USER/${ENV_NAME}"
REQ_PATH="${TMP_REPO_PATH}/requirements.txt"
SWAN_ENV="/home/$USER/swan"
IPYKERNEL_VERSION=$(python -c "import ipykernel; print(ipykernel.__version__)")

# Check if requirements.txt exists in the repository
if [ ! -f "${REQ_PATH}" ]; then
_log "ERROR: Requirements file not found (${REQ_PATH})."
exit 1
if [ -f "${TMP_REPO_PATH}/requirements.txt" ]; then
# Fully resolved requirements (requirements.txt) take precedence
RESOLVED_REQ=true
REQ_PATH="${TMP_REPO_PATH}/requirements.txt"
elif [ -f "${TMP_REPO_PATH}/requirements.in" ]; then
# If only requirements.in is present, proceed with high-level requirements
RESOLVED_REQ=false
REQ_PATH="${TMP_REPO_PATH}/requirements.in"
else
# There are no requirements files (neither requirements.txt nor requirements.in) in the repository
_log "ERROR: No requirements file found. You must provide a requirements.in or requirements.txt file." && exit 1
fi

# Check if the requirements file contains the nxcals package, if the user activated the nxcals option
if [ -n "${USE_NXCALS}" ] && ! grep -q "nxcals" "${REQ_PATH}"; then
_log "ERROR: The NXCALS cluster was selected but the requirements file (${REQ_PATH}) does not contain the nxcals package." && exit 1
fi

_log "Creating environment ${ENV_NAME} using ${BUILDER}${BUILDER_VERSION:+ (${BUILDER_VERSION})}..."
Expand All @@ -182,7 +194,7 @@ ln -f -s ${KERNEL_JSON} /home/$USER/.local/share/jupyter/kernels/python3/kernel.

# For NXCALS, configure the environment kernel and terminal with some variables to
# ensure the connection with the cluster works properly.
if [ -n "${INSTALL_NXCALS}" ]; then
if [ -n "${USE_NXCALS}" ]; then
# Kernel configuration
# - SPARK_HOME: needed to point to the SPARK installation provided by the nxcals package
# - PYSPARK_PYTHON: needed to point to the Python executable in the environment shipped to the cluster
Expand Down

0 comments on commit 220b804

Please sign in to comment.