diff --git a/hyakvnc b/hyakvnc index e51ba42..4b97cb0 100755 --- a/hyakvnc +++ b/hyakvnc @@ -21,7 +21,9 @@ HYAKVNC_LOG_FILE="${HYAKVNC_LOG_FILE:-${HYAKVNC_DIR}/hyakvnc.log}" # %% Log file HYAKVNC_LOG_LEVEL="${HYAKVNC_LOG_LEVEL:-INFO}" # %% Log level to use for interactive output (default: `INFO`) HYAKVNC_LOG_FILE_LEVEL="${HYAKVNC_LOG_FILE_LEVEL:-DEBUG}" # %% Log level to use for log file output (default: `DEBUG`) -# # Utility functions +# ## Logging functions: + +# Create associative arrays for log levels and colors: declare -A Log_Levels=([OFF]=0 [FATAL]=1 [ERROR]=2 [WARN]=3 [INFO]=4 [DEBUG]=5 [TRACE]=6 [ALL]=100) declare -A Log_Level_Colors=([FATAL]=5 [ERROR]=1 [WARN]=3 [INFO]=4 [DEBUG]=6 [TRACE]=2) @@ -29,9 +31,13 @@ declare -A Log_Level_Colors=([FATAL]=5 [ERROR]=1 [WARN]=3 [INFO]=4 [DEBUG]=6 [TR # Check if the current log level is high enough to log a message # Arguments: function check_log_level() { - local -i refloglevel refloglevelno - [[ -z "${levelno:=${Log_Levels[${level:-INFO}]}}" ]] && return 1 - [[ -z "${refloglevelno:=${Log_Levels[${refloglevel:-INFO}]}}" ]] && return 1 + local loglevel refloglevel + local -i loglevelno refloglevelno + loglevel="${1:-INFO}" + refloglevel="${HYAKVNC_LOG_LEVEL:-INFO}" + + [[ -z "${loglevelno:=${Log_Levels[${loglevel}]:-}}" ]] && return 1 + [[ -z "${refloglevelno:=${Log_Levels[${refloglevel}]:-}}" ]] && return 1 ((levelno >= refloglevelno)) } @@ -46,15 +52,16 @@ function check_log_level() { # $HYAKVNC_LOG_FILE - The log file to use (default: $HYAKVNC_DIR/hyakvnc.log) # $HYAKVNC_LOG_FILE_LEVEL - The log level to use for log file output (default: DEBUG) function log() { - (($# < 1)) && { echo >&2 "ERROR in log(): No arguments provided"; return 1; } + [[ "${_HYAKVNC_LOG_NOTRACE:=1}" == 1 ]] && [[ $- == *x* ]] && set +x && _HYAKVNC_LOG_NOTRACE=x # If we're in a shell with xtrace enabled, disable xtrace while logging + (($# < 1)) && { echo >&2 "ERROR in log(): No arguments provided"; [[ "${_HYAKVNC_LOG_NOTRACE:-}" == "x" ]] && _HYAKVNC_LOG_NOTRACE=1 && set -x; return 1; } local level levelno curlevelno curlogfilelevelno curloglevel curlogfilelevel level="${1:-}" shift - [[ -z "${levelno:=${Log_Levels[${level}]}}" ]] && { echo >&2 "ERROR in log(): Unknown log level: ${level}"; return 0; } + [[ -z "${levelno:=${Log_Levels[${level}]}}" ]] && { echo >&2 "ERROR in log(): Unknown log level: ${level}"; [[ "${_HYAKVNC_LOG_NOTRACE:-}" == "x" ]] && _HYAKVNC_LOG_NOTRACE=1 && set -x; return 0; } curloglevel="${HYAKVNC_LOG_LEVEL:-INFO}" - [[ -z "${curlevelno:=${Log_Levels[${curloglevel}]}}" ]] && { echo >&2 "ERROR in log(): Unknown interactive log level: ${curloglevel}"; return 0; } + [[ -z "${curlevelno:=${Log_Levels[${curloglevel}]}}" ]] && { echo >&2 "ERROR in log(): Unknown interactive log level: ${curloglevel}"; [[ "${_HYAKVNC_LOG_NOTRACE:-}" == "x" ]] && _HYAKVNC_LOG_NOTRACE=1 && set -x; return 0; } [[ "${HYAKVNC_LOG_FILE:-/dev/null}" == "/dev/null" ]] && curlogfilelevel="OFF" || curlogfilelevel="${HYAKVNC_LOG_FILE_LEVEL:-DEBUG}" - [[ -z "${curlogfilelevelno:=${Log_Levels[${curlogfilelevel}]}}" ]] && { echo >&2 "log() Unknown logfile log level: ${curloglevel}"; return 0; } + [[ -z "${curlogfilelevelno:=${Log_Levels[${curlogfilelevel}]}}" ]] && { echo >&2 "ERROR in log(): Unknown logfile log level: ${curloglevel}"; [[ "${_HYAKVNC_LOG_NOTRACE:-}" == "x" ]] && _HYAKVNC_LOG_NOTRACE=1 && set -x; return 0; } local logctx curlogctx sourcefile lineno funcname sourcefile="${BASH_SOURCE[1]:-}" @@ -74,6 +81,22 @@ function log() { ((curlogfilelevelno >= "${Log_Levels[DEBUG]}")) && curlogctx="${logctx}" printf >&2 '%s %s%s: %b\n' "$(date -Is || printf '' || true)" "${level}" "${curlogctx:-}" "${*:-}" >>"${HYAKVNC_LOG_FILE:-/dev/null}" fi + [[ "${_HYAKVNC_LOG_NOTRACE:-}" == "x" ]] && _HYAKVNC_LOG_NOTRACE=1 && set -x; + return 0 +} + +# ## General utility functions: + +# check_command() +# Check if a command is available +# Arguments: +# - - The command to check +# - - Passed to log if the command is not available (optional) +function check_command() { + if [[ -z "${1:-}" ]] || ! command -v "${1}" >/dev/null 2>&1; then + [[ $# -gt 1 ]] && log "${@:2}" + return 1 + fi return 0 } @@ -113,12 +136,12 @@ else exit 1 fi -HYAKVNC_JOBS_DIR="${HYAKVNC_JOBS_DIR:-${HYAKVNC_DIR}/${HYAKVNC_BACKEND}/jobs}" # %% Directory to store job data (default: `$HYAKVNC_DIR/jobs`) -HYAKVNC_REPO_DIR="${HYAKVNC_REPO_DIR:-${HYAKVNC_DIR}/hyakvnc}" # Local directory to store git repository (default: `$HYAKVNC_DIR/hyakvnc`) -HYAKVNC_CHECK_UPDATE_FREQUENCY="${HYAKVNC_CHECK_UPDATE_FREQUENCY:-0}" # %% How often to check for updates in `[d]`ays or `[m]`inutes (default: `0` for every time. Use `1d` for daily, `10m` for every 10 minutes, etc. `-1` to disable.) -HYAKVNC_SSH_HOST="${HYAKVNC_SSH_HOST:-}" # %% Default SSH host to use for connection strings (default: (autodetected)) -HYAKVNC_DEFAULT_TIMEOUT="${HYAKVNC_DEFAULT_TIMEOUT:-30}" # %% Seconds to wait for most commands to complete before timing out (default: `30`) -HYAKVNC_BACKEND="${HYAKVNC_BACKEND:-}" # %% Backend to use (default: (autodetected, can be `klone`, `apptainer`)) +HYAKVNC_BACKEND="${HYAKVNC_BACKEND:-apptainer}" # %% Backend to use (default: (`klone` if running on Hyak Klone cluster, `apptainer` otherwise)) +HYAKVNC_JOB_PREFIX="${HYAKVNC_JOB_PREFIX:-hyakvnc-}" # %% Prefix to use for hyakvnc job names (default: `hyakvnc-`) +HYAKVNC_JOBS_DIR="${HYAKVNC_JOBS_DIR:-${HYAKVNC_DIR}/${jobs}/${HYAKVNC_BACKEND}}" # %% Directory to store job data (default: `$HYAKVNC_DIR/jobs`) +HYAKVNC_REPO_DIR="${HYAKVNC_REPO_DIR:-${HYAKVNC_DIR}/hyakvnc}" # Local directory to store git repository (default: `$HYAKVNC_DIR/hyakvnc`) +HYAKVNC_CHECK_UPDATE_FREQUENCY="${HYAKVNC_CHECK_UPDATE_FREQUENCY:-0}" # %% How often to check for updates in `[d]`ays or `[m]`inutes (default: `0` for every time. Use `1d` for daily, `10m` for every 10 minutes, etc. `-1` to disable.) +HYAKVNC_DEFAULT_TIMEOUT="${HYAKVNC_DEFAULT_TIMEOUT:-30}" # %% Seconds to wait for most commands to complete before timing out (default: `30`) HYAKVNC_SSH_HOST="${HYAKVNC_SSH_HOST:-localhost}" # %% Default SSH host to use for connection strings (default: `localhost` if backend is apptainer or docker, `klone.hyak.uw.edu` if backend is `klone`) @@ -129,26 +152,16 @@ HYAKVNC_VNC_DISPLAY="${HYAKVNC_VNC_DISPLAY:-:10}" # %% VNC display to use HYAKVNC_MACOS_VNC_VIEWER_BUNDLEIDS="${HYAKVNC_MACOS_VNC_VIEWER_BUNDLEIDS:-com.turbovnc.vncviewer.VncViewer com.turbovnc.vncviewer}" # macOS bundle identifiers for VNC viewer executables (default: `com.turbovnc.vncviewer com.realvnc.vncviewer com.tigervnc.vncviewer`) # ## Apptainer preferences: -HYAKVNC_APPTAINER_CONTAINERS_DIR="${HYAKVNC_APPTAINER_CONTAINERS_DIR:-}" # %% Directory to look for apptainer containers (default: (none)) -HYAKVNC_APPTAINER_GHCR_ORAS_PRELOAD="${HYAKVNC_APPTAINER_GHCR_ORAS_PRELOAD:-1}" # %% Whether to preload SIF files from the ORAS GitHub Container Registry (default: `0`) -HYAKVNC_APPTAINER_BIN="${HYAKVNC_APPTAINER_BIN:-apptainer}" # %% Name of apptainer binary (default: `apptainer`) -HYAKVNC_APPTAINER_CONTAINER="${HYAKVNC_APPTAINER_CONTAINER:-}" # %% Path to container image to use (default: (none; set by `--container` option)) -HYAKVNC_APPTAINER_APP_VNCSERVER="${HYAKVNC_APPTAINER_APP_VNCSERVER:-vncserver}" # %% Name of app in the container that starts the VNC session (default: `vncserver`) +HYAKVNC_APPTAINER_CONTAINERS_DIR="${HYAKVNC_APPTAINER_CONTAINERS_DIR:-}" # %% Directory to look for apptainer containers (default: (none)) APPTAINER_WRITABLE_TMPFS="${APPTAINER_WRITABLE_TMPFS:-1}" # %% Whether to mount a writable tmpfs at /tmp (default: `1`) APPTAINER_CONTAIN="${APPTAINER_CONTAIN:-1}" # %% Whether to run the container in a separate process (default: `1`) -HYAKVNC_JOB_PREFIX="${HYAKVNC_JOB_PREFIX:-hyakvnc-}" # %% Prefix to use for hyakvnc job names (default: `hyakvnc-`) # ## Slurm preferences: if [[ "${HYAKVNC_BACKEND}" == "slurm" ]]; then - HYAKVNC_SLURM_SUBMIT_TIMEOUT="${HYAKVNC_SLURM_SUBMIT_TIMEOUT:-120}" # %% Seconds after submitting job to wait for the job to start before timing out (default: `120`) - HYAKVNC_SLURM_OUTPUT_DIR="${SBATCH_OUTPUT:-${HYAKVNC_DIR}/${HYAKVNC_BACKEND}/slurm-output}" # %% Directory to store SLURM output files (default: `$HYAKVNC_DIR/slurm/slurm-output`) - SBATCH_OUTPUT="${SBATCH_OUTPUT:-${HYAKVNC_SLURM_OUTPUT_DIR}/%j.out}}" # %% Where to send SLURM job output (default: `$HYAKVNC_SLURM_OUTPUT_DIR/%j.out`) - SBATCH_ACCOUNT="${SBATCH_ACCOUNT:-}" # %% Slurm account to use (default: (autodetected)) - SBATCH_PARTITION="${SBATCH_PARTITION:-}" # %% Slurm partition to use (default: (autodetected)) - SBATCH_CLUSTERS="${SBATCH_CLUSTERS:-}" # %% Slurm cluster to use (default: (autodetected)) - SBATCH_GPUS="${SBATCH_GPUS:-}" # %% Number of GPUs to request (default: (none)) - + HYAKVNC_SLURM_SUBMIT_TIMEOUT="${HYAKVNC_SLURM_SUBMIT_TIMEOUT:-120}" # %% Seconds after submitting job to wait for the job to start before timing out (default: `120`) + HYAKVNC_SLURM_OUTPUT_DIR="${SBATCH_OUTPUT:-${HYAKVNC_DIR}/jobs/slurm/slurm-output}" # %% Directory to store SLURM output files (default: `$HYAKVNC_DIR/slurm/slurm-output`) + SBATCH_OUTPUT="${SBATCH_OUTPUT:-${HYAKVNC_SLURM_OUTPUT_DIR}/%j.out}}" # %% Where to send SLURM job output (default: `$HYAKVNC_SLURM_OUTPUT_DIR/%j.out`) if [[ "${SBATCH_CLUSTERS:-}" == "klone" ]]; then if [[ -z "${APPTAINER_CACHEDIR:-}" ]] && [[ -d "/gscratch/scrubbed" ]]; then APPTAINER_CACHEDIR="/gscratch/scrubbed/${USER}/cache/apptainer" @@ -288,12 +301,9 @@ function hyakvnc_autoupdate() { log DEBUG "Checking for updates because the last check was more than ${update_frequency_value}${update_frequency_unit} ago." fi - hyakvnc_check_updates || { - log DEBUG "No updates found." - return 1 - } + hyakvnc_check_updates || { log DEBUG "No updates found."; return 1; } - if [[ -t 0 ]]; then # Check if we're running interactively + if [[ -t 1 ]]; then # Check if we're running interactively while true; do # Ask user if they want to update local choice read -r -p "Would you like to update hyakvnc? [y/n] [x to disable]: " choice @@ -327,29 +337,49 @@ function hyakvnc_autoupdate() { esac done else - hyakvnc_pull_updates || { - log INFO "Didn't update hyakvnc" - return 1 - } + hyakvnc_pull_updates || { log INFO "Didn't update hyakvnc"; return 1; } fi return 0 } -# ## General utility functions: -# check_command() -# Check if a command is available -# Arguments: -# - - The command to check -# - - Passed to log if the command is not available (optional) -function check_command() { - if [[ -z "${1:-}" ]] || ! command -v "${1}" >/dev/null 2>&1; then - [[ $# -gt 1 ]] && log "${@:2}" - return 1 +# relativize_date() +# Convert a date to a relative date +# Arguments: +# Stdout: Relative date +function relativize_date() { + local date_a date_base + local -i diff_seconds days hours minutes seconds + (( $# < 1)) || (( $# > 2 )) && return 1 + (( $# == 1)) && date_base="$(date '+%s')" || return 1 + (( $# == 2 )) && date_base="$(date -d "$2" '+%s')" + date_a="$(date -d "$1" '+%s')" || return 1 + # Calculate the difference in seconds + diff_seconds=$((date_a - date_base)) + # Convert the difference to relative time duration + days=$((diff_seconds / 86400)) + hours=$(( (diff_seconds % 86400) / 3600)) + minutes=$(( ((diff_seconds % 86400) % 3600) / 60)) + seconds=$(( ((diff_seconds % 86400) % 3600) % 60)) + + if ((days != 0)); then + printf '%dd %dh %dm %ds' "${days#-}" "${hours#-}" "${minutes#-}" "${seconds#-}" + elif ((hours != 0)); then + printf '%dh %dm %ds' "${hours#-}" "${minutes#-}" "${seconds#-}" + elif ((minutes != 0)); then + printf '%dm %ds' "${minutes#-}" "${seconds#-}" + elif ((seconds != 0)); then + printf '%ds' "${seconds#-}" + fi + if (( diff_seconds < 0 )); then + printf ' ago' + else + printf ' from now' fi + printf '\n' return 0 } - + # ## SLURM utility functons: # check_slurm_running { @@ -399,6 +429,7 @@ function slurm_list_partitions() { max_count="$1" ;; *) break ;; esac + shift done [[ -z "${user:=${USER:-}}" ]] && { log ERROR "No user specified"; return 1; } @@ -410,26 +441,51 @@ function slurm_list_partitions() { [[ -n "${cluster:-}" ]] && sacctmgr_args+=("cluster=${cluster}") # Get partitions: - partitions="$(sacctmgr "${sacctmgr_args[@]}" | tr ',' '\n' | sort | uniq | head -n "${max_count:-0}")" || { log ERROR "sacctmgr failed with code $?"; return 1; } - [[ -n "${partitions:-}" ]] || { log ERROR "No partitions found"; return 1; } + partitions="$(sacctmgr "${sacctmgr_args[@]}" | tr ',' '\n')" || { log ERROR "sacctmgr failed with code $?"; return 1; } - # If running on klone, process the partition names as required (see `hyakalloc`) + # If running on klone, process the partition names as required (see hyakalloc): if [[ "${cluster:-}" == "klone" ]] && [[ -n "${partitions:-}" ]]; then - partitions="$(echo "${partitions:-}" | klone_read_qos | sort | uniq || true)" + partitions="$(echo "${partitions:-}" | klone_read_qos)" || { log ERROR "Failed to process partition names"; return 1; } fi + # Filter out duplicates and sort: + partitions="$(echo "${partitions:-}" | sort | uniq | head -n "${max_count:-0}")" || { log ERROR "Failed to sort partitions"; return 1; } + + # Check if we got any partitions: + [[ -n "${partitions:-}" ]] || { log ERROR "No partitions found"; return 1; } + # Return the partitions: echo "${partitions}" return 0 } +# klone_read_qos() +# Return the correct QOS on Hyak for the given partition on hyak +# Logic copied from hyakalloc's hyakqos.py:QosResource.__init__(): +# Arguments: (optional, read from stdin if not provided) +# stdout: QOS +# shellcheck disable=SC2120 +function klone_read_qos() { + local qos_name="${1:-$( Maximum number of clusters to list (optional) # stdout: List of clusters -# Returns: 0 if successful, 1 otherwise function slurm_list_clusters() { check_command sacctmgr ERROR || return 1 local clusters max_count @@ -485,36 +541,6 @@ function slurm_get_default_account() { return 0 } -# klone_read_qos() -# Return the correct QOS on Hyak for the given partition on hyak -# Logic copied from hyakalloc's hyakqos.py:QosResource.__init__(): -# Arguments: -# shellcheck disable=SC2120 -function klone_read_qos() { - local qos_name="${1:-$(/dev/null || true)" - [[ -z "${repo_token}" ]] && { - log ERROR "Failed to get token for repository ${repo}" - return 1 - } - - # Request the manifest for the image tag: - local manifest - manifest="$(curl -sSL \ - -H "Accept: application/vnd.oci.image.manifest.v1+json" \ - -H "Authorization: Bearer ${repo_token}" \ - "https://ghcr.io/v2/${repo}/manifests/${image_tag}" \ - 2>/dev/null || true)" - [[ -z "${manifest}" ]] && { - log ERROR "Failed to get manifest for repository ${repo}" - return 1 - } - - local image_sha256 - image_sha256="$(echo "${manifest}" | python3 -I -c \ - 'import sys, json; s=[ x for x in json.load(sys.stdin)["layers"] if x.get("mediaType", "") == "application/vnd.sylabs.sif.layer.v1.sif" and x.get("digest", "").startswith("sha256")]; sys.exit(1) if len(s) != 1 else print(s[0]["digest"])' \ - 2>/dev/null || true)" - [[ -z "${image_sha256:-}" ]] && { - log ERROR "Failed to get image info for repository ${repo}" - return 1 - } - [[ -d "${output_path}" ]] && output_path="${output_path}/${image_sha256}" # Append the image SHA256 to the output path if it's a directory - - if [[ -r "${output_path}" ]]; then - log DEBUG "Image already exists at ${output_path}" - if check_command sha256sum; then - if sha256sum --quiet --status --ignore-missing --check <(echo "${image_sha256##sha256:}" "${output_path}"); then - log DEBUG "Image at ${output_path} matches expected SHA256 ${image_sha256}" - echo "${output_path}" - return 0 - else - log DEBUG "Image at ${output_path} does not match expected SHA256 ${image_sha256}. Will redownload and overwrite." - fi - fi - fi - - # Download the image: - local image_url - image_url="https://ghcr.io/v2/${repo}/blobs/${image_sha256}" - curl --progress -fSL -H "Authorization: Bearer ${repo_token}" -o "${output_path}" "${image_url}" || { - log ERROR "Failed to download image from ${image_url} to ${output_path}" - rm -f "${output_path}" && log DEBUG "Removed output file at ${output_path}" # Remove the file if it exists - return 1 - } - chmod +x "${output_path}" - log DEBUG "Downloaded image to ${output_path}" - echo "${output_path}" - return 0 -} - # # Commands # ## Command: create @@ -878,7 +802,6 @@ Options: -c, --container Path to container image (required) Apptainer options: - --no-ghcr-oras-preload Don't preload ORAS GitHub Container Registry images --apptainer-args Any arguments after this will be passed to apptainer. SLURM options (available when running on SLURM login node): @@ -912,17 +835,10 @@ function cmd_create() { help_create return 0 ;; - -d | --debug) # Debug mode - export HYAKVNC_LOG_LEVEL="DEBUG" - ;; -c | --container) shift || { log ERROR "$1 requires an argument"; return 1; } export HYAKVNC_APPTAINER_CONTAINER="${1:-}" ;; - --no-ghcr-oras-preload) # Don't preload ORAS GitHub Container Registry images - shift || { log ERROR "$1 requires an argument"; return 1; } - export HYAKVNC_APPTAINER_GHCR_ORAS_PRELOAD=0 - ;; --apptainer-args) # Args to pass to Apptainer shift while (($# > 0)); do @@ -948,6 +864,8 @@ function cmd_create() { shift done + + [[ -z "${HYAKVNC_APPTAINER_CONTAINER}" ]] && { log ERROR "Container image must be specified"; return 1; } container_basename="$(basename "${HYAKVNC_APPTAINER_CONTAINER}")" || { log ERROR "Failed to get container basename from \"${HYAKVNC_APPTAINER_CONTAINER}\""; return 1; } [[ -z "${container_basename}" ]] && { log ERROR "The basename for the container \"${HYAKVNC_APPTAINER_CONTAINER}\"" is empty; return 1; } @@ -959,18 +877,6 @@ function cmd_create() { # Add a tag if none is specified: [[ "${container_basename}" =~ .*:.* ]] || HYAKVNC_APPTAINER_CONTAINER="${HYAKVNC_APPTAINER_CONTAINER}:latest" - ;;& # Fallthrough - oras://*) - if [[ "${HYAKVNC_APPTAINER_CONTAINER}" =~ ^oras:// ]] && [[ "${HYAKVNC_APPTAINER_GHCR_ORAS_PRELOAD:-1}" == 1 ]]; then - local oras_cache_dir - oras_cache_dir="${APPTAINER_CACHEDIR:-${HOME}/.apptainer/cache}/oras" - if mkdir -p "${oras_cache_dir}"; then - log INFO "Preloading ORAS image for \"${HYAKVNC_APPTAINER_CONTAINER}\"" - ghcr_get_oras_sif "${HYAKVNC_APPTAINER_CONTAINER}" "${APPTAINER_CACHEDIR}/cache/oras" || log ERROR "Failed to preload ORAS image for \"${HYAKVNC_APPTAINER_CONTAINER}\". Apptainer will try to download the image by itself. If you don't want to preload ORAS images, use the --no-ghcr-oras-preload option." - else - log ERROR "Failed to create ORAS cache directory \"${oras_cache_dir}\"" - fi - fi ;; *) @@ -996,9 +902,8 @@ function cmd_create() { case "${HYAKVNC_BACKEND:-}" in apptainer) local timestr - timestr=$(date --utc '+%Y%m%d%H%M%S') || { log ERROR "Failed to get date"; timestr=0; } - HYAKVNC_JOB_DIR=$(mktemp --tmpdir="${HYAKVNC_JOBS_DIR}" --directory "${timestr:-0}-XXX") || { log ERROR "Failed to create job directory"; return 1; } - + timestr=$(date '+%s') || { log ERROR "Failed to get date"; timestr=0; } + HYAKVNC_JOB_DIR=$(mktemp --tmpdir="${HYAKVNC_JOBS_DIR}" --directory "${HYAKVNC_JOB_PREFIX}${timestr:-0}-XXX") || { log ERROR "Failed to create job directory"; return 1; } HYAKVNC_JOB_ID=$(basename "${HYAKVNC_JOB_DIR}") export HYAKVNC_JOB_DIR HYAKVNC_JOB_ID @@ -1013,17 +918,17 @@ function cmd_create() { apptainer_args+=("${HYAKVNC_APPTAINER_CONTAINER}") apptainer_args+=("${HYAKVNC_JOB_ID}") - log DEBUG "Launching job with command: \"apptainer instance start ${apptainer_args[*]}\"" + log DEBUG "Launching job with command: \"apptainer instance run ${apptainer_args[*]}\"" # shellcheck disable=SC2016 [[ -z "${XNOTRAP:-}" ]] && trap 'log WARN "Interrupted while running \"${BASH_COMMAND:-}\". Cleaning up and exiting!"; apptainer instance stop "${HYAKVNC_JOB_ID}" && [[ -n "${HYAKVNC_JOB_DIR:-}" ]] && [[ -d "${HYAKVNC_JOB_DIR}" ]] && rm -rf "${HYAKVNC_JOB_DIR}" && log INFO "Removed job directory ${HYAKVNC_JOB_DIR}"; trap - SIGINT SIGTSTP ERR EXIT; exit 1' SIGINT SIGTSTP ERR EXIT - apptainer instance start "${apptainer_args[@]}" || { log ERROR "Failed to launch job"; return 1; } + apptainer instance run "${apptainer_args[@]}" || { log ERROR "Failed to launch job"; return 1; } # Wait for job to start running by monitoring the output of squeue: log INFO "Waiting for job ${HYAKVNC_JOB_ID} (\"${HYAKVNC_JOB_NAME}\") to start" # Get the path to the instance log file: - instance_err_log="$(apptainer instance list -l "${HYAKVNC_JOB_ID}" | grep -m 1 -oE '\/.*\.err$' || true)" + instance_err_log="$(apptainer instance list -l "${HYAKVNC_JOB_NAME}" | grep -m 1 -oE '\/.*\.err$' || true)" ;; slurm) @@ -1084,7 +989,7 @@ function cmd_create() { log INFO "Waiting for job ${HYAKVNC_JOB_ID} (\"${HYAKVNC_JOB_NAME}\") to start" # shellcheck disable=SC2016 - [[ -z "${XNOTRAP:-}" ]] && 'log WARN "Interrupted while running \"${BASH_COMMAND:-}\". Cleaning up and exiting!"; scancel --quiet "${HYAKVNC_JOB_ID}" && [[ -n "${HYAKVNC_JOB_DIR:-}" ]] && [[ -d "${HYAKVNC_JOB_DIR}" ]] && rm -rf "${HYAKVNC_JOB_DIR}" && log INFO "Removed job directory ${HYAKVNC_JOB_DIR}"; trap - RETURN SIGINT SIGTSTP ERR EXIT; return 1' RETURN SIGINT SIGTSTP ERR EXIT + [[ -z "${XNOTRAP:-}" ]] && 'log WARN "Interrupted while running \"${BASH_COMMAND:-}\". Cleaning up and exiting!"; scancel --quiet "${HYAKVNC_JOB_ID}" && [[ -n "${HYAKVNC_JOB_DIR:-}" ]] && [[ -d "${HYAKVNC_JOB_DIR}" ]] && rm -rf "${HYAKVNC_JOB_DIR}" && log INFO "Removed job directory ${HYAKVNC_JOB_DIR}"; trap - SIGINT SIGTSTP ERR EXIT; return 1' SIGINT SIGTSTP ERR EXIT local starttime="${SECONDS:-0}" while true; do @@ -1190,12 +1095,8 @@ Description: Options: -h, --help Show this help message and exit - -d, --debug Print debug info - -j, --jobid Only check status of provided SLURM job ID (optional) Examples: - # Check the status of job no. 12345: - hyakvnc status -j 12345 # Check the status of all VNC jobs: hyakvnc status EOF @@ -1203,17 +1104,13 @@ EOF # cmd_status() function cmd_status() { - local running_jobid running_jobids + local running_jobid running_jobids headerstr rowsstr='' while true; do case ${1:-} in -h | --help) help_status return 0 ;; - -d | --debug) # Debug mode - shift - export HYAKVNC_LOG_LEVEL=DEBUG - ;; -j | --jobid) # Job ID to attach to (optional) shift running_jobid="${1:-}" @@ -1228,43 +1125,78 @@ function cmd_status() { ;; esac done - # Loop over directories in ${HYAKVNC_DIR}/jobs - squeue_args=(--me --states=RUNNING --noheader --format '%j %i') - [[ -n "${running_jobid:-}" ]] && squeue_args+=(--job "${running_jobid}") - running_jobids=$(squeue "${squeue_args[@]}" | grep -E "^${HYAKVNC_JOB_PREFIX}" | grep -oE '[0-9]+$') || { - log WARN "Found no running job IDs with names that match the set job name prefix ${HYAKVNC_JOB_PREFIX}" - return 1 - } - [[ -z "${running_jobids:-}" ]] && { - log WARN "Found no running job IDs with names that match the prefix ${HYAKVNC_JOB_PREFIX}" - return 1 + + headerstr="ID\tCreated\tState\tName" + + case "${HYAKVNC_BACKEND:-apptainer}" in + apptainer) + local adds s + s="$(apptainer instance list "${HYAKVNC_JOB_PREFIX:-}"'*' || true)" + [[ -z "${s:-}" ]] && { log INFO "Found no running jobs with names that match the prefix \"${HYAKVNC_JOB_PREFIX}\""; return 0; } + local -i i=0 + while IFS= read -r; do + local pid created state name id rest relative_created + ((i++ > 0)) || continue + read -r name pid rest <<<"${REPLY}" || return 1 + created="$(date --date "$(ps -p "${pid}" -o lstart= || true)" -Is)" || return 1 + state="$(ps -p "${pid}" -o state=)" || return 1 + relative_created="$(relativize_date "${created}")" || return 1 + + id="${pid}--${name}" + printf -v adds "%s\t%s\t%s\t%s\n" "${id}" "${relative_created}" "${state}" "${name}" + rowsstr+="${adds}" + done <<<"${s}" + ;; + + slurm) + + # Loop over directories in ${HYAKVNC_DIR}/jobs + squeue_args=(--me --states=RUNNING --noheader --format '%j %i') + [[ -n "${running_jobid:-}" ]] && squeue_args+=(--job "${running_jobid}") + running_jobids=$(squeue "${squeue_args[@]}" | grep -E "^${HYAKVNC_JOB_PREFIX}" | grep -oE '[0-9]+$') || { + log WARN "Found no running job IDs with names that match the set job name prefix ${HYAKVNC_JOB_PREFIX}" + return 1 + } + [[ -z "${running_jobids:-}" ]] && { + log INFO "Found no running job IDs with names that match the prefix ${HYAKVNC_JOB_PREFIX}" + return 0 + } + + for running_jobid in ${running_jobids:-}; do + local running_job_node jobdir + running_job_node=$(squeue --job "${running_jobid}" --format "%N" --noheader --states=RUNNING) || { + log WARN "Failed to get node for job ${running_jobid}" + continue + } + [[ -z "${running_job_node}" ]] && { + log WARN "Failed to get node for job ${running_jobid}" + continue + } + jobdir="${HYAKVNC_DIR}/jobs/${running_jobid}" + [[ ! -d "${HYAKVNC_JOB_DIR}" ]] && { + log WARN "Job directory ${HYAKVNC_JOB_DIR} does not exist" + continue + } + [[ ! -e "${HYAKVNC_JOB_DIR}/vnc/socket.uds" ]] && { + log WARN "Job socket not found at ${HYAKVNC_JOB_DIR}/vnc/socket.uds" + continue + } + [[ ! -S "${HYAKVNC_JOB_DIR}/vnc/socket.uds" ]] && { + log WARN "Job socket at ${HYAKVNC_JOB_DIR}/vnc/socket.uds is not a socket" + continue + } + echo "HyakVNC job ${running_jobid} is running on node ${running_job_node}" + done + ;; + *) log ERROR "Unsupported \$HYAKVNC_BACKEND backend \"${HYAKVNC_BACKEND}\""; return 1 ;; + esac + + [[ -z "${rowsstr:-}" ]] && { + log INFO "Found no running jobs with names that match the prefix \"${HYAKVNC_JOB_PREFIX}\"" + return 0 } + printf '%b\n%b' "${headerstr:-}" "${rowsstr:-}" | column -t -s $'\t' - for running_jobid in ${running_jobids:-}; do - local running_job_node jobdir - running_job_node=$(squeue --job "${running_jobid}" --format "%N" --noheader --states=RUNNING) || { - log WARN "Failed to get node for job ${running_jobid}" - continue - } - [[ -z "${running_job_node}" ]] && { - log WARN "Failed to get node for job ${running_jobid}" - continue - } - jobdir="${HYAKVNC_DIR}/jobs/${running_jobid}" - [[ ! -d "${HYAKVNC_JOB_DIR}" ]] && { - log WARN "Job directory ${HYAKVNC_JOB_DIR} does not exist" - continue - } - [[ ! -e "${HYAKVNC_JOB_DIR}/vnc/socket.uds" ]] && { - log WARN "Job socket not found at ${HYAKVNC_JOB_DIR}/vnc/socket.uds" - continue - } - [[ ! -S "${HYAKVNC_JOB_DIR}/vnc/socket.uds" ]] && { - log WARN "Job socket at ${HYAKVNC_JOB_DIR}/vnc/socket.uds is not a socket" - continue - } - echo "HyakVNC job ${running_jobid} is running on node ${running_job_node}" - done } # ## COMMAND: stop @@ -1309,10 +1241,6 @@ function cmd_stop() { help_stop return 0 ;; - -d | --debug) # Debug mode - shift - export HYAKVNC_LOG_LEVEL=DEBUG - ;; -a | --all) shift all=1 @@ -1335,33 +1263,43 @@ function cmd_stop() { stop_hyakvnc_session_args+=("--cancel") fi - if [[ -n "${all}" ]]; then - jobids=$(squeue --me --format '%j %i' --noheader | grep -E "^${HYAKVNC_JOB_PREFIX}" | grep -oE '[0-9]+$') || log WARN "Found no running job IDs with names that match the prefix ${HYAKVNC_JOB_PREFIX}" - fi + case "${HYAKVNC_BACKEND:-apptainer}" in + apptainer) ;; - if [[ -z "${jobids}" ]]; then - if [[ -t 0 ]]; then - echo "Reading available job IDs to select from a menu" - running_jobids=$(squeue --me --noheader --format '%j %i' | grep -E "^${HYAKVNC_JOB_PREFIX}" | grep -oE '[0-9]+$') || { - log WARN "Found no running jobs with names that match the prefix ${HYAKVNC_JOB_PREFIX}" - return 1 + slurm) + + if [[ -n "${all}" ]]; then + jobids=$(squeue --me --format '%j %i' --noheader | grep -E "^${HYAKVNC_JOB_PREFIX}" | grep -oE '[0-9]+$') || log WARN "Found no running job IDs with names that match the prefix ${HYAKVNC_JOB_PREFIX}" + fi + + if [[ -z "${jobids}" ]]; then + if [[ -t 1 ]]; then + echo "Reading available job IDs to select from a menu" + running_jobids=$(squeue --me --noheader --format '%j %i' | grep -E "^${HYAKVNC_JOB_PREFIX}" | grep -oE '[0-9]+$') || { + log WARN "Found no running jobs with names that match the prefix ${HYAKVNC_JOB_PREFIX}" + return 1 + } + PS3="Enter a number: " + select jobids in ${running_jobids}; do + echo "Selected job: ${jobids}" && echo && break + done + fi + fi + + [[ -z "${jobids}" ]] && { + log ERROR "Must specify running job IDs" + exit 1 } - PS3="Enter a number: " - select jobids in ${running_jobids}; do - echo "Selected job: ${jobids}" && echo && break + + # Cancel any jobs that were launched: + for jobid in ${jobids}; do + stop_hyakvnc_session "${stop_hyakvnc_session_args[@]}" "${jobid}" && log INFO "Stopped job ${jobid}" done - fi - fi + ;; - [[ -z "${jobids}" ]] && { - log ERROR "Must specify running job IDs" - exit 1 - } + *) log ERROR "Unsupported \$HYAKVNC_BACKEND backend \"${HYAKVNC_BACKEND}\""; return 1 ;; + esac - # Cancel any jobs that were launched: - for jobid in ${jobids}; do - stop_hyakvnc_session "${stop_hyakvnc_session_args[@]}" "${jobid}" && log INFO "Stopped job ${jobid}" - done return 0 } @@ -1418,7 +1356,7 @@ function cmd_show() { done if [[ -z "${jobid:-}" ]]; then - if [[ -t 0 ]]; then + if [[ -t 1 ]]; then echo "Reading available job IDs to select from a menu" running_jobids=$(squeue --me --noheader --format '%j %i' --states RUNNING | grep -E "^${HYAKVNC_JOB_PREFIX}" | grep -oE '[0-9]+$') || { log WARN "Found no running jobs with names that match the prefix ${HYAKVNC_JOB_PREFIX}" @@ -1445,126 +1383,6 @@ function cmd_show() { return 0 } -# ## COMMAND: install - -# help_install() -function help_install() { - cat <>"${shellrcpath}" && echo "Added \$HOME/.local/bin to PATH in ${shellrcpath}" - else - echo "export PATH=\"${install_dir}:\$PATH\"" >>"${shellrcpath}" && echo "Added ${install_dir} to PATH in ${shellrcpath}" - fi - echo "Run 'source ${shellrcpath}' to update your PATH" - fi - - echo "Installed hyakvnc to ${install_dir}/hyakvnc" - [[ "${myshell}" == "zsh" ]] && echo "Run 'rehash' to update your PATH" -} - # ## COMMAND: update # help_update() @@ -1657,7 +1475,7 @@ function cmd_help() { if [[ "${1:-help}" == "help" ]]; then cat <' for more information on a specific command. @@ -1705,10 +1522,22 @@ function main() { [[ $# -eq 0 ]] && cmd_help && exit 0 # Show help if no arguments are provided while true; do case "${1:-}" in + --?*=* | -?*=*) # Handle --flag=value args + set -- "${1%%=*}" "${1#*=}" "${@:2}" + continue + ;; -d | --debug) # Debug mode export HYAKVNC_LOG_LEVEL=DEBUG shift ;; + --log-level) + shift || { log ERROR "$1 requires an argument"; return 1; } + export HYAKVNC_LOG_LEVEL="$1" + if check_log_level "${HYAKVNC_LOG_LEVEL}" "${HYAKVNC_LOG_FILE_LEVEL:-INFO}"; then + export HYAKVNC_LOG_FILE_LEVEL="${HYAKVNC_LOG_LEVEL}" + fi + shift + ;; -h | --help) shift cmd_help "${@:-}" @@ -1745,8 +1574,6 @@ function main() { ${action} "$@" } -# shellcheck disable=SC2046 -export $(compgen -v HYAKVNC_) # Export all variables starting with HYAKVNC_ # Invoke main with args if not sourced: if ! (return 0 2>/dev/null); then main "$@"