Skip to content
This repository has been archived by the owner on Oct 2, 2024. It is now read-only.

Commit

Permalink
Merge branch 'master' into run-oci-bundles_1754
Browse files Browse the repository at this point in the history
  • Loading branch information
kchilleri committed May 14, 2024
2 parents 94545fe + 649c710 commit a50d23b
Show file tree
Hide file tree
Showing 20 changed files with 342 additions and 63 deletions.
9 changes: 9 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,15 @@ jobs:
steps:
- uses: actions/checkout@v3

# This allows SSH access to the GitHub Actions VM to debug things that
# only happen on CI. Comment out unless needed. WARNING: tmate.io has
# access to unencrypted SSH traffic.
# See: https://github.com/marketplace/actions/debugging-with-tmate
#- name: set up tmate session
# uses: mxschmitt/action-tmate@v3
# with:
# detached: true

- name: early setup & validation
run: |
[[ -n $CH_TEST_BUILDER ]]
Expand Down
3 changes: 2 additions & 1 deletion bin/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ if HAVE_LIBSQUASHFUSE
ch_run_SOURCES += ch_fuse.h ch_fuse.c
endif

ch_run_CFLAGS = $(CFLAGS) $(PTHREAD_CFLAGS)
# additional build flags for ch-run
ch_run_CFLAGS = $(PTHREAD_CFLAGS)
ch_run_LDADD = $(CH_RUN_LIBS)


Expand Down
2 changes: 2 additions & 0 deletions bin/ch-image.py.in
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,8 @@ if (__name__ == "__main__"):
for (opt, arg) in zip(sys.argv[1:], sys.argv[2:] + [None]):
(opt, _, arg_eq) = opt.partition("=")
if (opt == "--break"):
if (not sys.stdin.isatty()):
ch.FATAL("--break: standard input must be a terminal")
if (arg_eq != ""):
arg = arg_eq
try:
Expand Down
12 changes: 12 additions & 0 deletions bin/ch-run.c
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,12 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state)
exit(0);
#else
exit(1);
#endif
} else if (!strcmp(arg, "overlayfs")) {
#ifdef HAVE_OVERLAYFS
exit(0);
#else
exit(1);
#endif
} else if (!strcmp(arg, "seccomp")) {
#ifdef HAVE_SECCOMP
Expand All @@ -461,6 +467,12 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state)
exit(0);
#else
exit(1);
#endif
} else if (!strcmp(arg, "tmpfs-xattrs")) {
#ifdef HAVE_TMPFS_XATTRS
exit(0);
#else
exit(1);
#endif
}
else
Expand Down
45 changes: 33 additions & 12 deletions bin/ch_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,27 @@
/* Timeout in seconds for waiting for join semaphore. */
#define JOIN_TIMEOUT 30

/* Maximum length of paths we're willing to deal with. (Note that
/* Maximum length of paths were willing to deal with. (Note that
system-defined PATH_MAX isn't reliable.) */
#define PATH_CHARS 4096

/* Mount point for the tmpfs used by -W. We want this to be (a) always
available [1], (b) short, (c) not used by anything else we care about
during container setup, and (d) not wildly confusing if users see it in an
error message. Must be a string literal because we use C’s literal
concatenation feature. Options considered (all of these required by FHS):
/boot Not present if host is booted in some strange way?
/etc Likely very reliable but seems risky
/mnt Used for images on GitHub Actions and causes CI failures
/opt Seems very omittable
/srv I’ve never actually seen it used; reliable?
/var Too aggressive?
/var/spool Long; omittable for lightweight hosts?
[1]: https://www.pathname.com/fhs/pub/fhs-2.3.pdf */
#define WF_MNT "/srv"


/** Constants **/

Expand Down Expand Up @@ -306,26 +323,30 @@ void enter_udss(struct container *c)
// https://www.kernel.org/doc/html/v5.11/filesystems/tmpfs.html
// https://www.kernel.org/doc/html/v5.11/filesystems/overlayfs.html
if (c->overlay_size != NULL) {
VERBOSE("overlaying tmpfs for --write-fake (%s)", c->overlay_size);
char *options;
struct stat st;
VERBOSE("overlaying tmpfs for --write-fake (%s)", c->overlay_size);
T_ (1 <= asprintf(&options, "size=%s", c->overlay_size));
Zf (mount(NULL, "/mnt", "tmpfs", 0, options), // host should have /mnt
Zf (mount(NULL, WF_MNT, "tmpfs", 0, options),
"cannot mount tmpfs for overlay");
free(options);
Z_ (mkdir("/mnt/upper", 0700));
Z_ (mkdir("/mnt/work", 0700));
Z_ (mkdir("/mnt/merged", 0700));
mkdir_scratch = "/mnt/mkdir_overmount";
Z_ (mkdir(WF_MNT "/upper", 0700));
Z_ (mkdir(WF_MNT "/work", 0700));
Z_ (mkdir(WF_MNT "/merged", 0700));
mkdir_scratch = WF_MNT "/mkdir_overmount";
Z_ (mkdir(mkdir_scratch, 0700));
T_ (1 <= asprintf(&options, "lowerdir=%s,upperdir=%s,workdir=%s,"
"index=on,userxattr,volatile",
c->newroot, "/mnt/upper", "/mnt/work"));
T_ (1 <= asprintf(&options, ("lowerdir=%s,upperdir=%s,workdir=%s,"
"index=on,userxattr,volatile"),
c->newroot, WF_MNT "/upper", WF_MNT "/work"));
// update newroot
c->newroot = "/mnt/merged";
Zf (stat(c->newroot, &st),
"can't stat new root; overmounted by tmpfs for -W?: %s", c->newroot);
c->newroot = WF_MNT "/merged";
free(nr_parent);
free(nr_base);
path_split(c->newroot, &nr_parent, &nr_base);
Zf (mount(NULL, c->newroot, "overlay", 0, options), "can't overlay");
Zf (mount(NULL, c->newroot, "overlay", 0, options),
"can't overlay: %s, %s", c->newroot, options);
VERBOSE("newroot updated: %s", c->newroot);
free(options);
}
Expand Down
5 changes: 5 additions & 0 deletions bin/ch_fuse.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@
// SquashFUSE redefines __le16 unless HAVE_LINUX_TYPES_LE16 is defined. We are
// assuming it is defined in <linux/types.h> on your machine.
#define HAVE_LINUX_TYPES_LE16
// The forget operation in libfuse3 takes uint64_t as third parameter,
// while SquashFUSE defaults to unsigned long as used in libfuse2.
// This causes a mess on arches with different size of these types,
// so explicitly switch to the libfuse3 variant.
#define HAVE_FUSE_LL_FORGET_OP_64T
// Now we can include ll.h.
#include <squashfuse/ll.h>

Expand Down
7 changes: 6 additions & 1 deletion configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,7 @@ AC_MSG_RESULT($have_userns)
AC_DEFUN([CH_OVERLAY_C], [[
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
Expand Down Expand Up @@ -786,6 +787,10 @@ AC_SUBST([CH_RUN_LIBS])
AC_SUBST([PYTHON_SHEBANG])
AC_SUBST([SPHINX])

AS_IF([test $have_overlayfs = yes],
[AC_DEFINE([HAVE_OVERLAYFS], [1], [unprivileged overlayfs])])
AS_IF([test $have_tmpfs_xattrs = yes],
[AC_DEFINE([HAVE_TMPFS_XATTRS], [1], [tmpfs user xattrs])])
AS_IF([test $have_fnm_extmatch = yes],
[AC_DEFINE([HAVE_FNM_EXTMATCH], [1], [extended globs supported])])
AS_IF([test $have_seccomp = yes],
Expand Down Expand Up @@ -941,7 +946,7 @@ Building Charliecloud
test suite ... ${enable_test}
required:
C99 compiler ... ${CC} ${CC_VERSION}
C99 compiler ... ${CC} ${CFLAGS}
optional:
extended glob patterns in --unset-env ... ${have_fnm_extmatch}
Expand Down
152 changes: 150 additions & 2 deletions doc/best_practices.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
Best practices
**************

.. contents::
:depth: 3
:local:

Other best practices information
================================

Expand Down Expand Up @@ -303,5 +307,149 @@ building, and then run using a separate container invoked from a different
terminal.


.. LocalWords: userguide Gruening Souppaya Morello Scarfone openmpi nist
.. LocalWords: ident OCFS MAGICK
MPI
===

Problems that best practices help you avoid
-------------------------------------------

These recommendations are derived from our experience in mitigating container
MPI issues. It is important to note that, despite marketing claims, no single
container implementation has “solved” MPI or is free of warts; the issues are
numerous, multifaceted, and dynamic.

Key concepts and related issues include:

1. **Workload management**. Running applications on HPC clusters requires
resource management and job scheduling. Put simply, resource management
is the act of allocating and restricting compute resources, e.g., CPU and
memory, whereas job scheduling is the act of prioritizing and enforcing
resource management. *Both require privileged operations.*

Some privileged container implementations attempt to provide their own
workload management, often referred to as “container orchestration”.

Charliecloud is lightweight and completely unprivileged. We rely on
existing, reputable and well established HPC workload managers such as
Slurm.

2. **Job launch**. When a multi-node MPI job is launched, each node must
launch a number of containerized processes, i.e., *ranks*. Doing this
unprivileged and at scale requires interaction between the application
and workload manager. That is, something like Process Management
Interface (PMI) is needed to facilitate the job launch.

3. **Shared memory**. Processes in separate sibling containers cannot use
single-copy *cross-memory attach* (CMA), as opposed to double-copy POSIX
or SysV shared memory. The solution is to put all ranks in the *same*
container with :code:`ch-run --join`. (See above for details:
:ref:`faq_join`.)

4. **Network fabric.** Performant MPI jobs must recognize and use a system’s
high-speed interconnect. Common issues that arise are:

a. Libraries required to use the interconnect are proprietary or
otherwise unavailable to the container.

b. The interconnect is not supported by the container MPI.

In both cases, the containerized MPI application will either fail or run
significantly slower.

These problems can be avoided, and this section describes our recommendations
to do so.

Recommendations TL;DR
---------------------

Generally, we recommend building a flexible MPI container using:

a. **libfabric** to flexibly manage process communication over a diverse
set of network fabrics;

b. a parallel **process management interface** (PMI), compatible with the
host workload manager (e.g., PMI2, PMIx, flux-pmi); and

c. an **MPI** that supports (1) libfabric and (2) the selected PMI.

More experienced MPI and unprivileged container users can find success through
MPI replacement (injection); however, such practices are beyond the scope of
this FAQ.

The remaining sections detail the reasoning behind our approach. We recommend
referencing, or directly using, our examples
:code:`examples/Dockerfile.{libfabric,mpich,openmpi}`.

Use libfabric
-------------

`libfabric <https://ofiwg.github.io/libfabric>`_ (a.k.a. Open Fabrics
Interfaces or OFI) is a low-level communication library that abstracts diverse
networking technologies. It defines *providers* that implement the mapping
between application-facing software (e.g., MPI) and network specific drivers,
protocols, and hardware. These providers have been co-designed with fabric
hardware and application developers with a focus on HPC needs. libfabric lets
us more easily manage MPI communication over diverse network high-speed
interconnects (a.k.a. *fabrics*).

From our libfabric example (:code:`examples/Dockerfile.libfabric`):

.. literalinclude:: ../examples/Dockerfile.libfabric
:language: docker
:lines: 116-135

The above compiles libfabric with several “built-in” providers, i.e.
:code:`psm3` (on x86-64), :code:`rxm`, :code:`shm`, :code:`tcp`, and
:code:`verbs`, which enables MPI applications to run efficiently over most
verb devices using TCP, IB, OPA, and RoCE protocols.

Two key advantages of using libfabric are: (1) the container’s libfabric can
make use of “external” i.e. dynamic-shared-object (DSO) providers, and
(2) libfabric replacement is simpler than MPI replacement and preserves the
original container MPI. That is, managing host/container ABI compatibility is
difficult and error-prone, so we instead manage the more forgiving libfabric
ABI compatibility.

A DSO provider can be used by a libfabric that did not originally compile it,
i.e., they can be compiled on a target host and later injected into the
container along with any missing shared library dependencies, and used by the
container's libfabric. To build a libfabric provider as a DSO, add :code:`=dl`
to its :code:`configure` argument, e.g., :code:`--with-cxi=dl`.

A container's libfabric can also be replaced by a host libfabric. This is a
brittle but usually effective way to give containers access to the Cray
libfabric Slingshot provider :code:`cxi`.

In Charliecloud, both of these injection operations are currently done with
:code:`ch-fromhost`, though see `issue #1861
<https://github.com/hpc/charliecloud/issues/1861>`_.

Choose a compatible PMI
-----------------------

Unprivileged processes, including unprivileged containerized processes, are
unable to independently launch containerized processes on different nodes,
aside from using SSH, which isn’t scalable. We must either (1) rely on a host
supported parallel process management interface (PMI), or (2) achieve
host/container MPI ABI compatibility through unsavory practices such as
complete container MPI replacement.

The preferred PMI implementation, e.g., PMI1, PMI2, OpenPMIx, or flux-pmi,
will be that which is best supported by your host workload manager and
container MPI.

In :code:`example/Dockerfile.libfabric`, we selected :code:`OpenPMIx` because
(1) it is supported by SLURM, OpenMPI, and MPICH, (2)~it is required for
exascale, and (3) OpenMPI versions 5 and newer will no longer support PMI2.

Choose an MPI compatible with your libfabric and PMI
----------------------------------------------------

There are various MPI implementations, e.g., OpenMPI, MPICH, MVAPICH2,
Intel-MPI, etc., to consider. We generally recommend OpenMPI; however, your
MPI implementation of choice will ultimately be that which best supports the
libfabric and PMI most compatible with your hardware and workload manager.


.. LocalWords: userguide Gruening Souppaya Morello Scarfone openmpi nist dl
.. LocalWords: ident OCFS MAGICK mpich psm rxm shm DSO pmi MVAPICH
13 changes: 10 additions & 3 deletions doc/ch-run.rst
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,16 @@ mounting SquashFS images with FUSE.
Don’t expand variables when using :code:`--set-env`.

:code:`--feature=FEAT`
If feature :code:`FEAT` is enabled, exit with success. Valid values of
:code:`FEAT` are :code:`extglob` for extended globs, :code:`seccomp` for
:code:`seccomp(2)`, and :code:`squash` for squashfs archives.
If feature :code:`FEAT` is enabled, exit successfully (zero); otherwise,
exit unsuccessfully (non-zero). Note this just communicates the results of
:code:`configure` rather than testing the feature. Valid values of
:code:`FEAT` are:

* :code:`extglob`: extended globs in :code:`--unset-env`
* :code:`seccomp`: :code:`--seccomp` available
* :code:`squash`: internal SquashFUSE image mounts
* :code:`overlayfs`: unprivileged overlayfs support
* :code:`tmpfs-xattrs`: :code:`user` xattrs on tmpfs

:code:`-g`, :code:`--gid=GID`
Run as group :code:`GID` within container.
Expand Down
2 changes: 1 addition & 1 deletion doc/faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1364,4 +1364,4 @@ conversion. Important caveats include:

.. LocalWords: CAs SY Gutmann AUTH rHsFFqwwqh MrieaQ Za loc mpihello mvo du
.. LocalWords: VirtualSize linuxcontainers jour uk lxd rwxr xr qq qqq drwxr
.. LocalWords: drwx
.. LocalWords: drwx mpich
2 changes: 1 addition & 1 deletion doc/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ package managers.
Maintained by us:

* `Spack
<https://spack.readthedocs.io/en/latest/package_list.html#charliecloud>`_;
<https://packages.spack.io/package.html?name=charliecloud>`_;
install with :code:`+builder` to get :code:`ch-image`.
* `Fedora/EPEL <https://bodhi.fedoraproject.org/updates/?search=charliecloud>`_;
check for available versions with :code:`{yum,dnf} list charliecloud`.
Expand Down
8 changes: 6 additions & 2 deletions examples/obspy/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,13 @@ WORKDIR /usr/local/src
#
# 2. Use latest version so we catch sooner if things explode.
#
# 3. ObsPy 1.4.0, the latest as of 2024-03-27, is incompatible with Python
# 3.12 [2], which is recently the default in Miniconda (see PR #1885 and
# issue #1886).
#
# [1]: https://docs.anaconda.com/anaconda/user-guide/faq/
ARG MC_VERSION=latest
# [2]: https://github.com/obspy/obspy/issues/3313#issuecomment-1818165937
ARG MC_VERSION=py311_24.1.2-0
ARG MC_FILE=Miniconda3-$MC_VERSION-Linux-x86_64.sh
RUN wget -nv https://repo.anaconda.com/miniconda/$MC_FILE
# Miniconda will fail if the HOME variable is not set.
Expand All @@ -32,7 +37,6 @@ RUN conda config --set auto_update_conda False
# new environment for obspy.
# See: https://github.com/obspy/obspy/wiki/Installation-via-Anaconda
RUN conda config --add channels conda-forge
# Use numpy 1.21 to avoid isse: https://github.com/obspy/obspy/issues/2940
RUN conda install --yes obspy=1.4.0
RUN conda update obspy

Expand Down
Loading

0 comments on commit a50d23b

Please sign in to comment.