Merge remote-tracking branch 'origin/main'

ahsimb · ahsimb · commit ccba19fa6756 · 2024-08-12T17:50:18.000+01:00
diff --git a/doc/changes/unreleased.md b/doc/changes/unreleased.md
@@ -1 +1,5 @@
 # Unreleased
+
+## Features
+
+* #42: Optionally wait until SLC is deployed to all nodes in the database cluster
diff --git a/exasol/python_extension_common/deployment/extract_validator.py b/exasol/python_extension_common/deployment/extract_validator.py
@@ -0,0 +1,130 @@
+import re
+import exasol.bucketfs as bfs   # type: ignore
+import pyexasol     # type: ignore
+
+from datetime import datetime, timedelta
+from typing import Callable, List
+from tenacity import Retrying
+from tenacity.wait import wait_fixed
+from tenacity.stop import stop_after_delay
+
+from exasol.python_extension_common.deployment.language_container_validator import (
+    temp_schema
+)
+
+MANIFEST_FILE = "exasol-manifest.json"
+
+
+def _udf_name(schema: str | None, name: str) -> str:
+    timestamp = f'{datetime.now().timestamp():.0f}'
+    suffix = f'"{name}_manifest_{timestamp}"'
+    return f'"{schema}".{suffix}' if schema else suffix
+
+
+class ExtractException(Exception):
+    """
+    Expected file MANIFEST_FILE could not detected on all nodes of the
+    database cluster.
+    """
+
+
+class ExtractValidator:
+    """
+    This validates that a given archive (e.g. tgz) has been extracted on
+    all nodes of an Exasol database cluster by checking if MANIFEST_FILE
+    exists.
+
+    The specified timeout applies to the max. total duration of both phases:
+    P1) creating the UDF script and P2) checking if the UDF in SLC can be
+    executed and finds extracted MANIFEST_FILE on each node.
+
+    If a callback is specified then this function will be called multiple
+    times during detecting the MANIFEST_FILE on the nodes.
+
+    The callback is called with two arguments: the total number of nodes in
+    the database cluster as returned by nproc() and a list of the IDs of the
+    pending nodes on which the MANIFEST_FILE could not be found, yet.
+    """
+    def __init__(self,
+                 pyexasol_connection: pyexasol.ExaConnection,
+                 timeout: timedelta,
+                 interval: timedelta = timedelta(seconds=10),
+                 callback: Callable[[int, List[int]], None] | None = None,
+                 ) -> None:
+        self._pyexasol_conn = pyexasol_connection
+        self._timeout = timeout
+        self._interval = interval
+        self._callback = callback if callback else lambda x, y: None
+
+    def _create_manifest_udf_with_retry(self, language_alias: str, udf_name: str):
+        for attempt in Retrying(
+                wait=wait_fixed(self._interval),
+                stop=stop_after_delay(self._timeout),
+                reraise=True):
+            with attempt:
+                self._create_manifest_udf(language_alias, udf_name)
+
+    def _create_manifest_udf(self, language_alias: str, udf_name: str):
+        """
+        The SQL statements "ALTER SESSION SET SCRIPT_LANGUAGES" and "ALTER
+        SYSTEM SET SCRIPT_LANGUAGES" doe not check whether the specified
+        BucketFS path exists and has permissions allowing it to be accessed by
+        UDFs.
+
+        Much more a later statement "CREATE SCRIPT" will fail with an error
+        message. Hence we need to use a retry here, as well.
+        """
+        self._pyexasol_conn.execute(
+            f"""
+            CREATE OR REPLACE {language_alias} SET SCRIPT
+                {udf_name}(my_path VARCHAR(256))
+                EMITS (node INTEGER, manifest BOOL) AS
+            import os
+            def run(ctx):
+                ctx.emit(exa.meta.node_id, os.path.isfile(ctx.my_path))
+            /
+            """
+        )
+
+    def _check_all_nodes_with_retry(self, udf_name: str, nproc: int, manifest: str, timeout: timedelta):
+        for attempt in Retrying(
+                wait=wait_fixed(self._interval),
+                stop=stop_after_delay(timeout),
+                reraise=True):
+            with attempt:
+                self._check_all_nodes(udf_name, nproc, manifest)
+
+    def _check_all_nodes(self, udf_name: str, nproc: int, manifest: str):
+        result = self._pyexasol_conn.execute(
+            f"""
+            SELECT {udf_name}({manifest})
+            FROM VALUES BETWEEN 1 AND {nproc} t(i) GROUP BY i
+            """
+        ).fetchall()
+        pending = list( x[0] for x in result if not x[1] )
+        self._callback(nproc, pending)
+        if len(pending) > 0:
+            raise ExtractException(
+                f"{len(pending)} of {nproc} nodes are still pending."
+                f" IDs: {pending}")
+
+    def verify_all_nodes(self, schema: str, language_alias: str, bfs_archive_path: bfs.path.PathLike):
+        """
+        Verify if the given bfs_archive_path was extracted on all nodes
+        successfully.
+
+        Raise an ExtractException if after the configured timeout there are
+        still nodes pending, for which the extraction could not be verified,
+        yet.
+        """
+        manifest = f"{bfs_archive_path.as_udf_path()}/{MANIFEST_FILE}"
+        nproc = self._pyexasol_conn.execute("SELECT nproc()").fetchone()
+        udf_name = _udf_name(schema, language_alias)
+        start = datetime.now()
+        try:
+            self._create_manifest_udf_with_retry(language_alias, udf_name)
+            elapsed = datetime.now() - start
+            remaining = self._timeout - elapsed
+            self._check_all_nodes_with_retry(udf_name, nproc, manifest, remaining)
+        finally:
+            self._pyexasol_conn.execute(f"DROP SCRIPT IF EXISTS {udf_name}")
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "exasol-python-extension-common"
-version = "0.3.1"
+version = "0.4.0"
 description = "A collection of common utilities for Exasol extensions."
 packages = [ {include = "exasol"}, ]
 authors = ["Mikhail Beck <mikhail.beck@exasol.com>"]
diff --git a/test/unit/deployment/test_extract_validator.py b/test/unit/deployment/test_extract_validator.py
@@ -0,0 +1,196 @@
+import contextlib
+import logging
+import pytest
+import re
+import exasol.bucketfs as bfs   # type: ignore
+from pyexasol import ExaConnection
+
+from typing import Any, Dict, List
+from unittest.mock import Mock, call, patch
+from datetime import timedelta
+
+from exasol.python_extension_common.deployment.extract_validator import (
+    ExtractValidator,
+    ExtractException,
+    _udf_name,
+)
+from tenacity import RetryError
+
+LOG = logging.getLogger(__name__)
+
+
+def bucket_path(path: str):
+    bucket_api = bfs.MountedBucket("svc", "bkt")
+    return bfs.path.BucketPath(path, bucket_api=bucket_api)
+
+
+@pytest.fixture
+def archive_bucket_path():
+    return bucket_path("/folder/a.tgz")
+
+
+class ConnectionMock:
+    def __init__(self, spec: Dict[str, Any]):
+        self.spec = spec
+        self.values = iter(())
+
+    def _get_values(self, first_line: str):
+        for regex, values in self.spec.items():
+            if re.match(regex, first_line, re.IGNORECASE):
+                return values() if callable(values) else values
+        LOG.warning(f"ConnectionMock.execute() called with '{first_line[:40]}...'")
+        return ()
+
+    def execute(self, *args, **kwargs):
+        statement = args[0] if len(args) else kwargs["query"]
+        first_line = statement.strip().splitlines()[0]
+        self.values = iter(self._get_values(first_line))
+        return self
+
+    def fetchone(self):
+        return next(self.values)
+
+    def fetchall(self):
+        return [ v for v in self.values ]
+
+
+class Simulator:
+    def __init__(self, nodes: int, udf_results: List[List[any]],
+                 create_script=()):
+        self.create_script = create_script
+        self.nodes = nodes
+        self.udf = Mock(side_effect=udf_results)
+        self.callback = Mock(side_effect = self._callback)
+
+    def _callback(self, n, pending):
+        LOG.debug(f"{len(pending)} of {n} nodes pending: {pending}")
+
+    @property
+    def testee(self):
+        connection = ConnectionMock({
+            r"CREATE .* SCRIPT": self.create_script,
+            r"(CREATE|DROP) ": (),
+            r"SELECT nproc\(\)": [ self.nodes ],
+            r'SELECT .*_manifest_': self.udf,
+        })
+        return ExtractValidator(
+            pyexasol_connection=Mock(execute=connection.execute),
+            timeout=timedelta(seconds=10),
+            interval=timedelta(seconds=1),
+            callback=self.callback,
+        )
+
+
+@contextlib.contextmanager
+def mock_tenacity_wait(*wait_lists: List[int|float], max: int = 1000):
+    """
+    This context mocks internals of library ``tenacity`` in order to
+    simulate waiting for timeouts in ``tenacity.Retrying()``. All specified
+    durations are interpreted as number of seconds which can be floats.
+
+    A test case may provide multiple lists of waiting periods to cover
+    multiple consecutive retry phases in the class under test, see
+    ``ExtractValidator`` for example.
+
+    mock_tenacity_wait([1, 2], [3, 4], max=100)
+
+    After all wait lists are exhausted, i.e. the mock simulated waiting for
+    the specified periods, the mock will constantly simulate
+    ``time.monotonic()`` to return the specified max time, typically making
+    tenacity detect a timeout.
+
+    Internally the mock needs to prefix each list of waiting periods with two
+    additional entries [0, 0] which are used by ``tenacity.Retrying()`` to
+    inititialize its start times in ``BaseRetrying.begin()`` and
+    ``RetryCallState.__init__()``, see
+    https://github.com/jd/tenacity/blob/main/tenacity/__init__.py.
+    """
+    def expand(wait_lists):
+        for waits in wait_lists:
+            yield from [ 0, 0 ] + waits
+
+    durations = expand(wait_lists)
+    def mock():
+        try:
+            return next(durations)
+        except StopIteration:
+            return max
+
+    with patch("tenacity.time.sleep"):
+        with patch("tenacity.time.monotonic", side_effect=mock):
+            yield
+
+
+@pytest.mark.parametrize(
+    "schema, expected",
+    [
+        (None, r'"alias_manifest_[0-9]+"'),
+        ("schema", r'"schema"\."alias_manifest_[0-9]+"'),
+    ])
+def test_udf_name(schema, expected):
+    assert re.match(expected, _udf_name(schema, "alias"))
+
+
+def test_create_script_failure(archive_bucket_path):
+    create_script = Mock(side_effect=Exception("failed to create UDF script"))
+    sim = Simulator(nodes=4, udf_results=[], create_script=create_script)
+    with pytest.raises(Exception, match="failed to create UDF script") as ex:
+        with mock_tenacity_wait([1]):
+            sim.testee.verify_all_nodes("alias", "schema", archive_bucket_path)
+
+
+def test_failure(archive_bucket_path):
+    sim = Simulator(
+        nodes=4,
+        udf_results=[
+            [[1, False]],
+            [[1, False]],
+            [[1, False]],
+        ])
+    with pytest.raises(ExtractException) as ex:
+        with mock_tenacity_wait([1], [2, 4]):
+            sim.testee.verify_all_nodes("alias", "schema", archive_bucket_path)
+    assert "1 of 4 nodes are still pending. IDs: [1]" == str(ex.value)
+
+
+def test_success(archive_bucket_path):
+    sim = Simulator(
+        nodes=4,
+        udf_results=[
+            [[1, False], [2, False]],
+            [[1, True ], [2, False]],
+            [[1, True ], [2, True ]],
+        ])
+    with mock_tenacity_wait([1], [2, 4]):
+        sim.testee.verify_all_nodes("alias", "schema", archive_bucket_path)
+    assert sim.callback.call_args_list == [
+        call(4, [1, 2]),
+        call(4, [2]),
+        call(4, []),
+    ]
+
+
+def test_reduced_timeout(archive_bucket_path):
+    """
+    This test simulates a retry being required for creating the UDF
+    script, hence already eating up part of the total timeout.
+
+    The test then verifies the remaining part of the total timeout for actual
+    calls to the UDF being too short for successfully detecting the manifest
+    on all nodes.
+    """
+    create_script = Mock(side_effect=[Exception("failure"), ()])
+    udf_results=[
+        [[1, False], [2, False]],
+        [[1, True ], [2, False]],
+        [[1, True ], [2, True ]],
+    ]
+    sim = Simulator(
+        nodes=4,
+        udf_results=udf_results,
+        create_script=create_script,
+    )
+    with pytest.raises(ExtractException) as ex:
+        with mock_tenacity_wait([1], [2, 4]):
+            sim.testee.verify_all_nodes("alias", "schema", archive_bucket_path)
+    assert "1 of 4 nodes are still pending. IDs: [2]" == str(ex.value)
diff --git a/version.py b/version.py
@@ -5,6 +5,6 @@
 # Do not edit this file manually!
 # If you need to change the version, do so in the project.toml, e.g. by using `poetry version X.Y.Z`.
 MAJOR = 0
-MINOR = 3
-PATCH = 1
+MINOR = 4
+PATCH = 0
 VERSION = f"{MAJOR}.{MINOR}.{PATCH}"