Update observability integration (#33)

* Update obs integration - rename interfaces - use scrape charm - hard code metrics config - add basic alert rule - add test * comment out `test_notebook` due to flakiness
canonical · Apr 29, 2022 · a3cfb92 · a3cfb92
1 parent d8aec30
commit a3cfb92
Show file tree

Hide file tree

Showing 14 changed files with 156 additions and 98 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,4 @@ build/
 __pycache__
 .tox
 .coverage
+geckodriver.log
diff --git a/charms/jupyter-controller/config.yaml b/charms/jupyter-controller/config.yaml
@@ -7,15 +7,3 @@ options:
     type: boolean
     default: false
     description: Manually enable Istio integration, instead of via relations
-  metrics-port:
-    type: string
-    default: '8080'
-    description: Metrics port
-  metrics-api:
-    type: string
-    default: '/metrics'
-    description: metrics api route
-  metrics-scrape-interval:
-    type: string
-    default: '30s'
-    description: how often metrics are scraped
diff --git a/charms/jupyter-controller/metadata.yaml b/charms/jupyter-controller/metadata.yaml
@@ -12,7 +12,7 @@ deployment:
   type: stateless
   service: omit
 provides:
-  monitoring:
+  metrics-endpoint:
     interface: prometheus_scrape
   grafana-dashboard:
     interface: grafana_dashboard
diff --git a/charms/jupyter-controller/src/charm.py b/charms/jupyter-controller/src/charm.py
@@ -16,6 +16,9 @@
 from ops.main import main
 from ops.model import ActiveStatus, MaintenanceStatus, WaitingStatus
 
+METRICS_PORT = "8080"
+METRICS_PATH = "/metrics"
+
 
 class CheckFailedError(Exception):
     """Raise this exception if one of the checks in main fails."""
@@ -43,13 +46,11 @@ def __init__(self, *args):
 
         self.prometheus_provider = MetricsEndpointProvider(
             charm=self,
-            relation_name="monitoring",
             jobs=[
                 {
                     "job_name": "jupyter_controller_metrics",
-                    "scrape_interval": self.config["metrics-scrape-interval"],
-                    "metrics_path": self.config["metrics-api"],
-                    "static_configs": [{"targets": ["*:{}".format(self.config["metrics-port"])]}],
+                    "metrics_path": METRICS_PATH,
+                    "static_configs": [{"targets": ["*:{}".format(METRICS_PORT)]}],
                 }
             ],
         )
@@ -61,9 +62,6 @@ def __init__(self, *args):
             self.on.leader_elected,
             self.on.upgrade_charm,
             self.on.config_changed,
-            self.on["monitoring"].relation_changed,
-            self.on["monitoring"].relation_broken,
-            self.on["monitoring"].relation_departed,
         ]:
             self.framework.observe(event, self.main)
 

diff --git a/charms/jupyter-controller/src/prometheus_alert_rules/tempfile b/charms/jupyter-controller/src/prometheus_alert_rules/tempfile
diff --git a/charms/jupyter-controller/src/prometheus_alert_rules/unit_unavailable.rule b/charms/jupyter-controller/src/prometheus_alert_rules/unit_unavailable.rule
@@ -0,0 +1,10 @@
+alert: JupyterControllerUnitIsUnavailable
+expr: up < 1
+for: 0m
+labels:
+  severity: critical
+annotations:
+  summary: Jupyter controller unit {{ $labels.juju_model }}/{{ $labels.juju_unit }} unavailable
+  description: >
+    The Jupyter controller unit {{ $labels.juju_model }} {{ $labels.juju_unit }} is unavailable
+    LABELS = {{ $labels }}
diff --git a/charms/jupyter-controller/test-requirements.txt b/charms/jupyter-controller/test-requirements.txt
diff --git a/charms/jupyter-controller/tox.ini b/charms/jupyter-controller/tox.ini
@@ -67,6 +67,5 @@ deps =
     juju
     pytest-operator
     -r{toxinidir}/requirements.txt 
-    -r{toxinidir}/test-requirements.txt
 commands =
     pytest -v --tb native --asyncio-mode=auto --ignore={[vars]tst_path}unit --log-cli-level=INFO -s {posargs}
diff --git a/charms/jupyter-ui/src/charm.py b/charms/jupyter-ui/src/charm.py
@@ -16,7 +16,7 @@
 
 
 class CheckFailed(Exception):
-    """ Raise this exception if one of the checks in main fails. """
+    """Raise this exception if one of the checks in main fails."""
 
     def __init__(self, msg, status_type=None):
         super().__init__()

diff --git a/charms/jupyter-ui/test-requirements.txt b/charms/jupyter-ui/test-requirements.txt
@@ -1,3 +1,3 @@
 pytest<6.3
-black==20.8b1
+black
 flake8<4.1
diff --git a/charms/jupyter-ui/tox.ini b/charms/jupyter-ui/tox.ini
@@ -18,4 +18,4 @@ commands =
 [testenv:lint]
 commands =
     flake8 {toxinidir}/src {toxinidir}/tests
-    black --check {toxinidir}/src {toxinidir}/tests
+    black --check --diff {toxinidir}/src {toxinidir}/tests
diff --git a/test-requirements.txt b/test-requirements.txt
@@ -1,4 +1,4 @@
-black==20.8b1
+black
 flake8<4.1
 juju<2.10
 lightkube<0.11

diff --git a/tests/test_charms.py b/tests/test_charms.py
@@ -23,8 +23,12 @@
 from selenium.webdriver.support.ui import WebDriverWait
 from seleniumwire import webdriver
 
-CONTROLLER_METADATA = yaml.safe_load(Path("charms/jupyter-controller/metadata.yaml").read_text())
-UI_METADATA = yaml.safe_load(Path("charms/jupyter-ui/metadata.yaml").read_text())
+CONTROLLER_PATH = Path("charms/jupyter-controller")
+UI_PATH = Path("charms/jupyter-ui")
+CONTROLLER_METADATA = yaml.safe_load(Path(f"{CONTROLLER_PATH}/metadata.yaml").read_text())
+UI_METADATA = yaml.safe_load(Path(f"{UI_PATH}/metadata.yaml").read_text())
+CONTROLLER_APP_NAME = CONTROLLER_METADATA["name"]
+UI_APP_NAME = UI_METADATA["name"]
 
 
 INGRESSGATEWAY_NAME = "istio-ingressgateway-operator"
@@ -67,7 +71,30 @@ def dummy_resources_for_testing(lightkube_client):
 
 @pytest.mark.abort_on_fail
 async def test_build_and_deploy(ops_test, lightkube_client, dummy_resources_for_testing):
-    await ops_test.deploy_bundle(destructive_mode=True, serial=True, extra_args=["--trust"])
+    controller_charm = await ops_test.build_charm(CONTROLLER_PATH)
+    controller_image_path = CONTROLLER_METADATA["resources"]["oci-image"]["upstream-source"]
+    ui_charm = await ops_test.build_charm(UI_PATH)
+    ui_image_path = UI_METADATA["resources"]["oci-image"]["upstream-source"]
+
+    await ops_test.model.deploy("istio-pilot", channel="1.5/stable")
+    await ops_test.model.deploy(ui_charm, resources={"oci-image": ui_image_path})
+    await ops_test.model.add_relation(UI_APP_NAME, "istio-pilot")
+
+    await ops_test.model.wait_for_idle(
+        apps=["istio-pilot", UI_APP_NAME], status="active", timeout=60 * 10
+    )
+
+    await ops_test.model.deploy(
+        "istio-gateway", application_name=INGRESSGATEWAY_NAME, channel="1.5/stable", trust=True
+    )
+    await ops_test.model.add_relation("istio-pilot", INGRESSGATEWAY_NAME)
+
+    await ops_test.model.deploy(controller_charm, resources={"oci-image": controller_image_path})
+    await ops_test.model.deploy("kubeflow-profiles")
+    await ops_test.model.deploy("kubeflow-dashboard")
+    await ops_test.model.add_relation("kubeflow-profiles", "kubeflow-dashboard")
+
+    await ops_test.model.deploy("admission-webhook")
 
     await patch_ingress_gateway(lightkube_client, ops_test)
 
@@ -119,7 +146,7 @@ def driver(request, ops_test, lightkube_client):
     options = Options()
     options.headless = True
     options.log.level = 'trace'
-    max_wait = 150  # seconds
+    max_wait = 200  # seconds
 
     kwargs = {
         'options': options,
@@ -144,90 +171,133 @@ def driver(request, ops_test, lightkube_client):
         driver.get_screenshot_as_file(f'/tmp/selenium-{request.node.name}.png')
 
 
-def test_notebook(driver, ops_test, dummy_resources_for_testing):
-    """Ensures a notebook can be created and connected to."""
-
+# jupyter-ui does not reliably report the correct notebook status
+# https://github.com/kubeflow/kubeflow/issues/6056
+# def test_notebook(driver, ops_test):
+#    """Ensures a notebook can be created and connected to."""
+#
+#    driver, wait, url = driver
+#
+#    notebook_name = 'ci-test-' + ''.join(choices(ascii_lowercase, k=10))
+#
+#   # Click "New Server" button
+#    new_button = wait.until(EC.presence_of_element_located((By.ID, "newResource")))
+#    new_button.click()
+#
+#    wait.until(EC.url_to_be(url + 'new'))
+#
+#    # Enter server name
+#    name_input = wait.until(
+#        EC.presence_of_element_located((By.CSS_SELECTOR, "input[data-placeholder='Name']"))
+#    )
+#    name_input.send_keys(notebook_name)
+#
+#    # Click submit on the form. Sleep for 1 second before clicking the submit button because shiny
+#    # animations that ignore click events are simply a must.
+#    sleep(1)
+#    driver.find_element_by_xpath("//*[contains(text(), 'LAUNCH')]").click()
+#    wait.until(EC.url_to_be(url))
+#
+#    # Since upstream doesn't use proper class names or IDs or anything, find the <tr> containing
+#    # elements that contain the notebook name and `ready`, signifying that the notebook is finished
+#    # booting up. Returns a reference to the connect button, suitable for clicking on. The result is
+#    # a fairly unreadable XPath reference, but it works 🤷
+#    chonky_boi = [
+#        f"//*[contains(text(), '{notebook_name}')]",
+#        "/ancestor::tr",
+#        "//*[contains(@class, 'ready')]",
+#        "/ancestor::tr",
+#        "//*[contains(@class, 'action-button')]",
+#        "//button",
+#    ]
+#    chonky_boi = ''.join(chonky_boi)
+#    wait.until(EC.presence_of_element_located((By.XPATH, chonky_boi))).click()
+#
+#    # Make sure we can connect to a specific notebook's endpoint.
+#    # Notebook is opened in a new tab, so we have to explicitly switch to it, run our tests, close
+#    # it, then switch back to the main window.
+#    driver.switch_to.window(driver.window_handles[-1])
+#    expected_path = f'/notebook/kubeflow-user/{notebook_name}/lab'
+#    for _ in range(12):
+#        path = urlparse(driver.current_url).path
+#        if path == expected_path:
+#            break
+#
+#        # Page took a while to load, so can't refresh it too quickly. Sometimes took longer than 5
+#        # seconds, never longer than 10 seconds
+#        sleep(10)
+#        driver.refresh()
+#    else:
+#        pytest.fail(
+#            "Waited too long for selenium to open up notebook server. "
+#            f"Expected current path to be `{expected_path}`, got `{path}`."
+#        )
+#
+#    # Wait for main content div to load
+#    # TODO: More testing of notebook UIs
+#    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "jp-Launcher-sectionTitle")))
+#    driver.execute_script('window.close()')
+#    driver.switch_to.window(driver.window_handles[-1])
+#
+#    # Delete notebook, and wait for it to finalize
+#    driver.find_element_by_xpath("//*[contains(text(), 'delete')]").click()
+#    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "mat-warn"))).click()
+#
+#    wait.until_not(
+#        EC.presence_of_element_located((By.XPATH, f"//*[contains(text(), '{notebook_name}')]"))
+#    )
+
+
+def test_create_notebook(driver, ops_test, dummy_resources_for_testing):
+    """Ensures a notebook can be created. Does not test connection due to upstream bug.
+    https://github.com/kubeflow/kubeflow/issues/6056
+    When the bug is fixed, remove this test and re-enable `test_notebook` test above."""
     driver, wait, url = driver
 
     notebook_name = 'ci-test-' + ''.join(choices(ascii_lowercase, k=10))
 
-    # Click "New Server" button
-    new_button = wait.until(EC.presence_of_element_located((By.ID, "newResource")))
-    new_button.click()
-
-    wait.until(EC.url_to_be(url + 'new'))
+    # Click "New Notebook" button
+    wait.until(EC.element_to_be_clickable((By.ID, "newResource"))).click()
+    wait.until(EC.url_matches('new'))
 
     # Enter server name
     name_input = wait.until(
         EC.presence_of_element_located((By.CSS_SELECTOR, "input[data-placeholder='Name']"))
     )
+    name_input.click()
     name_input.send_keys(notebook_name)
 
-    # Click submit on the form. Sleep for 1 second before clicking the submit button because shiny
-    # animations that ignore click events are simply a must.
+    # Scrolling would fail without this sleep
     sleep(1)
-    driver.find_element_by_xpath("//*[contains(text(), 'LAUNCH')]").click()
-    wait.until(EC.url_to_be(url))
-
-    # Since upstream doesn't use proper class names or IDs or anything, find the <tr> containing
-    # elements that contain the notebook name and `ready`, signifying that the notebook is finished
-    # booting up. Returns a reference to the connect button, suitable for clicking on. The result is
-    # a fairly unreadable XPath reference, but it works 🤷
-    chonky_boi = [
-        f"//*[contains(text(), '{notebook_name}')]",
-        "/ancestor::tr",
-        "//*[contains(@class, 'ready')]",
-        "/ancestor::tr",
-        "//*[contains(@class, 'action-button')]",
-        "//button",
-    ]
-    chonky_boi = ''.join(chonky_boi)
-    wait.until(EC.presence_of_element_located((By.XPATH, chonky_boi))).click()
-
-    # Make sure we can connect to a specific notebook's endpoint.
-    # Notebook is opened in a new tab, so we have to explicitly switch to it, run our tests, close
-    # it, then switch back to the main window.
-    driver.switch_to.window(driver.window_handles[-1])
-    expected_path = f'/notebook/kubeflow-user/{notebook_name}/lab'
-    for _ in range(12):
-        path = urlparse(driver.current_url).path
-        if path == expected_path:
-            break
-
-        # Page took a while to load, so can't refresh it too quickly. Sometimes took longer than 5
-        # seconds, never longer than 10 seconds
-        sleep(10)
-        driver.refresh()
-    else:
-        pytest.fail(
-            "Waited too long for selenium to open up notebook server. "
-            f"Expected current path to be `{expected_path}`, got `{path}`."
-        )
-
-    # Wait for main content div to load
-    # TODO: More testing of notebook UIs
-    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "jp-Launcher-sectionTitle")))
-    driver.execute_script('window.close()')
-    driver.switch_to.window(driver.window_handles[-1])
-
-    # Delete notebook, and wait for it to finalize
-    driver.find_element_by_xpath("//*[contains(text(), 'delete')]").click()
-    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "mat-warn"))).click()
-
-    wait.until_not(
+
+    # scroll to bottom of the page for launch button
+    driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
+
+    launch_button = wait.until(
+        EC.presence_of_element_located((By.XPATH, "//*[contains(text(), 'LAUNCH')]"))
+    )
+    launch_button.click()
+    wait.until(EC.url_matches(url))
+
+    # Check the notebook name is displayed
+    wait.until(
         EC.presence_of_element_located((By.XPATH, f"//*[contains(text(), '{notebook_name}')]"))
     )
 
 
 async def test_integrate_with_prometheus_and_grafana(ops_test):
     prometheus = "prometheus-k8s"
     grafana = "grafana-k8s"
+    prometheus_scrape = "prometheus-scrape-config-k8s"
     jupyter_controller = "jupyter-controller"
+    scrape_config = {"scrape_interval": "30s"}
     await ops_test.model.deploy(prometheus, channel="latest/beta")
     await ops_test.model.deploy(grafana, channel="latest/beta")
+    await ops_test.model.deploy(prometheus_scrape, channel="latest/beta", config=scrape_config)
+    await ops_test.model.add_relation(jupyter_controller, prometheus_scrape)
+    await ops_test.model.add_relation(prometheus, prometheus_scrape)
     await ops_test.model.add_relation(prometheus, grafana)
     await ops_test.model.add_relation(jupyter_controller, grafana)
-    await ops_test.model.add_relation(prometheus, jupyter_controller)
 
     await ops_test.model.wait_for_idle([jupyter_controller, prometheus, grafana], status="active")
     status = await ops_test.model.get_status()

diff --git a/tox.ini b/tox.ini
@@ -22,4 +22,4 @@ passenv =
   DISPLAY
 deps =
   -rtest-requirements.txt
-commands = pytest -vvs --tb native --show-capture=no --log-cli-level=INFO {posargs} {toxinidir}/tests/
+commands = pytest -vvs --tb native --show-capture=no --log-cli-level=INFO --asyncio-mode=auto {posargs} {toxinidir}/tests/