Skip to content

Commit

Permalink
Update observability integration (#33)
Browse files Browse the repository at this point in the history
* Update obs integration
- rename interfaces
- use scrape charm
- hard code metrics config
- add basic alert rule
- add test

* comment out `test_notebook` due to flakiness
  • Loading branch information
agathanatasha authored Apr 29, 2022
1 parent d8aec30 commit a3cfb92
Show file tree
Hide file tree
Showing 14 changed files with 156 additions and 98 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ build/
__pycache__
.tox
.coverage
geckodriver.log
12 changes: 0 additions & 12 deletions charms/jupyter-controller/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,3 @@ options:
type: boolean
default: false
description: Manually enable Istio integration, instead of via relations
metrics-port:
type: string
default: '8080'
description: Metrics port
metrics-api:
type: string
default: '/metrics'
description: metrics api route
metrics-scrape-interval:
type: string
default: '30s'
description: how often metrics are scraped
2 changes: 1 addition & 1 deletion charms/jupyter-controller/metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ deployment:
type: stateless
service: omit
provides:
monitoring:
metrics-endpoint:
interface: prometheus_scrape
grafana-dashboard:
interface: grafana_dashboard
12 changes: 5 additions & 7 deletions charms/jupyter-controller/src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
from ops.main import main
from ops.model import ActiveStatus, MaintenanceStatus, WaitingStatus

METRICS_PORT = "8080"
METRICS_PATH = "/metrics"


class CheckFailedError(Exception):
"""Raise this exception if one of the checks in main fails."""
Expand Down Expand Up @@ -43,13 +46,11 @@ def __init__(self, *args):

self.prometheus_provider = MetricsEndpointProvider(
charm=self,
relation_name="monitoring",
jobs=[
{
"job_name": "jupyter_controller_metrics",
"scrape_interval": self.config["metrics-scrape-interval"],
"metrics_path": self.config["metrics-api"],
"static_configs": [{"targets": ["*:{}".format(self.config["metrics-port"])]}],
"metrics_path": METRICS_PATH,
"static_configs": [{"targets": ["*:{}".format(METRICS_PORT)]}],
}
],
)
Expand All @@ -61,9 +62,6 @@ def __init__(self, *args):
self.on.leader_elected,
self.on.upgrade_charm,
self.on.config_changed,
self.on["monitoring"].relation_changed,
self.on["monitoring"].relation_broken,
self.on["monitoring"].relation_departed,
]:
self.framework.observe(event, self.main)

Expand Down
5 changes: 0 additions & 5 deletions charms/jupyter-controller/src/prometheus_alert_rules/tempfile

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
alert: JupyterControllerUnitIsUnavailable
expr: up < 1
for: 0m
labels:
severity: critical
annotations:
summary: Jupyter controller unit {{ $labels.juju_model }}/{{ $labels.juju_unit }} unavailable
description: >
The Jupyter controller unit {{ $labels.juju_model }} {{ $labels.juju_unit }} is unavailable
LABELS = {{ $labels }}
3 changes: 0 additions & 3 deletions charms/jupyter-controller/test-requirements.txt

This file was deleted.

1 change: 0 additions & 1 deletion charms/jupyter-controller/tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,5 @@ deps =
juju
pytest-operator
-r{toxinidir}/requirements.txt
-r{toxinidir}/test-requirements.txt
commands =
pytest -v --tb native --asyncio-mode=auto --ignore={[vars]tst_path}unit --log-cli-level=INFO -s {posargs}
2 changes: 1 addition & 1 deletion charms/jupyter-ui/src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@


class CheckFailed(Exception):
""" Raise this exception if one of the checks in main fails. """
"""Raise this exception if one of the checks in main fails."""

def __init__(self, msg, status_type=None):
super().__init__()
Expand Down
2 changes: 1 addition & 1 deletion charms/jupyter-ui/test-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
pytest<6.3
black==20.8b1
black
flake8<4.1
2 changes: 1 addition & 1 deletion charms/jupyter-ui/tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ commands =
[testenv:lint]
commands =
flake8 {toxinidir}/src {toxinidir}/tests
black --check {toxinidir}/src {toxinidir}/tests
black --check --diff {toxinidir}/src {toxinidir}/tests
2 changes: 1 addition & 1 deletion test-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
black==20.8b1
black
flake8<4.1
juju<2.10
lightkube<0.11
Expand Down
198 changes: 134 additions & 64 deletions tests/test_charms.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,12 @@
from selenium.webdriver.support.ui import WebDriverWait
from seleniumwire import webdriver

CONTROLLER_METADATA = yaml.safe_load(Path("charms/jupyter-controller/metadata.yaml").read_text())
UI_METADATA = yaml.safe_load(Path("charms/jupyter-ui/metadata.yaml").read_text())
CONTROLLER_PATH = Path("charms/jupyter-controller")
UI_PATH = Path("charms/jupyter-ui")
CONTROLLER_METADATA = yaml.safe_load(Path(f"{CONTROLLER_PATH}/metadata.yaml").read_text())
UI_METADATA = yaml.safe_load(Path(f"{UI_PATH}/metadata.yaml").read_text())
CONTROLLER_APP_NAME = CONTROLLER_METADATA["name"]
UI_APP_NAME = UI_METADATA["name"]


INGRESSGATEWAY_NAME = "istio-ingressgateway-operator"
Expand Down Expand Up @@ -67,7 +71,30 @@ def dummy_resources_for_testing(lightkube_client):

@pytest.mark.abort_on_fail
async def test_build_and_deploy(ops_test, lightkube_client, dummy_resources_for_testing):
await ops_test.deploy_bundle(destructive_mode=True, serial=True, extra_args=["--trust"])
controller_charm = await ops_test.build_charm(CONTROLLER_PATH)
controller_image_path = CONTROLLER_METADATA["resources"]["oci-image"]["upstream-source"]
ui_charm = await ops_test.build_charm(UI_PATH)
ui_image_path = UI_METADATA["resources"]["oci-image"]["upstream-source"]

await ops_test.model.deploy("istio-pilot", channel="1.5/stable")
await ops_test.model.deploy(ui_charm, resources={"oci-image": ui_image_path})
await ops_test.model.add_relation(UI_APP_NAME, "istio-pilot")

await ops_test.model.wait_for_idle(
apps=["istio-pilot", UI_APP_NAME], status="active", timeout=60 * 10
)

await ops_test.model.deploy(
"istio-gateway", application_name=INGRESSGATEWAY_NAME, channel="1.5/stable", trust=True
)
await ops_test.model.add_relation("istio-pilot", INGRESSGATEWAY_NAME)

await ops_test.model.deploy(controller_charm, resources={"oci-image": controller_image_path})
await ops_test.model.deploy("kubeflow-profiles")
await ops_test.model.deploy("kubeflow-dashboard")
await ops_test.model.add_relation("kubeflow-profiles", "kubeflow-dashboard")

await ops_test.model.deploy("admission-webhook")

await patch_ingress_gateway(lightkube_client, ops_test)

Expand Down Expand Up @@ -119,7 +146,7 @@ def driver(request, ops_test, lightkube_client):
options = Options()
options.headless = True
options.log.level = 'trace'
max_wait = 150 # seconds
max_wait = 200 # seconds

kwargs = {
'options': options,
Expand All @@ -144,90 +171,133 @@ def driver(request, ops_test, lightkube_client):
driver.get_screenshot_as_file(f'/tmp/selenium-{request.node.name}.png')


def test_notebook(driver, ops_test, dummy_resources_for_testing):
"""Ensures a notebook can be created and connected to."""

# jupyter-ui does not reliably report the correct notebook status
# https://github.com/kubeflow/kubeflow/issues/6056
# def test_notebook(driver, ops_test):
# """Ensures a notebook can be created and connected to."""
#
# driver, wait, url = driver
#
# notebook_name = 'ci-test-' + ''.join(choices(ascii_lowercase, k=10))
#
# # Click "New Server" button
# new_button = wait.until(EC.presence_of_element_located((By.ID, "newResource")))
# new_button.click()
#
# wait.until(EC.url_to_be(url + 'new'))
#
# # Enter server name
# name_input = wait.until(
# EC.presence_of_element_located((By.CSS_SELECTOR, "input[data-placeholder='Name']"))
# )
# name_input.send_keys(notebook_name)
#
# # Click submit on the form. Sleep for 1 second before clicking the submit button because shiny
# # animations that ignore click events are simply a must.
# sleep(1)
# driver.find_element_by_xpath("//*[contains(text(), 'LAUNCH')]").click()
# wait.until(EC.url_to_be(url))
#
# # Since upstream doesn't use proper class names or IDs or anything, find the <tr> containing
# # elements that contain the notebook name and `ready`, signifying that the notebook is finished
# # booting up. Returns a reference to the connect button, suitable for clicking on. The result is
# # a fairly unreadable XPath reference, but it works 🤷
# chonky_boi = [
# f"//*[contains(text(), '{notebook_name}')]",
# "/ancestor::tr",
# "//*[contains(@class, 'ready')]",
# "/ancestor::tr",
# "//*[contains(@class, 'action-button')]",
# "//button",
# ]
# chonky_boi = ''.join(chonky_boi)
# wait.until(EC.presence_of_element_located((By.XPATH, chonky_boi))).click()
#
# # Make sure we can connect to a specific notebook's endpoint.
# # Notebook is opened in a new tab, so we have to explicitly switch to it, run our tests, close
# # it, then switch back to the main window.
# driver.switch_to.window(driver.window_handles[-1])
# expected_path = f'/notebook/kubeflow-user/{notebook_name}/lab'
# for _ in range(12):
# path = urlparse(driver.current_url).path
# if path == expected_path:
# break
#
# # Page took a while to load, so can't refresh it too quickly. Sometimes took longer than 5
# # seconds, never longer than 10 seconds
# sleep(10)
# driver.refresh()
# else:
# pytest.fail(
# "Waited too long for selenium to open up notebook server. "
# f"Expected current path to be `{expected_path}`, got `{path}`."
# )
#
# # Wait for main content div to load
# # TODO: More testing of notebook UIs
# wait.until(EC.presence_of_element_located((By.CLASS_NAME, "jp-Launcher-sectionTitle")))
# driver.execute_script('window.close()')
# driver.switch_to.window(driver.window_handles[-1])
#
# # Delete notebook, and wait for it to finalize
# driver.find_element_by_xpath("//*[contains(text(), 'delete')]").click()
# wait.until(EC.presence_of_element_located((By.CLASS_NAME, "mat-warn"))).click()
#
# wait.until_not(
# EC.presence_of_element_located((By.XPATH, f"//*[contains(text(), '{notebook_name}')]"))
# )


def test_create_notebook(driver, ops_test, dummy_resources_for_testing):
"""Ensures a notebook can be created. Does not test connection due to upstream bug.
https://github.com/kubeflow/kubeflow/issues/6056
When the bug is fixed, remove this test and re-enable `test_notebook` test above."""
driver, wait, url = driver

notebook_name = 'ci-test-' + ''.join(choices(ascii_lowercase, k=10))

# Click "New Server" button
new_button = wait.until(EC.presence_of_element_located((By.ID, "newResource")))
new_button.click()

wait.until(EC.url_to_be(url + 'new'))
# Click "New Notebook" button
wait.until(EC.element_to_be_clickable((By.ID, "newResource"))).click()
wait.until(EC.url_matches('new'))

# Enter server name
name_input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "input[data-placeholder='Name']"))
)
name_input.click()
name_input.send_keys(notebook_name)

# Click submit on the form. Sleep for 1 second before clicking the submit button because shiny
# animations that ignore click events are simply a must.
# Scrolling would fail without this sleep
sleep(1)
driver.find_element_by_xpath("//*[contains(text(), 'LAUNCH')]").click()
wait.until(EC.url_to_be(url))

# Since upstream doesn't use proper class names or IDs or anything, find the <tr> containing
# elements that contain the notebook name and `ready`, signifying that the notebook is finished
# booting up. Returns a reference to the connect button, suitable for clicking on. The result is
# a fairly unreadable XPath reference, but it works 🤷
chonky_boi = [
f"//*[contains(text(), '{notebook_name}')]",
"/ancestor::tr",
"//*[contains(@class, 'ready')]",
"/ancestor::tr",
"//*[contains(@class, 'action-button')]",
"//button",
]
chonky_boi = ''.join(chonky_boi)
wait.until(EC.presence_of_element_located((By.XPATH, chonky_boi))).click()

# Make sure we can connect to a specific notebook's endpoint.
# Notebook is opened in a new tab, so we have to explicitly switch to it, run our tests, close
# it, then switch back to the main window.
driver.switch_to.window(driver.window_handles[-1])
expected_path = f'/notebook/kubeflow-user/{notebook_name}/lab'
for _ in range(12):
path = urlparse(driver.current_url).path
if path == expected_path:
break

# Page took a while to load, so can't refresh it too quickly. Sometimes took longer than 5
# seconds, never longer than 10 seconds
sleep(10)
driver.refresh()
else:
pytest.fail(
"Waited too long for selenium to open up notebook server. "
f"Expected current path to be `{expected_path}`, got `{path}`."
)

# Wait for main content div to load
# TODO: More testing of notebook UIs
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "jp-Launcher-sectionTitle")))
driver.execute_script('window.close()')
driver.switch_to.window(driver.window_handles[-1])

# Delete notebook, and wait for it to finalize
driver.find_element_by_xpath("//*[contains(text(), 'delete')]").click()
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "mat-warn"))).click()

wait.until_not(

# scroll to bottom of the page for launch button
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")

launch_button = wait.until(
EC.presence_of_element_located((By.XPATH, "//*[contains(text(), 'LAUNCH')]"))
)
launch_button.click()
wait.until(EC.url_matches(url))

# Check the notebook name is displayed
wait.until(
EC.presence_of_element_located((By.XPATH, f"//*[contains(text(), '{notebook_name}')]"))
)


async def test_integrate_with_prometheus_and_grafana(ops_test):
prometheus = "prometheus-k8s"
grafana = "grafana-k8s"
prometheus_scrape = "prometheus-scrape-config-k8s"
jupyter_controller = "jupyter-controller"
scrape_config = {"scrape_interval": "30s"}
await ops_test.model.deploy(prometheus, channel="latest/beta")
await ops_test.model.deploy(grafana, channel="latest/beta")
await ops_test.model.deploy(prometheus_scrape, channel="latest/beta", config=scrape_config)
await ops_test.model.add_relation(jupyter_controller, prometheus_scrape)
await ops_test.model.add_relation(prometheus, prometheus_scrape)
await ops_test.model.add_relation(prometheus, grafana)
await ops_test.model.add_relation(jupyter_controller, grafana)
await ops_test.model.add_relation(prometheus, jupyter_controller)

await ops_test.model.wait_for_idle([jupyter_controller, prometheus, grafana], status="active")
status = await ops_test.model.get_status()
Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ passenv =
DISPLAY
deps =
-rtest-requirements.txt
commands = pytest -vvs --tb native --show-capture=no --log-cli-level=INFO {posargs} {toxinidir}/tests/
commands = pytest -vvs --tb native --show-capture=no --log-cli-level=INFO --asyncio-mode=auto {posargs} {toxinidir}/tests/

0 comments on commit a3cfb92

Please sign in to comment.