From a3cfb92d7bc3a294d9a758a6f982b4fc8ca306c4 Mon Sep 17 00:00:00 2001 From: Natasha Ho Date: Fri, 29 Apr 2022 11:29:04 -0400 Subject: [PATCH] Update observability integration (#33) * Update obs integration - rename interfaces - use scrape charm - hard code metrics config - add basic alert rule - add test * comment out `test_notebook` due to flakiness --- .gitignore | 1 + charms/jupyter-controller/config.yaml | 12 -- charms/jupyter-controller/metadata.yaml | 2 +- charms/jupyter-controller/src/charm.py | 12 +- .../src/prometheus_alert_rules/tempfile | 5 - .../unit_unavailable.rule | 10 + .../jupyter-controller/test-requirements.txt | 3 - charms/jupyter-controller/tox.ini | 1 - charms/jupyter-ui/src/charm.py | 2 +- charms/jupyter-ui/test-requirements.txt | 2 +- charms/jupyter-ui/tox.ini | 2 +- test-requirements.txt | 2 +- tests/test_charms.py | 198 ++++++++++++------ tox.ini | 2 +- 14 files changed, 156 insertions(+), 98 deletions(-) delete mode 100644 charms/jupyter-controller/src/prometheus_alert_rules/tempfile create mode 100644 charms/jupyter-controller/src/prometheus_alert_rules/unit_unavailable.rule delete mode 100644 charms/jupyter-controller/test-requirements.txt diff --git a/.gitignore b/.gitignore index fa524f00..2f30bd54 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ build/ __pycache__ .tox .coverage +geckodriver.log diff --git a/charms/jupyter-controller/config.yaml b/charms/jupyter-controller/config.yaml index d384cad4..d2960190 100644 --- a/charms/jupyter-controller/config.yaml +++ b/charms/jupyter-controller/config.yaml @@ -7,15 +7,3 @@ options: type: boolean default: false description: Manually enable Istio integration, instead of via relations - metrics-port: - type: string - default: '8080' - description: Metrics port - metrics-api: - type: string - default: '/metrics' - description: metrics api route - metrics-scrape-interval: - type: string - default: '30s' - description: how often metrics are scraped diff --git a/charms/jupyter-controller/metadata.yaml b/charms/jupyter-controller/metadata.yaml index 1ef25dc3..7831dab9 100755 --- a/charms/jupyter-controller/metadata.yaml +++ b/charms/jupyter-controller/metadata.yaml @@ -12,7 +12,7 @@ deployment: type: stateless service: omit provides: - monitoring: + metrics-endpoint: interface: prometheus_scrape grafana-dashboard: interface: grafana_dashboard diff --git a/charms/jupyter-controller/src/charm.py b/charms/jupyter-controller/src/charm.py index c847bfd6..04489ac2 100755 --- a/charms/jupyter-controller/src/charm.py +++ b/charms/jupyter-controller/src/charm.py @@ -16,6 +16,9 @@ from ops.main import main from ops.model import ActiveStatus, MaintenanceStatus, WaitingStatus +METRICS_PORT = "8080" +METRICS_PATH = "/metrics" + class CheckFailedError(Exception): """Raise this exception if one of the checks in main fails.""" @@ -43,13 +46,11 @@ def __init__(self, *args): self.prometheus_provider = MetricsEndpointProvider( charm=self, - relation_name="monitoring", jobs=[ { "job_name": "jupyter_controller_metrics", - "scrape_interval": self.config["metrics-scrape-interval"], - "metrics_path": self.config["metrics-api"], - "static_configs": [{"targets": ["*:{}".format(self.config["metrics-port"])]}], + "metrics_path": METRICS_PATH, + "static_configs": [{"targets": ["*:{}".format(METRICS_PORT)]}], } ], ) @@ -61,9 +62,6 @@ def __init__(self, *args): self.on.leader_elected, self.on.upgrade_charm, self.on.config_changed, - self.on["monitoring"].relation_changed, - self.on["monitoring"].relation_broken, - self.on["monitoring"].relation_departed, ]: self.framework.observe(event, self.main) diff --git a/charms/jupyter-controller/src/prometheus_alert_rules/tempfile b/charms/jupyter-controller/src/prometheus_alert_rules/tempfile deleted file mode 100644 index 6e0d914a..00000000 --- a/charms/jupyter-controller/src/prometheus_alert_rules/tempfile +++ /dev/null @@ -1,5 +0,0 @@ -"""prometheus_alert_rules folder is only added to .charm file -if there is any relevant content. -Adding this file only to ensure the folder is added to the -charm and MetricsEndpointProvider does not complain. -""" diff --git a/charms/jupyter-controller/src/prometheus_alert_rules/unit_unavailable.rule b/charms/jupyter-controller/src/prometheus_alert_rules/unit_unavailable.rule new file mode 100644 index 00000000..748c1eb8 --- /dev/null +++ b/charms/jupyter-controller/src/prometheus_alert_rules/unit_unavailable.rule @@ -0,0 +1,10 @@ +alert: JupyterControllerUnitIsUnavailable +expr: up < 1 +for: 0m +labels: + severity: critical +annotations: + summary: Jupyter controller unit {{ $labels.juju_model }}/{{ $labels.juju_unit }} unavailable + description: > + The Jupyter controller unit {{ $labels.juju_model }} {{ $labels.juju_unit }} is unavailable + LABELS = {{ $labels }} diff --git a/charms/jupyter-controller/test-requirements.txt b/charms/jupyter-controller/test-requirements.txt deleted file mode 100644 index 96add76a..00000000 --- a/charms/jupyter-controller/test-requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -pytest<6.3 -black==20.8b1 -flake8<4.1 diff --git a/charms/jupyter-controller/tox.ini b/charms/jupyter-controller/tox.ini index fe3aa8df..4be25470 100644 --- a/charms/jupyter-controller/tox.ini +++ b/charms/jupyter-controller/tox.ini @@ -67,6 +67,5 @@ deps = juju pytest-operator -r{toxinidir}/requirements.txt - -r{toxinidir}/test-requirements.txt commands = pytest -v --tb native --asyncio-mode=auto --ignore={[vars]tst_path}unit --log-cli-level=INFO -s {posargs} diff --git a/charms/jupyter-ui/src/charm.py b/charms/jupyter-ui/src/charm.py index 52cdade6..34eb3a6a 100755 --- a/charms/jupyter-ui/src/charm.py +++ b/charms/jupyter-ui/src/charm.py @@ -16,7 +16,7 @@ class CheckFailed(Exception): - """ Raise this exception if one of the checks in main fails. """ + """Raise this exception if one of the checks in main fails.""" def __init__(self, msg, status_type=None): super().__init__() diff --git a/charms/jupyter-ui/test-requirements.txt b/charms/jupyter-ui/test-requirements.txt index 96add76a..90f4d537 100644 --- a/charms/jupyter-ui/test-requirements.txt +++ b/charms/jupyter-ui/test-requirements.txt @@ -1,3 +1,3 @@ pytest<6.3 -black==20.8b1 +black flake8<4.1 diff --git a/charms/jupyter-ui/tox.ini b/charms/jupyter-ui/tox.ini index 12ea7ff5..b9f205d6 100644 --- a/charms/jupyter-ui/tox.ini +++ b/charms/jupyter-ui/tox.ini @@ -18,4 +18,4 @@ commands = [testenv:lint] commands = flake8 {toxinidir}/src {toxinidir}/tests - black --check {toxinidir}/src {toxinidir}/tests + black --check --diff {toxinidir}/src {toxinidir}/tests diff --git a/test-requirements.txt b/test-requirements.txt index 978169cb..82710e07 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -1,4 +1,4 @@ -black==20.8b1 +black flake8<4.1 juju<2.10 lightkube<0.11 diff --git a/tests/test_charms.py b/tests/test_charms.py index a21b238f..d285030b 100644 --- a/tests/test_charms.py +++ b/tests/test_charms.py @@ -23,8 +23,12 @@ from selenium.webdriver.support.ui import WebDriverWait from seleniumwire import webdriver -CONTROLLER_METADATA = yaml.safe_load(Path("charms/jupyter-controller/metadata.yaml").read_text()) -UI_METADATA = yaml.safe_load(Path("charms/jupyter-ui/metadata.yaml").read_text()) +CONTROLLER_PATH = Path("charms/jupyter-controller") +UI_PATH = Path("charms/jupyter-ui") +CONTROLLER_METADATA = yaml.safe_load(Path(f"{CONTROLLER_PATH}/metadata.yaml").read_text()) +UI_METADATA = yaml.safe_load(Path(f"{UI_PATH}/metadata.yaml").read_text()) +CONTROLLER_APP_NAME = CONTROLLER_METADATA["name"] +UI_APP_NAME = UI_METADATA["name"] INGRESSGATEWAY_NAME = "istio-ingressgateway-operator" @@ -67,7 +71,30 @@ def dummy_resources_for_testing(lightkube_client): @pytest.mark.abort_on_fail async def test_build_and_deploy(ops_test, lightkube_client, dummy_resources_for_testing): - await ops_test.deploy_bundle(destructive_mode=True, serial=True, extra_args=["--trust"]) + controller_charm = await ops_test.build_charm(CONTROLLER_PATH) + controller_image_path = CONTROLLER_METADATA["resources"]["oci-image"]["upstream-source"] + ui_charm = await ops_test.build_charm(UI_PATH) + ui_image_path = UI_METADATA["resources"]["oci-image"]["upstream-source"] + + await ops_test.model.deploy("istio-pilot", channel="1.5/stable") + await ops_test.model.deploy(ui_charm, resources={"oci-image": ui_image_path}) + await ops_test.model.add_relation(UI_APP_NAME, "istio-pilot") + + await ops_test.model.wait_for_idle( + apps=["istio-pilot", UI_APP_NAME], status="active", timeout=60 * 10 + ) + + await ops_test.model.deploy( + "istio-gateway", application_name=INGRESSGATEWAY_NAME, channel="1.5/stable", trust=True + ) + await ops_test.model.add_relation("istio-pilot", INGRESSGATEWAY_NAME) + + await ops_test.model.deploy(controller_charm, resources={"oci-image": controller_image_path}) + await ops_test.model.deploy("kubeflow-profiles") + await ops_test.model.deploy("kubeflow-dashboard") + await ops_test.model.add_relation("kubeflow-profiles", "kubeflow-dashboard") + + await ops_test.model.deploy("admission-webhook") await patch_ingress_gateway(lightkube_client, ops_test) @@ -119,7 +146,7 @@ def driver(request, ops_test, lightkube_client): options = Options() options.headless = True options.log.level = 'trace' - max_wait = 150 # seconds + max_wait = 200 # seconds kwargs = { 'options': options, @@ -144,77 +171,116 @@ def driver(request, ops_test, lightkube_client): driver.get_screenshot_as_file(f'/tmp/selenium-{request.node.name}.png') -def test_notebook(driver, ops_test, dummy_resources_for_testing): - """Ensures a notebook can be created and connected to.""" - +# jupyter-ui does not reliably report the correct notebook status +# https://github.com/kubeflow/kubeflow/issues/6056 +# def test_notebook(driver, ops_test): +# """Ensures a notebook can be created and connected to.""" +# +# driver, wait, url = driver +# +# notebook_name = 'ci-test-' + ''.join(choices(ascii_lowercase, k=10)) +# +# # Click "New Server" button +# new_button = wait.until(EC.presence_of_element_located((By.ID, "newResource"))) +# new_button.click() +# +# wait.until(EC.url_to_be(url + 'new')) +# +# # Enter server name +# name_input = wait.until( +# EC.presence_of_element_located((By.CSS_SELECTOR, "input[data-placeholder='Name']")) +# ) +# name_input.send_keys(notebook_name) +# +# # Click submit on the form. Sleep for 1 second before clicking the submit button because shiny +# # animations that ignore click events are simply a must. +# sleep(1) +# driver.find_element_by_xpath("//*[contains(text(), 'LAUNCH')]").click() +# wait.until(EC.url_to_be(url)) +# +# # Since upstream doesn't use proper class names or IDs or anything, find the containing +# # elements that contain the notebook name and `ready`, signifying that the notebook is finished +# # booting up. Returns a reference to the connect button, suitable for clicking on. The result is +# # a fairly unreadable XPath reference, but it works 🤷 +# chonky_boi = [ +# f"//*[contains(text(), '{notebook_name}')]", +# "/ancestor::tr", +# "//*[contains(@class, 'ready')]", +# "/ancestor::tr", +# "//*[contains(@class, 'action-button')]", +# "//button", +# ] +# chonky_boi = ''.join(chonky_boi) +# wait.until(EC.presence_of_element_located((By.XPATH, chonky_boi))).click() +# +# # Make sure we can connect to a specific notebook's endpoint. +# # Notebook is opened in a new tab, so we have to explicitly switch to it, run our tests, close +# # it, then switch back to the main window. +# driver.switch_to.window(driver.window_handles[-1]) +# expected_path = f'/notebook/kubeflow-user/{notebook_name}/lab' +# for _ in range(12): +# path = urlparse(driver.current_url).path +# if path == expected_path: +# break +# +# # Page took a while to load, so can't refresh it too quickly. Sometimes took longer than 5 +# # seconds, never longer than 10 seconds +# sleep(10) +# driver.refresh() +# else: +# pytest.fail( +# "Waited too long for selenium to open up notebook server. " +# f"Expected current path to be `{expected_path}`, got `{path}`." +# ) +# +# # Wait for main content div to load +# # TODO: More testing of notebook UIs +# wait.until(EC.presence_of_element_located((By.CLASS_NAME, "jp-Launcher-sectionTitle"))) +# driver.execute_script('window.close()') +# driver.switch_to.window(driver.window_handles[-1]) +# +# # Delete notebook, and wait for it to finalize +# driver.find_element_by_xpath("//*[contains(text(), 'delete')]").click() +# wait.until(EC.presence_of_element_located((By.CLASS_NAME, "mat-warn"))).click() +# +# wait.until_not( +# EC.presence_of_element_located((By.XPATH, f"//*[contains(text(), '{notebook_name}')]")) +# ) + + +def test_create_notebook(driver, ops_test, dummy_resources_for_testing): + """Ensures a notebook can be created. Does not test connection due to upstream bug. + https://github.com/kubeflow/kubeflow/issues/6056 + When the bug is fixed, remove this test and re-enable `test_notebook` test above.""" driver, wait, url = driver notebook_name = 'ci-test-' + ''.join(choices(ascii_lowercase, k=10)) - # Click "New Server" button - new_button = wait.until(EC.presence_of_element_located((By.ID, "newResource"))) - new_button.click() - - wait.until(EC.url_to_be(url + 'new')) + # Click "New Notebook" button + wait.until(EC.element_to_be_clickable((By.ID, "newResource"))).click() + wait.until(EC.url_matches('new')) # Enter server name name_input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, "input[data-placeholder='Name']")) ) + name_input.click() name_input.send_keys(notebook_name) - # Click submit on the form. Sleep for 1 second before clicking the submit button because shiny - # animations that ignore click events are simply a must. + # Scrolling would fail without this sleep sleep(1) - driver.find_element_by_xpath("//*[contains(text(), 'LAUNCH')]").click() - wait.until(EC.url_to_be(url)) - - # Since upstream doesn't use proper class names or IDs or anything, find the containing - # elements that contain the notebook name and `ready`, signifying that the notebook is finished - # booting up. Returns a reference to the connect button, suitable for clicking on. The result is - # a fairly unreadable XPath reference, but it works 🤷 - chonky_boi = [ - f"//*[contains(text(), '{notebook_name}')]", - "/ancestor::tr", - "//*[contains(@class, 'ready')]", - "/ancestor::tr", - "//*[contains(@class, 'action-button')]", - "//button", - ] - chonky_boi = ''.join(chonky_boi) - wait.until(EC.presence_of_element_located((By.XPATH, chonky_boi))).click() - - # Make sure we can connect to a specific notebook's endpoint. - # Notebook is opened in a new tab, so we have to explicitly switch to it, run our tests, close - # it, then switch back to the main window. - driver.switch_to.window(driver.window_handles[-1]) - expected_path = f'/notebook/kubeflow-user/{notebook_name}/lab' - for _ in range(12): - path = urlparse(driver.current_url).path - if path == expected_path: - break - - # Page took a while to load, so can't refresh it too quickly. Sometimes took longer than 5 - # seconds, never longer than 10 seconds - sleep(10) - driver.refresh() - else: - pytest.fail( - "Waited too long for selenium to open up notebook server. " - f"Expected current path to be `{expected_path}`, got `{path}`." - ) - - # Wait for main content div to load - # TODO: More testing of notebook UIs - wait.until(EC.presence_of_element_located((By.CLASS_NAME, "jp-Launcher-sectionTitle"))) - driver.execute_script('window.close()') - driver.switch_to.window(driver.window_handles[-1]) - - # Delete notebook, and wait for it to finalize - driver.find_element_by_xpath("//*[contains(text(), 'delete')]").click() - wait.until(EC.presence_of_element_located((By.CLASS_NAME, "mat-warn"))).click() - - wait.until_not( + + # scroll to bottom of the page for launch button + driver.execute_script("window.scrollTo(0,document.body.scrollHeight)") + + launch_button = wait.until( + EC.presence_of_element_located((By.XPATH, "//*[contains(text(), 'LAUNCH')]")) + ) + launch_button.click() + wait.until(EC.url_matches(url)) + + # Check the notebook name is displayed + wait.until( EC.presence_of_element_located((By.XPATH, f"//*[contains(text(), '{notebook_name}')]")) ) @@ -222,12 +288,16 @@ def test_notebook(driver, ops_test, dummy_resources_for_testing): async def test_integrate_with_prometheus_and_grafana(ops_test): prometheus = "prometheus-k8s" grafana = "grafana-k8s" + prometheus_scrape = "prometheus-scrape-config-k8s" jupyter_controller = "jupyter-controller" + scrape_config = {"scrape_interval": "30s"} await ops_test.model.deploy(prometheus, channel="latest/beta") await ops_test.model.deploy(grafana, channel="latest/beta") + await ops_test.model.deploy(prometheus_scrape, channel="latest/beta", config=scrape_config) + await ops_test.model.add_relation(jupyter_controller, prometheus_scrape) + await ops_test.model.add_relation(prometheus, prometheus_scrape) await ops_test.model.add_relation(prometheus, grafana) await ops_test.model.add_relation(jupyter_controller, grafana) - await ops_test.model.add_relation(prometheus, jupyter_controller) await ops_test.model.wait_for_idle([jupyter_controller, prometheus, grafana], status="active") status = await ops_test.model.get_status() diff --git a/tox.ini b/tox.ini index 4c52c917..08383557 100644 --- a/tox.ini +++ b/tox.ini @@ -22,4 +22,4 @@ passenv = DISPLAY deps = -rtest-requirements.txt -commands = pytest -vvs --tb native --show-capture=no --log-cli-level=INFO {posargs} {toxinidir}/tests/ +commands = pytest -vvs --tb native --show-capture=no --log-cli-level=INFO --asyncio-mode=auto {posargs} {toxinidir}/tests/