diff --git a/.cirun.yml b/.cirun.yml index bdabe6500..dcc829bb8 100644 --- a/.cirun.yml +++ b/.cirun.yml @@ -4,8 +4,8 @@ runners: - name: run-k8s-tests # Cloud Provider: AWS cloud: aws - # Instance Type has 4 vcpu, 16 GiB memory, Up to 5 Gbps Network Performance - instance_type: t3a.xlarge + # Instance Type has 8 vcpu, 32 GiB memory, Up to 5 Gbps Network Performance + instance_type: t3a.2xlarge # Custom AMI with docker/cypress/hub pre-installed machine_image: ami-0a388df278199ff52 # Region: Oregon diff --git a/.github/workflows/test_local_integration.yaml b/.github/workflows/test_local_integration.yaml index ac5ff87b4..05dec384b 100644 --- a/.github/workflows/test_local_integration.yaml +++ b/.github/workflows/test_local_integration.yaml @@ -96,7 +96,6 @@ jobs: sed -i -E 's/(cpu_guarantee):\s+[0-9\.]+/\1: 0.25/g' "nebari-config.yaml" sed -i -E 's/(mem_guarantee):\s+[A-Za-z0-9\.]+/\1: 0.25G/g' "nebari-config.yaml" - # Change default JupyterLab theme cat >> nebari-config.yaml <<- EOM jupyterlab: @@ -105,6 +104,16 @@ jobs: theme: JupyterLab Dark EOM + # Change default value for minio persistence size + cat >> nebari-config.yaml <<- EOM + monitoring: + enabled: true + overrides: + minio: + persistence: + size: 1Gi + EOM + cat nebari-config.yaml - name: Deploy Nebari @@ -115,7 +124,7 @@ jobs: - name: Basic kubectl checks after deployment if: always() run: | - kubectl get all,cm,secret,ing -A + kubectl get all,cm,secret,pv,pvc,ing -A - name: Check github-actions.nebari.dev resolves run: | diff --git a/RELEASE.md b/RELEASE.md index f3f93499a..076754b3a 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -11,6 +11,8 @@ This file is copied to nebari-dev/nebari-docs using a GitHub Action. --> ## Upcoming Release +* Added Grafana Loki to aggregate, index and search logs + ## Release 2024.1.1 - January 17, 2024 ### Feature changes and enhancements diff --git a/src/_nebari/stages/kubernetes_services/__init__.py b/src/_nebari/stages/kubernetes_services/__init__.py index 0702a27c5..a9124f41a 100644 --- a/src/_nebari/stages/kubernetes_services/__init__.py +++ b/src/_nebari/stages/kubernetes_services/__init__.py @@ -199,8 +199,16 @@ class JHubApps(schema.Base): enabled: bool = False +class MonitoringOverrides(schema.Base): + loki: typing.Dict = {} + promtail: typing.Dict = {} + minio: typing.Dict = {} + + class Monitoring(schema.Base): enabled: bool = True + overrides: MonitoringOverrides = MonitoringOverrides() + minio_enabled: bool = True class JupyterLabPioneer(schema.Base): @@ -381,6 +389,12 @@ class DaskGatewayInputVars(schema.Base): class MonitoringInputVars(schema.Base): monitoring_enabled: bool = Field(alias="monitoring-enabled") + minio_enabled: bool = Field(alias="minio-enabled") + grafana_loki_overrides: List[str] = Field(alias="grafana-loki-overrides") + grafana_promtail_overrides: List[str] = Field(alias="grafana-promtail-overrides") + grafana_loki_minio_overrides: List[str] = Field( + alias="grafana-loki-minio-overrides" + ) class TelemetryInputVars(schema.Base): @@ -524,6 +538,14 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]): monitoring_vars = MonitoringInputVars( monitoring_enabled=self.config.monitoring.enabled, + minio_enabled=self.config.monitoring.minio_enabled, + grafana_loki_overrides=[json.dumps(self.config.monitoring.overrides.loki)], + grafana_promtail_overrides=[ + json.dumps(self.config.monitoring.overrides.promtail) + ], + grafana_loki_minio_overrides=[ + json.dumps(self.config.monitoring.overrides.minio) + ], ) telemetry_vars = TelemetryInputVars( diff --git a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/monitoring/loki/main.tf b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/monitoring/loki/main.tf new file mode 100644 index 000000000..8180d46fb --- /dev/null +++ b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/monitoring/loki/main.tf @@ -0,0 +1,103 @@ +resource "random_password" "minio_root_password" { + length = 32 + special = false +} + +locals { + minio-url = "http://${var.minio-release-name}:${var.minio-port}" + node-selector = { + "${var.node-group.key}" = "${var.node-group.value}" + } +} + +resource "helm_release" "loki-minio" { + count = var.minio-enabled ? 1 : 0 + name = var.minio-release-name + namespace = var.namespace + repository = "https://raw.githubusercontent.com/bitnami/charts/defb094c658024e4aa8245622dab202874880cbc/bitnami" + chart = "minio" + # last release that was Apache-2.0 + version = var.minio-helm-chart-version + + set { + name = "accessKey.password" + value = "admin" + } + + set { + name = "secretKey.password" + value = random_password.minio_root_password.result + } + + set { + name = "defaultBuckets" + value = join(" ", var.buckets) + } + + set { + name = "persistence.size" + value = var.minio-storage + } + + values = concat([ + file("${path.module}/values_minio.yaml"), + jsonencode({ + nodeSelector : local.node-selector + }) + ], var.grafana-loki-minio-overrides) +} + + +resource "helm_release" "grafana-loki" { + name = "nebari-loki" + namespace = var.namespace + repository = "https://grafana.github.io/helm-charts" + chart = "loki" + version = var.loki-helm-chart-version + + values = concat([ + file("${path.module}/values_loki.yaml"), + jsonencode({ + loki : { + storage : { + s3 : { + endpoint : local.minio-url, + accessKeyId : "admin" + secretAccessKey : random_password.minio_root_password.result, + s3ForcePathStyle : true + } + } + } + storageConfig : { + # We configure MinIO by using the AWS config because MinIO implements the S3 API + aws : { + s3 : local.minio-url + s3ForcePathStyle : true + } + } + write : { nodeSelector : local.node-selector } + read : { nodeSelector : local.node-selector } + backend : { nodeSelector : local.node-selector } + gateway : { nodeSelector : local.node-selector } + }) + ], var.grafana-loki-overrides) + + depends_on = [helm_release.loki-minio] +} + +resource "helm_release" "grafana-promtail" { + # Promtail ships the contents of logs to Loki instance + name = "nebari-promtail" + namespace = var.namespace + repository = "https://grafana.github.io/helm-charts" + chart = "promtail" + version = var.promtail-helm-chart-version + + values = concat([ + file("${path.module}/values_promtail.yaml"), + jsonencode({ + }) + ], var.grafana-promtail-overrides) + + depends_on = [helm_release.grafana-loki] +} diff --git a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/monitoring/loki/values_loki.yaml b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/monitoring/loki/values_loki.yaml new file mode 100644 index 000000000..c11ebe5d1 --- /dev/null +++ b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/monitoring/loki/values_loki.yaml @@ -0,0 +1,78 @@ +# https://github.com/grafana/loki/blob/4cae003ecedd474e4c15feab4ea2ef435afff83f/production/helm/loki/values.yaml + +loki: + storage: + type: s3 + commonConfig: + replication_factor: 1 + # Not required as it is inside cluster and not exposed to the public network + auth_enabled: false + + # The Compactor deduplicates index entries and also apply granular retention. + compactor: + # is the directory where marked chunks and temporary tables will be saved. + working_directory: /var/loki/compactor/data/retention + # minio s3 + shared_store: s3 + # how often compaction will happen + compaction_interval: 1h + # should delete old logs after retention delete delay + # ideally we would want to do storage based retention, but this is not + # currently implemented in loki, that's why we're doing time based retention. + retention_enabled: true + # is the delay after which the Compactor will delete marked chunks. + retention_delete_delay: 1h + # specifies the maximum quantity of goroutine workers instantiated to delete chunks. + retention_delete_worker_count: 150 + + limits_config: + # The minimum retention period is 24h. + # This is reasonable in most cases, but if people would like to retain logs for longer + # then they can override this variable from nebari-config.yaml + retention_period: 60d + + schema_config: + configs: + # list of period_configs + # The date of the first day that index buckets should be created. + - from: "2024-03-01" + index: + period: 24h + prefix: loki_index_ + object_store: s3 + schema: v11 + store: boltdb-shipper + storage_config: + boltdb_shipper: + # Directory where ingesters would write index files which would then be + # uploaded by shipper to configured storage + active_index_directory: /var/loki/compactor/data/index + # Cache location for restoring index files from storage for queries + cache_location: /var/loki/compactor/data/boltdb-cache + # Shared store for keeping index files + shared_store: s3 + +# Configuration for the write pod(s) +write: + # -- Number of replicas for the write + # Keeping cost of running Nebari in mind + # We don't need so many replicas, if people need it + # they can always override from nebari-config.yaml + replicas: 1 + +read: + # -- Number of replicas for the read + replicas: 1 + +backend: + # -- Number of replicas for the backend + replicas: 1 + +minio: + # We are deploying minio from bitnami chart separately + enabled: false + +monitoring: + selfMonitoring: + grafanaAgent: + installOperator: false diff --git a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/monitoring/loki/values_minio.yaml b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/monitoring/loki/values_minio.yaml new file mode 100644 index 000000000..666542bb4 --- /dev/null +++ b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/monitoring/loki/values_minio.yaml @@ -0,0 +1 @@ +# https://github.com/bitnami/charts/blob/440ec159c26e4ff0748b9e9866b345d98220c40a/bitnami/minio/values.yaml diff --git a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/monitoring/loki/values_promtail.yaml b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/monitoring/loki/values_promtail.yaml new file mode 100644 index 000000000..5a18a9bc0 --- /dev/null +++ b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/monitoring/loki/values_promtail.yaml @@ -0,0 +1 @@ +# https://github.com/grafana/helm-charts/blob/3831194ba2abd2a0ca7a14ca00e578f8e9d2abc6/charts/promtail/values.yaml diff --git a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/monitoring/loki/variables.tf b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/monitoring/loki/variables.tf new file mode 100644 index 000000000..a43695252 --- /dev/null +++ b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/monitoring/loki/variables.tf @@ -0,0 +1,84 @@ +variable "namespace" { + description = "deploy monitoring services on this namespace" + type = string + default = "dev" +} + +variable "loki-helm-chart-version" { + description = "version to deploy for the loki helm chart" + type = string + default = "5.43.3" +} + +variable "promtail-helm-chart-version" { + description = "version to deploy for the promtail helm chart" + type = string + default = "6.15.5" +} + +variable "minio-helm-chart-version" { + description = "version to deploy for the minio helm chart" + type = string + default = "6.7.4" +} + +variable "grafana-loki-overrides" { + description = "Grafana Loki helm chart overrides" + type = list(string) + default = [] +} + +variable "grafana-promtail-overrides" { + description = "Grafana Promtail helm chart overrides" + type = list(string) + default = [] +} + +variable "grafana-loki-minio-overrides" { + description = "Grafana Loki minio helm chart overrides" + type = list(string) + default = [] +} + +variable "minio-release-name" { + description = "Grafana Loki minio release name" + type = string + default = "nebari-loki-minio" +} + +variable "minio-port" { + description = "Grafana Loki minio port" + type = number + default = 9000 +} + +variable "buckets" { + description = "Minio buckets" + type = list(string) + default = [ + "chunks", + "ruler", + "admin", + "loki" + ] +} + +variable "minio-storage" { + description = "Minio storage" + type = string + default = "50Gi" +} + +variable "minio-enabled" { + description = "Deploy minio along with loki or not" + type = bool + default = true +} + +variable "node-group" { + description = "Node key value pair for bound resources" + type = object({ + key = string + value = string + }) +} diff --git a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/monitoring/values.yaml b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/monitoring/values.yaml index ada868882..f3cf47c88 100644 --- a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/monitoring/values.yaml +++ b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/monitoring/values.yaml @@ -1 +1,7 @@ # https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/values.yaml + +grafana: + additionalDataSources: + - name: Loki + type: loki + url: http://loki-gateway.dev diff --git a/src/_nebari/stages/kubernetes_services/template/monitoring.tf b/src/_nebari/stages/kubernetes_services/template/monitoring.tf index ec20a75ba..39487c4bb 100644 --- a/src/_nebari/stages/kubernetes_services/template/monitoring.tf +++ b/src/_nebari/stages/kubernetes_services/template/monitoring.tf @@ -14,3 +14,14 @@ module "monitoring" { node-group = var.node_groups.general } + +module "grafana-loki" { + count = var.monitoring-enabled ? 1 : 0 + source = "./modules/kubernetes/services/monitoring/loki" + namespace = var.environment + grafana-loki-overrides = var.grafana-loki-overrides + grafana-promtail-overrides = var.grafana-promtail-overrides + grafana-loki-minio-overrides = var.grafana-loki-minio-overrides + node-group = var.node_groups.general + minio-enabled = var.minio-enabled +} diff --git a/src/_nebari/stages/kubernetes_services/template/variables.tf b/src/_nebari/stages/kubernetes_services/template/variables.tf index 4b78f5994..9e36e6597 100644 --- a/src/_nebari/stages/kubernetes_services/template/variables.tf +++ b/src/_nebari/stages/kubernetes_services/template/variables.tf @@ -63,3 +63,27 @@ variable "cloud-provider" { description = "Name of cloud provider." type = string } + +variable "grafana-loki-overrides" { + description = "Helm chart overrides for loki" + type = list(string) + default = [] +} + +variable "grafana-promtail-overrides" { + description = "Helm chart overrides for promtail" + type = list(string) + default = [] +} + +variable "grafana-loki-minio-overrides" { + description = "Grafana Loki minio helm chart overrides" + type = list(string) + default = [] +} + +variable "minio-enabled" { + description = "Deploy minio along with loki or not" + type = bool + default = true +} diff --git a/tests/common/kube_api.py b/tests/common/kube_api.py new file mode 100644 index 000000000..eec1d05d7 --- /dev/null +++ b/tests/common/kube_api.py @@ -0,0 +1,40 @@ +import socket +import typing + +from kubernetes import config +from kubernetes.client.api import core_v1_api +from kubernetes.client.models import V1Pod +from kubernetes.stream import portforward + + +def kubernetes_port_forward( + pod_labels: typing.Dict[str, str], port: int, namespace: str = "dev" +) -> V1Pod: + """Given pod labels and port, finds the pod name and port forwards to + the given port. + :param pod_labels: dict of labels, by which to search the pod + :param port: port number to forward + :param namespace: kubernetes namespace name + :return: kubernetes pod object + """ + config.load_kube_config() + core_v1 = core_v1_api.CoreV1Api() + label_selector = ",".join([f"{k}={v}" for k, v in pod_labels.items()]) + pods = core_v1.list_namespaced_pod( + namespace=namespace, label_selector=label_selector + ) + assert pods.items + pod = pods.items[0] + pod_name = pod.metadata.name + + def kubernetes_create_connection(address, *args, **kwargs): + pf = portforward( + core_v1.connect_get_namespaced_pod_portforward, + pod_name, + namespace, + ports=str(port), + ) + return pf.socket(port) + + socket.create_connection = kubernetes_create_connection + return pod diff --git a/tests/tests_deployment/test_loki_deployment.py b/tests/tests_deployment/test_loki_deployment.py new file mode 100644 index 000000000..59210a8fc --- /dev/null +++ b/tests/tests_deployment/test_loki_deployment.py @@ -0,0 +1,126 @@ +import json +import urllib.parse +import urllib.request as urllib_request + +import pytest +from kubernetes.client import V1Pod + +from tests.common.kube_api import kubernetes_port_forward + +LOKI_BACKEND_PORT = 3100 +LOKI_BACKEND_POD_LABELS = { + "app.kubernetes.io/instance": "nebari-loki", + "app.kubernetes.io/component": "backend", +} + +MINIO_PORT = 9000 +MINIO_POD_LABELS = { + "app.kubernetes.io/instance": "nebari-loki-minio", + "app.kubernetes.io/name": "minio", +} + +LOKI_GATEWAY_PORT = 8080 +LOKI_GATEWAY_POD_LABELS = { + "app.kubernetes.io/instance": "nebari-loki", + "app.kubernetes.io/component": "gateway", +} + + +@pytest.fixture(scope="module") +def port_forward_fixture(request): + """Pytest fixture to port forward loki backend pod to make it accessible + on localhost so that we can run some tests on it. + """ + return kubernetes_port_forward( + pod_labels=request.param["labels"], port=request.param["port"] + ) + + +def port_forward(labels, port): + params = {"labels": labels, "port": port} + return pytest.mark.parametrize("port_forward_fixture", [params], indirect=True) + + +@pytest.mark.parametrize( + "endpoint_path", + ( + "metrics", + "services", + "config", + "ready", + "log_level", + ), +) +@port_forward(labels=LOKI_BACKEND_POD_LABELS, port=LOKI_BACKEND_PORT) +def test_loki_endpoint(endpoint_path: str, port_forward_fixture: V1Pod): + """This will hit some endpoints in the loki API and verify that we + get a 200 status code, to make sure Loki is working properly. + :param endpoint_path: a loki api endpoint path + :param port_forward_fixture: pytest fixture to port forward. + :return: + """ + pod_name = port_forward_fixture.metadata.name + url = f"http://{pod_name}.pod.dev.kubernetes:{LOKI_BACKEND_PORT}/{endpoint_path}" + response = urllib_request.urlopen(url) + response.read().decode("utf-8") + assert response.code == 200 + response.close() + + +@port_forward(labels=MINIO_POD_LABELS, port=MINIO_PORT) +def test_minio_accessible(port_forward_fixture: V1Pod): + """This will hit liveness endpoint of minio API and verify that we + get a 200 status code, to make sure minio is up and running. + :param port_forward_fixture: pytest fixture to port forward. + :return: + """ + pod_name = port_forward_fixture.metadata.name + url = f"http://{pod_name}.pod.dev.kubernetes:{MINIO_PORT}/minio/health/live" + response = urllib_request.urlopen(url) + response.read().decode("utf-8") + assert response.code == 200 + response.close() + + +@port_forward(labels=LOKI_GATEWAY_POD_LABELS, port=LOKI_GATEWAY_PORT) +def test_loki_gateway(port_forward_fixture: V1Pod): + """This will hit an endpoint of loki gateway API and verify that we + get a 200 status code, to make sure minio is up and running. + :param port_forward_fixture: pytest fixture to port forward. + :return: + """ + pod_name = port_forward_fixture.metadata.name + url = f"http://{pod_name}.pod.dev.kubernetes:{LOKI_BACKEND_PORT}/loki/api/v1/labels" + response = urllib_request.urlopen(url) + response_content = response.read().decode("utf-8") + response_json = json.loads(response_content) + assert response.code == 200 + assert response_json["status"] == "success" + response.close() + + +@port_forward(labels=LOKI_GATEWAY_POD_LABELS, port=LOKI_GATEWAY_PORT) +def test_loki_gateway_fetch_logs(port_forward_fixture: V1Pod): + """This will hit an endpoint of loki gateway API to fetch some logs + and verify logs received. + :param port_forward_fixture: pytest fixture to port forward. + :return: None + """ + pod_name = port_forward_fixture.metadata.name + query_params = { + "limit": "5", + # Fetch logs for jupyterhub app + "query": '{app="jupyterhub"}', + } + + encoded_params = urllib.parse.urlencode(query_params) + path = f"/loki/api/v1/query_range?{encoded_params}" + url = f"http://{pod_name}.pod.dev.kubernetes:{LOKI_BACKEND_PORT}/{path}" + response = urllib_request.urlopen(url) + response_content = response.read().decode("utf-8") + response_json = json.loads(response_content) + assert response.code == 200 + assert response_json["status"] == "success" + # Make sure log lines received + assert len(response_json["data"]["result"][0]["values"]) > 0 + response.close() diff --git a/tests/tests_unit/cli_validate/min.happy.monitoring.overrides.yaml b/tests/tests_unit/cli_validate/min.happy.monitoring.overrides.yaml new file mode 100644 index 000000000..587c0cf5c --- /dev/null +++ b/tests/tests_unit/cli_validate/min.happy.monitoring.overrides.yaml @@ -0,0 +1,10 @@ +project_name: test +monitoring: + enabled: true + overrides: + loki: + loki: foobar + promtail: + promtail: foobar + minio: + minio: foobar