Skip to content

Commit

Permalink
Add Grafana Loki integration (#2156)
Browse files Browse the repository at this point in the history
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Vinicius D. Cerutti <[email protected]>
  • Loading branch information
3 people authored Mar 8, 2024
1 parent 50e2f8a commit 0210a47
Show file tree
Hide file tree
Showing 15 changed files with 521 additions and 4 deletions.
4 changes: 2 additions & 2 deletions .cirun.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ runners:
- name: run-k8s-tests
# Cloud Provider: AWS
cloud: aws
# Instance Type has 4 vcpu, 16 GiB memory, Up to 5 Gbps Network Performance
instance_type: t3a.xlarge
# Instance Type has 8 vcpu, 32 GiB memory, Up to 5 Gbps Network Performance
instance_type: t3a.2xlarge
# Custom AMI with docker/cypress/hub pre-installed
machine_image: ami-0a388df278199ff52
# Region: Oregon
Expand Down
13 changes: 11 additions & 2 deletions .github/workflows/test_local_integration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ jobs:
sed -i -E 's/(cpu_guarantee):\s+[0-9\.]+/\1: 0.25/g' "nebari-config.yaml"
sed -i -E 's/(mem_guarantee):\s+[A-Za-z0-9\.]+/\1: 0.25G/g' "nebari-config.yaml"
# Change default JupyterLab theme
cat >> nebari-config.yaml <<- EOM
jupyterlab:
Expand All @@ -105,6 +104,16 @@ jobs:
theme: JupyterLab Dark
EOM
# Change default value for minio persistence size
cat >> nebari-config.yaml <<- EOM
monitoring:
enabled: true
overrides:
minio:
persistence:
size: 1Gi
EOM
cat nebari-config.yaml
- name: Deploy Nebari
Expand All @@ -115,7 +124,7 @@ jobs:
- name: Basic kubectl checks after deployment
if: always()
run: |
kubectl get all,cm,secret,ing -A
kubectl get all,cm,secret,pv,pvc,ing -A
- name: Check github-actions.nebari.dev resolves
run: |
Expand Down
2 changes: 2 additions & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ This file is copied to nebari-dev/nebari-docs using a GitHub Action. -->

## Upcoming Release

* Added Grafana Loki to aggregate, index and search logs

## Release 2024.1.1 - January 17, 2024

### Feature changes and enhancements
Expand Down
22 changes: 22 additions & 0 deletions src/_nebari/stages/kubernetes_services/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,16 @@ class JHubApps(schema.Base):
enabled: bool = False


class MonitoringOverrides(schema.Base):
loki: typing.Dict = {}
promtail: typing.Dict = {}
minio: typing.Dict = {}


class Monitoring(schema.Base):
enabled: bool = True
overrides: MonitoringOverrides = MonitoringOverrides()
minio_enabled: bool = True


class JupyterLabPioneer(schema.Base):
Expand Down Expand Up @@ -381,6 +389,12 @@ class DaskGatewayInputVars(schema.Base):

class MonitoringInputVars(schema.Base):
monitoring_enabled: bool = Field(alias="monitoring-enabled")
minio_enabled: bool = Field(alias="minio-enabled")
grafana_loki_overrides: List[str] = Field(alias="grafana-loki-overrides")
grafana_promtail_overrides: List[str] = Field(alias="grafana-promtail-overrides")
grafana_loki_minio_overrides: List[str] = Field(
alias="grafana-loki-minio-overrides"
)


class TelemetryInputVars(schema.Base):
Expand Down Expand Up @@ -524,6 +538,14 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]):

monitoring_vars = MonitoringInputVars(
monitoring_enabled=self.config.monitoring.enabled,
minio_enabled=self.config.monitoring.minio_enabled,
grafana_loki_overrides=[json.dumps(self.config.monitoring.overrides.loki)],
grafana_promtail_overrides=[
json.dumps(self.config.monitoring.overrides.promtail)
],
grafana_loki_minio_overrides=[
json.dumps(self.config.monitoring.overrides.minio)
],
)

telemetry_vars = TelemetryInputVars(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
resource "random_password" "minio_root_password" {
length = 32
special = false
}

locals {
minio-url = "http://${var.minio-release-name}:${var.minio-port}"
node-selector = {
"${var.node-group.key}" = "${var.node-group.value}"
}
}

resource "helm_release" "loki-minio" {
count = var.minio-enabled ? 1 : 0
name = var.minio-release-name
namespace = var.namespace
repository = "https://raw.githubusercontent.com/bitnami/charts/defb094c658024e4aa8245622dab202874880cbc/bitnami"
chart = "minio"
# last release that was Apache-2.0
version = var.minio-helm-chart-version

set {
name = "accessKey.password"
value = "admin"
}

set {
name = "secretKey.password"
value = random_password.minio_root_password.result
}

set {
name = "defaultBuckets"
value = join(" ", var.buckets)
}

set {
name = "persistence.size"
value = var.minio-storage
}

values = concat([
file("${path.module}/values_minio.yaml"),
jsonencode({
nodeSelector : local.node-selector
})
], var.grafana-loki-minio-overrides)
}


resource "helm_release" "grafana-loki" {
name = "nebari-loki"
namespace = var.namespace
repository = "https://grafana.github.io/helm-charts"
chart = "loki"
version = var.loki-helm-chart-version

values = concat([
file("${path.module}/values_loki.yaml"),
jsonencode({
loki : {
storage : {
s3 : {
endpoint : local.minio-url,
accessKeyId : "admin"
secretAccessKey : random_password.minio_root_password.result,
s3ForcePathStyle : true
}
}
}
storageConfig : {
# We configure MinIO by using the AWS config because MinIO implements the S3 API
aws : {
s3 : local.minio-url
s3ForcePathStyle : true
}
}
write : { nodeSelector : local.node-selector }
read : { nodeSelector : local.node-selector }
backend : { nodeSelector : local.node-selector }
gateway : { nodeSelector : local.node-selector }
})
], var.grafana-loki-overrides)

depends_on = [helm_release.loki-minio]
}

resource "helm_release" "grafana-promtail" {
# Promtail ships the contents of logs to Loki instance
name = "nebari-promtail"
namespace = var.namespace
repository = "https://grafana.github.io/helm-charts"
chart = "promtail"
version = var.promtail-helm-chart-version

values = concat([
file("${path.module}/values_promtail.yaml"),
jsonencode({
})
], var.grafana-promtail-overrides)

depends_on = [helm_release.grafana-loki]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# https://github.com/grafana/loki/blob/4cae003ecedd474e4c15feab4ea2ef435afff83f/production/helm/loki/values.yaml

loki:
storage:
type: s3
commonConfig:
replication_factor: 1
# Not required as it is inside cluster and not exposed to the public network
auth_enabled: false

# The Compactor deduplicates index entries and also apply granular retention.
compactor:
# is the directory where marked chunks and temporary tables will be saved.
working_directory: /var/loki/compactor/data/retention
# minio s3
shared_store: s3
# how often compaction will happen
compaction_interval: 1h
# should delete old logs after retention delete delay
# ideally we would want to do storage based retention, but this is not
# currently implemented in loki, that's why we're doing time based retention.
retention_enabled: true
# is the delay after which the Compactor will delete marked chunks.
retention_delete_delay: 1h
# specifies the maximum quantity of goroutine workers instantiated to delete chunks.
retention_delete_worker_count: 150

limits_config:
# The minimum retention period is 24h.
# This is reasonable in most cases, but if people would like to retain logs for longer
# then they can override this variable from nebari-config.yaml
retention_period: 60d

schema_config:
configs:
# list of period_configs
# The date of the first day that index buckets should be created.
- from: "2024-03-01"
index:
period: 24h
prefix: loki_index_
object_store: s3
schema: v11
store: boltdb-shipper
storage_config:
boltdb_shipper:
# Directory where ingesters would write index files which would then be
# uploaded by shipper to configured storage
active_index_directory: /var/loki/compactor/data/index
# Cache location for restoring index files from storage for queries
cache_location: /var/loki/compactor/data/boltdb-cache
# Shared store for keeping index files
shared_store: s3

# Configuration for the write pod(s)
write:
# -- Number of replicas for the write
# Keeping cost of running Nebari in mind
# We don't need so many replicas, if people need it
# they can always override from nebari-config.yaml
replicas: 1

read:
# -- Number of replicas for the read
replicas: 1

backend:
# -- Number of replicas for the backend
replicas: 1

minio:
# We are deploying minio from bitnami chart separately
enabled: false

monitoring:
selfMonitoring:
grafanaAgent:
installOperator: false
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# https://github.com/bitnami/charts/blob/440ec159c26e4ff0748b9e9866b345d98220c40a/bitnami/minio/values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# https://github.com/grafana/helm-charts/blob/3831194ba2abd2a0ca7a14ca00e578f8e9d2abc6/charts/promtail/values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
variable "namespace" {
description = "deploy monitoring services on this namespace"
type = string
default = "dev"
}

variable "loki-helm-chart-version" {
description = "version to deploy for the loki helm chart"
type = string
default = "5.43.3"
}

variable "promtail-helm-chart-version" {
description = "version to deploy for the promtail helm chart"
type = string
default = "6.15.5"
}

variable "minio-helm-chart-version" {
description = "version to deploy for the minio helm chart"
type = string
default = "6.7.4"
}

variable "grafana-loki-overrides" {
description = "Grafana Loki helm chart overrides"
type = list(string)
default = []
}

variable "grafana-promtail-overrides" {
description = "Grafana Promtail helm chart overrides"
type = list(string)
default = []
}

variable "grafana-loki-minio-overrides" {
description = "Grafana Loki minio helm chart overrides"
type = list(string)
default = []
}

variable "minio-release-name" {
description = "Grafana Loki minio release name"
type = string
default = "nebari-loki-minio"
}

variable "minio-port" {
description = "Grafana Loki minio port"
type = number
default = 9000
}

variable "buckets" {
description = "Minio buckets"
type = list(string)
default = [
"chunks",
"ruler",
"admin",
"loki"
]
}

variable "minio-storage" {
description = "Minio storage"
type = string
default = "50Gi"
}

variable "minio-enabled" {
description = "Deploy minio along with loki or not"
type = bool
default = true
}

variable "node-group" {
description = "Node key value pair for bound resources"
type = object({
key = string
value = string
})
}
Original file line number Diff line number Diff line change
@@ -1 +1,7 @@
# https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/values.yaml

grafana:
additionalDataSources:
- name: Loki
type: loki
url: http://loki-gateway.dev
11 changes: 11 additions & 0 deletions src/_nebari/stages/kubernetes_services/template/monitoring.tf
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,14 @@ module "monitoring" {

node-group = var.node_groups.general
}

module "grafana-loki" {
count = var.monitoring-enabled ? 1 : 0
source = "./modules/kubernetes/services/monitoring/loki"
namespace = var.environment
grafana-loki-overrides = var.grafana-loki-overrides
grafana-promtail-overrides = var.grafana-promtail-overrides
grafana-loki-minio-overrides = var.grafana-loki-minio-overrides
node-group = var.node_groups.general
minio-enabled = var.minio-enabled
}
Loading

0 comments on commit 0210a47

Please sign in to comment.