From f0d8cf044e3a94e63fbe75fb3d242d2e0e259dd6 Mon Sep 17 00:00:00 2001 From: Dustin Kaiser Date: Thu, 19 Sep 2024 16:49:57 +0200 Subject: [PATCH 01/19] wip --- charts/Makefile | 33 +++++++++++++++++++++------------ charts/README.md | 8 ++++++++ 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/charts/Makefile b/charts/Makefile index 8bf61fee..f33c5391 100644 --- a/charts/Makefile +++ b/charts/Makefile @@ -7,33 +7,32 @@ CONFIG_DIR := $(shell dirname $(REPO_CONFIG_LOCATION)) CHART_DIRS := $(wildcard $(REPO_BASE_DIR)/charts/*/) .PHONY: .check-helmfile-installed -.check-helmfile-installed: +.check-helmfile-installed: ## Checks if helmfile is installed @if ! command -v helmfile >/dev/null 2>&1; then \ echo "'helmfile' is not installed. Install it to continue ...";\ fi -helmfile.yaml: simcore-charts/helmfile.yaml +helmfile.yaml: simcore-charts/helmfile.yaml ## Copies the helmfile.yaml to the charts directory cp $(CONFIG_DIR)/$@ $(REPO_BASE_DIR)/charts/helmfile.yaml -simcore-charts/helmfile.yaml: +simcore-charts/helmfile.yaml: ## Copies the simcore helmfile to the charts directory cp $(CONFIG_DIR)/helmfile.simcore.yaml $(REPO_BASE_DIR)/charts/$@ .PHONY: helmfile-lint -helmfile-lint: .check-helmfile-installed helmfile.yaml +helmfile-lint: .check-helmfile-installed helmfile.yaml ## Lints the helmfile set -a; source $(REPO_CONFIG_LOCATION); set +a; \ helmfile lint .PHONY: .helmfile-local-post-install -.helmfile-local-post-install: +.helmfile-local-post-install: ## Post install steps for local helmfile deployment @$(MAKE) -s configure-local-hosts @echo ""; @echo "Cluster has been deployed locally: http://$(MACHINE_FQDN)"; @echo " For secure connections self-signed certificates are used."; - @echo " Install their root-ca certificate in your system for smooth experience."; - @echo " For insecure connections make sure to disable automatic https redirects in your browser."; + @echo " .PHONY: helmfile-apply -helmfile-apply: .check-helmfile-installed helmfile.yaml +helmfile-apply: .check-helmfile-installed helmfile.yaml ## Applies the helmfile configuration set -a; source $(REPO_CONFIG_LOCATION); set +a; \ helmfile -f $(REPO_BASE_DIR)/charts/helmfile.yaml apply @@ -41,17 +40,27 @@ helmfile-apply: .check-helmfile-installed helmfile.yaml $(MAKE) -s .helmfile-local-post-install; \ fi +.PHONY: helmfile-sync +helmfile-sync: .check-helmfile-installed helmfile.yaml ## Syncs the helmfile configuration + set -a; source $(REPO_CONFIG_LOCATION); set +a; \ + helmfile -f $(REPO_BASE_DIR)/charts/helmfile.yaml sync + + @if [ "$(MACHINE_FQDN)" = "osparc.local" ]; then \ + $(MAKE) -s .helmfile-local-post-install; \ + fi + + .PHONY: configure-local-hosts -configure-local-hosts: - @echo "Addings $(MACHINE_FQDN) hosts to /etc/hosts ..." +configure-local-hosts: ## Adds local hosts entries for the machine + @echo "Adding $(MACHINE_FQDN) hosts to /etc/hosts ..." @grep -q '127.0.0.1 k8s.monitoring.$(MACHINE_FQDN)' /etc/hosts || echo '127.0.0.1 k8s.monitoring.$(MACHINE_FQDN)' | sudo tee -a /etc/hosts .PHONY: helmfile-diff -helmfile-diff: .check-helmfile-installed helmfile.yaml +helmfile-diff: .check-helmfile-installed helmfile.yaml ## Shows the differences that would be applied by helmfile @set -a; source $(REPO_CONFIG_LOCATION); set +a; \ helmfile -f $(REPO_BASE_DIR)/charts/helmfile.yaml diff .PHONY: helmfile-delete -helmfile-delete: .check-helmfile-installed helmfile.yaml +helmfile-delete: .check-helmfile-installed helmfile.yaml ## Deletes the helmfile configuration @set -a; source $(REPO_CONFIG_LOCATION); set +a; \ helmfile -f $(REPO_BASE_DIR)/charts/helmfile.yaml delete diff --git a/charts/README.md b/charts/README.md index 13767e80..45d83630 100644 --- a/charts/README.md +++ b/charts/README.md @@ -23,6 +23,14 @@ source: https://kind.sigs.k8s.io/docs/user/quick-start Follow the instructions here: https://helm.sh/docs/intro/install/ +Install the helm-diff plugin: `helm plugin install https://github.com/databus23/helm-diff` + +`via https://doc.traefik.io/traefik/user-guides/crd-acme/#ingressroute-definition` +Install traefik-v3 CRDs: `kubectl apply -f https://raw.githubusercontent.com/traefik/traefik/v3.1/docs/content/reference/dynamic-configuration/kubernetes-crd-definition-v1.yml` + +`via https://doc.traefik.io/traefik/user-guides/crd-acme/#ingressroute-definition` +Install traefik-v3 RBAC: `kubectl apply -f https://raw.githubusercontent.com/traefik/traefik/v3.1/docs/content/reference/dynamic-configuration/kubernetes-crd-rbac.yml` + #### helmfile If you have a different OS / architecture, pick a different link from [release artifacts](https://github.com/helmfile/helmfile/releases) From 293f63c8c7971afeb0de64af16d01153bf76eed4 Mon Sep 17 00:00:00 2001 From: Dustin Kaiser Date: Thu, 24 Oct 2024 10:13:28 +0200 Subject: [PATCH 02/19] Add csi-s3 and have portainer use it --- .gitignore | 3 +++ charts/csi-s3/values.yaml.gotmpl | 7 +++++++ charts/portainer/values.yaml.gotmpl | 6 ++++++ 3 files changed, 16 insertions(+) create mode 100644 charts/csi-s3/values.yaml.gotmpl diff --git a/.gitignore b/.gitignore index 0c825bcd..24edb7f6 100644 --- a/.gitignore +++ b/.gitignore @@ -149,3 +149,6 @@ docker-compose.simcore.yml repo.config .temp .temp/** + +# By convention: `.secret` files are gitignored +**/*.secret diff --git a/charts/csi-s3/values.yaml.gotmpl b/charts/csi-s3/values.yaml.gotmpl new file mode 100644 index 00000000..7e6ff4c9 --- /dev/null +++ b/charts/csi-s3/values.yaml.gotmpl @@ -0,0 +1,7 @@ +secret: + accessKey: {{ requiredEnv "S3_ACCESS_KEY" }} + secretKey: {{ requiredEnv "S3_SECRET_KEY" }} + region: {{ requiredEnv "S3_REGION" }} + endpoint: {{ requiredEnv "S3_ENDPOINT" }} +storageClass: + singleBucket: {{ requiredEnv "S3_K8S_CSI_BUCKET_NAME" }} diff --git a/charts/portainer/values.yaml.gotmpl b/charts/portainer/values.yaml.gotmpl index e89f2457..edc56479 100644 --- a/charts/portainer/values.yaml.gotmpl +++ b/charts/portainer/values.yaml.gotmpl @@ -18,6 +18,12 @@ serviceAccount: # The name of the service account to use. # If not set and create is true, a name is generated using the fullname template name: portainer-sa-clusteradmin +persistence: + enabled: true + size: "10Gi" + annotations: {} + storageClass: "csi-s3" + existingClaim: podAnnotations: {} podLabels: {} From f7f72ec27e13232dababef209c92e2a3a1b983d4 Mon Sep 17 00:00:00 2001 From: Dustin Kaiser Date: Fri, 25 Oct 2024 08:37:18 +0200 Subject: [PATCH 03/19] Change request @hrytsuk 1GB max portainer volume size --- charts/portainer/values.yaml.gotmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/portainer/values.yaml.gotmpl b/charts/portainer/values.yaml.gotmpl index edc56479..1f5f5c44 100644 --- a/charts/portainer/values.yaml.gotmpl +++ b/charts/portainer/values.yaml.gotmpl @@ -20,7 +20,7 @@ serviceAccount: name: portainer-sa-clusteradmin persistence: enabled: true - size: "10Gi" + size: "1Gi" annotations: {} storageClass: "csi-s3" existingClaim: From c9c70d642e632e920a25cc76395cc56f1be8f134 Mon Sep 17 00:00:00 2001 From: Dustin Kaiser Date: Tue, 3 Dec 2024 16:08:00 +0100 Subject: [PATCH 04/19] Arch Linux Certificates Customization --- certificates/Makefile | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/certificates/Makefile b/certificates/Makefile index a9326900..9af700df 100644 --- a/certificates/Makefile +++ b/certificates/Makefile @@ -66,16 +66,10 @@ install-root-certificate: rootca.crt ## installs a certificate in the host syste echo "Is the DOCKER service ready? press when ready" && read -n 1; \ fi;\ echo "======================================";,\ - $(if $(IS_OSX), \ - sudo security add-trusted-cert -d -k /Library/Keychains/System.keychain $<; \ - echo "Please restart the DOCKER service now..." && read -n 1; \ - echo "Is the DOCKER service ready? press when ready" && read -n 1; \ - , \ - sudo cp $< /usr/local/share/ca-certificates/osparc.crt; \ - sudo update-ca-certificates -f; \ - echo "# restarting docker daemon"; \ + sudo cp $< /etc/ca-certificates/trust-source/anchors/osparc.crt; \ + sudo trust extract-compat && \ + echo "# restarting docker daemon" && \ sudo systemctl restart docker \ - ) \ ) From 48fbbca2ea90fefe6f23eac88d066e49816ec0fc Mon Sep 17 00:00:00 2001 From: Dustin Kaiser Date: Thu, 24 Apr 2025 17:41:41 +0200 Subject: [PATCH 05/19] Fix pgsql exporter failure --- services/monitoring/pgsql_query_exporter_config.yaml.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/monitoring/pgsql_query_exporter_config.yaml.j2 b/services/monitoring/pgsql_query_exporter_config.yaml.j2 index c082d31f..5b5dabff 100644 --- a/services/monitoring/pgsql_query_exporter_config.yaml.j2 +++ b/services/monitoring/pgsql_query_exporter_config.yaml.j2 @@ -58,7 +58,7 @@ queries:{% for _gid in MONITORING_PROMETHEUS_PGSQL_GID_MONITORED.split(",") if _ query_total_number_of_dollars_paid_successfully: interval: 55 databases: [postgres] - metrics: [osparc_total_number_of_dollars_paid] + metrics: [osparc_total_number_of_dollars_paid_successfully] sql: | SELECT SUM(price_dollars) as osparc_total_number_of_dollars_paid_successfully FROM payments_transactions WHERE state = 'SUCCESS'; From 5ecbfec728aaadac483d29ffa0503bf775a523f5 Mon Sep 17 00:00:00 2001 From: YH <50014626+YuryHrytsuk@users.noreply.github.com> Date: Tue, 6 May 2025 14:57:03 +0200 Subject: [PATCH 06/19] [Kubernetes] Introduce on-prem persistent Storage (Longhorn) :tada: (#979) * Introduce longhorn chart * Further longhorn configuration * Longhorn: further settings configuration * Fix longhorn configuration bugs Extra: introduce longhorn pv vales for portainer * Add comment for deletion longhorn * Further longhorn configuration * Add README.md for Longhorn wit FAQ * Update Longhorn readme * Update readme * Futher LH configuration * Update LH's Readme * Update Longhorn Readme * Improve LH's Readme * LH: Reduce reserved default disk space to 5% Since we use a dedicated disk for LH, we can go ahead with 5% * Use values to set Longhorn storage class * Update LH's Readme * LH Readme: add requirements reference * PR Review: bring back portainer s3 pv * LH: decrease portinaer volume size --- charts/Makefile | 1 - charts/longhorn/README.md | 50 ++++++++++++++ charts/longhorn/values.yaml.gotmpl | 68 +++++++++++++++++++ .../portainer/values.longhorn-pv.yaml.gotmpl | 4 ++ charts/traefik/values.insecure.yaml.gotmpl | 14 ++++ charts/traefik/values.secure.yaml.gotmpl | 12 ++++ 6 files changed, 148 insertions(+), 1 deletion(-) create mode 100644 charts/longhorn/README.md create mode 100644 charts/longhorn/values.yaml.gotmpl create mode 100644 charts/portainer/values.longhorn-pv.yaml.gotmpl diff --git a/charts/Makefile b/charts/Makefile index 71ee46e9..20051d86 100644 --- a/charts/Makefile +++ b/charts/Makefile @@ -49,7 +49,6 @@ helmfile-sync: .check-helmfile-installed helmfile.yaml ## Syncs the helmfile con $(MAKE) -s .helmfile-local-post-install; \ fi - .PHONY: configure-local-hosts configure-local-hosts: ## Adds local hosts entries for the machine @echo "Adding $(MACHINE_FQDN) hosts to /etc/hosts ..." diff --git a/charts/longhorn/README.md b/charts/longhorn/README.md new file mode 100644 index 00000000..73508294 --- /dev/null +++ b/charts/longhorn/README.md @@ -0,0 +1,50 @@ +# Longhorn (LH) Knowledge Base + +### Can LH be used for critical services (e.g., Databases)? + +No (as of now). , we should not use it for volumes of critical services. + +As of now, we should avoid using LH for critical services. Instead, we should rely on easier-to-maintain solutions (e.g., application-level replication [Postgres Operators], S3, etc.). Once we get hands-on experience, extensive monitoring and ability to scale LH, we can consider using it for critical services. + +LH uses networking to keep replicas in sync, and IO-heavy workloads may easily overload it, leading to unpredictable consequences. Until we can extensively monitor LH and scale it properly on demand, it should not be used for critical or IO-heavy services. + +### How does LH decide which node's disk to use as storage? + +It depends on the configuration. There are three possibilities: +* https://longhorn.io/kb/tip-only-use-storage-on-a-set-of-nodes/ + +When using the `Create Default Disk on Labeled Nodes` option, it relies on the `node.longhorn.io/create-default-disk` Kubernetes node label. + +Source: https://longhorn.io/docs/1.8.1/nodes-and-volumes/nodes/default-disk-and-node-config/#customizing-default-disks-for-new-nodes + +### Will LH pick up storage from a newly added node? + +By default, LH will use storage on all nodes (including newly created ones) where it runs. If `createDefaultDiskLabeledNodes` is configured, it will depend on the label of the node. + +Source: +* https://longhorn.io/kb/tip-only-use-storage-on-a-set-of-nodes/ +* https://longhorn.io/docs/1.8.1/nodes-and-volumes/nodes/default-disk-and-node-config/#customizing-default-disks-for-new-nodes + +### Can workloads be run on nodes where LH is not installed? + +Workloads can run on nodes without LH as long as LH is not restricted to specific nodes via the `nodeSelector` or `systemManagedComponentsNodeSelector` settings. If LH is configured to run on specific nodes, workloads can only run on those nodes. + +Note: There is an [ongoing bug](https://github.com/longhorn/longhorn/discussions/7312#discussioncomment-13030581) where LH will raise warnings when workloads run on nodes without LH. However, it will still function correctly. + +Source: https://longhorn.io/kb/tip-only-use-storage-on-a-set-of-nodes/ + +### Adding new volumes to (PVs that rely on) LH + +Monitor carefully whether LH is capable of handling new volumes. Test the new volume under load (when many read/write operations occur) and ensure LH does not fail due to insufficient resource capacities (e.g., network or CPU). You can also consider LH's performance section from this Readme. + +LH's minimum recommended resource requirements: +* https://longhorn.io/docs/1.8.1/best-practices/#minimum-recommended-hardware + +### LH's performance / resources + +Insights into LH's performance: +* https://longhorn.io/blog/performance-scalability-report-aug-2020/ +* https://github.com/longhorn/longhorn/wiki/Performance-Benchmark + +Resource requirements: +* https://github.com/longhorn/longhorn/issues/1691 diff --git a/charts/longhorn/values.yaml.gotmpl b/charts/longhorn/values.yaml.gotmpl new file mode 100644 index 00000000..edcd0ac1 --- /dev/null +++ b/charts/longhorn/values.yaml.gotmpl @@ -0,0 +1,68 @@ +# Values documentation: +# https://github.com/longhorn/longhorn/tree/v1.8.1/chart#values + +global: + # Warning: updating node selectors (after installation) will cause downtime + # https://longhorn.io/docs/archives/1.2.2/advanced-resources/deploy/node-selector/#setting-up-node-selector-after-longhorn-has-been-installed + # + # Warning: using node selectors will restrict our workloads to the same nodes + # https://longhorn.io/kb/tip-only-use-storage-on-a-set-of-nodes/#deploy-longhorn-components-only-on-a-specific-set-of-nodes + nodeSelector: {} + systemManagedComponentsNodeSelector: {} + +defaultSettings: + replicaAutoBalance: best-effort + + # control on which nodes LH will use disks + # use `node.longhorn.io/create-default-disk` node label for control + createDefaultDiskLabeledNodes: true + # use dedicated folder (disk) for storage + defaultDataPath: /longhorn + + # https://longhorn.io/docs/1.8.1/best-practices/#minimal-available-storage-and-over-provisioning + storageMinimalAvailablePercentage: 10 + + # Prevent LH deletion. Set to true if you want to delete LH + deletingConfirmationFlag: false + + # let replicas to be scheduled on the same node + replicaSoftAntiAffinity: false + + # we always use dedicated disks. 5% is a good value + storageReservedPercentageForDefaultDisk: 5 + +persistence: + # use only for non-critical ops workloads + # for critical workloads (e.g. database) + # use application replication (e.g. postgres HA operator) + defaultClass: false + + # https://longhorn.io/docs/1.8.1/best-practices/#io-performance + defaultDataLocality: best-effort + defaultClassReplicaCount: 2 + + # minimum volume size is 300Mi + # https://github.com/longhorn/longhorn/issues/8488 + defaultFsType: xfs + +resources: # https://longhorn.io/docs/1.8.1/best-practices/#minimum-recommended-hardware + requests: + cpu: 0.5 + memory: 128Mi + limits: + cpu: 4 + memory: 4Gi + +ingress: + enabled: true + className: "" + annotations: + namespace: {{ .Release.Namespace }} + cert-manager.io/cluster-issuer: "cert-issuer" + traefik.ingress.kubernetes.io/router.entrypoints: websecure + traefik.ingress.kubernetes.io/router.middlewares: traefik-traefik-basic-auth@kubernetescrd,traefik-longhorn-strip-prefix@kubernetescrd # namespace + middleware name + tls: true + tlsSecret: monitoring-tls + host: {{ requiredEnv "K8S_MONITORING_FQDN" }} + path: /longhorn + pathType: Prefix diff --git a/charts/portainer/values.longhorn-pv.yaml.gotmpl b/charts/portainer/values.longhorn-pv.yaml.gotmpl new file mode 100644 index 00000000..755420c0 --- /dev/null +++ b/charts/portainer/values.longhorn-pv.yaml.gotmpl @@ -0,0 +1,4 @@ +persistence: + enabled: true + size: "300Mi" # cannot be lower https://github.com/longhorn/longhorn/issues/8488 + storageClass: "{{.Values.longhornStorageClassName}}" diff --git a/charts/traefik/values.insecure.yaml.gotmpl b/charts/traefik/values.insecure.yaml.gotmpl index 2b60aae2..0c5dc75d 100644 --- a/charts/traefik/values.insecure.yaml.gotmpl +++ b/charts/traefik/values.insecure.yaml.gotmpl @@ -14,6 +14,7 @@ extraObjects: name: traefik targetPort: 9000 protocol: TCP + - apiVersion: v1 kind: Secret metadata: @@ -22,6 +23,7 @@ extraObjects: data: users: |2 {{ requiredEnv "TRAEFIK_K8S_AUTHORIZED_USER" }} + - apiVersion: traefik.io/v1alpha1 kind: Middleware metadata: @@ -29,6 +31,7 @@ extraObjects: spec: basicAuth: secret: traefik-authorized-users # https://doc.traefik.io/traefik/middlewares/http/basicauth/#users + - apiVersion: traefik.io/v1alpha1 kind: Middleware metadata: @@ -38,6 +41,17 @@ extraObjects: stripPrefix: prefixes: - /portainer + +- apiVersion: traefik.io/v1alpha1 + kind: Middleware + metadata: + name: longhorn-strip-prefix + namespace: {{.Release.Namespace}} + spec: + stripPrefix: + prefixes: + - /longhorn + - apiVersion: networking.k8s.io/v1 kind: Ingress metadata: diff --git a/charts/traefik/values.secure.yaml.gotmpl b/charts/traefik/values.secure.yaml.gotmpl index 55cfb1ed..8abb2e7e 100644 --- a/charts/traefik/values.secure.yaml.gotmpl +++ b/charts/traefik/values.secure.yaml.gotmpl @@ -39,6 +39,7 @@ extraObjects: spec: basicAuth: secret: traefik-authorized-users # https://doc.traefik.io/traefik/middlewares/http/basicauth/#users + - apiVersion: traefik.io/v1alpha1 kind: Middleware metadata: @@ -48,6 +49,17 @@ extraObjects: stripPrefix: prefixes: - /portainer + +- apiVersion: traefik.io/v1alpha1 + kind: Middleware + metadata: + name: longhorn-strip-prefix + namespace: {{.Release.Namespace}} + spec: + stripPrefix: + prefixes: + - /longhorn + - apiVersion: traefik.io/v1alpha1 kind: Middleware metadata: From 3ea41b58043b267fe9ffe4eec57aeeef82f383e5 Mon Sep 17 00:00:00 2001 From: Dustin Kaiser <8209087+mrnicegyu11@users.noreply.github.com> Date: Fri, 9 May 2025 12:03:41 +0200 Subject: [PATCH 07/19] Experimental: Try to add tracing to simcore-traefik on master --- services/simcore/docker-compose.deploy.master.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/services/simcore/docker-compose.deploy.master.yml b/services/simcore/docker-compose.deploy.master.yml index 792d9ca6..40a5114f 100644 --- a/services/simcore/docker-compose.deploy.master.yml +++ b/services/simcore/docker-compose.deploy.master.yml @@ -48,10 +48,11 @@ services: - "--providers.swarm.refreshSeconds=1" - "--providers.swarm.exposedByDefault=false" - "--providers.swarm.constraints=Label(`io.simcore.zone`, `${TRAEFIK_SIMCORE_ZONE}`)" - - "--tracing=true" - - "--tracing.addinternals" - - "--tracing.otlp=true" - - "--tracing.otlp.http=true" + #- "--tracing=true" + #- "--tracing.addinternals" + #- "--tracing.otlp=true" + #- "--tracing.otlp.http=true" + - "--tracing.otlp.http.endpoint=https://otel-collector:4318/v1/traces" deploy: resources: limits: From 29f2f2e6f40da3b3dd2677a922b31ffd1d0f3cd7 Mon Sep 17 00:00:00 2001 From: Dustin Kaiser Date: Wed, 14 May 2025 15:52:19 +0200 Subject: [PATCH 08/19] Fixes https://github.com/ITISFoundation/osparc-simcore/issues/7363 --- .../contentpacks/osparc-custom-content-pack-v2.json | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/services/graylog/data/contentpacks/osparc-custom-content-pack-v2.json b/services/graylog/data/contentpacks/osparc-custom-content-pack-v2.json index 59574fba..410d2d6d 100644 --- a/services/graylog/data/contentpacks/osparc-custom-content-pack-v2.json +++ b/services/graylog/data/contentpacks/osparc-custom-content-pack-v2.json @@ -1,9 +1,9 @@ { "v": "1", - "id": "dfaeea11-bde6-4203-9cfe-6ca2a23ca22e", - "rev": 44, - "name": "osparc-custom-content-pack-v2", - "summary": "osparc-custom-content-pack-v2", + "id": "daaeea11-bde6-4203-9cfe-6ca2a23ca22e", + "rev": 1, + "name": "osparc-custom-content-pack-v3", + "summary": "osparc-custom-content-pack-v3", "description": "", "vendor": "Osparc team", "url": "", @@ -623,7 +623,7 @@ "configuration": { "grok_pattern": { "@type": "string", - "@value": "log_level=%{WORD:log_level} \\| log_timestamp=%{TIMESTAMP_ISO8601:log_timestamp} \\| log_source=%{NOTSPACE:log_source} \\| log_uid=%{NOTSPACE:log_uid} \\| log_oec=%{NOTSPACE:log_oec}\\| log_trace_id=%{NOTSPACE:log_trace_id} \\| (log_span_id=%{DATA:log_span_id}\\|)? log_msg=%{GREEDYDATA:log_msg}" + "@value": "log_level=%{WORD:log_level} \\| log_timestamp=%{TIMESTAMP_ISO8601:log_timestamp} \\| log_source=%{DATA:log_source} \\| (log_uid=%{WORD:log_uid} \\| (log_oec=%{WORD:log_oec} \\| )?log_msg=%{GREEDYDATA:log_msg}" }, "named_captures_only": { "@type": "boolean", From b856eb0138df55417ad2fac1dc21b824df59cec2 Mon Sep 17 00:00:00 2001 From: Dustin Kaiser <8209087+mrnicegyu11@users.noreply.github.com> Date: Wed, 30 Jul 2025 10:29:20 +0200 Subject: [PATCH 09/19] Arch Linux Certificates Customization - 2 --- certificates/Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/certificates/Makefile b/certificates/Makefile index 9af700df..019fa89c 100644 --- a/certificates/Makefile +++ b/certificates/Makefile @@ -84,7 +84,8 @@ remove-root-certificate: ## removes the certificate from the host system $(if $(IS_OSX), \ sudo security remove-trusted-cert -d rootca.crt; \ , \ - sudo rm -f /usr/local/share/ca-certificates/osparc.crt; \ - sudo update-ca-certificates -f; \ + sudo rm -f /etc/ca-certificates/trust-source/anchors/osparc.crt; \ + sudo trust extract-compat; \ + sudo systemctl restart docker; \ ) \ ) From 54480870e2c2b31c1317aaa0896ebaec13a6fc9d Mon Sep 17 00:00:00 2001 From: Dustin Kaiser <8209087+mrnicegyu11@users.noreply.github.com> Date: Thu, 25 Sep 2025 08:36:14 +0200 Subject: [PATCH 10/19] Revert: disable loki & vector-dev, oldschool graylog logging (#1223) * wip * Add csi-s3 and have portainer use it * Change request @hrytsuk 1GB max portainer volume size * Arch Linux Certificates Customization * Fix pgsql exporter failure * [Kubernetes] Introduce on-prem persistent Storage (Longhorn) :tada: (#979) * Introduce longhorn chart * Further longhorn configuration * Longhorn: further settings configuration * Fix longhorn configuration bugs Extra: introduce longhorn pv vales for portainer * Add comment for deletion longhorn * Further longhorn configuration * Add README.md for Longhorn wit FAQ * Update Longhorn readme * Update readme * Futher LH configuration * Update LH's Readme * Update Longhorn Readme * Improve LH's Readme * LH: Reduce reserved default disk space to 5% Since we use a dedicated disk for LH, we can go ahead with 5% * Use values to set Longhorn storage class * Update LH's Readme * LH Readme: add requirements reference * PR Review: bring back portainer s3 pv * LH: decrease portinaer volume size * Experimental: Try to add tracing to simcore-traefik on master * Fixes https://github.com/ITISFoundation/osparc-simcore/issues/7363 * Arch Linux Certificates Customization - 2 * Send docker logs directly to graylog * revert arch linux customization --------- Co-authored-by: Dustin Kaiser Co-authored-by: YH <50014626+YuryHrytsuk@users.noreply.github.com> --- services/logging/docker-compose.yml.j2 | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/services/logging/docker-compose.yml.j2 b/services/logging/docker-compose.yml.j2 index be2aa1a9..0fa34c0e 100644 --- a/services/logging/docker-compose.yml.j2 +++ b/services/logging/docker-compose.yml.j2 @@ -75,7 +75,7 @@ services: aliases: - graylog ports: - - 12200:12201/udp + - 12201:12201/udp - 12202:12202/udp deploy: replicas: 1 @@ -102,7 +102,7 @@ services: vector: image: timberio/vector:0.49.X-debian ports: - - "12201:12201/udp" # GELF input + - "12200:12201/udp" # GELF input volumes: - /var/run/docker.sock:/var/run/docker.sock:ro environment: @@ -112,6 +112,7 @@ services: - source: vector_config target: /etc/vector/vector.yaml deploy: + replicas: 0 resources: limits: cpus: "1.0" @@ -140,7 +141,7 @@ services: deploy: placement: constraints: [] - replicas: 1 + replicas: 0 restart_policy: condition: any delay: 5s From 534f6f4638a44ad9dbf29619d95745a742f78671 Mon Sep 17 00:00:00 2001 From: Dustin Kaiser <8209087+mrnicegyu11@users.noreply.github.com> Date: Thu, 25 Sep 2025 16:57:29 +0200 Subject: [PATCH 11/19] Enable Chatbot for S4L products (#1221) * wip * Add csi-s3 and have portainer use it * Change request @hrytsuk 1GB max portainer volume size * Arch Linux Certificates Customization * Fix pgsql exporter failure * [Kubernetes] Introduce on-prem persistent Storage (Longhorn) :tada: (#979) * Introduce longhorn chart * Further longhorn configuration * Longhorn: further settings configuration * Fix longhorn configuration bugs Extra: introduce longhorn pv vales for portainer * Add comment for deletion longhorn * Further longhorn configuration * Add README.md for Longhorn wit FAQ * Update Longhorn readme * Update readme * Futher LH configuration * Update LH's Readme * Update Longhorn Readme * Improve LH's Readme * LH: Reduce reserved default disk space to 5% Since we use a dedicated disk for LH, we can go ahead with 5% * Use values to set Longhorn storage class * Update LH's Readme * LH Readme: add requirements reference * PR Review: bring back portainer s3 pv * LH: decrease portinaer volume size * Experimental: Try to add tracing to simcore-traefik on master * Fixes https://github.com/ITISFoundation/osparc-simcore/issues/7363 * Arch Linux Certificates Customization - 2 * Remove frontend vendor chatbot service * wip --------- Co-authored-by: Dustin Kaiser Co-authored-by: YH <50014626+YuryHrytsuk@users.noreply.github.com> --- services/vendors/docker-compose.yml.j2 | 35 +------------------------- services/vendors/template.env | 7 ++---- 2 files changed, 3 insertions(+), 39 deletions(-) diff --git a/services/vendors/docker-compose.yml.j2 b/services/vendors/docker-compose.yml.j2 index c1a49ade..7a8149c2 100644 --- a/services/vendors/docker-compose.yml.j2 +++ b/services/vendors/docker-compose.yml.j2 @@ -63,40 +63,7 @@ services: - traefik.http.services.vendor_chat_backend.loadbalancer.server.port=${VENDOR_CHATBOT_BACKEND_PORT} - traefik.http.routers.vendor_chat_backend.entrypoints=https - traefik.http.routers.vendor_chat_backend.tls=true - - traefik.http.routers.vendor_chat_backend.rule=(PathPrefix(`/v1/`) && ({{ generate_vendors_traefik_rule(VENDOR_CHATBOT_FRONTEND_PRODUCTS, VENDOR_CHATBOT_FRONTEND_SUBDOMAIN_PREFIX) }})) - networks: - - public - chat-frontend: - image: ${VENDOR_CHATBOT_FRONTEND_IMAGE} - init: true -{%- raw %} - hostname: "v-chat-frontend-{{.Node.Hostname}}-{{.Task.Slot}}" -{%- endraw %} - deploy: - replicas: ${VENDOR_CHATBOT_FRONTEND_REPLICAS} - placement: - constraints: - - node.labels.simcore==true - resources: - limits: - cpus: "1.0" - memory: 2.5G - reservations: - cpus: "0.1" - memory: 512M - update_config: - parallelism: 1 - order: start-first - failure_action: continue - delay: 10s - labels: - - traefik.enable=true - - traefik.swarm.network=${PUBLIC_NETWORK} - - traefik.http.services.vendor_chat_frontend.loadbalancer.server.port=${VENDOR_CHATBOT_FRONTEND_PORT} - - traefik.http.routers.vendor_chat_frontend.entrypoints=https - - traefik.http.routers.vendor_chat_frontend.tls=true - - traefik.http.routers.vendor_chat_frontend.rule=(!PathPrefix(`/v1/`) && ({{ generate_vendors_traefik_rule(VENDOR_CHATBOT_FRONTEND_PRODUCTS, VENDOR_CHATBOT_FRONTEND_SUBDOMAIN_PREFIX) }})) - - traefik.http.routers.vendor_chat_frontend.middlewares=authenticated_platform_user@swarm + - traefik.http.routers.vendor_chat_backend.rule=(PathPrefix(`/v1/`) && ({{ generate_vendors_traefik_rule(VENDOR_CHATBOT_PRODUCTS, VENDOR_CHATBOT_SUBDOMAIN_PREFIX) }})) networks: - public networks: diff --git a/services/vendors/template.env b/services/vendors/template.env index 7fbe5a61..23a1d554 100644 --- a/services/vendors/template.env +++ b/services/vendors/template.env @@ -6,10 +6,7 @@ VENDOR_MANUAL_PORT=${VENDOR_MANUAL_PORT} VENDOR_CHATBOT_BACKEND_IMAGE=${VENDOR_CHATBOT_BACKEND_IMAGE} VENDOR_CHATBOT_BACKEND_PORT=${VENDOR_CHATBOT_BACKEND_PORT} VENDOR_CHATBOT_BACKEND_REPLICAS=${VENDOR_CHATBOT_BACKEND_REPLICAS} -VENDOR_CHATBOT_FRONTEND_IMAGE=${VENDOR_CHATBOT_FRONTEND_IMAGE} -VENDOR_CHATBOT_FRONTEND_PORT=${VENDOR_CHATBOT_FRONTEND_PORT} -VENDOR_CHATBOT_FRONTEND_PRODUCTS=${VENDOR_CHATBOT_FRONTEND_PRODUCTS} -VENDOR_CHATBOT_FRONTEND_REPLICAS=${VENDOR_CHATBOT_FRONTEND_REPLICAS} -VENDOR_CHATBOT_FRONTEND_SUBDOMAIN_PREFIX=${VENDOR_CHATBOT_FRONTEND_SUBDOMAIN_PREFIX} +VENDOR_CHATBOT_PRODUCTS=${VENDOR_CHATBOT_PRODUCTS} +VENDOR_CHATBOT_SUBDOMAIN_PREFIX=${VENDOR_CHATBOT_SUBDOMAIN_PREFIX} PUBLIC_NETWORK=${PUBLIC_NETWORK} OPENAI_API_KEY=${OPENAI_API_KEY} From 1e15c940b3789aa8c9a6d4f47d7a3d48702a7398 Mon Sep 17 00:00:00 2001 From: YH <50014626+YuryHrytsuk@users.noreply.github.com> Date: Thu, 2 Oct 2025 10:25:58 +0200 Subject: [PATCH 12/19] Kubernetes: fix global network policy (#1227) --- .../templates/globalpolicy.yaml | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/charts/calico-configuration/templates/globalpolicy.yaml b/charts/calico-configuration/templates/globalpolicy.yaml index cb579df2..30d787f0 100644 --- a/charts/calico-configuration/templates/globalpolicy.yaml +++ b/charts/calico-configuration/templates/globalpolicy.yaml @@ -26,10 +26,11 @@ spec: # IP from https://github.com/kubernetes-sigs/kubespray/blob/v2.24.1/roles/kubespray-defaults/defaults/main/main.yml#L108 - action: Allow protocol: UDP - nets: - - 169.254.25.10/32 - ports: - - 53 + destination: + nets: + - 169.254.25.10/32 + ports: + - 53 - action: Allow protocol: TCP destination: @@ -38,7 +39,8 @@ spec: - 53 - action: Allow protocol: TCP - nets: - - 169.254.25.10/32 - ports: - - 53 + destination: + nets: + - 169.254.25.10/32 + ports: + - 53 From 6571bb821c0b422a65fca8fdaa46e5fa5153616b Mon Sep 17 00:00:00 2001 From: Dustin Kaiser Date: Fri, 3 Oct 2025 14:57:29 +0200 Subject: [PATCH 13/19] Add authentication middleware to cahtbot vendor service --- services/vendors/docker-compose.yml.j2 | 1 + 1 file changed, 1 insertion(+) diff --git a/services/vendors/docker-compose.yml.j2 b/services/vendors/docker-compose.yml.j2 index 7a8149c2..0b139cea 100644 --- a/services/vendors/docker-compose.yml.j2 +++ b/services/vendors/docker-compose.yml.j2 @@ -64,6 +64,7 @@ services: - traefik.http.routers.vendor_chat_backend.entrypoints=https - traefik.http.routers.vendor_chat_backend.tls=true - traefik.http.routers.vendor_chat_backend.rule=(PathPrefix(`/v1/`) && ({{ generate_vendors_traefik_rule(VENDOR_CHATBOT_PRODUCTS, VENDOR_CHATBOT_SUBDOMAIN_PREFIX) }})) + - traefik.http.routers.vendor_chat_backend.middlewares=authenticated_platform_user@swarm networks: - public networks: From c05f58c460d13427b6ea7d79c473dd8e4ce5cdaf Mon Sep 17 00:00:00 2001 From: Dustin Kaiser Date: Mon, 6 Oct 2025 17:21:47 +0200 Subject: [PATCH 14/19] Revert "Kubernetes: fix global network policy (#1227)" This reverts commit 2d3adb13172cbc577d92ad499d80ff8f0948e267. --- .../templates/globalpolicy.yaml | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/charts/calico-configuration/templates/globalpolicy.yaml b/charts/calico-configuration/templates/globalpolicy.yaml index 30d787f0..cb579df2 100644 --- a/charts/calico-configuration/templates/globalpolicy.yaml +++ b/charts/calico-configuration/templates/globalpolicy.yaml @@ -26,11 +26,10 @@ spec: # IP from https://github.com/kubernetes-sigs/kubespray/blob/v2.24.1/roles/kubespray-defaults/defaults/main/main.yml#L108 - action: Allow protocol: UDP - destination: - nets: - - 169.254.25.10/32 - ports: - - 53 + nets: + - 169.254.25.10/32 + ports: + - 53 - action: Allow protocol: TCP destination: @@ -39,8 +38,7 @@ spec: - 53 - action: Allow protocol: TCP - destination: - nets: - - 169.254.25.10/32 - ports: - - 53 + nets: + - 169.254.25.10/32 + ports: + - 53 From 9a8113b3f6b1165e1e7fa702ed654c6122316b29 Mon Sep 17 00:00:00 2001 From: Dustin Kaiser <8209087+mrnicegyu11@users.noreply.github.com> Date: Tue, 7 Oct 2025 16:02:30 +0200 Subject: [PATCH 15/19] Add ACME DNS Resolver for gitlabCD and k8s (#1217) * wip * Add csi-s3 and have portainer use it * Change request @hrytsuk 1GB max portainer volume size * Arch Linux Certificates Customization * Fix pgsql exporter failure * [Kubernetes] Introduce on-prem persistent Storage (Longhorn) :tada: (#979) * Introduce longhorn chart * Further longhorn configuration * Longhorn: further settings configuration * Fix longhorn configuration bugs Extra: introduce longhorn pv vales for portainer * Add comment for deletion longhorn * Further longhorn configuration * Add README.md for Longhorn wit FAQ * Update Longhorn readme * Update readme * Futher LH configuration * Update LH's Readme * Update Longhorn Readme * Improve LH's Readme * LH: Reduce reserved default disk space to 5% Since we use a dedicated disk for LH, we can go ahead with 5% * Use values to set Longhorn storage class * Update LH's Readme * LH Readme: add requirements reference * PR Review: bring back portainer s3 pv * LH: decrease portinaer volume size * Experimental: Try to add tracing to simcore-traefik on master * Fixes https://github.com/ITISFoundation/osparc-simcore/issues/7363 * Arch Linux Certificates Customization - 2 * wip * wip * this might work * k8s wip * wip * wip --------- Co-authored-by: Dustin Kaiser Co-authored-by: YH <50014626+YuryHrytsuk@users.noreply.github.com> --- charts/Makefile | 2 +- .../cert-manager/values.acme-dns.yaml.gotmpl | 40 +++++++++++++++++++ charts/cert-manager/values.common.yaml.gotmpl | 5 +++ .../cert-manager/values.rfc2136.yaml.gotmpl | 36 ----------------- 4 files changed, 46 insertions(+), 37 deletions(-) create mode 100644 charts/cert-manager/values.acme-dns.yaml.gotmpl delete mode 100644 charts/cert-manager/values.rfc2136.yaml.gotmpl diff --git a/charts/Makefile b/charts/Makefile index 39844abb..40c6945d 100644 --- a/charts/Makefile +++ b/charts/Makefile @@ -4,7 +4,7 @@ REPO_BASE_DIR := $(shell git rev-parse --show-toplevel) include ${REPO_BASE_DIR}/scripts/common.Makefile include $(REPO_CONFIG_LOCATION) -CONFIG_DIR := $(shell dirname $(REPO_CONFIG_LOCATION)) +export CONFIG_DIR := $(shell dirname $(REPO_CONFIG_LOCATION)) CHART_DIRS := $(wildcard $(REPO_BASE_DIR)/charts/*/) .PHONY: .check-helmfile-installed diff --git a/charts/cert-manager/values.acme-dns.yaml.gotmpl b/charts/cert-manager/values.acme-dns.yaml.gotmpl new file mode 100644 index 00000000..5fcd0d60 --- /dev/null +++ b/charts/cert-manager/values.acme-dns.yaml.gotmpl @@ -0,0 +1,40 @@ +cert-manager: + extraArgs: + - --dns01-recursive-nameservers="8.8.8.8:53" + - --dns01-recursive-nameservers-only + startupapicheck: + enabled: false + skipDNSResolutionCheck: true + maxConcurrentChallenges: 2 + extraObjects: + - | + apiVersion: v1 + kind: Secret + metadata: + name: acme-dns-secret + namespace: {{ .Release.Namespace }} # secret must be in same namespace as Cert Manager deployment + type: Opaque + stringData: +{{ $configDir := requiredEnv "CONFIG_DIR" }} + acmedns.json: | +{{ readFile (printf "%s/lego-acme-accounts/acme-dns-accounts.json" $configDir) | indent 8 }} + - | + apiVersion: cert-manager.io/v1 + kind: ClusterIssuer + metadata: + name: cert-issuer + namespace: {{ .Release.Namespace }} + spec: + acme: + email: {{ requiredEnv "OSPARC_DEVOPS_MAIL_ADRESS" }} + server: {{ requiredEnv "DNS_CHALLENGE_ACME_SERVER" }} + privateKeySecretRef: + name: cert-manager-acme-private-key + solvers: + - dns01: + cnameStrategy: Follow + acmeDNS: + accountSecretRef: + name: acme-dns-secret + key: acmedns.json + host: {{ requiredEnv "ACME_DNS_API_BASE" }} diff --git a/charts/cert-manager/values.common.yaml.gotmpl b/charts/cert-manager/values.common.yaml.gotmpl index ba797108..6affd24a 100644 --- a/charts/cert-manager/values.common.yaml.gotmpl +++ b/charts/cert-manager/values.common.yaml.gotmpl @@ -8,3 +8,8 @@ cert-manager: webhook: securePort: 10250 + cainjector: + replicaCount: 1 + replicaCount: 1 + webhook: + replicaCount: 1 diff --git a/charts/cert-manager/values.rfc2136.yaml.gotmpl b/charts/cert-manager/values.rfc2136.yaml.gotmpl deleted file mode 100644 index f6a24905..00000000 --- a/charts/cert-manager/values.rfc2136.yaml.gotmpl +++ /dev/null @@ -1,36 +0,0 @@ -cert-manager: - extraObjects: - - | - apiVersion: v1 - kind: Secret - metadata: - name: rfc2136-credentials - namespace: {{ .Release.Namespace }} # secret must be in same namespace as Cert Manager deployment - type: Opaque - data: - tsig-secret-key: {{ requiredEnv "RFC2136_TSIG_SECRET" | b64enc }} # Base64 encoded Secret Access Key - - | - apiVersion: cert-manager.io/v1 - kind: ClusterIssuer - metadata: - name: cert-issuer - namespace: {{ .Release.Namespace }} - annotations: - # ClusterIssuer depends on cert-manager CRDs. We need to wait for them to be installed before creating the ClusterIssuer - "helm.sh/hook": post-install,post-upgrade - "helm.sh/hook-weight": "1" - spec: - acme: - email: {{ requiredEnv "OSPARC_DEVOPS_MAIL_ADRESS" }} - server: {{ requiredEnv "DNS_CHALLENGE_ACME_SERVER" }} - privateKeySecretRef: - name: cert-manager-acme-private-key - solvers: - - dns01: - rfc2136: - nameserver: {{ requiredEnv "RFC2136_NAMESERVER" }} - tsigKeyName: {{ requiredEnv "RFC2136_TSIG_KEY" }} - tsigAlgorithm: {{ requiredEnv "RFC2136_TSIG_ALGORITHM_CERT_MANAGER" }} - tsigSecretSecretRef: - name: rfc2136-credentials - key: tsig-secret-key From a0d93e1c3ef8fb5d73afb690add0e1ab3c1a6df2 Mon Sep 17 00:00:00 2001 From: Dustin Kaiser Date: Wed, 15 Oct 2025 14:01:23 +0200 Subject: [PATCH 16/19] Experimental: vectordev via host ports --- services/logging/docker-compose.yml.j2 | 17 +++++++++-------- services/logging/template.env | 1 + services/logging/vector.yaml | 4 ++-- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/services/logging/docker-compose.yml.j2 b/services/logging/docker-compose.yml.j2 index 0fa34c0e..1c2f8b26 100644 --- a/services/logging/docker-compose.yml.j2 +++ b/services/logging/docker-compose.yml.j2 @@ -75,7 +75,7 @@ services: aliases: - graylog ports: - - 12201:12201/udp + - 12203:12201/udp - 12202:12202/udp deploy: replicas: 1 @@ -100,19 +100,20 @@ services: - traefik.http.middlewares.graylog_replace_regex.replacepathregex.replacement=/$${1} - traefik.http.routers.graylog.middlewares=ops_whitelist_ips@swarm, ops_gzip@swarm, graylog_replace_regex vector: - image: timberio/vector:0.49.X-debian + image: timberio/vector:0.50.X-debian ports: - - "12200:12201/udp" # GELF input + - "12201:12203/udp" # GELF input volumes: - /var/run/docker.sock:/var/run/docker.sock:ro environment: - VECTOR_CONFIG=/etc/vector/vector.yaml - VECTOR_LOG=info + - VECTOR_LOG_DESTINATION=${VECTOR_LOG_DESTINATION} configs: - source: vector_config target: /etc/vector/vector.yaml deploy: - replicas: 0 + replicas: 1 resources: limits: cpus: "1.0" @@ -120,12 +121,12 @@ services: reservations: memory: 256M labels: [] - networks: - - monitoring - - graylog + networks: [] loki: - image: grafana/loki:3.5.4 + image: grafana/loki:3.5.7 + ports: + - "12204:3100/udp" # GELF input configs: - source: loki_config target: /etc/loki/loki.yaml diff --git a/services/logging/template.env b/services/logging/template.env index cb34724b..3878dc1d 100644 --- a/services/logging/template.env +++ b/services/logging/template.env @@ -24,3 +24,4 @@ S3_FORCE_PATH_STYLE_LOKI=${S3_FORCE_PATH_STYLE_LOKI} S3_REGION_LOKI=${S3_REGION_LOKI} S3_SECRET_KEY_LOKI=${S3_SECRET_KEY_LOKI} STORAGE_DOMAIN=${STORAGE_DOMAIN} +VECTOR_LOG_DESTINATION=${VECTOR_LOG_DESTINATION} diff --git a/services/logging/vector.yaml b/services/logging/vector.yaml index 16ddf97d..5a12f0c0 100644 --- a/services/logging/vector.yaml +++ b/services/logging/vector.yaml @@ -58,7 +58,7 @@ sinks: loki: type: loki inputs: ["process_logs"] - endpoint: "http://loki:3100" + endpoint: "http://${VECTOR_LOG_DESTINATION:?err}:12204" encoding: codec: json labels: @@ -76,7 +76,7 @@ sinks: graylog: type: socket inputs: ["process_logs"] - address: "logging_graylog:12201" + address: "${VECTOR_LOG_DESTINATION:?err}:12203" mode: udp encoding: codec: gelf From 4cf5f1e0e5532a0f105546c1d85d3db87f7e162b Mon Sep 17 00:00:00 2001 From: Dustin Kaiser Date: Wed, 15 Oct 2025 14:05:11 +0200 Subject: [PATCH 17/19] revert --- certificates/Makefile | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/certificates/Makefile b/certificates/Makefile index 019fa89c..a9326900 100644 --- a/certificates/Makefile +++ b/certificates/Makefile @@ -66,10 +66,16 @@ install-root-certificate: rootca.crt ## installs a certificate in the host syste echo "Is the DOCKER service ready? press when ready" && read -n 1; \ fi;\ echo "======================================";,\ - sudo cp $< /etc/ca-certificates/trust-source/anchors/osparc.crt; \ - sudo trust extract-compat && \ - echo "# restarting docker daemon" && \ + $(if $(IS_OSX), \ + sudo security add-trusted-cert -d -k /Library/Keychains/System.keychain $<; \ + echo "Please restart the DOCKER service now..." && read -n 1; \ + echo "Is the DOCKER service ready? press when ready" && read -n 1; \ + , \ + sudo cp $< /usr/local/share/ca-certificates/osparc.crt; \ + sudo update-ca-certificates -f; \ + echo "# restarting docker daemon"; \ sudo systemctl restart docker \ + ) \ ) @@ -84,8 +90,7 @@ remove-root-certificate: ## removes the certificate from the host system $(if $(IS_OSX), \ sudo security remove-trusted-cert -d rootca.crt; \ , \ - sudo rm -f /etc/ca-certificates/trust-source/anchors/osparc.crt; \ - sudo trust extract-compat; \ - sudo systemctl restart docker; \ + sudo rm -f /usr/local/share/ca-certificates/osparc.crt; \ + sudo update-ca-certificates -f; \ ) \ ) From 41cc5c5419e9946edf9be28a8ced826df6c6616e Mon Sep 17 00:00:00 2001 From: Dustin Kaiser Date: Wed, 15 Oct 2025 14:08:20 +0200 Subject: [PATCH 18/19] fix --- services/logging/docker-compose.yml.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/logging/docker-compose.yml.j2 b/services/logging/docker-compose.yml.j2 index 1c2f8b26..5b02ec10 100644 --- a/services/logging/docker-compose.yml.j2 +++ b/services/logging/docker-compose.yml.j2 @@ -102,7 +102,7 @@ services: vector: image: timberio/vector:0.50.X-debian ports: - - "12201:12203/udp" # GELF input + - "12201:12201/udp" # GELF input volumes: - /var/run/docker.sock:/var/run/docker.sock:ro environment: From 035b61e87cf62652dba370b93448f640cc027a96 Mon Sep 17 00:00:00 2001 From: Dustin Kaiser Date: Wed, 15 Oct 2025 15:10:36 +0200 Subject: [PATCH 19/19] fixes --- services/logging/docker-compose.yml.j2 | 33 ++++++++++++++++++-------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/services/logging/docker-compose.yml.j2 b/services/logging/docker-compose.yml.j2 index 5b02ec10..19b29b0a 100644 --- a/services/logging/docker-compose.yml.j2 +++ b/services/logging/docker-compose.yml.j2 @@ -18,7 +18,7 @@ services: memory: 300M cpus: "0.1" networks: - graylog: + logging: aliases: - mongo # needed because of graylog configuration @@ -46,7 +46,7 @@ services: memory: 1G cpus: "0.1" networks: - graylog: + logging: # Graylog: https://hub.docker.com/r/graylog/graylog/ graylog: image: graylog/graylog:6.0.5 @@ -71,12 +71,18 @@ services: networks: public: monitoring: - graylog: + logging: aliases: - graylog ports: - - 12203:12201/udp - - 12202:12202/udp + - target: 12201 + published: 12203 + protocol: udp + mode: host + - target: 12202 + published: 12202 + protocol: udp + mode: host deploy: replicas: 1 restart_policy: @@ -102,7 +108,10 @@ services: vector: image: timberio/vector:0.50.X-debian ports: - - "12201:12201/udp" # GELF input + - target: 12201 + published: 12201 + protocol: udp + mode: host volumes: - /var/run/docker.sock:/var/run/docker.sock:ro environment: @@ -121,12 +130,16 @@ services: reservations: memory: 256M labels: [] - networks: [] + networks: + logging: loki: image: grafana/loki:3.5.7 ports: - - "12204:3100/udp" # GELF input + - target: 3100 + published: 12204 + protocol: tcp + mode: host configs: - source: loki_config target: /etc/loki/loki.yaml @@ -142,7 +155,7 @@ services: deploy: placement: constraints: [] - replicas: 0 + replicas: 1 restart_policy: condition: any delay: 5s @@ -173,7 +186,7 @@ volumes: graylog_journal: networks: - graylog: + logging: public: external: true name: ${PUBLIC_NETWORK}