Skip to content

Commit

Permalink
Merge pull request #552 from raft-tech/release/v3.7.2-sprint-112
Browse files Browse the repository at this point in the history
Release/v3.7.2 sprint 112
  • Loading branch information
ADPennington authored Dec 2, 2024
2 parents c98eb57 + dd5ea65 commit e9bdc33
Show file tree
Hide file tree
Showing 27 changed files with 567 additions and 123 deletions.
3 changes: 1 addition & 2 deletions scripts/cf-check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@ else
NEXUS_URL="https://tdp-nexus.dev.raftlabs.tech/repository/tdp-bin/cloudfoundry-cli/$NEXUS_ARCHIVE"
curl $NEXUS_URL -o $NEXUS_ARCHIVE # prefers anonymous, use of -u failed.
tar xzf $NEXUS_ARCHIVE
mv ./cf /usr/local/bin/
chmod +x /usr/local/bin/cf
mv ./cf7 /usr/local/bin/cf
cf --version

fi
11 changes: 7 additions & 4 deletions scripts/deploy-backend.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ set_cf_envs()
"LOGGING_LEVEL"
"REDIS_URI"
"JWT_KEY"
"STAGING_JWT_KEY"
"SENDGRID_API_KEY"
)

Expand All @@ -71,9 +70,13 @@ set_cf_envs()
cf_cmd="cf unset-env $CGAPPNAME_BACKEND $var_name ${!var_name}"
$cf_cmd
continue
elif [[ ("$var_name" =~ "STAGING_") && ("$CF_SPACE" = "tanf-staging") ]]; then
sed_var_name=$(echo "$var_name" | sed -e 's@STAGING_@@g')
cf_cmd="cf set-env $CGAPPNAME_BACKEND $sed_var_name ${!var_name}"
elif [[ ("$CF_SPACE" = "tanf-staging") ]]; then
var_value=${!var_name}
staging_var="STAGING_$var_name"
if [[ "${!staging_var}" ]]; then
var_value=${!staging_var}
fi
cf_cmd="cf set-env $CGAPPNAME_BACKEND $var_name ${var_value}"
else
cf_cmd="cf set-env $CGAPPNAME_BACKEND $var_name ${!var_name}"
fi
Expand Down
4 changes: 2 additions & 2 deletions tdrs-backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ RUN apt-get install -y gcc graphviz graphviz-dev libpq-dev python3-dev vim curl
# Postgres client setup
#RUN bash -c 'echo "deb [trusted=yes] https://tdp-nexus.dev.raftlabs.tech/repository/apt-proxy-postgres/ bullseye-pdpg main" >> /etc/apt/sources.list'
RUN apt-get update -y && apt-get upgrade -y
RUN apt install -y postgresql-common && install -d /usr/share/postgresql-common/pgdg && \
sh -c 'echo "deb [signed-by=/usr/share/postgresql-common/pgdg/apt.postgresql.org.asc trusted=yes] https://tdp-nexus.dev.raftlabs.tech/repository/apt-proxy-postgres/ bullseye-pgdg main" >> /etc/apt/sources.list' && \
RUN apt --purge remove postgresql postgresql-* && apt install -y postgresql-common curl ca-certificates && install -d /usr/share/postgresql-common/pgdg && \
curl -o /usr/share/postgresql-common/pgdg/apt.postgresql.org.asc --fail https://www.postgresql.org/media/keys/ACCC4CF8.asc && \
sh -c 'echo "deb [signed-by=/usr/share/postgresql-common/pgdg/apt.postgresql.org.asc] https://apt.postgresql.org/pub/repos/apt bullseye-pgdg main" > /etc/apt/sources.list.d/pgdg.list' && \
apt -y update && apt -y upgrade && apt install postgresql-client-15 -y

# Install pipenv
Expand Down
13 changes: 12 additions & 1 deletion tdrs-backend/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,15 @@ services:
command: --config /usr/share/grafana/conf/custom.ini
depends_on:
- grafana-pg

alertmanager:
restart: always
image: prom/alertmanager:v0.27.0
ports:
- 9093:9093
volumes:
- ./plg/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
command: --config.file=/etc/alertmanager/alertmanager.yml --storage.path=/alertmanager --log.level=debug --web.external-url=http://localhost:3000/alerts --web.route-prefix=/alerts --cluster.listen-address=""

prometheus:
restart: always
Expand All @@ -109,12 +118,14 @@ services:
- 9090:9090
volumes:
- ./plg/prometheus/prometheus.local.yml:/etc/prometheus/prometheus.yml
- ./plg/prometheus/django_rules.yml:/etc/prometheus/django_rules.yml
- ./plg/prometheus/django-rules.yml:/etc/prometheus/django-rules.yml
- ./plg/prometheus/alerts.local.yml:/etc/prometheus/alerts.yml
- prometheus_data:/prometheus
depends_on:
- web
- celery-exporter
- postgres-exporter
- alertmanager

promtail:
restart: always
Expand Down
71 changes: 71 additions & 0 deletions tdrs-backend/plg/alertmanager/alertmanager.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
global:
# The smarthost and SMTP sender used for mail notifications.
smtp_smarthost: 'smtp.sendgrid.net:587'
smtp_from: '[email protected]'
smtp_auth_username: 'apikey'
smtp_auth_password: '{{ sendgrid_api_key }}'

# The directory from which notification templates are read.
templates:
- '/etc/alertmanager/template/*.tmpl'

# The root route on which each incoming alert enters.
route:
# The labels by which incoming alerts are grouped together. For example,
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
# be batched into a single group.
group_by: ['alertname', 'env', 'service']

# When a new group of alerts is created by an incoming alert, wait at
# least 'group_wait' to send the initial notification.
# This way ensures that you get multiple alerts for the same group that start
# firing shortly after another are batched together on the first
# notification.
group_wait: 30s

# When the first notification was sent, wait 'group_interval' to send a batch
# of new alerts that started firing for that group.
group_interval: 5m

# If an alert has successfully been sent, wait 'repeat_interval' to
# resend them.
repeat_interval: 5m

# A default receiver
receiver: admin-team-emails

# All the above attributes are inherited by all child routes and can
# overwritten on each.

# The child route trees.
routes:
# This routes performs a regular expression match on alert labels to
# catch alerts that are related to a list of services.
- matchers:
- alertname=~"UpTime"
receiver: dev-team-emails
group_wait: 30m

# Inhibition rules allow to mute a set of alerts given that another alert is
# firing.
# We use this to mute any warning-level notifications if the same alert is
# already critical.
inhibit_rules:
- source_matchers: [severity="CRITICAL"]
target_matchers: [severity="WARNING"]
# Apply inhibition if the alertname is the same.
# CAUTION:
# If all label names listed in `equal` are missing
# from both the source and target alerts,
# the inhibition rule will apply!
equal: [alertname, env, service]


receivers:
- name: 'admin-team-emails'
email_configs:
- to: '{{ admin_team_emails }}'

- name: 'dev-team-emails'
email_configs:
- to: '{{ dev_team_emails }}'
10 changes: 10 additions & 0 deletions tdrs-backend/plg/alertmanager/manifest.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
version: 1
applications:
- name: alertmanager
memory: 512M
disk_quota: 1G
instances: 1
command: |
mkdir /tmp
buildpacks:
- https://github.com/cloudfoundry/binary-buildpack
120 changes: 115 additions & 5 deletions tdrs-backend/plg/grafana/dashboards/logs_dashboard.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,95 @@
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 4,
"links": [],
"panels": [
{
"datasource": {
"default": true,
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "dark-red",
"value": 80
},
{
"color": "light-red",
"value": 85
},
{
"color": "#EAB839",
"value": 90
},
{
"color": "semi-dark-green",
"value": 95
},
{
"color": "dark-green",
"value": 100
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 14,
"w": 24,
"x": 0,
"y": 0
},
"id": 2,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.2.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "100 * avg_over_time(up{job=~\"$app\"}[$__range])",
"instant": false,
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "App Uptime",
"type": "stat"
},
{
"datasource": {
"type": "loki",
Expand All @@ -31,7 +117,7 @@
"h": 28,
"w": 24,
"x": 0,
"y": 0
"y": 14
},
"id": 1,
"options": {
Expand All @@ -56,7 +142,7 @@
"refId": "A"
}
],
"title": "Logs",
"title": "Job Logs",
"type": "logs"
}
],
Expand All @@ -71,7 +157,7 @@
"list": [
{
"current": {
"selected": false,
"selected": true,
"text": "All",
"value": "$__all"
},
Expand All @@ -98,11 +184,35 @@
"skipUrlSync": false,
"sort": 0,
"type": "query"
},
{
"current": {
"selected": true,
"text": "All",
"value": "$__all"
},
"definition": "query_result(up)",
"hide": 0,
"includeAll": true,
"label": "App",
"multi": false,
"name": "app",
"options": [],
"query": {
"qryType": 3,
"query": "query_result(up)",
"refId": "PrometheusVariableQueryEditor-VariableQuery"
},
"refresh": 1,
"regex": "/.*job=\"([^\"]+).*/",
"skipUrlSync": false,
"sort": 0,
"type": "query"
}
]
},
"time": {
"from": "now-3h",
"from": "now-24h",
"to": "now"
},
"timepicker": {},
Expand Down
39 changes: 39 additions & 0 deletions tdrs-backend/plg/prometheus/alerts.local.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
groups:
- name: database.alerts
rules:
- alert: LocalDatabaseDown
expr: last_over_time(pg_up{job="postgres"}[1m]) == 0
for: 1m
labels:
severity: CRITICAL
annotations:
summary: "The {{ $labels.service }} service is down."
description: "The {{ $labels.service }} service in the {{ $labels.env }} environment has been down for more than 1 minute."
- name: backend.alerts
rules:
- alert: LocalBackendDown
expr: last_over_time(up{job=~"tdp-backend.*"}[1m]) == 0
for: 1m
labels:
severity: ERROR
annotations:
summary: "The {{ $labels.service }} service is down."
description: "The {{ $labels.service }} service in the {{ $labels.env }} environment has been down for more than 1 minute."
- name: plg.alerts
rules:
- alert: LocalLokiDown
expr: last_over_time(up{job="loki"}[1m]) == 0
labels:
severity: ERROR
annotations:
summary: "The {{ $labels.service }} service is down."
description: "The {{ $labels.service }} service in the {{ $labels.env }} environment has been down for more than 1 minute."
- name: app.alerts
rules:
- alert: UpTime
expr: avg_over_time(up[1m]) < 0.95
labels:
severity: WARNING
annotations:
summary: "The {{ $labels.service }} service has a uptime warning."
description: "The {{ $labels.service }} service in the {{ $labels.env }} environment is not maintaining 95% uptime."
Loading

0 comments on commit e9bdc33

Please sign in to comment.