Skip to content

Commit

Permalink
Merge #131339
Browse files Browse the repository at this point in the history
131339: drtprod: yaml config for drt clusters r=sambhav-jain-16 ,vidit-bhat a=nameisbhaskar

This PR has the YAML configurations for drt-large and drt-chos clusters. These configuration creates the clusters and does the default setup including datadog configuration. It also creates the workload clusters.

The datadog setup scripts that are referred in the configuration are also part of thie PR.

Fixes: #125381
Epic: None

Co-authored-by: Bhaskarjyoti Bora <[email protected]>
  • Loading branch information
craig[bot] and nameisbhaskar committed Sep 27, 2024
2 parents 67dc7a1 + 2c0eb17 commit 497c316
Show file tree
Hide file tree
Showing 6 changed files with 417 additions and 0 deletions.
87 changes: 87 additions & 0 deletions pkg/cmd/drtprod/configs/drt_chaos.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# Yaml for creating and configuring the drt-chaos and workload-chaos clusters. This also configures the datadog.
environment:
ROACHPROD_GCE_DEFAULT_SERVICE_ACCOUNT: [email protected]
ROACHPROD_DNS: drt.crdb.io
ROACHPROD_GCE_DNS_DOMAIN: drt.crdb.io
ROACHPROD_GCE_DNS_ZONE: drt
ROACHPROD_GCE_DEFAULT_PROJECT: cockroach-drt
CLUSTER: drt-chaos
WORKLOAD_CLUSTER: workload-chaos

targets:
- target_name: $CLUSTER
steps:
- command: create
args:
- $CLUSTER
flags:
clouds: gce
gce-managed: true
gce-enable-multiple-stores: true
gce-zones: "us-east1-d,us-east1-b,us-east1-c"
nodes: 6
gce-machine-type: n2-standard-16
local-ssd: true
gce-local-ssd-count: 4
username: drt
lifetime: 8760h
gce-image: "ubuntu-2204-jammy-v20240319"
on_rollback:
- command: destroy
args:
- $CLUSTER
- command: sync
flags:
clouds: gce
- command: stage
args:
- $CLUSTER
- cockroach
- script: "pkg/cmd/drtprod/configs/setup_datadog_cluster"
- command: start
args:
- $CLUSTER
- "--binary"
- "./cockroach"
flags:
enable-fluent-sink: true
restart: false
sql-port: 26257
on_rollback:
- command: stop
args:
- $CLUSTER
- command: run
args:
- $CLUSTER
- --
- "sudo systemctl unmask cron.service ; sudo systemctl enable cron.service ; echo \"crontab -l ; echo '@reboot sleep 100 && ~/cockroach.sh' | crontab -\" > t.sh ; sh t.sh ; rm t.sh"
- target_name: $WORKLOAD_CLUSTER
steps:
- command: create
args:
- $WORKLOAD_CLUSTER
flags:
clouds: gce
gce-zones: "us-east1-c"
nodes: 1
gce-machine-type: n2-standard-8
os-volume-size: 100
username: workload
lifetime: 8760h
on_rollback:
- command: destroy
args:
- $WORKLOAD_CLUSTER
- command: sync
flags:
clouds: gce
- command: stage
args:
- $WORKLOAD_CLUSTER
- cockroach
- command: stage
args:
- $WORKLOAD_CLUSTER
- workload
- script: "pkg/cmd/drtprod/configs/setup_datadog_workload"
21 changes: 21 additions & 0 deletions pkg/cmd/drtprod/configs/drt_chaos_destroy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Yaml for destroying the drt-chaos and workload-chaos clusters.
environment:
ROACHPROD_GCE_DEFAULT_SERVICE_ACCOUNT: [email protected]
ROACHPROD_DNS: drt.crdb.io
ROACHPROD_GCE_DNS_DOMAIN: drt.crdb.io
ROACHPROD_GCE_DNS_ZONE: drt
ROACHPROD_GCE_DEFAULT_PROJECT: cockroach-drt
CLUSTER: drt-chaos
WORKLOAD_CLUSTER: workload-chaos

targets:
- target_name: $CLUSTER
steps:
- command: destroy
args:
- $CLUSTER
- target_name: $WORKLOAD_CLUSTER
steps:
- command: destroy
args:
- $WORKLOAD_CLUSTER
100 changes: 100 additions & 0 deletions pkg/cmd/drtprod/configs/drt_large.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# Yaml for creating and configuring the drt-large and workload-large clusters. This also configures the datadog.
environment:
ROACHPROD_GCE_DEFAULT_SERVICE_ACCOUNT: [email protected]
ROACHPROD_DNS: drt.crdb.io
ROACHPROD_GCE_DNS_DOMAIN: drt.crdb.io
ROACHPROD_GCE_DNS_ZONE: drt
ROACHPROD_GCE_DEFAULT_PROJECT: cockroach-drt
CLUSTER: drt-large
WORKLOAD_CLUSTER: workload-large

targets:
- target_name: $CLUSTER
steps:
- command: create
args:
- $CLUSTER
flags:
clouds: gce
gce-managed: true
gce-enable-multiple-stores: true
gce-zones: "northamerica-northeast2-a:2,northamerica-northeast2-b:2,northamerica-northeast2-c:1,us-east5-a:2,us-east5-b:2,us-east5-c:1,us-east1-b:2,us-east1-c:2,us-east1-d:1"
nodes: 15
gce-machine-type: n2-standard-16
local-ssd: true
gce-local-ssd-count: 4
os-volume-size: 100
username: drt
lifetime: 8760h
on_rollback:
- command: destroy
args:
- $CLUSTER
- command: sync
flags:
clouds: gce
- command: stage
args:
- $CLUSTER
- cockroach
- script: "pkg/cmd/drtprod/configs/setup_datadog_cluster"
- command: start
args:
- $CLUSTER
- "--binary"
- "./cockroach"
flags:
enable-fluent-sink: true
store-count: 4
restart: false
sql-port: 26257
on_rollback:
- command: stop
args:
- $CLUSTER
- command: run
args:
- $CLUSTER
- --
- "sudo systemctl unmask cron.service ; sudo systemctl enable cron.service ; echo \"crontab -l ; echo '@reboot sleep 100 && ~/cockroach.sh' | crontab -\" > t.sh ; sh t.sh ; rm t.sh"
- command: sql
args:
- $CLUSTER:1
- --
- -e
- "ALTER RANGE timeseries CONFIGURE ZONE USING num_replicas=5,num_voters=5"
- command: sql
args:
- $CLUSTER:1
- --
- -e
- "ALTER RANGE default CONFIGURE ZONE USING num_replicas=5,num_voters=5"
- target_name: $WORKLOAD_CLUSTER
steps:
- command: create
args:
- $WORKLOAD_CLUSTER
flags:
clouds: gce
gce-zones: "northamerica-northeast2-a,us-east5-a,us-east1-b"
nodes: 3
gce-machine-type: n2d-standard-4
os-volume-size: 100
username: workload
lifetime: 8760h
on_rollback:
- command: destroy
args:
- $WORKLOAD_CLUSTER
- command: sync
flags:
clouds: gce
- command: stage
args:
- $WORKLOAD_CLUSTER
- cockroach
- command: stage
args:
- $WORKLOAD_CLUSTER
- workload
- script: "pkg/cmd/drtprod/configs/setup_datadog_workload"
21 changes: 21 additions & 0 deletions pkg/cmd/drtprod/configs/drt_large_destroy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Yaml for destroying the drt-large and workload-large clusters.
environment:
ROACHPROD_GCE_DEFAULT_SERVICE_ACCOUNT: [email protected]
ROACHPROD_DNS: drt.crdb.io
ROACHPROD_GCE_DNS_DOMAIN: drt.crdb.io
ROACHPROD_GCE_DNS_ZONE: drt
ROACHPROD_GCE_DEFAULT_PROJECT: cockroach-drt
CLUSTER: drt-large
WORKLOAD_CLUSTER: workload-large

targets:
- target_name: $CLUSTER
steps:
- command: destroy
args:
- $CLUSTER
- target_name: $WORKLOAD_CLUSTER
steps:
- command: destroy
args:
- $WORKLOAD_CLUSTER
69 changes: 69 additions & 0 deletions pkg/cmd/drtprod/configs/setup_datadog_cluster
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/bin/bash

# Sets up datadog for the drt clusters.
# NOTE - This uses CLUSTER environment variable, if not set the script fails

if [ -z "${CLUSTER}" ]; then
echo "environment CLUSTER is not set"
exit 1
fi

# TODO - this command does not work. We need to replace this with the actual dd_api_key for the script to work

dd_api_key="$(gcloud --project=cockroach-drt secrets versions access latest --secret datadog-api-key)"


if [ -z "${dd_api_key}" ]; then
echo "Missing Datadog API key!"
exit 1
fi

dd_site="us5.datadoghq.com"

roachprod ssh $CLUSTER -- "sudo mkdir -p /etc/fluent-bit && sudo tee /etc/fluent-bit/config-override.yaml > /dev/null << EOF
---
pipeline:
inputs:
- name: tail
path: /var/log/audit/audit.log
tag: audit
key: message
storage.type: filesystem
alias: audit
outputs:
- name: datadog
match: audit
host: http-intake.logs.${dd_site}
tls: on
compress: gzip
apikey: ${dd_api_key}
dd_source: audit
dd_service: drt-cockroachdb
dd_tags: env:development,cluster:${cluster%:*},service:drt-cockroachdb,team:drt
alias: audit
storage.total_limit_size: 25MB
EOF"

roachprod ssh $CLUSTER -- "sudo tee /etc/profile.d/99-datadog.sh > /dev/null << EOF
export DD_SITE=${dd_site}
export DD_API_KEY=${dd_api_key}
export DD_TAGS=env:development,cluster${CLUSTER%:*},team:drt,service:drt-cockroachdb
EOF"

roachprod opentelemetry-start $CLUSTER \
--datadog-api-key "${dd_api_key}" \
--datadog-tags 'service:drt-cockroachdb,team:drt'

roachprod fluent-bit-start $CLUSTER \
--datadog-api-key "${dd_api_key}" \
--datadog-service drt-cockroachdb \
--datadog-tags 'service:drt-cockroachdb,team:drt'

echo
echo "Updated $CLUSTER configuration to send telemetry data to Datadog."
echo
echo "If this was the first time this script was run against $CLUSTER then"
echo "CockroachDB must be restarted to reload its logging configuration."
echo

exit 0
Loading

0 comments on commit 497c316

Please sign in to comment.