Skip to content

Commit

Permalink
Merge pull request #1 from anyscale/release-0.1.0
Browse files Browse the repository at this point in the history
Release: anyscale-kubernetes-operator 0.1.0
  • Loading branch information
csivanich authored Oct 30, 2024
2 parents 9629980 + 72d84ef commit 412d840
Show file tree
Hide file tree
Showing 14 changed files with 635 additions and 0 deletions.
3 changes: 3 additions & 0 deletions charts/anyscale-kubernetes-operator/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
apiVersion: v2
name: anyscale-kubernetes-operator
version: 0.1.0
3 changes: 3 additions & 0 deletions charts/anyscale-kubernetes-operator/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Anyscale Kubernetes Operator Helm Chart

Refer to [Anyscale documentation](https://docs.anyscale.com/administration/cloud-deployment/kubernetes/) for latest installation instructions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{{if .Values.anyscaleCliToken}}
apiVersion: v1
kind: Secret
metadata:
name: anyscale-cli-token
namespace: {{ .Release.Namespace }}
type: Opaque
data:
ANYSCALE_CLI_TOKEN: {{ .Values.anyscaleCliToken | b64enc }}
{{end}}
14 changes: 14 additions & 0 deletions charts/anyscale-kubernetes-operator/templates/cluster_role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
# Since this is a global resource, we append the namespace to it to support
# launching multiple cloud deployments into a single Kubernetes cluster (we
# assume that clouds to not share namespaces).
name: anyscale-operator-token-reviewer-{{ .Release.Namespace }}
rules:
- apiGroups: ["authentication.k8s.io"]
resources: ["tokenreviews"]
verbs: ["create"]
- apiGroups: [""]
resources: ["events", "nodes"]
verbs: ["get", "watch", "list"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
# Since this is a global resource, we append the namespace to it to support
# launching multiple cloud deployments into a single Kubernetes cluster (we
# assume that clouds to not share namespaces).
name: anyscale-operator-token-reviewer-{{ .Release.Namespace }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
# Since this is a global resource, we append the namespace to it to support
# launching multiple cloud deployments into a single Kubernetes cluster (we
# assume that clouds to not share namespaces).
name: anyscale-operator-token-reviewer-{{ .Release.Namespace }}
subjects:
- kind: ServiceAccount
name: anyscale-operator
namespace: {{ .Release.Namespace }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{{if or .Values.defaultInstanceTypes .Values.additionalInstanceTypes}}
apiVersion: v1
kind: ConfigMap
metadata:
name: instance-types
namespace: {{ .Release.Namespace }}
labels:
anyscale.com/name: instance-types
data:
version: v1
instance_types.yaml: |-
{{- if .Values.defaultInstanceTypes }}
{{ toYaml .Values.defaultInstanceTypes | indent 4 }}
{{- end }}
{{- if .Values.additionalInstanceTypes }}
{{ toYaml .Values.additionalInstanceTypes | indent 4 }}
{{- end }}
{{- end }}
162 changes: 162 additions & 0 deletions charts/anyscale-kubernetes-operator/templates/configmap_patches.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: patches
namespace: {{ .Release.Namespace }}
data:
version: v1
patches.yaml: |-
{{- if .Values.workloadServiceAccountName }}
########################################
# Service Account Support
########################################
- kind: Pod
patch:
- op: add
path: /spec/serviceAccountName
value: {{ .Values.workloadServiceAccountName }}
{{- end }}
########################################
# Taint tolerances
# NOTE: we handle these separate from affinity
########################################
# Tolerances for ALL workloads
- kind: Pod
patch:
{{- range $k, $v := .Values.workloadDefaultTolerances.all }}
- op: add
path: /spec/tolerations/-
value:
key: {{ $k }}
operator: {{ if $v.value }}Equal{{- else }}Exists{{- end }}
{{- if $v.value }}
value: {{ $v.value }}
{{- end }}
effect: {{ $v.effect | default "NoSchedule" }}
{{- end }}
# Tolerances for SPOT workloads
- kind: Pod
selector: "anyscale.com/market-type in (SPOT)"
patch:
{{- range $k, $v := .Values.workloadDefaultTolerances.spot }}
- op: add
path: /spec/tolerations/-
value:
key: {{ $k }}
operator: {{ if $v.value }}Equal{{- else }}Exists{{- end }}
{{- if $v.value }}
value: {{ $v.value }}
{{- end }}
effect: {{ $v.effect | default "NoSchedule" }}
{{- end }}
# Tolerances for GPU workloads
- kind: Pod
selector: "anyscale.com/accelerator-type"
patch:
{{- range $k, $v := .Values.workloadDefaultTolerances.gpu }}
- op: add
path: /spec/tolerations/-
value:
key: {{ $k }}
operator: {{ if $v.value }}Equal{{- else }}Exists{{- end }}
{{- if $v.value }}
value: {{ $v.value }}
{{- end }}
effect: {{ $v.effect | default "NoSchedule" }}
{{- end }}
########################################
# Market Type Support
########################################
{{- if eq .Values.cloudProvider "aws" }}
- kind: Pod
selector: "anyscale.com/market-type in (ON_DEMAND)"
patch:
- op: add
path: /spec/nodeSelector/eks.amazonaws.com~1capacityType
value: "ON_DEMAND"
- op: add
path: /metadata/annotations/cluster-autoscaler.kubernetes.io~1safe-to-evict
value: "false"
- kind: Pod
selector: "anyscale.com/market-type in (SPOT)"
patch:
- op: add
path: /spec/nodeSelector/eks.amazonaws.com~1capacityType
value: "SPOT"
{{- else if eq .Values.cloudProvider "gcp" }}
- kind: Pod
selector: "anyscale.com/market-type in (SPOT)"
patch:
- op: add
path: /spec/nodeSelector/cloud.google.com~1gke-spot
value: "true"
{{- end }}
{{- if .Values.enableZoneNodeSelector }}
########################################
# Zone Support
########################################
- kind: Pod
selector: anyscale.com/zone
patch:
- op: add
path: /spec/nodeSelector/topology.kubernetes.io~1zone
- op: copy
from: /metadata/annotations/anyscale.com~1zone
path: /spec/nodeSelector/topology.kubernetes.io~1zone
{{- end }}
########################################
# GPU Support
########################################
{{- if eq .Values.cloudProvider "aws" }}
# Prevent CPU workloads from being scheduled on GPU nodes.
- kind: Pod
selector: "!anyscale.com/accelerator-type"
patch:
- op: add
path: /spec/affinity
value:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "nvidia.com/gpu.count"
operator: DoesNotExist
{{- range $key, $value := .Values.supportedAccelerators.aws }}
- kind: Pod
selector: "anyscale.com/accelerator-type in ({{ $key }})"
patch:
- op: add
{{- if $.Values.acceleratorNodeSelector }}
path: /spec/nodeSelector/{{ $.Values.acceleratorNodeSelector }}
{{- else }}
path: /spec/nodeSelector/nvidia.com~1gpu.product
{{- end }}
value: "{{ $value }}"
{{- end }}
{{- else if eq .Values.cloudProvider "gcp" }}
{{- range $key, $value := .Values.supportedAccelerators.gcp }}
- kind: Pod
selector: "anyscale.com/accelerator-type in ({{ $key }})"
patch:
- op: add
{{- if $.Values.acceleratorNodeSelector }}
path: /spec/nodeSelector/{{ $.Values.acceleratorNodeSelector }}
{{- else }}
path: /spec/nodeSelector/cloud.google.com~1gke-accelerator
{{- end }}
value: "{{ $value }}"
{{- end }}
{{- end }}
########################################
# Additional Patches
########################################
{{- if .Values.additionalPatches}}
{{ toYaml .Values.additionalPatches | indent 4 }}
{{ end -}}
123 changes: 123 additions & 0 deletions charts/anyscale-kubernetes-operator/templates/configmap_vector.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
apiVersion: v1
data:
vector.yaml: |
data_dir: /tmp/vector
sources:
raw_operator_logs:
type: file
include:
- /tmp/anyscale/logs/*.log
raw_vector_logs:
type: internal_logs
raw_metrics:
type: prometheus_scrape
endpoints:
- http://localhost:2112/metrics
scrape_interval_secs: 15
raw_metrics_grpc:
type: prometheus_scrape
endpoints:
- http://localhost:2112/metrics_grpc
scrape_interval_secs: 15
transforms:
vector_logs:
type: remap
inputs:
- raw_vector_logs
source: |-
.file = "vector"
operator_logs:
type: remap
inputs:
- raw_operator_logs
source: |-
.file = "anyscale-operator"
combined_logs:
type: remap
inputs:
- operator_logs
- vector_logs
source: |-
x = .file
if (parsed, err = parse_json(.message); err == null) {
. = parsed
} else {
y = .message
. = {}
.msg = y
}
if !exists(.cloud_id) {
.cloud_id = "unknown"
}
.file = x
.pod = "${POD_NAME}"
.pod_ip = "${POD_IP}"
.namespace = "${POD_NAMESPACE}"
{{ if .Values.region }}
.region = "{{ .Values.region }}"
{{ end }}
system_metrics:
type: remap
inputs:
- raw_metrics
- raw_metrics_grpc
source: |-
.tags.pod = "${POD_NAME}"
.tags.pod_ip = "${POD_IP}"
.tags.namespace = "${POD_NAMESPACE}"
{{ if .Values.region }}
.tags.region = "{{ .Values.region }}"
{{ end }}
.tags.cloud_provider = "{{ .Values.cloudProvider }}"
.tags.cloud_resource_id = "{{ .Values.cloudDeploymentId }}"
.tags.cloud_id = get_enrichment_table_record("runtime_metadata", {"key": "cloud_id"}).value ?? "unknown"
sinks:
# Forward Vector internal metrics to the system cortex.
sink_loki:
healthcheck: false
type: loki
inputs:
- combined_logs
endpoint: http://localhost:3100
labels:
cloud_id: '{{ "{{" }} cloud_id {{ "}}" }}'
pod_ip: "${POD_IP}"
cloudProvider: {{ .Values.cloudProvider }}
cloud_resource_id: {{ .Values.cloudDeploymentId }}
file: '{{ "{{" }} file {{ "}}" }}'
source: anyscale-operator
encoding:
codec: json
compression: snappy
az_metrics_sink:
type: prometheus_remote_write
inputs:
- system_metrics
endpoint: http://localhost:3101/api/v1/push
default_namespace: dataplane_aop
healthcheck:
enabled: false
api:
enabled: true
enrichment_tables:
runtime_metadata:
type: file
file:
path: /tmp/config/vector/runtime_metadata.csv
encoding:
type: csv
schema:
key: string
value: string
entrypoint.sh: |
#!/bin/sh
while [ ! -f '/tmp/config/vector/runtime_metadata.csv' ]; do
echo Waiting for metadata
sleep 1
done
/usr/bin/vector --watch-config --log-format json --config-yaml /etc/vector/vector.yaml
kind: ConfigMap
metadata:
name: vector
namespace: {{ .Release.Namespace }}
Loading

0 comments on commit 412d840

Please sign in to comment.