diff --git a/applications/rag/variables.tf b/applications/rag/variables.tf index f9bbf537f..c80f39791 100644 --- a/applications/rag/variables.tf +++ b/applications/rag/variables.tf @@ -398,7 +398,7 @@ variable "gpu_pools" { name = "gpu-pool-l4" machine_type = "g2-standard-24" autoscaling = true - min_count = 1 + min_count = 0 max_count = 3 disk_size_gb = 200 disk_type = "pd-balanced" diff --git a/applications/ray/kuberay-tpu-webhook/bin/kuberay-tpu-webhook b/applications/ray/kuberay-tpu-webhook/bin/kuberay-tpu-webhook deleted file mode 100755 index 2fe6ec469..000000000 Binary files a/applications/ray/kuberay-tpu-webhook/bin/kuberay-tpu-webhook and /dev/null differ diff --git a/applications/ray/variables.tf b/applications/ray/variables.tf index 63d20c0cf..7b760959b 100644 --- a/applications/ray/variables.tf +++ b/applications/ray/variables.tf @@ -172,7 +172,7 @@ variable "gpu_pools" { name = "gpu-pool-l4" machine_type = "g2-standard-24" autoscaling = true - min_count = 1 + min_count = 0 max_count = 3 disk_size_gb = 100 disk_type = "pd-balanced" diff --git a/benchmarks/README.md b/benchmarks/README.md index 341393709..fed2f4954 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -88,7 +88,7 @@ cd inference-server/text-generation-inference # Copy the sample variables and update the project number and cluster name in # the fleet_host variable "https://connectgateway.googleapis.com/v1/projects//locations/global/gkeMemberships/" # in the `terraform.tfvars` file. -cp ./sample-tfvars/gpu-sample.tfvars terraform.tfvars +cp ./sample-terraform.tfvars terraform.tfvars # Initialize the Terraform modules. terraform init diff --git a/benchmarks/benchmark/tools/locust-load-inference/README.md b/benchmarks/benchmark/tools/locust-load-inference/README.md index cc3727b9f..3e5e0f23d 100644 --- a/benchmarks/benchmark/tools/locust-load-inference/README.md +++ b/benchmarks/benchmark/tools/locust-load-inference/README.md @@ -58,6 +58,8 @@ You will set the `gcs_path` in your `terraform.tfvars` to this gcs path containi The Locust workload requires storage.admin access to view the dataset in the given gcs bucket. If you are running with workload identity, it obtains this access via a kubernetes service account that is backed by a gcloud service account. If you followed steps in `../../infra`, then you already have a kubernetes and gcloud service account created that you can use here. +**Note: If you would like your raw benchmark data as a CSV, add the Locust master serviceAccount as a Storage Admin to the GCS bucket** + To give viewer permissions on the gcs bucket to the gcloud service account, run the following: ``` diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/custom_metric_aggregator.py b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/custom_metric_aggregator.py index 7a159e340..474bade8e 100644 --- a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/custom_metric_aggregator.py +++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/custom_metric_aggregator.py @@ -1,9 +1,14 @@ import datetime import logging import json +import csv +import os +from datetime import datetime +from google.cloud import storage -class TokenMetricCollector: +class MetricCollector: def __init__(self): + self.request_metrics = [] self.tokens_sent = [] self.tokens_received = [] self.test_time = [] @@ -12,6 +17,7 @@ def __init__(self): self.time_to_first_token_list = [] def add_metric(self, sent, received, test_time, request_succesful_bool, ttft): + self.request_metrics.append({"success": request_succesful_bool, "input_tokens": sent, "output_tokens": received, "total_request_time": test_time, "time_to_first_token": ttft}) if request_succesful_bool == 1: self.tokens_sent.append(sent) self.tokens_received.append(received) @@ -22,16 +28,17 @@ def add_metric(self, sent, received, test_time, request_succesful_bool, ttft): else: self.failure_count += 1 - def add_metrics(self, tokens_sent, tokens_received, test_time, success_count, failure_count, ttfts): + def add_metrics(self, tokens_sent, tokens_received, test_time, success_count, failure_count, ttfts, request_metrics): self.tokens_sent = self.tokens_sent + tokens_sent self.tokens_received = self.tokens_received + tokens_received self.test_time = self.test_time + test_time self.success_count += success_count self.failure_count += failure_count self.time_to_first_token_list = self.time_to_first_token_list + ttfts + self.request_metrics = self.request_metrics + request_metrics def share_stats(self): - return self.tokens_sent, self.tokens_received, self.test_time, self.success_count, self.failure_count, self.time_to_first_token_list + return self.tokens_sent, self.tokens_received, self.test_time, self.success_count, self.failure_count, self.time_to_first_token_list, self.request_metrics def calculate_average_tokens(self): if self.tokens_sent and len(self.tokens_sent) > 0: @@ -60,4 +67,16 @@ def json_dump_report(self): "average-time-to-first-token": sum(self.time_to_first_token_list)/max(len(self.time_to_first_token_list),1) } return json.dumps(stats) + + def dump_to_csv(self): + fields = ['success', 'total_request_time', 'time_to_first_token', 'input_tokens', 'output_tokens'] + now = datetime.now() + storage_client = storage.Client() + bucket = storage_client.bucket(os.environ['BUCKET']) + timestamp = now.strftime('metrics%Y-%m-%d%H:%M:%S.csv') + blob = bucket.blob(timestamp) + with blob.open('w') as metricsfile: + writer = csv.DictWriter(metricsfile, fieldnames=fields) + writer.writeheader() + writer.writerows(self.request_metrics) diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py index 99f617ab8..3d6e8ab6b 100644 --- a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py +++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py @@ -33,8 +33,8 @@ from grpc_interceptor import ClientInterceptor -from custom_metric_aggregator import TokenMetricCollector -local_metric_collector = TokenMetricCollector() +from custom_metric_aggregator import MetricCollector +local_metric_collector = MetricCollector() logging.basicConfig(level=logging.INFO) grpc_gevent.init_gevent() @@ -208,6 +208,7 @@ def on_test_stop(environment, **kwargs): """on test stop the locust master resets metric collector""" if isinstance(environment.runner, MasterRunner): logging.info(f'dumping metrics before clear: {local_metric_collector.json_dump_report()}') + local_metric_collector.dump_to_csv() logging.info(f'init metric_collector') local_metric_collector.__init__() @@ -224,13 +225,14 @@ def on_report_to_master(client_id, data): to the dict that is being sent, and then we clear the local stats in the worker, so as to avoid sending duplicate data to the master on the next run. """ - tokens_sent, tokens_recieved, test_time, success_count, failure_count, ttft = local_metric_collector.share_stats() + tokens_sent, tokens_recieved, test_time, success_count, failure_count, ttft, request_metrics = local_metric_collector.share_stats() data["tokens-sent"] = tokens_sent data["tokens-received"] = tokens_recieved data["test-time"] = test_time data["success-count"] = success_count data["failure-count"] = failure_count data["time_to_first_token"] = ttft + data["request-metrics"] = request_metrics local_metric_collector.__init__ @@ -242,7 +244,7 @@ def on_worker_report(client_id, data): stats dict. """ local_metric_collector.add_metrics( - data["tokens-sent"], data["tokens-received"], data["test-time"], data["success-count"], data["failure-count"], data["time_to_first_token"]) + data["tokens-sent"], data["tokens-received"], data["test-time"], data["success-count"], data["failure-count"], data["time_to_first_token"], data["request-metrics"]) @events.init_command_line_parser.add_listener @@ -339,13 +341,16 @@ def grpc_infer(self): ) logging.info(f"Prompt: {prompt}") #return values format is from the interceptor, which makes the actual call - output, ttft, response_time = self.stub.Decode(request) - logging.info(f"Response: {output}") - - number_of_input_tokens = len(tokenizer.encode(prompt)) - number_of_output_tokens = len(tokenizer.encode(output)) - send_metrics(number_of_input_tokens, number_of_output_tokens, response_time,1, ttft) + try: + output, ttft, response_time = self.stub.Decode(request) + logging.info(f"Response: {output}") + number_of_input_tokens = len(tokenizer.encode(prompt)) + number_of_output_tokens = len(tokenizer.encode(output)) + send_metrics(number_of_input_tokens, number_of_output_tokens, response_time, 1, ttft) + except: + # Capture that a test was ran, but the request threw an exception + send_metrics(-1,-1,-1,0,-1) class LocustInterceptor(ClientInterceptor): def __init__(self, environment, *args, **kwargs): @@ -371,11 +376,11 @@ def intercept( # chunk sent back is used to calculate time to first token(TTFT). for response in responses: if ttft == 0: - ttft = time.perf_counter() - start_perf_counter + ttft = (time.perf_counter() - start_perf_counter) * 1000 output += response.response[0] response_length += response.ByteSize() response_time_ms = (time.perf_counter() - start_perf_counter) * 1000 - logging.info(f"response_time {response_time_ms}; ttft:{ttft * 1000}") + logging.info(f"response_time {response_time_ms}; ttft:{ttft}") self.env.events.request.fire( request_type="grpc", name=call_details.method, diff --git a/benchmarks/benchmark/tools/locust-load-inference/main.tf b/benchmarks/benchmark/tools/locust-load-inference/main.tf index 81115fd66..ff557f9ee 100644 --- a/benchmarks/benchmark/tools/locust-load-inference/main.tf +++ b/benchmarks/benchmark/tools/locust-load-inference/main.tf @@ -50,6 +50,7 @@ locals { k8s_hf_secret_list = var.k8s_hf_secret == null ? [] : [var.k8s_hf_secret] stop_timeout = var.stop_timeout request_type = var.request_type + bucket = var.output_bucket })) : data] ]) } diff --git a/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-master-controller.yaml.tpl b/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-master-controller.yaml.tpl index 0b443879a..4f3230a18 100644 --- a/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-master-controller.yaml.tpl +++ b/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-master-controller.yaml.tpl @@ -31,6 +31,8 @@ spec: value: ${stop_timeout} - name: REQUEST_TYPE value: ${request_type} + - name: BUCKET + value: ${bucket} ports: - name: loc-master-web containerPort: 8089 diff --git a/benchmarks/benchmark/tools/locust-load-inference/sample-dashboards/tgi-dashboard.yaml b/benchmarks/benchmark/tools/locust-load-inference/sample-dashboards/tgi-dashboard.yaml new file mode 100644 index 000000000..488e9a425 --- /dev/null +++ b/benchmarks/benchmark/tools/locust-load-inference/sample-dashboards/tgi-dashboard.yaml @@ -0,0 +1,694 @@ +displayName: Benchmark +mosaicLayout: + columns: 48 + tiles: + - height: 13 + widget: + title: Locust Request Avg Response Time + xyChart: + chartOptions: + mode: COLOR + dataSets: + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + perSeriesAligner: ALIGN_MEAN + filter: metric.type="prometheus.googleapis.com/locust_requests_avg_response_time/gauge" + resource.type="prometheus_target" + yAxis: + scale: LINEAR + width: 11 + xPos: 27 + yPos: 15 + - height: 16 + widget: + title: TGI Queue Size + xyChart: + chartOptions: + mode: COLOR + dataSets: + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + perSeriesAligner: ALIGN_MEAN + filter: metric.type="prometheus.googleapis.com/tgi_queue_size/gauge" + resource.type="prometheus_target" + yAxis: + scale: LINEAR + width: 14 + xPos: 17 + yPos: 45 + - height: 14 + widget: + title: Tgi batch current size + xyChart: + chartOptions: + mode: COLOR + dataSets: + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_MEAN + perSeriesAligner: ALIGN_MEAN + filter: metric.type="prometheus.googleapis.com/tgi_batch_current_size/gauge" + resource.type="prometheus_target" + yAxis: + scale: LINEAR + width: 14 + yPos: 76 + - height: 15 + widget: + title: TGI Batch Current Max Tokens + xyChart: + chartOptions: + mode: COLOR + dataSets: + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + perSeriesAligner: ALIGN_MEAN + filter: metric.type="prometheus.googleapis.com/tgi_batch_current_max_tokens/gauge" + resource.type="prometheus_target" + yAxis: + scale: LINEAR + width: 18 + xPos: 30 + yPos: 74 + - height: 15 + widget: + title: TGI Request Mean Time per token (s) + xyChart: + chartOptions: + mode: COLOR + dataSets: + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_PERCENTILE_99 + perSeriesAligner: ALIGN_DELTA + filter: metric.type="prometheus.googleapis.com/tgi_request_mean_time_per_token_duration/histogram" + resource.type="prometheus_target" + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_PERCENTILE_95 + perSeriesAligner: ALIGN_DELTA + filter: metric.type="prometheus.googleapis.com/tgi_request_mean_time_per_token_duration/histogram" + resource.type="prometheus_target" + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_PERCENTILE_50 + perSeriesAligner: ALIGN_DELTA + filter: metric.type="prometheus.googleapis.com/tgi_request_mean_time_per_token_duration/histogram" + resource.type="prometheus_target" + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_PERCENTILE_05 + perSeriesAligner: ALIGN_DELTA + filter: metric.type="prometheus.googleapis.com/tgi_request_mean_time_per_token_duration/histogram" + resource.type="prometheus_target" + yAxis: + scale: LINEAR + width: 15 + xPos: 12 + yPos: 61 + - height: 15 + widget: + title: prometheus/DCGM_FI_DEV_MEM_COPY_UTIL/gauge [MEAN] + xyChart: + chartOptions: + mode: COLOR + dataSets: + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + perSeriesAligner: ALIGN_MEAN + filter: metric.type="prometheus.googleapis.com/DCGM_FI_DEV_MEM_COPY_UTIL/gauge" + resource.type="prometheus_target" + yAxis: + scale: LINEAR + width: 10 + xPos: 38 + yPos: 28 + - height: 14 + widget: + title: Locust Requests Max Response Time + xyChart: + chartOptions: + mode: COLOR + dataSets: + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_MEAN + perSeriesAligner: ALIGN_MEAN + filter: metric.type="prometheus.googleapis.com/locust_requests_max_response_time/gauge" + resource.type="prometheus_target" + yAxis: + scale: LINEAR + width: 15 + xPos: 12 + yPos: 15 + - height: 15 + widget: + title: Tgi batch inference duration + xyChart: + chartOptions: + mode: COLOR + dataSets: + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_PERCENTILE_99 + perSeriesAligner: ALIGN_DELTA + filter: metric.type="prometheus.googleapis.com/tgi_batch_inference_duration/histogram" + resource.type="prometheus_target" + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_PERCENTILE_95 + perSeriesAligner: ALIGN_DELTA + filter: metric.type="prometheus.googleapis.com/tgi_batch_inference_duration/histogram" + resource.type="prometheus_target" + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_PERCENTILE_50 + perSeriesAligner: ALIGN_DELTA + filter: metric.type="prometheus.googleapis.com/tgi_batch_inference_duration/histogram" + resource.type="prometheus_target" + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_PERCENTILE_05 + perSeriesAligner: ALIGN_DELTA + filter: metric.type="prometheus.googleapis.com/tgi_batch_inference_duration/histogram" + resource.type="prometheus_target" + yAxis: + scale: LINEAR + width: 15 + xPos: 33 + yPos: 59 + - height: 16 + widget: + title: Tgi Request Inference Duration + xyChart: + chartOptions: + mode: COLOR + dataSets: + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_PERCENTILE_99 + perSeriesAligner: ALIGN_DELTA + filter: metric.type="prometheus.googleapis.com/tgi_request_inference_duration/histogram" + resource.type="prometheus_target" + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_PERCENTILE_95 + perSeriesAligner: ALIGN_DELTA + filter: metric.type="prometheus.googleapis.com/tgi_request_inference_duration/histogram" + resource.type="prometheus_target" + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_PERCENTILE_50 + perSeriesAligner: ALIGN_DELTA + filter: metric.type="prometheus.googleapis.com/tgi_request_inference_duration/histogram" + resource.type="prometheus_target" + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_PERCENTILE_05 + perSeriesAligner: ALIGN_DELTA + filter: metric.type="prometheus.googleapis.com/tgi_request_inference_duration/histogram" + resource.type="prometheus_target" + yAxis: + scale: LINEAR + width: 12 + yPos: 90 + - height: 15 + widget: + title: TGI Pod count + xyChart: + chartOptions: + mode: COLOR + dataSets: + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_SUM + groupByFields: + - resource.label."pod_name" + perSeriesAligner: ALIGN_MEAN + filter: metric.type="kubernetes.io/container/uptime" resource.type="k8s_container" + resource.label."pod_name"=monitoring.regex.full_match("tgi.*") + secondaryAggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_COUNT + perSeriesAligner: ALIGN_MEAN + yAxis: + scale: LINEAR + width: 13 + - height: 15 + widget: + title: Tgi request queue duration + xyChart: + chartOptions: + mode: COLOR + dataSets: + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_PERCENTILE_99 + perSeriesAligner: ALIGN_DELTA + filter: metric.type="prometheus.googleapis.com/tgi_request_queue_duration/histogram" + resource.type="prometheus_target" + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_PERCENTILE_95 + perSeriesAligner: ALIGN_DELTA + filter: metric.type="prometheus.googleapis.com/tgi_request_queue_duration/histogram" + resource.type="prometheus_target" + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_PERCENTILE_50 + perSeriesAligner: ALIGN_DELTA + filter: metric.type="prometheus.googleapis.com/tgi_request_queue_duration/histogram" + resource.type="prometheus_target" + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_PERCENTILE_05 + perSeriesAligner: ALIGN_DELTA + filter: metric.type="prometheus.googleapis.com/tgi_request_queue_duration/histogram" + resource.type="prometheus_target" + yAxis: + scale: LINEAR + width: 14 + xPos: 13 + yPos: 90 + - height: 14 + widget: + title: Locust Requests Current RPS + xyChart: + chartOptions: + mode: COLOR + dataSets: + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + perSeriesAligner: ALIGN_MEAN + filter: metric.type="prometheus.googleapis.com/locust_requests_current_rps/gauge" + resource.type="prometheus_target" + yAxis: + scale: LINEAR + width: 12 + yPos: 15 + - height: 15 + widget: + title: Locust Avg Output Tokens Per Sec + xyChart: + chartOptions: + mode: COLOR + dataSets: + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_SUM + perSeriesAligner: ALIGN_MEAN + filter: metric.type="prometheus.googleapis.com/locust_custom_metrics_avg_tokens_received/gauge" + resource.type="prometheus_target" + yAxis: + scale: LINEAR + width: 13 + xPos: 13 + - height: 16 + widget: + title: Nvidia GPU Utilization Mean + xyChart: + chartOptions: + mode: COLOR + dataSets: + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_MEAN + perSeriesAligner: ALIGN_MEAN + filter: metric.type="prometheus.googleapis.com/DCGM_FI_DEV_GPU_UTIL/gauge" + resource.type="prometheus_target" + yAxis: + scale: LINEAR + width: 13 + yPos: 29 + - height: 16 + widget: + title: prometheus/DCGM_FI_DEV_FB_USED/gauge [MEAN] + xyChart: + chartOptions: + mode: COLOR + dataSets: + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + perSeriesAligner: ALIGN_MEAN + filter: metric.type="prometheus.googleapis.com/DCGM_FI_DEV_FB_USED/gauge" + resource.type="prometheus_target" + yAxis: + scale: LINEAR + width: 13 + xPos: 13 + yPos: 29 + - height: 15 + widget: + title: prometheus/DCGM_FI_PROF_PIPE_TENSOR_ACTIVE/gauge [MEAN] + xyChart: + chartOptions: + mode: COLOR + dataSets: + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + perSeriesAligner: ALIGN_MEAN + filter: metric.type="prometheus.googleapis.com/DCGM_FI_PROF_PIPE_TENSOR_ACTIVE/gauge" + resource.type="prometheus_target" + yAxis: + scale: LINEAR + width: 11 + xPos: 1 + yPos: 61 + - height: 15 + widget: + title: Locust Workers Count + xyChart: + chartOptions: + mode: COLOR + dataSets: + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_SUM + perSeriesAligner: ALIGN_MEAN + filter: metric.type="prometheus.googleapis.com/locust_workers_running_count/gauge" + resource.type="prometheus_target" + yAxis: + scale: LINEAR + width: 9 + xPos: 39 + - height: 15 + widget: + title: Locust Users + xyChart: + chartOptions: + mode: COLOR + dataSets: + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + perSeriesAligner: ALIGN_MEAN + filter: metric.type="prometheus.googleapis.com/locust_users/gauge" resource.type="prometheus_target" + yAxis: + scale: LINEAR + width: 13 + xPos: 26 + - height: 16 + widget: + title: TGI Queue Size Mean + xyChart: + chartOptions: + mode: COLOR + dataSets: + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_MEAN + perSeriesAligner: ALIGN_MEAN + filter: metric.type="prometheus.googleapis.com/tgi_queue_size/gauge" + resource.type="prometheus_target" + yAxis: + scale: LINEAR + width: 16 + yPos: 45 + - height: 15 + widget: + title: Tgi request queue duration mean + xyChart: + chartOptions: + mode: COLOR + dataSets: + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_MEAN + perSeriesAligner: ALIGN_DELTA + filter: metric.type="prometheus.googleapis.com/tgi_request_queue_duration/histogram" + resource.type="prometheus_target" + yAxis: + scale: LINEAR + width: 14 + xPos: 31 + yPos: 44 + - height: 13 + widget: + title: Locust requests fail ratio + xyChart: + chartOptions: + mode: COLOR + dataSets: + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + perSeriesAligner: ALIGN_MEAN + filter: metric.type="prometheus.googleapis.com/locust_requests_fail_ratio/gauge" + resource.type="prometheus_target" + yAxis: + scale: LINEAR + width: 10 + xPos: 38 + yPos: 15 + - height: 15 + widget: + title: Kubernetes Container - Accelerator duty cycle [MEAN] + xyChart: + chartOptions: + mode: COLOR + dataSets: + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + perSeriesAligner: ALIGN_MEAN + filter: metric.type="kubernetes.io/container/accelerator/duty_cycle" + resource.type="k8s_container" + yAxis: + scale: LINEAR + width: 12 + xPos: 26 + yPos: 29 + - height: 16 + widget: + title: Nvidia GPU Utilization Per GPU + xyChart: + chartOptions: + mode: COLOR + dataSets: + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + perSeriesAligner: ALIGN_MEAN + filter: metric.type="prometheus.googleapis.com/DCGM_FI_DEV_GPU_UTIL/gauge" + resource.type="prometheus_target" + yAxis: + scale: LINEAR + width: 20 + xPos: 28 + yPos: 89 + - height: 14 + widget: + title: Tgi batch current size per pod + xyChart: + chartOptions: + mode: COLOR + dataSets: + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + perSeriesAligner: ALIGN_MEAN + filter: metric.type="prometheus.googleapis.com/tgi_batch_current_size/gauge" + resource.type="prometheus_target" + yAxis: + scale: LINEAR + width: 14 + xPos: 14 + yPos: 76 + - height: 16 + widget: + title: prometheus/DCGM_FI_DEV_FB_USED/gauge [SUM] + xyChart: + chartOptions: + mode: COLOR + dataSets: + - minAlignmentPeriod: 60s + plotType: LINE + targetAxis: Y1 + timeSeriesQuery: + timeSeriesFilter: + aggregation: + alignmentPeriod: 60s + crossSeriesReducer: REDUCE_SUM + perSeriesAligner: ALIGN_MEAN + filter: metric.type="prometheus.googleapis.com/DCGM_FI_DEV_FB_USED/gauge" + resource.type="prometheus_target" + yAxis: + scale: LINEAR + width: 24 + xPos: 12 + yPos: 105 diff --git a/benchmarks/benchmark/tools/locust-load-inference/sample-tfvars/tgi-sample.tfvars b/benchmarks/benchmark/tools/locust-load-inference/sample-tfvars/tgi-sample.tfvars index 255f97660..9296b20b0 100644 --- a/benchmarks/benchmark/tools/locust-load-inference/sample-tfvars/tgi-sample.tfvars +++ b/benchmarks/benchmark/tools/locust-load-inference/sample-tfvars/tgi-sample.tfvars @@ -2,16 +2,18 @@ credentials_config = { fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUM/locations/global/gkeMemberships/ai-benchmark" } -project_id = "change-me" +project_id = "$PROJECT_ID" namespace = "benchmark" ksa = "benchmark-ksa" +k8s_hf_secret = "hf-token" + # Locust service configuration artifact_registry = "us-central1-docker.pkg.dev/$PROJECT_ID/ai-benchmark" inference_server_service = "tgi" # inference server service name locust_runner_kubernetes_service_account = "sample-runner-ksa" -output_bucket = "benchmark-output" +output_bucket = "${PROJECT_ID}-benchmark-output" gcs_path = "gs://${PROJECT_ID}-ai-gke-benchmark-fuse/ShareGPT_V3_unfiltered_cleaned_split_filtered_prompts.txt" # Benchmark configuration for Locust Docker accessing inference server diff --git a/benchmarks/inference-server/text-generation-inference/sample-terraform.tfvars b/benchmarks/inference-server/text-generation-inference/sample-terraform.tfvars index c04015e83..f6d7d17a9 100644 --- a/benchmarks/inference-server/text-generation-inference/sample-terraform.tfvars +++ b/benchmarks/inference-server/text-generation-inference/sample-terraform.tfvars @@ -2,6 +2,8 @@ credentials_config = { fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUMBER/locations/global/gkeMemberships/ai-benchmark" } +project_id = "$PROJECT_ID" + namespace = "benchmark" ksa = "benchmark-ksa" model_id = "tiiuae/falcon-7b" @@ -33,5 +35,3 @@ hpa_type = null # Adjust these if you want different min/max values # hpa_min_replicas = 1 # hpa_max_replicas = 5 - -project_id = "" diff --git a/benchmarks/infra/stage-1/sample-tfvars/gpu-sample.tfvars b/benchmarks/infra/stage-1/sample-tfvars/gpu-sample.tfvars index 2557f9c6f..8aae63b68 100644 --- a/benchmarks/infra/stage-1/sample-tfvars/gpu-sample.tfvars +++ b/benchmarks/infra/stage-1/sample-tfvars/gpu-sample.tfvars @@ -1,5 +1,4 @@ -project_id = "change-me" -// TODO: change all instances of clusterName to be ai-gpu-benchmark. +project_id = "$PROJECT_ID" cluster_name = "ai-benchmark" region = "us-central1" gke_location = "us-central1-a" diff --git a/benchmarks/infra/stage-2/modules/gke-setup/modules/nvidia-dcgm/manifest-templates/03-cm-dcgm.yaml b/benchmarks/infra/stage-2/modules/gke-setup/modules/nvidia-dcgm/manifest-templates/03-cm-dcgm.yaml index 17a24ef6f..e4972a3a0 100644 --- a/benchmarks/infra/stage-2/modules/gke-setup/modules/nvidia-dcgm/manifest-templates/03-cm-dcgm.yaml +++ b/benchmarks/infra/stage-2/modules/gke-setup/modules/nvidia-dcgm/manifest-templates/03-cm-dcgm.yaml @@ -23,6 +23,11 @@ data: DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %). + # Temperature and power usage,, + DCGM_FI_DEV_GPU_TEMP, gauge, Current temperature readings for the device in degrees C. + DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature for the device. + DCGM_FI_DEV_POWER_USAGE, gauge, Power usage for the device in Watts. + # Utilization of IP blocks,, DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned DCGM_FI_PROF_SM_OCCUPANCY, gauge, The fraction of resident warps on a multiprocessor diff --git a/benchmarks/infra/stage-2/sample-tfvars/gpu-sample.tfvars b/benchmarks/infra/stage-2/sample-tfvars/gpu-sample.tfvars index 7900e9b9f..90e965e37 100644 --- a/benchmarks/infra/stage-2/sample-tfvars/gpu-sample.tfvars +++ b/benchmarks/infra/stage-2/sample-tfvars/gpu-sample.tfvars @@ -6,7 +6,7 @@ credentials_config = { # can be obtained from stage-1 by running: # terraform output -json | jq '."project_id".value' -project_id = "change-me" +project_id = "$PROJECT_ID" bucket_name = "${PROJECT_ID}-ai-gke-benchmark-fuse" bucket_location = "US" diff --git a/benchmarks/infra/stage-2/variables.tf b/benchmarks/infra/stage-2/variables.tf index e68ace4b6..ef35b858b 100644 --- a/benchmarks/infra/stage-2/variables.tf +++ b/benchmarks/infra/stage-2/variables.tf @@ -142,5 +142,5 @@ variable "nvidia_dcgm_create" { variable "gcs_fuse_create" { description = "Give the SA object admin privileges" type = bool - default = false + default = true } \ No newline at end of file diff --git a/best-practices/ml-platform/.gitignore b/best-practices/ml-platform/.gitignore new file mode 100644 index 000000000..8e6c23947 --- /dev/null +++ b/best-practices/ml-platform/.gitignore @@ -0,0 +1,2 @@ +test/log/*.log +test/scripts/locks/*.lock diff --git a/best-practices/ml-platform/README.md b/best-practices/ml-platform/README.md index e339a3cff..09e7ff061 100644 --- a/best-practices/ml-platform/README.md +++ b/best-practices/ml-platform/README.md @@ -10,6 +10,8 @@ This reference architecture demonstrates how to build a GKE platform that facili - Platform admins will create a namespace per application and provide the application team member full access to it. - The namespace scoped resources will be created by the Application/ML teams either via [Config Sync][config-sync] or through a deployment tool like [Cloud Deploy][cloud-deploy] +For an outline of products and features used in the platform, see the [Platform Products and Features](/best-practices/ml-platform/docs/platform/products-and-features.md) document. + ## Critical User Journeys (CUJs) ### Persona : Platform Admin @@ -60,6 +62,10 @@ This reference architecture demonstrates how to build a GKE platform that facili - [Distributed Data Processing with Ray](examples/use-case/ray/dataprocessing/README.md): Run a distributed data processing job using Ray. +## Resources + +- [Packaging Jupyter notebooks](docs/notebook/packaging.md): Patterns and tools to get your ipynb's ready for deployment in a container runtime. + [gitops]: https://about.gitlab.com/topics/gitops/ [repo-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields [root-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields diff --git a/best-practices/ml-platform/docs/images/notebook/dockerfile.png b/best-practices/ml-platform/docs/images/notebook/dockerfile.png new file mode 100644 index 000000000..0047d69ae Binary files /dev/null and b/best-practices/ml-platform/docs/images/notebook/dockerfile.png differ diff --git a/best-practices/ml-platform/docs/images/notebook/jupyter-generate-requirements.png b/best-practices/ml-platform/docs/images/notebook/jupyter-generate-requirements.png new file mode 100644 index 000000000..ef6ebc23a Binary files /dev/null and b/best-practices/ml-platform/docs/images/notebook/jupyter-generate-requirements.png differ diff --git a/best-practices/ml-platform/docs/images/notebook/jupyter-gpt-j-online-ipynb.png b/best-practices/ml-platform/docs/images/notebook/jupyter-gpt-j-online-ipynb.png new file mode 100644 index 000000000..c07172e1b Binary files /dev/null and b/best-practices/ml-platform/docs/images/notebook/jupyter-gpt-j-online-ipynb.png differ diff --git a/best-practices/ml-platform/docs/images/notebook/jupyter-gpt-j-online-py.png b/best-practices/ml-platform/docs/images/notebook/jupyter-gpt-j-online-py.png new file mode 100644 index 000000000..25fbf0131 Binary files /dev/null and b/best-practices/ml-platform/docs/images/notebook/jupyter-gpt-j-online-py.png differ diff --git a/best-practices/ml-platform/docs/images/notebook/jupyter-nbconvert.png b/best-practices/ml-platform/docs/images/notebook/jupyter-nbconvert.png new file mode 100644 index 000000000..a65123f1c Binary files /dev/null and b/best-practices/ml-platform/docs/images/notebook/jupyter-nbconvert.png differ diff --git a/best-practices/ml-platform/docs/images/notebook/jupyter-pairing.png b/best-practices/ml-platform/docs/images/notebook/jupyter-pairing.png new file mode 100644 index 000000000..7762ccbd7 Binary files /dev/null and b/best-practices/ml-platform/docs/images/notebook/jupyter-pairing.png differ diff --git a/best-practices/ml-platform/docs/notebook/packaging.md b/best-practices/ml-platform/docs/notebook/packaging.md new file mode 100644 index 000000000..beb6fe181 --- /dev/null +++ b/best-practices/ml-platform/docs/notebook/packaging.md @@ -0,0 +1,93 @@ +# Packaging Jupyter notebook as deployable code + +Jupyter notebook is widely used by data scientists and machine learning experts in their day to day work to interactively and iteratively develop. However, the `ipynb` format is typically not used as a deployable or packagable artifact. There are two scenarios that notebooks are converted to deployable/package artifacts: + 1. Model training tasks needed to convert to batch jobs to scale up with more computational resources + 1. Model inference tasks needed to convert to an API server to serve the end-user requests + +In this guide we will showcase two different tools which may help faciliate converting your notebook to a deployable/packageable raw python library. + +This process can also be automated utilizing Continuous Integration (CI) tools such as [Cloud Build](https://cloud.google.com/build/). + +## Use jupytext to convert notebook to raw python and containerize + +1. Update the notebook to `Pair Notebook with Percent Format` + + Jupytext comes with recent jupyter notebook or jupyter-lab. In addition to just converting from `ipynb` to python, it can pair between the formats. This allows for updates made in `ipynb` to be propagated to python and vice versa. + + To pair the notebook, simply use the pair function in the File menu: + + ![jupyter-pairing](../images/notebook/jupyter-pairing.png) + + In this example we use the file [gpt-j-online.ipynb](https://github.com/GoogleCloudPlatform/ai-on-gke/blob/main/ray-on-gke/examples/notebooks/gpt-j-online.ipynb):![jupyter-gpt-j-online-ipynb](../images/notebook/jupyter-gpt-j-online-ipynb.png) + +1. After pairing, we get the generated raw python: + + ![jupyter-gpt-j-online-py](../images/notebook/jupyter-gpt-j-online-py.png) + + **NOTE**: This conversion can also be performed via the `jupytext` cli with the following command: + + ```sh + jupytext --set-formats ipynb,py:percent \ + --to py gpt-j-online.ipynb + ``` + +1. Extract the module dependencies + + In the notebook environment, users typically install required python modules using `pip install` commands, but in the container environment, these dependencies need to be installed into the container prior to executing the python library. + + We can use the `pipreqs` tool to generate the dependencies. Add the following snippet in a new cell of your notebook and run it: + + ```sh + !pip install pipreqs + !pipreqs --scan-notebooks + ``` + + The following is an example output: + + ![jupyter-generate-requirements](../images/notebook/jupyter-generate-requirements.png) + **NOTE**: (the `!cat requirements.txt` line is an example of the generated `requirements.txt`) + +1. Create the Dockerfile + + To create the docker image of your generated raw python, we need to create a `Dockerfile`, below is an example. Replace `_THE_GENERATED_PYTHON_FILE_` with your generated python file: + + ```Dockerfile + FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04 + + RUN apt-get update && \ + apt-get -y --no-install-recommends install python3-dev gcc python3-pip git && \ + rm -rf /var/lib/apt/lists/* + + COPY requirements.txt _THE_GENERATED_PYTHON_FILE_ /_THE_GENERATED_PYTHON_FILE_ + + RUN pip3 install --no-cache-dir -r requirements.txt + + ENV PYTHONUNBUFFERED 1 + + CMD python3 /_THE_GENERATED_PYTHON_FILE_ + ``` + +1. [Optional] Lint and remove unused code + + Using `pylint` to validate the generated code is a good practice. Pylint can detect unordered `import` statements, unused code and provide code readability suggestions. + + To use `pylint`, create a new cell in your notebook, run the code below and replace `_THE_GENERATED_PYTHON_FILE_` to your filename: + + ```sh + !pip install pylint + !pylint _THE_GENERATED_PYTHON_FILE_ + ``` + +## Use nbconvert to convert notebook to raw python + +We can convert a Jupyter notebook to python script using nbconvert tool. +The nbconvert tool is available inside your Jupyter notebook environment in Google Colab Enterprise. If you are in another environment and it is not available, it can be found [here](https://pypi.org/project/nbconvert/) + +1. Run the nbconvert command in your notebook. In this example, we are using `gsutil` to copy the notebook to the Colab Enterprise notebook. + + ```sh + !jupyter nbconvert --to python Fine-tune-Llama-Google-Colab.ipynb + ``` + + Below is an example of the commands + ![jupyter-nbconvert](../images/notebook/jupyter-nbconvert.png) \ No newline at end of file diff --git a/best-practices/ml-platform/docs/platform/products-and-features.md b/best-practices/ml-platform/docs/platform/products-and-features.md new file mode 100644 index 000000000..27382a9bb --- /dev/null +++ b/best-practices/ml-platform/docs/platform/products-and-features.md @@ -0,0 +1,279 @@ +# Platform Products and Features + +This document outlines the products and features that are used in the platform. + +## Cloud Logging + +Cloud Logging is a real-time log-management system with storage, search, analysis, and monitoring support. Cloud Logging automatically collects logs from Google Cloud resources. You can also collect logs from your applications, on-premise resources, and resources from other cloud providers. You can also configure alerting policies so that Cloud Monitoring notifies you if certain kinds of events are reported in your logs. For regulatory or security reasons, you can determine where your log data is stored. + +For more information see the [Cloud Logging documentation](https://cloud.google.com/logging/docs/overview). + +## Cloud Monitoring + +Cloud Monitoring services can help you to understand the behavior, health, and performance of your applications and of other Google Cloud services. Cloud Monitoring automatically collects and stores performance information for most Google Cloud services. + +For more information see the [Cloud Monitoring documentation](https://cloud.google.com/monitoring/docs/monitoring-overview). + +## Identity-Aware Proxy (IAP) + +IAP lets you establish a central authorization layer for applications accessed by HTTPS, so you can use an application-level access control model instead of relying on network-level firewalls. + +IAP policies scale across your organization. You can define access policies centrally and apply them to all of your applications and resources. When you assign a dedicated team to create and enforce policies, you protect your project from incorrect policy definition or implementation in any application. + +For more information see the [Identity-Aware Proxy (IAP) documentation](https://cloud.google.com/iap/docs/concepts-overview). + +## Google Cloud Managed Service for Prometheus + +Google Cloud Managed Service for Prometheus is Google Cloud's fully managed, multi-cloud, cross-project solution for Prometheus metrics. It lets you globally monitor and alert on your workloads, using Prometheus, without having to manually manage and operate Prometheus at scale. + +Managed Service for Prometheus collects metrics from Prometheus exporters and lets you query the data globally using PromQL, meaning that you can keep using any existing Grafana dashboards, PromQL-based alerts, and workflows. It is hybrid- and multi-cloud compatible, can monitor Kubernetes, VMs, and serverless workloads on Cloud Run, retains data for 24 months, and maintains portability by staying compatible with upstream Prometheus. You can also supplement your Prometheus monitoring by querying over 6,500 free metrics in Cloud Monitoring, including free GKE system metrics, using PromQL. + +For more information see the [Google Cloud Managed Service for Prometheus documentation](https://cloud.google.com/stackdriver/docs/managed-prometheus). + +## Google Kubernetes Engine (GKE) + +The Google Kubernetes Engine (GKE) is a managed Kubernetes service that you can use to deploy and operate containerized applications at scale using Google's infrastructure. + +For more information see the [Google Kubernetes Engine (GKE) documentation](https://cloud.google.com/kubernetes-engine). + +### Cloud Storage FUSE CSI driver + +Filesystem in Userspace (FUSE) is an interface used to export a filesystem to the Linux kernel. Cloud Storage FUSE allows you to mount Cloud Storage buckets as a file system so that applications can access the objects in a bucket using common File IO operations (e.g. open, read, write, close) rather than using cloud-specific APIs. + +The Cloud Storage FUSE CSI driver lets you use the Kubernetes API to consume pre-existing Cloud Storage buckets as volumes. Your applications can upload and download objects using Cloud Storage FUSE file system semantics. The Cloud Storage FUSE CSI driver provides a fully-managed experience powered by the open source Google Cloud Storage FUSE CSI driver. + +For more information see the [Cloud Storage FUSE CSI driver documentation](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/cloud-storage-fuse-csi-driver) + +### Cluster autoscaler + +GKE's cluster autoscaler automatically resizes the number of nodes in a given node pool, based on the demands of your workloads. When demand is low, the cluster autoscaler scales back down to a minimum size that you designate. This can increase the availability of your workloads when you need it, while controlling costs. You don't need to manually add or remove nodes or over-provision your node pools. Instead, you specify a minimum and maximum size for the node pool, and the rest is automatic. + +If resources are deleted or moved when autoscaling your cluster, your workloads might experience transient disruption. For example, if your workload consists of a controller with a single replica, that replica's Pod might be rescheduled onto a different node if its current node is deleted. Before enabling cluster autoscaler, design your workloads to tolerate potential disruption or ensure that critical Pods are not interrupted. + +For more information see the [Cluster autoscaler documentation](https://cloud.google.com/kubernetes-engine/docs/concepts/cluster-autoscaler). + +### Compute Engine persistent disk CSI Driver + +Google Kubernetes Engine (GKE) provides a simple way for you to automatically deploy and manage the Compute Engine persistent disk Container Storage Interface (CSI) Driver in your clusters. + +The Compute Engine persistent disk CSI Driver version is tied to the GKE version numbers and is typically the latest driver available at the time that the GKE version is released. The drivers update automatically when the cluster is upgraded to the latest GKE patch. + +For more information see the [Compute Engine persistent disk CSI Driver documentation](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/gce-pd-csi-driver) + +### Dataplane V2 + +GKE Dataplane V2 is a dataplane that is optimized for Kubernetes networking. GKE Dataplane V2 provides: + +- A consistent user experience for networking. +- Real-time visibility of network activity. +- Simpler architecture that makes it easier to manage and troubleshoot clusters. + +GKE Dataplane V2 is implemented using eBPF. As packets arrive at a GKE node, eBPF programs installed in the kernel decide how to route and process the packets. Unlike packet processing with iptables, eBPF programs can use Kubernetes-specific metadata in the packet. This lets GKE Dataplane V2 process network packets in the kernel more efficiently and report annotated actions back to user space for logging. + +For more information see the [Dataplane V2 documentation](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2) + +### Dataplane V2 observability + +GKE Dataplane V2 observability provides GKE Dataplane V2 metrics and insights into Kubernetes workloads. With GKE Dataplane V2 observability,you can: + +- Capture, observe, and alert on network metrics using Google Cloud Managed Service for Prometheus and Cloud Monitoring with Metrics Explorer +- Understand traffic flows for a particular Service in a cluster +- Understand and identify issues with the network health of a Kubernetes workload +- Verify Kubernetes Network Policies + +GKE Dataplane V2 observability offers the following troubleshooting tools: + +- A Kubernetes cluster Network Topology +- A Kubernetes Network Policy verdict table with live traffic flows and connection information +- Command-line tooling for troubleshooting Kubernetes traffic flows + +For more information see the [Dataplane V2 observability documentation](https://cloud.google.com/kubernetes-engine/docs/concepts/about-dpv2-observability) + +### Filestore CSI driver + +The Filestore CSI driver is the primary way to use Filestore instances with GKE. The CSI driver provides a fully-managed experience powered by the open source Google Cloud Filestore CSI driver. + +The CSI driver version is tied to Kubernetes minor version numbers and is typically the latest driver available at the time that the Kubernetes minor version is released. The drivers update automatically when the cluster is upgraded to the latest GKE patch. + +For more information see the [Filestore CSI driver](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/filestore-csi-driver) + +### Gateway + +The GKE Gateway controller is Google's implementation of the Kubernetes Gateway API for Cloud Load Balancing. Similar to the GKE Ingress controller, the Gateway controller watches a Kubernetes API for Gateway API resources and reconciles Cloud Load Balancing resources to implement the networking behavior specified by the Gateway resources. + +There are two versions of the GKE Gateway controller: + +- Single-cluster: manages single-cluster Gateways for a single GKE cluster. +- Multi-cluster: manages multi-cluster Gateways for one or more GKE clusters. + +Both Gateway controllers are Google-hosted controllers that watch the Kubernetes API for GKE clusters. Unlike the GKE Ingress controller, the Gateway controllers are not hosted on GKE control planes or in the user project, enabling them to be more scalable and robust. Both Gateway controllers are Generally Available. + +The Gateway controllers themselves are not a networking data plane and they do not process any traffic. They sit out of band from traffic and manage various data planes that process traffic. + +For more information see the [Gateway documentation](https://cloud.google.com/kubernetes-engine/docs/concepts/gateway-api) + +### Google Virtual NIC (gVNIC) + +Google Virtual NIC (gVNIC) is a virtual network interface designed specifically for Compute Engine. gVNIC is an alternative to the virtIO-based ethernet driver. + +As the next generation network interface which succeeds VirtIO, gVNIC replaces VirtIO-Net as the only supported network interface in Compute Engine for all new machine types (Generation 3 and onwards). Newer machine series and networking features require gVNIC instead of VirtIO. Consuming gVNIC as the modern I/O interface with Compute Engine VMs offers the following advantages: + +- Provides better performance. +- Improves consistency by reducing noisy neighbor problems. +- Introduces new network capabilities beyond what VirtIO is capable of. + +gVNIC is supported and recommended on all machine families, machine types, and generations. + +gVNIC is required to achieve the following maximum bandwidth rates: + +- 50 to 200 Gbps bandwidth with VMs that support per VM Tier_1 networking performance +- 50 to 1,000 Gbps bandwidth with VMs that have attached GPUs + +For more information see the [Google Virtual NIC (gVNIC) documentation](https://cloud.google.com/kubernetes-engine/docs/how-to/using-gvnic) + +### Image streaming + +Image streaming is a method of pulling container images in which GKE streams data from eligible images as requested by your applications. You can use Image streaming to allow your workloads to initialize without waiting for the entire image to download, which leads to significant improvements in initialization times. The shortened pull time provides you with benefits including the following: + +- Faster autoscaling +- Reduced latency when pulling large images +- Faster Pod startup + +With Image streaming, GKE uses a remote filesystem as the root filesystem for any containers that use eligible container images. GKE streams image data from the remote filesystem as needed by your workloads. Without Image streaming, GKE downloads the entire container image onto each node and uses it as the root filesystem for your workloads. + +While streaming the image data, GKE downloads the entire container image onto the local disk in the background and caches it. GKE then serves future data read requests from the cached image. + +When you deploy workloads that need to read specific files in the container image, the Image streaming backend serves only those requested files. + +To use image streaming, your container images must be stored in Artifact Registry. + +For more information see the [Image streaming documentation](https://cloud.google.com/kubernetes-engine/docs/how-to/image-streaming) + +### Node auto-provisioning (NAP) + +Node auto-provisioning automatically manages and scales a set of node pools on the user's behalf. Without node auto-provisioning, the GKE cluster autoscaler creates nodes only from user-created node pools. With node auto-provisioning, GKE automatically creates and deletes node pools. + +For more information see the [Node auto-provisioning (NAP) documentation](https://cloud.google.com/kubernetes-engine/docs/concepts/node-auto-provisioning) + +### Observability + +Observability is key to understand the health of your applications and maintain application availability and reliability. + +When you create a GKE cluster, Cloud Logging, Cloud Monitoring and Google Cloud Managed Service for Prometheus provide observability specifically tailored for Kubernetes. + +- Use the built-in dashboards to view default metrics and logs, and to set up recommended alerts. +- Enable additional observability packages to monitor Kubernetes components and objects and use collected data for debugging and troubleshooting. +- Configure data collection for third-party applications running on your clusters. +- Define your own metrics, dashboards, and alerts to meet your needs. + +In addition to the integration with Cloud Logging and Cloud Monitoring, GKE also provides other features to help you observe and maintain the health of your applications. + +For more information see the [Observability for GKE documentation](https://cloud.google.com/kubernetes-engine/docs/concepts/observability) + +### Private cluster + +Private clusters use nodes that don't have external IP addresses. This means that clients on the internet cannot connect to the IP addresses of the nodes. Private clusters are ideal for workloads that require controlled access due to data privacy and security regulations. + +For more information see the [Private cluster documentation](https://cloud.google.com/kubernetes-engine/docs/concepts/private-cluster-concept) + +### Release channels + +Use release channels for Google Kubernetes Engine (GKE) to pick versions for your clusters with your chosen balance between feature availability and stability. + +GKE automatically upgrades all clusters over time, including those not enrolled in a release channel, to ensure that they receive security updates, fixes to known issues, new features, and run a supported Kubernetes version. You can control the timing of upgrades with maintenance windows and exclusions. + +For more information see the [Release channels documentation](https://cloud.google.com/kubernetes-engine/docs/concepts/release-channels) + +### Security posture dashboard + +The security posture dashboard provides insights about your workload security posture at the runtime phase of the software delivery lifecycle. To gain comprehensive coverage of your applications throughout the lifecycle from source control to maintenance, we recommend that you use the dashboard with other security tooling. For more details about the available tooling and for best practices to safeguard your applications from end to end, see [Protect your software supply chain](https://cloud.google.com/software-supply-chain-security/docs/practices). + +For more information see the [Security posture dashboard documentation](https://cloud.google.com/kubernetes-engine/docs/concepts/about-security-posture-dashboard) + +### Shielded GKE nodes + +Shielded GKE Nodes are built on top of (Compute Engine Shielded VMs)[https://cloud.google.com/compute/shielded-vm/docs/shielded-vm]. Without Shielded GKE Nodes an attacker can exploit a vulnerability in a Pod to exfiltrate bootstrap credentials and impersonate nodes in your cluster, giving the attackers access to cluster secrets. When Shielded GKE Nodes is enabled, the GKE control plane cryptographically verifies that: + +- Every node in your cluster is a virtual machine running in Google's data center. +- Every node is part of the Managed Instance Group (MIG) provisioned for the cluster. +- The kubelet is being provisioned a certificate for the node on which it is running. + +This limits the ability of an attacker to impersonate a node in your cluster even if they are able to exfiltrate bootstrap credentials of the node. + +For more information see the [Shielded GKE nodes documentation](https://cloud.google.com/kubernetes-engine/docs/how-to/shielded-gke-nodes) + +#### Integrity monitoring + +Integrity monitoring helps you understand and make decisions about the state of your VM instances. + +Integrity monitoring relies on the measurements created by Measured Boot, which use platform configuration registers (PCRs) to store information about the components and component load order of both the integrity policy baseline (a known good boot sequence), and the most recent boot sequence. + +Integrity monitoring compares the most recent boot measurements to the integrity policy baseline and returns a pair of pass/fail results depending on whether they match or not, one for the early boot sequence and one for the late boot sequence. Early boot is the boot sequence from the start of the UEFI firmware until it passes control to the bootloader. Late boot is the boot sequence from the bootloader until it passes control to the operating system kernel. If either part of the most recent boot sequence doesn't match the baseline, you get an integrity validation failure. + +If the failure is expected, for example if you applied a system update on that VM instance, you should update the integrity policy baseline. Updating the integrity policy baseline sets the baseline to the measurements captured from the most recent boot sequence. If it is not expected, you should stop that VM instance and investigate the reason for the failure. + +You can view integrity reports in Cloud Monitoring, and set alerts on integrity failures. You can review the details of integrity monitoring results in Cloud Logging. For more information, see Monitoring integrity on Shielded VM instances. + +For more information see the [Integrity monitoring documentation](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm?hl=en#integrity-monitoring) + +#### Secure boot + +Secure Boot helps ensure that the system only runs authentic software by verifying the digital signature of all boot components, and halting the boot process if signature verification fails. + +Shielded VM instances run firmware which is signed and verified using Google's Certificate Authority, ensuring that the instance's firmware is unmodified and establishing the root of trust for Secure Boot. The Unified Extensible Firmware Interface (UEFI) 2.3.1 firmware, securely manages the certificates that contain the keys used by the software manufacturers to sign the system firmware, the system boot loader, and any binaries they load. Shielded VM instances use UEFI firmware. + +On each boot, the UEFI firmware verifies the digital signature of each boot component against the secure store of approved keys. Any boot component that isn't properly signed, or isn't signed at all, isn't allowed to run. + +If this occurs, the VM instance's serial console log will have an entry containing the strings `UEFI: Failed to load image` and `Status: Security Violation`, along with a description of the boot option that failed. + +For more information see the [Secure boot documentation](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm?hl=en#secure-boot) + +### Workload Identity Federation + +Workload Identity Federation for GKE is the recommended way for your workloads running on Google Kubernetes Engine (GKE) to access Google Cloud services in a secure and manageable way. It is available through IAM Workload Identity Federation, which provides identities for workloads that run in environments inside and outside Google Cloud. In GKE, Google Cloud manages the workload identity pool and provider for you and doesn't require an external identity provider. + +For more information see the [Workload Identity Federation for GKE documentation](https://cloud.google.com/kubernetes-engine/docs/concepts/workload-identity) + +### Workload vulnerability scanning + +Workload vulnerability scanning is a set of capabilities in the security posture dashboard that automatically scans for known vulnerabilities in your container images and in specific language packages during the runtime phase of the software delivery lifecycle. If GKE detects vulnerabilities, the security posture dashboard displays details about the issues and provides actionable remediation steps to mitigate the vulnerabilities. + +For more information see the [Workload vulnerability scanning documentation](https://cloud.google.com/kubernetes-engine/docs/concepts/about-workload-vulnerability-scanning) + +## GKE Enterprise + +GKE Enterprise is Google's cloud-centric container platform for running modern apps anywhere consistently at scale. + +For more information see the [GKE Enterprise documentation](https://cloud.google.com/kubernetes-engine/enterprise/docs/concepts/overview) + +### Compliance dashboard + +The GKE Compliance dashboard in the Google Cloud console provides actionable insights to strengthen your security posture. + +For more information see the [GKE Compliance dashboard documentation](https://cloud.google.com/kubernetes-engine/fleet-management/docs/about-compliance-dashboard) + +### Config Sync + +Config Sync is a GitOps service built on an open source core that lets cluster operators and platform administrators deploy configurations from a source of truth. The service has the flexibility to support one or many clusters and any number of repositories per cluster or namespace. The clusters can be in a hybrid or multi-cloud environment. + +For more information see the [Config Sync documentation](https://cloud.google.com/kubernetes-engine/enterprise/config-sync/docs/overview) + +### Connect gateway + +The Connect gateway builds on the power of fleets to let users connect to and run commands against fleet member clusters in a simple, consistent, and secured way, whether the clusters are on Google Cloud, other public clouds, or on premises, and makes it easier to automate DevOps processes across all your clusters. + +By default the Connect gateway uses your Google ID to authenticate to clusters, with support for third party identity providers using workforce identity federation, and with group-based authentication support via GKE Identity Service. + +For more information see the [Connect gateway documentation](https://cloud.google.com/kubernetes-engine/enterprise/multicluster-management/gateway) + +### Fleet Management + +Fleet management offers a set of capabilities that helps you and your organization manage clusters, infrastructure, and workloads, on Google Cloud and across public cloud and on-premises environments. These capabilities are all built around the idea of the `fleet`: a logical grouping of Kubernetes clusters and other resources that can be managed together. Fleets are managed by the Fleet service, also known as the Hub service. + +For more information see the [Fleet management documentation](https://cloud.google.com/kubernetes-engine/fleet-management/docs) + +### Policy Controller + +Policy Controller enables the application and enforcement of programmable policies for your Kubernetes clusters. These policies act as guardrails and can help with best practices, security, and compliance management of your clusters and fleet. Based on the open source Open Policy Agent Gatekeeper project, Policy Controller is fully integrated with Google Cloud, includes a built-in dashboard, for observability, and comes with a full library of pre-built policies for common security and compliance controls. + +For more information see the [Policy Controller documentation](https://cloud.google.com/kubernetes-engine/enterprise/policy-controller/docs/overview diff --git a/best-practices/ml-platform/examples/platform/playground/README.md b/best-practices/ml-platform/examples/platform/playground/README.md index 7799cb03e..d40b57da4 100644 --- a/best-practices/ml-platform/examples/platform/playground/README.md +++ b/best-practices/ml-platform/examples/platform/playground/README.md @@ -8,6 +8,8 @@ This quick-start deployment guide can be used to set up an environment to famili For more information about the architecture, see the [Playground Machine learning platform (MLP) on GKE: Architecture](/best-practices/ml-platform/docs/platform/playground/architecture.md) document. +For an outline of products and features used in the platform, see the [Platform Products and Features](/best-practices/ml-platform/docs/platform/products-and-features.md) document. + ## Requirements ### Project @@ -101,26 +103,27 @@ The default quota given to a project should be sufficient for this guide. nano ${HOME}/secrets/mlp-github-token ``` -- Set the GitHub environment variables in Cloud Shell +- Set the Git environment variables in Cloud Shell Replace the following values: - - `` is the GitHub organization or user namespace to use for the repositories - - `` is the GitHub account to use for authentication - - `` is the email address to use for commit + - `` is the GitHub organization or user namespace to use for the repositories + - `` is the email address to use for commit + - `` is the GitHub account to use for authentication ``` - export MLP_GITHUB_ORG="" - export MLP_GITHUB_USER="" - export MLP_GITHUB_EMAIL="" + export MLP_GIT_NAMESPACE="" + export MLP_GIT_USER_EMAIL="" + export MLP_GIT_USER_NAME="" + ``` - Set the configuration variables ``` - sed -i "s/YOUR_GITHUB_EMAIL/${MLP_GITHUB_EMAIL}/g" ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars - sed -i "s/YOUR_GITHUB_ORG/${MLP_GITHUB_ORG}/g" ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars - sed -i "s/YOUR_GITHUB_USER/${MLP_GITHUB_USER}/g" ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars + sed -i "s/YOUR_GIT_NAMESPACE/${MLP_GIT_NAMESPACE}/g" ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars + sed -i "s/YOUR_GIT_USER_EMAIL/${MLP_GIT_USER_EMAIL}/g" ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars + sed -i "s/YOUR_GIT_USER_NAME/${MLP_GIT_USER_NAME}/g" ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars ``` ## Project Configuration @@ -176,7 +179,8 @@ You can now deploy the platform with Terraform in the [next section](#run-terraf ``` ``` - environment_name = "dev" + environment_name = "dev" + iap_support_email = "" project = { billing_account_id = "XXXXXX-XXXXXX-XXXXXX" folder_id = "############" @@ -186,6 +190,7 @@ You can now deploy the platform with Terraform in the [next section](#run-terraf ``` - `environment_name`: the name of the environment + - `iap_support_email`: the email to use as the support contact for the IAP brand - `project.billing_account_id`: the billing account ID - `project.name`: the prefix for the display name of the project, the full name will be `-` @@ -230,9 +235,9 @@ For more information on IAP, see the [Identity-Aware Proxy documentation](https: For this guide we will configure a generic OAuth consent screen setup for internal use. Internal use means that only users within your organization can be granted IAM permissions to access the IAP secured applications and resource. -See the [Configuring the OAuth consent screen documenation](https://developers.google.com/workspace/guides/configure-oauth-consent) for additional information +See the [Configuring the OAuth consent screen documentation](https://developers.google.com/workspace/guides/configure-oauth-consent) for additional information -**NOTE: These steps only need to be completed once for a project.** +**NOTE: These steps only need to be completed once for a project. If you are using the Terraform managed project option, this has already been completed for you.** - Go to [APIs & Services](https://console.cloud.google.com/apis/dashboard?) > [OAuth consent screen](https://console.cloud.google.com/apis/credentials/consent) configuration page. - Select **Internal** for the **User Type** @@ -254,20 +259,20 @@ For simplicity, in this guide access to the IAP secured applications will be con - Set the IAP allow domain ``` - IAP_DOMAIN=$(gcloud auth list --filter=status:ACTIVE --format="value(account)" | awk -F@ '{print $2}') - echo "IAP_DOMAIN=${IAP_DOMAIN}" + MLP_IAP_DOMAIN=$(gcloud auth list --filter=status:ACTIVE --format="value(account)" | awk -F@ '{print $2}') + echo "MLP_IAP_DOMAIN=${MLP_IAP_DOMAIN}" ``` - **If the domain of the active `gcloud` user is different from the organization that the `MLP_PROJECT_ID` project is in, you will need to manually set `IAP_DOMAIN` environment variable** + **If the domain of the active `gcloud` user is different from the organization that the `MLP_PROJECT_ID` project is in, you will need to manually set `MLP_IAP_DOMAIN` environment variable** ``` - IAP_DOMAIN= + MLP_IAP_DOMAIN= ``` - Set the IAP domain in the configuration file ``` - sed -i '/^iap_domain[[:blank:]]*=/{h;s/=.*/= "'"${IAP_DOMAIN}"'"/};${x;/^$/{s//iap_domain = "'"${IAP_DOMAIN}"'"/;H};x}' ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars + sed -i '/^iap_domain[[:blank:]]*=/{h;s/=.*/= "'"${MLP_IAP_DOMAIN}"'"/};${x;/^$/{s//iap_domain = "'"${MLP_IAP_DOMAIN}"'"/;H};x}' ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars ``` ## Create the resources @@ -281,8 +286,9 @@ Before running Terraform, make sure that the Service Usage API is enable. - Ensure the endpoint is not in a deleted state ``` + MLP_ENVIRONMENT_NAME=$(grep environment_name ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars | awk -F"=" '{print $2}' | xargs) MLP_PROJECT_ID=$(grep environment_project_id ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars | awk -F"=" '{print $2}' | xargs) - gcloud endpoints services undelete ray-dashboard.ml-team.mlp.endpoints.${MLP_PROJECT_ID}.cloud.goog --quiet 2>/dev/null + gcloud endpoints services undelete ray-dashboard.ml-team.mlp-${MLP_ENVIRONMENT_NAME}.endpoints.${MLP_PROJECT_ID}.cloud.goog --quiet 2>/dev/null ``` - Create the resources @@ -290,7 +296,7 @@ Before running Terraform, make sure that the Service Usage API is enable. ``` cd ${MLP_TYPE_BASE_DIR} && \ terraform init && \ - terraform plan -input=false -var github_token="$(tr --delete '\n' < ${HOME}/secrets/mlp-github-token)" -out=tfplan && \ + terraform plan -input=false -var git_token="$(tr --delete '\n' < ${HOME}/secrets/mlp-github-token)" -out=tfplan && \ terraform apply -input=false tfplan && \ rm tfplan ``` @@ -326,7 +332,7 @@ Open Cloud Shell to execute the following commands: ``` Starting to build Gateway kubeconfig... Current project_id: mlops-platform-417609 - A new kubeconfig entry "connectgateway_mlops-platform-417609_global_gke-ml-dev" has been generated and set as the current context. + A new kubeconfig entry "connectgateway_mlops-platform-417609_global_mlp-dev" has been generated and set as the current context. ``` - Fetch KubeRay operator CRDs @@ -404,8 +410,9 @@ Open Cloud Shell to execute the following commands: - Open the `ml-team` Ray dashboard ``` + MLP_ENVIRONMENT_NAME=$(grep environment_name ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars | awk -F"=" '{print $2}' | xargs) MLP_PROJECT_ID=$(grep environment_project_id ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars | awk -F"=" '{print $2}' | xargs) - echo -e "\nml-team Ray dashboard: https://ray-dashboard.ml-team.mlp.endpoints.${MLP_PROJECT_ID}.cloud.goog\n" + echo -e "\nml-team Ray dashboard: https://ray-dashboard.ml-team.mlp-${MLP_ENVIRONMENT_NAME}.endpoints.${MLP_PROJECT_ID}.cloud.goog\n" ``` > If you get `ERR_CONNECTION_CLOSED` or `ERR_CONNECTION_RESET` when trying to go to the Ray dashboard, the [Gateway](https://console.cloud.google.com/kubernetes/gateways) is still being provisioned. Retry in a couple of minutes. @@ -421,7 +428,7 @@ Open Cloud Shell to execute the following commands: ``` cd ${MLP_TYPE_BASE_DIR} && \ terraform init && \ - terraform destroy -auto-approve -var github_token="$(tr --delete '\n' < ${HOME}/secrets/mlp-github-token)" && \ + terraform destroy -auto-approve -var git_token="$(tr --delete '\n' < ${HOME}/secrets/mlp-github-token)" && \ rm -rf .terraform .terraform.lock.hcl ``` @@ -501,7 +508,7 @@ The OAuth Consent screen was not configured, see the [Configure OAuth consent sc --- ``` -│ Error: googleapi: Error 400: Service ray-dashboard.ml-team.mlp.endpoints..cloud.goog has been deleted and +│ Error: googleapi: Error 400: Service ray-dashboard.ml-team.mlp-.endpoints..cloud.goog has been deleted and will be purged after 30 days. To reuse this service, please undelete the service following https://cloud.google.com/service-infrastructure/docs/create-services#undeleting., failedPrecondition │ │ with google_endpoints_service.ray_dashboard_https, @@ -513,8 +520,9 @@ will be purged after 30 days. To reuse this service, please undelete the service The endpoint is in a deleted state and needs to be undeleted, run the following command and then rerun the Terraform apply. ``` +MLP_ENVIRONMENT_NAME=$(grep environment_name ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars | awk -F"=" '{print $2}' | xargs) MLP_PROJECT_ID=$(grep environment_project_id ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars | awk -F"=" '{print $2}' | xargs) -gcloud endpoints services undelete ray-dashboard.ml-team.mlp.endpoints.${MLP_PROJECT_ID}.cloud.goog --quiet +gcloud endpoints services undelete ray-dashboard.ml-team.mlp-${MLP_ENVIRONMENT_NAME}.endpoints.${MLP_PROJECT_ID}.cloud.goog --quiet ``` --- diff --git a/best-practices/ml-platform/examples/platform/playground/configsync_repository_github.tf b/best-practices/ml-platform/examples/platform/playground/configsync_repository_github.tf new file mode 100644 index 000000000..50d951700 --- /dev/null +++ b/best-practices/ml-platform/examples/platform/playground/configsync_repository_github.tf @@ -0,0 +1,31 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + configsync_repository = module.configsync_repository + git_repository = replace(local.configsync_repository.html_url, "/https*:\\/\\//", "") +} + +module "configsync_repository" { + source = "../../../terraform/modules/github_repository" + + branches = { + default = "main" + names = ["main"] + } + description = "MLP Config Sync repository for ${var.environment_name} environment" + name = "${var.configsync_repo_name}-${var.environment_name}" + owner = var.git_namespace + token = var.git_token +} diff --git a/best-practices/ml-platform/examples/platform/playground/configsync_repository_gitlab.tf.ignore b/best-practices/ml-platform/examples/platform/playground/configsync_repository_gitlab.tf.ignore new file mode 100644 index 000000000..6a270f4d8 --- /dev/null +++ b/best-practices/ml-platform/examples/platform/playground/configsync_repository_gitlab.tf.ignore @@ -0,0 +1,31 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + configsync_repository = module.configsync_repository + git_repository = replace(local.configsync_repository.html_url, "/https*:\\/\\//", "") +} + +module "configsync_repository" { + source = "../../../terraform/modules/gitlab_project" + + branches = { + default = "main" + names = ["main"] + } + description = "MLP Config Sync repository for ${var.environment_name} environment" + group_full_path = var.git_namespace + project_name = "${var.configsync_repo_name}-${var.environment_name}" + token = var.git_token +} diff --git a/best-practices/ml-platform/examples/platform/playground/container_cluster.tf b/best-practices/ml-platform/examples/platform/playground/container_cluster.tf new file mode 100644 index 000000000..cf56b98b2 --- /dev/null +++ b/best-practices/ml-platform/examples/platform/playground/container_cluster.tf @@ -0,0 +1,267 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + # Minimal roles for nodepool SA https://cloud.google.com/kubernetes-engine/docs/how-to/hardening-your-cluster#use_least_privilege_sa + cluster_sa_roles = [ + "roles/monitoring.viewer", + "roles/monitoring.metricWriter", + "roles/logging.logWriter", + "roles/stackdriver.resourceMetadata.writer", + "roles/autoscaling.metricsWriter", + "roles/artifactregistry.reader", + "roles/serviceusage.serviceUsageConsumer" + ] +} + +# Create dedicated service account for node pools +resource "google_service_account" "cluster" { + project = data.google_project.environment.project_id + account_id = "vm-${var.cluster_name}-${var.environment_name}" + display_name = "${var.cluster_name}-${var.environment_name} Service Account" + description = "Terraform-managed service account for cluster ${var.cluster_name}-${var.environment_name}" +} + +# Bind minimum role list + additional roles to nodepool SA on project +resource "google_project_iam_member" "cluster_sa" { + for_each = toset(local.cluster_sa_roles) + project = data.google_project.environment.project_id + member = google_service_account.cluster.member + role = each.value +} + +resource "google_container_cluster" "mlp" { + provider = google-beta + + datapath_provider = "ADVANCED_DATAPATH" + deletion_protection = false + enable_shielded_nodes = true + location = var.subnet_01_region + name = "${var.cluster_name}-${var.environment_name}" + network = module.create-vpc.vpc + project = data.google_project.environment.project_id + remove_default_node_pool = false + subnetwork = module.create-vpc.subnet-1 + + addons_config { + gcp_filestore_csi_driver_config { + enabled = true + } + + gcs_fuse_csi_driver_config { + enabled = true + } + + gce_persistent_disk_csi_driver_config { + enabled = true + } + } + + cluster_autoscaling { + autoscaling_profile = "OPTIMIZE_UTILIZATION" + enabled = true + + auto_provisioning_defaults { + disk_type = "pd-balanced" + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + service_account = google_service_account.cluster.email + + management { + auto_repair = true + auto_upgrade = true + } + + shielded_instance_config { + enable_integrity_monitoring = true + enable_secure_boot = true + } + + upgrade_settings { + max_surge = 0 + max_unavailable = 1 + strategy = "SURGE" + } + } + + resource_limits { + resource_type = "cpu" + minimum = 4 + maximum = 1024 + } + + resource_limits { + resource_type = "memory" + minimum = 16 + maximum = 4096 + } + + resource_limits { + resource_type = "nvidia-a100-80gb" + maximum = 32 + } + + resource_limits { + resource_type = "nvidia-l4" + maximum = 32 + } + + resource_limits { + resource_type = "nvidia-tesla-t4" + maximum = 256 + } + + resource_limits { + resource_type = "nvidia-tesla-a100" + maximum = 64 + } + + resource_limits { + resource_type = "nvidia-tesla-k80" + maximum = 32 + } + + resource_limits { + resource_type = "nvidia-tesla-p4" + maximum = 32 + } + + resource_limits { + resource_type = "nvidia-tesla-p100" + maximum = 32 + } + + resource_limits { + resource_type = "nvidia-tesla-v100" + maximum = 32 + } + } + + gateway_api_config { + channel = "CHANNEL_STANDARD" + } + + ip_allocation_policy { + } + + lifecycle { + ignore_changes = [ + node_pool + ] + } + + logging_config { + enable_components = [ + "APISERVER", + "CONTROLLER_MANAGER", + "SCHEDULER", + "SYSTEM_COMPONENTS", + "WORKLOADS" + ] + } + + master_authorized_networks_config { + cidr_blocks { + cidr_block = var.subnet_01_ip + display_name = "vpc-cidr" + } + } + + monitoring_config { + advanced_datapath_observability_config { + enable_metrics = true + } + + enable_components = [ + "APISERVER", + "CONTROLLER_MANAGER", + "DAEMONSET", + "DEPLOYMENT", + "HPA", + "POD", + "SCHEDULER", + "STATEFULSET", + "STORAGE", + "SYSTEM_COMPONENTS" + ] + + managed_prometheus { + enabled = true + } + } + + node_pool { + initial_node_count = 1 + name = "system" + + autoscaling { + location_policy = "BALANCED" + total_max_node_count = 32 + total_min_node_count = 1 + } + + network_config { + enable_private_nodes = true + } + + node_config { + machine_type = "e2-standard-4" + service_account = google_service_account.cluster.email + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + + gcfs_config { + enabled = true + } + + shielded_instance_config { + enable_integrity_monitoring = true + enable_secure_boot = true + } + } + } + + node_pool_defaults { + node_config_defaults { + gcfs_config { + enabled = true + } + } + } + + private_cluster_config { + enable_private_nodes = true + enable_private_endpoint = true + master_ipv4_cidr_block = "172.16.0.32/28" + } + + release_channel { + channel = "RAPID" + } + + secret_manager_config { + enabled = true + } + + security_posture_config { + mode = "BASIC" + vulnerability_mode = "VULNERABILITY_ENTERPRISE" + } + + workload_identity_config { + workload_pool = "${data.google_project.environment.project_id}.svc.id.goog" + } +} diff --git a/best-practices/ml-platform/examples/platform/playground/container_node_pool.tf b/best-practices/ml-platform/examples/platform/playground/container_node_pool.tf new file mode 100644 index 000000000..2abe2e1b8 --- /dev/null +++ b/best-practices/ml-platform/examples/platform/playground/container_node_pool.tf @@ -0,0 +1,625 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# CPU +# Available zones: https://cloud.google.com/compute/docs/regions-zones#available +############################################################################### +resource "google_container_node_pool" "cpu_n4s8" { + depends_on = [google_container_cluster.mlp] + + # Variables + cluster = google_container_cluster.mlp.name + initial_node_count = 1 + location = var.subnet_01_region + name = "cpu-n4s8" + node_locations = [ + "${var.subnet_01_region}-a", + "${var.subnet_01_region}-c" + ] + project = data.google_project.environment.project_id + + # Blocks + autoscaling { + location_policy = "BALANCED" + total_max_node_count = 32 + total_min_node_count = 1 + } + + network_config { + enable_private_nodes = true + } + + node_config { + # Variables + labels = { + "resource-model" : "n4" + "resource-type" : "cpu" + } + machine_type = "n4-standard-8" + service_account = google_service_account.cluster.email + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + + # Blocks + gcfs_config { + enabled = true + } + + shielded_instance_config { + enable_integrity_monitoring = true + enable_secure_boot = true + } + + taint { + effect = "NO_SCHEDULE" + key = "on-demand" + value = true + } + } + + lifecycle { + ignore_changes = [ + initial_node_count, + node_config[0].labels, + node_config[0].taint, + ] + } + + timeouts { + create = "30m" + update = "20m" + } +} + + + +# GPU +# Available zones: https://cloud.google.com/compute/docs/gpus/gpu-regions-zones#view-using-table +############################################################################### +resource "google_container_node_pool" "gpu_a100x2_a2h2" { + depends_on = [google_container_cluster.mlp] + + # Variables + cluster = google_container_cluster.mlp.name + location = var.subnet_01_region + name = "gpu-a100x2-a2h2" + node_locations = [ + "${var.subnet_01_region}-a", + "${var.subnet_01_region}-b", + "${var.subnet_01_region}-c", + "${var.subnet_01_region}-f" + ] + project = data.google_project.environment.project_id + + # Blocks + autoscaling { + location_policy = "ANY" + total_max_node_count = 1000 + total_min_node_count = 0 + } + + lifecycle { + ignore_changes = [ + node_config[0].labels, + node_config[0].taint, + ] + } + + network_config { + enable_private_nodes = true + } + + node_config { + # Variables + labels = { + "resource-model" : "a100" + "resource-type" : "gpu" + } + machine_type = "a2-highgpu-2g" + service_account = google_service_account.cluster.email + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + + # Blocks + gcfs_config { + enabled = true + } + + guest_accelerator { + count = 2 + type = "nvidia-tesla-a100" + + gpu_driver_installation_config { + gpu_driver_version = var.gpu_driver_version + } + } + + gvnic { + enabled = true + } + + reservation_affinity { + consume_reservation_type = "NO_RESERVATION" + } + + shielded_instance_config { + enable_integrity_monitoring = true + enable_secure_boot = true + } + + taint { + effect = "NO_SCHEDULE" + key = "on-demand" + value = true + } + } + + timeouts { + create = "30m" + update = "20m" + } +} + +############################################################################### + +resource "google_container_node_pool" "gpu_a100x2_a2h2_dws" { + depends_on = [google_container_cluster.mlp] + + # Variables + cluster = google_container_cluster.mlp.name + location = var.subnet_01_region + name = "gpu-a100x2-a2h2-dws" + node_locations = [ + "${var.subnet_01_region}-a", + "${var.subnet_01_region}-b", + "${var.subnet_01_region}-c", + "${var.subnet_01_region}-f" + ] + project = data.google_project.environment.project_id + + # Blocks + autoscaling { + location_policy = "ANY" + total_max_node_count = 1000 + total_min_node_count = 0 + } + + lifecycle { + ignore_changes = [ + node_config[0].labels, + node_config[0].taint, + ] + } + + network_config { + enable_private_nodes = true + } + + node_config { + # Variables + labels = { + "resource-model" : "a100" + "resource-type" : "gpu" + } + machine_type = "a2-highgpu-2g" + service_account = google_service_account.cluster.email + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + + # Blocks + gcfs_config { + enabled = true + } + + guest_accelerator { + count = 2 + type = "nvidia-tesla-a100" + + gpu_driver_installation_config { + gpu_driver_version = var.gpu_driver_version + } + } + + gvnic { + enabled = true + } + + reservation_affinity { + consume_reservation_type = "NO_RESERVATION" + } + + shielded_instance_config { + enable_integrity_monitoring = true + enable_secure_boot = true + } + + taint { + effect = "NO_SCHEDULE" + key = "on-demand" + value = true + } + } + + queued_provisioning { + enabled = true + } + + timeouts { + create = "30m" + update = "20m" + } +} + +############################################################################### + +resource "google_container_node_pool" "gpu_h100x8_a3h8_dws" { + depends_on = [google_container_cluster.mlp] + + # Variables + cluster = google_container_cluster.mlp.name + location = var.subnet_01_region + name = "gpu-h100x8-a3h8-dws" + node_locations = [ + "${var.subnet_01_region}-a", + "${var.subnet_01_region}-c" + ] + project = data.google_project.environment.project_id + + # Blocks + autoscaling { + location_policy = "ANY" + total_max_node_count = 1000 + total_min_node_count = 0 + } + + lifecycle { + ignore_changes = [ + node_config[0].labels, + node_config[0].taint, + ] + } + + network_config { + enable_private_nodes = true + } + + node_config { + # Variables + labels = { + "resource-model" : "h100" + "resource-type" : "gpu" + } + machine_type = "a3-highgpu-8g" + service_account = google_service_account.cluster.email + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + + # Blocks + ephemeral_storage_local_ssd_config { + local_ssd_count = 16 + } + + gcfs_config { + enabled = true + } + + guest_accelerator { + count = 8 + type = "nvidia-h100-80gb" + + gpu_driver_installation_config { + gpu_driver_version = var.gpu_driver_version + } + } + + gvnic { + enabled = true + } + + reservation_affinity { + consume_reservation_type = "NO_RESERVATION" + } + + shielded_instance_config { + enable_integrity_monitoring = true + enable_secure_boot = true + } + + taint { + effect = "NO_SCHEDULE" + key = "on-demand" + value = true + } + } + + queued_provisioning { + enabled = true + } + + timeouts { + create = "30m" + update = "20m" + } +} + +############################################################################### + +resource "google_container_node_pool" "gpu_l4x2_g2s24" { + depends_on = [google_container_cluster.mlp] + + # Variables + cluster = google_container_cluster.mlp.name + location = var.subnet_01_region + name = "gpu-l4x2-g2s24" + node_locations = [ + "${var.subnet_01_region}-a", + "${var.subnet_01_region}-b", + "${var.subnet_01_region}-c" + ] + project = data.google_project.environment.project_id + + autoscaling { + location_policy = "ANY" + total_max_node_count = 1000 + total_min_node_count = 0 + } + + lifecycle { + ignore_changes = [ + node_config[0].labels, + node_config[0].taint, + ] + } + + network_config { + enable_private_nodes = true + } + + node_config { + # Variables + labels = { + "resource-model" : "l4" + "resource-type" : "gpu" + } + machine_type = "g2-standard-24" + service_account = google_service_account.cluster.email + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + + # Blocks + gcfs_config { + enabled = true + } + + guest_accelerator { + count = 2 + type = "nvidia-l4" + + gpu_driver_installation_config { + gpu_driver_version = var.gpu_driver_version + } + } + + gvnic { + enabled = true + } + + reservation_affinity { + consume_reservation_type = "NO_RESERVATION" + } + + shielded_instance_config { + enable_integrity_monitoring = true + enable_secure_boot = true + } + + taint { + effect = "NO_SCHEDULE" + key = "on-demand" + value = true + } + } + + timeouts { + create = "30m" + update = "20m" + } +} + +############################################################################### + +resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" { + depends_on = [google_container_cluster.mlp] + + # Variables + cluster = google_container_cluster.mlp.name + location = var.subnet_01_region + name = "gpu-l4x2-g2s24-dws" + node_locations = [ + "${var.subnet_01_region}-a", + "${var.subnet_01_region}-b", + "${var.subnet_01_region}-c" + ] + project = data.google_project.environment.project_id + + autoscaling { + location_policy = "ANY" + total_max_node_count = 1000 + total_min_node_count = 0 + } + + lifecycle { + ignore_changes = [ + node_config[0].labels, + node_config[0].taint, + ] + } + + network_config { + enable_private_nodes = true + } + + node_config { + # Variables + labels = { + "resource-model" : "l4" + "resource-type" : "gpu" + } + machine_type = "g2-standard-24" + service_account = google_service_account.cluster.email + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + + # Blocks + gcfs_config { + enabled = true + } + + guest_accelerator { + count = 2 + type = "nvidia-l4" + + gpu_driver_installation_config { + gpu_driver_version = var.gpu_driver_version + } + } + + gvnic { + enabled = true + } + + reservation_affinity { + consume_reservation_type = "NO_RESERVATION" + } + + shielded_instance_config { + enable_integrity_monitoring = true + enable_secure_boot = true + } + + taint { + effect = "NO_SCHEDULE" + key = "on-demand" + value = true + } + } + + queued_provisioning { + enabled = true + } + + timeouts { + create = "30m" + update = "20m" + } +} + +############################################################################### + +resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" { + depends_on = [google_container_cluster.mlp] + + # Variables + cluster = google_container_cluster.mlp.name + location = var.subnet_01_region + name = "gpu-l4x2-g2s24-spot" + node_locations = [ + "${var.subnet_01_region}-a", + "${var.subnet_01_region}-b", + "${var.subnet_01_region}-c" + ] + project = data.google_project.environment.project_id + + # Blocks + autoscaling { + location_policy = "ANY" + total_max_node_count = 1000 + total_min_node_count = 0 + } + + lifecycle { + ignore_changes = [ + node_config[0].labels, + node_config[0].taint, + ] + } + + network_config { + enable_private_nodes = true + } + + node_config { + # Variables + labels = { + "resource-model" : "l4" + "resource-type" : "gpu" + } + machine_type = "g2-standard-24" + service_account = google_service_account.cluster.email + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + spot = true + + # Blocks + gcfs_config { + enabled = true + } + + guest_accelerator { + count = 2 + type = "nvidia-l4" + + gpu_driver_installation_config { + gpu_driver_version = var.gpu_driver_version + } + } + + gvnic { + enabled = true + } + + reservation_affinity { + consume_reservation_type = "NO_RESERVATION" + } + + shielded_instance_config { + enable_integrity_monitoring = true + enable_secure_boot = true + } + + taint { + effect = "NO_SCHEDULE" + key = "spot" + value = true + } + } + + timeouts { + create = "30m" + update = "20m" + } +} + + + +# TPU +# Available zones: https://cloud.google.com/tpu/docs/regions-zones +############################################################################### diff --git a/best-practices/ml-platform/examples/platform/playground/fleet.tf b/best-practices/ml-platform/examples/platform/playground/fleet.tf new file mode 100644 index 000000000..c60fb36e8 --- /dev/null +++ b/best-practices/ml-platform/examples/platform/playground/fleet.tf @@ -0,0 +1,103 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# Removing to all for multiple environments in the same project +# +# resource "google_gke_hub_feature" "configmanagement" { +# depends_on = [ +# google_project_service.anthos_googleapis_com, +# google_project_service.anthosconfigmanagement_googleapis_com, +# google_project_service.compute_googleapis_com, +# google_project_service.gkeconnect_googleapis_com, +# google_project_service.gkehub_googleapis_com, +# local.configsync_repository +# ] + +# location = "global" +# name = "configmanagement" +# project = data.google_project.environment.project_id +# } + +resource "null_resource" "gke_hub_feature_configmanagement" { + depends_on = [ + google_project_service.anthos_googleapis_com, + google_project_service.anthosconfigmanagement_googleapis_com, + google_project_service.compute_googleapis_com, + google_project_service.gkeconnect_googleapis_com, + google_project_service.gkehub_googleapis_com, + local.configsync_repository + ] + + provisioner "local-exec" { + command = "gcloud beta container fleet config-management enable --project ${var.environment_project_id}" + } +} + +resource "google_gke_hub_membership" "cluster" { + depends_on = [ + google_project_service.gkeconnect_googleapis_com, + google_project_service.gkehub_googleapis_com + ] + + membership_id = google_container_cluster.mlp.name + project = data.google_project.environment.project_id + + endpoint { + gke_cluster { + resource_link = "//container.googleapis.com/${google_container_cluster.mlp.id}" + } + } +} + +resource "google_gke_hub_feature_membership" "cluster_configmanagement" { + depends_on = [ + google_container_cluster.mlp, + #google_gke_hub_feature" "configmanagement, + google_project_service.anthos_googleapis_com, + google_project_service.anthosconfigmanagement_googleapis_com, + google_project_service.gkeconnect_googleapis_com, + google_project_service.gkehub_googleapis_com, + local.configsync_repository, + module.cloud-nat, + null_resource.gke_hub_feature_configmanagement + ] + + feature = "configmanagement" + location = "global" + membership = google_gke_hub_membership.cluster.membership_id + project = data.google_project.environment.project_id + + configmanagement { + version = var.config_management_version + + config_sync { + source_format = "unstructured" + + git { + policy_dir = "manifests/clusters" + secret_type = "token" + sync_branch = local.configsync_repository.default_branch + sync_repo = local.configsync_repository.http_clone_url + } + } + + policy_controller { + enabled = true + referential_rules_enabled = true + template_library_installed = true + + } + } +} diff --git a/best-practices/ml-platform/examples/platform/playground/gateway.tf b/best-practices/ml-platform/examples/platform/playground/gateway.tf index a3d75056f..1798cec2b 100644 --- a/best-practices/ml-platform/examples/platform/playground/gateway.tf +++ b/best-practices/ml-platform/examples/platform/playground/gateway.tf @@ -12,19 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -data "google_client_config" "default" {} - -data "google_client_openid_userinfo" "identity" {} - locals { hostname_suffix = "endpoints.${data.google_project.environment.project_id}.cloud.goog" - gateway_manifests_directory = "${path.module}/manifests/ml-team/gateway" + gateway_manifests_directory = "${path.module}/manifests/${var.environment_name}/${var.namespace}/gateway" gateway_name = "external-https" - ray_head_service_name = "ray-cluster-kuberay-head-svc" - ray_dashboard_endpoint = "ray-dashboard.${data.kubernetes_namespace_v1.team.metadata[0].name}.mlp.${local.hostname_suffix}" - ray_dashboard_port = 8265 iap_domain = var.iap_domain != null ? var.iap_domain : split("@", trimspace(data.google_client_openid_userinfo.identity.email))[1] iap_oath_brand = "projects/${data.google_project.environment.number}/brands/${data.google_project.environment.number}" + ray_head_service_name = "ray-cluster-kuberay-head-svc" + ray_dashboard_endpoint = "ray-dashboard.${data.kubernetes_namespace_v1.team.metadata[0].name}.mlp-${var.environment_name}.${local.hostname_suffix}" + ray_dashboard_port = 8265 } ############################################################################### @@ -42,7 +38,7 @@ resource "google_compute_managed_ssl_certificate" "external_gateway" { google_project_service.certificatemanager_googleapis_com, ] - name = "${var.namespace}-external-gateway" + name = "${var.environment_name}-${var.namespace}-external-gateway" project = data.google_project.environment.project_id managed { @@ -55,7 +51,7 @@ resource "google_compute_global_address" "external_gateway_https" { google_project_service.compute_googleapis_com ] - name = "${data.kubernetes_namespace_v1.team.metadata[0].name}-external-gateway-https" + name = "${var.environment_name}-${data.kubernetes_namespace_v1.team.metadata[0].name}-external-gateway-https" project = data.google_project.environment.project_id } @@ -102,7 +98,7 @@ resource "local_file" "route_ray_dashboard_https_yaml" { ############################################################################### resource "google_project_service" "iap_googleapis_com" { disable_dependent_services = false - disable_on_destroy = true + disable_on_destroy = false project = data.google_project.environment.project_id service = "iap.googleapis.com" } @@ -113,7 +109,7 @@ resource "google_iap_client" "ray_head_client" { ] brand = local.iap_oath_brand - display_name = "IAP-gkegw-${data.kubernetes_namespace_v1.team.metadata[0].name}-ray-head-dashboard" + display_name = "IAP-gkegw-${var.environment_name}-${data.kubernetes_namespace_v1.team.metadata[0].name}-ray-head-dashboard" } # TODO: Look at possibly converting to google_iap_web_backend_service_iam_member, but would need the gateway to be created first. @@ -128,119 +124,3 @@ resource "google_iap_web_iam_member" "domain_iap_https_resource_accessor" { member = "domain:${local.iap_domain}" role = "roles/iap.httpsResourceAccessor" } - -resource "kubernetes_secret_v1" "ray_head_client" { - data = { - secret = google_iap_client.ray_head_client.secret - } - - metadata { - name = "ray-head-client" - namespace = data.kubernetes_namespace_v1.team.metadata[0].name - } -} - -resource "local_file" "policy_iap_ray_head_yaml" { - content = templatefile( - "${path.module}/templates/gateway/gcp-backend-policy-iap-service.tftpl.yaml", - { - oauth_client_id = google_iap_client.ray_head_client.client_id - oauth_client_secret_name = kubernetes_secret_v1.ray_head_client.metadata[0].name - policy_name = "ray-head" - service_name = local.ray_head_service_name - } - ) - filename = "${local.gateway_manifests_directory}/policy-iap-ray-head.yaml" -} - -############################################################################### -# CONFIG SYNC -############################################################################### -resource "local_file" "gateway_kustomization_yaml" { - content = templatefile( - "${path.module}/templates/kustomize/kustomization.tftpl.yaml", - { - namespace = data.kubernetes_namespace_v1.team.metadata[0].name - resources = [ - basename(local_file.gateway_external_https_yaml.filename), - basename(local_file.policy_iap_ray_head_yaml.filename), - basename(local_file.route_ray_dashboard_https_yaml.filename), - ] - } - ) - filename = "${local.gateway_manifests_directory}/kustomization.yaml" -} - -resource "null_resource" "gateway_manifests" { - depends_on = [ - google_compute_managed_ssl_certificate.external_gateway, - google_endpoints_service.ray_dashboard_https, - kubernetes_secret_v1.ray_head_client, - local.configsync_repository, - module.gke - ] - - provisioner "local-exec" { - command = "scripts/gateway_manifests.sh" - environment = { - GIT_EMAIL = self.triggers.github_email - GIT_REPOSITORY = self.triggers.git_repository - GIT_TOKEN = self.triggers.github_token - GIT_USERNAME = self.triggers.github_user - KUBECONFIG = self.triggers.kubeconfig - K8S_NAMESPACE = self.triggers.namespace - REPO_SYNC_NAME = self.triggers.repo_sync_name - REPO_SYNC_NAMESPACE = self.triggers.repo_sync_namespace - } - interpreter = ["bash", "-c"] - working_dir = path.module - } - - provisioner "local-exec" { - command = "scripts/gateway_cleanup.sh" - environment = { - GIT_EMAIL = self.triggers.github_email - GIT_REPOSITORY = self.triggers.git_repository - GIT_TOKEN = self.triggers.github_token - GIT_USERNAME = self.triggers.github_user - K8S_NAMESPACE = self.triggers.namespace - KUBECONFIG = self.triggers.kubeconfig - REPO_SYNC_NAME = self.triggers.repo_sync_name - REPO_SYNC_NAMESPACE = self.triggers.repo_sync_namespace - } - interpreter = ["bash", "-c"] - when = destroy - working_dir = path.module - } - - triggers = { - gateway_name = local.gateway_name - git_repository = local.git_repository - github_email = var.github_email - github_token = var.github_token - github_user = var.github_user - kubeconfig = "${local.kubeconfig_dir}/${data.google_project.environment.project_id}_${google_gke_hub_membership.cluster.membership_id}" - md5_script = filemd5("${path.module}/scripts/gateway_manifests.sh") - md5_files = md5(join("", [ - local_file.gateway_external_https_yaml.content_md5, - local_file.policy_iap_ray_head_yaml.content_md5, - local_file.route_ray_dashboard_https_yaml.content_md5, - local_file.gateway_kustomization_yaml.content_md5 - ])) - namespace = data.kubernetes_namespace_v1.team.metadata[0].name - repo_sync_name = "${var.environment_name}-${data.kubernetes_namespace_v1.team.metadata[0].name}" - repo_sync_namespace = data.kubernetes_namespace_v1.team.metadata[0].name - - } -} - -############################################################################### -# OUTPUT -############################################################################### -output "iap_domain" { - value = local.iap_domain -} - -output "ray_dashboard_url_https" { - value = "https://${local.ray_dashboard_endpoint}" -} diff --git a/best-practices/ml-platform/examples/platform/playground/gitops_configsync.tf b/best-practices/ml-platform/examples/platform/playground/gitops_configsync.tf new file mode 100644 index 000000000..5414f4b1f --- /dev/null +++ b/best-practices/ml-platform/examples/platform/playground/gitops_configsync.tf @@ -0,0 +1,457 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + namespace_default_kubernetes_service_account = "default" + ray_head_kubernetes_service_account = "ray-head" + ray_worker_kubernetes_service_account = "ray-worker" +} + + + +# TEMPLATE MANIFESTS +############################################################################### +resource "null_resource" "template_manifests" { + depends_on = [ + google_gke_hub_feature_membership.cluster_configmanagement + ] + + provisioner "local-exec" { + command = "${path.module}/scripts/template_manifests.sh" + environment = { + GIT_EMAIL = var.git_user_email + GIT_REPOSITORY = local.git_repository + GIT_TOKEN = var.git_token + GIT_USERNAME = var.git_user_name + } + } + + triggers = { + md5_files = md5(join("", [for f in fileset("${path.module}/templates/configsync", "**") : md5("${path.module}/templates/configsync/${f}")])) + md5_script = filemd5("${path.module}/scripts/template_manifests.sh") + } +} + + + +# CLUSTER MANIFESTS +############################################################################### +resource "null_resource" "cluster_manifests" { + depends_on = [ + google_gke_hub_feature_membership.cluster_configmanagement, + null_resource.template_manifests + ] + + provisioner "local-exec" { + command = "${path.module}/scripts/cluster_manifests.sh" + environment = { + CLUSTER_ENV = var.environment_name + CLUSTER_NAME = google_container_cluster.mlp.name + GIT_EMAIL = var.git_user_email + GIT_REPOSITORY = local.git_repository + GIT_TOKEN = var.git_token + GIT_USERNAME = var.git_user_name + } + } + + triggers = { + md5_files = md5(join("", [for f in fileset("${path.module}/templates/configsync/templates/_cluster_template", "**") : md5("${path.module}/templates/configsync/templates/_cluster_template${f}")])) + md5_script = filemd5("${path.module}/scripts/cluster_manifests.sh") + } +} + + + +# GIT CREDENTIALS SECRET CONFIGSYNC +############################################################################### +resource "null_resource" "git_cred_secret_cms" { + depends_on = [ + google_gke_hub_feature_membership.cluster_configmanagement, + null_resource.connect_gateway_kubeconfig + ] + + provisioner "local-exec" { + command = "${path.module}/scripts/git_cred_secret.sh" + environment = { + GIT_EMAIL = var.git_user_email + GIT_REPOSITORY = local.git_repository + GIT_TOKEN = var.git_token + GIT_USERNAME = var.git_user_name + K8S_NAMESPACE = "config-management-system" + KUBECONFIG = "${local.kubeconfig_dir}/${data.google_project.environment.project_id}_${google_gke_hub_membership.cluster.membership_id}" + } + } + + triggers = { + md5_credentials = md5(join("", [var.git_user_name, var.git_token])) + md5_script = filemd5("${path.module}/scripts/git_cred_secret.sh") + } +} + + + +# KUEUE +############################################################################### +resource "null_resource" "kueue" { + depends_on = [ + google_gke_hub_feature_membership.cluster_configmanagement, + null_resource.cluster_manifests + ] + + provisioner "local-exec" { + command = "${path.module}/scripts/kueue_manifests.sh" + environment = { + GIT_EMAIL = var.git_user_email + GIT_REPOSITORY = local.git_repository + GIT_TOKEN = var.git_token + GIT_USERNAME = var.git_user_name + } + } + + triggers = { + md5_files = md5(join("", [for f in fileset("${path.module}/templates/configsync/templates/_cluster_template", "**") : md5("${path.module}/templates/configsync/templates/_cluster_template/${f}")])) + md5_script = filemd5("${path.module}/scripts/kueue_manifests.sh") + } +} + + + +# NVIDIA DCGM +############################################################################### +resource "null_resource" "nvidia_dcgm" { + depends_on = [ + google_gke_hub_feature_membership.cluster_configmanagement, + null_resource.kueue + ] + + provisioner "local-exec" { + command = "${path.module}/scripts/nvidia_dcgm_manifests.sh" + environment = { + GIT_EMAIL = var.git_user_email + GIT_REPOSITORY = local.git_repository + GIT_TOKEN = var.git_token + GIT_USERNAME = var.git_user_name + } + } + + triggers = { + md5_files = md5(join("", [for f in fileset("${path.module}/templates/configsync/templates/_cluster_template/gmp-public/nvidia-dcgm", "**") : md5("${path.module}/templates/configsync/templates/_cluster_template/gmp-public/nvidia-dcgm/${f}")])) + md5_script = filemd5("${path.module}/scripts/nvidia_dcgm_manifests.sh") + } +} + + + +# KUBERAY MANIFESTS +############################################################################### +resource "null_resource" "kuberay_manifests" { + depends_on = [ + google_gke_hub_feature_membership.cluster_configmanagement, + null_resource.nvidia_dcgm, + ] + + provisioner "local-exec" { + command = "${path.module}/scripts/kuberay_manifests.sh" + environment = { + GIT_EMAIL = var.git_user_email + GIT_REPOSITORY = local.git_repository + GIT_TOKEN = var.git_token + GIT_USERNAME = var.git_user_name + K8S_NAMESPACE = var.namespace + } + } + + triggers = { + md5_files = md5(join("", [for f in fileset("${path.module}/templates/configsync/templates/_cluster_template/kuberay", "**") : md5("${path.module}/templates/configsync/templates/_cluster_template/kuberay/${f}")])) + md5_script = filemd5("${path.module}/scripts/kuberay_manifests.sh") + } +} + + + +# NAMESPACE +############################################################################### +resource "null_resource" "namespace_manifests" { + depends_on = [ + google_gke_hub_feature_membership.cluster_configmanagement, + null_resource.connect_gateway_kubeconfig, + null_resource.kuberay_manifests + ] + + provisioner "local-exec" { + command = "${path.module}/scripts/namespace_manifests.sh" + environment = { + CLUSTER_ENV = var.environment_name + CLUSTER_NAME = google_container_cluster.mlp.name + GIT_EMAIL = var.git_user_email + GIT_REPOSITORY = local.git_repository + GIT_TOKEN = var.git_token + GIT_USERNAME = var.git_user_name + K8S_NAMESPACE = self.triggers.namespace + } + } + + provisioner "local-exec" { + command = "scripts/namespace_cleanup.sh" + environment = { + ENVIRONMENT_NAME = self.triggers.environment_name + GIT_EMAIL = self.triggers.github_email + GIT_REPOSITORY = self.triggers.git_repository + GIT_TOKEN = self.triggers.github_token + GIT_USERNAME = self.triggers.github_user + KUBECONFIG = self.triggers.kubeconfig + K8S_NAMESPACE = self.triggers.namespace + PROJECT_ID = self.triggers.project_id + REPO_SYNC_NAME = self.triggers.repo_sync_name + REPO_SYNC_NAMESPACE = self.triggers.repo_sync_namespace + ROOT_SYNC_NAME = self.triggers.root_sync_name + } + when = destroy + working_dir = path.module + } + + triggers = { + environment_name = var.environment_name + git_repository = local.git_repository + github_email = var.git_user_email + github_token = var.git_token + github_user = var.git_user_name + kubeconfig = "${local.kubeconfig_dir}/${data.google_project.environment.project_id}_${google_gke_hub_membership.cluster.membership_id}" + project_id = data.google_project.environment.project_id + md5_files = md5(join("", [for f in fileset("${path.module}/templates/configsync/templates/_cluster_template/team", "**") : md5("${path.module}/templates/configsync/templates/_cluster_template/team/${f}")])) + md5_script = filemd5("${path.module}/scripts/namespace_manifests.sh") + namespace = var.namespace + repo_sync_name = "${var.environment_name}-${var.namespace}" + repo_sync_namespace = var.namespace + root_sync_name = "root-sync" + } +} + + + +# GIT CREDENTIALS SECRET NAMESPACE +############################################################################### +resource "null_resource" "git_cred_secret_ns" { + depends_on = [ + null_resource.connect_gateway_kubeconfig, + null_resource.namespace_manifests + ] + + provisioner "local-exec" { + command = "${path.module}/scripts/git_cred_secret.sh" + environment = { + GIT_TOKEN = var.git_token + GIT_USERNAME = var.git_user_name + K8S_NAMESPACE = var.namespace + KUBECONFIG = "${local.kubeconfig_dir}/${data.google_project.environment.project_id}_${google_gke_hub_membership.cluster.membership_id}" + } + } + + triggers = { + md5_credentials = md5(join("", [var.git_user_name, var.git_token])) + md5_script = filemd5("${path.module}/scripts/git_cred_secret.sh") + } +} + + + +# KUBERAY WATCH NAMESPACE MANIFESTS +############################################################################### +resource "null_resource" "kuberay_watch_namespace_manifests" { + depends_on = [ + google_gke_hub_feature_membership.cluster_configmanagement, + null_resource.namespace_manifests + ] + + provisioner "local-exec" { + command = "${path.module}/scripts/kuberay_watch_namespace_manifests.sh" + environment = { + GIT_EMAIL = var.git_user_email + GIT_REPOSITORY = local.git_repository + GIT_TOKEN = var.git_token + GIT_USERNAME = var.git_user_name + K8S_NAMESPACE = var.namespace + } + } + + triggers = { + md5_script = filemd5("${path.module}/scripts/kuberay_watch_namespace_manifests.sh") + } +} + + + +# RAY CLUSTER IN NAMESPACE +############################################################################### +resource "null_resource" "ray_cluster_namespace_manifests" { + depends_on = [ + google_gke_hub_feature_membership.cluster_configmanagement, + null_resource.kuberay_watch_namespace_manifests + ] + + provisioner "local-exec" { + command = "${path.module}/scripts/ray_cluster_namespace_manifests.sh" + environment = { + GIT_EMAIL = var.git_user_email + GIT_REPOSITORY = local.git_repository + GIT_TOKEN = var.git_token + GIT_USERNAME = var.git_user_name + K8S_NAMESPACE = var.namespace + K8S_SERVICE_ACCOUNT_HEAD = local.ray_head_kubernetes_service_account + K8S_SERVICE_ACCOUNT_WORKER = local.ray_worker_kubernetes_service_account + } + } + + triggers = { + md5_files = md5(join("", [for f in fileset("${path.module}/templates/configsync/templates/_namespace_template/app", "**") : md5("${path.module}/templates/configsync/templates/_namespace_template/app/${f}")])) + md5_script = filemd5("${path.module}/scripts/ray_cluster_namespace_manifests.sh") + } +} + + + +# GATEWAY +############################################################################### +resource "kubernetes_secret_v1" "ray_head_client" { + data = { + secret = google_iap_client.ray_head_client.secret + } + + metadata { + name = "ray-head-client" + namespace = data.kubernetes_namespace_v1.team.metadata[0].name + } +} + +resource "local_file" "policy_iap_ray_head_yaml" { + content = templatefile( + "${path.module}/templates/gateway/gcp-backend-policy-iap-service.tftpl.yaml", + { + oauth_client_id = google_iap_client.ray_head_client.client_id + oauth_client_secret_name = kubernetes_secret_v1.ray_head_client.metadata[0].name + policy_name = "ray-head" + service_name = local.ray_head_service_name + } + ) + filename = "${local.gateway_manifests_directory}/policy-iap-ray-head.yaml" +} + +resource "local_file" "gateway_kustomization_yaml" { + content = templatefile( + "${path.module}/templates/kustomize/kustomization.tftpl.yaml", + { + namespace = data.kubernetes_namespace_v1.team.metadata[0].name + resources = [ + basename(local_file.gateway_external_https_yaml.filename), + basename(local_file.policy_iap_ray_head_yaml.filename), + basename(local_file.route_ray_dashboard_https_yaml.filename), + ] + } + ) + filename = "${local.gateway_manifests_directory}/kustomization.yaml" +} + +resource "null_resource" "gateway_manifests" { + depends_on = [ + google_compute_managed_ssl_certificate.external_gateway, + google_endpoints_service.ray_dashboard_https, + google_gke_hub_feature_membership.cluster_configmanagement, + kubernetes_secret_v1.ray_head_client, + null_resource.ray_cluster_namespace_manifests + ] + + provisioner "local-exec" { + command = "scripts/gateway_manifests.sh" + environment = { + ENVIRONMENT_NAME = self.triggers.environment_name + GIT_EMAIL = self.triggers.github_email + GIT_REPOSITORY = self.triggers.git_repository + GIT_TOKEN = self.triggers.github_token + GIT_USERNAME = self.triggers.github_user + KUBECONFIG = self.triggers.kubeconfig + K8S_NAMESPACE = self.triggers.namespace + REPO_SYNC_NAME = self.triggers.repo_sync_name + REPO_SYNC_NAMESPACE = self.triggers.repo_sync_namespace + } + interpreter = ["bash", "-c"] + working_dir = path.module + } + + provisioner "local-exec" { + command = "scripts/gateway_cleanup.sh" + environment = { + GIT_EMAIL = self.triggers.github_email + GIT_REPOSITORY = self.triggers.git_repository + GIT_TOKEN = self.triggers.github_token + GIT_USERNAME = self.triggers.github_user + K8S_NAMESPACE = self.triggers.namespace + KUBECONFIG = self.triggers.kubeconfig + REPO_SYNC_NAME = self.triggers.repo_sync_name + REPO_SYNC_NAMESPACE = self.triggers.repo_sync_namespace + } + interpreter = ["bash", "-c"] + when = destroy + working_dir = path.module + } + + triggers = { + environment_name = var.environment_name + gateway_name = local.gateway_name + git_repository = local.git_repository + github_email = var.git_user_email + github_token = var.git_token + github_user = var.git_user_name + kubeconfig = "${local.kubeconfig_dir}/${data.google_project.environment.project_id}_${google_gke_hub_membership.cluster.membership_id}" + md5_script = filemd5("${path.module}/scripts/gateway_manifests.sh") + md5_files = md5(join("", [ + local_file.gateway_external_https_yaml.content_md5, + local_file.policy_iap_ray_head_yaml.content_md5, + local_file.route_ray_dashboard_https_yaml.content_md5, + local_file.gateway_kustomization_yaml.content_md5 + ])) + namespace = data.kubernetes_namespace_v1.team.metadata[0].name + repo_sync_name = "${var.environment_name}-${data.kubernetes_namespace_v1.team.metadata[0].name}" + repo_sync_namespace = data.kubernetes_namespace_v1.team.metadata[0].name + + } +} + + + +# WAIT FOR CONFIGSYNC +############################################################################### +resource "null_resource" "wait_for_configsync" { + depends_on = [ + google_gke_hub_feature_membership.cluster_configmanagement, + null_resource.gateway_manifests + ] + + provisioner "local-exec" { + command = "${path.module}/scripts/wait_for_configsync.sh" + environment = { + GIT_EMAIL = var.git_user_email + GIT_REPOSITORY = local.git_repository + GIT_TOKEN = var.git_token + GIT_USERNAME = var.git_user_name + KUBECONFIG = "${local.kubeconfig_dir}/${data.google_project.environment.project_id}_${google_gke_hub_membership.cluster.membership_id}" + REPO_SYNC_NAME = "${var.environment_name}-${data.kubernetes_namespace_v1.team.metadata[0].name}" + REPO_SYNC_NAMESPACE = data.kubernetes_namespace_v1.team.metadata[0].name + ROOT_SYNC_NAME = "root-sync" + } + } + + triggers = { + md5_script = filemd5("${path.module}/scripts/wait_for_configsync.sh") + } +} diff --git a/best-practices/ml-platform/examples/platform/playground/google_client.tf b/best-practices/ml-platform/examples/platform/playground/google_client.tf new file mode 100644 index 000000000..8656e63a7 --- /dev/null +++ b/best-practices/ml-platform/examples/platform/playground/google_client.tf @@ -0,0 +1,17 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +data "google_client_config" "default" {} + +data "google_client_openid_userinfo" "identity" {} \ No newline at end of file diff --git a/best-practices/ml-platform/examples/platform/playground/kubernetes.tf b/best-practices/ml-platform/examples/platform/playground/kubernetes.tf new file mode 100644 index 000000000..2ab02e159 --- /dev/null +++ b/best-practices/ml-platform/examples/platform/playground/kubernetes.tf @@ -0,0 +1,60 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + # https://github.com/hashicorp/terraform-provider-google/issues/13325 + connect_gateway_host_url = "https://connectgateway.googleapis.com/v1/projects/${data.google_project.environment.number}/locations/global/gkeMemberships/${google_container_cluster.mlp.name}" + kubeconfig_dir = abspath("${path.module}/kubeconfig") +} + +provider "kubernetes" { + host = local.connect_gateway_host_url + token = data.google_client_config.default.access_token +} + +resource "null_resource" "connect_gateway_kubeconfig" { + provisioner "local-exec" { + command = </dev/null 2>&1 + pwd -P +)" + +source ${SCRIPT_PATH}/helpers/clone_git_repo.sh + +# Set directory and path variables +clusters_directory="manifests/clusters" +clusters_path="${GIT_REPOSITORY_PATH}/${clusters_directory}" +cluster_template_directory="templates/_cluster_template" +cluster_template_path="${GIT_REPOSITORY_PATH}/${cluster_template_directory}" + +cd ${clusters_path} || { + echo "Clusters folder is missing" + exit 100 +} + +cp ${cluster_template_path}/cluster.yaml ${clusters_path}/${CLUSTER_NAME}-cluster.yaml +cp ${cluster_template_path}/selector.yaml ${clusters_path}/${CLUSTER_ENV}-selector.yaml +cp ${cluster_template_path}/network-logging.yaml ${clusters_path}/network-logging.yaml + +cd ${clusters_path} +find . -type f -name ${CLUSTER_NAME}-cluster.yaml -exec sed -i "s/CLUSTER_NAME/${CLUSTER_NAME}/g" {} + +find . -type f -name ${CLUSTER_NAME}-cluster.yaml -exec sed -i "s/ENV/${CLUSTER_ENV}/g" {} + +find . -type f -name ${CLUSTER_ENV}-selector.yaml -exec sed -i "s/ENV/${CLUSTER_ENV}/g" {} + + +if [ ! -d "${clusters_path}/kustomization.yaml" ]; then + cp ${cluster_template_path}/kustomization.yaml ${clusters_path}/ +fi + +# Added entries to the kustomization file +export resources=("${clusters_path}/${CLUSTER_NAME}-cluster.yaml ${clusters_path}/${CLUSTER_ENV}-selector.yaml ${clusters_path}/network-logging.yaml") +export kustomization_file="${clusters_path}/kustomization.yaml" +source ${SCRIPT_PATH}/helpers/add_to_kustomization.sh + +# Add, commit, and push changes to the repository +cd ${GIT_REPOSITORY_PATH} +git add . +git commit -m "Manifests for '${CLUSTER_NAME}' cluster in the ${CLUSTER_ENV} environment." +git push origin diff --git a/best-practices/ml-platform/examples/platform/playground/scripts/create_cluster_yamls.sh b/best-practices/ml-platform/examples/platform/playground/scripts/create_cluster_yamls.sh deleted file mode 100755 index 5cc61fc1f..000000000 --- a/best-practices/ml-platform/examples/platform/playground/scripts/create_cluster_yamls.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash -# -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -u - -SCRIPT_PATH="$( - cd "$(dirname "$0")" >/dev/null 2>&1 - pwd -P -)" - -source ${SCRIPT_PATH}/helpers/clone_git_repo.sh - -if [ ! -d "${GIT_REPOSITORY_PATH}/manifests" ] && [ ! -d "${GIT_REPOSITORY_PATH}/templates" ]; then - echo "Copying template files..." - cp -r templates/acm-template/* ${GIT_REPOSITORY_PATH}/ -fi - -cd ${GIT_REPOSITORY_PATH}/manifests/clusters || { - echo "Failed to copy template files" - exit 1 -} - -cp ../../templates/_cluster_template/cluster.yaml ./${CLUSTER_NAME}-cluster.yaml -cp ../../templates/_cluster_template/network-logging.yaml ./network-logging.yaml -cp ../../templates/_cluster_template/selector.yaml ./${CLUSTER_ENV}-selector.yaml - -find . -type f -name ${CLUSTER_NAME}-cluster.yaml -exec sed -i "s/CLUSTER_NAME/${CLUSTER_NAME}/g" {} + -find . -type f -name ${CLUSTER_NAME}-cluster.yaml -exec sed -i "s/ENV/${CLUSTER_ENV}/g" {} + -find . -type f -name ${CLUSTER_ENV}-selector.yaml -exec sed -i "s/ENV/${CLUSTER_ENV}/g" {} + - -git add ../../. -git commit -m "Added '${CLUSTER_NAME} 'cluster to the ${CLUSTER_ENV} environment." -git push origin diff --git a/best-practices/ml-platform/examples/platform/playground/scripts/create_namespace.sh b/best-practices/ml-platform/examples/platform/playground/scripts/create_namespace.sh deleted file mode 100755 index c40dd541b..000000000 --- a/best-practices/ml-platform/examples/platform/playground/scripts/create_namespace.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -# -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -u - -SCRIPT_PATH="$( - cd "$(dirname "$0")" >/dev/null 2>&1 - pwd -P -)" - -source ${SCRIPT_PATH}/helpers/clone_git_repo.sh - -cd ${GIT_REPOSITORY_PATH}/manifests/clusters - -if [ -d "${K8S_NAMESPACE}" ]; then - exit 0 -fi - -#TODO: This most likely needs to be fixed for multiple environments -chars_in_namespace=$(echo -n ${K8S_NAMESPACE} | wc -c) -chars_in_cluster_env=$(echo -n ${CLUSTER_ENV} | wc -c) -chars_in_reposync_name=$(expr ${chars_in_namespace} + ${chars_in_cluster_env} + 1) -mkdir ${K8S_NAMESPACE} || exit 1 -cp -r ../../templates/_cluster_template/team/* ${K8S_NAMESPACE} -sed -i "s?NAMESPACE?${K8S_NAMESPACE}?g" ${K8S_NAMESPACE}/* -sed -ni '/#END OF SINGLE ENV DECLARATION/q;p' ${K8S_NAMESPACE}/reposync.yaml -sed -i "s?ENV?${CLUSTER_ENV}?g" ${K8S_NAMESPACE}/reposync.yaml -sed -i "s?GIT_REPO?https://${GIT_REPOSITORY}?g" ${K8S_NAMESPACE}/reposync.yaml -sed -i "s??${chars_in_reposync_name}?g" ${K8S_NAMESPACE}/reposync.yaml - -mkdir ../apps/${K8S_NAMESPACE} -touch ../apps/${K8S_NAMESPACE}/.gitkeep - -cat <>kustomization.yaml -- ./${K8S_NAMESPACE} -EOF - -cd .. -git add . -git commit -m "Added manifests to create '${K8S_NAMESPACE}' namespace" -git push origin diff --git a/best-practices/ml-platform/examples/platform/playground/scripts/gateway_cleanup.sh b/best-practices/ml-platform/examples/platform/playground/scripts/gateway_cleanup.sh index 580ce292f..892da3c64 100755 --- a/best-practices/ml-platform/examples/platform/playground/scripts/gateway_cleanup.sh +++ b/best-practices/ml-platform/examples/platform/playground/scripts/gateway_cleanup.sh @@ -22,17 +22,20 @@ SCRIPT_PATH="$( source ${SCRIPT_PATH}/helpers/clone_git_repo.sh -team_namespace_directory="manifests/apps/${K8S_NAMESPACE}" -team_namespace_path="${GIT_REPOSITORY_PATH}/${team_namespace_directory}" +# Set directory and path variables +namespace_directory="manifests/apps/${K8S_NAMESPACE}" +namespace_path="${GIT_REPOSITORY_PATH}/${namespace_directory}" -cd "${team_namespace_path}" || { - echo "Team namespace directory '${team_namespace_directory}' does not exist" - exit 2 +cd "${namespace_path}" || { + echo "Namespace directory '${namespace_directory}' does not exist" + exit 100 } -git rm -rf gateway -sed -i '/- .\/gateway/d' kustomization.yaml +git rm -rf ${namespace_path}/gateway +sed -i '/- .\/gateway/d' ${namespace_path}/kustomization.yaml +# Add, commit, and push changes to the repository +cd ${GIT_REPOSITORY_PATH} git add . git commit -m "Removed manifests for '${K8S_NAMESPACE}' gateway" git push origin diff --git a/best-practices/ml-platform/examples/platform/playground/scripts/gateway_manifests.sh b/best-practices/ml-platform/examples/platform/playground/scripts/gateway_manifests.sh index 0d45c8836..feaa54143 100755 --- a/best-practices/ml-platform/examples/platform/playground/scripts/gateway_manifests.sh +++ b/best-practices/ml-platform/examples/platform/playground/scripts/gateway_manifests.sh @@ -22,36 +22,27 @@ SCRIPT_PATH="$( source ${SCRIPT_PATH}/helpers/clone_git_repo.sh -team_namespace_directory="manifests/apps/${K8S_NAMESPACE}" -team_namespace_path="${GIT_REPOSITORY_PATH}/${team_namespace_directory}" +# Set directory and path variables +namespace_directory="manifests/apps/${K8S_NAMESPACE}" +namespace_path="${GIT_REPOSITORY_PATH}/${namespace_directory}" -cd "${team_namespace_path}" || { - echo "Team namespace directory '${team_namespace_directory}' does not exist" - exit 2 +cd "${namespace_path}" || { + echo "Namespace directory '${namespace_directory}' does not exist" + exit 100 } -generated_manifests_directory="${SCRIPT_PATH}/../manifests/${K8S_NAMESPACE}/gateway" -cp -pr ${generated_manifests_directory} ${team_namespace_path}/ +generated_manifests_directory="${SCRIPT_PATH}/../manifests/${ENVIRONMENT_NAME}/${K8S_NAMESPACE}/gateway" +cp -pr ${generated_manifests_directory} ${namespace_path}/ -resources=$(find ${team_namespace_path} -maxdepth 1 -mindepth 1 -type d) +# Added entries to the kustomization file +resources=$(find ${namespace_path} -maxdepth 1 -mindepth 1 -type d | sort) resources+=" " -resources+=$(find ${team_namespace_path} -maxdepth 1 -type f -name "*.yaml" ! -name "kustomization.yaml" ! -name "*values.yaml") -for resource in ${resources}; do - resource_basename=$(basename ${resource}) +export resources+=$(find ${namespace_path} -maxdepth 1 -type f -name "*.yaml" ! -name "kustomization.yaml" ! -name "*values.yaml" | sort) +export kustomization_file=${namespace_path}/kustomization.yaml +source ${SCRIPT_PATH}/helpers/add_to_kustomization.sh - if [ -d "${resource}" ]; then - resource_entry="./${resource_basename}" - elif [ -f "${resource}" ]; then - resource_entry="${resource_basename}" - else - echo "${resource} is not a directory or file" - exit 3 - fi - - grep -qx "\- ${resource_entry}" ${team_namespace_path}/kustomization.yaml || echo "- ${resource_entry}" >>${team_namespace_path}/kustomization.yaml -done - -cd "${team_namespace_path}" +# Add, commit, and push changes to the repository +cd ${GIT_REPOSITORY_PATH} git add . git commit -m "Manifests for ${K8S_NAMESPACE} gateway" git push origin diff --git a/best-practices/ml-platform/examples/platform/playground/scripts/create_git_cred.sh b/best-practices/ml-platform/examples/platform/playground/scripts/git_cred_secret.sh similarity index 100% rename from best-practices/ml-platform/examples/platform/playground/scripts/create_git_cred.sh rename to best-practices/ml-platform/examples/platform/playground/scripts/git_cred_secret.sh diff --git a/best-practices/ml-platform/examples/platform/playground/scripts/helpers/add_to_kustomization.sh b/best-practices/ml-platform/examples/platform/playground/scripts/helpers/add_to_kustomization.sh new file mode 100644 index 000000000..8f4e60d25 --- /dev/null +++ b/best-practices/ml-platform/examples/platform/playground/scripts/helpers/add_to_kustomization.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -u + +if [ ! -f ${kustomization_file} ]; then + echo "Kustomization file '${kustomization_file}' not found" + exit 2 +fi + +for resource in ${resources}; do + resource_basename=$(basename ${resource}) + + if [ -d "${resource}" ]; then + resource_entry="./${resource_basename}" + elif [ -f "${resource}" ]; then + resource_entry="${resource_basename}" + else + echo "${resource} is not a directory or file" + exit 3 + fi + + grep -qx "\- ${resource_entry}" ${kustomization_file} || echo "- ${resource_entry}" >>${kustomization_file} +done diff --git a/best-practices/ml-platform/examples/platform/playground/scripts/kuberay_manifests.sh b/best-practices/ml-platform/examples/platform/playground/scripts/kuberay_manifests.sh new file mode 100755 index 000000000..739db03bb --- /dev/null +++ b/best-practices/ml-platform/examples/platform/playground/scripts/kuberay_manifests.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -u + +SCRIPT_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" + +source ${SCRIPT_PATH}/helpers/clone_git_repo.sh + +# Set directory and path variables +clusters_directory="manifests/clusters" +clusters_path="${GIT_REPOSITORY_PATH}/${clusters_directory}" +template_directory="templates/_cluster_template" +template_path="${GIT_REPOSITORY_PATH}/${template_directory}" + +cp -r ${template_path}/kuberay ${clusters_path}/ + +# Added entries to the kustomization file +export resources="${clusters_path}/kuberay" +export kustomization_file="${clusters_path}/kustomization.yaml" +source ${SCRIPT_PATH}/helpers/add_to_kustomization.sh + +# Add, commit, and push changes to the repository +cd ${GIT_REPOSITORY_PATH} +git add . +git commit -m "Manifests for KubeRay operator" +git push origin diff --git a/best-practices/ml-platform/examples/platform/playground/scripts/kuberay_watch_namespace_manifests.sh b/best-practices/ml-platform/examples/platform/playground/scripts/kuberay_watch_namespace_manifests.sh new file mode 100755 index 000000000..9937507a7 --- /dev/null +++ b/best-practices/ml-platform/examples/platform/playground/scripts/kuberay_watch_namespace_manifests.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -u + +SCRIPT_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" + +KUBERAY_NAMESPACE=${KUBERAY_NAMESPACE:-"default"} + +source ${SCRIPT_PATH}/helpers/clone_git_repo.sh + +# Set directory and path variables +clusters_directory="manifests/clusters" +clusters_path="${GIT_REPOSITORY_PATH}/${clusters_directory}" +clusters_namespace_directory="${clusters_directory}/${K8S_NAMESPACE}" +clusters_namespace_path="${GIT_REPOSITORY_PATH}/${clusters_namespace_directory}" + +ns_exists=$(grep ${K8S_NAMESPACE} ${clusters_path}/kuberay/values.yaml | wc -l) +if [ "${ns_exists}" -ne 0 ]; then + echo "namespace '${K8S_NAMESPACE}' already present in values.yaml" + exit 0 +fi + +# TODO: this will need to be fixed for multiple namespaces +sed -i "s/watchNamespace:/watchNamespace:\n - ${K8S_NAMESPACE}/g" ${clusters_path}/kuberay/values.yaml + +cat <>${clusters_namespace_path}/network-policy.yaml + # Allow KubeRay Operator + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: ${KUBERAY_NAMESPACE} + podSelector: + matchLabels: + app.kubernetes.io/name: kuberay-operator +EOF + +# Add, commit, and push changes to the repository +cd ${GIT_REPOSITORY_PATH} +git add . +git commit -m "Configured KubeRay operator to watch '${K8S_NAMESPACE}' namespace" +git push origin diff --git a/best-practices/ml-platform/examples/platform/playground/scripts/kueue_manifests.sh b/best-practices/ml-platform/examples/platform/playground/scripts/kueue_manifests.sh new file mode 100755 index 000000000..fe99dccee --- /dev/null +++ b/best-practices/ml-platform/examples/platform/playground/scripts/kueue_manifests.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -u + +SCRIPT_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" + +source ${SCRIPT_PATH}/helpers/clone_git_repo.sh + +# Set directory and path variables +clusters_directory="manifests/clusters" +clusters_path="${GIT_REPOSITORY_PATH}/${clusters_directory}" +cluster_template_directory="templates/_cluster_template" +cluster_template_path="${GIT_REPOSITORY_PATH}/${cluster_template_directory}" + +cd "${clusters_path}" || { + echo "Clusters directory '${clusters_directory}' does not exist" + exit 100 +} + +cp -pr ${cluster_template_path}/kueue-system ${clusters_path}/ +cp -pr ${cluster_template_path}/namespace-kueue-system.yaml ${clusters_path}/ + +# Added entries to the kustomization file +resources=$(find ${clusters_path} -maxdepth 1 -mindepth 1 -type d | sort) +resources+=" " +export resources+=$(find ${clusters_path} -maxdepth 1 -type f -name "*.yaml" ! -name "kustomization.yaml" ! -name "*values.yaml" | sort) +export kustomization_file="${clusters_path}/kustomization.yaml" +source ${SCRIPT_PATH}/helpers/add_to_kustomization.sh + +# Add, commit, and push changes to the repository +cd ${GIT_REPOSITORY_PATH} +git add . +git commit -m "Manifests for Kueue" +git push origin diff --git a/best-practices/ml-platform/examples/platform/playground/scripts/namespace_cleanup.sh b/best-practices/ml-platform/examples/platform/playground/scripts/namespace_cleanup.sh index 325d7e2a9..c47d239fb 100755 --- a/best-practices/ml-platform/examples/platform/playground/scripts/namespace_cleanup.sh +++ b/best-practices/ml-platform/examples/platform/playground/scripts/namespace_cleanup.sh @@ -22,31 +22,28 @@ SCRIPT_PATH="$( source ${SCRIPT_PATH}/helpers/clone_git_repo.sh -cd ${GIT_REPOSITORY_PATH} - -git config user.name "${GIT_USERNAME}" -git config user.email "${GIT_EMAIL}" +# Set directory and path variables +clusters_directory="manifests/clusters" +clusters_path="${GIT_REPOSITORY_PATH}/${clusters_directory}" +namespace_directory="manifests/apps/${K8S_NAMESPACE}" +namespace_path="${GIT_REPOSITORY_PATH}/${namespace_directory}" -team_namespace_directory="manifests/apps/${K8S_NAMESPACE}" -team_namespace_path="${GIT_REPOSITORY_PATH}/${team_namespace_directory}" - -cd "${team_namespace_path}/.." || { - echo "Team namespace directory '${team_namespace_directory}' does not exist" +cd "${namespace_path}/.." || { + echo "Team namespace directory '${namespace_directory}' does not exist" } -git rm -rf ${team_namespace_path} - -cluster_directory="manifests/clusters" -cluster_path="${GIT_REPOSITORY_PATH}/${cluster_directory}" +git rm -rf ${namespace_path} -cd "${cluster_path}" || { - echo "Cluster directory '${cluster_directory}' does not exist" +cd "${clusters_path}" || { + echo "Clusters directory '${clusters_directory}' does not exist" } -git rm -rf ${cluster_path}/${K8S_NAMESPACE}/* -sed -i "/- .\/${K8S_NAMESPACE}/d" ${cluster_path}/kustomization.yaml -sed -i "/ - ${K8S_NAMESPACE}/d" ${cluster_path}/kuberay/values.yaml +git rm -rf ${clusters_path}/${K8S_NAMESPACE}/* +sed -i "/- .\/${K8S_NAMESPACE}/d" ${clusters_path}/kustomization.yaml +sed -i "/ - ${K8S_NAMESPACE}/d" ${clusters_path}/kuberay/values.yaml +# Add, commit, and push changes to the repository +cd ${GIT_REPOSITORY_PATH} git add . git commit -m "Removed manifests for '${K8S_NAMESPACE}' namespace" git push origin @@ -57,3 +54,13 @@ echo "Deleteing the namespace '${K8S_NAMESPACE}'..." kubectl --namespace ${K8S_NAMESPACE} delete all --all kubectl delete namespace ${K8S_NAMESPACE} echo "Namespace '${K8S_NAMESPACE}', deleted" + +echo "Cleaning up network endpoint groups..." +negs=$(gcloud compute network-endpoint-groups list --filter="name~'k8s.*-${K8S_NAMESPACE}-.*' AND network~'.*-${ENVIRONMENT_NAME}$'" --format='value(format("{0},{1}", name, zone.basename()))' --project=${PROJECT_ID}) +for neg in ${negs}; do + name="${neg%,*}" + zone="${neg#*,}" + + echo "Deleting '${name}' network endpoint group in ${zone}..." + gcloud compute network-endpoint-groups delete ${name} --project=${PROJECT_ID} --quiet --zone=${zone} +done diff --git a/best-practices/ml-platform/examples/platform/playground/scripts/namespace_manifests.sh b/best-practices/ml-platform/examples/platform/playground/scripts/namespace_manifests.sh new file mode 100755 index 000000000..bfdf1032b --- /dev/null +++ b/best-practices/ml-platform/examples/platform/playground/scripts/namespace_manifests.sh @@ -0,0 +1,65 @@ +#!/bin/bash +# +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -u + +SCRIPT_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" + +source ${SCRIPT_PATH}/helpers/clone_git_repo.sh + +# Set directory and path variables +clusters_directory="manifests/clusters" +clusters_path="${GIT_REPOSITORY_PATH}/${clusters_directory}" +clusters_namespace_directory="${clusters_directory}/${K8S_NAMESPACE}" +clusters_namespace_path="${GIT_REPOSITORY_PATH}/${clusters_namespace_directory}" +namespace_directory="manifests/apps/${K8S_NAMESPACE}" +namespace_path="${GIT_REPOSITORY_PATH}/${namespace_directory}" +cluster_template_directory="templates/_cluster_template" +cluster_template_path="${GIT_REPOSITORY_PATH}/${cluster_template_directory}" + +chars_in_namespace=$(echo -n ${K8S_NAMESPACE} | wc -c) +chars_in_cluster_env=$(echo -n ${CLUSTER_ENV} | wc -c) +chars_in_reposync_name=$(expr ${chars_in_namespace} + ${chars_in_cluster_env} + 1) + +# Create clusters namespace directory +mkdir ${clusters_namespace_path} + +# Copy template files to clusters namespace directory +cp -r ${cluster_template_path}/team/* ${clusters_namespace_path} + +# Configure template files in clusters namespace directory +sed -i "s?NAMESPACE?${K8S_NAMESPACE}?g" ${clusters_namespace_path}/* +sed -ni '/#END OF SINGLE ENV DECLARATION/q;p' ${clusters_namespace_path}/reposync.yaml +sed -i "s?ENV?${CLUSTER_ENV}?g" ${clusters_namespace_path}/reposync.yaml +sed -i "s?GIT_REPO?https://${GIT_REPOSITORY}?g" ${clusters_namespace_path}/reposync.yaml +sed -i "s??${chars_in_reposync_name}?g" ${clusters_namespace_path}/reposync.yaml + +# Create the namespace directory +mkdir ${namespace_path} +touch ${namespace_path}/.gitkeep + +# Added entries to the kustomization file +export resources=("${clusters_namespace_path}") +export kustomization_file="${clusters_path}/kustomization.yaml" +source ${SCRIPT_PATH}/helpers/add_to_kustomization.sh + +# Add, commit, and push changes to the repository +cd ${GIT_REPOSITORY_PATH} +git add . +git commit -m "Manifests for '${K8S_NAMESPACE}' namespace" +git push origin diff --git a/best-practices/ml-platform/examples/platform/playground/scripts/nvidia_dcgm_manifests.sh b/best-practices/ml-platform/examples/platform/playground/scripts/nvidia_dcgm_manifests.sh new file mode 100755 index 000000000..11e27f742 --- /dev/null +++ b/best-practices/ml-platform/examples/platform/playground/scripts/nvidia_dcgm_manifests.sh @@ -0,0 +1,63 @@ +#!/bin/bash +# +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -u + +SCRIPT_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" + +source ${SCRIPT_PATH}/helpers/clone_git_repo.sh + +# Set directory and path variables +clusters_directory="manifests/clusters" +clusters_path="${GIT_REPOSITORY_PATH}/${clusters_directory}" +cluster_template_directory="templates/_cluster_template" +cluster_template_path="${GIT_REPOSITORY_PATH}/${cluster_template_directory}" + +cd "${clusters_path}" || { + echo "Clusters directory '${clusters_directory}' does not exist" + exit 100 +} + +mkdir -p ${clusters_path}/gmp-public +cp -pr ${cluster_template_path}/gmp-public/nvidia-dcgm ${clusters_path}/gmp-public/ + +# Added entries to the clusters/gmp-public kustomization file +resources=$(find ${clusters_path}/gmp-public -maxdepth 1 -mindepth 1 -type d | sort) +resources+=" " +export resources+=$(find ${clusters_path}/gmp-public -maxdepth 1 -type f -name "*.yaml" ! -name "kustomization.yaml" ! -name "*values.yaml" | sort) +export kustomization_file="${clusters_path}/gmp-public/kustomization.yaml" + +if [ ! -f ${kustomization_file} ]; then + cp -pr ${cluster_template_path}/gmp-public/kustomization.yaml ${clusters_path}/gmp-public/ +fi + +source ${SCRIPT_PATH}/helpers/add_to_kustomization.sh + +# Added entries to the clusters kustomization file +resources=$(find ${clusters_path} -maxdepth 1 -mindepth 1 -type d | sort) +resources+=" " +export resources+=$(find ${clusters_path} -maxdepth 1 -type f -name "*.yaml" ! -name "kustomization.yaml" ! -name "*values.yaml" | sort) +export kustomization_file="${clusters_path}/kustomization.yaml" + +source ${SCRIPT_PATH}/helpers/add_to_kustomization.sh + +# Add, commit, and push changes to the repository +cd ${GIT_REPOSITORY_PATH} +git add . +git commit -m "Manifests for NVIDIA DCGM" +git push origin diff --git a/best-practices/ml-platform/examples/platform/playground/scripts/install_ray_cluster.sh b/best-practices/ml-platform/examples/platform/playground/scripts/ray_cluster_namespace_manifests.sh similarity index 57% rename from best-practices/ml-platform/examples/platform/playground/scripts/install_ray_cluster.sh rename to best-practices/ml-platform/examples/platform/playground/scripts/ray_cluster_namespace_manifests.sh index dc8b3a487..ec2edd6fe 100755 --- a/best-practices/ml-platform/examples/platform/playground/scripts/install_ray_cluster.sh +++ b/best-practices/ml-platform/examples/platform/playground/scripts/ray_cluster_namespace_manifests.sh @@ -22,24 +22,29 @@ SCRIPT_PATH="$( source ${SCRIPT_PATH}/helpers/clone_git_repo.sh -cd ${GIT_REPOSITORY_PATH}/manifests/apps -if [ ! -d "${K8S_NAMESPACE}" ]; then +# Set directory and path variables +namespace_directory="manifests/apps/${K8S_NAMESPACE}" +namespace_path="${GIT_REPOSITORY_PATH}/${namespace_directory}" +namespace_template_directory="templates/_namespace_template" +namespace_template_path="${GIT_REPOSITORY_PATH}/${namespace_template_directory}" + +if [ ! -d "${namespace_path}" ]; then echo "${K8S_NAMESPACE} folder doesnt exist in the configsync repo" - exit 1 + exit 100 fi -if [ -f "${K8S_NAMESPACE}/kustomization.yaml" ]; then +if [ -f "${namespace_path}/kustomization.yaml" ]; then echo "${K8S_NAMESPACE} is already set up" exit 0 fi -cp -r ../../templates/_namespace_template/app/* ${K8S_NAMESPACE}/ -sed -i "s?NAMESPACE?${K8S_NAMESPACE}?g" ${K8S_NAMESPACE}/* -sed -i "s?GOOGLE_SERVICE_ACCOUNT_RAY_HEAD?${GOOGLE_SERVICE_ACCOUNT_HEAD}?g" ${K8S_NAMESPACE}/* -sed -i "s?KUBERNETES_SERVICE_ACCOUNT_RAY_HEAD?${K8S_SERVICE_ACCOUNT_HEAD}?g" ${K8S_NAMESPACE}/* -sed -i "s?GOOGLE_SERVICE_ACCOUNT_RAY_WORKER?${GOOGLE_SERVICE_ACCOUNT_WORKER}?g" ${K8S_NAMESPACE}/* -sed -i "s?KUBERNETES_SERVICE_ACCOUNT_RAY_WORKER?${K8S_SERVICE_ACCOUNT_WORKER}?g" ${K8S_NAMESPACE}/* +cp -r ${namespace_template_path}/app/* ${namespace_path}/ +sed -i "s?NAMESPACE?${K8S_NAMESPACE}?g" ${namespace_path}/* +sed -i "s?KUBERNETES_SERVICE_ACCOUNT_RAY_HEAD?${K8S_SERVICE_ACCOUNT_HEAD}?g" ${namespace_path}/* +sed -i "s?KUBERNETES_SERVICE_ACCOUNT_RAY_WORKER?${K8S_SERVICE_ACCOUNT_WORKER}?g" ${namespace_path}/* +# Add, commit, and push changes to the repository +cd ${GIT_REPOSITORY_PATH} git add . -git commit -m "Added a RayCluster in '${K8S_NAMESPACE}' namespace." +git commit -m "Manifests for RayCluster in '${K8S_NAMESPACE}' namespace" git push origin diff --git a/best-practices/ml-platform/examples/platform/playground/scripts/install_kuberay_operator.sh b/best-practices/ml-platform/examples/platform/playground/scripts/template_manifests.sh similarity index 61% rename from best-practices/ml-platform/examples/platform/playground/scripts/install_kuberay_operator.sh rename to best-practices/ml-platform/examples/platform/playground/scripts/template_manifests.sh index da3a79cae..54e8fcac6 100755 --- a/best-practices/ml-platform/examples/platform/playground/scripts/install_kuberay_operator.sh +++ b/best-practices/ml-platform/examples/platform/playground/scripts/template_manifests.sh @@ -22,20 +22,21 @@ SCRIPT_PATH="$( source ${SCRIPT_PATH}/helpers/clone_git_repo.sh -cd ${GIT_REPOSITORY_PATH}/manifests/clusters -if [ -f "kustomization.yaml" ]; then - exit 0 -fi +# Set directory and path variables +clusters_directory="manifests/clusters" +clusters_path="${GIT_REPOSITORY_PATH}/${clusters_directory}" +templates_directory="templates/configsync" -yamlfiles=$(find . -type f -name "*.yaml") -cp ../../templates/_cluster_template/kustomization.yaml . -for yamlfile in $(echo ${yamlfiles}); do - cat <>kustomization.yaml -- ${yamlfile} -EOF -done +echo "Copying template files..." +cp -r ${templates_directory}/* ${GIT_REPOSITORY_PATH}/ -cp -r ../../templates/_cluster_template/kuberay . +cd ${clusters_path} || { + echo "Failed to copy template files" + exit 100 +} + +# Add, commit, and push changes to the repository +cd ${GIT_REPOSITORY_PATH} git add . -git commit -m "Added manifests to install kuberay operator." +git commit -m "Added templates and scaffolding" git push origin diff --git a/best-practices/ml-platform/examples/platform/playground/scripts/manage_ray_ns.sh b/best-practices/ml-platform/examples/platform/playground/scripts/wait_for_configsync.sh similarity index 63% rename from best-practices/ml-platform/examples/platform/playground/scripts/manage_ray_ns.sh rename to best-practices/ml-platform/examples/platform/playground/scripts/wait_for_configsync.sh index 1fdd2d712..5f081df24 100755 --- a/best-practices/ml-platform/examples/platform/playground/scripts/manage_ray_ns.sh +++ b/best-practices/ml-platform/examples/platform/playground/scripts/wait_for_configsync.sh @@ -22,16 +22,8 @@ SCRIPT_PATH="$( source ${SCRIPT_PATH}/helpers/clone_git_repo.sh -cd ${GIT_REPOSITORY_PATH}/manifests/clusters/kuberay +cd ${GIT_REPOSITORY_PATH} +commit_hash=$(git rev-parse HEAD) -ns_exists=$(grep ${K8S_NAMESPACE} values.yaml | wc -l) -if [ "${ns_exists}" -ne 0 ]; then - echo "namespace '${K8S_NAMESPACE}' already present in values.yaml" - exit 0 -fi - -sed -i "s/watchNamespace:/watchNamespace:\n - ${K8S_NAMESPACE}/g" values.yaml - -git add . -git commit -m "Configured KubeRay operator to watch '${K8S_NAMESPACE}' namespace" -git push origin +${SCRIPT_PATH}/helpers/wait_for_repo_sync.sh ${commit_hash} +${SCRIPT_PATH}/helpers/wait_for_root_sync.sh ${commit_hash} diff --git a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/manifests/apps/.gitkeep b/best-practices/ml-platform/examples/platform/playground/templates/configsync/manifests/apps/.gitkeep similarity index 100% rename from best-practices/ml-platform/examples/platform/playground/templates/acm-template/manifests/apps/.gitkeep rename to best-practices/ml-platform/examples/platform/playground/templates/configsync/manifests/apps/.gitkeep diff --git a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/manifests/clusters/.gitkeep b/best-practices/ml-platform/examples/platform/playground/templates/configsync/manifests/clusters/.gitkeep similarity index 100% rename from best-practices/ml-platform/examples/platform/playground/templates/acm-template/manifests/clusters/.gitkeep rename to best-practices/ml-platform/examples/platform/playground/templates/configsync/manifests/clusters/.gitkeep diff --git a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/cluster.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/cluster.yaml similarity index 100% rename from best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/cluster.yaml rename to best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/cluster.yaml diff --git a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/config-selector.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/config-selector.yaml similarity index 100% rename from best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/config-selector.yaml rename to best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/config-selector.yaml diff --git a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/kustomization.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/gmp-public/kustomization.yaml similarity index 98% rename from best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/kustomization.yaml rename to best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/gmp-public/kustomization.yaml index 27776353a..361030e28 100644 --- a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/kustomization.yaml +++ b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/gmp-public/kustomization.yaml @@ -15,4 +15,3 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: -- ./kuberay diff --git a/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/gmp-public/nvidia-dcgm/dcgm-daemons-set.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/gmp-public/nvidia-dcgm/dcgm-daemons-set.yaml new file mode 100644 index 000000000..f2363eda3 --- /dev/null +++ b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/gmp-public/nvidia-dcgm/dcgm-daemons-set.yaml @@ -0,0 +1,61 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Reference: https://cloud.google.com/stackdriver/docs/managed-prometheus/exporters/nvidia-dcgm +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-dcgm + namespace: gmp-public + labels: + app: nvidia-dcgm +spec: + selector: + matchLabels: + app: nvidia-dcgm + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: nvidia-dcgm + app: nvidia-dcgm + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-accelerator + operator: Exists + tolerations: + - operator: "Exists" + volumes: + - name: nvidia-install-dir-host + hostPath: + path: /home/kubernetes/bin/nvidia + type: Directory + containers: + - name: nvidia-dcgm + # https://hub.docker.com/r/nvidia/dcgm/tags + image: "nvcr.io/nvidia/cloud-native/dcgm:3.3.0-1-ubuntu22.04" + command: ["nv-hostengine", "-n", "-b", "ALL"] + ports: + - containerPort: 5555 + hostPort: 5555 + securityContext: + privileged: true + volumeMounts: + - name: nvidia-install-dir-host + mountPath: /usr/local/nvidia diff --git a/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/gmp-public/nvidia-dcgm/dcgm-exporter-cluster-pod-monitoring.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/gmp-public/nvidia-dcgm/dcgm-exporter-cluster-pod-monitoring.yaml new file mode 100644 index 000000000..433f63241 --- /dev/null +++ b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/gmp-public/nvidia-dcgm/dcgm-exporter-cluster-pod-monitoring.yaml @@ -0,0 +1,32 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Reference: https://cloud.google.com/stackdriver/docs/managed-prometheus/exporters/nvidia-dcgm +apiVersion: monitoring.googleapis.com/v1 +kind: ClusterPodMonitoring +metadata: + name: nvidia-dcgm-exporter + labels: + app.kubernetes.io/name: nvidia-dcgm-exporter + app.kubernetes.io/part-of: google-cloud-managed-prometheus +spec: + selector: + matchLabels: + app.kubernetes.io/name: nvidia-dcgm-exporter + endpoints: + - port: metrics + interval: 30s + targetLabels: + metadata: [] + diff --git a/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/gmp-public/nvidia-dcgm/dcgm-exporter-daemon-set.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/gmp-public/nvidia-dcgm/dcgm-exporter-daemon-set.yaml new file mode 100644 index 000000000..e0ab23ac8 --- /dev/null +++ b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/gmp-public/nvidia-dcgm/dcgm-exporter-daemon-set.yaml @@ -0,0 +1,90 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Reference: https://cloud.google.com/stackdriver/docs/managed-prometheus/exporters/nvidia-dcgm +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-dcgm-exporter + namespace: gmp-public + labels: + app.kubernetes.io/name: nvidia-dcgm-exporter +spec: + selector: + matchLabels: + app.kubernetes.io/name: nvidia-dcgm-exporter + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + app.kubernetes.io/name: nvidia-dcgm-exporter + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-accelerator + operator: Exists + tolerations: + - operator: "Exists" + volumes: + - name: nvidia-dcgm-exporter-metrics + configMap: + name: nvidia-dcgm-exporter-metrics + - name: nvidia-install-dir-host + hostPath: + path: /home/kubernetes/bin/nvidia + type: Directory + - name: pod-resources + hostPath: + path: /var/lib/kubelet/pod-resources + containers: + - name: nvidia-dcgm-exporter + # https://hub.docker.com/r/nvidia/dcgm-exporter/tags + image: nvcr.io/nvidia/k8s/dcgm-exporter:3.3.6-3.4.2-ubuntu22.04 + command: ["/bin/bash", "-c"] + args: + - hostname $NODE_NAME; dcgm-exporter --remote-hostengine-info $(NODE_IP) --collectors /etc/dcgm-exporter/counters.csv + ports: + - name: metrics + containerPort: 9400 + securityContext: + privileged: true + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: "DCGM_EXPORTER_KUBERNETES_GPU_ID_TYPE" + value: "device-name" + - name: LD_LIBRARY_PATH + value: /usr/local/nvidia/lib64 + - name: NODE_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + - name: DCGM_EXPORTER_KUBERNETES + value: 'true' + - name: DCGM_EXPORTER_LISTEN + value: ':9400' + volumeMounts: + - name: nvidia-dcgm-exporter-metrics + mountPath: "/etc/dcgm-exporter" + readOnly: true + - name: nvidia-install-dir-host + mountPath: /usr/local/nvidia + - name: pod-resources + mountPath: /var/lib/kubelet/pod-resources diff --git a/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/gmp-public/nvidia-dcgm/dcgm-exporter-metrics-config-map.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/gmp-public/nvidia-dcgm/dcgm-exporter-metrics-config-map.yaml new file mode 100644 index 000000000..cbac3dfe2 --- /dev/null +++ b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/gmp-public/nvidia-dcgm/dcgm-exporter-metrics-config-map.yaml @@ -0,0 +1,107 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Reference: https://cloud.google.com/stackdriver/docs/managed-prometheus/exporters/nvidia-dcgm +apiVersion: v1 +kind: ConfigMap +metadata: + name: nvidia-dcgm-exporter-metrics + namespace: gmp-public +data: + counters.csv: | + # DCGM FIELDS: + # - https://docs.nvidia.com/datacenter/dcgm/latest/dcgm-api/dcgm-api-field-ids.html + # - https://docs.nvidia.com/datacenter/dcgm/2.3/dcgm-api/group__dcgmFieldIdentifiers.html + + # Format + # If line starts with a '#' it is considered a comment + # DCGM FIELD, Prometheus metric type, help message + + # Clocks + DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). + DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). + + # Temperature + DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C). + DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C). + + # Power + DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). + DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ). + + # PCIE + # DCGM_FI_DEV_PCIE_TX_THROUGHPUT, counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML. + # DCGM_FI_DEV_PCIE_RX_THROUGHPUT, counter, Total number of bytes received through PCIe RX (in KB) via NVML. + DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries. + + # Utilization (the sample period varies depending on the product) + DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). + DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %). + DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %). + DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %). + + # Errors and violations + DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered. + # DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us). + # DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us). + # DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us). + # DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us). + # DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us). + # DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us). + + # Memory usage + DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB). + DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB). + DCGM_FI_DEV_FB_TOTAL, gauge, Framebuffer memory total (in MiB). + + # ECC + # DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors. + # DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors. + # DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors. + # DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors. + + # Retired pages + # DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors. + # DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors. + # DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement. + + # NVLink + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors. + # DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. + # DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. + # DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. + DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes. + # DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload. + + # VGPU License status + DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status + + # Remapped rows + DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors + DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors + DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed + + # DCP metrics + DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active. + DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned. + DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM. + DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active. + DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data. + DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active. + DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active. + DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active. + DCGM_FI_PROF_PCIE_TX_BYTES, counter, The number of bytes of active pcie tx data including both header and payload. + DCGM_FI_PROF_PCIE_RX_BYTES, counter, The number of bytes of active pcie rx data including both header and payload. + DCGM_FI_PROF_NVLINK_TX_BYTES, gauge, The number of bytes of active NvLink tx (transmit) data including both header and payload. + DCGM_FI_PROF_NVLINK_RX_BYTES, gauge, The number of bytes of active NvLink rx (read) data including both header and payload. diff --git a/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/gmp-public/nvidia-dcgm/kustomization.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/gmp-public/nvidia-dcgm/kustomization.yaml new file mode 100644 index 000000000..7857cc984 --- /dev/null +++ b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/gmp-public/nvidia-dcgm/kustomization.yaml @@ -0,0 +1,31 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# Another option is to use the NVIDIA Helm chart +# helmCharts: +# - name: dcgm-exporter +# repo: https://nvidia.github.io/dcgm-exporter/helm-charts/ +# version: 3.4.2 +# releaseName: dcgm-exporter +# includeCRDs: true +# valuesFile: dcgm-exporter-values.yaml + +resources: +- dcgm-daemons-set.yaml +- dcgm-exporter-cluster-pod-monitoring.yaml +- dcgm-exporter-daemon-set.yaml +- dcgm-exporter-metrics-config-map.yaml diff --git a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/kuberay/kustomization.yaml similarity index 100% rename from best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml rename to best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/kuberay/kustomization.yaml diff --git a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/kuberay/rayclusters.yaml similarity index 100% rename from best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml rename to best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/kuberay/rayclusters.yaml diff --git a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/kuberay/rayjobs.yaml similarity index 100% rename from best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml rename to best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/kuberay/rayjobs.yaml diff --git a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/kuberay/rayservices.yaml similarity index 100% rename from best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml rename to best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/kuberay/rayservices.yaml diff --git a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/kuberay/rbac.yaml similarity index 100% rename from best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml rename to best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/kuberay/rbac.yaml diff --git a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/kuberay/values.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/kuberay/values.yaml similarity index 100% rename from best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/kuberay/values.yaml rename to best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/kuberay/values.yaml diff --git a/tutorials-and-examples/workflow-orchestration/dws-examples/kueue-manifests.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/kueue-system/kueue.yaml similarity index 51% rename from tutorials-and-examples/workflow-orchestration/dws-examples/kueue-manifests.yaml rename to best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/kueue-system/kueue.yaml index a259628dd..25c2e0caf 100644 --- a/tutorials-and-examples/workflow-orchestration/dws-examples/kueue-manifests.yaml +++ b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/kueue-system/kueue.yaml @@ -1,15 +1,12 @@ -apiVersion: v1 -kind: Namespace -metadata: - labels: - control-plane: controller-manager - name: kueue-system ---- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.14.0 + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager name: admissionchecks.kueue.x-k8s.io spec: group: kueue.x-k8s.io @@ -26,14 +23,19 @@ spec: description: AdmissionCheck is the Schema for the admissionchecks API properties: apiVersion: - description: 'APIVersion defines the versioned schema of this representation - of an object. Servers should convert recognized schemas to the latest - internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources type: string kind: - description: 'Kind is a string value representing the REST resource this - object represents. Servers may infer this from the endpoint the client - submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds type: string metadata: type: object @@ -41,22 +43,32 @@ spec: description: AdmissionCheckSpec defines the desired state of AdmissionCheck properties: controllerName: - description: controllerName is name of the controller which will actually - perform the checks. This is the name with which controller identifies - with, not necessarily a K8S Pod or Deployment name. Cannot be empty. + description: |- + controllerName is name of the controller which will actually perform + the checks. This is the name with which controller identifies with, + not necessarily a K8S Pod or Deployment name. Cannot be empty. type: string + x-kubernetes-validations: + - message: field is immutable + rule: self == oldSelf parameters: description: Parameters identifies the resource providing additional check parameters. properties: apiGroup: description: ApiGroup is the group for the resource being referenced. + maxLength: 253 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ type: string kind: description: Kind is the type of the resource being referenced. + maxLength: 63 + pattern: ^(?i)[a-z]([-a-z0-9]*[a-z0-9])?$ type: string name: description: Name is the name of the resource being referenced. + maxLength: 63 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ type: string required: - apiGroup @@ -65,10 +77,11 @@ spec: type: object retryDelayMinutes: default: 15 - description: RetryDelayMinutes specifies how long to keep the workload - suspended after a failed check (after it transitioned to False). - After that the check state goes to "Unknown". The default is 15 - min. + description: |- + RetryDelayMinutes specifies how long to keep the workload suspended + after a failed check (after it transitioned to False). + After that the check state goes to "Unknown". + The default is 15 min. format: int64 type: integer required: @@ -78,46 +91,47 @@ spec: description: AdmissionCheckStatus defines the observed state of AdmissionCheck properties: conditions: - description: conditions hold the latest available observations of - the AdmissionCheck current state. + description: |- + conditions hold the latest available observations of the AdmissionCheck + current state. items: description: "Condition contains details for one aspect of the current - state of this API Resource. --- This struct is intended for direct - use as an array at the field path .status.conditions. For example, - \n type FooStatus struct{ // Represents the observations of a - foo's current state. // Known .status.conditions.type are: \"Available\", - \"Progressing\", and \"Degraded\" // +patchMergeKey=type // +patchStrategy=merge - // +listType=map // +listMapKey=type Conditions []metav1.Condition - `json:\"conditions,omitempty\" patchStrategy:\"merge\" patchMergeKey:\"type\" - protobuf:\"bytes,1,rep,name=conditions\"` \n // other fields }" + state of this API Resource.\n---\nThis struct is intended for + direct use as an array at the field path .status.conditions. For + example,\n\n\n\ttype FooStatus struct{\n\t // Represents the + observations of a foo's current state.\n\t // Known .status.conditions.type + are: \"Available\", \"Progressing\", and \"Degraded\"\n\t // + +patchMergeKey=type\n\t // +patchStrategy=merge\n\t // +listType=map\n\t + \ // +listMapKey=type\n\t Conditions []metav1.Condition `json:\"conditions,omitempty\" + patchStrategy:\"merge\" patchMergeKey:\"type\" protobuf:\"bytes,1,rep,name=conditions\"`\n\n\n\t + \ // other fields\n\t}" properties: lastTransitionTime: - description: lastTransitionTime is the last time the condition - transitioned from one status to another. This should be when - the underlying condition changed. If that is not known, then - using the time when the API field changed is acceptable. + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. format: date-time type: string message: - description: message is a human readable message indicating - details about the transition. This may be an empty string. + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. maxLength: 32768 type: string observedGeneration: - description: observedGeneration represents the .metadata.generation - that the condition was set based upon. For instance, if .metadata.generation - is currently 12, but the .status.conditions[x].observedGeneration - is 9, the condition is out of date with respect to the current - state of the instance. + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. format: int64 minimum: 0 type: integer reason: - description: reason contains a programmatic identifier indicating - the reason for the condition's last transition. Producers - of specific condition types may define expected values and - meanings for this field, and whether the values are considered - a guaranteed API. The value should be a CamelCase string. + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. This field may not be empty. maxLength: 1024 minLength: 1 @@ -131,11 +145,12 @@ spec: - Unknown type: string type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - --- Many .condition.type values are consistent across resources - like Available, but because arbitrary conditions can be useful - (see .node.status.conditions), the ability to deconflict is - important. The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: |- + type of condition in CamelCase or in foo.example.com/CamelCase. + --- + Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be + useful (see .node.status.conditions), the ability to deconflict is important. + The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string @@ -162,6 +177,10 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.14.0 + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager name: clusterqueues.kueue.x-k8s.io spec: group: kueue.x-k8s.io @@ -169,6 +188,8 @@ spec: kind: ClusterQueue listKind: ClusterQueueList plural: clusterqueues + shortNames: + - cq singular: clusterqueue scope: Cluster versions: @@ -197,14 +218,19 @@ spec: description: ClusterQueue is the Schema for the clusterQueue API. properties: apiVersion: - description: 'APIVersion defines the versioned schema of this representation - of an object. Servers should convert recognized schemas to the latest - internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources type: string kind: - description: 'Kind is a string value representing the REST resource this - object represents. Servers may infer this from the endpoint the client - submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds type: string metadata: type: object @@ -212,116 +238,198 @@ spec: description: ClusterQueueSpec defines the desired state of ClusterQueue properties: admissionChecks: - description: admissionChecks lists the AdmissionChecks required by - this ClusterQueue + description: |- + admissionChecks lists the AdmissionChecks required by this ClusterQueue. + Cannot be used along with AdmissionCheckStrategy. items: type: string type: array + admissionChecksStrategy: + description: |- + admissionCheckStrategy defines a list of strategies to determine which ResourceFlavors require AdmissionChecks. + This property cannot be used in conjunction with the 'admissionChecks' property. + properties: + admissionChecks: + description: admissionChecks is a list of strategies for AdmissionChecks + items: + description: AdmissionCheckStrategyRule defines rules for a + single AdmissionCheck + properties: + name: + description: name is an AdmissionCheck's name. + type: string + onFlavors: + description: |- + onFlavors is a list of ResourceFlavors' names that this AdmissionCheck should run for. + If empty, the AdmissionCheck will run for all workloads submitted to the ClusterQueue. + items: + description: ResourceFlavorReference is the name of the + ResourceFlavor. + maxLength: 253 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + type: array + required: + - name + type: object + type: array + type: object cohort: - description: "cohort that this ClusterQueue belongs to. CQs that belong - to the same cohort can borrow unused resources from each other. - \n A CQ can be a member of a single borrowing cohort. A workload - submitted to a queue referencing this CQ can borrow quota from any - CQ in the cohort. Only quota for the [resource, flavor] pairs listed - in the CQ can be borrowed. If empty, this ClusterQueue cannot borrow - from any other ClusterQueue and vice versa. \n A cohort is a name - that links CQs together, but it doesn't reference any object. \n + description: |- + cohort that this ClusterQueue belongs to. CQs that belong to the + same cohort can borrow unused resources from each other. + + + A CQ can be a member of a single borrowing cohort. A workload submitted + to a queue referencing this CQ can borrow quota from any CQ in the cohort. + Only quota for the [resource, flavor] pairs listed in the CQ can be + borrowed. + If empty, this ClusterQueue cannot borrow from any other ClusterQueue and + vice versa. + + + A cohort is a name that links CQs together, but it doesn't reference any + object. + + Validation of a cohort name is equivalent to that of object names: - subdomain in DNS (RFC 1123)." + subdomain in DNS (RFC 1123). + maxLength: 253 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ type: string + fairSharing: + description: |- + fairSharing defines the properties of the ClusterQueue when participating in fair sharing. + The values are only relevant if fair sharing is enabled in the Kueue configuration. + properties: + weight: + anyOf: + - type: integer + - type: string + default: 1 + description: |- + weight gives a comparative advantage to this ClusterQueue when competing for unused + resources in the cohort against other ClusterQueues. + The share of a ClusterQueue is based on the dominant resource usage above nominal + quotas for each resource, divided by the weight. + Admission prioritizes scheduling workloads from ClusterQueues with the lowest share + and preempting workloads from the ClusterQueues with the highest share. + A zero weight implies infinite share value, meaning that this ClusterQueue will always + be at disadvantage against other ClusterQueues. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object flavorFungibility: - description: flavorFungibility defines whether a workload should try - the next flavor before borrowing or preempting in the flavor being - evaluated. + default: {} + description: |- + flavorFungibility defines whether a workload should try the next flavor + before borrowing or preempting in the flavor being evaluated. properties: whenCanBorrow: default: Borrow - description: "whenCanBorrow determines whether a workload should - try the next flavor before borrowing in current flavor. The - possible values are: \n - `Borrow` (default): allocate in current - flavor if borrowing is possible. - `TryNextFlavor`: try next - flavor even if the current flavor has enough resources to borrow." + description: |- + whenCanBorrow determines whether a workload should try the next flavor + before borrowing in current flavor. The possible values are: + + + - `Borrow` (default): allocate in current flavor if borrowing + is possible. + - `TryNextFlavor`: try next flavor even if the current + flavor has enough resources to borrow. enum: - Borrow - TryNextFlavor type: string whenCanPreempt: default: TryNextFlavor - description: "whenCanPreempt determines whether a workload should - try the next flavor before borrowing in current flavor. The - possible values are: \n - `Preempt`: allocate in current flavor - if it's possible to preempt some workloads. - `TryNextFlavor` - (default): try next flavor even if there are enough candidates - for preemption in the current flavor." + description: |- + whenCanPreempt determines whether a workload should try the next flavor + before borrowing in current flavor. The possible values are: + + + - `Preempt`: allocate in current flavor if it's possible to preempt some workloads. + - `TryNextFlavor` (default): try next flavor even if there are enough + candidates for preemption in the current flavor. enum: - Preempt - TryNextFlavor type: string type: object namespaceSelector: - description: namespaceSelector defines which namespaces are allowed - to submit workloads to this clusterQueue. Beyond this basic support - for policy, an policy agent like Gatekeeper should be used to enforce - more advanced policies. Defaults to null which is a nothing selector - (no namespaces eligible). If set to an empty selector `{}`, then - all namespaces are eligible. + description: |- + namespaceSelector defines which namespaces are allowed to submit workloads to + this clusterQueue. Beyond this basic support for policy, a policy agent like + Gatekeeper should be used to enforce more advanced policies. + Defaults to null which is a nothing selector (no namespaces eligible). + If set to an empty selector `{}`, then all namespaces are eligible. properties: matchExpressions: description: matchExpressions is a list of label selector requirements. The requirements are ANDed. items: - description: A label selector requirement is a selector that - contains values, a key, and an operator that relates the key - and values. + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. properties: key: description: key is the label key that the selector applies to. type: string operator: - description: operator represents a key's relationship to - a set of values. Valid operators are In, NotIn, Exists - and DoesNotExist. + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. type: string values: - description: values is an array of string values. If the - operator is In or NotIn, the values array must be non-empty. - If the operator is Exists or DoesNotExist, the values - array must be empty. This array is replaced during a strategic + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic merge patch. items: type: string type: array + x-kubernetes-list-type: atomic required: - key - operator type: object type: array + x-kubernetes-list-type: atomic matchLabels: additionalProperties: type: string - description: matchLabels is a map of {key,value} pairs. A single - {key,value} in the matchLabels map is equivalent to an element - of matchExpressions, whose key field is "key", the operator - is "In", and the values array contains only "value". The requirements - are ANDed. + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. type: object type: object x-kubernetes-map-type: atomic preemption: - description: "preemption describes policies to preempt Workloads from - this ClusterQueue or the ClusterQueue's cohort. \n Preemption can - happen in two scenarios: \n - When a Workload fits within the nominal - quota of the ClusterQueue, but the quota is currently borrowed by - other ClusterQueues in the cohort. Preempting Workloads in other - ClusterQueues allows this ClusterQueue to reclaim its nominal quota. + default: {} + description: |- + preemption describes policies to preempt Workloads from this ClusterQueue + or the ClusterQueue's cohort. + + + Preemption can happen in two scenarios: + + + - When a Workload fits within the nominal quota of the ClusterQueue, but + the quota is currently borrowed by other ClusterQueues in the cohort. + Preempting Workloads in other ClusterQueues allows this ClusterQueue to + reclaim its nominal quota. - When a Workload doesn't fit within the nominal quota of the ClusterQueue - and there are admitted Workloads in the ClusterQueue with lower - priority. \n The preemption algorithm tries to find a minimal set - of Workloads to preempt to accomomdate the pending Workload, preempting - Workloads with lower priority first." + and there are admitted Workloads in the ClusterQueue with lower priority. + + + The preemption algorithm tries to find a minimal set of Workloads to + preempt to accomomdate the pending Workload, preempting Workloads with + lower priority first. properties: borrowWithinCohort: + default: {} description: |- borrowWithinCohort provides configuration to allow preemption within cohort while borrowing. @@ -352,16 +460,19 @@ spec: type: object reclaimWithinCohort: default: Never - description: "reclaimWithinCohort determines whether a pending - Workload can preempt Workloads from other ClusterQueues in the - cohort that are using more than their nominal quota. The possible - values are: \n - `Never` (default): do not preempt Workloads - in the cohort. - `LowerPriority`: if the pending Workload fits - within the nominal quota of its ClusterQueue, only preempt Workloads - in the cohort that have lower priority than the pending Workload. - - `Any`: if the pending Workload fits within the nominal quota - of its ClusterQueue, preempt any Workload in the cohort, irrespective - of priority." + description: |- + reclaimWithinCohort determines whether a pending Workload can preempt + Workloads from other ClusterQueues in the cohort that are using more than + their nominal quota. The possible values are: + + + - `Never` (default): do not preempt Workloads in the cohort. + - `LowerPriority`: if the pending Workload fits within the nominal + quota of its ClusterQueue, only preempt Workloads in the cohort that have + lower priority than the pending Workload. + - `Any`: if the pending Workload fits within the nominal quota of its + ClusterQueue, preempt any Workload in the cohort, irrespective of + priority. enum: - Never - LowerPriority @@ -369,47 +480,61 @@ spec: type: string withinClusterQueue: default: Never - description: "withinClusterQueue determines whether a pending - Workload that doesn't fit within the nominal quota for its ClusterQueue, - can preempt active Workloads in the ClusterQueue. The possible - values are: \n - `Never` (default): do not preempt Workloads - in the ClusterQueue. - `LowerPriority`: only preempt Workloads - in the ClusterQueue that have lower priority than the pending - Workload. - `LowerOrNewerEqualPriority`: only preempt Workloads - in the ClusterQueue that either have a lower priority than the - pending workload or equal priority and are newer than the pending - workload." + description: |- + withinClusterQueue determines whether a pending Workload that doesn't fit + within the nominal quota for its ClusterQueue, can preempt active Workloads in + the ClusterQueue. The possible values are: + + + - `Never` (default): do not preempt Workloads in the ClusterQueue. + - `LowerPriority`: only preempt Workloads in the ClusterQueue that have + lower priority than the pending Workload. + - `LowerOrNewerEqualPriority`: only preempt Workloads in the ClusterQueue that + either have a lower priority than the pending workload or equal priority + and are newer than the pending workload. enum: - Never - LowerPriority - LowerOrNewerEqualPriority type: string type: object + x-kubernetes-validations: + - message: reclaimWithinCohort=Never and borrowWithinCohort.Policy!=Never + rule: '!(self.reclaimWithinCohort == ''Never'' && has(self.borrowWithinCohort) + && self.borrowWithinCohort.policy != ''Never'')' queueingStrategy: default: BestEffortFIFO - description: "QueueingStrategy indicates the queueing strategy of - the workloads across the queues in this ClusterQueue. This field - is immutable. Current Supported Strategies: \n - StrictFIFO: workloads - are ordered strictly by creation time. Older workloads that can't - be admitted will block admitting newer workloads even if they fit - available quota. - BestEffortFIFO: workloads are ordered by creation - time, however older workloads that can't be admitted will not block - admitting newer workloads that fit existing quota." + description: |- + QueueingStrategy indicates the queueing strategy of the workloads + across the queues in this ClusterQueue. + Current Supported Strategies: + + + - StrictFIFO: workloads are ordered strictly by creation time. + Older workloads that can't be admitted will block admitting newer + workloads even if they fit available quota. + - BestEffortFIFO: workloads are ordered by creation time, + however older workloads that can't be admitted will not block + admitting newer workloads that fit existing quota. enum: - StrictFIFO - BestEffortFIFO type: string resourceGroups: - description: resourceGroups describes groups of resources. Each resource - group defines the list of resources and a list of flavors that provide - quotas for these resources. Each resource and each flavor can only - form part of one resource group. resourceGroups can be up to 16. + description: |- + resourceGroups describes groups of resources. + Each resource group defines the list of resources and a list of flavors + that provide quotas for these resources. + Each resource and each flavor can only form part of one resource group. + resourceGroups can be up to 16. items: properties: coveredResources: - description: 'coveredResources is the list of resources covered - by the flavors in this group. Examples: cpu, memory, vendor.com/gpu. - The list cannot be empty and it can contain up to 16 resources.' + description: |- + coveredResources is the list of resources covered by the flavors in this + group. + Examples: cpu, memory, vendor.com/gpu. + The list cannot be empty and it can contain up to 16 resources. items: description: ResourceName is the name identifying various resources in a ResourceList. @@ -418,42 +543,44 @@ spec: minItems: 1 type: array flavors: - description: flavors is the list of flavors that provide the - resources of this group. Typically, different flavors represent - different hardware models (e.g., gpu models, cpu architectures) - or pricing models (on-demand vs spot cpus). Each flavor MUST - list all the resources listed for this group in the same order - as the .resources field. The list cannot be empty and it can - contain up to 16 flavors. + description: |- + flavors is the list of flavors that provide the resources of this group. + Typically, different flavors represent different hardware models + (e.g., gpu models, cpu architectures) or pricing models (on-demand vs spot + cpus). + Each flavor MUST list all the resources listed for this group in the same + order as the .resources field. + The list cannot be empty and it can contain up to 16 flavors. items: properties: name: - description: name of this flavor. The name should match - the .metadata.name of a ResourceFlavor. If a matching - ResourceFlavor does not exist, the ClusterQueue will - have an Active condition set to False. + description: |- + name of this flavor. The name should match the .metadata.name of a + ResourceFlavor. If a matching ResourceFlavor does not exist, the + ClusterQueue will have an Active condition set to False. + maxLength: 253 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ type: string resources: - description: resources is the list of quotas for this - flavor per resource. There could be up to 16 resources. + description: |- + resources is the list of quotas for this flavor per resource. + There could be up to 16 resources. items: properties: borrowingLimit: anyOf: - type: integer - type: string - description: borrowingLimit is the maximum amount - of quota for the [flavor, resource] combination - that this ClusterQueue is allowed to borrow from - the unused quota of other ClusterQueues in the - same cohort. In total, at a given time, Workloads - in a ClusterQueue can consume a quantity of quota - equal to nominalQuota+borrowingLimit, assuming - the other ClusterQueues in the cohort have enough - unused quota. If null, it means that there is - no borrowing limit. If not null, it must be non-negative. - borrowingLimit must be null if spec.cohort is - empty. + description: |- + borrowingLimit is the maximum amount of quota for the [flavor, resource] + combination that this ClusterQueue is allowed to borrow from the unused + quota of other ClusterQueues in the same cohort. + In total, at a given time, Workloads in a ClusterQueue can consume a + quantity of quota equal to nominalQuota+borrowingLimit, assuming the other + ClusterQueues in the cohort have enough unused quota. + If null, it means that there is no borrowing limit. + If not null, it must be non-negative. + borrowingLimit must be null if spec.cohort is empty. pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true lendingLimit: @@ -480,20 +607,20 @@ spec: anyOf: - type: integer - type: string - description: "nominalQuota is the quantity of this - resource that is available for Workloads admitted - by this ClusterQueue at a point in time. The nominalQuota - must be non-negative. nominalQuota should represent - the resources in the cluster available for running - jobs (after discounting resources consumed by - system components and pods not managed by kueue). - In an autoscaled cluster, nominalQuota should - account for resources that can be provided by - a component such as Kubernetes cluster-autoscaler. - \n If the ClusterQueue belongs to a cohort, the - sum of the quotas for each (flavor, resource) - combination defines the maximum quantity that - can be allocated by a ClusterQueue in the cohort." + description: |- + nominalQuota is the quantity of this resource that is available for + Workloads admitted by this ClusterQueue at a point in time. + The nominalQuota must be non-negative. + nominalQuota should represent the resources in the cluster available for + running jobs (after discounting resources consumed by system components + and pods not managed by kueue). In an autoscaled cluster, nominalQuota + should account for resources that can be provided by a component such as + Kubernetes cluster-autoscaler. + + + If the ClusterQueue belongs to a cohort, the sum of the quotas for each + (flavor, resource) combination defines the maximum quantity that can be + allocated by a ClusterQueue in the cohort. pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: @@ -520,6 +647,10 @@ spec: - coveredResources - flavors type: object + x-kubernetes-validations: + - message: flavors must have the same number of resources as the + coveredResources + rule: self.flavors.all(x, size(x.resources) == size(self.coveredResources)) maxItems: 16 type: array x-kubernetes-list-type: atomic @@ -542,55 +673,61 @@ spec: - HoldAndDrain type: string type: object + x-kubernetes-validations: + - message: borrowingLimit must be nil when cohort is empty + rule: '!has(self.cohort) && has(self.resourceGroups) ? self.resourceGroups.all(rg, + rg.flavors.all(f, f.resources.all(r, !has(r.borrowingLimit)))) : true' status: description: ClusterQueueStatus defines the observed state of ClusterQueue properties: admittedWorkloads: - description: admittedWorkloads is the number of workloads currently - admitted to this clusterQueue and haven't finished yet. + description: |- + admittedWorkloads is the number of workloads currently admitted to this + clusterQueue and haven't finished yet. format: int32 type: integer conditions: - description: conditions hold the latest available observations of - the ClusterQueue current state. + description: |- + conditions hold the latest available observations of the ClusterQueue + current state. items: description: "Condition contains details for one aspect of the current - state of this API Resource. --- This struct is intended for direct - use as an array at the field path .status.conditions. For example, - \n type FooStatus struct{ // Represents the observations of a - foo's current state. // Known .status.conditions.type are: \"Available\", - \"Progressing\", and \"Degraded\" // +patchMergeKey=type // +patchStrategy=merge - // +listType=map // +listMapKey=type Conditions []metav1.Condition - `json:\"conditions,omitempty\" patchStrategy:\"merge\" patchMergeKey:\"type\" - protobuf:\"bytes,1,rep,name=conditions\"` \n // other fields }" + state of this API Resource.\n---\nThis struct is intended for + direct use as an array at the field path .status.conditions. For + example,\n\n\n\ttype FooStatus struct{\n\t // Represents the + observations of a foo's current state.\n\t // Known .status.conditions.type + are: \"Available\", \"Progressing\", and \"Degraded\"\n\t // + +patchMergeKey=type\n\t // +patchStrategy=merge\n\t // +listType=map\n\t + \ // +listMapKey=type\n\t Conditions []metav1.Condition `json:\"conditions,omitempty\" + patchStrategy:\"merge\" patchMergeKey:\"type\" protobuf:\"bytes,1,rep,name=conditions\"`\n\n\n\t + \ // other fields\n\t}" properties: lastTransitionTime: - description: lastTransitionTime is the last time the condition - transitioned from one status to another. This should be when - the underlying condition changed. If that is not known, then - using the time when the API field changed is acceptable. + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. format: date-time type: string message: - description: message is a human readable message indicating - details about the transition. This may be an empty string. + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. maxLength: 32768 type: string observedGeneration: - description: observedGeneration represents the .metadata.generation - that the condition was set based upon. For instance, if .metadata.generation - is currently 12, but the .status.conditions[x].observedGeneration - is 9, the condition is out of date with respect to the current - state of the instance. + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. format: int64 minimum: 0 type: integer reason: - description: reason contains a programmatic identifier indicating - the reason for the condition's last transition. Producers - of specific condition types may define expected values and - meanings for this field, and whether the values are considered - a guaranteed API. The value should be a CamelCase string. + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. This field may not be empty. maxLength: 1024 minLength: 1 @@ -604,11 +741,12 @@ spec: - Unknown type: string type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - --- Many .condition.type values are consistent across resources - like Available, but because arbitrary conditions can be useful - (see .node.status.conditions), the ability to deconflict is - important. The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: |- + type of condition in CamelCase or in foo.example.com/CamelCase. + --- + Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be + useful (see .node.status.conditions), the ability to deconflict is important. + The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string @@ -623,13 +761,31 @@ spec: x-kubernetes-list-map-keys: - type x-kubernetes-list-type: map + fairSharing: + description: FairSharing contains the information about the current + status of fair sharing. + properties: + weightedShare: + description: |- + WeightedShare represent the maximum of the ratios of usage above nominal + quota to the lendable resources in the cohort, among all the resources + provided by the ClusterQueue, and divided by the weight. + If zero, it means that the usage of the ClusterQueue is below the nominal quota. + If the ClusterQueue has a weight of zero, this will return 9223372036854775807, + the maximum possible share value. + format: int64 + type: integer + type: object flavorsReservation: - description: flavorsReservation are the reserved quotas, by flavor, - currently in use by the workloads assigned to this ClusterQueue. + description: |- + flavorsReservation are the reserved quotas, by flavor, currently in use by the + workloads assigned to this ClusterQueue. items: properties: name: description: name of the flavor. + maxLength: 253 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ type: string resources: description: resources lists the quota usage for the resources @@ -640,9 +796,9 @@ spec: anyOf: - type: integer - type: string - description: Borrowed is quantity of quota that is borrowed - from the cohort. In other words, it's the used quota - that is over the nominalQuota. + description: |- + Borrowed is quantity of quota that is borrowed from the cohort. In other + words, it's the used quota that is over the nominalQuota. pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true name: @@ -652,8 +808,9 @@ spec: anyOf: - type: integer - type: string - description: total is the total quantity of used quota, - including the amount borrowed from the cohort. + description: |- + total is the total quantity of used quota, including the amount borrowed + from the cohort. pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: @@ -674,12 +831,15 @@ spec: - name x-kubernetes-list-type: map flavorsUsage: - description: flavorsUsage are the used quotas, by flavor, currently - in use by the workloads admitted in this ClusterQueue. + description: |- + flavorsUsage are the used quotas, by flavor, currently in use by the + workloads admitted in this ClusterQueue. items: properties: name: description: name of the flavor. + maxLength: 253 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ type: string resources: description: resources lists the quota usage for the resources @@ -690,9 +850,9 @@ spec: anyOf: - type: integer - type: string - description: Borrowed is quantity of quota that is borrowed - from the cohort. In other words, it's the used quota - that is over the nominalQuota. + description: |- + Borrowed is quantity of quota that is borrowed from the cohort. In other + words, it's the used quota that is over the nominalQuota. pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true name: @@ -702,8 +862,9 @@ spec: anyOf: - type: integer - type: string - description: total is the total quantity of used quota, - including the amount borrowed from the cohort. + description: |- + total is the total quantity of used quota, including the amount borrowed + from the cohort. pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: @@ -724,20 +885,22 @@ spec: - name x-kubernetes-list-type: map pendingWorkloads: - description: pendingWorkloads is the number of workloads currently - waiting to be admitted to this clusterQueue. + description: |- + pendingWorkloads is the number of workloads currently waiting to be + admitted to this clusterQueue. format: int32 type: integer pendingWorkloadsStatus: - description: PendingWorkloadsStatus contains the information exposed - about the current status of the pending workloads in the cluster - queue. + description: |- + PendingWorkloadsStatus contains the information exposed about the current + status of the pending workloads in the cluster queue. properties: clusterQueuePendingWorkload: description: Head contains the list of top pending workloads. items: - description: ClusterQueuePendingWorkload contains the information - identifying a pending workload in the cluster queue. + description: |- + ClusterQueuePendingWorkload contains the information identifying a pending workload + in the cluster queue. properties: name: description: Name indicates the name of the pending workload. @@ -761,8 +924,9 @@ spec: - lastChangeTime type: object reservingWorkloads: - description: reservingWorkloads is the number of workloads currently - reserving quota in this clusterQueue. + description: |- + reservingWorkloads is the number of workloads currently reserving quota in this + clusterQueue. format: int32 type: integer type: object @@ -778,6 +942,10 @@ metadata: annotations: cert-manager.io/inject-ca-from: $(CERTIFICATE_NAMESPACE)/$(CERTIFICATE_NAME) controller-gen.kubebuilder.io/version: v0.14.0 + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager name: localqueues.kueue.x-k8s.io spec: group: kueue.x-k8s.io @@ -788,6 +956,7 @@ spec: shortNames: - queue - queues + - lq singular: localqueue scope: Namespaced versions: @@ -810,14 +979,19 @@ spec: description: LocalQueue is the Schema for the localQueues API properties: apiVersion: - description: 'APIVersion defines the versioned schema of this representation - of an object. Servers should convert recognized schemas to the latest - internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources type: string kind: - description: 'Kind is a string value representing the REST resource this - object represents. Servers may infer this from the endpoint the client - submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds type: string metadata: type: object @@ -827,58 +1001,64 @@ spec: clusterQueue: description: clusterQueue is a reference to a clusterQueue that backs this localQueue. + maxLength: 253 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ type: string + x-kubernetes-validations: + - message: field is immutable + rule: self == oldSelf type: object status: description: LocalQueueStatus defines the observed state of LocalQueue properties: admittedWorkloads: - description: admittedWorkloads is the number of workloads in this - LocalQueue admitted to a ClusterQueue and that haven't finished - yet. + description: |- + admittedWorkloads is the number of workloads in this LocalQueue + admitted to a ClusterQueue and that haven't finished yet. format: int32 type: integer conditions: - description: Conditions hold the latest available observations of - the LocalQueue current state. + description: |- + Conditions hold the latest available observations of the LocalQueue + current state. items: description: "Condition contains details for one aspect of the current - state of this API Resource. --- This struct is intended for direct - use as an array at the field path .status.conditions. For example, - \n type FooStatus struct{ // Represents the observations of a - foo's current state. // Known .status.conditions.type are: \"Available\", - \"Progressing\", and \"Degraded\" // +patchMergeKey=type // +patchStrategy=merge - // +listType=map // +listMapKey=type Conditions []metav1.Condition - `json:\"conditions,omitempty\" patchStrategy:\"merge\" patchMergeKey:\"type\" - protobuf:\"bytes,1,rep,name=conditions\"` \n // other fields }" + state of this API Resource.\n---\nThis struct is intended for + direct use as an array at the field path .status.conditions. For + example,\n\n\n\ttype FooStatus struct{\n\t // Represents the + observations of a foo's current state.\n\t // Known .status.conditions.type + are: \"Available\", \"Progressing\", and \"Degraded\"\n\t // + +patchMergeKey=type\n\t // +patchStrategy=merge\n\t // +listType=map\n\t + \ // +listMapKey=type\n\t Conditions []metav1.Condition `json:\"conditions,omitempty\" + patchStrategy:\"merge\" patchMergeKey:\"type\" protobuf:\"bytes,1,rep,name=conditions\"`\n\n\n\t + \ // other fields\n\t}" properties: lastTransitionTime: - description: lastTransitionTime is the last time the condition - transitioned from one status to another. This should be when - the underlying condition changed. If that is not known, then - using the time when the API field changed is acceptable. + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. format: date-time type: string message: - description: message is a human readable message indicating - details about the transition. This may be an empty string. + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. maxLength: 32768 type: string observedGeneration: - description: observedGeneration represents the .metadata.generation - that the condition was set based upon. For instance, if .metadata.generation - is currently 12, but the .status.conditions[x].observedGeneration - is 9, the condition is out of date with respect to the current - state of the instance. + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. format: int64 minimum: 0 type: integer reason: - description: reason contains a programmatic identifier indicating - the reason for the condition's last transition. Producers - of specific condition types may define expected values and - meanings for this field, and whether the values are considered - a guaranteed API. The value should be a CamelCase string. + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. This field may not be empty. maxLength: 1024 minLength: 1 @@ -892,11 +1072,12 @@ spec: - Unknown type: string type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - --- Many .condition.type values are consistent across resources - like Available, but because arbitrary conditions can be useful - (see .node.status.conditions), the ability to deconflict is - important. The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: |- + type of condition in CamelCase or in foo.example.com/CamelCase. + --- + Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be + useful (see .node.status.conditions), the ability to deconflict is important. + The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string @@ -912,12 +1093,15 @@ spec: - type x-kubernetes-list-type: map flavorUsage: - description: flavorsUsage are the used quotas, by flavor currently - in use by the workloads assigned to this LocalQueue. + description: |- + flavorsUsage are the used quotas, by flavor currently in use by the + workloads assigned to this LocalQueue. items: properties: name: description: name of the flavor. + maxLength: 253 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ type: string resources: description: resources lists the quota usage for the resources @@ -952,12 +1136,15 @@ spec: - name x-kubernetes-list-type: map flavorsReservation: - description: flavorsReservation are the reserved quotas, by flavor - currently in use by the workloads assigned to this LocalQueue. + description: |- + flavorsReservation are the reserved quotas, by flavor currently in use by the + workloads assigned to this LocalQueue. items: properties: name: description: name of the flavor. + maxLength: 253 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ type: string resources: description: resources lists the quota usage for the resources @@ -997,9 +1184,9 @@ spec: format: int32 type: integer reservingWorkloads: - description: reservingWorkloads is the number of workloads in this - LocalQueue reserving quota in a ClusterQueue and that haven't finished - yet. + description: |- + reservingWorkloads is the number of workloads in this LocalQueue + reserving quota in a ClusterQueue and that haven't finished yet. format: int32 type: integer type: object @@ -1014,6 +1201,10 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.14.0 + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager name: multikueueclusters.kueue.x-k8s.io spec: group: kueue.x-k8s.io @@ -1159,6 +1350,10 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.14.0 + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager name: multikueueconfigs.kueue.x-k8s.io spec: group: kueue.x-k8s.io @@ -1215,6 +1410,10 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.14.0 + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager name: provisioningrequestconfigs.kueue.x-k8s.io spec: group: kueue.x-k8s.io @@ -1232,14 +1431,19 @@ spec: API properties: apiVersion: - description: 'APIVersion defines the versioned schema of this representation - of an object. Servers should convert recognized schemas to the latest - internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources type: string kind: - description: 'Kind is a string value representing the REST resource this - object represents. Servers may infer this from the endpoint the client - submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds type: string metadata: type: object @@ -1248,12 +1452,19 @@ spec: ProvisioningRequestConfig properties: managedResources: - description: "managedResources contains the list of resources managed - by the autoscaling. \n If empty, all resources are considered managed. - \n If not empty, the ProvisioningRequest will contain only the podsets - that are requesting at least one of them. \n If none of the workloads - podsets is requesting at least a managed resource, the workload - is considered ready." + description: |- + managedResources contains the list of resources managed by the autoscaling. + + + If empty, all resources are considered managed. + + + If not empty, the ProvisioningRequest will contain only the podsets that are + requesting at least one of them. + + + If none of the workloads podsets is requesting at least a managed resource, + the workload is considered ready. items: description: ResourceName is the name identifying various resources in a ResourceList. @@ -1271,9 +1482,9 @@ spec: maxProperties: 100 type: object provisioningClassName: - description: ProvisioningClassName describes the different modes of - provisioning the resources. Check autoscaling.x-k8s.io ProvisioningRequestSpec.ProvisioningClassName - for details. + description: |- + ProvisioningClassName describes the different modes of provisioning the resources. + Check autoscaling.x-k8s.io ProvisioningRequestSpec.ProvisioningClassName for details. maxLength: 253 pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ type: string @@ -1289,6 +1500,10 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.14.0 + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager name: resourceflavors.kueue.x-k8s.io spec: group: kueue.x-k8s.io @@ -1308,14 +1523,19 @@ spec: description: ResourceFlavor is the Schema for the resourceflavors API. properties: apiVersion: - description: 'APIVersion defines the versioned schema of this representation - of an object. Servers should convert recognized schemas to the latest - internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources type: string kind: - description: 'Kind is a string value representing the REST resource this - object represents. Servers may infer this from the endpoint the client - submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds type: string metadata: type: object @@ -1325,38 +1545,52 @@ spec: nodeLabels: additionalProperties: type: string - description: "nodeLabels are labels that associate the ResourceFlavor - with Nodes that have the same labels. When a Workload is admitted, - its podsets can only get assigned ResourceFlavors whose nodeLabels - match the nodeSelector and nodeAffinity fields. Once a ResourceFlavor - is assigned to a podSet, the ResourceFlavor's nodeLabels should - be injected into the pods of the Workload by the controller that - integrates with the Workload object. \n nodeLabels can be up to - 8 elements." + description: |- + nodeLabels are labels that associate the ResourceFlavor with Nodes that + have the same labels. + When a Workload is admitted, its podsets can only get assigned + ResourceFlavors whose nodeLabels match the nodeSelector and nodeAffinity + fields. + Once a ResourceFlavor is assigned to a podSet, the ResourceFlavor's + nodeLabels should be injected into the pods of the Workload by the + controller that integrates with the Workload object. + + + nodeLabels can be up to 8 elements. maxProperties: 8 type: object x-kubernetes-map-type: atomic nodeTaints: - description: "nodeTaints are taints that the nodes associated with - this ResourceFlavor have. Workloads' podsets must have tolerations - for these nodeTaints in order to get assigned this ResourceFlavor - during admission. \n An example of a nodeTaint is cloud.provider.com/preemptible=\"true\":NoSchedule - \n nodeTaints can be up to 8 elements." + description: |- + nodeTaints are taints that the nodes associated with this ResourceFlavor + have. + Workloads' podsets must have tolerations for these nodeTaints in order to + get assigned this ResourceFlavor during admission. + + + An example of a nodeTaint is + cloud.provider.com/preemptible="true":NoSchedule + + + nodeTaints can be up to 8 elements. items: - description: The node this Taint is attached to has the "effect" - on any pod that does not tolerate the Taint. + description: |- + The node this Taint is attached to has the "effect" on + any pod that does not tolerate the Taint. properties: effect: - description: Required. The effect of the taint on pods that - do not tolerate the taint. Valid effects are NoSchedule, PreferNoSchedule - and NoExecute. + description: |- + Required. The effect of the taint on pods + that do not tolerate the taint. + Valid effects are NoSchedule, PreferNoSchedule and NoExecute. type: string key: description: Required. The taint key to be applied to a node. type: string timeAdded: - description: TimeAdded represents the time at which the taint - was added. It is only written for NoExecute taints. + description: |- + TimeAdded represents the time at which the taint was added. + It is only written for NoExecute taints. format: date-time type: string value: @@ -1369,51 +1603,78 @@ spec: maxItems: 8 type: array x-kubernetes-list-type: atomic + x-kubernetes-validations: + - message: 'supported taint effect values: ''NoSchedule'', ''PreferNoSchedule'', + ''NoExecute''' + rule: self.all(x, x.effect in ['NoSchedule', 'PreferNoSchedule', + 'NoExecute']) tolerations: - description: "tolerations are extra tolerations that will be added - to the pods admitted in the quota associated with this resource - flavor. \n An example of a toleration is cloud.provider.com/preemptible=\"true\":NoSchedule - \n tolerations can be up to 8 elements." + description: |- + tolerations are extra tolerations that will be added to the pods admitted in + the quota associated with this resource flavor. + + + An example of a toleration is + cloud.provider.com/preemptible="true":NoSchedule + + + tolerations can be up to 8 elements. items: - description: The pod this Toleration is attached to tolerates any - taint that matches the triple using the matching - operator . + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . properties: effect: - description: Effect indicates the taint effect to match. Empty - means match all taint effects. When specified, allowed values - are NoSchedule, PreferNoSchedule and NoExecute. + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. type: string key: - description: Key is the taint key that the toleration applies - to. Empty means match all taint keys. If the key is empty, - operator must be Exists; this combination means to match all - values and all keys. + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. type: string operator: - description: Operator represents a key's relationship to the - value. Valid operators are Exists and Equal. Defaults to Equal. - Exists is equivalent to wildcard for value, so that a pod - can tolerate all taints of a particular category. + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. type: string tolerationSeconds: - description: TolerationSeconds represents the period of time - the toleration (which must be of effect NoExecute, otherwise - this field is ignored) tolerates the taint. By default, it - is not set, which means tolerate the taint forever (do not - evict). Zero and negative values will be treated as 0 (evict - immediately) by the system. + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. format: int64 type: integer value: - description: Value is the taint value the toleration matches - to. If the operator is Exists, the value should be empty, - otherwise just a regular string. + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. type: string type: object maxItems: 8 type: array x-kubernetes-list-type: atomic + x-kubernetes-validations: + - message: operator must be Exists when 'key' is empty, which means + 'match all values and all keys' + rule: 'self.all(x, !has(x.key) ? x.operator == ''Exists'' : true)' + - message: effect must be 'NoExecute' when 'tolerationSeconds' is + set + rule: 'self.all(x, has(x.tolerationSeconds) ? x.effect == ''NoExecute'' + : true)' + - message: 'supported toleration values: ''Equal''(default), ''Exists''' + rule: self.all(x, !has(x.operator) || x.operator in ['Equal', 'Exists']) + - message: a value must be empty when 'operator' is 'Exists' + rule: 'self.all(x, has(x.operator) && x.operator == ''Exists'' ? + !has(x.value) : true)' + - message: 'supported taint effect values: ''NoSchedule'', ''PreferNoSchedule'', + ''NoExecute''' + rule: self.all(x, !has(x.effect) || x.effect in ['NoSchedule', 'PreferNoSchedule', + 'NoExecute']) type: object type: object served: true @@ -1424,6 +1685,10 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.14.0 + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager name: workloadpriorityclasses.kueue.x-k8s.io spec: group: kueue.x-k8s.io @@ -1446,27 +1711,32 @@ spec: API properties: apiVersion: - description: 'APIVersion defines the versioned schema of this representation - of an object. Servers should convert recognized schemas to the latest - internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources type: string description: - description: description is an arbitrary string that usually provides - guidelines on when this workloadPriorityClass should be used. + description: |- + description is an arbitrary string that usually provides guidelines on + when this workloadPriorityClass should be used. type: string kind: - description: 'Kind is a string value representing the REST resource this - object represents. Servers may infer this from the endpoint the client - submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds type: string metadata: type: object value: - description: value represents the integer value of this workloadPriorityClass. - This is the actual priority that workloads receive when jobs have the - name of this class in their workloadPriorityClass label. Changing the - value of workloadPriorityClass doesn't affect the priority of workloads - that were already created. + description: |- + value represents the integer value of this workloadPriorityClass. This is the actual priority that workloads + receive when jobs have the name of this class in their workloadPriorityClass label. + Changing the value of workloadPriorityClass doesn't affect the priority of workloads that were already created. format: int32 type: integer required: @@ -1482,6 +1752,10 @@ metadata: annotations: cert-manager.io/inject-ca-from: $(CERTIFICATE_NAMESPACE)/$(CERTIFICATE_NAME) controller-gen.kubebuilder.io/version: v0.14.0 + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager name: workloads.kueue.x-k8s.io spec: group: kueue.x-k8s.io @@ -1499,9 +1773,13 @@ spec: jsonPath: .spec.queueName name: Queue type: string - - description: Name of the ClusterQueue that admitted this workload + - description: Name of the ClusterQueue where the workload is reserving quota jsonPath: .status.admission.clusterQueue - name: Admitted by + name: Reserved in + type: string + - description: Admission status + jsonPath: .status.conditions[?(@.type=='Admitted')].status + name: Admitted type: string - description: Time this workload was created jsonPath: .metadata.creationTimestamp @@ -1513,56 +1791,99 @@ spec: description: Workload is the Schema for the workloads API properties: apiVersion: - description: 'APIVersion defines the versioned schema of this representation - of an object. Servers should convert recognized schemas to the latest - internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources type: string kind: - description: 'Kind is a string value representing the REST resource this - object represents. Servers may infer this from the endpoint the client - submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds type: string metadata: type: object spec: description: WorkloadSpec defines the desired state of Workload properties: + active: + default: true + description: |- + Active determines if a workload can be admitted into a queue. + Changing active from true to false will evict any running workloads. + Possible values are: + + + - false: indicates that a workload should never be admitted and evicts running workloads + - true: indicates that a workload can be evaluated for admission into it's respective queue. + + + Defaults to true + type: boolean podSets: - description: podSets is a list of sets of homogeneous pods, each described - by a Pod spec and a count. There must be at least one element and - at most 8. podSets cannot be changed. + description: |- + podSets is a list of sets of homogeneous pods, each described by a Pod spec + and a count. + There must be at least one element and at most 8. + podSets cannot be changed. items: properties: count: + default: 1 description: count is the number of pods for the spec. format: int32 - minimum: 1 + minimum: 0 type: integer minCount: - description: "minCount is the minimum number of pods for the - spec acceptable if the workload supports partial admission. - \n If not provided, partial admission for the current PodSet - is not enabled. \n Only one podSet within the workload can - use this. \n This is an alpha field and requires enabling - PartialAdmission feature gate." + description: |- + minCount is the minimum number of pods for the spec acceptable + if the workload supports partial admission. + + + If not provided, partial admission for the current PodSet is not + enabled. + + + Only one podSet within the workload can use this. + + + This is an alpha field and requires enabling PartialAdmission feature gate. format: int32 + minimum: 1 type: integer name: + default: main description: name is the PodSet name. + maxLength: 63 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ type: string template: - description: "template is the Pod template. \n The only allowed - fields in template.metadata are labels and annotations. \n + description: |- + template is the Pod template. + + + The only allowed fields in template.metadata are labels and annotations. + + If requests are omitted for a container or initContainer, - they default to the limits if they are explicitly specified - for the container or initContainer. \n During admission, the - rules in nodeSelector and nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution - that match the keys in the nodeLabels from the ResourceFlavors - considered for this Workload are used to filter the ResourceFlavors - that can be assigned to this podSet." + they default to the limits if they are explicitly specified for the + container or initContainer. + + + During admission, the rules in nodeSelector and + nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution that match + the keys in the nodeLabels from the ResourceFlavors considered for this + Workload are used to filter the ResourceFlavors that can be assigned to + this podSet. properties: metadata: - description: 'Standard object''s metadata. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#metadata' + description: |- + Standard object's metadata. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#metadata properties: annotations: additionalProperties: @@ -1582,15 +1903,15 @@ spec: type: string type: object spec: - description: 'Specification of the desired behavior of the - pod. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#spec-and-status' + description: |- + Specification of the desired behavior of the pod. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#spec-and-status properties: activeDeadlineSeconds: - description: Optional duration in seconds the pod may - be active on the node relative to StartTime before - the system will actively try to mark it failed and - kill associated containers. Value must be a positive - integer. + description: |- + Optional duration in seconds the pod may be active on the node relative to + StartTime before the system will actively try to mark it failed and kill associated containers. + Value must be a positive integer. format: int64 type: integer affinity: @@ -1601,26 +1922,20 @@ spec: rules for the pod. properties: preferredDuringSchedulingIgnoredDuringExecution: - description: The scheduler will prefer to schedule - pods to nodes that satisfy the affinity expressions - specified by this field, but it may choose - a node that violates one or more of the expressions. - The node that is most preferred is the one - with the greatest sum of weights, i.e. for - each node that meets all of the scheduling - requirements (resource request, requiredDuringScheduling - affinity expressions, etc.), compute a sum - by iterating through the elements of this - field and adding "weight" to the sum if the - node matches the corresponding matchExpressions; - the node(s) with the highest sum are the most - preferred. + description: |- + The scheduler will prefer to schedule pods to nodes that satisfy + the affinity expressions specified by this field, but it may choose + a node that violates one or more of the expressions. The node that is + most preferred is the one with the greatest sum of weights, i.e. + for each node that meets all of the scheduling requirements (resource + request, requiredDuringScheduling affinity expressions, etc.), + compute a sum by iterating through the elements of this field and adding + "weight" to the sum if the node matches the corresponding matchExpressions; the + node(s) with the highest sum are the most preferred. items: - description: An empty preferred scheduling - term matches all objects with implicit weight - 0 (i.e. it's a no-op). A null preferred - scheduling term matches no objects (i.e. - is also a no-op). + description: |- + An empty preferred scheduling term matches all objects with implicit weight 0 + (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op). properties: preference: description: A node selector term, associated @@ -1630,84 +1945,70 @@ spec: description: A list of node selector requirements by node's labels. items: - description: A node selector requirement - is a selector that contains values, - a key, and an operator that relates - the key and values. + description: |- + A node selector requirement is a selector that contains values, a key, and an operator + that relates the key and values. properties: key: description: The label key that the selector applies to. type: string operator: - description: Represents a key's - relationship to a set of values. - Valid operators are In, NotIn, - Exists, DoesNotExist. Gt, - and Lt. + description: |- + Represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. type: string values: - description: An array of string - values. If the operator is - In or NotIn, the values array - must be non-empty. If the - operator is Exists or DoesNotExist, - the values array must be empty. - If the operator is Gt or Lt, - the values array must have - a single element, which will - be interpreted as an integer. - This array is replaced during - a strategic merge patch. + description: |- + An array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. If the operator is Gt or Lt, the values + array must have a single element, which will be interpreted as an integer. + This array is replaced during a strategic merge patch. items: type: string type: array + x-kubernetes-list-type: atomic required: - key - operator type: object type: array + x-kubernetes-list-type: atomic matchFields: description: A list of node selector requirements by node's fields. items: - description: A node selector requirement - is a selector that contains values, - a key, and an operator that relates - the key and values. + description: |- + A node selector requirement is a selector that contains values, a key, and an operator + that relates the key and values. properties: key: description: The label key that the selector applies to. type: string operator: - description: Represents a key's - relationship to a set of values. - Valid operators are In, NotIn, - Exists, DoesNotExist. Gt, - and Lt. + description: |- + Represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. type: string values: - description: An array of string - values. If the operator is - In or NotIn, the values array - must be non-empty. If the - operator is Exists or DoesNotExist, - the values array must be empty. - If the operator is Gt or Lt, - the values array must have - a single element, which will - be interpreted as an integer. - This array is replaced during - a strategic merge patch. + description: |- + An array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. If the operator is Gt or Lt, the values + array must have a single element, which will be interpreted as an integer. + This array is replaced during a strategic merge patch. items: type: string type: array + x-kubernetes-list-type: atomic required: - key - operator type: object type: array + x-kubernetes-list-type: atomic type: object x-kubernetes-map-type: atomic weight: @@ -1721,110 +2022,96 @@ spec: - weight type: object type: array + x-kubernetes-list-type: atomic requiredDuringSchedulingIgnoredDuringExecution: - description: If the affinity requirements specified - by this field are not met at scheduling time, - the pod will not be scheduled onto the node. - If the affinity requirements specified by - this field cease to be met at some point during - pod execution (e.g. due to an update), the - system may or may not try to eventually evict - the pod from its node. + description: |- + If the affinity requirements specified by this field are not met at + scheduling time, the pod will not be scheduled onto the node. + If the affinity requirements specified by this field cease to be met + at some point during pod execution (e.g. due to an update), the system + may or may not try to eventually evict the pod from its node. properties: nodeSelectorTerms: description: Required. A list of node selector terms. The terms are ORed. items: - description: A null or empty node selector - term matches no objects. The requirements - of them are ANDed. The TopologySelectorTerm - type implements a subset of the NodeSelectorTerm. + description: |- + A null or empty node selector term matches no objects. The requirements of + them are ANDed. + The TopologySelectorTerm type implements a subset of the NodeSelectorTerm. properties: matchExpressions: description: A list of node selector requirements by node's labels. items: - description: A node selector requirement - is a selector that contains values, - a key, and an operator that relates - the key and values. + description: |- + A node selector requirement is a selector that contains values, a key, and an operator + that relates the key and values. properties: key: description: The label key that the selector applies to. type: string operator: - description: Represents a key's - relationship to a set of values. - Valid operators are In, NotIn, - Exists, DoesNotExist. Gt, - and Lt. + description: |- + Represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. type: string values: - description: An array of string - values. If the operator is - In or NotIn, the values array - must be non-empty. If the - operator is Exists or DoesNotExist, - the values array must be empty. - If the operator is Gt or Lt, - the values array must have - a single element, which will - be interpreted as an integer. - This array is replaced during - a strategic merge patch. + description: |- + An array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. If the operator is Gt or Lt, the values + array must have a single element, which will be interpreted as an integer. + This array is replaced during a strategic merge patch. items: type: string type: array + x-kubernetes-list-type: atomic required: - key - operator type: object type: array + x-kubernetes-list-type: atomic matchFields: description: A list of node selector requirements by node's fields. items: - description: A node selector requirement - is a selector that contains values, - a key, and an operator that relates - the key and values. + description: |- + A node selector requirement is a selector that contains values, a key, and an operator + that relates the key and values. properties: key: description: The label key that the selector applies to. type: string operator: - description: Represents a key's - relationship to a set of values. - Valid operators are In, NotIn, - Exists, DoesNotExist. Gt, - and Lt. + description: |- + Represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. type: string values: - description: An array of string - values. If the operator is - In or NotIn, the values array - must be non-empty. If the - operator is Exists or DoesNotExist, - the values array must be empty. - If the operator is Gt or Lt, - the values array must have - a single element, which will - be interpreted as an integer. - This array is replaced during - a strategic merge patch. + description: |- + An array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. If the operator is Gt or Lt, the values + array must have a single element, which will be interpreted as an integer. + This array is replaced during a strategic merge patch. items: type: string type: array + x-kubernetes-list-type: atomic required: - key - operator type: object type: array + x-kubernetes-list-type: atomic type: object x-kubernetes-map-type: atomic type: array + x-kubernetes-list-type: atomic required: - nodeSelectorTerms type: object @@ -1836,20 +2123,16 @@ spec: etc. as some other pod(s)). properties: preferredDuringSchedulingIgnoredDuringExecution: - description: The scheduler will prefer to schedule - pods to nodes that satisfy the affinity expressions - specified by this field, but it may choose - a node that violates one or more of the expressions. - The node that is most preferred is the one - with the greatest sum of weights, i.e. for - each node that meets all of the scheduling - requirements (resource request, requiredDuringScheduling - affinity expressions, etc.), compute a sum - by iterating through the elements of this - field and adding "weight" to the sum if the - node has pods which matches the corresponding - podAffinityTerm; the node(s) with the highest - sum are the most preferred. + description: |- + The scheduler will prefer to schedule pods to nodes that satisfy + the affinity expressions specified by this field, but it may choose + a node that violates one or more of the expressions. The node that is + most preferred is the one with the greatest sum of weights, i.e. + for each node that meets all of the scheduling requirements (resource + request, requiredDuringScheduling affinity expressions, etc.), + compute a sum by iterating through the elements of this field and adding + "weight" to the sum if the node has pods which matches the corresponding podAffinityTerm; the + node(s) with the highest sum are the most preferred. items: description: The weights of all of the matched WeightedPodAffinityTerm fields are added @@ -1861,8 +2144,9 @@ spec: weight. properties: labelSelector: - description: A label query over a - set of resources, in this case pods. + description: |- + A label query over a set of resources, in this case pods. + If it's null, this PodAffinityTerm matches with no Pods. properties: matchExpressions: description: matchExpressions @@ -1870,11 +2154,9 @@ spec: requirements. The requirements are ANDed. items: - description: A label selector - requirement is a selector - that contains values, a key, - and an operator that relates - the key and values. + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. properties: key: description: key is the @@ -1882,43 +2164,33 @@ spec: applies to. type: string operator: - description: operator represents - a key's relationship to - a set of values. Valid - operators are In, NotIn, - Exists and DoesNotExist. + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. type: string values: - description: values is an - array of string values. - If the operator is In - or NotIn, the values array - must be non-empty. If - the operator is Exists - or DoesNotExist, the values - array must be empty. This - array is replaced during - a strategic merge patch. + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. items: type: string type: array + x-kubernetes-list-type: atomic required: - key - operator type: object type: array + x-kubernetes-list-type: atomic matchLabels: additionalProperties: type: string - description: matchLabels is a - map of {key,value} pairs. A - single {key,value} in the matchLabels - map is equivalent to an element - of matchExpressions, whose key - field is "key", the operator - is "In", and the values array - contains only "value". The requirements - are ANDed. + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. type: object type: object x-kubernetes-map-type: atomic @@ -1926,12 +2198,12 @@ spec: description: |- MatchLabelKeys is a set of pod label keys to select which pods will be taken into consideration. The keys are used to lookup values from the - incoming pod labels, those key-value labels are merged with `LabelSelector` as `key in (value)` + incoming pod labels, those key-value labels are merged with `labelSelector` as `key in (value)` to select the group of existing pods which pods will be taken into consideration for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming pod labels will be ignored. The default value is empty. - The same key is forbidden to exist in both MatchLabelKeys and LabelSelector. - Also, MatchLabelKeys cannot be set when LabelSelector isn't set. + The same key is forbidden to exist in both matchLabelKeys and labelSelector. + Also, matchLabelKeys cannot be set when labelSelector isn't set. This is an alpha field and requires enabling MatchLabelKeysInPodAffinity feature gate. items: type: string @@ -1941,28 +2213,24 @@ spec: description: |- MismatchLabelKeys is a set of pod label keys to select which pods will be taken into consideration. The keys are used to lookup values from the - incoming pod labels, those key-value labels are merged with `LabelSelector` as `key notin (value)` + incoming pod labels, those key-value labels are merged with `labelSelector` as `key notin (value)` to select the group of existing pods which pods will be taken into consideration for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming pod labels will be ignored. The default value is empty. - The same key is forbidden to exist in both MismatchLabelKeys and LabelSelector. - Also, MismatchLabelKeys cannot be set when LabelSelector isn't set. + The same key is forbidden to exist in both mismatchLabelKeys and labelSelector. + Also, mismatchLabelKeys cannot be set when labelSelector isn't set. This is an alpha field and requires enabling MatchLabelKeysInPodAffinity feature gate. items: type: string type: array x-kubernetes-list-type: atomic namespaceSelector: - description: A label query over the - set of namespaces that the term - applies to. The term is applied - to the union of the namespaces selected - by this field and the ones listed - in the namespaces field. null selector - and null or empty namespaces list - means "this pod's namespace". An - empty selector ({}) matches all - namespaces. + description: |- + A label query over the set of namespaces that the term applies to. + The term is applied to the union of the namespaces selected by this field + and the ones listed in the namespaces field. + null selector and null or empty namespaces list means "this pod's namespace". + An empty selector ({}) matches all namespaces. properties: matchExpressions: description: matchExpressions @@ -1970,11 +2238,9 @@ spec: requirements. The requirements are ANDed. items: - description: A label selector - requirement is a selector - that contains values, a key, - and an operator that relates - the key and values. + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. properties: key: description: key is the @@ -1982,78 +2248,61 @@ spec: applies to. type: string operator: - description: operator represents - a key's relationship to - a set of values. Valid - operators are In, NotIn, - Exists and DoesNotExist. + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. type: string values: - description: values is an - array of string values. - If the operator is In - or NotIn, the values array - must be non-empty. If - the operator is Exists - or DoesNotExist, the values - array must be empty. This - array is replaced during - a strategic merge patch. + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. items: type: string type: array + x-kubernetes-list-type: atomic required: - key - operator type: object type: array + x-kubernetes-list-type: atomic matchLabels: additionalProperties: type: string - description: matchLabels is a - map of {key,value} pairs. A - single {key,value} in the matchLabels - map is equivalent to an element - of matchExpressions, whose key - field is "key", the operator - is "In", and the values array - contains only "value". The requirements - are ANDed. + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. type: object type: object x-kubernetes-map-type: atomic namespaces: - description: namespaces specifies - a static list of namespace names - that the term applies to. The term - is applied to the union of the namespaces - listed in this field and the ones - selected by namespaceSelector. null - or empty namespaces list and null - namespaceSelector means "this pod's - namespace". + description: |- + namespaces specifies a static list of namespace names that the term applies to. + The term is applied to the union of the namespaces listed in this field + and the ones selected by namespaceSelector. + null or empty namespaces list and null namespaceSelector means "this pod's namespace". items: type: string type: array + x-kubernetes-list-type: atomic topologyKey: - description: This pod should be co-located - (affinity) or not co-located (anti-affinity) - with the pods matching the labelSelector - in the specified namespaces, where - co-located is defined as running - on a node whose value of the label - with key topologyKey matches that - of any node on which any of the - selected pods is running. Empty - topologyKey is not allowed. + description: |- + This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching + the labelSelector in the specified namespaces, where co-located is defined as running on a node + whose value of the label with key topologyKey matches that of any node on which any of the + selected pods is running. + Empty topologyKey is not allowed. type: string required: - topologyKey type: object weight: - description: weight associated with matching - the corresponding podAffinityTerm, in - the range 1-100. + description: |- + weight associated with matching the corresponding podAffinityTerm, + in the range 1-100. format: int32 type: integer required: @@ -2061,42 +2310,38 @@ spec: - weight type: object type: array + x-kubernetes-list-type: atomic requiredDuringSchedulingIgnoredDuringExecution: - description: If the affinity requirements specified - by this field are not met at scheduling time, - the pod will not be scheduled onto the node. - If the affinity requirements specified by - this field cease to be met at some point during - pod execution (e.g. due to a pod label update), - the system may or may not try to eventually - evict the pod from its node. When there are - multiple elements, the lists of nodes corresponding - to each podAffinityTerm are intersected, i.e. - all terms must be satisfied. + description: |- + If the affinity requirements specified by this field are not met at + scheduling time, the pod will not be scheduled onto the node. + If the affinity requirements specified by this field cease to be met + at some point during pod execution (e.g. due to a pod label update), the + system may or may not try to eventually evict the pod from its node. + When there are multiple elements, the lists of nodes corresponding to each + podAffinityTerm are intersected, i.e. all terms must be satisfied. items: - description: Defines a set of pods (namely - those matching the labelSelector relative - to the given namespace(s)) that this pod - should be co-located (affinity) or not co-located - (anti-affinity) with, where co-located is - defined as running on a node whose value - of the label with key matches - that of any node on which a pod of the set - of pods is running + description: |- + Defines a set of pods (namely those matching the labelSelector + relative to the given namespace(s)) that this pod should be + co-located (affinity) or not co-located (anti-affinity) with, + where co-located is defined as running on a node whose value of + the label with key matches that of any node on which + a pod of the set of pods is running properties: labelSelector: - description: A label query over a set - of resources, in this case pods. + description: |- + A label query over a set of resources, in this case pods. + If it's null, this PodAffinityTerm matches with no Pods. properties: matchExpressions: description: matchExpressions is a list of label selector requirements. The requirements are ANDed. items: - description: A label selector requirement - is a selector that contains values, - a key, and an operator that relates - the key and values. + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. properties: key: description: key is the label @@ -2104,41 +2349,33 @@ spec: to. type: string operator: - description: operator represents - a key's relationship to a - set of values. Valid operators - are In, NotIn, Exists and - DoesNotExist. + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. type: string values: - description: values is an array - of string values. If the operator - is In or NotIn, the values - array must be non-empty. If - the operator is Exists or - DoesNotExist, the values array - must be empty. This array - is replaced during a strategic + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic merge patch. items: type: string type: array + x-kubernetes-list-type: atomic required: - key - operator type: object type: array + x-kubernetes-list-type: atomic matchLabels: additionalProperties: type: string - description: matchLabels is a map - of {key,value} pairs. A single {key,value} - in the matchLabels map is equivalent - to an element of matchExpressions, - whose key field is "key", the operator - is "In", and the values array contains - only "value". The requirements are - ANDed. + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. type: object type: object x-kubernetes-map-type: atomic @@ -2146,12 +2383,12 @@ spec: description: |- MatchLabelKeys is a set of pod label keys to select which pods will be taken into consideration. The keys are used to lookup values from the - incoming pod labels, those key-value labels are merged with `LabelSelector` as `key in (value)` + incoming pod labels, those key-value labels are merged with `labelSelector` as `key in (value)` to select the group of existing pods which pods will be taken into consideration for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming pod labels will be ignored. The default value is empty. - The same key is forbidden to exist in both MatchLabelKeys and LabelSelector. - Also, MatchLabelKeys cannot be set when LabelSelector isn't set. + The same key is forbidden to exist in both matchLabelKeys and labelSelector. + Also, matchLabelKeys cannot be set when labelSelector isn't set. This is an alpha field and requires enabling MatchLabelKeysInPodAffinity feature gate. items: type: string @@ -2161,25 +2398,23 @@ spec: description: |- MismatchLabelKeys is a set of pod label keys to select which pods will be taken into consideration. The keys are used to lookup values from the - incoming pod labels, those key-value labels are merged with `LabelSelector` as `key notin (value)` + incoming pod labels, those key-value labels are merged with `labelSelector` as `key notin (value)` to select the group of existing pods which pods will be taken into consideration for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming pod labels will be ignored. The default value is empty. - The same key is forbidden to exist in both MismatchLabelKeys and LabelSelector. - Also, MismatchLabelKeys cannot be set when LabelSelector isn't set. + The same key is forbidden to exist in both mismatchLabelKeys and labelSelector. + Also, mismatchLabelKeys cannot be set when labelSelector isn't set. This is an alpha field and requires enabling MatchLabelKeysInPodAffinity feature gate. items: type: string type: array x-kubernetes-list-type: atomic namespaceSelector: - description: A label query over the set - of namespaces that the term applies - to. The term is applied to the union - of the namespaces selected by this field - and the ones listed in the namespaces - field. null selector and null or empty - namespaces list means "this pod's namespace". + description: |- + A label query over the set of namespaces that the term applies to. + The term is applied to the union of the namespaces selected by this field + and the ones listed in the namespaces field. + null selector and null or empty namespaces list means "this pod's namespace". An empty selector ({}) matches all namespaces. properties: matchExpressions: @@ -2187,10 +2422,9 @@ spec: list of label selector requirements. The requirements are ANDed. items: - description: A label selector requirement - is a selector that contains values, - a key, and an operator that relates - the key and values. + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. properties: key: description: key is the label @@ -2198,71 +2432,59 @@ spec: to. type: string operator: - description: operator represents - a key's relationship to a - set of values. Valid operators - are In, NotIn, Exists and - DoesNotExist. + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. type: string values: - description: values is an array - of string values. If the operator - is In or NotIn, the values - array must be non-empty. If - the operator is Exists or - DoesNotExist, the values array - must be empty. This array - is replaced during a strategic + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic merge patch. items: type: string type: array + x-kubernetes-list-type: atomic required: - key - operator type: object type: array + x-kubernetes-list-type: atomic matchLabels: additionalProperties: type: string - description: matchLabels is a map - of {key,value} pairs. A single {key,value} - in the matchLabels map is equivalent - to an element of matchExpressions, - whose key field is "key", the operator - is "In", and the values array contains - only "value". The requirements are - ANDed. + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. type: object type: object x-kubernetes-map-type: atomic namespaces: - description: namespaces specifies a static - list of namespace names that the term - applies to. The term is applied to the - union of the namespaces listed in this - field and the ones selected by namespaceSelector. - null or empty namespaces list and null - namespaceSelector means "this pod's - namespace". + description: |- + namespaces specifies a static list of namespace names that the term applies to. + The term is applied to the union of the namespaces listed in this field + and the ones selected by namespaceSelector. + null or empty namespaces list and null namespaceSelector means "this pod's namespace". items: type: string type: array + x-kubernetes-list-type: atomic topologyKey: - description: This pod should be co-located - (affinity) or not co-located (anti-affinity) - with the pods matching the labelSelector - in the specified namespaces, where co-located - is defined as running on a node whose - value of the label with key topologyKey - matches that of any node on which any - of the selected pods is running. Empty - topologyKey is not allowed. + description: |- + This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching + the labelSelector in the specified namespaces, where co-located is defined as running on a node + whose value of the label with key topologyKey matches that of any node on which any of the + selected pods is running. + Empty topologyKey is not allowed. type: string required: - topologyKey type: object type: array + x-kubernetes-list-type: atomic type: object podAntiAffinity: description: Describes pod anti-affinity scheduling @@ -2270,21 +2492,16 @@ spec: node, zone, etc. as some other pod(s)). properties: preferredDuringSchedulingIgnoredDuringExecution: - description: The scheduler will prefer to schedule - pods to nodes that satisfy the anti-affinity - expressions specified by this field, but it - may choose a node that violates one or more - of the expressions. The node that is most - preferred is the one with the greatest sum - of weights, i.e. for each node that meets - all of the scheduling requirements (resource - request, requiredDuringScheduling anti-affinity - expressions, etc.), compute a sum by iterating - through the elements of this field and adding - "weight" to the sum if the node has pods which - matches the corresponding podAffinityTerm; - the node(s) with the highest sum are the most - preferred. + description: |- + The scheduler will prefer to schedule pods to nodes that satisfy + the anti-affinity expressions specified by this field, but it may choose + a node that violates one or more of the expressions. The node that is + most preferred is the one with the greatest sum of weights, i.e. + for each node that meets all of the scheduling requirements (resource + request, requiredDuringScheduling anti-affinity expressions, etc.), + compute a sum by iterating through the elements of this field and adding + "weight" to the sum if the node has pods which matches the corresponding podAffinityTerm; the + node(s) with the highest sum are the most preferred. items: description: The weights of all of the matched WeightedPodAffinityTerm fields are added @@ -2296,8 +2513,9 @@ spec: weight. properties: labelSelector: - description: A label query over a - set of resources, in this case pods. + description: |- + A label query over a set of resources, in this case pods. + If it's null, this PodAffinityTerm matches with no Pods. properties: matchExpressions: description: matchExpressions @@ -2305,11 +2523,9 @@ spec: requirements. The requirements are ANDed. items: - description: A label selector - requirement is a selector - that contains values, a key, - and an operator that relates - the key and values. + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. properties: key: description: key is the @@ -2317,43 +2533,33 @@ spec: applies to. type: string operator: - description: operator represents - a key's relationship to - a set of values. Valid - operators are In, NotIn, - Exists and DoesNotExist. + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. type: string values: - description: values is an - array of string values. - If the operator is In - or NotIn, the values array - must be non-empty. If - the operator is Exists - or DoesNotExist, the values - array must be empty. This - array is replaced during - a strategic merge patch. + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. items: type: string type: array + x-kubernetes-list-type: atomic required: - key - operator type: object type: array + x-kubernetes-list-type: atomic matchLabels: additionalProperties: type: string - description: matchLabels is a - map of {key,value} pairs. A - single {key,value} in the matchLabels - map is equivalent to an element - of matchExpressions, whose key - field is "key", the operator - is "In", and the values array - contains only "value". The requirements - are ANDed. + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. type: object type: object x-kubernetes-map-type: atomic @@ -2361,12 +2567,12 @@ spec: description: |- MatchLabelKeys is a set of pod label keys to select which pods will be taken into consideration. The keys are used to lookup values from the - incoming pod labels, those key-value labels are merged with `LabelSelector` as `key in (value)` + incoming pod labels, those key-value labels are merged with `labelSelector` as `key in (value)` to select the group of existing pods which pods will be taken into consideration for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming pod labels will be ignored. The default value is empty. - The same key is forbidden to exist in both MatchLabelKeys and LabelSelector. - Also, MatchLabelKeys cannot be set when LabelSelector isn't set. + The same key is forbidden to exist in both matchLabelKeys and labelSelector. + Also, matchLabelKeys cannot be set when labelSelector isn't set. This is an alpha field and requires enabling MatchLabelKeysInPodAffinity feature gate. items: type: string @@ -2376,28 +2582,24 @@ spec: description: |- MismatchLabelKeys is a set of pod label keys to select which pods will be taken into consideration. The keys are used to lookup values from the - incoming pod labels, those key-value labels are merged with `LabelSelector` as `key notin (value)` + incoming pod labels, those key-value labels are merged with `labelSelector` as `key notin (value)` to select the group of existing pods which pods will be taken into consideration for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming pod labels will be ignored. The default value is empty. - The same key is forbidden to exist in both MismatchLabelKeys and LabelSelector. - Also, MismatchLabelKeys cannot be set when LabelSelector isn't set. + The same key is forbidden to exist in both mismatchLabelKeys and labelSelector. + Also, mismatchLabelKeys cannot be set when labelSelector isn't set. This is an alpha field and requires enabling MatchLabelKeysInPodAffinity feature gate. items: type: string type: array x-kubernetes-list-type: atomic namespaceSelector: - description: A label query over the - set of namespaces that the term - applies to. The term is applied - to the union of the namespaces selected - by this field and the ones listed - in the namespaces field. null selector - and null or empty namespaces list - means "this pod's namespace". An - empty selector ({}) matches all - namespaces. + description: |- + A label query over the set of namespaces that the term applies to. + The term is applied to the union of the namespaces selected by this field + and the ones listed in the namespaces field. + null selector and null or empty namespaces list means "this pod's namespace". + An empty selector ({}) matches all namespaces. properties: matchExpressions: description: matchExpressions @@ -2405,11 +2607,9 @@ spec: requirements. The requirements are ANDed. items: - description: A label selector - requirement is a selector - that contains values, a key, - and an operator that relates - the key and values. + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. properties: key: description: key is the @@ -2417,78 +2617,61 @@ spec: applies to. type: string operator: - description: operator represents - a key's relationship to - a set of values. Valid - operators are In, NotIn, - Exists and DoesNotExist. + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. type: string values: - description: values is an - array of string values. - If the operator is In - or NotIn, the values array - must be non-empty. If - the operator is Exists - or DoesNotExist, the values - array must be empty. This - array is replaced during - a strategic merge patch. + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. items: type: string type: array + x-kubernetes-list-type: atomic required: - key - operator type: object type: array + x-kubernetes-list-type: atomic matchLabels: additionalProperties: type: string - description: matchLabels is a - map of {key,value} pairs. A - single {key,value} in the matchLabels - map is equivalent to an element - of matchExpressions, whose key - field is "key", the operator - is "In", and the values array - contains only "value". The requirements - are ANDed. + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. type: object type: object x-kubernetes-map-type: atomic namespaces: - description: namespaces specifies - a static list of namespace names - that the term applies to. The term - is applied to the union of the namespaces - listed in this field and the ones - selected by namespaceSelector. null - or empty namespaces list and null - namespaceSelector means "this pod's - namespace". + description: |- + namespaces specifies a static list of namespace names that the term applies to. + The term is applied to the union of the namespaces listed in this field + and the ones selected by namespaceSelector. + null or empty namespaces list and null namespaceSelector means "this pod's namespace". items: type: string type: array + x-kubernetes-list-type: atomic topologyKey: - description: This pod should be co-located - (affinity) or not co-located (anti-affinity) - with the pods matching the labelSelector - in the specified namespaces, where - co-located is defined as running - on a node whose value of the label - with key topologyKey matches that - of any node on which any of the - selected pods is running. Empty - topologyKey is not allowed. + description: |- + This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching + the labelSelector in the specified namespaces, where co-located is defined as running on a node + whose value of the label with key topologyKey matches that of any node on which any of the + selected pods is running. + Empty topologyKey is not allowed. type: string required: - topologyKey type: object weight: - description: weight associated with matching - the corresponding podAffinityTerm, in - the range 1-100. + description: |- + weight associated with matching the corresponding podAffinityTerm, + in the range 1-100. format: int32 type: integer required: @@ -2496,42 +2679,38 @@ spec: - weight type: object type: array + x-kubernetes-list-type: atomic requiredDuringSchedulingIgnoredDuringExecution: - description: If the anti-affinity requirements - specified by this field are not met at scheduling - time, the pod will not be scheduled onto the - node. If the anti-affinity requirements specified - by this field cease to be met at some point - during pod execution (e.g. due to a pod label - update), the system may or may not try to - eventually evict the pod from its node. When - there are multiple elements, the lists of - nodes corresponding to each podAffinityTerm - are intersected, i.e. all terms must be satisfied. + description: |- + If the anti-affinity requirements specified by this field are not met at + scheduling time, the pod will not be scheduled onto the node. + If the anti-affinity requirements specified by this field cease to be met + at some point during pod execution (e.g. due to a pod label update), the + system may or may not try to eventually evict the pod from its node. + When there are multiple elements, the lists of nodes corresponding to each + podAffinityTerm are intersected, i.e. all terms must be satisfied. items: - description: Defines a set of pods (namely - those matching the labelSelector relative - to the given namespace(s)) that this pod - should be co-located (affinity) or not co-located - (anti-affinity) with, where co-located is - defined as running on a node whose value - of the label with key matches - that of any node on which a pod of the set - of pods is running + description: |- + Defines a set of pods (namely those matching the labelSelector + relative to the given namespace(s)) that this pod should be + co-located (affinity) or not co-located (anti-affinity) with, + where co-located is defined as running on a node whose value of + the label with key matches that of any node on which + a pod of the set of pods is running properties: labelSelector: - description: A label query over a set - of resources, in this case pods. + description: |- + A label query over a set of resources, in this case pods. + If it's null, this PodAffinityTerm matches with no Pods. properties: matchExpressions: description: matchExpressions is a list of label selector requirements. The requirements are ANDed. items: - description: A label selector requirement - is a selector that contains values, - a key, and an operator that relates - the key and values. + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. properties: key: description: key is the label @@ -2539,41 +2718,33 @@ spec: to. type: string operator: - description: operator represents - a key's relationship to a - set of values. Valid operators - are In, NotIn, Exists and - DoesNotExist. + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. type: string values: - description: values is an array - of string values. If the operator - is In or NotIn, the values - array must be non-empty. If - the operator is Exists or - DoesNotExist, the values array - must be empty. This array - is replaced during a strategic + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic merge patch. items: type: string type: array + x-kubernetes-list-type: atomic required: - key - operator type: object type: array + x-kubernetes-list-type: atomic matchLabels: additionalProperties: type: string - description: matchLabels is a map - of {key,value} pairs. A single {key,value} - in the matchLabels map is equivalent - to an element of matchExpressions, - whose key field is "key", the operator - is "In", and the values array contains - only "value". The requirements are - ANDed. + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. type: object type: object x-kubernetes-map-type: atomic @@ -2581,12 +2752,12 @@ spec: description: |- MatchLabelKeys is a set of pod label keys to select which pods will be taken into consideration. The keys are used to lookup values from the - incoming pod labels, those key-value labels are merged with `LabelSelector` as `key in (value)` + incoming pod labels, those key-value labels are merged with `labelSelector` as `key in (value)` to select the group of existing pods which pods will be taken into consideration for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming pod labels will be ignored. The default value is empty. - The same key is forbidden to exist in both MatchLabelKeys and LabelSelector. - Also, MatchLabelKeys cannot be set when LabelSelector isn't set. + The same key is forbidden to exist in both matchLabelKeys and labelSelector. + Also, matchLabelKeys cannot be set when labelSelector isn't set. This is an alpha field and requires enabling MatchLabelKeysInPodAffinity feature gate. items: type: string @@ -2596,25 +2767,23 @@ spec: description: |- MismatchLabelKeys is a set of pod label keys to select which pods will be taken into consideration. The keys are used to lookup values from the - incoming pod labels, those key-value labels are merged with `LabelSelector` as `key notin (value)` + incoming pod labels, those key-value labels are merged with `labelSelector` as `key notin (value)` to select the group of existing pods which pods will be taken into consideration for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming pod labels will be ignored. The default value is empty. - The same key is forbidden to exist in both MismatchLabelKeys and LabelSelector. - Also, MismatchLabelKeys cannot be set when LabelSelector isn't set. + The same key is forbidden to exist in both mismatchLabelKeys and labelSelector. + Also, mismatchLabelKeys cannot be set when labelSelector isn't set. This is an alpha field and requires enabling MatchLabelKeysInPodAffinity feature gate. items: type: string type: array x-kubernetes-list-type: atomic namespaceSelector: - description: A label query over the set - of namespaces that the term applies - to. The term is applied to the union - of the namespaces selected by this field - and the ones listed in the namespaces - field. null selector and null or empty - namespaces list means "this pod's namespace". + description: |- + A label query over the set of namespaces that the term applies to. + The term is applied to the union of the namespaces selected by this field + and the ones listed in the namespaces field. + null selector and null or empty namespaces list means "this pod's namespace". An empty selector ({}) matches all namespaces. properties: matchExpressions: @@ -2622,10 +2791,9 @@ spec: list of label selector requirements. The requirements are ANDed. items: - description: A label selector requirement - is a selector that contains values, - a key, and an operator that relates - the key and values. + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. properties: key: description: key is the label @@ -2633,71 +2801,59 @@ spec: to. type: string operator: - description: operator represents - a key's relationship to a - set of values. Valid operators - are In, NotIn, Exists and - DoesNotExist. + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. type: string values: - description: values is an array - of string values. If the operator - is In or NotIn, the values - array must be non-empty. If - the operator is Exists or - DoesNotExist, the values array - must be empty. This array - is replaced during a strategic + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic merge patch. items: type: string type: array + x-kubernetes-list-type: atomic required: - key - operator type: object type: array + x-kubernetes-list-type: atomic matchLabels: additionalProperties: type: string - description: matchLabels is a map - of {key,value} pairs. A single {key,value} - in the matchLabels map is equivalent - to an element of matchExpressions, - whose key field is "key", the operator - is "In", and the values array contains - only "value". The requirements are - ANDed. + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. type: object type: object x-kubernetes-map-type: atomic namespaces: - description: namespaces specifies a static - list of namespace names that the term - applies to. The term is applied to the - union of the namespaces listed in this - field and the ones selected by namespaceSelector. - null or empty namespaces list and null - namespaceSelector means "this pod's - namespace". + description: |- + namespaces specifies a static list of namespace names that the term applies to. + The term is applied to the union of the namespaces listed in this field + and the ones selected by namespaceSelector. + null or empty namespaces list and null namespaceSelector means "this pod's namespace". items: type: string type: array + x-kubernetes-list-type: atomic topologyKey: - description: This pod should be co-located - (affinity) or not co-located (anti-affinity) - with the pods matching the labelSelector - in the specified namespaces, where co-located - is defined as running on a node whose - value of the label with key topologyKey - matches that of any node on which any - of the selected pods is running. Empty - topologyKey is not allowed. + description: |- + This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching + the labelSelector in the specified namespaces, where co-located is defined as running on a node + whose value of the label with key topologyKey matches that of any node on which any of the + selected pods is running. + Empty topologyKey is not allowed. type: string required: - topologyKey type: object type: array + x-kubernetes-list-type: atomic type: object type: object automountServiceAccountToken: @@ -2706,49 +2862,47 @@ spec: mounted. type: boolean containers: - description: List of containers belonging to the pod. - Containers cannot currently be added or removed. There - must be at least one container in a Pod. Cannot be - updated. + description: |- + List of containers belonging to the pod. + Containers cannot currently be added or removed. + There must be at least one container in a Pod. + Cannot be updated. items: description: A single application container that you want to run within a pod. properties: args: - description: 'Arguments to the entrypoint. The - container image''s CMD is used if this is not - provided. Variable references $(VAR_NAME) are - expanded using the container''s environment. - If a variable cannot be resolved, the reference - in the input string will be unchanged. Double - $$ are reduced to a single $, which allows for - escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" - will produce the string literal "$(VAR_NAME)". - Escaped references will never be expanded, regardless - of whether the variable exists or not. Cannot - be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + description: |- + Arguments to the entrypoint. + The container image's CMD is used if this is not provided. + Variable references $(VAR_NAME) are expanded using the container's environment. If a variable + cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced + to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will + produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless + of whether the variable exists or not. Cannot be updated. + More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell items: type: string type: array + x-kubernetes-list-type: atomic command: - description: 'Entrypoint array. Not executed within - a shell. The container image''s ENTRYPOINT is - used if this is not provided. Variable references - $(VAR_NAME) are expanded using the container''s - environment. If a variable cannot be resolved, - the reference in the input string will be unchanged. - Double $$ are reduced to a single $, which allows - for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" - will produce the string literal "$(VAR_NAME)". - Escaped references will never be expanded, regardless - of whether the variable exists or not. Cannot - be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + description: |- + Entrypoint array. Not executed within a shell. + The container image's ENTRYPOINT is used if this is not provided. + Variable references $(VAR_NAME) are expanded using the container's environment. If a variable + cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced + to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will + produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless + of whether the variable exists or not. Cannot be updated. + More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell items: type: string type: array + x-kubernetes-list-type: atomic env: - description: List of environment variables to - set in the container. Cannot be updated. + description: |- + List of environment variables to set in the container. + Cannot be updated. items: description: EnvVar represents an environment variable present in a Container. @@ -2758,19 +2912,16 @@ spec: Must be a C_IDENTIFIER. type: string value: - description: 'Variable references $(VAR_NAME) - are expanded using the previously defined - environment variables in the container - and any service environment variables. - If a variable cannot be resolved, the - reference in the input string will be - unchanged. Double $$ are reduced to a - single $, which allows for escaping the - $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" - will produce the string literal "$(VAR_NAME)". - Escaped references will never be expanded, - regardless of whether the variable exists - or not. Defaults to "".' + description: |- + Variable references $(VAR_NAME) are expanded + using the previously defined environment variables in the container and + any service environment variables. If a variable cannot be resolved, + the reference in the input string will be unchanged. Double $$ are reduced + to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. + "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". + Escaped references will never be expanded, regardless of whether the variable + exists or not. + Defaults to "". type: string valueFrom: description: Source for the environment @@ -2784,10 +2935,15 @@ spec: description: The key to select. type: string name: - description: 'Name of the referent. + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + TODO: Add other useful fields. apiVersion, kind, uid? More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. - apiVersion, kind, uid?' + TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. type: string optional: description: Specify whether the @@ -2798,11 +2954,9 @@ spec: type: object x-kubernetes-map-type: atomic fieldRef: - description: 'Selects a field of the - pod: supports metadata.name, metadata.namespace, - `metadata.labels['''']`, `metadata.annotations['''']`, - spec.nodeName, spec.serviceAccountName, - status.hostIP, status.podIP, status.podIPs.' + description: |- + Selects a field of the pod: supports metadata.name, metadata.namespace, `metadata.labels['']`, `metadata.annotations['']`, + spec.nodeName, spec.serviceAccountName, status.hostIP, status.podIP, status.podIPs. properties: apiVersion: description: Version of the schema @@ -2818,12 +2972,9 @@ spec: type: object x-kubernetes-map-type: atomic resourceFieldRef: - description: 'Selects a resource of - the container: only resources limits - and requests (limits.cpu, limits.memory, - limits.ephemeral-storage, requests.cpu, - requests.memory and requests.ephemeral-storage) - are currently supported.' + description: |- + Selects a resource of the container: only resources limits and requests + (limits.cpu, limits.memory, limits.ephemeral-storage, requests.cpu, requests.memory and requests.ephemeral-storage) are currently supported. properties: containerName: description: 'Container name: required @@ -2857,10 +3008,15 @@ spec: secret key. type: string name: - description: 'Name of the referent. + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + TODO: Add other useful fields. apiVersion, kind, uid? More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. - apiVersion, kind, uid?' + TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. type: string optional: description: Specify whether the @@ -2875,16 +3031,17 @@ spec: - name type: object type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map envFrom: - description: List of sources to populate environment - variables in the container. The keys defined - within a source must be a C_IDENTIFIER. All - invalid keys will be reported as an event when - the container is starting. When a key exists - in multiple sources, the value associated with - the last source will take precedence. Values - defined by an Env with a duplicate key will - take precedence. Cannot be updated. + description: |- + List of sources to populate environment variables in the container. + The keys defined within a source must be a C_IDENTIFIER. All invalid keys + will be reported as an event when the container is starting. When a key exists in multiple + sources, the value associated with the last source will take precedence. + Values defined by an Env with a duplicate key will take precedence. + Cannot be updated. items: description: EnvFromSource represents the source of a set of ConfigMaps @@ -2893,10 +3050,15 @@ spec: description: The ConfigMap to select from properties: name: - description: 'Name of the referent. + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + TODO: Add other useful fields. apiVersion, kind, uid? More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' + TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. type: string optional: description: Specify whether the ConfigMap @@ -2913,10 +3075,15 @@ spec: description: The Secret to select from properties: name: - description: 'Name of the referent. + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + TODO: Add other useful fields. apiVersion, kind, uid? More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' + TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. type: string optional: description: Specify whether the Secret @@ -2926,62 +3093,58 @@ spec: x-kubernetes-map-type: atomic type: object type: array + x-kubernetes-list-type: atomic image: - description: 'Container image name. More info: - https://kubernetes.io/docs/concepts/containers/images - This field is optional to allow higher level - config management to default or override container - images in workload controllers like Deployments - and StatefulSets.' + description: |- + Container image name. + More info: https://kubernetes.io/docs/concepts/containers/images + This field is optional to allow higher level config management to default or override + container images in workload controllers like Deployments and StatefulSets. type: string imagePullPolicy: - description: 'Image pull policy. One of Always, - Never, IfNotPresent. Defaults to Always if :latest - tag is specified, or IfNotPresent otherwise. - Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images' + description: |- + Image pull policy. + One of Always, Never, IfNotPresent. + Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. + Cannot be updated. + More info: https://kubernetes.io/docs/concepts/containers/images#updating-images type: string lifecycle: - description: Actions that the management system - should take in response to container lifecycle - events. Cannot be updated. + description: |- + Actions that the management system should take in response to container lifecycle events. + Cannot be updated. properties: postStart: - description: 'PostStart is called immediately - after a container is created. If the handler - fails, the container is terminated and restarted - according to its restart policy. Other management - of the container blocks until the hook completes. - More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' + description: |- + PostStart is called immediately after a container is created. If the handler fails, + the container is terminated and restarted according to its restart policy. + Other management of the container blocks until the hook completes. + More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks properties: exec: description: Exec specifies the action to take. properties: command: - description: Command is the command - line to execute inside the container, - the working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it - is not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to - explicitly call out to that shell. - Exit status of 0 is treated as live/healthy - and non-zero is unhealthy. + description: |- + Command is the command line to execute inside the container, the working directory for the + command is root ('/') in the container's filesystem. The command is simply exec'd, it is + not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use + a shell, you need to explicitly call out to that shell. + Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array + x-kubernetes-list-type: atomic type: object httpGet: description: HTTPGet specifies the http request to perform. properties: host: - description: Host name to connect - to, defaults to the pod IP. You - probably want to set "Host" in httpHeaders - instead. + description: |- + Host name to connect to, defaults to the pod IP. You probably want to set + "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set @@ -2993,11 +3156,9 @@ spec: HTTP probes properties: name: - description: The header field - name. This will be canonicalized - upon output, so case-variant - names will be understood as - the same header. + description: |- + The header field name. + This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field @@ -3008,6 +3169,7 @@ spec: - value type: object type: array + x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. @@ -3016,14 +3178,15 @@ spec: anyOf: - type: integer - type: string - description: Name or number of the - port to access on the container. - Number must be in the range 1 to - 65535. Name must be an IANA_SVC_NAME. + description: |- + Name or number of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. + description: |- + Scheme to use for connecting to the host. + Defaults to HTTP. type: string required: - port @@ -3042,12 +3205,10 @@ spec: - seconds type: object tcpSocket: - description: Deprecated. TCPSocket is - NOT supported as a LifecycleHandler - and kept for the backward compatibility. - There are no validation of this field - and lifecycle hooks will fail in runtime - when tcp handler is specified. + description: |- + Deprecated. TCPSocket is NOT supported as a LifecycleHandler and kept + for the backward compatibility. There are no validation of this field and + lifecycle hooks will fail in runtime when tcp handler is specified. properties: host: description: 'Optional: Host name @@ -3058,61 +3219,51 @@ spec: anyOf: - type: integer - type: string - description: Number or name of the - port to access on the container. - Number must be in the range 1 to - 65535. Name must be an IANA_SVC_NAME. + description: |- + Number or name of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object type: object preStop: - description: 'PreStop is called immediately - before a container is terminated due to - an API request or management event such - as liveness/startup probe failure, preemption, - resource contention, etc. The handler is - not called if the container crashes or exits. - The Pod''s termination grace period countdown - begins before the PreStop hook is executed. - Regardless of the outcome of the handler, - the container will eventually terminate - within the Pod''s termination grace period - (unless delayed by finalizers). Other management - of the container blocks until the hook completes - or until the termination grace period is - reached. More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' + description: |- + PreStop is called immediately before a container is terminated due to an + API request or management event such as liveness/startup probe failure, + preemption, resource contention, etc. The handler is not called if the + container crashes or exits. The Pod's termination grace period countdown begins before the + PreStop hook is executed. Regardless of the outcome of the handler, the + container will eventually terminate within the Pod's termination grace + period (unless delayed by finalizers). Other management of the container blocks until the hook completes + or until the termination grace period is reached. + More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks properties: exec: description: Exec specifies the action to take. properties: command: - description: Command is the command - line to execute inside the container, - the working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it - is not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to - explicitly call out to that shell. - Exit status of 0 is treated as live/healthy - and non-zero is unhealthy. + description: |- + Command is the command line to execute inside the container, the working directory for the + command is root ('/') in the container's filesystem. The command is simply exec'd, it is + not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use + a shell, you need to explicitly call out to that shell. + Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array + x-kubernetes-list-type: atomic type: object httpGet: description: HTTPGet specifies the http request to perform. properties: host: - description: Host name to connect - to, defaults to the pod IP. You - probably want to set "Host" in httpHeaders - instead. + description: |- + Host name to connect to, defaults to the pod IP. You probably want to set + "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set @@ -3124,11 +3275,9 @@ spec: HTTP probes properties: name: - description: The header field - name. This will be canonicalized - upon output, so case-variant - names will be understood as - the same header. + description: |- + The header field name. + This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field @@ -3139,6 +3288,7 @@ spec: - value type: object type: array + x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. @@ -3147,14 +3297,15 @@ spec: anyOf: - type: integer - type: string - description: Name or number of the - port to access on the container. - Number must be in the range 1 to - 65535. Name must be an IANA_SVC_NAME. + description: |- + Name or number of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. + description: |- + Scheme to use for connecting to the host. + Defaults to HTTP. type: string required: - port @@ -3173,12 +3324,10 @@ spec: - seconds type: object tcpSocket: - description: Deprecated. TCPSocket is - NOT supported as a LifecycleHandler - and kept for the backward compatibility. - There are no validation of this field - and lifecycle hooks will fail in runtime - when tcp handler is specified. + description: |- + Deprecated. TCPSocket is NOT supported as a LifecycleHandler and kept + for the backward compatibility. There are no validation of this field and + lifecycle hooks will fail in runtime when tcp handler is specified. properties: host: description: 'Optional: Host name @@ -3189,10 +3338,10 @@ spec: anyOf: - type: integer - type: string - description: Number or name of the - port to access on the container. - Number must be in the range 1 to - 65535. Name must be an IANA_SVC_NAME. + description: |- + Number or name of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port @@ -3200,35 +3349,32 @@ spec: type: object type: object livenessProbe: - description: 'Periodic probe of container liveness. + description: |- + Periodic probe of container liveness. Container will be restarted if the probe fails. - Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + Cannot be updated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes properties: exec: description: Exec specifies the action to take. properties: command: - description: Command is the command line - to execute inside the container, the - working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it is - not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to explicitly - call out to that shell. Exit status - of 0 is treated as live/healthy and - non-zero is unhealthy. + description: |- + Command is the command line to execute inside the container, the working directory for the + command is root ('/') in the container's filesystem. The command is simply exec'd, it is + not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use + a shell, you need to explicitly call out to that shell. + Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array + x-kubernetes-list-type: atomic type: object failureThreshold: - description: Minimum consecutive failures - for the probe to be considered failed after - having succeeded. Defaults to 3. Minimum - value is 1. + description: |- + Minimum consecutive failures for the probe to be considered failed after having succeeded. + Defaults to 3. Minimum value is 1. format: int32 type: integer grpc: @@ -3241,11 +3387,12 @@ spec: format: int32 type: integer service: - description: "Service is the name of the - service to place in the gRPC HealthCheckRequest + description: |- + Service is the name of the service to place in the gRPC HealthCheckRequest (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - \n If this is not specified, the default - behavior is defined by gRPC." + + + If this is not specified, the default behavior is defined by gRPC. type: string required: - port @@ -3255,9 +3402,9 @@ spec: to perform. properties: host: - description: Host name to connect to, - defaults to the pod IP. You probably - want to set "Host" in httpHeaders instead. + description: |- + Host name to connect to, defaults to the pod IP. You probably want to set + "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set in @@ -3267,11 +3414,9 @@ spec: custom header to be used in HTTP probes properties: name: - description: The header field name. - This will be canonicalized upon - output, so case-variant names - will be understood as the same - header. + description: |- + The header field name. + This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field value @@ -3281,6 +3426,7 @@ spec: - value type: object type: array + x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. @@ -3289,36 +3435,35 @@ spec: anyOf: - type: integer - type: string - description: Name or number of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. + description: |- + Name or number of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. + description: |- + Scheme to use for connecting to the host. + Defaults to HTTP. type: string required: - port type: object initialDelaySeconds: - description: 'Number of seconds after the - container has started before liveness probes - are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + description: |- + Number of seconds after the container has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer periodSeconds: - description: How often (in seconds) to perform - the probe. Default to 10 seconds. Minimum - value is 1. + description: |- + How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. format: int32 type: integer successThreshold: - description: Minimum consecutive successes - for the probe to be considered successful - after having failed. Defaults to 1. Must - be 1 for liveness and startup. Minimum value - is 1. + description: |- + Minimum consecutive successes for the probe to be considered successful after having failed. + Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. format: int32 type: integer tcpSocket: @@ -3333,56 +3478,49 @@ spec: anyOf: - type: integer - type: string - description: Number or name of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. + description: |- + Number or name of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object terminationGracePeriodSeconds: - description: Optional duration in seconds - the pod needs to terminate gracefully upon - probe failure. The grace period is the duration - in seconds after the processes running in - the pod are sent a termination signal and - the time when the processes are forcibly - halted with a kill signal. Set this value - longer than the expected cleanup time for - your process. If this value is nil, the - pod's terminationGracePeriodSeconds will - be used. Otherwise, this value overrides - the value provided by the pod spec. Value - must be non-negative integer. The value - zero indicates stop immediately via the - kill signal (no opportunity to shut down). - This is a beta field and requires enabling - ProbeTerminationGracePeriod feature gate. - Minimum value is 1. spec.terminationGracePeriodSeconds - is used if unset. + description: |- + Optional duration in seconds the pod needs to terminate gracefully upon probe failure. + The grace period is the duration in seconds after the processes running in the pod are sent + a termination signal and the time when the processes are forcibly halted with a kill signal. + Set this value longer than the expected cleanup time for your process. + If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this + value overrides the value provided by the pod spec. + Value must be non-negative integer. The value zero indicates stop immediately via + the kill signal (no opportunity to shut down). + This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. + Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. format: int64 type: integer timeoutSeconds: - description: 'Number of seconds after which - the probe times out. Defaults to 1 second. - Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + description: |- + Number of seconds after which the probe times out. + Defaults to 1 second. Minimum value is 1. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer type: object name: - description: Name of the container specified as - a DNS_LABEL. Each container in a pod must have - a unique name (DNS_LABEL). Cannot be updated. + description: |- + Name of the container specified as a DNS_LABEL. + Each container in a pod must have a unique name (DNS_LABEL). + Cannot be updated. type: string ports: - description: List of ports to expose from the - container. Not specifying a port here DOES NOT - prevent that port from being exposed. Any port - which is listening on the default "0.0.0.0" - address inside a container will be accessible - from the network. Modifying this array with - strategic merge patch may corrupt the data. + description: |- + List of ports to expose from the container. Not specifying a port here + DOES NOT prevent that port from being exposed. Any port which is + listening on the default "0.0.0.0" address inside a container will be + accessible from the network. + Modifying this array with strategic merge patch may corrupt the data. For more information See https://github.com/kubernetes/kubernetes/issues/108255. Cannot be updated. items: @@ -3390,9 +3528,9 @@ spec: port in a single container. properties: containerPort: - description: Number of port to expose on - the pod's IP address. This must be a valid - port number, 0 < x < 65536. + description: |- + Number of port to expose on the pod's IP address. + This must be a valid port number, 0 < x < 65536. format: int32 type: integer hostIP: @@ -3400,24 +3538,24 @@ spec: port to. type: string hostPort: - description: Number of port to expose on - the host. If specified, this must be a - valid port number, 0 < x < 65536. If HostNetwork - is specified, this must match ContainerPort. + description: |- + Number of port to expose on the host. + If specified, this must be a valid port number, 0 < x < 65536. + If HostNetwork is specified, this must match ContainerPort. Most containers do not need this. format: int32 type: integer name: - description: If specified, this must be - an IANA_SVC_NAME and unique within the - pod. Each named port in a pod must have - a unique name. Name for the port that - can be referred to by services. + description: |- + If specified, this must be an IANA_SVC_NAME and unique within the pod. Each + named port in a pod must have a unique name. Name for the port that can be + referred to by services. type: string protocol: default: TCP - description: Protocol for port. Must be - UDP, TCP, or SCTP. Defaults to "TCP". + description: |- + Protocol for port. Must be UDP, TCP, or SCTP. + Defaults to "TCP". type: string required: - containerPort @@ -3428,36 +3566,32 @@ spec: - protocol x-kubernetes-list-type: map readinessProbe: - description: 'Periodic probe of container service - readiness. Container will be removed from service - endpoints if the probe fails. Cannot be updated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + description: |- + Periodic probe of container service readiness. + Container will be removed from service endpoints if the probe fails. + Cannot be updated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes properties: exec: description: Exec specifies the action to take. properties: command: - description: Command is the command line - to execute inside the container, the - working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it is - not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to explicitly - call out to that shell. Exit status - of 0 is treated as live/healthy and - non-zero is unhealthy. + description: |- + Command is the command line to execute inside the container, the working directory for the + command is root ('/') in the container's filesystem. The command is simply exec'd, it is + not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use + a shell, you need to explicitly call out to that shell. + Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array + x-kubernetes-list-type: atomic type: object failureThreshold: - description: Minimum consecutive failures - for the probe to be considered failed after - having succeeded. Defaults to 3. Minimum - value is 1. + description: |- + Minimum consecutive failures for the probe to be considered failed after having succeeded. + Defaults to 3. Minimum value is 1. format: int32 type: integer grpc: @@ -3470,11 +3604,12 @@ spec: format: int32 type: integer service: - description: "Service is the name of the - service to place in the gRPC HealthCheckRequest + description: |- + Service is the name of the service to place in the gRPC HealthCheckRequest (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - \n If this is not specified, the default - behavior is defined by gRPC." + + + If this is not specified, the default behavior is defined by gRPC. type: string required: - port @@ -3484,9 +3619,9 @@ spec: to perform. properties: host: - description: Host name to connect to, - defaults to the pod IP. You probably - want to set "Host" in httpHeaders instead. + description: |- + Host name to connect to, defaults to the pod IP. You probably want to set + "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set in @@ -3496,11 +3631,9 @@ spec: custom header to be used in HTTP probes properties: name: - description: The header field name. - This will be canonicalized upon - output, so case-variant names - will be understood as the same - header. + description: |- + The header field name. + This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field value @@ -3510,6 +3643,7 @@ spec: - value type: object type: array + x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. @@ -3518,36 +3652,35 @@ spec: anyOf: - type: integer - type: string - description: Name or number of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. + description: |- + Name or number of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. + description: |- + Scheme to use for connecting to the host. + Defaults to HTTP. type: string required: - port type: object initialDelaySeconds: - description: 'Number of seconds after the - container has started before liveness probes - are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + description: |- + Number of seconds after the container has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer periodSeconds: - description: How often (in seconds) to perform - the probe. Default to 10 seconds. Minimum - value is 1. + description: |- + How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. format: int32 type: integer successThreshold: - description: Minimum consecutive successes - for the probe to be considered successful - after having failed. Defaults to 1. Must - be 1 for liveness and startup. Minimum value - is 1. + description: |- + Minimum consecutive successes for the probe to be considered successful after having failed. + Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. format: int32 type: integer tcpSocket: @@ -3562,40 +3695,33 @@ spec: anyOf: - type: integer - type: string - description: Number or name of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. + description: |- + Number or name of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object terminationGracePeriodSeconds: - description: Optional duration in seconds - the pod needs to terminate gracefully upon - probe failure. The grace period is the duration - in seconds after the processes running in - the pod are sent a termination signal and - the time when the processes are forcibly - halted with a kill signal. Set this value - longer than the expected cleanup time for - your process. If this value is nil, the - pod's terminationGracePeriodSeconds will - be used. Otherwise, this value overrides - the value provided by the pod spec. Value - must be non-negative integer. The value - zero indicates stop immediately via the - kill signal (no opportunity to shut down). - This is a beta field and requires enabling - ProbeTerminationGracePeriod feature gate. - Minimum value is 1. spec.terminationGracePeriodSeconds - is used if unset. + description: |- + Optional duration in seconds the pod needs to terminate gracefully upon probe failure. + The grace period is the duration in seconds after the processes running in the pod are sent + a termination signal and the time when the processes are forcibly halted with a kill signal. + Set this value longer than the expected cleanup time for your process. + If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this + value overrides the value provided by the pod spec. + Value must be non-negative integer. The value zero indicates stop immediately via + the kill signal (no opportunity to shut down). + This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. + Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. format: int64 type: integer timeoutSeconds: - description: 'Number of seconds after which - the probe times out. Defaults to 1 second. - Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + description: |- + Number of seconds after which the probe times out. + Defaults to 1 second. Minimum value is 1. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer type: object @@ -3606,14 +3732,14 @@ spec: resource resize policy for the container. properties: resourceName: - description: 'Name of the resource to which - this resource resize policy applies. Supported - values: cpu, memory.' + description: |- + Name of the resource to which this resource resize policy applies. + Supported values: cpu, memory. type: string restartPolicy: - description: Restart policy to apply when - specified resource is resized. If not - specified, it defaults to NotRequired. + description: |- + Restart policy to apply when specified resource is resized. + If not specified, it defaults to NotRequired. type: string required: - resourceName @@ -3622,26 +3748,31 @@ spec: type: array x-kubernetes-list-type: atomic resources: - description: 'Compute Resources required by this - container. Cannot be updated. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + description: |- + Compute Resources required by this container. + Cannot be updated. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ properties: claims: - description: "Claims lists the names of resources, - defined in spec.resourceClaims, that are - used by this container. \n This is an alpha - field and requires enabling the DynamicResourceAllocation - feature gate. \n This field is immutable. - It can only be set for containers." + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + + This is an alpha field and requires enabling the + DynamicResourceAllocation feature gate. + + + This field is immutable. It can only be set for containers. items: description: ResourceClaim references one entry in PodSpec.ResourceClaims. properties: name: - description: Name must match the name - of one entry in pod.spec.resourceClaims - of the Pod where this field is used. - It makes that resource available inside - a container. + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. type: string required: - name @@ -3657,9 +3788,9 @@ spec: - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true - description: 'Limits describes the maximum - amount of compute resources allowed. More - info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ type: object requests: additionalProperties: @@ -3668,64 +3799,76 @@ spec: - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true - description: 'Requests describes the minimum - amount of compute resources required. If - Requests is omitted for a container, it - defaults to Limits if that is explicitly - specified, otherwise to an implementation-defined - value. Requests cannot exceed Limits. More - info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ type: object type: object restartPolicy: - description: 'RestartPolicy defines the restart - behavior of individual containers in a pod. - This field may only be set for init containers, - and the only allowed value is "Always". For - non-init containers or when this field is not - specified, the restart behavior is defined by - the Pod''s restart policy and the container - type. Setting the RestartPolicy as "Always" - for the init container will have the following - effect: this init container will be continually - restarted on exit until all regular containers - have terminated. Once all regular containers - have completed, all init containers with restartPolicy - "Always" will be shut down. This lifecycle differs - from normal init containers and is often referred - to as a "sidecar" container. Although this init - container still starts in the init container - sequence, it does not wait for the container - to complete before proceeding to the next init - container. Instead, the next init container - starts immediately after this init container - is started, or after any startupProbe has successfully - completed.' + description: |- + RestartPolicy defines the restart behavior of individual containers in a pod. + This field may only be set for init containers, and the only allowed value is "Always". + For non-init containers or when this field is not specified, + the restart behavior is defined by the Pod's restart policy and the container type. + Setting the RestartPolicy as "Always" for the init container will have the following effect: + this init container will be continually restarted on + exit until all regular containers have terminated. Once all regular + containers have completed, all init containers with restartPolicy "Always" + will be shut down. This lifecycle differs from normal init containers and + is often referred to as a "sidecar" container. Although this init + container still starts in the init container sequence, it does not wait + for the container to complete before proceeding to the next init + container. Instead, the next init container starts immediately after this + init container is started, or after any startupProbe has successfully + completed. type: string securityContext: - description: 'SecurityContext defines the security - options the container should be run with. If - set, the fields of SecurityContext override - the equivalent fields of PodSecurityContext. - More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/' + description: |- + SecurityContext defines the security options the container should be run with. + If set, the fields of SecurityContext override the equivalent fields of PodSecurityContext. + More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ properties: allowPrivilegeEscalation: - description: 'AllowPrivilegeEscalation controls - whether a process can gain more privileges - than its parent process. This bool directly - controls if the no_new_privs flag will be - set on the container process. AllowPrivilegeEscalation - is true always when the container is: 1) - run as Privileged 2) has CAP_SYS_ADMIN Note - that this field cannot be set when spec.os.name - is windows.' + description: |- + AllowPrivilegeEscalation controls whether a process can gain more + privileges than its parent process. This bool directly controls if + the no_new_privs flag will be set on the container process. + AllowPrivilegeEscalation is true always when the container is: + 1) run as Privileged + 2) has CAP_SYS_ADMIN + Note that this field cannot be set when spec.os.name is windows. type: boolean + appArmorProfile: + description: |- + appArmorProfile is the AppArmor options to use by this container. If set, this profile + overrides the pod's appArmorProfile. + Note that this field cannot be set when spec.os.name is windows. + properties: + localhostProfile: + description: |- + localhostProfile indicates a profile loaded on the node that should be used. + The profile must be preconfigured on the node to work. + Must match the loaded name of the profile. + Must be set if and only if type is "Localhost". + type: string + type: + description: |- + type indicates which kind of AppArmor profile will be applied. + Valid options are: + Localhost - a profile pre-loaded on the node. + RuntimeDefault - the container runtime's default profile. + Unconfined - no AppArmor enforcement. + type: string + required: + - type + type: object capabilities: - description: The capabilities to add/drop - when running containers. Defaults to the - default set of capabilities granted by the - container runtime. Note that this field - cannot be set when spec.os.name is windows. + description: |- + The capabilities to add/drop when running containers. + Defaults to the default set of capabilities granted by the container runtime. + Note that this field cannot be set when spec.os.name is windows. properties: add: description: Added capabilities @@ -3734,6 +3877,7 @@ spec: capabilities type type: string type: array + x-kubernetes-list-type: atomic drop: description: Removed capabilities items: @@ -3741,73 +3885,63 @@ spec: capabilities type type: string type: array + x-kubernetes-list-type: atomic type: object privileged: - description: Run container in privileged mode. - Processes in privileged containers are essentially - equivalent to root on the host. Defaults - to false. Note that this field cannot be - set when spec.os.name is windows. + description: |- + Run container in privileged mode. + Processes in privileged containers are essentially equivalent to root on the host. + Defaults to false. + Note that this field cannot be set when spec.os.name is windows. type: boolean procMount: - description: procMount denotes the type of - proc mount to use for the containers. The - default is DefaultProcMount which uses the - container runtime defaults for readonly - paths and masked paths. This requires the - ProcMountType feature flag to be enabled. - Note that this field cannot be set when - spec.os.name is windows. + description: |- + procMount denotes the type of proc mount to use for the containers. + The default is DefaultProcMount which uses the container runtime defaults for + readonly paths and masked paths. + This requires the ProcMountType feature flag to be enabled. + Note that this field cannot be set when spec.os.name is windows. type: string readOnlyRootFilesystem: - description: Whether this container has a - read-only root filesystem. Default is false. - Note that this field cannot be set when - spec.os.name is windows. + description: |- + Whether this container has a read-only root filesystem. + Default is false. + Note that this field cannot be set when spec.os.name is windows. type: boolean runAsGroup: - description: The GID to run the entrypoint - of the container process. Uses runtime default - if unset. May also be set in PodSecurityContext. If - set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext takes - precedence. Note that this field cannot - be set when spec.os.name is windows. + description: |- + The GID to run the entrypoint of the container process. + Uses runtime default if unset. + May also be set in PodSecurityContext. If set in both SecurityContext and + PodSecurityContext, the value specified in SecurityContext takes precedence. + Note that this field cannot be set when spec.os.name is windows. format: int64 type: integer runAsNonRoot: - description: Indicates that the container - must run as a non-root user. If true, the - Kubelet will validate the image at runtime - to ensure that it does not run as UID 0 - (root) and fail to start the container if - it does. If unset or false, no such validation - will be performed. May also be set in PodSecurityContext. If - set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext takes - precedence. + description: |- + Indicates that the container must run as a non-root user. + If true, the Kubelet will validate the image at runtime to ensure that it + does not run as UID 0 (root) and fail to start the container if it does. + If unset or false, no such validation will be performed. + May also be set in PodSecurityContext. If set in both SecurityContext and + PodSecurityContext, the value specified in SecurityContext takes precedence. type: boolean runAsUser: - description: The UID to run the entrypoint - of the container process. Defaults to user - specified in image metadata if unspecified. - May also be set in PodSecurityContext. If - set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext takes - precedence. Note that this field cannot - be set when spec.os.name is windows. + description: |- + The UID to run the entrypoint of the container process. + Defaults to user specified in image metadata if unspecified. + May also be set in PodSecurityContext. If set in both SecurityContext and + PodSecurityContext, the value specified in SecurityContext takes precedence. + Note that this field cannot be set when spec.os.name is windows. format: int64 type: integer seLinuxOptions: - description: The SELinux context to be applied - to the container. If unspecified, the container - runtime will allocate a random SELinux context - for each container. May also be set in - PodSecurityContext. If set in both SecurityContext - and PodSecurityContext, the value specified - in SecurityContext takes precedence. Note - that this field cannot be set when spec.os.name - is windows. + description: |- + The SELinux context to be applied to the container. + If unspecified, the container runtime will allocate a random SELinux context for each + container. May also be set in PodSecurityContext. If set in both SecurityContext and + PodSecurityContext, the value specified in SecurityContext takes precedence. + Note that this field cannot be set when spec.os.name is windows. properties: level: description: Level is SELinux level label @@ -3827,52 +3961,44 @@ spec: type: string type: object seccompProfile: - description: The seccomp options to use by - this container. If seccomp options are provided - at both the pod & container level, the container - options override the pod options. Note that - this field cannot be set when spec.os.name - is windows. + description: |- + The seccomp options to use by this container. If seccomp options are + provided at both the pod & container level, the container options + override the pod options. + Note that this field cannot be set when spec.os.name is windows. properties: localhostProfile: - description: localhostProfile indicates - a profile defined in a file on the node - should be used. The profile must be - preconfigured on the node to work. Must - be a descending path, relative to the - kubelet's configured seccomp profile - location. Must be set if type is "Localhost". - Must NOT be set for any other type. + description: |- + localhostProfile indicates a profile defined in a file on the node should be used. + The profile must be preconfigured on the node to work. + Must be a descending path, relative to the kubelet's configured seccomp profile location. + Must be set if type is "Localhost". Must NOT be set for any other type. type: string type: - description: "type indicates which kind - of seccomp profile will be applied. - Valid options are: \n Localhost - a - profile defined in a file on the node - should be used. RuntimeDefault - the - container runtime default profile should - be used. Unconfined - no profile should - be applied." + description: |- + type indicates which kind of seccomp profile will be applied. + Valid options are: + + + Localhost - a profile defined in a file on the node should be used. + RuntimeDefault - the container runtime default profile should be used. + Unconfined - no profile should be applied. type: string required: - type type: object windowsOptions: - description: The Windows specific settings - applied to all containers. If unspecified, - the options from the PodSecurityContext - will be used. If set in both SecurityContext - and PodSecurityContext, the value specified - in SecurityContext takes precedence. Note - that this field cannot be set when spec.os.name - is linux. + description: |- + The Windows specific settings applied to all containers. + If unspecified, the options from the PodSecurityContext will be used. + If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. + Note that this field cannot be set when spec.os.name is linux. properties: gmsaCredentialSpec: - description: GMSACredentialSpec is where - the GMSA admission webhook (https://github.com/kubernetes-sigs/windows-gmsa) - inlines the contents of the GMSA credential - spec named by the GMSACredentialSpecName - field. + description: |- + GMSACredentialSpec is where the GMSA admission webhook + (https://github.com/kubernetes-sigs/windows-gmsa) inlines the contents of the + GMSA credential spec named by the GMSACredentialSpecName field. type: string gmsaCredentialSpecName: description: GMSACredentialSpecName is @@ -3880,65 +4006,51 @@ spec: to use. type: string hostProcess: - description: HostProcess determines if - a container should be run as a 'Host - Process' container. All of a Pod's containers - must have the same effective HostProcess - value (it is not allowed to have a mix - of HostProcess containers and non-HostProcess - containers). In addition, if HostProcess - is true then HostNetwork must also be - set to true. + description: |- + HostProcess determines if a container should be run as a 'Host Process' container. + All of a Pod's containers must have the same effective HostProcess value + (it is not allowed to have a mix of HostProcess containers and non-HostProcess containers). + In addition, if HostProcess is true then HostNetwork must also be set to true. type: boolean runAsUserName: - description: The UserName in Windows to - run the entrypoint of the container - process. Defaults to the user specified - in image metadata if unspecified. May - also be set in PodSecurityContext. If - set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext - takes precedence. + description: |- + The UserName in Windows to run the entrypoint of the container process. + Defaults to the user specified in image metadata if unspecified. + May also be set in PodSecurityContext. If set in both SecurityContext and + PodSecurityContext, the value specified in SecurityContext takes precedence. type: string type: object type: object startupProbe: - description: 'StartupProbe indicates that the - Pod has successfully initialized. If specified, - no other probes are executed until this completes - successfully. If this probe fails, the Pod will - be restarted, just as if the livenessProbe failed. - This can be used to provide different probe - parameters at the beginning of a Pod''s lifecycle, - when it might take a long time to load data - or warm a cache, than during steady-state operation. - This cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + description: |- + StartupProbe indicates that the Pod has successfully initialized. + If specified, no other probes are executed until this completes successfully. + If this probe fails, the Pod will be restarted, just as if the livenessProbe failed. + This can be used to provide different probe parameters at the beginning of a Pod's lifecycle, + when it might take a long time to load data or warm a cache, than during steady-state operation. + This cannot be updated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes properties: exec: description: Exec specifies the action to take. properties: command: - description: Command is the command line - to execute inside the container, the - working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it is - not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to explicitly - call out to that shell. Exit status - of 0 is treated as live/healthy and - non-zero is unhealthy. + description: |- + Command is the command line to execute inside the container, the working directory for the + command is root ('/') in the container's filesystem. The command is simply exec'd, it is + not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use + a shell, you need to explicitly call out to that shell. + Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array + x-kubernetes-list-type: atomic type: object failureThreshold: - description: Minimum consecutive failures - for the probe to be considered failed after - having succeeded. Defaults to 3. Minimum - value is 1. + description: |- + Minimum consecutive failures for the probe to be considered failed after having succeeded. + Defaults to 3. Minimum value is 1. format: int32 type: integer grpc: @@ -3951,11 +4063,12 @@ spec: format: int32 type: integer service: - description: "Service is the name of the - service to place in the gRPC HealthCheckRequest + description: |- + Service is the name of the service to place in the gRPC HealthCheckRequest (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - \n If this is not specified, the default - behavior is defined by gRPC." + + + If this is not specified, the default behavior is defined by gRPC. type: string required: - port @@ -3965,9 +4078,9 @@ spec: to perform. properties: host: - description: Host name to connect to, - defaults to the pod IP. You probably - want to set "Host" in httpHeaders instead. + description: |- + Host name to connect to, defaults to the pod IP. You probably want to set + "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set in @@ -3977,11 +4090,9 @@ spec: custom header to be used in HTTP probes properties: name: - description: The header field name. - This will be canonicalized upon - output, so case-variant names - will be understood as the same - header. + description: |- + The header field name. + This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field value @@ -3991,6 +4102,7 @@ spec: - value type: object type: array + x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. @@ -3999,36 +4111,35 @@ spec: anyOf: - type: integer - type: string - description: Name or number of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. + description: |- + Name or number of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. + description: |- + Scheme to use for connecting to the host. + Defaults to HTTP. type: string required: - port type: object initialDelaySeconds: - description: 'Number of seconds after the - container has started before liveness probes - are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + description: |- + Number of seconds after the container has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer periodSeconds: - description: How often (in seconds) to perform - the probe. Default to 10 seconds. Minimum - value is 1. + description: |- + How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. format: int32 type: integer successThreshold: - description: Minimum consecutive successes - for the probe to be considered successful - after having failed. Defaults to 1. Must - be 1 for liveness and startup. Minimum value - is 1. + description: |- + Minimum consecutive successes for the probe to be considered successful after having failed. + Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. format: int32 type: integer tcpSocket: @@ -4043,92 +4154,76 @@ spec: anyOf: - type: integer - type: string - description: Number or name of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. + description: |- + Number or name of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object terminationGracePeriodSeconds: - description: Optional duration in seconds - the pod needs to terminate gracefully upon - probe failure. The grace period is the duration - in seconds after the processes running in - the pod are sent a termination signal and - the time when the processes are forcibly - halted with a kill signal. Set this value - longer than the expected cleanup time for - your process. If this value is nil, the - pod's terminationGracePeriodSeconds will - be used. Otherwise, this value overrides - the value provided by the pod spec. Value - must be non-negative integer. The value - zero indicates stop immediately via the - kill signal (no opportunity to shut down). - This is a beta field and requires enabling - ProbeTerminationGracePeriod feature gate. - Minimum value is 1. spec.terminationGracePeriodSeconds - is used if unset. + description: |- + Optional duration in seconds the pod needs to terminate gracefully upon probe failure. + The grace period is the duration in seconds after the processes running in the pod are sent + a termination signal and the time when the processes are forcibly halted with a kill signal. + Set this value longer than the expected cleanup time for your process. + If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this + value overrides the value provided by the pod spec. + Value must be non-negative integer. The value zero indicates stop immediately via + the kill signal (no opportunity to shut down). + This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. + Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. format: int64 type: integer timeoutSeconds: - description: 'Number of seconds after which - the probe times out. Defaults to 1 second. - Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + description: |- + Number of seconds after which the probe times out. + Defaults to 1 second. Minimum value is 1. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer type: object stdin: - description: Whether this container should allocate - a buffer for stdin in the container runtime. - If this is not set, reads from stdin in the - container will always result in EOF. Default - is false. + description: |- + Whether this container should allocate a buffer for stdin in the container runtime. If this + is not set, reads from stdin in the container will always result in EOF. + Default is false. type: boolean stdinOnce: - description: Whether the container runtime should - close the stdin channel after it has been opened - by a single attach. When stdin is true the stdin - stream will remain open across multiple attach - sessions. If stdinOnce is set to true, stdin - is opened on container start, is empty until - the first client attaches to stdin, and then - remains open and accepts data until the client - disconnects, at which time stdin is closed and - remains closed until the container is restarted. - If this flag is false, a container processes - that reads from stdin will never receive an - EOF. Default is false + description: |- + Whether the container runtime should close the stdin channel after it has been opened by + a single attach. When stdin is true the stdin stream will remain open across multiple attach + sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the + first client attaches to stdin, and then remains open and accepts data until the client disconnects, + at which time stdin is closed and remains closed until the container is restarted. If this + flag is false, a container processes that reads from stdin will never receive an EOF. + Default is false type: boolean terminationMessagePath: - description: 'Optional: Path at which the file - to which the container''s termination message - will be written is mounted into the container''s - filesystem. Message written is intended to be - brief final status, such as an assertion failure - message. Will be truncated by the node if greater - than 4096 bytes. The total message length across - all containers will be limited to 12kb. Defaults - to /dev/termination-log. Cannot be updated.' + description: |- + Optional: Path at which the file to which the container's termination message + will be written is mounted into the container's filesystem. + Message written is intended to be brief final status, such as an assertion failure message. + Will be truncated by the node if greater than 4096 bytes. The total message length across + all containers will be limited to 12kb. + Defaults to /dev/termination-log. + Cannot be updated. type: string terminationMessagePolicy: - description: Indicate how the termination message - should be populated. File will use the contents - of terminationMessagePath to populate the container - status message on both success and failure. - FallbackToLogsOnError will use the last chunk - of container log output if the termination message - file is empty and the container exited with - an error. The log output is limited to 2048 - bytes or 80 lines, whichever is smaller. Defaults - to File. Cannot be updated. + description: |- + Indicate how the termination message should be populated. File will use the contents of + terminationMessagePath to populate the container status message on both success and failure. + FallbackToLogsOnError will use the last chunk of container log output if the termination + message file is empty and the container exited with an error. + The log output is limited to 2048 bytes or 80 lines, whichever is smaller. + Defaults to File. + Cannot be updated. type: string tty: - description: Whether this container should allocate - a TTY for itself, also requires 'stdin' to be - true. Default is false. + description: |- + Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. + Default is false. type: boolean volumeDevices: description: volumeDevices is the list of block @@ -4151,82 +4246,118 @@ spec: - name type: object type: array + x-kubernetes-list-map-keys: + - devicePath + x-kubernetes-list-type: map volumeMounts: - description: Pod volumes to mount into the container's - filesystem. Cannot be updated. + description: |- + Pod volumes to mount into the container's filesystem. + Cannot be updated. items: description: VolumeMount describes a mounting of a Volume within a container. properties: mountPath: - description: Path within the container at - which the volume should be mounted. Must + description: |- + Path within the container at which the volume should be mounted. Must not contain ':'. type: string mountPropagation: - description: mountPropagation determines - how mounts are propagated from the host + description: |- + mountPropagation determines how mounts are propagated from the host to container and the other way around. - When not set, MountPropagationNone is - used. This field is beta in 1.10. + When not set, MountPropagationNone is used. + This field is beta in 1.10. + When RecursiveReadOnly is set to IfPossible or to Enabled, MountPropagation must be None or unspecified + (which defaults to None). type: string name: description: This must match the Name of a Volume. type: string readOnly: - description: Mounted read-only if true, - read-write otherwise (false or unspecified). + description: |- + Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false. type: boolean + recursiveReadOnly: + description: |- + RecursiveReadOnly specifies whether read-only mounts should be handled + recursively. + + + If ReadOnly is false, this field has no meaning and must be unspecified. + + + If ReadOnly is true, and this field is set to Disabled, the mount is not made + recursively read-only. If this field is set to IfPossible, the mount is made + recursively read-only, if it is supported by the container runtime. If this + field is set to Enabled, the mount is made recursively read-only if it is + supported by the container runtime, otherwise the pod will not be started and + an error will be generated to indicate the reason. + + + If this field is set to IfPossible or Enabled, MountPropagation must be set to + None (or be unspecified, which defaults to None). + + + If this field is not specified, it is treated as an equivalent of Disabled. + type: string subPath: - description: Path within the volume from - which the container's volume should be - mounted. Defaults to "" (volume's root). + description: |- + Path within the volume from which the container's volume should be mounted. + Defaults to "" (volume's root). type: string subPathExpr: - description: Expanded path within the volume - from which the container's volume should - be mounted. Behaves similarly to SubPath - but environment variable references $(VAR_NAME) - are expanded using the container's environment. - Defaults to "" (volume's root). SubPathExpr - and SubPath are mutually exclusive. + description: |- + Expanded path within the volume from which the container's volume should be mounted. + Behaves similarly to SubPath but environment variable references $(VAR_NAME) are expanded using the container's environment. + Defaults to "" (volume's root). + SubPathExpr and SubPath are mutually exclusive. type: string required: - mountPath - name type: object type: array + x-kubernetes-list-map-keys: + - mountPath + x-kubernetes-list-type: map workingDir: - description: Container's working directory. If - not specified, the container runtime's default - will be used, which might be configured in the - container image. Cannot be updated. + description: |- + Container's working directory. + If not specified, the container runtime's default will be used, which + might be configured in the container image. + Cannot be updated. type: string required: - name type: object type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map dnsConfig: - description: Specifies the DNS parameters of a pod. - Parameters specified here will be merged to the generated - DNS configuration based on DNSPolicy. + description: |- + Specifies the DNS parameters of a pod. + Parameters specified here will be merged to the generated DNS + configuration based on DNSPolicy. properties: nameservers: - description: A list of DNS name server IP addresses. - This will be appended to the base nameservers - generated from DNSPolicy. Duplicated nameservers - will be removed. + description: |- + A list of DNS name server IP addresses. + This will be appended to the base nameservers generated from DNSPolicy. + Duplicated nameservers will be removed. items: type: string type: array + x-kubernetes-list-type: atomic options: - description: A list of DNS resolver options. This - will be merged with the base options generated - from DNSPolicy. Duplicated entries will be removed. - Resolution options given in Options will override - those that appear in the base DNSPolicy. + description: |- + A list of DNS resolver options. + This will be merged with the base options generated from DNSPolicy. + Duplicated entries will be removed. Resolution options given in Options + will override those that appear in the base DNSPolicy. items: description: PodDNSConfigOption defines DNS resolver options of a pod. @@ -4238,86 +4369,82 @@ spec: type: string type: object type: array + x-kubernetes-list-type: atomic searches: - description: A list of DNS search domains for host-name - lookup. This will be appended to the base search - paths generated from DNSPolicy. Duplicated search - paths will be removed. + description: |- + A list of DNS search domains for host-name lookup. + This will be appended to the base search paths generated from DNSPolicy. + Duplicated search paths will be removed. items: type: string type: array + x-kubernetes-list-type: atomic type: object dnsPolicy: - description: Set DNS policy for the pod. Defaults to - "ClusterFirst". Valid values are 'ClusterFirstWithHostNet', - 'ClusterFirst', 'Default' or 'None'. DNS parameters - given in DNSConfig will be merged with the policy - selected with DNSPolicy. To have DNS options set along - with hostNetwork, you have to specify DNS policy explicitly - to 'ClusterFirstWithHostNet'. + description: |- + Set DNS policy for the pod. + Defaults to "ClusterFirst". + Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. + DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. + To have DNS options set along with hostNetwork, you have to specify DNS policy + explicitly to 'ClusterFirstWithHostNet'. type: string enableServiceLinks: - description: 'EnableServiceLinks indicates whether information - about services should be injected into pod''s environment - variables, matching the syntax of Docker links. Optional: - Defaults to true.' + description: |- + EnableServiceLinks indicates whether information about services should be injected into pod's + environment variables, matching the syntax of Docker links. + Optional: Defaults to true. type: boolean ephemeralContainers: - description: List of ephemeral containers run in this - pod. Ephemeral containers may be run in an existing - pod to perform user-initiated actions such as debugging. - This list cannot be specified when creating a pod, - and it cannot be modified by updating the pod spec. - In order to add an ephemeral container to an existing - pod, use the pod's ephemeralcontainers subresource. + description: |- + List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing + pod to perform user-initiated actions such as debugging. This list cannot be specified when + creating a pod, and it cannot be modified by updating the pod spec. In order to add an + ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. items: - description: "An EphemeralContainer is a temporary - container that you may add to an existing Pod for - user-initiated activities such as debugging. Ephemeral - containers have no resource or scheduling guarantees, - and they will not be restarted when they exit or - when a Pod is removed or restarted. The kubelet - may evict a Pod if an ephemeral container causes - the Pod to exceed its resource allocation. \n To - add an ephemeral container, use the ephemeralcontainers - subresource of an existing Pod. Ephemeral containers - may not be removed or restarted." + description: |- + An EphemeralContainer is a temporary container that you may add to an existing Pod for + user-initiated activities such as debugging. Ephemeral containers have no resource or + scheduling guarantees, and they will not be restarted when they exit or when a Pod is + removed or restarted. The kubelet may evict a Pod if an ephemeral container causes the + Pod to exceed its resource allocation. + + + To add an ephemeral container, use the ephemeralcontainers subresource of an existing + Pod. Ephemeral containers may not be removed or restarted. properties: args: - description: 'Arguments to the entrypoint. The - image''s CMD is used if this is not provided. - Variable references $(VAR_NAME) are expanded - using the container''s environment. If a variable - cannot be resolved, the reference in the input - string will be unchanged. Double $$ are reduced - to a single $, which allows for escaping the - $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will - produce the string literal "$(VAR_NAME)". Escaped - references will never be expanded, regardless - of whether the variable exists or not. Cannot - be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + description: |- + Arguments to the entrypoint. + The image's CMD is used if this is not provided. + Variable references $(VAR_NAME) are expanded using the container's environment. If a variable + cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced + to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will + produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless + of whether the variable exists or not. Cannot be updated. + More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell items: type: string type: array + x-kubernetes-list-type: atomic command: - description: 'Entrypoint array. Not executed within - a shell. The image''s ENTRYPOINT is used if - this is not provided. Variable references $(VAR_NAME) - are expanded using the container''s environment. - If a variable cannot be resolved, the reference - in the input string will be unchanged. Double - $$ are reduced to a single $, which allows for - escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" - will produce the string literal "$(VAR_NAME)". - Escaped references will never be expanded, regardless - of whether the variable exists or not. Cannot - be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + description: |- + Entrypoint array. Not executed within a shell. + The image's ENTRYPOINT is used if this is not provided. + Variable references $(VAR_NAME) are expanded using the container's environment. If a variable + cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced + to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will + produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless + of whether the variable exists or not. Cannot be updated. + More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell items: type: string type: array + x-kubernetes-list-type: atomic env: - description: List of environment variables to - set in the container. Cannot be updated. + description: |- + List of environment variables to set in the container. + Cannot be updated. items: description: EnvVar represents an environment variable present in a Container. @@ -4327,19 +4454,16 @@ spec: Must be a C_IDENTIFIER. type: string value: - description: 'Variable references $(VAR_NAME) - are expanded using the previously defined - environment variables in the container - and any service environment variables. - If a variable cannot be resolved, the - reference in the input string will be - unchanged. Double $$ are reduced to a - single $, which allows for escaping the - $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" - will produce the string literal "$(VAR_NAME)". - Escaped references will never be expanded, - regardless of whether the variable exists - or not. Defaults to "".' + description: |- + Variable references $(VAR_NAME) are expanded + using the previously defined environment variables in the container and + any service environment variables. If a variable cannot be resolved, + the reference in the input string will be unchanged. Double $$ are reduced + to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. + "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". + Escaped references will never be expanded, regardless of whether the variable + exists or not. + Defaults to "". type: string valueFrom: description: Source for the environment @@ -4353,10 +4477,15 @@ spec: description: The key to select. type: string name: - description: 'Name of the referent. + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + TODO: Add other useful fields. apiVersion, kind, uid? More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. - apiVersion, kind, uid?' + TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. type: string optional: description: Specify whether the @@ -4367,11 +4496,9 @@ spec: type: object x-kubernetes-map-type: atomic fieldRef: - description: 'Selects a field of the - pod: supports metadata.name, metadata.namespace, - `metadata.labels['''']`, `metadata.annotations['''']`, - spec.nodeName, spec.serviceAccountName, - status.hostIP, status.podIP, status.podIPs.' + description: |- + Selects a field of the pod: supports metadata.name, metadata.namespace, `metadata.labels['']`, `metadata.annotations['']`, + spec.nodeName, spec.serviceAccountName, status.hostIP, status.podIP, status.podIPs. properties: apiVersion: description: Version of the schema @@ -4387,12 +4514,9 @@ spec: type: object x-kubernetes-map-type: atomic resourceFieldRef: - description: 'Selects a resource of - the container: only resources limits - and requests (limits.cpu, limits.memory, - limits.ephemeral-storage, requests.cpu, - requests.memory and requests.ephemeral-storage) - are currently supported.' + description: |- + Selects a resource of the container: only resources limits and requests + (limits.cpu, limits.memory, limits.ephemeral-storage, requests.cpu, requests.memory and requests.ephemeral-storage) are currently supported. properties: containerName: description: 'Container name: required @@ -4426,10 +4550,15 @@ spec: secret key. type: string name: - description: 'Name of the referent. + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + TODO: Add other useful fields. apiVersion, kind, uid? More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. - apiVersion, kind, uid?' + TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. type: string optional: description: Specify whether the @@ -4444,16 +4573,17 @@ spec: - name type: object type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map envFrom: - description: List of sources to populate environment - variables in the container. The keys defined - within a source must be a C_IDENTIFIER. All - invalid keys will be reported as an event when - the container is starting. When a key exists - in multiple sources, the value associated with - the last source will take precedence. Values - defined by an Env with a duplicate key will - take precedence. Cannot be updated. + description: |- + List of sources to populate environment variables in the container. + The keys defined within a source must be a C_IDENTIFIER. All invalid keys + will be reported as an event when the container is starting. When a key exists in multiple + sources, the value associated with the last source will take precedence. + Values defined by an Env with a duplicate key will take precedence. + Cannot be updated. items: description: EnvFromSource represents the source of a set of ConfigMaps @@ -4462,10 +4592,15 @@ spec: description: The ConfigMap to select from properties: name: - description: 'Name of the referent. + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + TODO: Add other useful fields. apiVersion, kind, uid? More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' + TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. type: string optional: description: Specify whether the ConfigMap @@ -4482,10 +4617,15 @@ spec: description: The Secret to select from properties: name: - description: 'Name of the referent. + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + TODO: Add other useful fields. apiVersion, kind, uid? More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' + TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. type: string optional: description: Specify whether the Secret @@ -4495,57 +4635,55 @@ spec: x-kubernetes-map-type: atomic type: object type: array + x-kubernetes-list-type: atomic image: - description: 'Container image name. More info: - https://kubernetes.io/docs/concepts/containers/images' + description: |- + Container image name. + More info: https://kubernetes.io/docs/concepts/containers/images type: string imagePullPolicy: - description: 'Image pull policy. One of Always, - Never, IfNotPresent. Defaults to Always if :latest - tag is specified, or IfNotPresent otherwise. - Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images' + description: |- + Image pull policy. + One of Always, Never, IfNotPresent. + Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. + Cannot be updated. + More info: https://kubernetes.io/docs/concepts/containers/images#updating-images type: string lifecycle: description: Lifecycle is not allowed for ephemeral containers. properties: postStart: - description: 'PostStart is called immediately - after a container is created. If the handler - fails, the container is terminated and restarted - according to its restart policy. Other management - of the container blocks until the hook completes. - More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' + description: |- + PostStart is called immediately after a container is created. If the handler fails, + the container is terminated and restarted according to its restart policy. + Other management of the container blocks until the hook completes. + More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks properties: exec: description: Exec specifies the action to take. properties: command: - description: Command is the command - line to execute inside the container, - the working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it - is not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to - explicitly call out to that shell. - Exit status of 0 is treated as live/healthy - and non-zero is unhealthy. + description: |- + Command is the command line to execute inside the container, the working directory for the + command is root ('/') in the container's filesystem. The command is simply exec'd, it is + not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use + a shell, you need to explicitly call out to that shell. + Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array + x-kubernetes-list-type: atomic type: object httpGet: description: HTTPGet specifies the http request to perform. properties: host: - description: Host name to connect - to, defaults to the pod IP. You - probably want to set "Host" in httpHeaders - instead. + description: |- + Host name to connect to, defaults to the pod IP. You probably want to set + "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set @@ -4557,11 +4695,9 @@ spec: HTTP probes properties: name: - description: The header field - name. This will be canonicalized - upon output, so case-variant - names will be understood as - the same header. + description: |- + The header field name. + This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field @@ -4572,6 +4708,7 @@ spec: - value type: object type: array + x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. @@ -4580,14 +4717,15 @@ spec: anyOf: - type: integer - type: string - description: Name or number of the - port to access on the container. - Number must be in the range 1 to - 65535. Name must be an IANA_SVC_NAME. + description: |- + Name or number of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. + description: |- + Scheme to use for connecting to the host. + Defaults to HTTP. type: string required: - port @@ -4606,12 +4744,10 @@ spec: - seconds type: object tcpSocket: - description: Deprecated. TCPSocket is - NOT supported as a LifecycleHandler - and kept for the backward compatibility. - There are no validation of this field - and lifecycle hooks will fail in runtime - when tcp handler is specified. + description: |- + Deprecated. TCPSocket is NOT supported as a LifecycleHandler and kept + for the backward compatibility. There are no validation of this field and + lifecycle hooks will fail in runtime when tcp handler is specified. properties: host: description: 'Optional: Host name @@ -4622,61 +4758,51 @@ spec: anyOf: - type: integer - type: string - description: Number or name of the - port to access on the container. - Number must be in the range 1 to - 65535. Name must be an IANA_SVC_NAME. + description: |- + Number or name of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object type: object preStop: - description: 'PreStop is called immediately - before a container is terminated due to - an API request or management event such - as liveness/startup probe failure, preemption, - resource contention, etc. The handler is - not called if the container crashes or exits. - The Pod''s termination grace period countdown - begins before the PreStop hook is executed. - Regardless of the outcome of the handler, - the container will eventually terminate - within the Pod''s termination grace period - (unless delayed by finalizers). Other management - of the container blocks until the hook completes - or until the termination grace period is - reached. More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' + description: |- + PreStop is called immediately before a container is terminated due to an + API request or management event such as liveness/startup probe failure, + preemption, resource contention, etc. The handler is not called if the + container crashes or exits. The Pod's termination grace period countdown begins before the + PreStop hook is executed. Regardless of the outcome of the handler, the + container will eventually terminate within the Pod's termination grace + period (unless delayed by finalizers). Other management of the container blocks until the hook completes + or until the termination grace period is reached. + More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks properties: exec: description: Exec specifies the action to take. properties: command: - description: Command is the command - line to execute inside the container, - the working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it - is not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to - explicitly call out to that shell. - Exit status of 0 is treated as live/healthy - and non-zero is unhealthy. + description: |- + Command is the command line to execute inside the container, the working directory for the + command is root ('/') in the container's filesystem. The command is simply exec'd, it is + not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use + a shell, you need to explicitly call out to that shell. + Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array + x-kubernetes-list-type: atomic type: object httpGet: description: HTTPGet specifies the http request to perform. properties: host: - description: Host name to connect - to, defaults to the pod IP. You - probably want to set "Host" in httpHeaders - instead. + description: |- + Host name to connect to, defaults to the pod IP. You probably want to set + "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set @@ -4688,11 +4814,9 @@ spec: HTTP probes properties: name: - description: The header field - name. This will be canonicalized - upon output, so case-variant - names will be understood as - the same header. + description: |- + The header field name. + This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field @@ -4703,6 +4827,7 @@ spec: - value type: object type: array + x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. @@ -4711,14 +4836,15 @@ spec: anyOf: - type: integer - type: string - description: Name or number of the - port to access on the container. - Number must be in the range 1 to - 65535. Name must be an IANA_SVC_NAME. + description: |- + Name or number of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. + description: |- + Scheme to use for connecting to the host. + Defaults to HTTP. type: string required: - port @@ -4737,12 +4863,10 @@ spec: - seconds type: object tcpSocket: - description: Deprecated. TCPSocket is - NOT supported as a LifecycleHandler - and kept for the backward compatibility. - There are no validation of this field - and lifecycle hooks will fail in runtime - when tcp handler is specified. + description: |- + Deprecated. TCPSocket is NOT supported as a LifecycleHandler and kept + for the backward compatibility. There are no validation of this field and + lifecycle hooks will fail in runtime when tcp handler is specified. properties: host: description: 'Optional: Host name @@ -4753,10 +4877,10 @@ spec: anyOf: - type: integer - type: string - description: Number or name of the - port to access on the container. - Number must be in the range 1 to - 65535. Name must be an IANA_SVC_NAME. + description: |- + Number or name of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port @@ -4772,26 +4896,21 @@ spec: take. properties: command: - description: Command is the command line - to execute inside the container, the - working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it is - not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to explicitly - call out to that shell. Exit status - of 0 is treated as live/healthy and - non-zero is unhealthy. + description: |- + Command is the command line to execute inside the container, the working directory for the + command is root ('/') in the container's filesystem. The command is simply exec'd, it is + not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use + a shell, you need to explicitly call out to that shell. + Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array + x-kubernetes-list-type: atomic type: object failureThreshold: - description: Minimum consecutive failures - for the probe to be considered failed after - having succeeded. Defaults to 3. Minimum - value is 1. + description: |- + Minimum consecutive failures for the probe to be considered failed after having succeeded. + Defaults to 3. Minimum value is 1. format: int32 type: integer grpc: @@ -4804,11 +4923,12 @@ spec: format: int32 type: integer service: - description: "Service is the name of the - service to place in the gRPC HealthCheckRequest + description: |- + Service is the name of the service to place in the gRPC HealthCheckRequest (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - \n If this is not specified, the default - behavior is defined by gRPC." + + + If this is not specified, the default behavior is defined by gRPC. type: string required: - port @@ -4818,9 +4938,9 @@ spec: to perform. properties: host: - description: Host name to connect to, - defaults to the pod IP. You probably - want to set "Host" in httpHeaders instead. + description: |- + Host name to connect to, defaults to the pod IP. You probably want to set + "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set in @@ -4830,11 +4950,9 @@ spec: custom header to be used in HTTP probes properties: name: - description: The header field name. - This will be canonicalized upon - output, so case-variant names - will be understood as the same - header. + description: |- + The header field name. + This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field value @@ -4844,6 +4962,7 @@ spec: - value type: object type: array + x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. @@ -4852,36 +4971,35 @@ spec: anyOf: - type: integer - type: string - description: Name or number of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. + description: |- + Name or number of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. + description: |- + Scheme to use for connecting to the host. + Defaults to HTTP. type: string required: - port type: object initialDelaySeconds: - description: 'Number of seconds after the - container has started before liveness probes - are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + description: |- + Number of seconds after the container has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer periodSeconds: - description: How often (in seconds) to perform - the probe. Default to 10 seconds. Minimum - value is 1. + description: |- + How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. format: int32 type: integer successThreshold: - description: Minimum consecutive successes - for the probe to be considered successful - after having failed. Defaults to 1. Must - be 1 for liveness and startup. Minimum value - is 1. + description: |- + Minimum consecutive successes for the probe to be considered successful after having failed. + Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. format: int32 type: integer tcpSocket: @@ -4896,48 +5014,40 @@ spec: anyOf: - type: integer - type: string - description: Number or name of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. + description: |- + Number or name of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object terminationGracePeriodSeconds: - description: Optional duration in seconds - the pod needs to terminate gracefully upon - probe failure. The grace period is the duration - in seconds after the processes running in - the pod are sent a termination signal and - the time when the processes are forcibly - halted with a kill signal. Set this value - longer than the expected cleanup time for - your process. If this value is nil, the - pod's terminationGracePeriodSeconds will - be used. Otherwise, this value overrides - the value provided by the pod spec. Value - must be non-negative integer. The value - zero indicates stop immediately via the - kill signal (no opportunity to shut down). - This is a beta field and requires enabling - ProbeTerminationGracePeriod feature gate. - Minimum value is 1. spec.terminationGracePeriodSeconds - is used if unset. + description: |- + Optional duration in seconds the pod needs to terminate gracefully upon probe failure. + The grace period is the duration in seconds after the processes running in the pod are sent + a termination signal and the time when the processes are forcibly halted with a kill signal. + Set this value longer than the expected cleanup time for your process. + If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this + value overrides the value provided by the pod spec. + Value must be non-negative integer. The value zero indicates stop immediately via + the kill signal (no opportunity to shut down). + This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. + Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. format: int64 type: integer timeoutSeconds: - description: 'Number of seconds after which - the probe times out. Defaults to 1 second. - Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + description: |- + Number of seconds after which the probe times out. + Defaults to 1 second. Minimum value is 1. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer type: object name: - description: Name of the ephemeral container specified - as a DNS_LABEL. This name must be unique among - all containers, init containers and ephemeral - containers. + description: |- + Name of the ephemeral container specified as a DNS_LABEL. + This name must be unique among all containers, init containers and ephemeral containers. type: string ports: description: Ports are not allowed for ephemeral @@ -4947,9 +5057,9 @@ spec: port in a single container. properties: containerPort: - description: Number of port to expose on - the pod's IP address. This must be a valid - port number, 0 < x < 65536. + description: |- + Number of port to expose on the pod's IP address. + This must be a valid port number, 0 < x < 65536. format: int32 type: integer hostIP: @@ -4957,24 +5067,24 @@ spec: port to. type: string hostPort: - description: Number of port to expose on - the host. If specified, this must be a - valid port number, 0 < x < 65536. If HostNetwork - is specified, this must match ContainerPort. + description: |- + Number of port to expose on the host. + If specified, this must be a valid port number, 0 < x < 65536. + If HostNetwork is specified, this must match ContainerPort. Most containers do not need this. format: int32 type: integer name: - description: If specified, this must be - an IANA_SVC_NAME and unique within the - pod. Each named port in a pod must have - a unique name. Name for the port that - can be referred to by services. + description: |- + If specified, this must be an IANA_SVC_NAME and unique within the pod. Each + named port in a pod must have a unique name. Name for the port that can be + referred to by services. type: string protocol: default: TCP - description: Protocol for port. Must be - UDP, TCP, or SCTP. Defaults to "TCP". + description: |- + Protocol for port. Must be UDP, TCP, or SCTP. + Defaults to "TCP". type: string required: - containerPort @@ -4993,26 +5103,21 @@ spec: take. properties: command: - description: Command is the command line - to execute inside the container, the - working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it is - not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to explicitly - call out to that shell. Exit status - of 0 is treated as live/healthy and - non-zero is unhealthy. + description: |- + Command is the command line to execute inside the container, the working directory for the + command is root ('/') in the container's filesystem. The command is simply exec'd, it is + not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use + a shell, you need to explicitly call out to that shell. + Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array + x-kubernetes-list-type: atomic type: object failureThreshold: - description: Minimum consecutive failures - for the probe to be considered failed after - having succeeded. Defaults to 3. Minimum - value is 1. + description: |- + Minimum consecutive failures for the probe to be considered failed after having succeeded. + Defaults to 3. Minimum value is 1. format: int32 type: integer grpc: @@ -5025,11 +5130,12 @@ spec: format: int32 type: integer service: - description: "Service is the name of the - service to place in the gRPC HealthCheckRequest + description: |- + Service is the name of the service to place in the gRPC HealthCheckRequest (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - \n If this is not specified, the default - behavior is defined by gRPC." + + + If this is not specified, the default behavior is defined by gRPC. type: string required: - port @@ -5039,9 +5145,9 @@ spec: to perform. properties: host: - description: Host name to connect to, - defaults to the pod IP. You probably - want to set "Host" in httpHeaders instead. + description: |- + Host name to connect to, defaults to the pod IP. You probably want to set + "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set in @@ -5051,11 +5157,9 @@ spec: custom header to be used in HTTP probes properties: name: - description: The header field name. - This will be canonicalized upon - output, so case-variant names - will be understood as the same - header. + description: |- + The header field name. + This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field value @@ -5065,6 +5169,7 @@ spec: - value type: object type: array + x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. @@ -5073,36 +5178,35 @@ spec: anyOf: - type: integer - type: string - description: Name or number of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. + description: |- + Name or number of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. + description: |- + Scheme to use for connecting to the host. + Defaults to HTTP. type: string required: - port type: object initialDelaySeconds: - description: 'Number of seconds after the - container has started before liveness probes - are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + description: |- + Number of seconds after the container has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer periodSeconds: - description: How often (in seconds) to perform - the probe. Default to 10 seconds. Minimum - value is 1. + description: |- + How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. format: int32 type: integer successThreshold: - description: Minimum consecutive successes - for the probe to be considered successful - after having failed. Defaults to 1. Must - be 1 for liveness and startup. Minimum value - is 1. + description: |- + Minimum consecutive successes for the probe to be considered successful after having failed. + Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. format: int32 type: integer tcpSocket: @@ -5117,40 +5221,33 @@ spec: anyOf: - type: integer - type: string - description: Number or name of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. + description: |- + Number or name of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object terminationGracePeriodSeconds: - description: Optional duration in seconds - the pod needs to terminate gracefully upon - probe failure. The grace period is the duration - in seconds after the processes running in - the pod are sent a termination signal and - the time when the processes are forcibly - halted with a kill signal. Set this value - longer than the expected cleanup time for - your process. If this value is nil, the - pod's terminationGracePeriodSeconds will - be used. Otherwise, this value overrides - the value provided by the pod spec. Value - must be non-negative integer. The value - zero indicates stop immediately via the - kill signal (no opportunity to shut down). - This is a beta field and requires enabling - ProbeTerminationGracePeriod feature gate. - Minimum value is 1. spec.terminationGracePeriodSeconds - is used if unset. + description: |- + Optional duration in seconds the pod needs to terminate gracefully upon probe failure. + The grace period is the duration in seconds after the processes running in the pod are sent + a termination signal and the time when the processes are forcibly halted with a kill signal. + Set this value longer than the expected cleanup time for your process. + If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this + value overrides the value provided by the pod spec. + Value must be non-negative integer. The value zero indicates stop immediately via + the kill signal (no opportunity to shut down). + This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. + Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. format: int64 type: integer timeoutSeconds: - description: 'Number of seconds after which - the probe times out. Defaults to 1 second. - Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + description: |- + Number of seconds after which the probe times out. + Defaults to 1 second. Minimum value is 1. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer type: object @@ -5161,14 +5258,14 @@ spec: resource resize policy for the container. properties: resourceName: - description: 'Name of the resource to which - this resource resize policy applies. Supported - values: cpu, memory.' + description: |- + Name of the resource to which this resource resize policy applies. + Supported values: cpu, memory. type: string restartPolicy: - description: Restart policy to apply when - specified resource is resized. If not - specified, it defaults to NotRequired. + description: |- + Restart policy to apply when specified resource is resized. + If not specified, it defaults to NotRequired. type: string required: - resourceName @@ -5177,27 +5274,30 @@ spec: type: array x-kubernetes-list-type: atomic resources: - description: Resources are not allowed for ephemeral - containers. Ephemeral containers use spare resources + description: |- + Resources are not allowed for ephemeral containers. Ephemeral containers use spare resources already allocated to the pod. properties: claims: - description: "Claims lists the names of resources, - defined in spec.resourceClaims, that are - used by this container. \n This is an alpha - field and requires enabling the DynamicResourceAllocation - feature gate. \n This field is immutable. - It can only be set for containers." + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + + This is an alpha field and requires enabling the + DynamicResourceAllocation feature gate. + + + This field is immutable. It can only be set for containers. items: description: ResourceClaim references one entry in PodSpec.ResourceClaims. properties: name: - description: Name must match the name - of one entry in pod.spec.resourceClaims - of the Pod where this field is used. - It makes that resource available inside - a container. + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. type: string required: - name @@ -5213,9 +5313,9 @@ spec: - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true - description: 'Limits describes the maximum - amount of compute resources allowed. More - info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ type: object requests: additionalProperties: @@ -5224,45 +5324,64 @@ spec: - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true - description: 'Requests describes the minimum - amount of compute resources required. If - Requests is omitted for a container, it - defaults to Limits if that is explicitly - specified, otherwise to an implementation-defined - value. Requests cannot exceed Limits. More - info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ type: object type: object restartPolicy: - description: Restart policy for the container - to manage the restart behavior of each container - within a pod. This may only be set for init - containers. You cannot set this field on ephemeral - containers. + description: |- + Restart policy for the container to manage the restart behavior of each + container within a pod. + This may only be set for init containers. You cannot set this field on + ephemeral containers. type: string securityContext: - description: 'Optional: SecurityContext defines - the security options the ephemeral container - should be run with. If set, the fields of SecurityContext - override the equivalent fields of PodSecurityContext.' + description: |- + Optional: SecurityContext defines the security options the ephemeral container should be run with. + If set, the fields of SecurityContext override the equivalent fields of PodSecurityContext. properties: allowPrivilegeEscalation: - description: 'AllowPrivilegeEscalation controls - whether a process can gain more privileges - than its parent process. This bool directly - controls if the no_new_privs flag will be - set on the container process. AllowPrivilegeEscalation - is true always when the container is: 1) - run as Privileged 2) has CAP_SYS_ADMIN Note - that this field cannot be set when spec.os.name - is windows.' + description: |- + AllowPrivilegeEscalation controls whether a process can gain more + privileges than its parent process. This bool directly controls if + the no_new_privs flag will be set on the container process. + AllowPrivilegeEscalation is true always when the container is: + 1) run as Privileged + 2) has CAP_SYS_ADMIN + Note that this field cannot be set when spec.os.name is windows. type: boolean + appArmorProfile: + description: |- + appArmorProfile is the AppArmor options to use by this container. If set, this profile + overrides the pod's appArmorProfile. + Note that this field cannot be set when spec.os.name is windows. + properties: + localhostProfile: + description: |- + localhostProfile indicates a profile loaded on the node that should be used. + The profile must be preconfigured on the node to work. + Must match the loaded name of the profile. + Must be set if and only if type is "Localhost". + type: string + type: + description: |- + type indicates which kind of AppArmor profile will be applied. + Valid options are: + Localhost - a profile pre-loaded on the node. + RuntimeDefault - the container runtime's default profile. + Unconfined - no AppArmor enforcement. + type: string + required: + - type + type: object capabilities: - description: The capabilities to add/drop - when running containers. Defaults to the - default set of capabilities granted by the - container runtime. Note that this field - cannot be set when spec.os.name is windows. + description: |- + The capabilities to add/drop when running containers. + Defaults to the default set of capabilities granted by the container runtime. + Note that this field cannot be set when spec.os.name is windows. properties: add: description: Added capabilities @@ -5271,6 +5390,7 @@ spec: capabilities type type: string type: array + x-kubernetes-list-type: atomic drop: description: Removed capabilities items: @@ -5278,73 +5398,63 @@ spec: capabilities type type: string type: array + x-kubernetes-list-type: atomic type: object privileged: - description: Run container in privileged mode. - Processes in privileged containers are essentially - equivalent to root on the host. Defaults - to false. Note that this field cannot be - set when spec.os.name is windows. + description: |- + Run container in privileged mode. + Processes in privileged containers are essentially equivalent to root on the host. + Defaults to false. + Note that this field cannot be set when spec.os.name is windows. type: boolean procMount: - description: procMount denotes the type of - proc mount to use for the containers. The - default is DefaultProcMount which uses the - container runtime defaults for readonly - paths and masked paths. This requires the - ProcMountType feature flag to be enabled. - Note that this field cannot be set when - spec.os.name is windows. + description: |- + procMount denotes the type of proc mount to use for the containers. + The default is DefaultProcMount which uses the container runtime defaults for + readonly paths and masked paths. + This requires the ProcMountType feature flag to be enabled. + Note that this field cannot be set when spec.os.name is windows. type: string readOnlyRootFilesystem: - description: Whether this container has a - read-only root filesystem. Default is false. - Note that this field cannot be set when - spec.os.name is windows. + description: |- + Whether this container has a read-only root filesystem. + Default is false. + Note that this field cannot be set when spec.os.name is windows. type: boolean runAsGroup: - description: The GID to run the entrypoint - of the container process. Uses runtime default - if unset. May also be set in PodSecurityContext. If - set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext takes - precedence. Note that this field cannot - be set when spec.os.name is windows. + description: |- + The GID to run the entrypoint of the container process. + Uses runtime default if unset. + May also be set in PodSecurityContext. If set in both SecurityContext and + PodSecurityContext, the value specified in SecurityContext takes precedence. + Note that this field cannot be set when spec.os.name is windows. format: int64 type: integer runAsNonRoot: - description: Indicates that the container - must run as a non-root user. If true, the - Kubelet will validate the image at runtime - to ensure that it does not run as UID 0 - (root) and fail to start the container if - it does. If unset or false, no such validation - will be performed. May also be set in PodSecurityContext. If - set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext takes - precedence. + description: |- + Indicates that the container must run as a non-root user. + If true, the Kubelet will validate the image at runtime to ensure that it + does not run as UID 0 (root) and fail to start the container if it does. + If unset or false, no such validation will be performed. + May also be set in PodSecurityContext. If set in both SecurityContext and + PodSecurityContext, the value specified in SecurityContext takes precedence. type: boolean runAsUser: - description: The UID to run the entrypoint - of the container process. Defaults to user - specified in image metadata if unspecified. - May also be set in PodSecurityContext. If - set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext takes - precedence. Note that this field cannot - be set when spec.os.name is windows. + description: |- + The UID to run the entrypoint of the container process. + Defaults to user specified in image metadata if unspecified. + May also be set in PodSecurityContext. If set in both SecurityContext and + PodSecurityContext, the value specified in SecurityContext takes precedence. + Note that this field cannot be set when spec.os.name is windows. format: int64 type: integer seLinuxOptions: - description: The SELinux context to be applied - to the container. If unspecified, the container - runtime will allocate a random SELinux context - for each container. May also be set in - PodSecurityContext. If set in both SecurityContext - and PodSecurityContext, the value specified - in SecurityContext takes precedence. Note - that this field cannot be set when spec.os.name - is windows. + description: |- + The SELinux context to be applied to the container. + If unspecified, the container runtime will allocate a random SELinux context for each + container. May also be set in PodSecurityContext. If set in both SecurityContext and + PodSecurityContext, the value specified in SecurityContext takes precedence. + Note that this field cannot be set when spec.os.name is windows. properties: level: description: Level is SELinux level label @@ -5364,52 +5474,44 @@ spec: type: string type: object seccompProfile: - description: The seccomp options to use by - this container. If seccomp options are provided - at both the pod & container level, the container - options override the pod options. Note that - this field cannot be set when spec.os.name - is windows. + description: |- + The seccomp options to use by this container. If seccomp options are + provided at both the pod & container level, the container options + override the pod options. + Note that this field cannot be set when spec.os.name is windows. properties: localhostProfile: - description: localhostProfile indicates - a profile defined in a file on the node - should be used. The profile must be - preconfigured on the node to work. Must - be a descending path, relative to the - kubelet's configured seccomp profile - location. Must be set if type is "Localhost". - Must NOT be set for any other type. + description: |- + localhostProfile indicates a profile defined in a file on the node should be used. + The profile must be preconfigured on the node to work. + Must be a descending path, relative to the kubelet's configured seccomp profile location. + Must be set if type is "Localhost". Must NOT be set for any other type. type: string type: - description: "type indicates which kind - of seccomp profile will be applied. - Valid options are: \n Localhost - a - profile defined in a file on the node - should be used. RuntimeDefault - the - container runtime default profile should - be used. Unconfined - no profile should - be applied." + description: |- + type indicates which kind of seccomp profile will be applied. + Valid options are: + + + Localhost - a profile defined in a file on the node should be used. + RuntimeDefault - the container runtime default profile should be used. + Unconfined - no profile should be applied. type: string required: - type type: object windowsOptions: - description: The Windows specific settings - applied to all containers. If unspecified, - the options from the PodSecurityContext - will be used. If set in both SecurityContext - and PodSecurityContext, the value specified - in SecurityContext takes precedence. Note - that this field cannot be set when spec.os.name - is linux. + description: |- + The Windows specific settings applied to all containers. + If unspecified, the options from the PodSecurityContext will be used. + If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. + Note that this field cannot be set when spec.os.name is linux. properties: gmsaCredentialSpec: - description: GMSACredentialSpec is where - the GMSA admission webhook (https://github.com/kubernetes-sigs/windows-gmsa) - inlines the contents of the GMSA credential - spec named by the GMSACredentialSpecName - field. + description: |- + GMSACredentialSpec is where the GMSA admission webhook + (https://github.com/kubernetes-sigs/windows-gmsa) inlines the contents of the + GMSA credential spec named by the GMSACredentialSpecName field. type: string gmsaCredentialSpecName: description: GMSACredentialSpecName is @@ -5417,25 +5519,18 @@ spec: to use. type: string hostProcess: - description: HostProcess determines if - a container should be run as a 'Host - Process' container. All of a Pod's containers - must have the same effective HostProcess - value (it is not allowed to have a mix - of HostProcess containers and non-HostProcess - containers). In addition, if HostProcess - is true then HostNetwork must also be - set to true. + description: |- + HostProcess determines if a container should be run as a 'Host Process' container. + All of a Pod's containers must have the same effective HostProcess value + (it is not allowed to have a mix of HostProcess containers and non-HostProcess containers). + In addition, if HostProcess is true then HostNetwork must also be set to true. type: boolean runAsUserName: - description: The UserName in Windows to - run the entrypoint of the container - process. Defaults to the user specified - in image metadata if unspecified. May - also be set in PodSecurityContext. If - set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext - takes precedence. + description: |- + The UserName in Windows to run the entrypoint of the container process. + Defaults to the user specified in image metadata if unspecified. + May also be set in PodSecurityContext. If set in both SecurityContext and + PodSecurityContext, the value specified in SecurityContext takes precedence. type: string type: object type: object @@ -5448,26 +5543,21 @@ spec: take. properties: command: - description: Command is the command line - to execute inside the container, the - working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it is - not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to explicitly - call out to that shell. Exit status - of 0 is treated as live/healthy and - non-zero is unhealthy. + description: |- + Command is the command line to execute inside the container, the working directory for the + command is root ('/') in the container's filesystem. The command is simply exec'd, it is + not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use + a shell, you need to explicitly call out to that shell. + Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array + x-kubernetes-list-type: atomic type: object failureThreshold: - description: Minimum consecutive failures - for the probe to be considered failed after - having succeeded. Defaults to 3. Minimum - value is 1. + description: |- + Minimum consecutive failures for the probe to be considered failed after having succeeded. + Defaults to 3. Minimum value is 1. format: int32 type: integer grpc: @@ -5480,11 +5570,12 @@ spec: format: int32 type: integer service: - description: "Service is the name of the - service to place in the gRPC HealthCheckRequest + description: |- + Service is the name of the service to place in the gRPC HealthCheckRequest (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - \n If this is not specified, the default - behavior is defined by gRPC." + + + If this is not specified, the default behavior is defined by gRPC. type: string required: - port @@ -5494,9 +5585,9 @@ spec: to perform. properties: host: - description: Host name to connect to, - defaults to the pod IP. You probably - want to set "Host" in httpHeaders instead. + description: |- + Host name to connect to, defaults to the pod IP. You probably want to set + "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set in @@ -5506,11 +5597,9 @@ spec: custom header to be used in HTTP probes properties: name: - description: The header field name. - This will be canonicalized upon - output, so case-variant names - will be understood as the same - header. + description: |- + The header field name. + This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field value @@ -5520,6 +5609,7 @@ spec: - value type: object type: array + x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. @@ -5528,36 +5618,35 @@ spec: anyOf: - type: integer - type: string - description: Name or number of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. + description: |- + Name or number of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. + description: |- + Scheme to use for connecting to the host. + Defaults to HTTP. type: string required: - port type: object initialDelaySeconds: - description: 'Number of seconds after the - container has started before liveness probes - are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + description: |- + Number of seconds after the container has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer periodSeconds: - description: How often (in seconds) to perform - the probe. Default to 10 seconds. Minimum - value is 1. + description: |- + How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. format: int32 type: integer successThreshold: - description: Minimum consecutive successes - for the probe to be considered successful - after having failed. Defaults to 1. Must - be 1 for liveness and startup. Minimum value - is 1. + description: |- + Minimum consecutive successes for the probe to be considered successful after having failed. + Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. format: int32 type: integer tcpSocket: @@ -5572,103 +5661,86 @@ spec: anyOf: - type: integer - type: string - description: Number or name of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. + description: |- + Number or name of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object terminationGracePeriodSeconds: - description: Optional duration in seconds - the pod needs to terminate gracefully upon - probe failure. The grace period is the duration - in seconds after the processes running in - the pod are sent a termination signal and - the time when the processes are forcibly - halted with a kill signal. Set this value - longer than the expected cleanup time for - your process. If this value is nil, the - pod's terminationGracePeriodSeconds will - be used. Otherwise, this value overrides - the value provided by the pod spec. Value - must be non-negative integer. The value - zero indicates stop immediately via the - kill signal (no opportunity to shut down). - This is a beta field and requires enabling - ProbeTerminationGracePeriod feature gate. - Minimum value is 1. spec.terminationGracePeriodSeconds - is used if unset. + description: |- + Optional duration in seconds the pod needs to terminate gracefully upon probe failure. + The grace period is the duration in seconds after the processes running in the pod are sent + a termination signal and the time when the processes are forcibly halted with a kill signal. + Set this value longer than the expected cleanup time for your process. + If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this + value overrides the value provided by the pod spec. + Value must be non-negative integer. The value zero indicates stop immediately via + the kill signal (no opportunity to shut down). + This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. + Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. format: int64 type: integer timeoutSeconds: - description: 'Number of seconds after which - the probe times out. Defaults to 1 second. - Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + description: |- + Number of seconds after which the probe times out. + Defaults to 1 second. Minimum value is 1. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer type: object stdin: - description: Whether this container should allocate - a buffer for stdin in the container runtime. - If this is not set, reads from stdin in the - container will always result in EOF. Default - is false. + description: |- + Whether this container should allocate a buffer for stdin in the container runtime. If this + is not set, reads from stdin in the container will always result in EOF. + Default is false. type: boolean stdinOnce: - description: Whether the container runtime should - close the stdin channel after it has been opened - by a single attach. When stdin is true the stdin - stream will remain open across multiple attach - sessions. If stdinOnce is set to true, stdin - is opened on container start, is empty until - the first client attaches to stdin, and then - remains open and accepts data until the client - disconnects, at which time stdin is closed and - remains closed until the container is restarted. - If this flag is false, a container processes - that reads from stdin will never receive an - EOF. Default is false + description: |- + Whether the container runtime should close the stdin channel after it has been opened by + a single attach. When stdin is true the stdin stream will remain open across multiple attach + sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the + first client attaches to stdin, and then remains open and accepts data until the client disconnects, + at which time stdin is closed and remains closed until the container is restarted. If this + flag is false, a container processes that reads from stdin will never receive an EOF. + Default is false type: boolean targetContainerName: - description: "If set, the name of the container - from PodSpec that this ephemeral container targets. - The ephemeral container will be run in the namespaces - (IPC, PID, etc) of this container. If not set - then the ephemeral container uses the namespaces - configured in the Pod spec. \n The container - runtime must implement support for this feature. - If the runtime does not support namespace targeting - then the result of setting this field is undefined." + description: |- + If set, the name of the container from PodSpec that this ephemeral container targets. + The ephemeral container will be run in the namespaces (IPC, PID, etc) of this container. + If not set then the ephemeral container uses the namespaces configured in the Pod spec. + + + The container runtime must implement support for this feature. If the runtime does not + support namespace targeting then the result of setting this field is undefined. type: string terminationMessagePath: - description: 'Optional: Path at which the file - to which the container''s termination message - will be written is mounted into the container''s - filesystem. Message written is intended to be - brief final status, such as an assertion failure - message. Will be truncated by the node if greater - than 4096 bytes. The total message length across - all containers will be limited to 12kb. Defaults - to /dev/termination-log. Cannot be updated.' + description: |- + Optional: Path at which the file to which the container's termination message + will be written is mounted into the container's filesystem. + Message written is intended to be brief final status, such as an assertion failure message. + Will be truncated by the node if greater than 4096 bytes. The total message length across + all containers will be limited to 12kb. + Defaults to /dev/termination-log. + Cannot be updated. type: string terminationMessagePolicy: - description: Indicate how the termination message - should be populated. File will use the contents - of terminationMessagePath to populate the container - status message on both success and failure. - FallbackToLogsOnError will use the last chunk - of container log output if the termination message - file is empty and the container exited with - an error. The log output is limited to 2048 - bytes or 80 lines, whichever is smaller. Defaults - to File. Cannot be updated. + description: |- + Indicate how the termination message should be populated. File will use the contents of + terminationMessagePath to populate the container status message on both success and failure. + FallbackToLogsOnError will use the last chunk of container log output if the termination + message file is empty and the container exited with an error. + The log output is limited to 2048 bytes or 80 lines, whichever is smaller. + Defaults to File. + Cannot be updated. type: string tty: - description: Whether this container should allocate - a TTY for itself, also requires 'stdin' to be - true. Default is false. + description: |- + Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. + Default is false. type: boolean volumeDevices: description: volumeDevices is the list of block @@ -5691,194 +5763,233 @@ spec: - name type: object type: array + x-kubernetes-list-map-keys: + - devicePath + x-kubernetes-list-type: map volumeMounts: - description: Pod volumes to mount into the container's - filesystem. Subpath mounts are not allowed for - ephemeral containers. Cannot be updated. + description: |- + Pod volumes to mount into the container's filesystem. Subpath mounts are not allowed for ephemeral containers. + Cannot be updated. items: description: VolumeMount describes a mounting of a Volume within a container. properties: mountPath: - description: Path within the container at - which the volume should be mounted. Must + description: |- + Path within the container at which the volume should be mounted. Must not contain ':'. type: string mountPropagation: - description: mountPropagation determines - how mounts are propagated from the host + description: |- + mountPropagation determines how mounts are propagated from the host to container and the other way around. - When not set, MountPropagationNone is - used. This field is beta in 1.10. + When not set, MountPropagationNone is used. + This field is beta in 1.10. + When RecursiveReadOnly is set to IfPossible or to Enabled, MountPropagation must be None or unspecified + (which defaults to None). type: string name: description: This must match the Name of a Volume. type: string readOnly: - description: Mounted read-only if true, - read-write otherwise (false or unspecified). + description: |- + Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false. type: boolean + recursiveReadOnly: + description: |- + RecursiveReadOnly specifies whether read-only mounts should be handled + recursively. + + + If ReadOnly is false, this field has no meaning and must be unspecified. + + + If ReadOnly is true, and this field is set to Disabled, the mount is not made + recursively read-only. If this field is set to IfPossible, the mount is made + recursively read-only, if it is supported by the container runtime. If this + field is set to Enabled, the mount is made recursively read-only if it is + supported by the container runtime, otherwise the pod will not be started and + an error will be generated to indicate the reason. + + + If this field is set to IfPossible or Enabled, MountPropagation must be set to + None (or be unspecified, which defaults to None). + + + If this field is not specified, it is treated as an equivalent of Disabled. + type: string subPath: - description: Path within the volume from - which the container's volume should be - mounted. Defaults to "" (volume's root). + description: |- + Path within the volume from which the container's volume should be mounted. + Defaults to "" (volume's root). type: string subPathExpr: - description: Expanded path within the volume - from which the container's volume should - be mounted. Behaves similarly to SubPath - but environment variable references $(VAR_NAME) - are expanded using the container's environment. - Defaults to "" (volume's root). SubPathExpr - and SubPath are mutually exclusive. + description: |- + Expanded path within the volume from which the container's volume should be mounted. + Behaves similarly to SubPath but environment variable references $(VAR_NAME) are expanded using the container's environment. + Defaults to "" (volume's root). + SubPathExpr and SubPath are mutually exclusive. type: string required: - mountPath - name type: object type: array + x-kubernetes-list-map-keys: + - mountPath + x-kubernetes-list-type: map workingDir: - description: Container's working directory. If - not specified, the container runtime's default - will be used, which might be configured in the - container image. Cannot be updated. + description: |- + Container's working directory. + If not specified, the container runtime's default will be used, which + might be configured in the container image. + Cannot be updated. type: string required: - name type: object type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map hostAliases: - description: HostAliases is an optional list of hosts - and IPs that will be injected into the pod's hosts - file if specified. This is only valid for non-hostNetwork - pods. + description: |- + HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts + file if specified. items: - description: HostAlias holds the mapping between IP - and hostnames that will be injected as an entry - in the pod's hosts file. + description: |- + HostAlias holds the mapping between IP and hostnames that will be injected as an entry in the + pod's hosts file. properties: hostnames: description: Hostnames for the above IP address. items: type: string type: array + x-kubernetes-list-type: atomic ip: description: IP address of the host file entry. type: string + required: + - ip type: object type: array + x-kubernetes-list-map-keys: + - ip + x-kubernetes-list-type: map hostIPC: - description: 'Use the host''s ipc namespace. Optional: - Default to false.' + description: |- + Use the host's ipc namespace. + Optional: Default to false. type: boolean hostNetwork: - description: Host networking requested for this pod. - Use the host's network namespace. If this option is - set, the ports that will be used must be specified. + description: |- + Host networking requested for this pod. Use the host's network namespace. + If this option is set, the ports that will be used must be specified. Default to false. type: boolean hostPID: - description: 'Use the host''s pid namespace. Optional: - Default to false.' + description: |- + Use the host's pid namespace. + Optional: Default to false. type: boolean hostUsers: - description: 'Use the host''s user namespace. Optional: - Default to true. If set to true or not present, the - pod will be run in the host user namespace, useful - for when the pod needs a feature only available to - the host user namespace, such as loading a kernel - module with CAP_SYS_MODULE. When set to false, a new - userns is created for the pod. Setting false is useful - for mitigating container breakout vulnerabilities - even allowing users to run their containers as root - without actually having root privileges on the host. - This field is alpha-level and is only honored by servers - that enable the UserNamespacesSupport feature.' + description: |- + Use the host's user namespace. + Optional: Default to true. + If set to true or not present, the pod will be run in the host user namespace, useful + for when the pod needs a feature only available to the host user namespace, such as + loading a kernel module with CAP_SYS_MODULE. + When set to false, a new userns is created for the pod. Setting false is useful for + mitigating container breakout vulnerabilities even allowing users to run their + containers as root without actually having root privileges on the host. + This field is alpha-level and is only honored by servers that enable the UserNamespacesSupport feature. type: boolean hostname: - description: Specifies the hostname of the Pod If not - specified, the pod's hostname will be set to a system-defined - value. + description: |- + Specifies the hostname of the Pod + If not specified, the pod's hostname will be set to a system-defined value. type: string imagePullSecrets: - description: 'ImagePullSecrets is an optional list of - references to secrets in the same namespace to use - for pulling any of the images used by this PodSpec. - If specified, these secrets will be passed to individual - puller implementations for them to use. More info: - https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod' + description: |- + ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. + If specified, these secrets will be passed to individual puller implementations for them to use. + More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod items: - description: LocalObjectReference contains enough - information to let you locate the referenced object - inside the same namespace. + description: |- + LocalObjectReference contains enough information to let you locate the + referenced object inside the same namespace. properties: name: - description: 'Name of the referent. More info: - https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, kind, - uid?' + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + TODO: Add other useful fields. apiVersion, kind, uid? + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. type: string type: object x-kubernetes-map-type: atomic type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map initContainers: - description: 'List of initialization containers belonging - to the pod. Init containers are executed in order - prior to containers being started. If any init container - fails, the pod is considered to have failed and is - handled according to its restartPolicy. The name for - an init container or normal container must be unique - among all containers. Init containers may not have - Lifecycle actions, Readiness probes, Liveness probes, - or Startup probes. The resourceRequirements of an - init container are taken into account during scheduling - by finding the highest request/limit for each resource - type, and then using the max of of that value or the - sum of the normal containers. Limits are applied to - init containers in a similar fashion. Init containers - cannot currently be added or removed. Cannot be updated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/' + description: |- + List of initialization containers belonging to the pod. + Init containers are executed in order prior to containers being started. If any + init container fails, the pod is considered to have failed and is handled according + to its restartPolicy. The name for an init container or normal container must be + unique among all containers. + Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. + The resourceRequirements of an init container are taken into account during scheduling + by finding the highest request/limit for each resource type, and then using the max of + of that value or the sum of the normal containers. Limits are applied to init containers + in a similar fashion. + Init containers cannot currently be added or removed. + Cannot be updated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ items: description: A single application container that you want to run within a pod. properties: args: - description: 'Arguments to the entrypoint. The - container image''s CMD is used if this is not - provided. Variable references $(VAR_NAME) are - expanded using the container''s environment. - If a variable cannot be resolved, the reference - in the input string will be unchanged. Double - $$ are reduced to a single $, which allows for - escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" - will produce the string literal "$(VAR_NAME)". - Escaped references will never be expanded, regardless - of whether the variable exists or not. Cannot - be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + description: |- + Arguments to the entrypoint. + The container image's CMD is used if this is not provided. + Variable references $(VAR_NAME) are expanded using the container's environment. If a variable + cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced + to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will + produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless + of whether the variable exists or not. Cannot be updated. + More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell items: type: string type: array + x-kubernetes-list-type: atomic command: - description: 'Entrypoint array. Not executed within - a shell. The container image''s ENTRYPOINT is - used if this is not provided. Variable references - $(VAR_NAME) are expanded using the container''s - environment. If a variable cannot be resolved, - the reference in the input string will be unchanged. - Double $$ are reduced to a single $, which allows - for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" - will produce the string literal "$(VAR_NAME)". - Escaped references will never be expanded, regardless - of whether the variable exists or not. Cannot - be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + description: |- + Entrypoint array. Not executed within a shell. + The container image's ENTRYPOINT is used if this is not provided. + Variable references $(VAR_NAME) are expanded using the container's environment. If a variable + cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced + to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will + produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless + of whether the variable exists or not. Cannot be updated. + More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell items: type: string type: array + x-kubernetes-list-type: atomic env: - description: List of environment variables to - set in the container. Cannot be updated. + description: |- + List of environment variables to set in the container. + Cannot be updated. items: description: EnvVar represents an environment variable present in a Container. @@ -5888,19 +5999,16 @@ spec: Must be a C_IDENTIFIER. type: string value: - description: 'Variable references $(VAR_NAME) - are expanded using the previously defined - environment variables in the container - and any service environment variables. - If a variable cannot be resolved, the - reference in the input string will be - unchanged. Double $$ are reduced to a - single $, which allows for escaping the - $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" - will produce the string literal "$(VAR_NAME)". - Escaped references will never be expanded, - regardless of whether the variable exists - or not. Defaults to "".' + description: |- + Variable references $(VAR_NAME) are expanded + using the previously defined environment variables in the container and + any service environment variables. If a variable cannot be resolved, + the reference in the input string will be unchanged. Double $$ are reduced + to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. + "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". + Escaped references will never be expanded, regardless of whether the variable + exists or not. + Defaults to "". type: string valueFrom: description: Source for the environment @@ -5914,10 +6022,15 @@ spec: description: The key to select. type: string name: - description: 'Name of the referent. + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + TODO: Add other useful fields. apiVersion, kind, uid? More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. - apiVersion, kind, uid?' + TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. type: string optional: description: Specify whether the @@ -5928,11 +6041,9 @@ spec: type: object x-kubernetes-map-type: atomic fieldRef: - description: 'Selects a field of the - pod: supports metadata.name, metadata.namespace, - `metadata.labels['''']`, `metadata.annotations['''']`, - spec.nodeName, spec.serviceAccountName, - status.hostIP, status.podIP, status.podIPs.' + description: |- + Selects a field of the pod: supports metadata.name, metadata.namespace, `metadata.labels['']`, `metadata.annotations['']`, + spec.nodeName, spec.serviceAccountName, status.hostIP, status.podIP, status.podIPs. properties: apiVersion: description: Version of the schema @@ -5948,12 +6059,9 @@ spec: type: object x-kubernetes-map-type: atomic resourceFieldRef: - description: 'Selects a resource of - the container: only resources limits - and requests (limits.cpu, limits.memory, - limits.ephemeral-storage, requests.cpu, - requests.memory and requests.ephemeral-storage) - are currently supported.' + description: |- + Selects a resource of the container: only resources limits and requests + (limits.cpu, limits.memory, limits.ephemeral-storage, requests.cpu, requests.memory and requests.ephemeral-storage) are currently supported. properties: containerName: description: 'Container name: required @@ -5987,10 +6095,15 @@ spec: secret key. type: string name: - description: 'Name of the referent. + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + TODO: Add other useful fields. apiVersion, kind, uid? More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. - apiVersion, kind, uid?' + TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. type: string optional: description: Specify whether the @@ -6005,16 +6118,17 @@ spec: - name type: object type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map envFrom: - description: List of sources to populate environment - variables in the container. The keys defined - within a source must be a C_IDENTIFIER. All - invalid keys will be reported as an event when - the container is starting. When a key exists - in multiple sources, the value associated with - the last source will take precedence. Values - defined by an Env with a duplicate key will - take precedence. Cannot be updated. + description: |- + List of sources to populate environment variables in the container. + The keys defined within a source must be a C_IDENTIFIER. All invalid keys + will be reported as an event when the container is starting. When a key exists in multiple + sources, the value associated with the last source will take precedence. + Values defined by an Env with a duplicate key will take precedence. + Cannot be updated. items: description: EnvFromSource represents the source of a set of ConfigMaps @@ -6023,10 +6137,15 @@ spec: description: The ConfigMap to select from properties: name: - description: 'Name of the referent. + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + TODO: Add other useful fields. apiVersion, kind, uid? More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' + TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. type: string optional: description: Specify whether the ConfigMap @@ -6043,10 +6162,15 @@ spec: description: The Secret to select from properties: name: - description: 'Name of the referent. + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + TODO: Add other useful fields. apiVersion, kind, uid? More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' + TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. type: string optional: description: Specify whether the Secret @@ -6056,62 +6180,58 @@ spec: x-kubernetes-map-type: atomic type: object type: array + x-kubernetes-list-type: atomic image: - description: 'Container image name. More info: - https://kubernetes.io/docs/concepts/containers/images - This field is optional to allow higher level - config management to default or override container - images in workload controllers like Deployments - and StatefulSets.' + description: |- + Container image name. + More info: https://kubernetes.io/docs/concepts/containers/images + This field is optional to allow higher level config management to default or override + container images in workload controllers like Deployments and StatefulSets. type: string imagePullPolicy: - description: 'Image pull policy. One of Always, - Never, IfNotPresent. Defaults to Always if :latest - tag is specified, or IfNotPresent otherwise. - Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images' + description: |- + Image pull policy. + One of Always, Never, IfNotPresent. + Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. + Cannot be updated. + More info: https://kubernetes.io/docs/concepts/containers/images#updating-images type: string lifecycle: - description: Actions that the management system - should take in response to container lifecycle - events. Cannot be updated. + description: |- + Actions that the management system should take in response to container lifecycle events. + Cannot be updated. properties: postStart: - description: 'PostStart is called immediately - after a container is created. If the handler - fails, the container is terminated and restarted - according to its restart policy. Other management - of the container blocks until the hook completes. - More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' + description: |- + PostStart is called immediately after a container is created. If the handler fails, + the container is terminated and restarted according to its restart policy. + Other management of the container blocks until the hook completes. + More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks properties: exec: description: Exec specifies the action to take. properties: command: - description: Command is the command - line to execute inside the container, - the working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it - is not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to - explicitly call out to that shell. - Exit status of 0 is treated as live/healthy - and non-zero is unhealthy. + description: |- + Command is the command line to execute inside the container, the working directory for the + command is root ('/') in the container's filesystem. The command is simply exec'd, it is + not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use + a shell, you need to explicitly call out to that shell. + Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array + x-kubernetes-list-type: atomic type: object httpGet: description: HTTPGet specifies the http request to perform. properties: host: - description: Host name to connect - to, defaults to the pod IP. You - probably want to set "Host" in httpHeaders - instead. + description: |- + Host name to connect to, defaults to the pod IP. You probably want to set + "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set @@ -6123,11 +6243,9 @@ spec: HTTP probes properties: name: - description: The header field - name. This will be canonicalized - upon output, so case-variant - names will be understood as - the same header. + description: |- + The header field name. + This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field @@ -6138,6 +6256,7 @@ spec: - value type: object type: array + x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. @@ -6146,14 +6265,15 @@ spec: anyOf: - type: integer - type: string - description: Name or number of the - port to access on the container. - Number must be in the range 1 to - 65535. Name must be an IANA_SVC_NAME. + description: |- + Name or number of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. + description: |- + Scheme to use for connecting to the host. + Defaults to HTTP. type: string required: - port @@ -6172,12 +6292,10 @@ spec: - seconds type: object tcpSocket: - description: Deprecated. TCPSocket is - NOT supported as a LifecycleHandler - and kept for the backward compatibility. - There are no validation of this field - and lifecycle hooks will fail in runtime - when tcp handler is specified. + description: |- + Deprecated. TCPSocket is NOT supported as a LifecycleHandler and kept + for the backward compatibility. There are no validation of this field and + lifecycle hooks will fail in runtime when tcp handler is specified. properties: host: description: 'Optional: Host name @@ -6188,61 +6306,51 @@ spec: anyOf: - type: integer - type: string - description: Number or name of the - port to access on the container. - Number must be in the range 1 to - 65535. Name must be an IANA_SVC_NAME. + description: |- + Number or name of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object type: object preStop: - description: 'PreStop is called immediately - before a container is terminated due to - an API request or management event such - as liveness/startup probe failure, preemption, - resource contention, etc. The handler is - not called if the container crashes or exits. - The Pod''s termination grace period countdown - begins before the PreStop hook is executed. - Regardless of the outcome of the handler, - the container will eventually terminate - within the Pod''s termination grace period - (unless delayed by finalizers). Other management - of the container blocks until the hook completes - or until the termination grace period is - reached. More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' + description: |- + PreStop is called immediately before a container is terminated due to an + API request or management event such as liveness/startup probe failure, + preemption, resource contention, etc. The handler is not called if the + container crashes or exits. The Pod's termination grace period countdown begins before the + PreStop hook is executed. Regardless of the outcome of the handler, the + container will eventually terminate within the Pod's termination grace + period (unless delayed by finalizers). Other management of the container blocks until the hook completes + or until the termination grace period is reached. + More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks properties: exec: description: Exec specifies the action to take. properties: command: - description: Command is the command - line to execute inside the container, - the working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it - is not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to - explicitly call out to that shell. - Exit status of 0 is treated as live/healthy - and non-zero is unhealthy. + description: |- + Command is the command line to execute inside the container, the working directory for the + command is root ('/') in the container's filesystem. The command is simply exec'd, it is + not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use + a shell, you need to explicitly call out to that shell. + Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array + x-kubernetes-list-type: atomic type: object httpGet: description: HTTPGet specifies the http request to perform. properties: host: - description: Host name to connect - to, defaults to the pod IP. You - probably want to set "Host" in httpHeaders - instead. + description: |- + Host name to connect to, defaults to the pod IP. You probably want to set + "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set @@ -6254,11 +6362,9 @@ spec: HTTP probes properties: name: - description: The header field - name. This will be canonicalized - upon output, so case-variant - names will be understood as - the same header. + description: |- + The header field name. + This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field @@ -6269,6 +6375,7 @@ spec: - value type: object type: array + x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. @@ -6277,14 +6384,15 @@ spec: anyOf: - type: integer - type: string - description: Name or number of the - port to access on the container. - Number must be in the range 1 to - 65535. Name must be an IANA_SVC_NAME. + description: |- + Name or number of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. + description: |- + Scheme to use for connecting to the host. + Defaults to HTTP. type: string required: - port @@ -6303,12 +6411,10 @@ spec: - seconds type: object tcpSocket: - description: Deprecated. TCPSocket is - NOT supported as a LifecycleHandler - and kept for the backward compatibility. - There are no validation of this field - and lifecycle hooks will fail in runtime - when tcp handler is specified. + description: |- + Deprecated. TCPSocket is NOT supported as a LifecycleHandler and kept + for the backward compatibility. There are no validation of this field and + lifecycle hooks will fail in runtime when tcp handler is specified. properties: host: description: 'Optional: Host name @@ -6319,10 +6425,10 @@ spec: anyOf: - type: integer - type: string - description: Number or name of the - port to access on the container. - Number must be in the range 1 to - 65535. Name must be an IANA_SVC_NAME. + description: |- + Number or name of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port @@ -6330,35 +6436,32 @@ spec: type: object type: object livenessProbe: - description: 'Periodic probe of container liveness. + description: |- + Periodic probe of container liveness. Container will be restarted if the probe fails. - Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + Cannot be updated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes properties: exec: description: Exec specifies the action to take. properties: command: - description: Command is the command line - to execute inside the container, the - working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it is - not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to explicitly - call out to that shell. Exit status - of 0 is treated as live/healthy and - non-zero is unhealthy. + description: |- + Command is the command line to execute inside the container, the working directory for the + command is root ('/') in the container's filesystem. The command is simply exec'd, it is + not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use + a shell, you need to explicitly call out to that shell. + Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array + x-kubernetes-list-type: atomic type: object failureThreshold: - description: Minimum consecutive failures - for the probe to be considered failed after - having succeeded. Defaults to 3. Minimum - value is 1. + description: |- + Minimum consecutive failures for the probe to be considered failed after having succeeded. + Defaults to 3. Minimum value is 1. format: int32 type: integer grpc: @@ -6371,11 +6474,12 @@ spec: format: int32 type: integer service: - description: "Service is the name of the - service to place in the gRPC HealthCheckRequest + description: |- + Service is the name of the service to place in the gRPC HealthCheckRequest (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - \n If this is not specified, the default - behavior is defined by gRPC." + + + If this is not specified, the default behavior is defined by gRPC. type: string required: - port @@ -6385,9 +6489,9 @@ spec: to perform. properties: host: - description: Host name to connect to, - defaults to the pod IP. You probably - want to set "Host" in httpHeaders instead. + description: |- + Host name to connect to, defaults to the pod IP. You probably want to set + "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set in @@ -6397,11 +6501,9 @@ spec: custom header to be used in HTTP probes properties: name: - description: The header field name. - This will be canonicalized upon - output, so case-variant names - will be understood as the same - header. + description: |- + The header field name. + This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field value @@ -6411,6 +6513,7 @@ spec: - value type: object type: array + x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. @@ -6419,36 +6522,35 @@ spec: anyOf: - type: integer - type: string - description: Name or number of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. + description: |- + Name or number of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. + description: |- + Scheme to use for connecting to the host. + Defaults to HTTP. type: string required: - port type: object initialDelaySeconds: - description: 'Number of seconds after the - container has started before liveness probes - are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + description: |- + Number of seconds after the container has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer periodSeconds: - description: How often (in seconds) to perform - the probe. Default to 10 seconds. Minimum - value is 1. + description: |- + How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. format: int32 type: integer successThreshold: - description: Minimum consecutive successes - for the probe to be considered successful - after having failed. Defaults to 1. Must - be 1 for liveness and startup. Minimum value - is 1. + description: |- + Minimum consecutive successes for the probe to be considered successful after having failed. + Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. format: int32 type: integer tcpSocket: @@ -6463,56 +6565,49 @@ spec: anyOf: - type: integer - type: string - description: Number or name of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. + description: |- + Number or name of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object terminationGracePeriodSeconds: - description: Optional duration in seconds - the pod needs to terminate gracefully upon - probe failure. The grace period is the duration - in seconds after the processes running in - the pod are sent a termination signal and - the time when the processes are forcibly - halted with a kill signal. Set this value - longer than the expected cleanup time for - your process. If this value is nil, the - pod's terminationGracePeriodSeconds will - be used. Otherwise, this value overrides - the value provided by the pod spec. Value - must be non-negative integer. The value - zero indicates stop immediately via the - kill signal (no opportunity to shut down). - This is a beta field and requires enabling - ProbeTerminationGracePeriod feature gate. - Minimum value is 1. spec.terminationGracePeriodSeconds - is used if unset. + description: |- + Optional duration in seconds the pod needs to terminate gracefully upon probe failure. + The grace period is the duration in seconds after the processes running in the pod are sent + a termination signal and the time when the processes are forcibly halted with a kill signal. + Set this value longer than the expected cleanup time for your process. + If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this + value overrides the value provided by the pod spec. + Value must be non-negative integer. The value zero indicates stop immediately via + the kill signal (no opportunity to shut down). + This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. + Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. format: int64 type: integer timeoutSeconds: - description: 'Number of seconds after which - the probe times out. Defaults to 1 second. - Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + description: |- + Number of seconds after which the probe times out. + Defaults to 1 second. Minimum value is 1. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer type: object name: - description: Name of the container specified as - a DNS_LABEL. Each container in a pod must have - a unique name (DNS_LABEL). Cannot be updated. + description: |- + Name of the container specified as a DNS_LABEL. + Each container in a pod must have a unique name (DNS_LABEL). + Cannot be updated. type: string ports: - description: List of ports to expose from the - container. Not specifying a port here DOES NOT - prevent that port from being exposed. Any port - which is listening on the default "0.0.0.0" - address inside a container will be accessible - from the network. Modifying this array with - strategic merge patch may corrupt the data. + description: |- + List of ports to expose from the container. Not specifying a port here + DOES NOT prevent that port from being exposed. Any port which is + listening on the default "0.0.0.0" address inside a container will be + accessible from the network. + Modifying this array with strategic merge patch may corrupt the data. For more information See https://github.com/kubernetes/kubernetes/issues/108255. Cannot be updated. items: @@ -6520,9 +6615,9 @@ spec: port in a single container. properties: containerPort: - description: Number of port to expose on - the pod's IP address. This must be a valid - port number, 0 < x < 65536. + description: |- + Number of port to expose on the pod's IP address. + This must be a valid port number, 0 < x < 65536. format: int32 type: integer hostIP: @@ -6530,24 +6625,24 @@ spec: port to. type: string hostPort: - description: Number of port to expose on - the host. If specified, this must be a - valid port number, 0 < x < 65536. If HostNetwork - is specified, this must match ContainerPort. + description: |- + Number of port to expose on the host. + If specified, this must be a valid port number, 0 < x < 65536. + If HostNetwork is specified, this must match ContainerPort. Most containers do not need this. format: int32 type: integer name: - description: If specified, this must be - an IANA_SVC_NAME and unique within the - pod. Each named port in a pod must have - a unique name. Name for the port that - can be referred to by services. + description: |- + If specified, this must be an IANA_SVC_NAME and unique within the pod. Each + named port in a pod must have a unique name. Name for the port that can be + referred to by services. type: string protocol: default: TCP - description: Protocol for port. Must be - UDP, TCP, or SCTP. Defaults to "TCP". + description: |- + Protocol for port. Must be UDP, TCP, or SCTP. + Defaults to "TCP". type: string required: - containerPort @@ -6558,36 +6653,32 @@ spec: - protocol x-kubernetes-list-type: map readinessProbe: - description: 'Periodic probe of container service - readiness. Container will be removed from service - endpoints if the probe fails. Cannot be updated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + description: |- + Periodic probe of container service readiness. + Container will be removed from service endpoints if the probe fails. + Cannot be updated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes properties: exec: description: Exec specifies the action to take. properties: command: - description: Command is the command line - to execute inside the container, the - working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it is - not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to explicitly - call out to that shell. Exit status - of 0 is treated as live/healthy and - non-zero is unhealthy. + description: |- + Command is the command line to execute inside the container, the working directory for the + command is root ('/') in the container's filesystem. The command is simply exec'd, it is + not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use + a shell, you need to explicitly call out to that shell. + Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array + x-kubernetes-list-type: atomic type: object failureThreshold: - description: Minimum consecutive failures - for the probe to be considered failed after - having succeeded. Defaults to 3. Minimum - value is 1. + description: |- + Minimum consecutive failures for the probe to be considered failed after having succeeded. + Defaults to 3. Minimum value is 1. format: int32 type: integer grpc: @@ -6600,11 +6691,12 @@ spec: format: int32 type: integer service: - description: "Service is the name of the - service to place in the gRPC HealthCheckRequest + description: |- + Service is the name of the service to place in the gRPC HealthCheckRequest (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - \n If this is not specified, the default - behavior is defined by gRPC." + + + If this is not specified, the default behavior is defined by gRPC. type: string required: - port @@ -6614,9 +6706,9 @@ spec: to perform. properties: host: - description: Host name to connect to, - defaults to the pod IP. You probably - want to set "Host" in httpHeaders instead. + description: |- + Host name to connect to, defaults to the pod IP. You probably want to set + "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set in @@ -6626,11 +6718,9 @@ spec: custom header to be used in HTTP probes properties: name: - description: The header field name. - This will be canonicalized upon - output, so case-variant names - will be understood as the same - header. + description: |- + The header field name. + This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field value @@ -6640,6 +6730,7 @@ spec: - value type: object type: array + x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. @@ -6648,36 +6739,35 @@ spec: anyOf: - type: integer - type: string - description: Name or number of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. + description: |- + Name or number of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. + description: |- + Scheme to use for connecting to the host. + Defaults to HTTP. type: string required: - port type: object initialDelaySeconds: - description: 'Number of seconds after the - container has started before liveness probes - are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + description: |- + Number of seconds after the container has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer periodSeconds: - description: How often (in seconds) to perform - the probe. Default to 10 seconds. Minimum - value is 1. + description: |- + How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. format: int32 type: integer successThreshold: - description: Minimum consecutive successes - for the probe to be considered successful - after having failed. Defaults to 1. Must - be 1 for liveness and startup. Minimum value - is 1. + description: |- + Minimum consecutive successes for the probe to be considered successful after having failed. + Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. format: int32 type: integer tcpSocket: @@ -6692,40 +6782,33 @@ spec: anyOf: - type: integer - type: string - description: Number or name of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. + description: |- + Number or name of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object terminationGracePeriodSeconds: - description: Optional duration in seconds - the pod needs to terminate gracefully upon - probe failure. The grace period is the duration - in seconds after the processes running in - the pod are sent a termination signal and - the time when the processes are forcibly - halted with a kill signal. Set this value - longer than the expected cleanup time for - your process. If this value is nil, the - pod's terminationGracePeriodSeconds will - be used. Otherwise, this value overrides - the value provided by the pod spec. Value - must be non-negative integer. The value - zero indicates stop immediately via the - kill signal (no opportunity to shut down). - This is a beta field and requires enabling - ProbeTerminationGracePeriod feature gate. - Minimum value is 1. spec.terminationGracePeriodSeconds - is used if unset. + description: |- + Optional duration in seconds the pod needs to terminate gracefully upon probe failure. + The grace period is the duration in seconds after the processes running in the pod are sent + a termination signal and the time when the processes are forcibly halted with a kill signal. + Set this value longer than the expected cleanup time for your process. + If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this + value overrides the value provided by the pod spec. + Value must be non-negative integer. The value zero indicates stop immediately via + the kill signal (no opportunity to shut down). + This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. + Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. format: int64 type: integer timeoutSeconds: - description: 'Number of seconds after which - the probe times out. Defaults to 1 second. - Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + description: |- + Number of seconds after which the probe times out. + Defaults to 1 second. Minimum value is 1. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer type: object @@ -6736,14 +6819,14 @@ spec: resource resize policy for the container. properties: resourceName: - description: 'Name of the resource to which - this resource resize policy applies. Supported - values: cpu, memory.' + description: |- + Name of the resource to which this resource resize policy applies. + Supported values: cpu, memory. type: string restartPolicy: - description: Restart policy to apply when - specified resource is resized. If not - specified, it defaults to NotRequired. + description: |- + Restart policy to apply when specified resource is resized. + If not specified, it defaults to NotRequired. type: string required: - resourceName @@ -6752,26 +6835,31 @@ spec: type: array x-kubernetes-list-type: atomic resources: - description: 'Compute Resources required by this - container. Cannot be updated. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + description: |- + Compute Resources required by this container. + Cannot be updated. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ properties: claims: - description: "Claims lists the names of resources, - defined in spec.resourceClaims, that are - used by this container. \n This is an alpha - field and requires enabling the DynamicResourceAllocation - feature gate. \n This field is immutable. - It can only be set for containers." + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + + This is an alpha field and requires enabling the + DynamicResourceAllocation feature gate. + + + This field is immutable. It can only be set for containers. items: description: ResourceClaim references one entry in PodSpec.ResourceClaims. properties: name: - description: Name must match the name - of one entry in pod.spec.resourceClaims - of the Pod where this field is used. - It makes that resource available inside - a container. + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. type: string required: - name @@ -6787,9 +6875,9 @@ spec: - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true - description: 'Limits describes the maximum - amount of compute resources allowed. More - info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ type: object requests: additionalProperties: @@ -6798,64 +6886,76 @@ spec: - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true - description: 'Requests describes the minimum - amount of compute resources required. If - Requests is omitted for a container, it - defaults to Limits if that is explicitly - specified, otherwise to an implementation-defined - value. Requests cannot exceed Limits. More - info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ type: object type: object restartPolicy: - description: 'RestartPolicy defines the restart - behavior of individual containers in a pod. - This field may only be set for init containers, - and the only allowed value is "Always". For - non-init containers or when this field is not - specified, the restart behavior is defined by - the Pod''s restart policy and the container - type. Setting the RestartPolicy as "Always" - for the init container will have the following - effect: this init container will be continually - restarted on exit until all regular containers - have terminated. Once all regular containers - have completed, all init containers with restartPolicy - "Always" will be shut down. This lifecycle differs - from normal init containers and is often referred - to as a "sidecar" container. Although this init - container still starts in the init container - sequence, it does not wait for the container - to complete before proceeding to the next init - container. Instead, the next init container - starts immediately after this init container - is started, or after any startupProbe has successfully - completed.' + description: |- + RestartPolicy defines the restart behavior of individual containers in a pod. + This field may only be set for init containers, and the only allowed value is "Always". + For non-init containers or when this field is not specified, + the restart behavior is defined by the Pod's restart policy and the container type. + Setting the RestartPolicy as "Always" for the init container will have the following effect: + this init container will be continually restarted on + exit until all regular containers have terminated. Once all regular + containers have completed, all init containers with restartPolicy "Always" + will be shut down. This lifecycle differs from normal init containers and + is often referred to as a "sidecar" container. Although this init + container still starts in the init container sequence, it does not wait + for the container to complete before proceeding to the next init + container. Instead, the next init container starts immediately after this + init container is started, or after any startupProbe has successfully + completed. type: string securityContext: - description: 'SecurityContext defines the security - options the container should be run with. If - set, the fields of SecurityContext override - the equivalent fields of PodSecurityContext. - More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/' + description: |- + SecurityContext defines the security options the container should be run with. + If set, the fields of SecurityContext override the equivalent fields of PodSecurityContext. + More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ properties: allowPrivilegeEscalation: - description: 'AllowPrivilegeEscalation controls - whether a process can gain more privileges - than its parent process. This bool directly - controls if the no_new_privs flag will be - set on the container process. AllowPrivilegeEscalation - is true always when the container is: 1) - run as Privileged 2) has CAP_SYS_ADMIN Note - that this field cannot be set when spec.os.name - is windows.' + description: |- + AllowPrivilegeEscalation controls whether a process can gain more + privileges than its parent process. This bool directly controls if + the no_new_privs flag will be set on the container process. + AllowPrivilegeEscalation is true always when the container is: + 1) run as Privileged + 2) has CAP_SYS_ADMIN + Note that this field cannot be set when spec.os.name is windows. type: boolean + appArmorProfile: + description: |- + appArmorProfile is the AppArmor options to use by this container. If set, this profile + overrides the pod's appArmorProfile. + Note that this field cannot be set when spec.os.name is windows. + properties: + localhostProfile: + description: |- + localhostProfile indicates a profile loaded on the node that should be used. + The profile must be preconfigured on the node to work. + Must match the loaded name of the profile. + Must be set if and only if type is "Localhost". + type: string + type: + description: |- + type indicates which kind of AppArmor profile will be applied. + Valid options are: + Localhost - a profile pre-loaded on the node. + RuntimeDefault - the container runtime's default profile. + Unconfined - no AppArmor enforcement. + type: string + required: + - type + type: object capabilities: - description: The capabilities to add/drop - when running containers. Defaults to the - default set of capabilities granted by the - container runtime. Note that this field - cannot be set when spec.os.name is windows. + description: |- + The capabilities to add/drop when running containers. + Defaults to the default set of capabilities granted by the container runtime. + Note that this field cannot be set when spec.os.name is windows. properties: add: description: Added capabilities @@ -6864,6 +6964,7 @@ spec: capabilities type type: string type: array + x-kubernetes-list-type: atomic drop: description: Removed capabilities items: @@ -6871,73 +6972,63 @@ spec: capabilities type type: string type: array + x-kubernetes-list-type: atomic type: object privileged: - description: Run container in privileged mode. - Processes in privileged containers are essentially - equivalent to root on the host. Defaults - to false. Note that this field cannot be - set when spec.os.name is windows. + description: |- + Run container in privileged mode. + Processes in privileged containers are essentially equivalent to root on the host. + Defaults to false. + Note that this field cannot be set when spec.os.name is windows. type: boolean procMount: - description: procMount denotes the type of - proc mount to use for the containers. The - default is DefaultProcMount which uses the - container runtime defaults for readonly - paths and masked paths. This requires the - ProcMountType feature flag to be enabled. - Note that this field cannot be set when - spec.os.name is windows. + description: |- + procMount denotes the type of proc mount to use for the containers. + The default is DefaultProcMount which uses the container runtime defaults for + readonly paths and masked paths. + This requires the ProcMountType feature flag to be enabled. + Note that this field cannot be set when spec.os.name is windows. type: string readOnlyRootFilesystem: - description: Whether this container has a - read-only root filesystem. Default is false. - Note that this field cannot be set when - spec.os.name is windows. + description: |- + Whether this container has a read-only root filesystem. + Default is false. + Note that this field cannot be set when spec.os.name is windows. type: boolean runAsGroup: - description: The GID to run the entrypoint - of the container process. Uses runtime default - if unset. May also be set in PodSecurityContext. If - set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext takes - precedence. Note that this field cannot - be set when spec.os.name is windows. + description: |- + The GID to run the entrypoint of the container process. + Uses runtime default if unset. + May also be set in PodSecurityContext. If set in both SecurityContext and + PodSecurityContext, the value specified in SecurityContext takes precedence. + Note that this field cannot be set when spec.os.name is windows. format: int64 type: integer runAsNonRoot: - description: Indicates that the container - must run as a non-root user. If true, the - Kubelet will validate the image at runtime - to ensure that it does not run as UID 0 - (root) and fail to start the container if - it does. If unset or false, no such validation - will be performed. May also be set in PodSecurityContext. If - set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext takes - precedence. + description: |- + Indicates that the container must run as a non-root user. + If true, the Kubelet will validate the image at runtime to ensure that it + does not run as UID 0 (root) and fail to start the container if it does. + If unset or false, no such validation will be performed. + May also be set in PodSecurityContext. If set in both SecurityContext and + PodSecurityContext, the value specified in SecurityContext takes precedence. type: boolean runAsUser: - description: The UID to run the entrypoint - of the container process. Defaults to user - specified in image metadata if unspecified. - May also be set in PodSecurityContext. If - set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext takes - precedence. Note that this field cannot - be set when spec.os.name is windows. + description: |- + The UID to run the entrypoint of the container process. + Defaults to user specified in image metadata if unspecified. + May also be set in PodSecurityContext. If set in both SecurityContext and + PodSecurityContext, the value specified in SecurityContext takes precedence. + Note that this field cannot be set when spec.os.name is windows. format: int64 type: integer seLinuxOptions: - description: The SELinux context to be applied - to the container. If unspecified, the container - runtime will allocate a random SELinux context - for each container. May also be set in - PodSecurityContext. If set in both SecurityContext - and PodSecurityContext, the value specified - in SecurityContext takes precedence. Note - that this field cannot be set when spec.os.name - is windows. + description: |- + The SELinux context to be applied to the container. + If unspecified, the container runtime will allocate a random SELinux context for each + container. May also be set in PodSecurityContext. If set in both SecurityContext and + PodSecurityContext, the value specified in SecurityContext takes precedence. + Note that this field cannot be set when spec.os.name is windows. properties: level: description: Level is SELinux level label @@ -6957,52 +7048,44 @@ spec: type: string type: object seccompProfile: - description: The seccomp options to use by - this container. If seccomp options are provided - at both the pod & container level, the container - options override the pod options. Note that - this field cannot be set when spec.os.name - is windows. + description: |- + The seccomp options to use by this container. If seccomp options are + provided at both the pod & container level, the container options + override the pod options. + Note that this field cannot be set when spec.os.name is windows. properties: localhostProfile: - description: localhostProfile indicates - a profile defined in a file on the node - should be used. The profile must be - preconfigured on the node to work. Must - be a descending path, relative to the - kubelet's configured seccomp profile - location. Must be set if type is "Localhost". - Must NOT be set for any other type. + description: |- + localhostProfile indicates a profile defined in a file on the node should be used. + The profile must be preconfigured on the node to work. + Must be a descending path, relative to the kubelet's configured seccomp profile location. + Must be set if type is "Localhost". Must NOT be set for any other type. type: string type: - description: "type indicates which kind - of seccomp profile will be applied. - Valid options are: \n Localhost - a - profile defined in a file on the node - should be used. RuntimeDefault - the - container runtime default profile should - be used. Unconfined - no profile should - be applied." + description: |- + type indicates which kind of seccomp profile will be applied. + Valid options are: + + + Localhost - a profile defined in a file on the node should be used. + RuntimeDefault - the container runtime default profile should be used. + Unconfined - no profile should be applied. type: string required: - type type: object windowsOptions: - description: The Windows specific settings - applied to all containers. If unspecified, - the options from the PodSecurityContext - will be used. If set in both SecurityContext - and PodSecurityContext, the value specified - in SecurityContext takes precedence. Note - that this field cannot be set when spec.os.name - is linux. + description: |- + The Windows specific settings applied to all containers. + If unspecified, the options from the PodSecurityContext will be used. + If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. + Note that this field cannot be set when spec.os.name is linux. properties: gmsaCredentialSpec: - description: GMSACredentialSpec is where - the GMSA admission webhook (https://github.com/kubernetes-sigs/windows-gmsa) - inlines the contents of the GMSA credential - spec named by the GMSACredentialSpecName - field. + description: |- + GMSACredentialSpec is where the GMSA admission webhook + (https://github.com/kubernetes-sigs/windows-gmsa) inlines the contents of the + GMSA credential spec named by the GMSACredentialSpecName field. type: string gmsaCredentialSpecName: description: GMSACredentialSpecName is @@ -7010,65 +7093,51 @@ spec: to use. type: string hostProcess: - description: HostProcess determines if - a container should be run as a 'Host - Process' container. All of a Pod's containers - must have the same effective HostProcess - value (it is not allowed to have a mix - of HostProcess containers and non-HostProcess - containers). In addition, if HostProcess - is true then HostNetwork must also be - set to true. + description: |- + HostProcess determines if a container should be run as a 'Host Process' container. + All of a Pod's containers must have the same effective HostProcess value + (it is not allowed to have a mix of HostProcess containers and non-HostProcess containers). + In addition, if HostProcess is true then HostNetwork must also be set to true. type: boolean runAsUserName: - description: The UserName in Windows to - run the entrypoint of the container - process. Defaults to the user specified - in image metadata if unspecified. May - also be set in PodSecurityContext. If - set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext - takes precedence. + description: |- + The UserName in Windows to run the entrypoint of the container process. + Defaults to the user specified in image metadata if unspecified. + May also be set in PodSecurityContext. If set in both SecurityContext and + PodSecurityContext, the value specified in SecurityContext takes precedence. type: string type: object type: object startupProbe: - description: 'StartupProbe indicates that the - Pod has successfully initialized. If specified, - no other probes are executed until this completes - successfully. If this probe fails, the Pod will - be restarted, just as if the livenessProbe failed. - This can be used to provide different probe - parameters at the beginning of a Pod''s lifecycle, - when it might take a long time to load data - or warm a cache, than during steady-state operation. - This cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + description: |- + StartupProbe indicates that the Pod has successfully initialized. + If specified, no other probes are executed until this completes successfully. + If this probe fails, the Pod will be restarted, just as if the livenessProbe failed. + This can be used to provide different probe parameters at the beginning of a Pod's lifecycle, + when it might take a long time to load data or warm a cache, than during steady-state operation. + This cannot be updated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes properties: exec: description: Exec specifies the action to take. properties: command: - description: Command is the command line - to execute inside the container, the - working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it is - not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to explicitly - call out to that shell. Exit status - of 0 is treated as live/healthy and - non-zero is unhealthy. + description: |- + Command is the command line to execute inside the container, the working directory for the + command is root ('/') in the container's filesystem. The command is simply exec'd, it is + not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use + a shell, you need to explicitly call out to that shell. + Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array + x-kubernetes-list-type: atomic type: object failureThreshold: - description: Minimum consecutive failures - for the probe to be considered failed after - having succeeded. Defaults to 3. Minimum - value is 1. + description: |- + Minimum consecutive failures for the probe to be considered failed after having succeeded. + Defaults to 3. Minimum value is 1. format: int32 type: integer grpc: @@ -7081,11 +7150,12 @@ spec: format: int32 type: integer service: - description: "Service is the name of the - service to place in the gRPC HealthCheckRequest + description: |- + Service is the name of the service to place in the gRPC HealthCheckRequest (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - \n If this is not specified, the default - behavior is defined by gRPC." + + + If this is not specified, the default behavior is defined by gRPC. type: string required: - port @@ -7095,9 +7165,9 @@ spec: to perform. properties: host: - description: Host name to connect to, - defaults to the pod IP. You probably - want to set "Host" in httpHeaders instead. + description: |- + Host name to connect to, defaults to the pod IP. You probably want to set + "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set in @@ -7107,11 +7177,9 @@ spec: custom header to be used in HTTP probes properties: name: - description: The header field name. - This will be canonicalized upon - output, so case-variant names - will be understood as the same - header. + description: |- + The header field name. + This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field value @@ -7121,6 +7189,7 @@ spec: - value type: object type: array + x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. @@ -7129,36 +7198,35 @@ spec: anyOf: - type: integer - type: string - description: Name or number of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. + description: |- + Name or number of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. + description: |- + Scheme to use for connecting to the host. + Defaults to HTTP. type: string required: - port type: object initialDelaySeconds: - description: 'Number of seconds after the - container has started before liveness probes - are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + description: |- + Number of seconds after the container has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer periodSeconds: - description: How often (in seconds) to perform - the probe. Default to 10 seconds. Minimum - value is 1. + description: |- + How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. format: int32 type: integer successThreshold: - description: Minimum consecutive successes - for the probe to be considered successful - after having failed. Defaults to 1. Must - be 1 for liveness and startup. Minimum value - is 1. + description: |- + Minimum consecutive successes for the probe to be considered successful after having failed. + Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. format: int32 type: integer tcpSocket: @@ -7173,92 +7241,76 @@ spec: anyOf: - type: integer - type: string - description: Number or name of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. + description: |- + Number or name of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object terminationGracePeriodSeconds: - description: Optional duration in seconds - the pod needs to terminate gracefully upon - probe failure. The grace period is the duration - in seconds after the processes running in - the pod are sent a termination signal and - the time when the processes are forcibly - halted with a kill signal. Set this value - longer than the expected cleanup time for - your process. If this value is nil, the - pod's terminationGracePeriodSeconds will - be used. Otherwise, this value overrides - the value provided by the pod spec. Value - must be non-negative integer. The value - zero indicates stop immediately via the - kill signal (no opportunity to shut down). - This is a beta field and requires enabling - ProbeTerminationGracePeriod feature gate. - Minimum value is 1. spec.terminationGracePeriodSeconds - is used if unset. + description: |- + Optional duration in seconds the pod needs to terminate gracefully upon probe failure. + The grace period is the duration in seconds after the processes running in the pod are sent + a termination signal and the time when the processes are forcibly halted with a kill signal. + Set this value longer than the expected cleanup time for your process. + If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this + value overrides the value provided by the pod spec. + Value must be non-negative integer. The value zero indicates stop immediately via + the kill signal (no opportunity to shut down). + This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. + Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. format: int64 type: integer timeoutSeconds: - description: 'Number of seconds after which - the probe times out. Defaults to 1 second. - Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + description: |- + Number of seconds after which the probe times out. + Defaults to 1 second. Minimum value is 1. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer type: object stdin: - description: Whether this container should allocate - a buffer for stdin in the container runtime. - If this is not set, reads from stdin in the - container will always result in EOF. Default - is false. + description: |- + Whether this container should allocate a buffer for stdin in the container runtime. If this + is not set, reads from stdin in the container will always result in EOF. + Default is false. type: boolean stdinOnce: - description: Whether the container runtime should - close the stdin channel after it has been opened - by a single attach. When stdin is true the stdin - stream will remain open across multiple attach - sessions. If stdinOnce is set to true, stdin - is opened on container start, is empty until - the first client attaches to stdin, and then - remains open and accepts data until the client - disconnects, at which time stdin is closed and - remains closed until the container is restarted. - If this flag is false, a container processes - that reads from stdin will never receive an - EOF. Default is false + description: |- + Whether the container runtime should close the stdin channel after it has been opened by + a single attach. When stdin is true the stdin stream will remain open across multiple attach + sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the + first client attaches to stdin, and then remains open and accepts data until the client disconnects, + at which time stdin is closed and remains closed until the container is restarted. If this + flag is false, a container processes that reads from stdin will never receive an EOF. + Default is false type: boolean terminationMessagePath: - description: 'Optional: Path at which the file - to which the container''s termination message - will be written is mounted into the container''s - filesystem. Message written is intended to be - brief final status, such as an assertion failure - message. Will be truncated by the node if greater - than 4096 bytes. The total message length across - all containers will be limited to 12kb. Defaults - to /dev/termination-log. Cannot be updated.' + description: |- + Optional: Path at which the file to which the container's termination message + will be written is mounted into the container's filesystem. + Message written is intended to be brief final status, such as an assertion failure message. + Will be truncated by the node if greater than 4096 bytes. The total message length across + all containers will be limited to 12kb. + Defaults to /dev/termination-log. + Cannot be updated. type: string terminationMessagePolicy: - description: Indicate how the termination message - should be populated. File will use the contents - of terminationMessagePath to populate the container - status message on both success and failure. - FallbackToLogsOnError will use the last chunk - of container log output if the termination message - file is empty and the container exited with - an error. The log output is limited to 2048 - bytes or 80 lines, whichever is smaller. Defaults - to File. Cannot be updated. + description: |- + Indicate how the termination message should be populated. File will use the contents of + terminationMessagePath to populate the container status message on both success and failure. + FallbackToLogsOnError will use the last chunk of container log output if the termination + message file is empty and the container exited with an error. + The log output is limited to 2048 bytes or 80 lines, whichever is smaller. + Defaults to File. + Cannot be updated. type: string tty: - description: Whether this container should allocate - a TTY for itself, also requires 'stdin' to be - true. Default is false. + description: |- + Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. + Default is false. type: boolean volumeDevices: description: volumeDevices is the list of block @@ -7281,106 +7333,153 @@ spec: - name type: object type: array + x-kubernetes-list-map-keys: + - devicePath + x-kubernetes-list-type: map volumeMounts: - description: Pod volumes to mount into the container's - filesystem. Cannot be updated. + description: |- + Pod volumes to mount into the container's filesystem. + Cannot be updated. items: description: VolumeMount describes a mounting of a Volume within a container. properties: mountPath: - description: Path within the container at - which the volume should be mounted. Must + description: |- + Path within the container at which the volume should be mounted. Must not contain ':'. type: string mountPropagation: - description: mountPropagation determines - how mounts are propagated from the host + description: |- + mountPropagation determines how mounts are propagated from the host to container and the other way around. - When not set, MountPropagationNone is - used. This field is beta in 1.10. + When not set, MountPropagationNone is used. + This field is beta in 1.10. + When RecursiveReadOnly is set to IfPossible or to Enabled, MountPropagation must be None or unspecified + (which defaults to None). type: string name: description: This must match the Name of a Volume. type: string readOnly: - description: Mounted read-only if true, - read-write otherwise (false or unspecified). + description: |- + Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false. type: boolean + recursiveReadOnly: + description: |- + RecursiveReadOnly specifies whether read-only mounts should be handled + recursively. + + + If ReadOnly is false, this field has no meaning and must be unspecified. + + + If ReadOnly is true, and this field is set to Disabled, the mount is not made + recursively read-only. If this field is set to IfPossible, the mount is made + recursively read-only, if it is supported by the container runtime. If this + field is set to Enabled, the mount is made recursively read-only if it is + supported by the container runtime, otherwise the pod will not be started and + an error will be generated to indicate the reason. + + + If this field is set to IfPossible or Enabled, MountPropagation must be set to + None (or be unspecified, which defaults to None). + + + If this field is not specified, it is treated as an equivalent of Disabled. + type: string subPath: - description: Path within the volume from - which the container's volume should be - mounted. Defaults to "" (volume's root). + description: |- + Path within the volume from which the container's volume should be mounted. + Defaults to "" (volume's root). type: string subPathExpr: - description: Expanded path within the volume - from which the container's volume should - be mounted. Behaves similarly to SubPath - but environment variable references $(VAR_NAME) - are expanded using the container's environment. - Defaults to "" (volume's root). SubPathExpr - and SubPath are mutually exclusive. + description: |- + Expanded path within the volume from which the container's volume should be mounted. + Behaves similarly to SubPath but environment variable references $(VAR_NAME) are expanded using the container's environment. + Defaults to "" (volume's root). + SubPathExpr and SubPath are mutually exclusive. type: string required: - mountPath - name type: object type: array + x-kubernetes-list-map-keys: + - mountPath + x-kubernetes-list-type: map workingDir: - description: Container's working directory. If - not specified, the container runtime's default - will be used, which might be configured in the - container image. Cannot be updated. + description: |- + Container's working directory. + If not specified, the container runtime's default will be used, which + might be configured in the container image. + Cannot be updated. type: string required: - name type: object type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map nodeName: - description: NodeName is a request to schedule this - pod onto a specific node. If it is non-empty, the - scheduler simply schedules this pod onto that node, - assuming that it fits resource requirements. + description: |- + NodeName is a request to schedule this pod onto a specific node. If it is non-empty, + the scheduler simply schedules this pod onto that node, assuming that it fits resource + requirements. type: string nodeSelector: additionalProperties: type: string - description: 'NodeSelector is a selector which must - be true for the pod to fit on a node. Selector which - must match a node''s labels for the pod to be scheduled - on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/' + description: |- + NodeSelector is a selector which must be true for the pod to fit on a node. + Selector which must match a node's labels for the pod to be scheduled on that node. + More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ type: object x-kubernetes-map-type: atomic os: - description: "Specifies the OS of the containers in - the pod. Some pod and container fields are restricted - if this is set. \n If the OS field is set to linux, - the following fields must be unset: -securityContext.windowsOptions - \n If the OS field is set to windows, following fields - must be unset: - spec.hostPID - spec.hostIPC - spec.hostUsers - - spec.securityContext.seLinuxOptions - spec.securityContext.seccompProfile - - spec.securityContext.fsGroup - spec.securityContext.fsGroupChangePolicy - - spec.securityContext.sysctls - spec.shareProcessNamespace - - spec.securityContext.runAsUser - spec.securityContext.runAsGroup - - spec.securityContext.supplementalGroups - spec.containers[*].securityContext.seLinuxOptions + description: |- + Specifies the OS of the containers in the pod. + Some pod and container fields are restricted if this is set. + + + If the OS field is set to linux, the following fields must be unset: + -securityContext.windowsOptions + + + If the OS field is set to windows, following fields must be unset: + - spec.hostPID + - spec.hostIPC + - spec.hostUsers + - spec.securityContext.appArmorProfile + - spec.securityContext.seLinuxOptions + - spec.securityContext.seccompProfile + - spec.securityContext.fsGroup + - spec.securityContext.fsGroupChangePolicy + - spec.securityContext.sysctls + - spec.shareProcessNamespace + - spec.securityContext.runAsUser + - spec.securityContext.runAsGroup + - spec.securityContext.supplementalGroups + - spec.containers[*].securityContext.appArmorProfile + - spec.containers[*].securityContext.seLinuxOptions - spec.containers[*].securityContext.seccompProfile - spec.containers[*].securityContext.capabilities - spec.containers[*].securityContext.readOnlyRootFilesystem - - spec.containers[*].securityContext.privileged - - spec.containers[*].securityContext.allowPrivilegeEscalation - - spec.containers[*].securityContext.procMount - spec.containers[*].securityContext.runAsUser - - spec.containers[*].securityContext.runAsGroup" + - spec.containers[*].securityContext.privileged + - spec.containers[*].securityContext.allowPrivilegeEscalation + - spec.containers[*].securityContext.procMount + - spec.containers[*].securityContext.runAsUser + - spec.containers[*].securityContext.runAsGroup properties: name: - description: 'Name is the name of the operating - system. The currently supported values are linux - and windows. Additional value may be defined in - future and can be one of: https://github.com/opencontainers/runtime-spec/blob/master/config.md#platform-specific-configuration - Clients should expect to handle additional values - and treat unrecognized values in this field as - os: null' + description: |- + Name is the name of the operating system. The currently supported values are linux and windows. + Additional value may be defined in future and can be one of: + https://github.com/opencontainers/runtime-spec/blob/master/config.md#platform-specific-configuration + Clients should expect to handle additional values and treat unrecognized values in this field as os: null type: string required: - name @@ -7392,48 +7491,45 @@ spec: - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true - description: 'Overhead represents the resource overhead - associated with running a pod for a given RuntimeClass. - This field will be autopopulated at admission time - by the RuntimeClass admission controller. If the RuntimeClass - admission controller is enabled, overhead must not - be set in Pod create requests. The RuntimeClass admission - controller will reject Pod create requests which have - the overhead already set. If RuntimeClass is configured - and selected in the PodSpec, Overhead will be set - to the value defined in the corresponding RuntimeClass, - otherwise it will remain unset and treated as zero. - More info: https://git.k8s.io/enhancements/keps/sig-node/688-pod-overhead/README.md' + description: |- + Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. + This field will be autopopulated at admission time by the RuntimeClass admission controller. If + the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. + The RuntimeClass admission controller will reject Pod create requests which have the overhead already + set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value + defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. + More info: https://git.k8s.io/enhancements/keps/sig-node/688-pod-overhead/README.md type: object preemptionPolicy: - description: PreemptionPolicy is the Policy for preempting - pods with lower priority. One of Never, PreemptLowerPriority. + description: |- + PreemptionPolicy is the Policy for preempting pods with lower priority. + One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. type: string priority: - description: The priority value. Various system components - use this field to find the priority of the pod. When - Priority Admission Controller is enabled, it prevents - users from setting this field. The admission controller - populates this field from PriorityClassName. The higher - the value, the higher the priority. + description: |- + The priority value. Various system components use this field to find the + priority of the pod. When Priority Admission Controller is enabled, it + prevents users from setting this field. The admission controller populates + this field from PriorityClassName. + The higher the value, the higher the priority. format: int32 type: integer priorityClassName: - description: If specified, indicates the pod's priority. - "system-node-critical" and "system-cluster-critical" - are two special keywords which indicate the highest - priorities with the former being the highest priority. - Any other name must be defined by creating a PriorityClass - object with that name. If not specified, the pod priority - will be default or zero if there is no default. + description: |- + If specified, indicates the pod's priority. "system-node-critical" and + "system-cluster-critical" are two special keywords which indicate the + highest priorities with the former being the highest priority. Any other + name must be defined by creating a PriorityClass object with that name. + If not specified, the pod priority will be default or zero if there is no + default. type: string readinessGates: - description: 'If specified, all readiness gates will - be evaluated for pod readiness. A pod is ready when - all its containers are ready AND all conditions specified - in the readiness gates have status equal to "True" - More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates' + description: |- + If specified, all readiness gates will be evaluated for pod readiness. + A pod is ready when all its containers are ready AND + all conditions specified in the readiness gates have status equal to "True" + More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates items: description: PodReadinessGate contains the reference to a pod condition @@ -7446,48 +7542,56 @@ spec: - conditionType type: object type: array + x-kubernetes-list-type: atomic resourceClaims: - description: "ResourceClaims defines which ResourceClaims - must be allocated and reserved before the Pod is allowed - to start. The resources will be made available to - those containers which consume them by name. \n This - is an alpha field and requires enabling the DynamicResourceAllocation - feature gate. \n This field is immutable." + description: |- + ResourceClaims defines which ResourceClaims must be allocated + and reserved before the Pod is allowed to start. The resources + will be made available to those containers which consume them + by name. + + + This is an alpha field and requires enabling the + DynamicResourceAllocation feature gate. + + + This field is immutable. items: - description: PodResourceClaim references exactly one - ResourceClaim through a ClaimSource. It adds a name - to it that uniquely identifies the ResourceClaim - inside the Pod. Containers that need access to the - ResourceClaim reference it with this name. + description: |- + PodResourceClaim references exactly one ResourceClaim through a ClaimSource. + It adds a name to it that uniquely identifies the ResourceClaim inside the Pod. + Containers that need access to the ResourceClaim reference it with this name. properties: name: - description: Name uniquely identifies this resource - claim inside the pod. This must be a DNS_LABEL. + description: |- + Name uniquely identifies this resource claim inside the pod. + This must be a DNS_LABEL. type: string source: description: Source describes where to find the ResourceClaim. properties: resourceClaimName: - description: ResourceClaimName is the name - of a ResourceClaim object in the same namespace - as this pod. + description: |- + ResourceClaimName is the name of a ResourceClaim object in the same + namespace as this pod. type: string resourceClaimTemplateName: - description: "ResourceClaimTemplateName is - the name of a ResourceClaimTemplate object - in the same namespace as this pod. \n The - template will be used to create a new ResourceClaim, - which will be bound to this pod. When this - pod is deleted, the ResourceClaim will also - be deleted. The pod name and resource name, - along with a generated component, will be - used to form a unique name for the ResourceClaim, - which will be recorded in pod.status.resourceClaimStatuses. - \n This field is immutable and no changes - will be made to the corresponding ResourceClaim - by the control plane after creating the - ResourceClaim." + description: |- + ResourceClaimTemplateName is the name of a ResourceClaimTemplate + object in the same namespace as this pod. + + + The template will be used to create a new ResourceClaim, which will + be bound to this pod. When this pod is deleted, the ResourceClaim + will also be deleted. The pod name and resource name, along with a + generated component, will be used to form a unique name for the + ResourceClaim, which will be recorded in pod.status.resourceClaimStatuses. + + + This field is immutable and no changes will be made to the + corresponding ResourceClaim by the control plane after creating the + ResourceClaim. type: string type: object required: @@ -7498,42 +7602,41 @@ spec: - name x-kubernetes-list-type: map restartPolicy: - description: 'Restart policy for all containers within - the pod. One of Always, OnFailure, Never. In some - contexts, only a subset of those values may be permitted. - Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy' + description: |- + Restart policy for all containers within the pod. + One of Always, OnFailure, Never. In some contexts, only a subset of those values may be permitted. + Default to Always. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy type: string runtimeClassName: - description: 'RuntimeClassName refers to a RuntimeClass - object in the node.k8s.io group, which should be used - to run this pod. If no RuntimeClass resource matches - the named class, the pod will not be run. If unset - or empty, the "legacy" RuntimeClass will be used, - which is an implicit class with an empty definition - that uses the default runtime handler. More info: - https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class' + description: |- + RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used + to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. + If unset or empty, the "legacy" RuntimeClass will be used, which is an implicit class with an + empty definition that uses the default runtime handler. + More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class type: string schedulerName: - description: If specified, the pod will be dispatched - by specified scheduler. If not specified, the pod - will be dispatched by default scheduler. + description: |- + If specified, the pod will be dispatched by specified scheduler. + If not specified, the pod will be dispatched by default scheduler. type: string schedulingGates: - description: "SchedulingGates is an opaque list of values - that if specified will block scheduling the pod. If - schedulingGates is not empty, the pod will stay in - the SchedulingGated state and the scheduler will not - attempt to schedule the pod. \n SchedulingGates can - only be set at pod creation time, and be removed only - afterwards. \n This is a beta feature enabled by the - PodSchedulingReadiness feature gate." + description: |- + SchedulingGates is an opaque list of values that if specified will block scheduling the pod. + If schedulingGates is not empty, the pod will stay in the SchedulingGated state and the + scheduler will not attempt to schedule the pod. + + + SchedulingGates can only be set at pod creation time, and be removed only afterwards. items: description: PodSchedulingGate is associated to a Pod to guard its scheduling. properties: name: - description: Name of the scheduling gate. Each - scheduling gate must have a unique name field. + description: |- + Name of the scheduling gate. + Each scheduling gate must have a unique name field. type: string required: - name @@ -7543,78 +7646,96 @@ spec: - name x-kubernetes-list-type: map securityContext: - description: 'SecurityContext holds pod-level security - attributes and common container settings. Optional: - Defaults to empty. See type description for default - values of each field.' + description: |- + SecurityContext holds pod-level security attributes and common container settings. + Optional: Defaults to empty. See type description for default values of each field. properties: + appArmorProfile: + description: |- + appArmorProfile is the AppArmor options to use by the containers in this pod. + Note that this field cannot be set when spec.os.name is windows. + properties: + localhostProfile: + description: |- + localhostProfile indicates a profile loaded on the node that should be used. + The profile must be preconfigured on the node to work. + Must match the loaded name of the profile. + Must be set if and only if type is "Localhost". + type: string + type: + description: |- + type indicates which kind of AppArmor profile will be applied. + Valid options are: + Localhost - a profile pre-loaded on the node. + RuntimeDefault - the container runtime's default profile. + Unconfined - no AppArmor enforcement. + type: string + required: + - type + type: object fsGroup: - description: "A special supplemental group that - applies to all containers in a pod. Some volume - types allow the Kubelet to change the ownership - of that volume to be owned by the pod: \n 1. The - owning GID will be the FSGroup 2. The setgid bit - is set (new files created in the volume will be - owned by FSGroup) 3. The permission bits are OR'd - with rw-rw---- \n If unset, the Kubelet will not - modify the ownership and permissions of any volume. - Note that this field cannot be set when spec.os.name - is windows." + description: |- + A special supplemental group that applies to all containers in a pod. + Some volume types allow the Kubelet to change the ownership of that volume + to be owned by the pod: + + + 1. The owning GID will be the FSGroup + 2. The setgid bit is set (new files created in the volume will be owned by FSGroup) + 3. The permission bits are OR'd with rw-rw---- + + + If unset, the Kubelet will not modify the ownership and permissions of any volume. + Note that this field cannot be set when spec.os.name is windows. format: int64 type: integer fsGroupChangePolicy: - description: 'fsGroupChangePolicy defines behavior - of changing ownership and permission of the volume - before being exposed inside Pod. This field will - only apply to volume types which support fsGroup - based ownership(and permissions). It will have - no effect on ephemeral volume types such as: secret, - configmaps and emptydir. Valid values are "OnRootMismatch" - and "Always". If not specified, "Always" is used. - Note that this field cannot be set when spec.os.name - is windows.' + description: |- + fsGroupChangePolicy defines behavior of changing ownership and permission of the volume + before being exposed inside Pod. This field will only apply to + volume types which support fsGroup based ownership(and permissions). + It will have no effect on ephemeral volume types such as: secret, configmaps + and emptydir. + Valid values are "OnRootMismatch" and "Always". If not specified, "Always" is used. + Note that this field cannot be set when spec.os.name is windows. type: string runAsGroup: - description: The GID to run the entrypoint of the - container process. Uses runtime default if unset. - May also be set in SecurityContext. If set in - both SecurityContext and PodSecurityContext, the - value specified in SecurityContext takes precedence - for that container. Note that this field cannot - be set when spec.os.name is windows. + description: |- + The GID to run the entrypoint of the container process. + Uses runtime default if unset. + May also be set in SecurityContext. If set in both SecurityContext and + PodSecurityContext, the value specified in SecurityContext takes precedence + for that container. + Note that this field cannot be set when spec.os.name is windows. format: int64 type: integer runAsNonRoot: - description: Indicates that the container must run - as a non-root user. If true, the Kubelet will - validate the image at runtime to ensure that it - does not run as UID 0 (root) and fail to start - the container if it does. If unset or false, no - such validation will be performed. May also be - set in SecurityContext. If set in both SecurityContext - and PodSecurityContext, the value specified in - SecurityContext takes precedence. + description: |- + Indicates that the container must run as a non-root user. + If true, the Kubelet will validate the image at runtime to ensure that it + does not run as UID 0 (root) and fail to start the container if it does. + If unset or false, no such validation will be performed. + May also be set in SecurityContext. If set in both SecurityContext and + PodSecurityContext, the value specified in SecurityContext takes precedence. type: boolean runAsUser: - description: The UID to run the entrypoint of the - container process. Defaults to user specified - in image metadata if unspecified. May also be - set in SecurityContext. If set in both SecurityContext - and PodSecurityContext, the value specified in - SecurityContext takes precedence for that container. - Note that this field cannot be set when spec.os.name - is windows. + description: |- + The UID to run the entrypoint of the container process. + Defaults to user specified in image metadata if unspecified. + May also be set in SecurityContext. If set in both SecurityContext and + PodSecurityContext, the value specified in SecurityContext takes precedence + for that container. + Note that this field cannot be set when spec.os.name is windows. format: int64 type: integer seLinuxOptions: - description: The SELinux context to be applied to - all containers. If unspecified, the container - runtime will allocate a random SELinux context - for each container. May also be set in SecurityContext. If - set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext takes precedence - for that container. Note that this field cannot - be set when spec.os.name is windows. + description: |- + The SELinux context to be applied to all containers. + If unspecified, the container runtime will allocate a random SELinux context for each + container. May also be set in SecurityContext. If set in + both SecurityContext and PodSecurityContext, the value specified in SecurityContext + takes precedence for that container. + Note that this field cannot be set when spec.os.name is windows. properties: level: description: Level is SELinux level label that @@ -7634,53 +7755,49 @@ spec: type: string type: object seccompProfile: - description: The seccomp options to use by the containers - in this pod. Note that this field cannot be set - when spec.os.name is windows. + description: |- + The seccomp options to use by the containers in this pod. + Note that this field cannot be set when spec.os.name is windows. properties: localhostProfile: - description: localhostProfile indicates a profile - defined in a file on the node should be used. - The profile must be preconfigured on the node - to work. Must be a descending path, relative - to the kubelet's configured seccomp profile - location. Must be set if type is "Localhost". - Must NOT be set for any other type. + description: |- + localhostProfile indicates a profile defined in a file on the node should be used. + The profile must be preconfigured on the node to work. + Must be a descending path, relative to the kubelet's configured seccomp profile location. + Must be set if type is "Localhost". Must NOT be set for any other type. type: string type: - description: "type indicates which kind of seccomp - profile will be applied. Valid options are: - \n Localhost - a profile defined in a file - on the node should be used. RuntimeDefault - - the container runtime default profile should - be used. Unconfined - no profile should be - applied." + description: |- + type indicates which kind of seccomp profile will be applied. + Valid options are: + + + Localhost - a profile defined in a file on the node should be used. + RuntimeDefault - the container runtime default profile should be used. + Unconfined - no profile should be applied. type: string required: - type type: object supplementalGroups: - description: A list of groups applied to the first - process run in each container, in addition to - the container's primary GID, the fsGroup (if specified), - and group memberships defined in the container - image for the uid of the container process. If - unspecified, no additional groups are added to - any container. Note that group memberships defined - in the container image for the uid of the container - process are still effective, even if they are - not included in this list. Note that this field - cannot be set when spec.os.name is windows. + description: |- + A list of groups applied to the first process run in each container, in addition + to the container's primary GID, the fsGroup (if specified), and group memberships + defined in the container image for the uid of the container process. If unspecified, + no additional groups are added to any container. Note that group memberships + defined in the container image for the uid of the container process are still effective, + even if they are not included in this list. + Note that this field cannot be set when spec.os.name is windows. items: format: int64 type: integer type: array + x-kubernetes-list-type: atomic sysctls: - description: Sysctls hold a list of namespaced sysctls - used for the pod. Pods with unsupported sysctls - (by the container runtime) might fail to launch. - Note that this field cannot be set when spec.os.name - is windows. + description: |- + Sysctls hold a list of namespaced sysctls used for the pod. Pods with unsupported + sysctls (by the container runtime) might fail to launch. + Note that this field cannot be set when spec.os.name is windows. items: description: Sysctl defines a kernel parameter to be set @@ -7696,343 +7813,300 @@ spec: - value type: object type: array + x-kubernetes-list-type: atomic windowsOptions: - description: The Windows specific settings applied - to all containers. If unspecified, the options - within a container's SecurityContext will be used. - If set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext takes precedence. - Note that this field cannot be set when spec.os.name - is linux. + description: |- + The Windows specific settings applied to all containers. + If unspecified, the options within a container's SecurityContext will be used. + If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. + Note that this field cannot be set when spec.os.name is linux. properties: gmsaCredentialSpec: - description: GMSACredentialSpec is where the - GMSA admission webhook (https://github.com/kubernetes-sigs/windows-gmsa) - inlines the contents of the GMSA credential - spec named by the GMSACredentialSpecName field. + description: |- + GMSACredentialSpec is where the GMSA admission webhook + (https://github.com/kubernetes-sigs/windows-gmsa) inlines the contents of the + GMSA credential spec named by the GMSACredentialSpecName field. type: string gmsaCredentialSpecName: description: GMSACredentialSpecName is the name of the GMSA credential spec to use. type: string hostProcess: - description: HostProcess determines if a container - should be run as a 'Host Process' container. - All of a Pod's containers must have the same - effective HostProcess value (it is not allowed - to have a mix of HostProcess containers and - non-HostProcess containers). In addition, - if HostProcess is true then HostNetwork must - also be set to true. + description: |- + HostProcess determines if a container should be run as a 'Host Process' container. + All of a Pod's containers must have the same effective HostProcess value + (it is not allowed to have a mix of HostProcess containers and non-HostProcess containers). + In addition, if HostProcess is true then HostNetwork must also be set to true. type: boolean runAsUserName: - description: The UserName in Windows to run - the entrypoint of the container process. Defaults - to the user specified in image metadata if - unspecified. May also be set in PodSecurityContext. - If set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext takes - precedence. + description: |- + The UserName in Windows to run the entrypoint of the container process. + Defaults to the user specified in image metadata if unspecified. + May also be set in PodSecurityContext. If set in both SecurityContext and + PodSecurityContext, the value specified in SecurityContext takes precedence. type: string type: object type: object serviceAccount: - description: 'DeprecatedServiceAccount is a depreciated - alias for ServiceAccountName. Deprecated: Use serviceAccountName - instead.' + description: |- + DeprecatedServiceAccount is a deprecated alias for ServiceAccountName. + Deprecated: Use serviceAccountName instead. type: string serviceAccountName: - description: 'ServiceAccountName is the name of the - ServiceAccount to use to run this pod. More info: - https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/' + description: |- + ServiceAccountName is the name of the ServiceAccount to use to run this pod. + More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ type: string setHostnameAsFQDN: - description: If true the pod's hostname will be configured - as the pod's FQDN, rather than the leaf name (the - default). In Linux containers, this means setting - the FQDN in the hostname field of the kernel (the - nodename field of struct utsname). In Windows containers, - this means setting the registry value of hostname - for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters - to FQDN. If a pod does not have FQDN, this has no - effect. Default to false. + description: |- + If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). + In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). + In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. + If a pod does not have FQDN, this has no effect. + Default to false. type: boolean shareProcessNamespace: - description: 'Share a single process namespace between - all of the containers in a pod. When this is set containers - will be able to view and signal processes from other - containers in the same pod, and the first process - in each container will not be assigned PID 1. HostPID - and ShareProcessNamespace cannot both be set. Optional: - Default to false.' + description: |- + Share a single process namespace between all of the containers in a pod. + When this is set containers will be able to view and signal processes from other containers + in the same pod, and the first process in each container will not be assigned PID 1. + HostPID and ShareProcessNamespace cannot both be set. + Optional: Default to false. type: boolean subdomain: - description: If specified, the fully qualified Pod hostname - will be "...svc.". If not specified, the pod will not have - a domainname at all. + description: |- + If specified, the fully qualified Pod hostname will be "...svc.". + If not specified, the pod will not have a domainname at all. type: string terminationGracePeriodSeconds: - description: Optional duration in seconds the pod needs - to terminate gracefully. May be decreased in delete - request. Value must be non-negative integer. The value - zero indicates stop immediately via the kill signal - (no opportunity to shut down). If this value is nil, - the default grace period will be used instead. The - grace period is the duration in seconds after the - processes running in the pod are sent a termination - signal and the time when the processes are forcibly - halted with a kill signal. Set this value longer than - the expected cleanup time for your process. Defaults - to 30 seconds. + description: |- + Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. + Value must be non-negative integer. The value zero indicates stop immediately via + the kill signal (no opportunity to shut down). + If this value is nil, the default grace period will be used instead. + The grace period is the duration in seconds after the processes running in the pod are sent + a termination signal and the time when the processes are forcibly halted with a kill signal. + Set this value longer than the expected cleanup time for your process. + Defaults to 30 seconds. format: int64 type: integer tolerations: description: If specified, the pod's tolerations. items: - description: The pod this Toleration is attached to - tolerates any taint that matches the triple - using the matching operator . + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . properties: effect: - description: Effect indicates the taint effect - to match. Empty means match all taint effects. - When specified, allowed values are NoSchedule, - PreferNoSchedule and NoExecute. + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. type: string key: - description: Key is the taint key that the toleration - applies to. Empty means match all taint keys. - If the key is empty, operator must be Exists; - this combination means to match all values and - all keys. + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. type: string operator: - description: Operator represents a key's relationship - to the value. Valid operators are Exists and - Equal. Defaults to Equal. Exists is equivalent - to wildcard for value, so that a pod can tolerate - all taints of a particular category. + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. type: string tolerationSeconds: - description: TolerationSeconds represents the - period of time the toleration (which must be - of effect NoExecute, otherwise this field is - ignored) tolerates the taint. By default, it - is not set, which means tolerate the taint forever - (do not evict). Zero and negative values will - be treated as 0 (evict immediately) by the system. + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. format: int64 type: integer value: - description: Value is the taint value the toleration - matches to. If the operator is Exists, the value - should be empty, otherwise just a regular string. + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. type: string type: object type: array + x-kubernetes-list-type: atomic topologySpreadConstraints: - description: TopologySpreadConstraints describes how - a group of pods ought to spread across topology domains. - Scheduler will schedule pods in a way which abides - by the constraints. All topologySpreadConstraints - are ANDed. + description: |- + TopologySpreadConstraints describes how a group of pods ought to spread across topology + domains. Scheduler will schedule pods in a way which abides by the constraints. + All topologySpreadConstraints are ANDed. items: description: TopologySpreadConstraint specifies how to spread matching pods among the given topology. properties: labelSelector: - description: LabelSelector is used to find matching - pods. Pods that match this label selector are - counted to determine the number of pods in their - corresponding topology domain. + description: |- + LabelSelector is used to find matching pods. + Pods that match this label selector are counted to determine the number of pods + in their corresponding topology domain. properties: matchExpressions: description: matchExpressions is a list of label selector requirements. The requirements are ANDed. items: - description: A label selector requirement - is a selector that contains values, a - key, and an operator that relates the - key and values. + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. properties: key: description: key is the label key that the selector applies to. type: string operator: - description: operator represents a key's - relationship to a set of values. Valid - operators are In, NotIn, Exists and - DoesNotExist. + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. type: string values: - description: values is an array of string - values. If the operator is In or NotIn, - the values array must be non-empty. - If the operator is Exists or DoesNotExist, - the values array must be empty. This - array is replaced during a strategic + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic merge patch. items: type: string type: array + x-kubernetes-list-type: atomic required: - key - operator type: object type: array + x-kubernetes-list-type: atomic matchLabels: additionalProperties: type: string - description: matchLabels is a map of {key,value} - pairs. A single {key,value} in the matchLabels - map is equivalent to an element of matchExpressions, - whose key field is "key", the operator is - "In", and the values array contains only - "value". The requirements are ANDed. + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. type: object type: object x-kubernetes-map-type: atomic matchLabelKeys: - description: "MatchLabelKeys is a set of pod label - keys to select the pods over which spreading - will be calculated. The keys are used to lookup - values from the incoming pod labels, those key-value - labels are ANDed with labelSelector to select - the group of existing pods over which spreading - will be calculated for the incoming pod. The - same key is forbidden to exist in both MatchLabelKeys - and LabelSelector. MatchLabelKeys cannot be - set when LabelSelector isn't set. Keys that - don't exist in the incoming pod labels will - be ignored. A null or empty list means only - match against labelSelector. \n This is a beta - field and requires the MatchLabelKeysInPodTopologySpread - feature gate to be enabled (enabled by default)." + description: |- + MatchLabelKeys is a set of pod label keys to select the pods over which + spreading will be calculated. The keys are used to lookup values from the + incoming pod labels, those key-value labels are ANDed with labelSelector + to select the group of existing pods over which spreading will be calculated + for the incoming pod. The same key is forbidden to exist in both MatchLabelKeys and LabelSelector. + MatchLabelKeys cannot be set when LabelSelector isn't set. + Keys that don't exist in the incoming pod labels will + be ignored. A null or empty list means only match against labelSelector. + + + This is a beta field and requires the MatchLabelKeysInPodTopologySpread feature gate to be enabled (enabled by default). items: type: string type: array x-kubernetes-list-type: atomic maxSkew: - description: 'MaxSkew describes the degree to - which pods may be unevenly distributed. When - `whenUnsatisfiable=DoNotSchedule`, it is the - maximum permitted difference between the number - of matching pods in the target topology and - the global minimum. The global minimum is the - minimum number of matching pods in an eligible - domain or zero if the number of eligible domains - is less than MinDomains. For example, in a 3-zone - cluster, MaxSkew is set to 1, and pods with - the same labelSelector spread as 2/2/1: In this - case, the global minimum is 1. | zone1 | zone2 - | zone3 | | P P | P P | P | - if MaxSkew - is 1, incoming pod can only be scheduled to - zone3 to become 2/2/2; scheduling it onto zone1(zone2) - would make the ActualSkew(3-1) on zone1(zone2) - violate MaxSkew(1). - if MaxSkew is 2, incoming - pod can be scheduled onto any zone. When `whenUnsatisfiable=ScheduleAnyway`, - it is used to give higher precedence to topologies - that satisfy it. It''s a required field. Default - value is 1 and 0 is not allowed.' + description: |- + MaxSkew describes the degree to which pods may be unevenly distributed. + When `whenUnsatisfiable=DoNotSchedule`, it is the maximum permitted difference + between the number of matching pods in the target topology and the global minimum. + The global minimum is the minimum number of matching pods in an eligible domain + or zero if the number of eligible domains is less than MinDomains. + For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same + labelSelector spread as 2/2/1: + In this case, the global minimum is 1. + | zone1 | zone2 | zone3 | + | P P | P P | P | + - if MaxSkew is 1, incoming pod can only be scheduled to zone3 to become 2/2/2; + scheduling it onto zone1(zone2) would make the ActualSkew(3-1) on zone1(zone2) + violate MaxSkew(1). + - if MaxSkew is 2, incoming pod can be scheduled onto any zone. + When `whenUnsatisfiable=ScheduleAnyway`, it is used to give higher precedence + to topologies that satisfy it. + It's a required field. Default value is 1 and 0 is not allowed. format: int32 type: integer minDomains: - description: "MinDomains indicates a minimum number - of eligible domains. When the number of eligible - domains with matching topology keys is less - than minDomains, Pod Topology Spread treats - \"global minimum\" as 0, and then the calculation - of Skew is performed. And when the number of - eligible domains with matching topology keys - equals or greater than minDomains, this value - has no effect on scheduling. As a result, when - the number of eligible domains is less than - minDomains, scheduler won't schedule more than - maxSkew Pods to those domains. If value is nil, - the constraint behaves as if MinDomains is equal - to 1. Valid values are integers greater than - 0. When value is not nil, WhenUnsatisfiable - must be DoNotSchedule. \n For example, in a - 3-zone cluster, MaxSkew is set to 2, MinDomains - is set to 5 and pods with the same labelSelector - spread as 2/2/2: | zone1 | zone2 | zone3 | | - \ P P | P P | P P | The number of domains - is less than 5(MinDomains), so \"global minimum\" - is treated as 0. In this situation, new pod - with the same labelSelector cannot be scheduled, - because computed skew will be 3(3 - 0) if new - Pod is scheduled to any of the three zones, - it will violate MaxSkew. \n This is a beta field - and requires the MinDomainsInPodTopologySpread - feature gate to be enabled (enabled by default)." + description: |- + MinDomains indicates a minimum number of eligible domains. + When the number of eligible domains with matching topology keys is less than minDomains, + Pod Topology Spread treats "global minimum" as 0, and then the calculation of Skew is performed. + And when the number of eligible domains with matching topology keys equals or greater than minDomains, + this value has no effect on scheduling. + As a result, when the number of eligible domains is less than minDomains, + scheduler won't schedule more than maxSkew Pods to those domains. + If value is nil, the constraint behaves as if MinDomains is equal to 1. + Valid values are integers greater than 0. + When value is not nil, WhenUnsatisfiable must be DoNotSchedule. + + + For example, in a 3-zone cluster, MaxSkew is set to 2, MinDomains is set to 5 and pods with the same + labelSelector spread as 2/2/2: + | zone1 | zone2 | zone3 | + | P P | P P | P P | + The number of domains is less than 5(MinDomains), so "global minimum" is treated as 0. + In this situation, new pod with the same labelSelector cannot be scheduled, + because computed skew will be 3(3 - 0) if new Pod is scheduled to any of the three zones, + it will violate MaxSkew. format: int32 type: integer nodeAffinityPolicy: - description: "NodeAffinityPolicy indicates how - we will treat Pod's nodeAffinity/nodeSelector - when calculating pod topology spread skew. Options - are: - Honor: only nodes matching nodeAffinity/nodeSelector - are included in the calculations. - Ignore: - nodeAffinity/nodeSelector are ignored. All nodes - are included in the calculations. \n If this - value is nil, the behavior is equivalent to - the Honor policy. This is a beta-level feature - default enabled by the NodeInclusionPolicyInPodTopologySpread - feature flag." + description: |- + NodeAffinityPolicy indicates how we will treat Pod's nodeAffinity/nodeSelector + when calculating pod topology spread skew. Options are: + - Honor: only nodes matching nodeAffinity/nodeSelector are included in the calculations. + - Ignore: nodeAffinity/nodeSelector are ignored. All nodes are included in the calculations. + + + If this value is nil, the behavior is equivalent to the Honor policy. + This is a beta-level feature default enabled by the NodeInclusionPolicyInPodTopologySpread feature flag. type: string nodeTaintsPolicy: - description: "NodeTaintsPolicy indicates how we - will treat node taints when calculating pod - topology spread skew. Options are: - Honor: - nodes without taints, along with tainted nodes - for which the incoming pod has a toleration, - are included. - Ignore: node taints are ignored. - All nodes are included. \n If this value is - nil, the behavior is equivalent to the Ignore - policy. This is a beta-level feature default - enabled by the NodeInclusionPolicyInPodTopologySpread - feature flag." + description: |- + NodeTaintsPolicy indicates how we will treat node taints when calculating + pod topology spread skew. Options are: + - Honor: nodes without taints, along with tainted nodes for which the incoming pod + has a toleration, are included. + - Ignore: node taints are ignored. All nodes are included. + + + If this value is nil, the behavior is equivalent to the Ignore policy. + This is a beta-level feature default enabled by the NodeInclusionPolicyInPodTopologySpread feature flag. type: string topologyKey: - description: TopologyKey is the key of node labels. - Nodes that have a label with this key and identical - values are considered to be in the same topology. - We consider each as a "bucket", - and try to put balanced number of pods into - each bucket. We define a domain as a particular - instance of a topology. Also, we define an eligible - domain as a domain whose nodes meet the requirements - of nodeAffinityPolicy and nodeTaintsPolicy. - e.g. If TopologyKey is "kubernetes.io/hostname", - each Node is a domain of that topology. And, - if TopologyKey is "topology.kubernetes.io/zone", - each zone is a domain of that topology. It's - a required field. + description: |- + TopologyKey is the key of node labels. Nodes that have a label with this key + and identical values are considered to be in the same topology. + We consider each as a "bucket", and try to put balanced number + of pods into each bucket. + We define a domain as a particular instance of a topology. + Also, we define an eligible domain as a domain whose nodes meet the requirements of + nodeAffinityPolicy and nodeTaintsPolicy. + e.g. If TopologyKey is "kubernetes.io/hostname", each Node is a domain of that topology. + And, if TopologyKey is "topology.kubernetes.io/zone", each zone is a domain of that topology. + It's a required field. type: string whenUnsatisfiable: - description: 'WhenUnsatisfiable indicates how - to deal with a pod if it doesn''t satisfy the - spread constraint. - DoNotSchedule (default) - tells the scheduler not to schedule it. - ScheduleAnyway - tells the scheduler to schedule the pod in any - location, but giving higher precedence to topologies - that would help reduce the skew. A constraint - is considered "Unsatisfiable" for an incoming - pod if and only if every possible node assignment - for that pod would violate "MaxSkew" on some - topology. For example, in a 3-zone cluster, - MaxSkew is set to 1, and pods with the same - labelSelector spread as 3/1/1: | zone1 | zone2 - | zone3 | | P P P | P | P | If WhenUnsatisfiable - is set to DoNotSchedule, incoming pod can only - be scheduled to zone2(zone3) to become 3/2/1(3/1/2) - as ActualSkew(2-1) on zone2(zone3) satisfies - MaxSkew(1). In other words, the cluster can - still be imbalanced, but scheduler won''t make - it *more* imbalanced. It''s a required field.' + description: |- + WhenUnsatisfiable indicates how to deal with a pod if it doesn't satisfy + the spread constraint. + - DoNotSchedule (default) tells the scheduler not to schedule it. + - ScheduleAnyway tells the scheduler to schedule the pod in any location, + but giving higher precedence to topologies that would help reduce the + skew. + A constraint is considered "Unsatisfiable" for an incoming pod + if and only if every possible node assignment for that pod would violate + "MaxSkew" on some topology. + For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same + labelSelector spread as 3/1/1: + | zone1 | zone2 | zone3 | + | P P P | P | P | + If WhenUnsatisfiable is set to DoNotSchedule, incoming pod can only be scheduled + to zone2(zone3) to become 3/2/1(3/1/2) as ActualSkew(2-1) on zone2(zone3) satisfies + MaxSkew(1). In other words, the cluster can still be imbalanced, but scheduler + won't make it *more* imbalanced. + It's a required field. type: string required: - maxSkew @@ -8045,49 +8119,45 @@ spec: - whenUnsatisfiable x-kubernetes-list-type: map volumes: - description: 'List of volumes that can be mounted by - containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes' + description: |- + List of volumes that can be mounted by containers belonging to the pod. + More info: https://kubernetes.io/docs/concepts/storage/volumes items: description: Volume represents a named volume in a pod that may be accessed by any container in the pod. properties: awsElasticBlockStore: - description: 'awsElasticBlockStore represents - an AWS Disk resource that is attached to a kubelet''s - host machine and then exposed to the pod. More - info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore' + description: |- + awsElasticBlockStore represents an AWS Disk resource that is attached to a + kubelet's host machine and then exposed to the pod. + More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore properties: fsType: - description: 'fsType is the filesystem type - of the volume that you want to mount. Tip: - Ensure that the filesystem type is supported - by the host operating system. Examples: - "ext4", "xfs", "ntfs". Implicitly inferred - to be "ext4" if unspecified. More info: - https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore - TODO: how do we prevent errors in the filesystem - from compromising the machine' + description: |- + fsType is the filesystem type of the volume that you want to mount. + Tip: Ensure that the filesystem type is supported by the host operating system. + Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. + More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore + TODO: how do we prevent errors in the filesystem from compromising the machine type: string partition: - description: 'partition is the partition in - the volume that you want to mount. If omitted, - the default is to mount by volume name. - Examples: For volume /dev/sda1, you specify - the partition as "1". Similarly, the volume - partition for /dev/sda is "0" (or you can - leave the property empty).' + description: |- + partition is the partition in the volume that you want to mount. + If omitted, the default is to mount by volume name. + Examples: For volume /dev/sda1, you specify the partition as "1". + Similarly, the volume partition for /dev/sda is "0" (or you can leave the property empty). format: int32 type: integer readOnly: - description: 'readOnly value true will force - the readOnly setting in VolumeMounts. More - info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore' + description: |- + readOnly value true will force the readOnly setting in VolumeMounts. + More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore type: boolean volumeID: - description: 'volumeID is unique ID of the - persistent disk resource in AWS (Amazon - EBS volume). More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore' + description: |- + volumeID is unique ID of the persistent disk resource in AWS (Amazon EBS volume). + More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore type: string required: - volumeID @@ -8110,11 +8180,10 @@ spec: in the blob storage type: string fsType: - description: fsType is Filesystem type to - mount. Must be a filesystem type supported - by the host operating system. Ex. "ext4", - "xfs", "ntfs". Implicitly inferred to be - "ext4" if unspecified. + description: |- + fsType is Filesystem type to mount. + Must be a filesystem type supported by the host operating system. + Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. type: string kind: description: 'kind expected values are Shared: @@ -8124,9 +8193,9 @@ spec: availability set). defaults to shared' type: string readOnly: - description: readOnly Defaults to false (read/write). - ReadOnly here will force the ReadOnly setting - in VolumeMounts. + description: |- + readOnly Defaults to false (read/write). ReadOnly here will force + the ReadOnly setting in VolumeMounts. type: boolean required: - diskName @@ -8138,9 +8207,9 @@ spec: the pod. properties: readOnly: - description: readOnly defaults to false (read/write). - ReadOnly here will force the ReadOnly setting - in VolumeMounts. + description: |- + readOnly defaults to false (read/write). ReadOnly here will force + the ReadOnly setting in VolumeMounts. type: boolean secretName: description: secretName is the name of secret @@ -8160,83 +8229,95 @@ spec: on the host that shares a pod's lifetime properties: monitors: - description: 'monitors is Required: Monitors - is a collection of Ceph monitors More info: - https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it' + description: |- + monitors is Required: Monitors is a collection of Ceph monitors + More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it items: type: string type: array + x-kubernetes-list-type: atomic path: description: 'path is Optional: Used as the mounted root, rather than the full Ceph tree, default is /' type: string readOnly: - description: 'readOnly is Optional: Defaults - to false (read/write). ReadOnly here will - force the ReadOnly setting in VolumeMounts. - More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it' + description: |- + readOnly is Optional: Defaults to false (read/write). ReadOnly here will force + the ReadOnly setting in VolumeMounts. + More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it type: boolean secretFile: - description: 'secretFile is Optional: SecretFile - is the path to key ring for User, default - is /etc/ceph/user.secret More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it' + description: |- + secretFile is Optional: SecretFile is the path to key ring for User, default is /etc/ceph/user.secret + More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it type: string secretRef: - description: 'secretRef is Optional: SecretRef - is reference to the authentication secret - for User, default is empty. More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it' + description: |- + secretRef is Optional: SecretRef is reference to the authentication secret for User, default is empty. + More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it properties: name: - description: 'Name of the referent. More - info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + TODO: Add other useful fields. apiVersion, kind, uid? + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. type: string type: object x-kubernetes-map-type: atomic user: - description: 'user is optional: User is the - rados user name, default is admin More info: - https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it' + description: |- + user is optional: User is the rados user name, default is admin + More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it type: string required: - monitors type: object cinder: - description: 'cinder represents a cinder volume - attached and mounted on kubelets host machine. - More info: https://examples.k8s.io/mysql-cinder-pd/README.md' + description: |- + cinder represents a cinder volume attached and mounted on kubelets host machine. + More info: https://examples.k8s.io/mysql-cinder-pd/README.md properties: fsType: - description: 'fsType is the filesystem type - to mount. Must be a filesystem type supported - by the host operating system. Examples: - "ext4", "xfs", "ntfs". Implicitly inferred - to be "ext4" if unspecified. More info: - https://examples.k8s.io/mysql-cinder-pd/README.md' + description: |- + fsType is the filesystem type to mount. + Must be a filesystem type supported by the host operating system. + Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. + More info: https://examples.k8s.io/mysql-cinder-pd/README.md type: string readOnly: - description: 'readOnly defaults to false (read/write). - ReadOnly here will force the ReadOnly setting - in VolumeMounts. More info: https://examples.k8s.io/mysql-cinder-pd/README.md' + description: |- + readOnly defaults to false (read/write). ReadOnly here will force + the ReadOnly setting in VolumeMounts. + More info: https://examples.k8s.io/mysql-cinder-pd/README.md type: boolean secretRef: - description: 'secretRef is optional: points - to a secret object containing parameters - used to connect to OpenStack.' + description: |- + secretRef is optional: points to a secret object containing parameters used to connect + to OpenStack. properties: name: - description: 'Name of the referent. More - info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + TODO: Add other useful fields. apiVersion, kind, uid? + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. type: string type: object x-kubernetes-map-type: atomic volumeID: - description: 'volumeID used to identify the - volume in cinder. More info: https://examples.k8s.io/mysql-cinder-pd/README.md' + description: |- + volumeID used to identify the volume in cinder. + More info: https://examples.k8s.io/mysql-cinder-pd/README.md type: string required: - volumeID @@ -8246,33 +8327,25 @@ spec: that should populate this volume properties: defaultMode: - description: 'defaultMode is optional: mode - bits used to set permissions on created - files by default. Must be an octal value - between 0000 and 0777 or a decimal value - between 0 and 511. YAML accepts both octal - and decimal values, JSON requires decimal - values for mode bits. Defaults to 0644. - Directories within the path are not affected - by this setting. This might be in conflict - with other options that affect the file - mode, like fsGroup, and the result can be - other mode bits set.' + description: |- + defaultMode is optional: mode bits used to set permissions on created files by default. + Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. + YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. + Defaults to 0644. + Directories within the path are not affected by this setting. + This might be in conflict with other options that affect the file + mode, like fsGroup, and the result can be other mode bits set. format: int32 type: integer items: - description: items if unspecified, each key-value - pair in the Data field of the referenced - ConfigMap will be projected into the volume - as a file whose name is the key and content - is the value. If specified, the listed keys - will be projected into the specified paths, - and unlisted keys will not be present. If - a key is specified which is not present - in the ConfigMap, the volume setup will - error unless it is marked optional. Paths - must be relative and may not contain the - '..' path or start with '..'. + description: |- + items if unspecified, each key-value pair in the Data field of the referenced + ConfigMap will be projected into the volume as a file whose name is the + key and content is the value. If specified, the listed keys will be + projected into the specified paths, and unlisted keys will not be + present. If a key is specified which is not present in the ConfigMap, + the volume setup will error unless it is marked optional. Paths must be + relative and may not contain the '..' path or start with '..'. items: description: Maps a string key to a path within a volume. @@ -8281,37 +8354,38 @@ spec: description: key is the key to project. type: string mode: - description: 'mode is Optional: mode - bits used to set permissions on this - file. Must be an octal value between - 0000 and 0777 or a decimal value between - 0 and 511. YAML accepts both octal - and decimal values, JSON requires - decimal values for mode bits. If not - specified, the volume defaultMode - will be used. This might be in conflict - with other options that affect the - file mode, like fsGroup, and the result - can be other mode bits set.' + description: |- + mode is Optional: mode bits used to set permissions on this file. + Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. + YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. + If not specified, the volume defaultMode will be used. + This might be in conflict with other options that affect the file + mode, like fsGroup, and the result can be other mode bits set. format: int32 type: integer path: - description: path is the relative path - of the file to map the key to. May - not be an absolute path. May not contain - the path element '..'. May not start - with the string '..'. + description: |- + path is the relative path of the file to map the key to. + May not be an absolute path. + May not contain the path element '..'. + May not start with the string '..'. type: string required: - key - path type: object type: array + x-kubernetes-list-type: atomic name: - description: 'Name of the referent. More info: - https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + TODO: Add other useful fields. apiVersion, kind, uid? + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. type: string optional: description: optional specify whether the @@ -8325,48 +8399,48 @@ spec: by certain external CSI drivers (Beta feature). properties: driver: - description: driver is the name of the CSI - driver that handles this volume. Consult - with your admin for the correct name as - registered in the cluster. + description: |- + driver is the name of the CSI driver that handles this volume. + Consult with your admin for the correct name as registered in the cluster. type: string fsType: - description: fsType to mount. Ex. "ext4", - "xfs", "ntfs". If not provided, the empty - value is passed to the associated CSI driver - which will determine the default filesystem - to apply. + description: |- + fsType to mount. Ex. "ext4", "xfs", "ntfs". + If not provided, the empty value is passed to the associated CSI driver + which will determine the default filesystem to apply. type: string nodePublishSecretRef: - description: nodePublishSecretRef is a reference - to the secret object containing sensitive - information to pass to the CSI driver to - complete the CSI NodePublishVolume and NodeUnpublishVolume - calls. This field is optional, and may - be empty if no secret is required. If the - secret object contains more than one secret, - all secret references are passed. + description: |- + nodePublishSecretRef is a reference to the secret object containing + sensitive information to pass to the CSI driver to complete the CSI + NodePublishVolume and NodeUnpublishVolume calls. + This field is optional, and may be empty if no secret is required. If the + secret object contains more than one secret, all secret references are passed. properties: name: - description: 'Name of the referent. More - info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + TODO: Add other useful fields. apiVersion, kind, uid? + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. type: string type: object x-kubernetes-map-type: atomic readOnly: - description: readOnly specifies a read-only - configuration for the volume. Defaults to - false (read/write). + description: |- + readOnly specifies a read-only configuration for the volume. + Defaults to false (read/write). type: boolean volumeAttributes: additionalProperties: type: string - description: volumeAttributes stores driver-specific - properties that are passed to the CSI driver. - Consult your driver's documentation for - supported values. + description: |- + volumeAttributes stores driver-specific properties that are passed to the CSI + driver. Consult your driver's documentation for supported values. type: object required: - driver @@ -8376,19 +8450,15 @@ spec: about the pod that should populate this volume properties: defaultMode: - description: 'Optional: mode bits to use on - created files by default. Must be a Optional: - mode bits used to set permissions on created - files by default. Must be an octal value - between 0000 and 0777 or a decimal value - between 0 and 511. YAML accepts both octal - and decimal values, JSON requires decimal - values for mode bits. Defaults to 0644. - Directories within the path are not affected - by this setting. This might be in conflict - with other options that affect the file - mode, like fsGroup, and the result can be - other mode bits set.' + description: |- + Optional: mode bits to use on created files by default. Must be a + Optional: mode bits used to set permissions on created files by default. + Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. + YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. + Defaults to 0644. + Directories within the path are not affected by this setting. + This might be in conflict with other options that affect the file + mode, like fsGroup, and the result can be other mode bits set. format: int32 type: integer items: @@ -8402,7 +8472,7 @@ spec: fieldRef: description: 'Required: Selects a field of the pod: only annotations, labels, - name and namespace are supported.' + name, namespace and uid are supported.' properties: apiVersion: description: Version of the schema @@ -8418,18 +8488,13 @@ spec: type: object x-kubernetes-map-type: atomic mode: - description: 'Optional: mode bits used - to set permissions on this file, must - be an octal value between 0000 and - 0777 or a decimal value between 0 - and 511. YAML accepts both octal and - decimal values, JSON requires decimal - values for mode bits. If not specified, - the volume defaultMode will be used. - This might be in conflict with other - options that affect the file mode, - like fsGroup, and the result can be - other mode bits set.' + description: |- + Optional: mode bits used to set permissions on this file, must be an octal value + between 0000 and 0777 or a decimal value between 0 and 511. + YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. + If not specified, the volume defaultMode will be used. + This might be in conflict with other options that affect the file + mode, like fsGroup, and the result can be other mode bits set. format: int32 type: integer path: @@ -8441,11 +8506,9 @@ spec: path must not start with ''..''' type: string resourceFieldRef: - description: 'Selects a resource of - the container: only resources limits - and requests (limits.cpu, limits.memory, - requests.cpu and requests.memory) - are currently supported.' + description: |- + Selects a resource of the container: only resources limits and requests + (limits.cpu, limits.memory, requests.cpu and requests.memory) are currently supported. properties: containerName: description: 'Container name: required @@ -8473,93 +8536,97 @@ spec: - path type: object type: array + x-kubernetes-list-type: atomic type: object emptyDir: - description: 'emptyDir represents a temporary - directory that shares a pod''s lifetime. More - info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' + description: |- + emptyDir represents a temporary directory that shares a pod's lifetime. + More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir properties: medium: - description: 'medium represents what type - of storage medium should back this directory. - The default is "" which means to use the - node''s default medium. Must be an empty - string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' + description: |- + medium represents what type of storage medium should back this directory. + The default is "" which means to use the node's default medium. + Must be an empty string (default) or Memory. + More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir type: string sizeLimit: anyOf: - type: integer - type: string - description: 'sizeLimit is the total amount - of local storage required for this EmptyDir - volume. The size limit is also applicable - for memory medium. The maximum usage on - memory medium EmptyDir would be the minimum - value between the SizeLimit specified here - and the sum of memory limits of all containers - in a pod. The default is nil which means - that the limit is undefined. More info: - https://kubernetes.io/docs/concepts/storage/volumes#emptydir' + description: |- + sizeLimit is the total amount of local storage required for this EmptyDir volume. + The size limit is also applicable for memory medium. + The maximum usage on memory medium EmptyDir would be the minimum value between + the SizeLimit specified here and the sum of memory limits of all containers in a pod. + The default is nil which means that the limit is undefined. + More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true type: object ephemeral: - description: "ephemeral represents a volume that - is handled by a cluster storage driver. The - volume's lifecycle is tied to the pod that defines - it - it will be created before the pod starts, - and deleted when the pod is removed. \n Use - this if: a) the volume is only needed while - the pod runs, b) features of normal volumes - like restoring from snapshot or capacity tracking - are needed, c) the storage driver is specified - through a storage class, and d) the storage - driver supports dynamic volume provisioning - through a PersistentVolumeClaim (see EphemeralVolumeSource - for more information on the connection between - this volume type and PersistentVolumeClaim). - \n Use PersistentVolumeClaim or one of the vendor-specific - APIs for volumes that persist for longer than - the lifecycle of an individual pod. \n Use CSI - for light-weight local ephemeral volumes if - the CSI driver is meant to be used that way - - see the documentation of the driver for more - information. \n A pod can use both types of - ephemeral volumes and persistent volumes at - the same time." + description: |- + ephemeral represents a volume that is handled by a cluster storage driver. + The volume's lifecycle is tied to the pod that defines it - it will be created before the pod starts, + and deleted when the pod is removed. + + + Use this if: + a) the volume is only needed while the pod runs, + b) features of normal volumes like restoring from snapshot or capacity + tracking are needed, + c) the storage driver is specified through a storage class, and + d) the storage driver supports dynamic volume provisioning through + a PersistentVolumeClaim (see EphemeralVolumeSource for more + information on the connection between this volume type + and PersistentVolumeClaim). + + + Use PersistentVolumeClaim or one of the vendor-specific + APIs for volumes that persist for longer than the lifecycle + of an individual pod. + + + Use CSI for light-weight local ephemeral volumes if the CSI driver is meant to + be used that way - see the documentation of the driver for + more information. + + + A pod can use both types of ephemeral volumes and + persistent volumes at the same time. properties: volumeClaimTemplate: - description: "Will be used to create a stand-alone - PVC to provision the volume. The pod in - which this EphemeralVolumeSource is embedded - will be the owner of the PVC, i.e. the PVC - will be deleted together with the pod. The - name of the PVC will be `-` where `` is the name - from the `PodSpec.Volumes` array entry. - Pod validation will reject the pod if the - concatenated name is not valid for a PVC - (for example, too long). \n An existing - PVC with that name that is not owned by - the pod will *not* be used for the pod to - avoid using an unrelated volume by mistake. - Starting the pod is then blocked until the - unrelated PVC is removed. If such a pre-created - PVC is meant to be used by the pod, the - PVC has to updated with an owner reference - to the pod once the pod exists. Normally - this should not be necessary, but it may - be useful when manually reconstructing a - broken cluster. \n This field is read-only - and no changes will be made by Kubernetes - to the PVC after it has been created. \n - Required, must not be nil." + description: |- + Will be used to create a stand-alone PVC to provision the volume. + The pod in which this EphemeralVolumeSource is embedded will be the + owner of the PVC, i.e. the PVC will be deleted together with the + pod. The name of the PVC will be `-` where + `` is the name from the `PodSpec.Volumes` array + entry. Pod validation will reject the pod if the concatenated name + is not valid for a PVC (for example, too long). + + + An existing PVC with that name that is not owned by the pod + will *not* be used for the pod to avoid using an unrelated + volume by mistake. Starting the pod is then blocked until + the unrelated PVC is removed. If such a pre-created PVC is + meant to be used by the pod, the PVC has to updated with an + owner reference to the pod once the pod exists. Normally + this should not be necessary, but it may be useful when + manually reconstructing a broken cluster. + + + This field is read-only and no changes will be made by Kubernetes + to the PVC after it has been created. + + + Required, must not be nil. properties: metadata: - description: May contain labels and annotations - that will be copied into the PVC when - creating it. No other fields are allowed - and will be rejected during validation. + description: |- + May contain labels and annotations that will be copied into the PVC + when creating it. No other fields are allowed and will be rejected during + validation. properties: annotations: additionalProperties: @@ -8579,47 +8646,36 @@ spec: type: string type: object spec: - description: The specification for the - PersistentVolumeClaim. The entire content - is copied unchanged into the PVC that - gets created from this template. The - same fields as in a PersistentVolumeClaim + description: |- + The specification for the PersistentVolumeClaim. The entire content is + copied unchanged into the PVC that gets created from this + template. The same fields as in a PersistentVolumeClaim are also valid here. properties: accessModes: - description: 'accessModes contains - the desired access modes the volume - should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1' + description: |- + accessModes contains the desired access modes the volume should have. + More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1 items: type: string type: array + x-kubernetes-list-type: atomic dataSource: - description: 'dataSource field can - be used to specify either: * An - existing VolumeSnapshot object (snapshot.storage.k8s.io/VolumeSnapshot) + description: |- + dataSource field can be used to specify either: + * An existing VolumeSnapshot object (snapshot.storage.k8s.io/VolumeSnapshot) * An existing PVC (PersistentVolumeClaim) - If the provisioner or an external - controller can support the specified - data source, it will create a new - volume based on the contents of - the specified data source. When - the AnyVolumeDataSource feature - gate is enabled, dataSource contents - will be copied to dataSourceRef, - and dataSourceRef contents will - be copied to dataSource when dataSourceRef.namespace - is not specified. If the namespace - is specified, then dataSourceRef - will not be copied to dataSource.' + If the provisioner or an external controller can support the specified data source, + it will create a new volume based on the contents of the specified data source. + When the AnyVolumeDataSource feature gate is enabled, dataSource contents will be copied to dataSourceRef, + and dataSourceRef contents will be copied to dataSource when dataSourceRef.namespace is not specified. + If the namespace is specified, then dataSourceRef will not be copied to dataSource. properties: apiGroup: - description: APIGroup is the group - for the resource being referenced. - If APIGroup is not specified, - the specified Kind must be in - the core API group. For any - other third-party types, APIGroup - is required. + description: |- + APIGroup is the group for the resource being referenced. + If APIGroup is not specified, the specified Kind must be in the core API group. + For any other third-party types, APIGroup is required. type: string kind: description: Kind is the type @@ -8635,57 +8691,36 @@ spec: type: object x-kubernetes-map-type: atomic dataSourceRef: - description: 'dataSourceRef specifies - the object from which to populate - the volume with data, if a non-empty - volume is desired. This may be any - object from a non-empty API group - (non core object) or a PersistentVolumeClaim - object. When this field is specified, - volume binding will only succeed - if the type of the specified object - matches some installed volume populator - or dynamic provisioner. This field - will replace the functionality of - the dataSource field and as such - if both fields are non-empty, they - must have the same value. For backwards - compatibility, when namespace isn''t - specified in dataSourceRef, both - fields (dataSource and dataSourceRef) - will be set to the same value automatically - if one of them is empty and the - other is non-empty. When namespace - is specified in dataSourceRef, dataSource - isn''t set to the same value and - must be empty. There are three important - differences between dataSource and - dataSourceRef: * While dataSource - only allows two specific types of - objects, dataSourceRef allows any - non-core object, as well as PersistentVolumeClaim - objects. * While dataSource ignores - disallowed values (dropping them), - dataSourceRef preserves all values, - and generates an error if a disallowed - value is specified. * While dataSource - only allows local objects, dataSourceRef - allows objects in any namespaces. - (Beta) Using this field requires - the AnyVolumeDataSource feature - gate to be enabled. (Alpha) Using - the namespace field of dataSourceRef - requires the CrossNamespaceVolumeDataSource - feature gate to be enabled.' + description: |- + dataSourceRef specifies the object from which to populate the volume with data, if a non-empty + volume is desired. This may be any object from a non-empty API group (non + core object) or a PersistentVolumeClaim object. + When this field is specified, volume binding will only succeed if the type of + the specified object matches some installed volume populator or dynamic + provisioner. + This field will replace the functionality of the dataSource field and as such + if both fields are non-empty, they must have the same value. For backwards + compatibility, when namespace isn't specified in dataSourceRef, + both fields (dataSource and dataSourceRef) will be set to the same + value automatically if one of them is empty and the other is non-empty. + When namespace is specified in dataSourceRef, + dataSource isn't set to the same value and must be empty. + There are three important differences between dataSource and dataSourceRef: + * While dataSource only allows two specific types of objects, dataSourceRef + allows any non-core object, as well as PersistentVolumeClaim objects. + * While dataSource ignores disallowed values (dropping them), dataSourceRef + preserves all values, and generates an error if a disallowed value is + specified. + * While dataSource only allows local objects, dataSourceRef allows objects + in any namespaces. + (Beta) Using this field requires the AnyVolumeDataSource feature gate to be enabled. + (Alpha) Using the namespace field of dataSourceRef requires the CrossNamespaceVolumeDataSource feature gate to be enabled. properties: apiGroup: - description: APIGroup is the group - for the resource being referenced. - If APIGroup is not specified, - the specified Kind must be in - the core API group. For any - other third-party types, APIGroup - is required. + description: |- + APIGroup is the group for the resource being referenced. + If APIGroup is not specified, the specified Kind must be in the core API group. + For any other third-party types, APIGroup is required. type: string kind: description: Kind is the type @@ -8696,32 +8731,22 @@ spec: of resource being referenced type: string namespace: - description: Namespace is the - namespace of resource being - referenced Note that when a - namespace is specified, a gateway.networking.k8s.io/ReferenceGrant - object is required in the referent - namespace to allow that namespace's - owner to accept the reference. - See the ReferenceGrant documentation - for details. (Alpha) This field - requires the CrossNamespaceVolumeDataSource - feature gate to be enabled. + description: |- + Namespace is the namespace of resource being referenced + Note that when a namespace is specified, a gateway.networking.k8s.io/ReferenceGrant object is required in the referent namespace to allow that namespace's owner to accept the reference. See the ReferenceGrant documentation for details. + (Alpha) This field requires the CrossNamespaceVolumeDataSource feature gate to be enabled. type: string required: - kind - name type: object resources: - description: 'resources represents - the minimum resources the volume - should have. If RecoverVolumeExpansionFailure - feature is enabled users are allowed - to specify resource requirements - that are lower than previous value - but must still be higher than capacity - recorded in the status field of - the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#resources' + description: |- + resources represents the minimum resources the volume should have. + If RecoverVolumeExpansionFailure feature is enabled users are allowed to specify resource requirements + that are lower than previous value but must still be higher than capacity recorded in the + status field of the claim. + More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#resources properties: limits: additionalProperties: @@ -8730,10 +8755,9 @@ spec: - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true - description: 'Limits describes - the maximum amount of compute - resources allowed. More info: - https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ type: object requests: additionalProperties: @@ -8742,15 +8766,11 @@ spec: - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true - description: 'Requests describes - the minimum amount of compute - resources required. If Requests - is omitted for a container, - it defaults to Limits if that - is explicitly specified, otherwise - to an implementation-defined - value. Requests cannot exceed - Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ type: object type: object selector: @@ -8763,11 +8783,9 @@ spec: requirements. The requirements are ANDed. items: - description: A label selector - requirement is a selector - that contains values, a key, - and an operator that relates - the key and values. + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. properties: key: description: key is the @@ -8775,56 +8793,60 @@ spec: applies to. type: string operator: - description: operator represents - a key's relationship to - a set of values. Valid - operators are In, NotIn, - Exists and DoesNotExist. + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. type: string values: - description: values is an - array of string values. - If the operator is In - or NotIn, the values array - must be non-empty. If - the operator is Exists - or DoesNotExist, the values - array must be empty. This - array is replaced during - a strategic merge patch. + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. items: type: string type: array + x-kubernetes-list-type: atomic required: - key - operator type: object type: array + x-kubernetes-list-type: atomic matchLabels: additionalProperties: type: string - description: matchLabels is a - map of {key,value} pairs. A - single {key,value} in the matchLabels - map is equivalent to an element - of matchExpressions, whose key - field is "key", the operator - is "In", and the values array - contains only "value". The requirements - are ANDed. + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. type: object type: object x-kubernetes-map-type: atomic storageClassName: - description: 'storageClassName is - the name of the StorageClass required - by the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1' + description: |- + storageClassName is the name of the StorageClass required by the claim. + More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1 + type: string + volumeAttributesClassName: + description: |- + volumeAttributesClassName may be used to set the VolumeAttributesClass used by this claim. + If specified, the CSI driver will create or update the volume with the attributes defined + in the corresponding VolumeAttributesClass. This has a different purpose than storageClassName, + it can be changed after the claim is created. An empty string value means that no VolumeAttributesClass + will be applied to the claim but it's not allowed to reset this field to empty string once it is set. + If unspecified and the PersistentVolumeClaim is unbound, the default VolumeAttributesClass + will be set by the persistentvolume controller if it exists. + If the resource referred to by volumeAttributesClass does not exist, this PersistentVolumeClaim will be + set to a Pending state, as reflected by the modifyVolumeStatus field, until such as a resource + exists. + More info: https://kubernetes.io/docs/concepts/storage/volume-attributes-classes/ + (Alpha) Using this field requires the VolumeAttributesClass feature gate to be enabled. type: string volumeMode: - description: volumeMode defines what - type of volume is required by the - claim. Value of Filesystem is implied - when not included in claim spec. + description: |- + volumeMode defines what type of volume is required by the claim. + Value of Filesystem is implied when not included in claim spec. type: string volumeName: description: volumeName is the binding @@ -8842,13 +8864,11 @@ spec: and then exposed to the pod. properties: fsType: - description: 'fsType is the filesystem type - to mount. Must be a filesystem type supported - by the host operating system. Ex. "ext4", - "xfs", "ntfs". Implicitly inferred to be - "ext4" if unspecified. TODO: how do we prevent - errors in the filesystem from compromising - the machine' + description: |- + fsType is the filesystem type to mount. + Must be a filesystem type supported by the host operating system. + Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. + TODO: how do we prevent errors in the filesystem from compromising the machine type: string lun: description: 'lun is Optional: FC target lun @@ -8856,9 +8876,9 @@ spec: format: int32 type: integer readOnly: - description: 'readOnly is Optional: Defaults - to false (read/write). ReadOnly here will - force the ReadOnly setting in VolumeMounts.' + description: |- + readOnly is Optional: Defaults to false (read/write). ReadOnly here will force + the ReadOnly setting in VolumeMounts. type: boolean targetWWNs: description: 'targetWWNs is Optional: FC target @@ -8866,30 +8886,30 @@ spec: items: type: string type: array + x-kubernetes-list-type: atomic wwids: - description: 'wwids Optional: FC volume world - wide identifiers (wwids) Either wwids or - combination of targetWWNs and lun must be - set, but not both simultaneously.' + description: |- + wwids Optional: FC volume world wide identifiers (wwids) + Either wwids or combination of targetWWNs and lun must be set, but not both simultaneously. items: type: string type: array + x-kubernetes-list-type: atomic type: object flexVolume: - description: flexVolume represents a generic volume - resource that is provisioned/attached using - an exec based plugin. + description: |- + flexVolume represents a generic volume resource that is + provisioned/attached using an exec based plugin. properties: driver: description: driver is the name of the driver to use for this volume. type: string fsType: - description: fsType is the filesystem type - to mount. Must be a filesystem type supported - by the host operating system. Ex. "ext4", - "xfs", "ntfs". The default filesystem depends - on FlexVolume script. + description: |- + fsType is the filesystem type to mount. + Must be a filesystem type supported by the host operating system. + Ex. "ext4", "xfs", "ntfs". The default filesystem depends on FlexVolume script. type: string options: additionalProperties: @@ -8898,24 +8918,28 @@ spec: holds extra command options if any.' type: object readOnly: - description: 'readOnly is Optional: defaults - to false (read/write). ReadOnly here will - force the ReadOnly setting in VolumeMounts.' + description: |- + readOnly is Optional: defaults to false (read/write). ReadOnly here will force + the ReadOnly setting in VolumeMounts. type: boolean secretRef: - description: 'secretRef is Optional: secretRef - is reference to the secret object containing - sensitive information to pass to the plugin - scripts. This may be empty if no secret - object is specified. If the secret object - contains more than one secret, all secrets - are passed to the plugin scripts.' + description: |- + secretRef is Optional: secretRef is reference to the secret object containing + sensitive information to pass to the plugin scripts. This may be + empty if no secret object is specified. If the secret object + contains more than one secret, all secrets are passed to the plugin + scripts. properties: name: - description: 'Name of the referent. More - info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + TODO: Add other useful fields. apiVersion, kind, uid? + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. type: string type: object x-kubernetes-map-type: atomic @@ -8928,9 +8952,9 @@ spec: on the Flocker control service being running properties: datasetName: - description: datasetName is Name of the dataset - stored as metadata -> name on the dataset - for Flocker should be considered as deprecated + description: |- + datasetName is Name of the dataset stored as metadata -> name on the dataset for Flocker + should be considered as deprecated type: string datasetUUID: description: datasetUUID is the UUID of the @@ -8939,60 +8963,55 @@ spec: type: string type: object gcePersistentDisk: - description: 'gcePersistentDisk represents a GCE - Disk resource that is attached to a kubelet''s - host machine and then exposed to the pod. More - info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk' + description: |- + gcePersistentDisk represents a GCE Disk resource that is attached to a + kubelet's host machine and then exposed to the pod. + More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk properties: fsType: - description: 'fsType is filesystem type of - the volume that you want to mount. Tip: - Ensure that the filesystem type is supported - by the host operating system. Examples: - "ext4", "xfs", "ntfs". Implicitly inferred - to be "ext4" if unspecified. More info: - https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk - TODO: how do we prevent errors in the filesystem - from compromising the machine' + description: |- + fsType is filesystem type of the volume that you want to mount. + Tip: Ensure that the filesystem type is supported by the host operating system. + Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. + More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk + TODO: how do we prevent errors in the filesystem from compromising the machine type: string partition: - description: 'partition is the partition in - the volume that you want to mount. If omitted, - the default is to mount by volume name. - Examples: For volume /dev/sda1, you specify - the partition as "1". Similarly, the volume - partition for /dev/sda is "0" (or you can - leave the property empty). More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk' + description: |- + partition is the partition in the volume that you want to mount. + If omitted, the default is to mount by volume name. + Examples: For volume /dev/sda1, you specify the partition as "1". + Similarly, the volume partition for /dev/sda is "0" (or you can leave the property empty). + More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk format: int32 type: integer pdName: - description: 'pdName is unique name of the - PD resource in GCE. Used to identify the - disk in GCE. More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk' + description: |- + pdName is unique name of the PD resource in GCE. Used to identify the disk in GCE. + More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk type: string readOnly: - description: 'readOnly here will force the - ReadOnly setting in VolumeMounts. Defaults - to false. More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk' + description: |- + readOnly here will force the ReadOnly setting in VolumeMounts. + Defaults to false. + More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk type: boolean required: - pdName type: object gitRepo: - description: 'gitRepo represents a git repository - at a particular revision. DEPRECATED: GitRepo - is deprecated. To provision a container with - a git repo, mount an EmptyDir into an InitContainer - that clones the repo using git, then mount the - EmptyDir into the Pod''s container.' + description: |- + gitRepo represents a git repository at a particular revision. + DEPRECATED: GitRepo is deprecated. To provision a container with a git repo, mount an + EmptyDir into an InitContainer that clones the repo using git, then mount the EmptyDir + into the Pod's container. properties: directory: - description: directory is the target directory - name. Must not contain or start with '..'. If - '.' is supplied, the volume directory will - be the git repository. Otherwise, if specified, - the volume will contain the git repository - in the subdirectory with the given name. + description: |- + directory is the target directory name. + Must not contain or start with '..'. If '.' is supplied, the volume directory will be the + git repository. Otherwise, if specified, the volume will contain the git repository in + the subdirectory with the given name. type: string repository: description: repository is the URL @@ -9005,57 +9024,61 @@ spec: - repository type: object glusterfs: - description: 'glusterfs represents a Glusterfs - mount on the host that shares a pod''s lifetime. - More info: https://examples.k8s.io/volumes/glusterfs/README.md' + description: |- + glusterfs represents a Glusterfs mount on the host that shares a pod's lifetime. + More info: https://examples.k8s.io/volumes/glusterfs/README.md properties: endpoints: - description: 'endpoints is the endpoint name - that details Glusterfs topology. More info: - https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod' + description: |- + endpoints is the endpoint name that details Glusterfs topology. + More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod type: string path: - description: 'path is the Glusterfs volume - path. More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod' + description: |- + path is the Glusterfs volume path. + More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod type: string readOnly: - description: 'readOnly here will force the - Glusterfs volume to be mounted with read-only - permissions. Defaults to false. More info: - https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod' + description: |- + readOnly here will force the Glusterfs volume to be mounted with read-only permissions. + Defaults to false. + More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod type: boolean required: - endpoints - path type: object hostPath: - description: 'hostPath represents a pre-existing - file or directory on the host machine that is - directly exposed to the container. This is generally - used for system agents or other privileged things - that are allowed to see the host machine. Most - containers will NOT need this. More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath - --- TODO(jonesdl) We need to restrict who can - use host directory mounts and who can/can not - mount host directories as read/write.' + description: |- + hostPath represents a pre-existing file or directory on the host + machine that is directly exposed to the container. This is generally + used for system agents or other privileged things that are allowed + to see the host machine. Most containers will NOT need this. + More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath + --- + TODO(jonesdl) We need to restrict who can use host directory mounts and who can/can not + mount host directories as read/write. properties: path: - description: 'path of the directory on the - host. If the path is a symlink, it will - follow the link to the real path. More info: - https://kubernetes.io/docs/concepts/storage/volumes#hostpath' + description: |- + path of the directory on the host. + If the path is a symlink, it will follow the link to the real path. + More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath type: string type: - description: 'type for HostPath Volume Defaults - to "" More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath' + description: |- + type for HostPath Volume + Defaults to "" + More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath type: string required: - path type: object iscsi: - description: 'iscsi represents an ISCSI Disk resource - that is attached to a kubelet''s host machine - and then exposed to the pod. More info: https://examples.k8s.io/volumes/iscsi/README.md' + description: |- + iscsi represents an ISCSI Disk resource that is attached to a + kubelet's host machine and then exposed to the pod. + More info: https://examples.k8s.io/volumes/iscsi/README.md properties: chapAuthDiscovery: description: chapAuthDiscovery defines whether @@ -9066,31 +9089,27 @@ spec: support iSCSI Session CHAP authentication type: boolean fsType: - description: 'fsType is the filesystem type - of the volume that you want to mount. Tip: - Ensure that the filesystem type is supported - by the host operating system. Examples: - "ext4", "xfs", "ntfs". Implicitly inferred - to be "ext4" if unspecified. More info: - https://kubernetes.io/docs/concepts/storage/volumes#iscsi - TODO: how do we prevent errors in the filesystem - from compromising the machine' + description: |- + fsType is the filesystem type of the volume that you want to mount. + Tip: Ensure that the filesystem type is supported by the host operating system. + Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. + More info: https://kubernetes.io/docs/concepts/storage/volumes#iscsi + TODO: how do we prevent errors in the filesystem from compromising the machine type: string initiatorName: - description: initiatorName is the custom iSCSI - Initiator Name. If initiatorName is specified - with iscsiInterface simultaneously, new - iSCSI interface : will be created for the connection. + description: |- + initiatorName is the custom iSCSI Initiator Name. + If initiatorName is specified with iscsiInterface simultaneously, new iSCSI interface + : will be created for the connection. type: string iqn: description: iqn is the target iSCSI Qualified Name. type: string iscsiInterface: - description: iscsiInterface is the interface - Name that uses an iSCSI transport. Defaults - to 'default' (tcp). + description: |- + iscsiInterface is the interface Name that uses an iSCSI transport. + Defaults to 'default' (tcp). type: string lun: description: lun represents iSCSI Target Lun @@ -9098,35 +9117,39 @@ spec: format: int32 type: integer portals: - description: portals is the iSCSI Target Portal - List. The portal is either an IP or ip_addr:port - if the port is other than default (typically - TCP ports 860 and 3260). + description: |- + portals is the iSCSI Target Portal List. The portal is either an IP or ip_addr:port if the port + is other than default (typically TCP ports 860 and 3260). items: type: string type: array + x-kubernetes-list-type: atomic readOnly: - description: readOnly here will force the - ReadOnly setting in VolumeMounts. Defaults - to false. + description: |- + readOnly here will force the ReadOnly setting in VolumeMounts. + Defaults to false. type: boolean secretRef: description: secretRef is the CHAP Secret for iSCSI target and initiator authentication properties: name: - description: 'Name of the referent. More - info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + TODO: Add other useful fields. apiVersion, kind, uid? + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. type: string type: object x-kubernetes-map-type: atomic targetPortal: - description: targetPortal is iSCSI Target - Portal. The Portal is either an IP or ip_addr:port - if the port is other than default (typically - TCP ports 860 and 3260). + description: |- + targetPortal is iSCSI Target Portal. The Portal is either an IP or ip_addr:port if the port + is other than default (typically TCP ports 860 and 3260). type: string required: - iqn @@ -9134,45 +9157,51 @@ spec: - targetPortal type: object name: - description: 'name of the volume. Must be a DNS_LABEL - and unique within the pod. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + description: |- + name of the volume. + Must be a DNS_LABEL and unique within the pod. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string nfs: - description: 'nfs represents an NFS mount on the - host that shares a pod''s lifetime More info: - https://kubernetes.io/docs/concepts/storage/volumes#nfs' + description: |- + nfs represents an NFS mount on the host that shares a pod's lifetime + More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs properties: path: - description: 'path that is exported by the - NFS server. More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs' + description: |- + path that is exported by the NFS server. + More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs type: string readOnly: - description: 'readOnly here will force the - NFS export to be mounted with read-only - permissions. Defaults to false. More info: - https://kubernetes.io/docs/concepts/storage/volumes#nfs' + description: |- + readOnly here will force the NFS export to be mounted with read-only permissions. + Defaults to false. + More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs type: boolean server: - description: 'server is the hostname or IP - address of the NFS server. More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs' + description: |- + server is the hostname or IP address of the NFS server. + More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs type: string required: - path - server type: object persistentVolumeClaim: - description: 'persistentVolumeClaimVolumeSource - represents a reference to a PersistentVolumeClaim - in the same namespace. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#persistentvolumeclaims' + description: |- + persistentVolumeClaimVolumeSource represents a reference to a + PersistentVolumeClaim in the same namespace. + More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#persistentvolumeclaims properties: claimName: - description: 'claimName is the name of a PersistentVolumeClaim - in the same namespace as the pod using this - volume. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#persistentvolumeclaims' + description: |- + claimName is the name of a PersistentVolumeClaim in the same namespace as the pod using this volume. + More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#persistentvolumeclaims type: string readOnly: - description: readOnly Will force the ReadOnly - setting in VolumeMounts. Default false. + description: |- + readOnly Will force the ReadOnly setting in VolumeMounts. + Default false. type: boolean required: - claimName @@ -9183,11 +9212,10 @@ spec: mounted on kubelets host machine properties: fsType: - description: fsType is the filesystem type - to mount. Must be a filesystem type supported - by the host operating system. Ex. "ext4", - "xfs", "ntfs". Implicitly inferred to be - "ext4" if unspecified. + description: |- + fsType is the filesystem type to mount. + Must be a filesystem type supported by the host operating system. + Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. type: string pdID: description: pdID is the ID that identifies @@ -9202,16 +9230,15 @@ spec: machine properties: fsType: - description: fSType represents the filesystem - type to mount Must be a filesystem type - supported by the host operating system. - Ex. "ext4", "xfs". Implicitly inferred to - be "ext4" if unspecified. + description: |- + fSType represents the filesystem type to mount + Must be a filesystem type supported by the host operating system. + Ex. "ext4", "xfs". Implicitly inferred to be "ext4" if unspecified. type: string readOnly: - description: readOnly defaults to false (read/write). - ReadOnly here will force the ReadOnly setting - in VolumeMounts. + description: |- + readOnly defaults to false (read/write). ReadOnly here will force + the ReadOnly setting in VolumeMounts. type: boolean volumeID: description: volumeID uniquely identifies @@ -9225,17 +9252,13 @@ spec: secrets, configmaps, and downward API properties: defaultMode: - description: defaultMode are the mode bits - used to set permissions on created files - by default. Must be an octal value between - 0000 and 0777 or a decimal value between - 0 and 511. YAML accepts both octal and decimal - values, JSON requires decimal values for - mode bits. Directories within the path are - not affected by this setting. This might - be in conflict with other options that affect - the file mode, like fsGroup, and the result - can be other mode bits set. + description: |- + defaultMode are the mode bits used to set permissions on created files by default. + Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. + YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. + Directories within the path are not affected by this setting. + This might be in conflict with other options that affect the file + mode, like fsGroup, and the result can be other mode bits set. format: int32 type: integer sources: @@ -9300,11 +9323,13 @@ spec: items: type: string type: array + x-kubernetes-list-type: atomic required: - key - operator type: object type: array + x-kubernetes-list-type: atomic matchLabels: additionalProperties: type: string @@ -9346,21 +9371,14 @@ spec: the configMap data to project properties: items: - description: items if unspecified, - each key-value pair in the Data - field of the referenced ConfigMap - will be projected into the volume - as a file whose name is the key - and content is the value. If specified, - the listed keys will be projected - into the specified paths, and - unlisted keys will not be present. - If a key is specified which is - not present in the ConfigMap, - the volume setup will error unless - it is marked optional. Paths must - be relative and may not contain - the '..' path or start with '..'. + description: |- + items if unspecified, each key-value pair in the Data field of the referenced + ConfigMap will be projected into the volume as a file whose name is the + key and content is the value. If specified, the listed keys will be + projected into the specified paths, and unlisted keys will not be + present. If a key is specified which is not present in the ConfigMap, + the volume setup will error unless it is marked optional. Paths must be + relative and may not contain the '..' path or start with '..'. items: description: Maps a string key to a path within a volume. @@ -9370,43 +9388,38 @@ spec: to project. type: string mode: - description: 'mode is Optional: - mode bits used to set permissions - on this file. Must be an - octal value between 0000 - and 0777 or a decimal value - between 0 and 511. YAML - accepts both octal and decimal - values, JSON requires decimal - values for mode bits. If - not specified, the volume - defaultMode will be used. - This might be in conflict - with other options that - affect the file mode, like - fsGroup, and the result - can be other mode bits set.' + description: |- + mode is Optional: mode bits used to set permissions on this file. + Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. + YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. + If not specified, the volume defaultMode will be used. + This might be in conflict with other options that affect the file + mode, like fsGroup, and the result can be other mode bits set. format: int32 type: integer path: - description: path is the relative - path of the file to map - the key to. May not be an - absolute path. May not contain - the path element '..'. May - not start with the string - '..'. + description: |- + path is the relative path of the file to map the key to. + May not be an absolute path. + May not contain the path element '..'. + May not start with the string '..'. type: string required: - key - path type: object type: array + x-kubernetes-list-type: atomic name: - description: 'Name of the referent. + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + TODO: Add other useful fields. apiVersion, kind, uid? More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. - apiVersion, kind, uid?' + TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. type: string optional: description: optional specify whether @@ -9431,8 +9444,8 @@ spec: fieldRef: description: 'Required: Selects a field of the pod: only - annotations, labels, name - and namespace are supported.' + annotations, labels, name, + namespace and uid are supported.' properties: apiVersion: description: Version of @@ -9450,22 +9463,13 @@ spec: type: object x-kubernetes-map-type: atomic mode: - description: 'Optional: mode - bits used to set permissions - on this file, must be an - octal value between 0000 - and 0777 or a decimal value - between 0 and 511. YAML - accepts both octal and decimal - values, JSON requires decimal - values for mode bits. If - not specified, the volume - defaultMode will be used. - This might be in conflict - with other options that - affect the file mode, like - fsGroup, and the result - can be other mode bits set.' + description: |- + Optional: mode bits used to set permissions on this file, must be an octal value + between 0000 and 0777 or a decimal value between 0 and 511. + YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. + If not specified, the volume defaultMode will be used. + This might be in conflict with other options that affect the file + mode, like fsGroup, and the result can be other mode bits set. format: int32 type: integer path: @@ -9480,12 +9484,9 @@ spec: ''..''' type: string resourceFieldRef: - description: 'Selects a resource - of the container: only resources - limits and requests (limits.cpu, - limits.memory, requests.cpu - and requests.memory) are - currently supported.' + description: |- + Selects a resource of the container: only resources limits and requests + (limits.cpu, limits.memory, requests.cpu and requests.memory) are currently supported. properties: containerName: description: 'Container @@ -9514,27 +9515,21 @@ spec: - path type: object type: array + x-kubernetes-list-type: atomic type: object secret: description: secret information about the secret data to project properties: items: - description: items if unspecified, - each key-value pair in the Data - field of the referenced Secret - will be projected into the volume - as a file whose name is the key - and content is the value. If specified, - the listed keys will be projected - into the specified paths, and - unlisted keys will not be present. - If a key is specified which is - not present in the Secret, the - volume setup will error unless - it is marked optional. Paths must - be relative and may not contain - the '..' path or start with '..'. + description: |- + items if unspecified, each key-value pair in the Data field of the referenced + Secret will be projected into the volume as a file whose name is the + key and content is the value. If specified, the listed keys will be + projected into the specified paths, and unlisted keys will not be + present. If a key is specified which is not present in the Secret, + the volume setup will error unless it is marked optional. Paths must be + relative and may not contain the '..' path or start with '..'. items: description: Maps a string key to a path within a volume. @@ -9544,43 +9539,38 @@ spec: to project. type: string mode: - description: 'mode is Optional: - mode bits used to set permissions - on this file. Must be an - octal value between 0000 - and 0777 or a decimal value - between 0 and 511. YAML - accepts both octal and decimal - values, JSON requires decimal - values for mode bits. If - not specified, the volume - defaultMode will be used. - This might be in conflict - with other options that - affect the file mode, like - fsGroup, and the result - can be other mode bits set.' + description: |- + mode is Optional: mode bits used to set permissions on this file. + Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. + YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. + If not specified, the volume defaultMode will be used. + This might be in conflict with other options that affect the file + mode, like fsGroup, and the result can be other mode bits set. format: int32 type: integer path: - description: path is the relative - path of the file to map - the key to. May not be an - absolute path. May not contain - the path element '..'. May - not start with the string - '..'. + description: |- + path is the relative path of the file to map the key to. + May not be an absolute path. + May not contain the path element '..'. + May not start with the string '..'. type: string required: - key - path type: object type: array + x-kubernetes-list-type: atomic name: - description: 'Name of the referent. + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + TODO: Add other useful fields. apiVersion, kind, uid? More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. - apiVersion, kind, uid?' + TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. type: string optional: description: optional field specify @@ -9595,70 +9585,62 @@ spec: data to project properties: audience: - description: audience is the intended - audience of the token. A recipient - of a token must identify itself - with an identifier specified in - the audience of the token, and - otherwise should reject the token. - The audience defaults to the identifier - of the apiserver. + description: |- + audience is the intended audience of the token. A recipient of a token + must identify itself with an identifier specified in the audience of the + token, and otherwise should reject the token. The audience defaults to the + identifier of the apiserver. type: string expirationSeconds: - description: expirationSeconds is - the requested duration of validity - of the service account token. - As the token approaches expiration, - the kubelet volume plugin will - proactively rotate the service - account token. The kubelet will - start trying to rotate the token - if the token is older than 80 - percent of its time to live or - if the token is older than 24 - hours.Defaults to 1 hour and must - be at least 10 minutes. + description: |- + expirationSeconds is the requested duration of validity of the service + account token. As the token approaches expiration, the kubelet volume + plugin will proactively rotate the service account token. The kubelet will + start trying to rotate the token if the token is older than 80 percent of + its time to live or if the token is older than 24 hours.Defaults to 1 hour + and must be at least 10 minutes. format: int64 type: integer path: - description: path is the path relative - to the mount point of the file - to project the token into. + description: |- + path is the path relative to the mount point of the file to project the + token into. type: string required: - path type: object type: object type: array + x-kubernetes-list-type: atomic type: object quobyte: description: quobyte represents a Quobyte mount on the host that shares a pod's lifetime properties: group: - description: group to map volume access to + description: |- + group to map volume access to Default is no group type: string readOnly: - description: readOnly here will force the - Quobyte volume to be mounted with read-only - permissions. Defaults to false. + description: |- + readOnly here will force the Quobyte volume to be mounted with read-only permissions. + Defaults to false. type: boolean registry: - description: registry represents a single - or multiple Quobyte Registry services specified - as a string as host:port pair (multiple - entries are separated with commas) which - acts as the central registry for volumes + description: |- + registry represents a single or multiple Quobyte Registry services + specified as a string as host:port pair (multiple entries are separated with commas) + which acts as the central registry for volumes type: string tenant: - description: tenant owning the given Quobyte - volume in the Backend Used with dynamically - provisioned Quobyte volumes, value is set - by the plugin + description: |- + tenant owning the given Quobyte volume in the Backend + Used with dynamically provisioned Quobyte volumes, value is set by the plugin type: string user: - description: user to map volume access to + description: |- + user to map volume access to Defaults to serivceaccount user type: string volume: @@ -9670,61 +9652,74 @@ spec: - volume type: object rbd: - description: 'rbd represents a Rados Block Device - mount on the host that shares a pod''s lifetime. - More info: https://examples.k8s.io/volumes/rbd/README.md' + description: |- + rbd represents a Rados Block Device mount on the host that shares a pod's lifetime. + More info: https://examples.k8s.io/volumes/rbd/README.md properties: fsType: - description: 'fsType is the filesystem type - of the volume that you want to mount. Tip: - Ensure that the filesystem type is supported - by the host operating system. Examples: - "ext4", "xfs", "ntfs". Implicitly inferred - to be "ext4" if unspecified. More info: - https://kubernetes.io/docs/concepts/storage/volumes#rbd - TODO: how do we prevent errors in the filesystem - from compromising the machine' + description: |- + fsType is the filesystem type of the volume that you want to mount. + Tip: Ensure that the filesystem type is supported by the host operating system. + Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. + More info: https://kubernetes.io/docs/concepts/storage/volumes#rbd + TODO: how do we prevent errors in the filesystem from compromising the machine type: string image: - description: 'image is the rados image name. - More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + description: |- + image is the rados image name. + More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it type: string keyring: - description: 'keyring is the path to key ring - for RBDUser. Default is /etc/ceph/keyring. - More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + description: |- + keyring is the path to key ring for RBDUser. + Default is /etc/ceph/keyring. + More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it type: string monitors: - description: 'monitors is a collection of - Ceph monitors. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + description: |- + monitors is a collection of Ceph monitors. + More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it items: type: string type: array + x-kubernetes-list-type: atomic pool: - description: 'pool is the rados pool name. - Default is rbd. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + description: |- + pool is the rados pool name. + Default is rbd. + More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it type: string readOnly: - description: 'readOnly here will force the - ReadOnly setting in VolumeMounts. Defaults - to false. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + description: |- + readOnly here will force the ReadOnly setting in VolumeMounts. + Defaults to false. + More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it type: boolean secretRef: - description: 'secretRef is name of the authentication - secret for RBDUser. If provided overrides - keyring. Default is nil. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + description: |- + secretRef is name of the authentication secret for RBDUser. If provided + overrides keyring. + Default is nil. + More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it properties: name: - description: 'Name of the referent. More - info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + TODO: Add other useful fields. apiVersion, kind, uid? + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. type: string type: object x-kubernetes-map-type: atomic user: - description: 'user is the rados user name. - Default is admin. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + description: |- + user is the rados user name. + Default is admin. + More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it type: string required: - image @@ -9735,10 +9730,11 @@ spec: volume attached and mounted on Kubernetes nodes. properties: fsType: - description: fsType is the filesystem type - to mount. Must be a filesystem type supported - by the host operating system. Ex. "ext4", - "xfs", "ntfs". Default is "xfs". + description: |- + fsType is the filesystem type to mount. + Must be a filesystem type supported by the host operating system. + Ex. "ext4", "xfs", "ntfs". + Default is "xfs". type: string gateway: description: gateway is the host address of @@ -9750,21 +9746,25 @@ spec: configured storage. type: string readOnly: - description: readOnly Defaults to false (read/write). - ReadOnly here will force the ReadOnly setting - in VolumeMounts. + description: |- + readOnly Defaults to false (read/write). ReadOnly here will force + the ReadOnly setting in VolumeMounts. type: boolean secretRef: - description: secretRef references to the secret - for ScaleIO user and other sensitive information. - If this is not provided, Login operation - will fail. + description: |- + secretRef references to the secret for ScaleIO user and other + sensitive information. If this is not provided, Login operation will fail. properties: name: - description: 'Name of the referent. More - info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + TODO: Add other useful fields. apiVersion, kind, uid? + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. type: string type: object x-kubernetes-map-type: atomic @@ -9774,9 +9774,9 @@ spec: false type: boolean storageMode: - description: storageMode indicates whether - the storage for a volume should be ThickProvisioned - or ThinProvisioned. Default is ThinProvisioned. + description: |- + storageMode indicates whether the storage for a volume should be ThickProvisioned or ThinProvisioned. + Default is ThinProvisioned. type: string storagePool: description: storagePool is the ScaleIO Storage @@ -9787,9 +9787,9 @@ spec: system as configured in ScaleIO. type: string volumeName: - description: volumeName is the name of a volume - already created in the ScaleIO system that - is associated with this volume source. + description: |- + volumeName is the name of a volume already created in the ScaleIO system + that is associated with this volume source. type: string required: - gateway @@ -9797,37 +9797,30 @@ spec: - system type: object secret: - description: 'secret represents a secret that - should populate this volume. More info: https://kubernetes.io/docs/concepts/storage/volumes#secret' + description: |- + secret represents a secret that should populate this volume. + More info: https://kubernetes.io/docs/concepts/storage/volumes#secret properties: defaultMode: - description: 'defaultMode is Optional: mode - bits used to set permissions on created - files by default. Must be an octal value - between 0000 and 0777 or a decimal value - between 0 and 511. YAML accepts both octal - and decimal values, JSON requires decimal - values for mode bits. Defaults to 0644. - Directories within the path are not affected - by this setting. This might be in conflict - with other options that affect the file - mode, like fsGroup, and the result can be - other mode bits set.' + description: |- + defaultMode is Optional: mode bits used to set permissions on created files by default. + Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. + YAML accepts both octal and decimal values, JSON requires decimal values + for mode bits. Defaults to 0644. + Directories within the path are not affected by this setting. + This might be in conflict with other options that affect the file + mode, like fsGroup, and the result can be other mode bits set. format: int32 type: integer items: - description: items If unspecified, each key-value - pair in the Data field of the referenced - Secret will be projected into the volume - as a file whose name is the key and content - is the value. If specified, the listed keys - will be projected into the specified paths, - and unlisted keys will not be present. If - a key is specified which is not present - in the Secret, the volume setup will error - unless it is marked optional. Paths must - be relative and may not contain the '..' - path or start with '..'. + description: |- + items If unspecified, each key-value pair in the Data field of the referenced + Secret will be projected into the volume as a file whose name is the + key and content is the value. If specified, the listed keys will be + projected into the specified paths, and unlisted keys will not be + present. If a key is specified which is not present in the Secret, + the volume setup will error unless it is marked optional. Paths must be + relative and may not contain the '..' path or start with '..'. items: description: Maps a string key to a path within a volume. @@ -9836,40 +9829,36 @@ spec: description: key is the key to project. type: string mode: - description: 'mode is Optional: mode - bits used to set permissions on this - file. Must be an octal value between - 0000 and 0777 or a decimal value between - 0 and 511. YAML accepts both octal - and decimal values, JSON requires - decimal values for mode bits. If not - specified, the volume defaultMode - will be used. This might be in conflict - with other options that affect the - file mode, like fsGroup, and the result - can be other mode bits set.' + description: |- + mode is Optional: mode bits used to set permissions on this file. + Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. + YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. + If not specified, the volume defaultMode will be used. + This might be in conflict with other options that affect the file + mode, like fsGroup, and the result can be other mode bits set. format: int32 type: integer path: - description: path is the relative path - of the file to map the key to. May - not be an absolute path. May not contain - the path element '..'. May not start - with the string '..'. + description: |- + path is the relative path of the file to map the key to. + May not be an absolute path. + May not contain the path element '..'. + May not start with the string '..'. type: string required: - key - path type: object type: array + x-kubernetes-list-type: atomic optional: description: optional field specify whether the Secret or its keys must be defined type: boolean secretName: - description: 'secretName is the name of the - secret in the pod''s namespace to use. More - info: https://kubernetes.io/docs/concepts/storage/volumes#secret' + description: |- + secretName is the name of the secret in the pod's namespace to use. + More info: https://kubernetes.io/docs/concepts/storage/volumes#secret type: string type: object storageos: @@ -9877,46 +9866,47 @@ spec: volume attached and mounted on Kubernetes nodes. properties: fsType: - description: fsType is the filesystem type - to mount. Must be a filesystem type supported - by the host operating system. Ex. "ext4", - "xfs", "ntfs". Implicitly inferred to be - "ext4" if unspecified. + description: |- + fsType is the filesystem type to mount. + Must be a filesystem type supported by the host operating system. + Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. type: string readOnly: - description: readOnly defaults to false (read/write). - ReadOnly here will force the ReadOnly setting - in VolumeMounts. + description: |- + readOnly defaults to false (read/write). ReadOnly here will force + the ReadOnly setting in VolumeMounts. type: boolean secretRef: - description: secretRef specifies the secret - to use for obtaining the StorageOS API credentials. If - not specified, default values will be attempted. + description: |- + secretRef specifies the secret to use for obtaining the StorageOS API + credentials. If not specified, default values will be attempted. properties: name: - description: 'Name of the referent. More - info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + TODO: Add other useful fields. apiVersion, kind, uid? + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. type: string type: object x-kubernetes-map-type: atomic volumeName: - description: volumeName is the human-readable - name of the StorageOS volume. Volume names - are only unique within a namespace. + description: |- + volumeName is the human-readable name of the StorageOS volume. Volume + names are only unique within a namespace. type: string volumeNamespace: - description: volumeNamespace specifies the - scope of the volume within StorageOS. If - no namespace is specified then the Pod's - namespace will be used. This allows the - Kubernetes name scoping to be mirrored within - StorageOS for tighter integration. Set VolumeName - to any name to override the default behaviour. - Set to "default" if you are not using namespaces - within StorageOS. Namespaces that do not - pre-exist within StorageOS will be created. + description: |- + volumeNamespace specifies the scope of the volume within StorageOS. If no + namespace is specified then the Pod's namespace will be used. This allows the + Kubernetes name scoping to be mirrored within StorageOS for tighter integration. + Set VolumeName to any name to override the default behaviour. + Set to "default" if you are not using namespaces within StorageOS. + Namespaces that do not pre-exist within StorageOS will be created. type: string type: object vsphereVolume: @@ -9925,11 +9915,10 @@ spec: machine properties: fsType: - description: fsType is filesystem type to - mount. Must be a filesystem type supported - by the host operating system. Ex. "ext4", - "xfs", "ntfs". Implicitly inferred to be - "ext4" if unspecified. + description: |- + fsType is filesystem type to mount. + Must be a filesystem type supported by the host operating system. + Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. type: string storagePolicyID: description: storagePolicyID is the storage @@ -9951,15 +9940,20 @@ spec: - name type: object type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map required: - containers type: object type: object required: - count - - name - template type: object + x-kubernetes-validations: + - message: minCount should be positive and less or equal to count + rule: 'has(self.minCount) ? self.minCount <= self.count : true' maxItems: 8 minItems: 1 type: array @@ -9967,52 +9961,63 @@ spec: - name x-kubernetes-list-type: map priority: - description: Priority determines the order of access to the resources - managed by the ClusterQueue where the workload is queued. The priority - value is populated from PriorityClassName. The higher the value, - the higher the priority. If priorityClassName is specified, priority - must not be null. + description: |- + Priority determines the order of access to the resources managed by the + ClusterQueue where the workload is queued. + The priority value is populated from PriorityClassName. + The higher the value, the higher the priority. + If priorityClassName is specified, priority must not be null. format: int32 type: integer priorityClassName: - description: If specified, indicates the workload's priority. "system-node-critical" - and "system-cluster-critical" are two special keywords which indicate - the highest priorities with the former being the highest priority. - Any other name must be defined by creating a PriorityClass object - with that name. If not specified, the workload priority will be - default or zero if there is no default. + description: |- + If specified, indicates the workload's priority. + "system-node-critical" and "system-cluster-critical" are two special + keywords which indicate the highest priorities with the former being + the highest priority. Any other name must be defined by creating a + PriorityClass object with that name. If not specified, the workload + priority will be default or zero if there is no default. + maxLength: 253 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ type: string priorityClassSource: default: "" - description: priorityClassSource determines whether the priorityClass - field refers to a pod PriorityClass or kueue.x-k8s.io/workloadpriorityclass. - Workload's PriorityClass can accept the name of a pod priorityClass - or a workloadPriorityClass. When using pod PriorityClass, a priorityClassSource - field has the scheduling.k8s.io/priorityclass value. + description: |- + priorityClassSource determines whether the priorityClass field refers to a pod PriorityClass or kueue.x-k8s.io/workloadpriorityclass. + Workload's PriorityClass can accept the name of a pod priorityClass or a workloadPriorityClass. + When using pod PriorityClass, a priorityClassSource field has the scheduling.k8s.io/priorityclass value. enum: - kueue.x-k8s.io/workloadpriorityclass - scheduling.k8s.io/priorityclass - "" type: string queueName: - description: queueName is the name of the LocalQueue the Workload - is associated with. queueName cannot be changed while .status.admission - is not null. + description: |- + queueName is the name of the LocalQueue the Workload is associated with. + queueName cannot be changed while .status.admission is not null. + maxLength: 253 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ type: string required: - podSets type: object + x-kubernetes-validations: + - message: priority should not be nil when priorityClassName is set + rule: 'has(self.priorityClassName) ? has(self.priority) : true' status: description: WorkloadStatus defines the observed state of Workload properties: admission: - description: admission holds the parameters of the admission of the - workload by a ClusterQueue. admission can be set back to null, but - its fields cannot be changed once set. + description: |- + admission holds the parameters of the admission of the workload by a + ClusterQueue. admission can be set back to null, but its fields cannot be + changed once set. properties: clusterQueue: description: clusterQueue is the name of the ClusterQueue that admitted this workload. + maxLength: 253 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ type: string podSetAssignments: description: PodSetAssignments hold the admission results for @@ -10020,11 +10025,11 @@ spec: items: properties: count: - description: count is the number of pods taken into account - at admission time. This field will not change in case - of quota reclaim. Value could be missing for Workloads - created before this field was added, in that case spec.podSets[*].count - value will be used. + description: |- + count is the number of pods taken into account at admission time. + This field will not change in case of quota reclaim. + Value could be missing for Workloads created before this field was added, + in that case spec.podSets[*].count value will be used. format: int32 minimum: 0 type: integer @@ -10032,6 +10037,8 @@ spec: additionalProperties: description: ResourceFlavorReference is the name of the ResourceFlavor. + maxLength: 253 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ type: string description: Flavors are the flavors assigned to the workload for each resource. @@ -10040,6 +10047,8 @@ spec: default: main description: Name is the name of the podSet. It should match one of the names in .spec.podSets. + maxLength: 63 + pattern: ^(?i)[a-z0-9]([-a-z0-9]*[a-z0-9])?$ type: string resourceUsage: additionalProperties: @@ -10048,16 +10057,18 @@ spec: - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true - description: "resourceUsage keeps track of the total resources - all the pods in the podset need to run. \n Beside what - is provided in podSet's specs, this calculation takes - into account the LimitRange defaults and RuntimeClass - overheads at the moment of admission. This field will - not change in case of quota reclaim." + description: |- + resourceUsage keeps track of the total resources all the pods in the podset need to run. + + + Beside what is provided in podSet's specs, this calculation takes into account + the LimitRange defaults and RuntimeClass overheads at the moment of admission. + This field will not change in case of quota reclaim. type: object required: - name type: object + maxItems: 8 type: array x-kubernetes-list-map-keys: - name @@ -10072,15 +10083,15 @@ spec: items: properties: lastTransitionTime: - description: lastTransitionTime is the last time the condition - transitioned from one status to another. This should be when - the underlying condition changed. If that is not known, then - using the time when the API field changed is acceptable. + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. format: date-time type: string message: - description: message is a human readable message indicating - details about the transition. This may be an empty string. + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. maxLength: 32768 type: string name: @@ -10089,12 +10100,11 @@ spec: type: string podSetUpdates: items: - description: PodSetUpdate contains a list of pod set modifications - suggested by AdmissionChecks. The modifications should be - additive only - modifications of already existing keys or - having the same key provided by multiple AdmissionChecks - is not allowed and will result in failure during workload - admission. + description: |- + PodSetUpdate contains a list of pod set modifications suggested by AdmissionChecks. + The modifications should be additive only - modifications of already existing keys + or having the same key provided by multiple AdmissionChecks is not allowed and will + result in failure during workload admission. properties: annotations: additionalProperties: @@ -10114,54 +10124,72 @@ spec: type: object tolerations: items: - description: The pod this Toleration is attached to - tolerates any taint that matches the triple - using the matching operator . + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . properties: effect: - description: Effect indicates the taint effect to - match. Empty means match all taint effects. When - specified, allowed values are NoSchedule, PreferNoSchedule - and NoExecute. + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. type: string key: - description: Key is the taint key that the toleration - applies to. Empty means match all taint keys. - If the key is empty, operator must be Exists; - this combination means to match all values and - all keys. + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. type: string operator: - description: Operator represents a key's relationship - to the value. Valid operators are Exists and Equal. - Defaults to Equal. Exists is equivalent to wildcard - for value, so that a pod can tolerate all taints - of a particular category. + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. type: string tolerationSeconds: - description: TolerationSeconds represents the period - of time the toleration (which must be of effect - NoExecute, otherwise this field is ignored) tolerates - the taint. By default, it is not set, which means - tolerate the taint forever (do not evict). Zero - and negative values will be treated as 0 (evict - immediately) by the system. + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. format: int64 type: integer value: - description: Value is the taint value the toleration - matches to. If the operator is Exists, the value - should be empty, otherwise just a regular string. + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. type: string type: object + maxItems: 8 type: array + x-kubernetes-validations: + - message: operator must be Exists when 'key' is empty, + which means 'match all values and all keys' + rule: 'self.all(x, !has(x.key) ? x.operator == ''Exists'' + : true)' + - message: effect must be 'NoExecute' when 'tolerationSeconds' + is set + rule: 'self.all(x, has(x.tolerationSeconds) ? x.effect + == ''NoExecute'' : true)' + - message: 'supported toleration values: ''Equal''(default), + ''Exists''' + rule: self.all(x, !has(x.operator) || x.operator in + ['Equal', 'Exists']) + - message: a value must be empty when 'operator' is 'Exists' + rule: 'self.all(x, has(x.operator) && x.operator == + ''Exists'' ? !has(x.value) : true)' + - message: 'supported taint effect values: ''NoSchedule'', + ''PreferNoSchedule'', ''NoExecute''' + rule: self.all(x, !has(x.effect) || x.effect in ['NoSchedule', + 'PreferNoSchedule', 'NoExecute']) required: - name type: object + maxItems: 8 type: array x-kubernetes-list-type: atomic state: - description: status of the condition, one of True, False, Unknown. + description: state of the admissionCheck, one of Pending, Ready, + Retry, Rejected enum: - Pending - Ready @@ -10174,55 +10202,62 @@ spec: - name - state type: object + maxItems: 8 type: array x-kubernetes-list-map-keys: - name x-kubernetes-list-type: map conditions: - description: "conditions hold the latest available observations of - the Workload current state. \n The type of the condition could be: - \n - Admitted: the Workload was admitted through a ClusterQueue. - - Finished: the associated workload finished running (failed or - succeeded). - PodsReady: at least `.spec.podSets[*].count` Pods - are ready or have succeeded." + description: |- + conditions hold the latest available observations of the Workload + current state. + + + The type of the condition could be: + + + - Admitted: the Workload was admitted through a ClusterQueue. + - Finished: the associated workload finished running (failed or succeeded). + - PodsReady: at least `.spec.podSets[*].count` Pods are ready or have + succeeded. items: description: "Condition contains details for one aspect of the current - state of this API Resource. --- This struct is intended for direct - use as an array at the field path .status.conditions. For example, - \n type FooStatus struct{ // Represents the observations of a - foo's current state. // Known .status.conditions.type are: \"Available\", - \"Progressing\", and \"Degraded\" // +patchMergeKey=type // +patchStrategy=merge - // +listType=map // +listMapKey=type Conditions []metav1.Condition - `json:\"conditions,omitempty\" patchStrategy:\"merge\" patchMergeKey:\"type\" - protobuf:\"bytes,1,rep,name=conditions\"` \n // other fields }" + state of this API Resource.\n---\nThis struct is intended for + direct use as an array at the field path .status.conditions. For + example,\n\n\n\ttype FooStatus struct{\n\t // Represents the + observations of a foo's current state.\n\t // Known .status.conditions.type + are: \"Available\", \"Progressing\", and \"Degraded\"\n\t // + +patchMergeKey=type\n\t // +patchStrategy=merge\n\t // +listType=map\n\t + \ // +listMapKey=type\n\t Conditions []metav1.Condition `json:\"conditions,omitempty\" + patchStrategy:\"merge\" patchMergeKey:\"type\" protobuf:\"bytes,1,rep,name=conditions\"`\n\n\n\t + \ // other fields\n\t}" properties: lastTransitionTime: - description: lastTransitionTime is the last time the condition - transitioned from one status to another. This should be when - the underlying condition changed. If that is not known, then - using the time when the API field changed is acceptable. + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. format: date-time type: string message: - description: message is a human readable message indicating - details about the transition. This may be an empty string. + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. maxLength: 32768 type: string observedGeneration: - description: observedGeneration represents the .metadata.generation - that the condition was set based upon. For instance, if .metadata.generation - is currently 12, but the .status.conditions[x].observedGeneration - is 9, the condition is out of date with respect to the current - state of the instance. + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. format: int64 minimum: 0 type: integer reason: - description: reason contains a programmatic identifier indicating - the reason for the condition's last transition. Producers - of specific condition types may define expected values and - meanings for this field, and whether the values are considered - a guaranteed API. The value should be a CamelCase string. + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. This field may not be empty. maxLength: 1024 minLength: 1 @@ -10236,11 +10271,12 @@ spec: - Unknown type: string type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - --- Many .condition.type values are consistent across resources - like Available, but because arbitrary conditions can be useful - (see .node.status.conditions), the ability to deconflict is - important. The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: |- + type of condition in CamelCase or in foo.example.com/CamelCase. + --- + Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be + useful (see .node.status.conditions), the ability to deconflict is important. + The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string @@ -10256,8 +10292,9 @@ spec: - type x-kubernetes-list-type: map reclaimablePods: - description: reclaimablePods keeps track of the number pods within - a podset for which the resource reservation is no longer needed. + description: |- + reclaimablePods keeps track of the number pods within a podset for which + the resource reservation is no longer needed. items: properties: count: @@ -10273,6 +10310,7 @@ spec: - count - name type: object + maxItems: 8 type: array x-kubernetes-list-map-keys: - name @@ -10300,6 +10338,28 @@ spec: type: object type: object type: object + x-kubernetes-validations: + - message: podSetAssignments must have the same number of podSets as the spec + rule: 'has(self.status) && has(self.status.conditions) && self.status.conditions.exists(c, + c.type == ''QuotaReserved'' && c.status == ''True'') && has(self.status.admission) + ? size(self.spec.podSets) == size(self.status.admission.podSetAssignments) + : true' + - message: field is immutable + rule: '(has(oldSelf.status) && has(oldSelf.status.conditions) && oldSelf.status.conditions.exists(c, + c.type == ''QuotaReserved'' && c.status == ''True'')) ? (oldSelf.spec.priorityClassSource + == self.spec.priorityClassSource) : true' + - message: field is immutable + rule: '(has(oldSelf.status) && has(oldSelf.status.conditions) && oldSelf.status.conditions.exists(c, + c.type == ''QuotaReserved'' && c.status == ''True'') && has(oldSelf.spec.priorityClassName) + && has(self.spec.priorityClassName)) ? (oldSelf.spec.priorityClassName + == self.spec.priorityClassName) : true' + - message: field is immutable + rule: '(has(oldSelf.status) && has(oldSelf.status.conditions) && oldSelf.status.conditions.exists(c, + c.type == ''QuotaReserved'' && c.status == ''True'')) && (has(self.status) + && has(self.status.conditions) && self.status.conditions.exists(c, c.type + == ''QuotaReserved'' && c.status == ''True'')) && has(oldSelf.spec.queueName) + && has(self.spec.queueName) ? oldSelf.spec.queueName == self.spec.queueName + : true' served: true storage: true subresources: @@ -10308,12 +10368,20 @@ spec: apiVersion: v1 kind: ServiceAccount metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager name: kueue-controller-manager namespace: kueue-system --- apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager name: kueue-leader-election-role namespace: kueue-system rules: @@ -10356,6 +10424,10 @@ aggregationRule: apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager name: kueue-batch-admin-role --- aggregationRule: @@ -10365,12 +10437,19 @@ aggregationRule: apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager name: kueue-batch-user-role --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" name: kueue-clusterqueue-editor-role rules: @@ -10397,6 +10476,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" name: kueue-clusterqueue-viewer-role rules: @@ -10419,6 +10501,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" rbac.kueue.x-k8s.io/batch-user: "true" name: kueue-job-editor-role @@ -10446,6 +10531,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" rbac.kueue.x-k8s.io/batch-user: "true" name: kueue-job-viewer-role @@ -10469,6 +10557,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" rbac.kueue.x-k8s.io/batch-user: "true" name: kueue-jobset-editor-role @@ -10496,6 +10587,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" rbac.kueue.x-k8s.io/batch-user: "true" name: kueue-jobset-viewer-role @@ -10519,6 +10613,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" name: kueue-localqueue-editor-role rules: @@ -10545,6 +10642,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" rbac.kueue.x-k8s.io/batch-user: "true" name: kueue-localqueue-viewer-role @@ -10567,6 +10667,10 @@ rules: apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager name: kueue-manager-role rules: - apiGroups: @@ -11116,6 +11220,10 @@ rules: apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager name: kueue-metrics-reader rules: - nonResourceURLs: @@ -11127,6 +11235,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" rbac.kueue.x-k8s.io/batch-user: "true" name: kueue-mpijob-editor-role @@ -11154,6 +11265,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" rbac.kueue.x-k8s.io/batch-user: "true" name: kueue-mpijob-viewer-role @@ -11177,6 +11291,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" rbac.kueue.x-k8s.io/batch-user: "true" name: kueue-mxjob-editor-role @@ -11204,6 +11321,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" rbac.kueue.x-k8s.io/batch-user: "true" name: kueue-mxjob-viewer-role @@ -11227,6 +11347,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" rbac.kueue.x-k8s.io/batch-user: "true" name: kueue-paddlejob-editor-role @@ -11254,6 +11377,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" rbac.kueue.x-k8s.io/batch-user: "true" name: kueue-paddlejob-viewer-role @@ -11277,6 +11403,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" name: kueue-pending-workloads-cq-viewer-role rules: @@ -11293,6 +11422,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" rbac.kueue.x-k8s.io/batch-user: "true" name: kueue-pending-workloads-lq-viewer-role @@ -11309,6 +11441,10 @@ rules: apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager name: kueue-proxy-role rules: - apiGroups: @@ -11328,6 +11464,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" rbac.kueue.x-k8s.io/batch-user: "true" name: kueue-pytorchjob-editor-role @@ -11355,6 +11494,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" rbac.kueue.x-k8s.io/batch-user: "true" name: kueue-pytorchjob-viewer-role @@ -11378,6 +11520,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" rbac.kueue.x-k8s.io/batch-user: "true" name: kueue-rayjob-editor-role @@ -11405,6 +11550,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" rbac.kueue.x-k8s.io/batch-user: "true" name: kueue-rayjob-viewer-role @@ -11428,6 +11576,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" name: kueue-resourceflavor-editor-role rules: @@ -11448,6 +11599,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" name: kueue-resourceflavor-viewer-role rules: @@ -11464,6 +11618,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" rbac.kueue.x-k8s.io/batch-user: "true" name: kueue-tfjob-editor-role @@ -11491,6 +11648,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" rbac.kueue.x-k8s.io/batch-user: "true" name: kueue-tfjob-viewer-role @@ -11514,6 +11674,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" name: kueue-workload-editor-role rules: @@ -11540,6 +11703,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" rbac.kueue.x-k8s.io/batch-user: "true" name: kueue-workload-viewer-role @@ -11563,6 +11729,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" rbac.kueue.x-k8s.io/batch-user: "true" name: kueue-xgboostjob-editor-role @@ -11590,6 +11759,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager rbac.kueue.x-k8s.io/batch-admin: "true" rbac.kueue.x-k8s.io/batch-user: "true" name: kueue-xgboostjob-viewer-role @@ -11612,6 +11784,10 @@ rules: apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager name: kueue-leader-election-rolebinding namespace: kueue-system roleRef: @@ -11626,6 +11802,10 @@ subjects: apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager name: kueue-manager-rolebinding roleRef: apiGroup: rbac.authorization.k8s.io @@ -11639,6 +11819,10 @@ subjects: apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager name: kueue-proxy-rolebinding roleRef: apiGroup: rbac.authorization.k8s.io @@ -11675,9 +11859,16 @@ data: clientConnection: qps: 50 burst: 100 - #pprofBindAddress: :8082 + #pprofBindAddress: :8083 #waitForPodsReady: - # enable: true + # enable: false + # timeout: 5m + # blockAdmission: false + # requeuingStrategy: + # timestamp: Eviction + # backoffLimitCount: null # null indicates infinite requeuing + # backoffBaseSeconds: 60 + # backoffMaxSeconds: 3600 #manageJobsWithoutQueueName: true #internalCertManagement: # enable: false @@ -11695,21 +11886,36 @@ data: - "kubeflow.org/pytorchjob" - "kubeflow.org/tfjob" - "kubeflow.org/xgboostjob" - # - "pod" - # podOptions: - # namespaceSelector: - # matchExpressions: - # - key: kubernetes.io/metadata.name - # operator: NotIn - # values: [ kube-system, kueue-system ] + # - "pod" + # externalFrameworks: + # - "Foo.v1.example.com" + # podOptions: + # namespaceSelector: + # matchExpressions: + # - key: kubernetes.io/metadata.name + # operator: NotIn + # values: [ kube-system, kueue-system ] + #fairSharing: + # enable: true + # preemptionStrategies: [LessThanOrEqualToFinalShare, LessThanInitialShare] + #resources: + # excludeResourcePrefixes: [] kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager name: kueue-manager-config namespace: kueue-system --- apiVersion: v1 kind: Secret metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager name: kueue-webhook-server-cert namespace: kueue-system --- @@ -11717,6 +11923,8 @@ apiVersion: v1 kind: Service metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue control-plane: controller-manager name: kueue-controller-manager-metrics-service namespace: kueue-system @@ -11732,6 +11940,10 @@ spec: apiVersion: v1 kind: Service metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager name: kueue-webhook-service namespace: kueue-system spec: @@ -11746,6 +11958,8 @@ apiVersion: apps/v1 kind: Deployment metadata: labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue control-plane: controller-manager name: kueue-controller-manager namespace: kueue-system @@ -11768,7 +11982,7 @@ spec: - --feature-gates=ProvisioningACC=true command: - /manager - image: registry.k8s.io/kueue/kueue:v0.6.1 + image: registry.k8s.io/kueue/kueue:v0.7.0 imagePullPolicy: Always livenessProbe: httpGet: @@ -11833,6 +12047,10 @@ spec: apiVersion: admissionregistration.k8s.io/v1 kind: MutatingWebhookConfiguration metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager name: kueue-mutating-webhook-configuration webhooks: - admissionReviewVersions: @@ -12038,14 +12256,14 @@ webhooks: service: name: kueue-webhook-service namespace: kueue-system - path: /mutate-ray-io-v1alpha1-rayjob + path: /mutate-ray-io-v1-rayjob failurePolicy: Fail name: mrayjob.kb.io rules: - apiGroups: - ray.io apiVersions: - - v1alpha1 + - v1 operations: - CREATE resources: @@ -12105,7 +12323,6 @@ webhooks: - v1beta1 operations: - CREATE - - UPDATE resources: - workloads sideEffects: None @@ -12113,6 +12330,10 @@ webhooks: apiVersion: admissionregistration.k8s.io/v1 kind: ValidatingWebhookConfiguration metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager name: kueue-validating-webhook-configuration webhooks: - admissionReviewVersions: @@ -12328,40 +12549,20 @@ webhooks: service: name: kueue-webhook-service namespace: kueue-system - path: /validate-ray-io-v1alpha1-rayjob + path: /validate-ray-io-v1-rayjob failurePolicy: Fail name: vrayjob.kb.io rules: - apiGroups: - ray.io apiVersions: - - v1alpha1 + - v1 operations: - CREATE - UPDATE resources: - rayjobs sideEffects: None -- admissionReviewVersions: - - v1 - clientConfig: - service: - name: kueue-webhook-service - namespace: kueue-system - path: /validate-kueue-x-k8s-io-v1beta1-admissioncheck - failurePolicy: Fail - name: vadmissioncheck.kb.io - rules: - - apiGroups: - - kueue.x-k8s.io - apiVersions: - - v1beta1 - operations: - - CREATE - - UPDATE - resources: - - admissionchecks - sideEffects: None - admissionReviewVersions: - v1 clientConfig: @@ -12382,26 +12583,6 @@ webhooks: resources: - clusterqueues sideEffects: None -- admissionReviewVersions: - - v1 - clientConfig: - service: - name: kueue-webhook-service - namespace: kueue-system - path: /validate-kueue-x-k8s-io-v1beta1-localqueue - failurePolicy: Fail - name: vlocalqueue.kb.io - rules: - - apiGroups: - - kueue.x-k8s.io - apiVersions: - - v1beta1 - operations: - - CREATE - - UPDATE - resources: - - localqueues - sideEffects: None - admissionReviewVersions: - v1 clientConfig: diff --git a/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/kueue-system/kueue_values.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/kueue-system/kueue_values.yaml new file mode 100644 index 000000000..0b87ac041 --- /dev/null +++ b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/kueue-system/kueue_values.yaml @@ -0,0 +1,148 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Default values for kueue. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. +nameOverride: "" +fullnameOverride: "" +# Enable each function, like kustomize https://github.com/kubernetes-sigs/kueue/blob/main/config/default/kustomization.yaml +enablePrometheus: false +# Enable x509 automated certificate management using cert-manager (cert-manager.io) +enableCertManager: false +# Customize controlerManager +controllerManager: + featureGates: + - name: ProvisioningACC + enabled: true + kubeRbacProxy: + image: + repository: gcr.io/kubebuilder/kube-rbac-proxy + # tag + tag: v0.8.0 + # This should be set to 'IfNotPresent' for released version + pullPolicy: IfNotPresent + manager: + image: + repository: gcr.io/k8s-staging-kueue/kueue + # This should be set to 'IfNotPresent' for released version + pullPolicy: Always + podAnnotations: {} + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 500m + memory: 512Mi + podSecurityContext: + runAsNonRoot: true + containerSecurityContext: + allowPrivilegeEscalation: false + replicas: 1 + imagePullSecrets: [] + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 1 + failureThreshold: 3 + successThreshold: 1 + livenessProbe: + initialDelaySeconds: 15 + periodSeconds: 20 + timeoutSeconds: 1 + failureThreshold: 3 + successThreshold: 1 +kubernetesClusterDomain: cluster.local +# controller_manager_config.yaml. controllerManager utilizes this yaml via manager-config Configmap. +managerConfig: + controllerManagerConfigYaml: |- + apiVersion: config.kueue.x-k8s.io/v1beta1 + kind: Configuration + health: + healthProbeBindAddress: :8081 + metrics: + bindAddress: :8080 + # enableClusterQueueResources: true + webhook: + port: 9443 + leaderElection: + leaderElect: true + resourceName: c1f6bfd2.kueue.x-k8s.io + controller: + groupKindConcurrency: + Job.batch: 5 + Pod: 5 + Workload.kueue.x-k8s.io: 5 + LocalQueue.kueue.x-k8s.io: 1 + ClusterQueue.kueue.x-k8s.io: 1 + ResourceFlavor.kueue.x-k8s.io: 1 + clientConnection: + qps: 50 + burst: 100 + #pprofBindAddress: :8083 + #waitForPodsReady: + # enable: false + # timeout: 5m + # blockAdmission: false + # requeuingStrategy: + # timestamp: Eviction + # backoffLimitCount: null # null indicates infinite requeuing + # backoffBaseSeconds: 60 + #manageJobsWithoutQueueName: true + #internalCertManagement: + # enable: false + # webhookServiceName: "" + # webhookSecretName: "" + integrations: + frameworks: + - "batch/job" + - "kubeflow.org/mpijob" + - "ray.io/rayjob" + - "ray.io/raycluster" + - "jobset.x-k8s.io/jobset" + - "kubeflow.org/mxjob" + - "kubeflow.org/paddlejob" + - "kubeflow.org/pytorchjob" + - "kubeflow.org/tfjob" + - "kubeflow.org/xgboostjob" + # - "pod" + # podOptions: + # namespaceSelector: + # matchExpressions: + # - key: kubernetes.io/metadata.name + # operator: NotIn + # values: [ kube-system, kueue-system ] + # fairSharing: + # enable: true + # preemptionStrategies: [LessThanOrEqualToFinalShare, LessThanInitialShare] +# ports definition for metricsService and webhookService. +metricsService: + ports: + - name: https + port: 8443 + protocol: TCP + targetPort: https + type: ClusterIP + annotations: {} +webhookService: + ipDualStack: + enabled: false + ipFamilies: ["IPv6", "IPv4"] + ipFamilyPolicy: "PreferDualStack" + ports: + - port: 443 + protocol: TCP + targetPort: 9443 + type: ClusterIP \ No newline at end of file diff --git a/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/kueue-system/kustomization.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/kueue-system/kustomization.yaml new file mode 100644 index 000000000..b9890430d --- /dev/null +++ b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/kueue-system/kustomization.yaml @@ -0,0 +1,31 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: kueue-system +resources: +- kueue.yaml + +# There is no publicly available Kueue Helm chart repository and kustomize does not support private or OCI-based (Artifact Registry) Helm repository +# https://github.com/kubernetes-sigs/kueue/issues/2311 +# https://cloud.google.com/kubernetes-engine/enterprise/config-sync/docs/concepts/kustomize +# helmCharts: +# - includeCRDs: true +# name: kueue +# namespace: kueue-system +# releaseName: kueue +# repo: +# valuesFile: kueue_values.yaml +# version: 0.1.0 diff --git a/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/kustomization.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/kustomization.yaml new file mode 100644 index 000000000..361030e28 --- /dev/null +++ b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/kustomization.yaml @@ -0,0 +1,17 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: diff --git a/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/namespace-kueue-system.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/namespace-kueue-system.yaml new file mode 100644 index 000000000..cb10fb525 --- /dev/null +++ b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/namespace-kueue-system.yaml @@ -0,0 +1,22 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Namespace +metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kueue + control-plane: controller-manager + name: kueue-system diff --git a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/network-logging.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/network-logging.yaml similarity index 100% rename from best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/network-logging.yaml rename to best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/network-logging.yaml diff --git a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/selector.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/selector.yaml similarity index 100% rename from best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/selector.yaml rename to best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/selector.yaml diff --git a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/team/kustomization.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/team/kustomization.yaml similarity index 100% rename from best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/team/kustomization.yaml rename to best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/team/kustomization.yaml diff --git a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/team/namespace.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/team/namespace.yaml similarity index 100% rename from best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/team/namespace.yaml rename to best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/team/namespace.yaml diff --git a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/team/network-policy.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/team/network-policy.yaml similarity index 100% rename from best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/team/network-policy.yaml rename to best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/team/network-policy.yaml diff --git a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/team/rbac.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/team/rbac.yaml similarity index 100% rename from best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/team/rbac.yaml rename to best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/team/rbac.yaml diff --git a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/team/reposync.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/team/reposync.yaml similarity index 99% rename from best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/team/reposync.yaml rename to best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/team/reposync.yaml index 82eb3c07a..afac66d1f 100644 --- a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_cluster_template/team/reposync.yaml +++ b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_cluster_template/team/reposync.yaml @@ -26,7 +26,7 @@ spec: sourceFormat: unstructured git: repo: "GIT_REPO" - revision: "ENV" + revision: "main" #branch: NAMESPACE_BRANCH dir: "manifests/apps/NAMESPACE" auth: token diff --git a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_namespace_template/app/fluentd_config.yaml similarity index 100% rename from best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml rename to best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_namespace_template/app/fluentd_config.yaml diff --git a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_namespace_template/app/kustomization.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_namespace_template/app/kustomization.yaml similarity index 100% rename from best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_namespace_template/app/kustomization.yaml rename to best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_namespace_template/app/kustomization.yaml diff --git a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_namespace_template/app/podmonitoring_ray.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_namespace_template/app/podmonitoring_ray.yaml similarity index 100% rename from best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_namespace_template/app/podmonitoring_ray.yaml rename to best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_namespace_template/app/podmonitoring_ray.yaml diff --git a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_namespace_template/app/serviceaccount_ray_head.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_namespace_template/app/serviceaccount_ray_head.yaml similarity index 89% rename from best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_namespace_template/app/serviceaccount_ray_head.yaml rename to best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_namespace_template/app/serviceaccount_ray_head.yaml index b88329a3c..4ed22b5ff 100644 --- a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_namespace_template/app/serviceaccount_ray_head.yaml +++ b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_namespace_template/app/serviceaccount_ray_head.yaml @@ -17,5 +17,3 @@ kind: ServiceAccount metadata: name: KUBERNETES_SERVICE_ACCOUNT_RAY_HEAD namespace: NAMESPACE - annotations: - iam.gke.io/gcp-service-account: GOOGLE_SERVICE_ACCOUNT_RAY_HEAD diff --git a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_namespace_template/app/serviceaccount_ray_worker.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_namespace_template/app/serviceaccount_ray_worker.yaml similarity index 89% rename from best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_namespace_template/app/serviceaccount_ray_worker.yaml rename to best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_namespace_template/app/serviceaccount_ray_worker.yaml index eefd56a56..6c1932ce2 100644 --- a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_namespace_template/app/serviceaccount_ray_worker.yaml +++ b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_namespace_template/app/serviceaccount_ray_worker.yaml @@ -17,5 +17,3 @@ kind: ServiceAccount metadata: name: KUBERNETES_SERVICE_ACCOUNT_RAY_WORKER namespace: NAMESPACE - annotations: - iam.gke.io/gcp-service-account: GOOGLE_SERVICE_ACCOUNT_RAY_WORKER diff --git a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_namespace_template/app/values.yaml b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_namespace_template/app/values.yaml similarity index 93% rename from best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_namespace_template/app/values.yaml rename to best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_namespace_template/app/values.yaml index e4068e4ef..922834f9d 100644 --- a/best-practices/ml-platform/examples/platform/playground/templates/acm-template/templates/_namespace_template/app/values.yaml +++ b/best-practices/ml-platform/examples/platform/playground/templates/configsync/templates/_namespace_template/app/values.yaml @@ -66,9 +66,10 @@ head: ephemeral-storage: 10Gi annotations: {} nodeSelector: + resource-type: "cpu" iam.gke.io/gke-metadata-server-enabled: "true" tolerations: - - key: "reserved" + - key: "on-demand" operator: "Exists" effect: "NoSchedule" affinity: {} @@ -118,7 +119,7 @@ head: name: ray-logs worker: - groupName: cpu-n2x2 + groupName: cpu-n4x2 replicas: 0 minReplicas: 0 maxReplicas: 16 @@ -130,7 +131,7 @@ worker: serviceAccountName: "KUBERNETES_SERVICE_ACCOUNT_RAY_WORKER" rayStartParams: block: 'true' - resources: '"{\"n2_cpu\": 2}"' + resources: '"{\"cpu\": 2}"' initContainerImage: 'busybox:1.28' initContainerSecurityContext: {} containerEnv: @@ -145,11 +146,16 @@ worker: requests: cpu: 2 memory: "4G" - annotations: - key: value + annotations: {} nodeSelector: - cloud.google.com/machine-family: n2 - tolerations: [] + resource-type: cpu + tolerations: + - key: "on-demand" + operator: "Exists" + effect: "NoSchedule" + - key: "spot" + operator: "Exists" + effect: "NoSchedule" affinity: {} securityContext: allowPrivilegeEscalation: false @@ -228,8 +234,7 @@ additionalWorkerGroups: nvidia.com/gpu: "2" memory: "90G" ephemeral-storage: 10Gi - annotations: - key: value + annotations: {} nodeSelector: iam.gke.io/gke-metadata-server-enabled: "true" cloud.google.com/gke-accelerator: "nvidia-l4" @@ -237,6 +242,12 @@ additionalWorkerGroups: - key: "nvidia.com/gpu" operator: "Exists" effect: "NoSchedule" + - key: "on-demand" + operator: "Exists" + effect: "NoSchedule" + - key: "spot" + operator: "Exists" + effect: "NoSchedule" affinity: {} securityContext: allowPrivilegeEscalation: false diff --git a/best-practices/ml-platform/examples/platform/playground/variables.tf b/best-practices/ml-platform/examples/platform/playground/variables.tf index 1e22df66c..c206a7f4e 100644 --- a/best-practices/ml-platform/examples/platform/playground/variables.tf +++ b/best-practices/ml-platform/examples/platform/playground/variables.tf @@ -13,7 +13,7 @@ # limitations under the License. variable "cluster_name" { - default = "gke-ml" + default = "mlp" description = "Name of the GKE cluster" type = string } @@ -25,7 +25,7 @@ variable "config_management_version" { } variable "configsync_repo_name" { - default = "config-sync-repo" + default = "mlp-configsync" description = "Name of the GitHub repo that will be synced to the cluster with Config sync." type = string } @@ -52,37 +52,37 @@ variable "env" { type = set(string) } -variable "github_email" { - description = "GitHub user email." +variable "git_namespace" { + description = "The namespace of the git repository" type = string validation { - condition = var.github_email != "YOUR_GITHUB_EMAIL" - error_message = "'github_email' was not set, please set the value in the mlp.auto.tfvars file" + condition = var.git_namespace != "YOUR_GIT_NAMESPACE" + error_message = "'git_namespace' was not set, please set the value in the mlp.auto.tfvars file" } } -variable "github_org" { - description = "GitHub org." +variable "git_token" { + description = "The token with permissions to create the project/repository in the namespace." type = string - - validation { - condition = var.github_org != "YOUR_GITHUB_ORG" - error_message = "'github_org' was not set, please set the value in the mlp.auto.tfvars file" - } } -variable "github_token" { - description = "GitHub token. It is a token with write permissions as it will create a repo in the GitHub org." +variable "git_user_email" { + description = "The user email to configure for git" type = string + + validation { + condition = var.git_user_email != "YOUR_GIT_USER_EMAIL" + error_message = "'git_user_email' was not set, please set the value in the mlp.auto.tfvars file" + } } -variable "github_user" { - description = "GitHub user name." +variable "git_user_name" { + description = "The user name to configure for git" type = string validation { - condition = var.github_user != "YOUR_GITHUB_USER" + condition = var.git_user_name != "YOUR_GIT_USER" error_message = "'github_user' was not set, please set the value in the mlp.auto.tfvars file" } } diff --git a/best-practices/ml-platform/examples/platform/playground/versions.tf b/best-practices/ml-platform/examples/platform/playground/versions.tf index 103558893..350587648 100644 --- a/best-practices/ml-platform/examples/platform/playground/versions.tf +++ b/best-practices/ml-platform/examples/platform/playground/versions.tf @@ -18,15 +18,19 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "5.19.0" + version = "5.35.0" } google-beta = { source = "hashicorp/google-beta" - version = "5.19.0" + version = "5.35.0" } kubernetes = { source = "hashicorp/kubernetes" - version = "2.29.0" + version = "2.31.0" + } + local = { + source = "hashicorp/local" + version = "2.5.1" } null = { source = "hashicorp/null" diff --git a/best-practices/ml-platform/examples/use-case/ray/dataprocessing/.gitignore b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/.gitignore new file mode 100644 index 000000000..3aa5b810c --- /dev/null +++ b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/.gitignore @@ -0,0 +1,2 @@ +src/venv +src/.python-version diff --git a/best-practices/ml-platform/examples/use-case/ray/dataprocessing/DEVELOPER.md b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/DEVELOPER.md new file mode 100644 index 000000000..75f281546 --- /dev/null +++ b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/DEVELOPER.md @@ -0,0 +1,70 @@ +# Distributed Data Processing Developer Guide + +- Install [`pyenv`](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation) + +- Install the `python` version + + ``` + pyenv install 3.10.14 + ``` + +- Clone the repository + + ``` + git clone https://github.com/GoogleCloudPlatform/ai-on-gke.git && \ + cd ai-on-gke + ``` + +- Change directory to the `src` directory + + ``` + cd best-practices/ml-platform/examples/use-case/ray/dataprocessing/src + ``` + +- Set the local `python` version + + ``` + pyenv local 3.10.14 + ``` + +- Create a virtual environment + + ``` + python -m venv venv + ``` + +- Activate the virtual environment + + ``` + source venv/bin/activate + ``` + +- Install the requirements + + ``` + pip install --no-cache-dir -r requirements.txt + ``` + +- Set the Ray Cluster to run locally + + ``` + export RAY_CLUSTER_HOST=local + ``` + +- Set the project for the GCS storage bucket + + ``` + gcloud config set project ${MLP_PROJECT_ID} + ``` + +- Set the GCS storage bucket name + + ``` + export PROCESSING_BUCKET= + ``` + +- Run the `preprocessing.py` script + + ``` + python ./preprocessing.py + ``` diff --git a/best-practices/ml-platform/examples/use-case/ray/dataprocessing/README.md b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/README.md index c182f414c..cd72d2e92 100644 --- a/best-practices/ml-platform/examples/use-case/ray/dataprocessing/README.md +++ b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/README.md @@ -40,7 +40,7 @@ The preprocessing.py file does the following: 1. Create a Cloud Storage bucket to store raw data ``` - gcloud storage buckets create gs://${PROCESSING_BUCKET} --project ${PROJECT_ID} + gcloud storage buckets create gs://${PROCESSING_BUCKET} --project ${PROJECT_ID} --uniform-bucket-level-access ``` 1. Download the raw data csv file from above and store into the bucket created in the previous step. @@ -61,11 +61,11 @@ The preprocessing.py file does the following: ``` gcloud projects add-iam-policy-binding ${PROJECT_ID} \ - --member "serviceAccount:wi-ml-team-ray-head@${PROJECT_ID}.iam.gserviceaccount.com" \ + --member "serviceAccount:${PROJECT_ID}.svc.id.goog[ml-team/ray-head]" \ --role roles/storage.objectViewer gcloud projects add-iam-policy-binding ${PROJECT_ID} \ - --member "serviceAccount:wi-ml-team-ray-worker@${PROJECT_ID}.iam.gserviceaccount.com" \ + --member "serviceAccount:${PROJECT_ID}.svc.id.goog[ml-team/ray-worker]" \ \ --role roles/storage.objectAdmin ``` @@ -89,7 +89,10 @@ The preprocessing.py file does the following: ``` cd src && \ - gcloud builds submit --tag ${DOCKER_IMAGE_URL} . && \ + gcloud builds submit \ + --project ${PROJECT_ID} \ + --tag ${DOCKER_IMAGE_URL} \ + . && \ cd .. ``` @@ -107,7 +110,7 @@ The preprocessing.py file does the following: 1. Get credentials for the GKE cluster ``` - gcloud container fleet memberships get-credentials ${CLUSTER_NAME} + gcloud container fleet memberships get-credentials ${CLUSTER_NAME} --project ${PROJECT_ID} ``` 1. Create the Job in the “ml-team” namespace using kubectl command @@ -131,4 +134,6 @@ The preprocessing.py file does the following: gcloud storage ls gs://${PROCESSING_BUCKET}/flipkart_images ``` +> For additional information about developing using this codebase see the [Developer Guide](DEVELOPER.md) + > For additional information about converting you code from a notebook to run as a Job on GKE see the [Conversion Guide](CONVERSION.md) diff --git a/best-practices/ml-platform/examples/use-case/ray/dataprocessing/job.yaml b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/job.yaml index 32cff2513..666a41f90 100644 --- a/best-practices/ml-platform/examples/use-case/ray/dataprocessing/job.yaml +++ b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/job.yaml @@ -4,6 +4,7 @@ metadata: name: job namespace: ml-team spec: + backoffLimit: 0 template: metadata: labels: @@ -17,6 +18,12 @@ spec: - name: "PROCESSING_BUCKET" value: #PROCESSING_BUCKET - name: "RAY_CLUSTER_HOST" - value: "ray-cluster-kuberay-head-svc.ml-team:10001" + value: ray-cluster-kuberay-head-svc.ml-team:10001 + nodeSelector: + resource-type: cpu restartPolicy: Never serviceAccountName: ray-worker + tolerations: + - effect: NoSchedule + key: on-demand + operator: Exists diff --git a/best-practices/ml-platform/examples/use-case/ray/dataprocessing/src/.gcloudignore b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/src/.gcloudignore new file mode 100644 index 000000000..e474fac8c --- /dev/null +++ b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/src/.gcloudignore @@ -0,0 +1,2 @@ +venv/ +.python-version diff --git a/best-practices/ml-platform/examples/use-case/ray/dataprocessing/src/Dockerfile b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/src/Dockerfile index 98aed63de..8cf00db90 100644 --- a/best-practices/ml-platform/examples/use-case/ray/dataprocessing/src/Dockerfile +++ b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/src/Dockerfile @@ -1,12 +1,18 @@ -FROM python:3.10-slim-bullseye as build-stage +FROM python:3.10.14-slim-bullseye as build-stage ENV PATH=/venv/bin:${PATH} ENV PYTHONDONTWRITEBYTECODE=1 ENV PYTHONUNBUFFERED=1 +RUN apt-get -y update \ + && pip install --upgrade pip \ + && python -m venv /venv + COPY requirements.txt /venv/requirements.txt + RUN pip install --no-cache-dir -r /venv/requirements.txt +COPY logging.conf /app/logging.conf COPY preprocessing.py /app/preprocessing.py WORKDIR /app diff --git a/best-practices/ml-platform/examples/use-case/ray/dataprocessing/src/logging.conf b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/src/logging.conf new file mode 100644 index 000000000..57b9eb189 --- /dev/null +++ b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/src/logging.conf @@ -0,0 +1,27 @@ +[loggers] +keys=root,preprocessing + +[handlers] +keys=consoleHandler + +[formatters] +keys=simpleFormatter + +[logger_root] +level=DEBUG +handlers=consoleHandler + +[logger_preprocessing] +level=DEBUG +handlers=consoleHandler +qualname=preprocessing +propagate=0 + +[handler_consoleHandler] +class=StreamHandler +level=DEBUG +formatter=simpleFormatter +args=(sys.stdout,) + +[formatter_simpleFormatter] +format=%(asctime)s - %(name)s - %(levelname)s - %(message)s diff --git a/best-practices/ml-platform/examples/use-case/ray/dataprocessing/src/preprocessing.py b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/src/preprocessing.py index 668958c97..d17a10ce6 100644 --- a/best-practices/ml-platform/examples/use-case/ray/dataprocessing/src/preprocessing.py +++ b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/src/preprocessing.py @@ -1,29 +1,36 @@ +import jsonpickle +import logging import os -import ray import pandas as pd -from typing import List -import urllib.request, urllib.error +import ray +import re +import socket +import spacy +import sys import time +import urllib.error +import urllib.request + from google.cloud import storage -import spacy -import jsonpickle -import re +from google.cloud.storage.retry import DEFAULT_RETRY +from typing import List IMAGE_BUCKET = os.environ['PROCESSING_BUCKET'] RAY_CLUSTER_HOST = os.environ['RAY_CLUSTER_HOST'] GCS_IMAGE_FOLDER = 'flipkart_images' -@ray.remote(resources={"n2_cpu": 1}) -def get_clean_df(df): +logging.config.fileConfig('logging.conf') +logger = logging.getLogger('preprocessing') +logger.debug(logger) +@ray.remote(resources={"cpu": 1}) +def get_clean_df(df, logger, ray_worker_node_id): + # extract image urls def extract_url(image_list: str) -> List[str]: - image_list = image_list.replace('[', '') - image_list = image_list.replace(']', '') - image_list = image_list.replace('"', '') - image_urls = image_list.split(',') - return image_urls + return image_list.replace('[', '').replace(']', '').replace('"', '').split(',') - def download_image(image_url, image_file_name, destination_blob_name): + #download the image from public url to GCS + def download_image(image_url, image_file_name, destination_blob_name, logger): storage_client = storage.Client() download_dir = '/tmp/images' @@ -31,46 +38,53 @@ def download_image(image_url, image_file_name, destination_blob_name): if not os.path.exists(download_dir): os.makedirs(download_dir) except FileExistsError as err: - print(f"Directory '{download_dir}' already exists") + logger.warning(f"Directory '{download_dir}' already exists") - image_found_flag = False try: download_file = f"{download_dir}/{image_file_name}" + socket.setdefaulttimeout(10) urllib.request.urlretrieve(image_url, download_file) + bucket = storage_client.bucket(IMAGE_BUCKET) blob = bucket.blob(destination_blob_name) - blob.upload_from_filename(download_file) - print( - f"File {image_file_name} uploaded to {destination_blob_name}." - ) + blob.upload_from_filename(download_file, retry=DEFAULT_RETRY) + logger.info(f"ray_worker_node_id:{ray_worker_node_id} File {image_file_name} uploaded to {destination_blob_name}") os.remove(download_file) - - image_found_flag = True - except urllib.error.HTTPError: - print("HTTPError exception") - except urllib.error.URLError: - print("URLError exception") - except: - print("Unhandled exception") + return True + except TimeoutError as err: + logger.warning(f"ray_worker_node_id:{ray_worker_node_id} Image '{image_url}' request timeout") + except urllib.error.HTTPError as err: + if err.code == 404: + logger.warning(f"ray_worker_node_id:{ray_worker_node_id} Image '{image_url}' not found") + elif err.code == 504: + logger.warning(f"ray_worker_node_id:{ray_worker_node_id} Image '{image_url}' gateway timeout") + else: + logger.error(f"ray_worker_node_id:{ray_worker_node_id} Unhandled HTTPError exception: {err}") + except urllib.error.URLError as err: + logger.error(f"ray_worker_node_id:{ray_worker_node_id} URLError exception: {err}") + except Exception as err: + logger.error(f"ray_worker_node_id:{ray_worker_node_id} Unhandled exception: {err}") raise - return image_found_flag + return False - def prep_product_desc(df): - # Cleaning the description text + # Cleaning the description text + def prep_product_desc(df, logger): spacy.cli.download("en_core_web_sm") model = spacy.load("en_core_web_sm") - def parse_nlp_description(description) -> str: if not pd.isna(description): - doc = model(description.lower()) - lemmas = [] - for token in doc: - if token.lemma_ not in lemmas and not token.is_stop and token.is_alpha: - lemmas.append(token.lemma_) - return ' '.join(lemmas) + try: + doc = model(description.lower()) + lemmas = [] + for token in doc: + if token.lemma_ not in lemmas and not token.is_stop and token.is_alpha: + lemmas.append(token.lemma_) + return ' '.join(lemmas) + except: + logger.error("Unable to load spacy model") df['description'] = df['description'].apply(parse_nlp_description) return df @@ -96,30 +110,33 @@ def parse_attributes(specification: str): json_string = jsonpickle.encode(out) return json_string - def get_product_image(df): + def get_product_image(df, logger): products_with_no_image_count = 0 products_with_no_image = [] gcs_image_url = [] + image_found_flag = False for id, image_list in zip(df['uniq_id'], df['image']): if pd.isnull(image_list): # No image url - # print("WARNING: No image url: product ", id) + logger.warning(f"No image url for product {id}") products_with_no_image_count += 1 products_with_no_image.append(id) gcs_image_url.append(None) continue image_urls = extract_url(image_list) for index in range(len(image_urls)): - image_url = image_urls[index] - image_file_name = '{}_{}.jpg'.format(id, index) - destination_blob_name = GCS_IMAGE_FOLDER + '/' + image_file_name - image_found_flag = download_image(image_url, image_file_name, destination_blob_name) + image_url = image_urls[index].strip() + image_file_name = f"{id}_{index}.jpg" + destination_blob_name = f"{GCS_IMAGE_FOLDER}/{id}_{index}.jpg" + image_found_flag = download_image( + image_url, image_file_name, destination_blob_name, logger) if image_found_flag: - gcs_image_url.append('gs://' + IMAGE_BUCKET + '/' + destination_blob_name) + gcs_image_url.append( + 'gs://' + IMAGE_BUCKET + '/' + destination_blob_name) break if not image_found_flag: - # print("WARNING: No image: product ", id) + logger.warning(f"No image found for product {id}") products_with_no_image_count += 1 products_with_no_image.append(id) gcs_image_url.append(None) @@ -128,11 +145,12 @@ def get_product_image(df): gcs_image_loc = pd.DataFrame(gcs_image_url, index=df.index) gcs_image_loc.columns = ["image_uri"] df_with_gcs_image_uri = pd.concat([df, gcs_image_loc], axis=1) - return df_with_gcs_image_uri + return df_with_gcs_image_uri - df_with_gcs_image_uri = get_product_image(df) - df_with_desc = prep_product_desc(df_with_gcs_image_uri) - df_with_desc['attributes'] = df_with_desc['product_specifications'].apply(parse_attributes) + df_with_gcs_image_uri = get_product_image(df, logger) + df_with_desc = prep_product_desc(df_with_gcs_image_uri, logger) + df_with_desc['attributes'] = df_with_desc['product_specifications'].apply( + parse_attributes) return df_with_desc @@ -147,26 +165,68 @@ def split_dataframe(df, chunk_size=199): # This function invokes ray task def run_remote(): - df = pd.read_csv('gs://'+IMAGE_BUCKET+'/flipkart_raw_dataset/flipkart_com-ecommerce_sample.csv') - df = df[['uniq_id','product_name','description','brand','image','product_specifications']] - runtime_env = {"pip": ["google-cloud-storage==2.16.0", "spacy==3.7.4", "jsonpickle==3.0.3"]} - ray.init("ray://"+RAY_CLUSTER_HOST, runtime_env=runtime_env) - print("STARTED") - start_time = time.time() + + #Read raw dataset from GCS + df = pd.read_csv( + f"gs://{IMAGE_BUCKET}/flipkart_raw_dataset/flipkart_com-ecommerce_sample.csv") + df = df[['uniq_id', + 'product_name', + 'description', + 'brand', + 'image', + 'product_specifications']] + + #Ray runtime env + runtime_env = {"pip": ["google-cloud-storage==2.16.0", + "spacy==3.7.4", + "jsonpickle==3.0.3"], + "env_vars": {"PIP_NO_CACHE_DIR": "1", + "PIP_DISABLE_PIP_VERSION_CHECK": "1"}} + + # Initiate a driver: start and connect with Ray cluster + if RAY_CLUSTER_HOST != "local": + ClientContext = ray.init(f"ray://{RAY_CLUSTER_HOST}", runtime_env=runtime_env) + logger.debug(ClientContext) + + # Get the ID of the node where the driver process is running + driver_process_node_id = ray.get_runtime_context().get_node_id() #HEX + logger.debug(f"ray_driver_node_id={driver_process_node_id}") + + logger.debug(ray.cluster_resources()) + else: + RayContext = ray.init() + logger.debug(RayContext) + + #Chunk the dataset res = split_dataframe(df) - results = ray.get([get_clean_df.remote(res[i]) for i in range(len(res))]) - print("FINISHED IN ") + + logger.debug('Data Preparation started') + start_time = time.time() + results = ray.get([get_clean_df.remote(res[i], logger, i) for i in range(len(res))]) duration = time.time() - start_time - print(duration) + logger.debug(f"Data Preparation finished in {duration} seconds") + + #Disconnect the worker, and terminate processes started by ray.init() ray.shutdown() + + #Store the preprocessed data into GCS result_df = pd.concat(results, axis=0, ignore_index=True) - result_df.to_csv('gs://'+IMAGE_BUCKET+'/flipkart_preprocessed_dataset/flipkart.csv', index=False) + result_df.to_csv('gs://'+IMAGE_BUCKET + + '/flipkart_preprocessed_dataset/flipkart.csv', index=False) return result_df def main(): + logger.info('Started') + + logger.debug(f"RAY_CLUSTER_HOST={RAY_CLUSTER_HOST}") + logger.debug(f"IMAGE_BUCKET={IMAGE_BUCKET}") + logger.debug(f"GCS_IMAGE_FOLDER={GCS_IMAGE_FOLDER}") + clean_df = run_remote() + logger.info('Finished') + if __name__ == "__main__": """ This is executed when run from the command line """ diff --git a/best-practices/ml-platform/terraform/features/initialize/initialize.auto.tfvars b/best-practices/ml-platform/terraform/features/initialize/initialize.auto.tfvars index 8ef26e4b0..fe439db50 100644 --- a/best-practices/ml-platform/terraform/features/initialize/initialize.auto.tfvars +++ b/best-practices/ml-platform/terraform/features/initialize/initialize.auto.tfvars @@ -1,4 +1,5 @@ -environment_name = "dev" +environment_name = "dev" +iap_support_email = "" project = { billing_account_id = "" folder_id = "" diff --git a/best-practices/ml-platform/terraform/features/initialize/main.tf b/best-practices/ml-platform/terraform/features/initialize/main.tf index bd9f5e55c..3ce24e261 100644 --- a/best-practices/ml-platform/terraform/features/initialize/main.tf +++ b/best-practices/ml-platform/terraform/features/initialize/main.tf @@ -35,7 +35,6 @@ resource "google_project" "environment" { project_id = "${local.project_id_prefix}-${random_string.project_id_suffix.result}" } - resource "google_storage_bucket" "mlp" { force_destroy = false location = var.storage_bucket_location @@ -48,6 +47,19 @@ resource "google_storage_bucket" "mlp" { } } +resource "google_project_service" "iap_googleapis_com" { + disable_dependent_services = true + disable_on_destroy = true + project = google_project.environment.project_id + service = "iap.googleapis.com" +} + +resource "google_iap_brand" "project_brand" { + support_email = var.iap_support_email + application_title = "IAP Secured Application" + project = google_project_service.iap_googleapis_com.project +} + resource "null_resource" "write_environment_name" { triggers = { md5 = var.environment_name diff --git a/best-practices/ml-platform/terraform/features/initialize/variables.tf b/best-practices/ml-platform/terraform/features/initialize/variables.tf index eff686657..12dbec234 100644 --- a/best-practices/ml-platform/terraform/features/initialize/variables.tf +++ b/best-practices/ml-platform/terraform/features/initialize/variables.tf @@ -18,6 +18,11 @@ variable "environment_name" { type = string } +variable "iap_support_email" { + description = "Name of the platform type" + type = string +} + variable "platform_type" { default = "playground" description = "Name of the platform type" diff --git a/best-practices/ml-platform/terraform/features/initialize/versions.tf b/best-practices/ml-platform/terraform/features/initialize/versions.tf index 9a0bdab15..af552ee09 100644 --- a/best-practices/ml-platform/terraform/features/initialize/versions.tf +++ b/best-practices/ml-platform/terraform/features/initialize/versions.tf @@ -16,7 +16,7 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "5.19.0" + version = "5.35.0" } null = { source = "hashicorp/null" diff --git a/best-practices/ml-platform/terraform/modules/cloud-nat/versions.tf b/best-practices/ml-platform/terraform/modules/cloud-nat/versions.tf index a2baa7bfd..fa4e230de 100644 --- a/best-practices/ml-platform/terraform/modules/cloud-nat/versions.tf +++ b/best-practices/ml-platform/terraform/modules/cloud-nat/versions.tf @@ -16,15 +16,15 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "5.19.0" + version = "5.35.0" } google-beta = { source = "hashicorp/google-beta" - version = "5.19.0" + version = "5.35.0" } random = { source = "hashicorp/random" - version = "3.6.0" + version = "3.6.2" } } } diff --git a/best-practices/ml-platform/terraform/modules/cluster/gke.tf b/best-practices/ml-platform/terraform/modules/cluster/gke.tf index c87df1445..e046e6971 100644 --- a/best-practices/ml-platform/terraform/modules/cluster/gke.tf +++ b/best-practices/ml-platform/terraform/modules/cluster/gke.tf @@ -134,6 +134,12 @@ resource "google_container_cluster" "mlp" { ip_allocation_policy { } + lifecycle { + ignore_changes = [ + node_config[0] + ] + } + logging_config { enable_components = [ "APISERVER", diff --git a/best-practices/ml-platform/terraform/modules/cluster/versions.tf b/best-practices/ml-platform/terraform/modules/cluster/versions.tf index b19f861ad..f0dfc493e 100644 --- a/best-practices/ml-platform/terraform/modules/cluster/versions.tf +++ b/best-practices/ml-platform/terraform/modules/cluster/versions.tf @@ -16,11 +16,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "5.19.0" + version = "5.35.0" } google-beta = { source = "hashicorp/google-beta" - version = "5.19.0" + version = "5.35.0" } } } diff --git a/best-practices/ml-platform/terraform/modules/config_controller/main.tf b/best-practices/ml-platform/terraform/modules/config_controller/main.tf new file mode 100644 index 000000000..48d00d81d --- /dev/null +++ b/best-practices/ml-platform/terraform/modules/config_controller/main.tf @@ -0,0 +1,133 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + cluster_name = "krmapihost-${var.name}" + kubeconfig_file_name = "${var.project_id}_${local.cluster_name}" + kubeconfig_file_path = "${var.kubeconfig_directory}/${local.kubeconfig_file_name}" +} + +data "google_project" "project" { + project_id = var.project_id +} + +resource "google_project_service" "anthos_googleapis_com" { + disable_dependent_services = false + disable_on_destroy = false + project = data.google_project.project.project_id + service = "anthos.googleapis.com" +} + +resource "google_project_service" "cloudresourcemanager_googleapis_com" { + disable_dependent_services = false + disable_on_destroy = false + project = data.google_project.project.project_id + service = "cloudresourcemanager.googleapis.com" +} + +resource "google_project_service" "container_googleapis_com" { + disable_dependent_services = false + disable_on_destroy = false + project = data.google_project.project.project_id + service = "container.googleapis.com" +} + +resource "google_project_service" "krmapihosting_googleapis_com" { + disable_dependent_services = false + disable_on_destroy = false + project = data.google_project.project.project_id + service = "krmapihosting.googleapis.com" +} + +resource "google_project_service" "serviceusage_googleapis_com" { + disable_dependent_services = false + disable_on_destroy = false + project = data.google_project.project.project_id + service = "serviceusage.googleapis.com" +} + +resource "null_resource" "config_controller" { + provisioner "local-exec" { + command = "scripts/gcloud_create.sh" + environment = { + FULL_MANAGEMENT = self.triggers.FULL_MANAGEMENT + LOCATION = self.triggers.LOCATION + NAME = self.triggers.NAME + NETWORK = self.triggers.NETWORK + PROJECT_ID = self.triggers.PROJECT_ID + SUBNET = self.triggers.SUBNET + } + interpreter = ["bash", "-c"] + working_dir = path.module + } + + provisioner "local-exec" { + command = "scripts/gcloud_delete.sh" + environment = { + LOCATION = self.triggers.LOCATION + NAME = self.triggers.NAME + PROJECT_ID = self.triggers.PROJECT_ID + } + interpreter = ["bash", "-c"] + when = destroy + working_dir = path.module + } + + triggers = { + FULL_MANAGEMENT = var.full_management + LOCATION = var.location + NAME = var.name + NETWORK = var.network + PROJECT_ID = var.project_id + SUBNET = var.subnet + } +} + +data "google_container_cluster" "config_controller" { + depends_on = [ + null_resource.config_controller + ] + + location = var.location + name = local.cluster_name + project = data.google_project.project.project_id +} + +resource "null_resource" "kubeconfig" { + provisioner "local-exec" { + command = </dev/null 2>&1 + pwd -P +)" + +command=(gcloud anthos config controller create "${NAME}") + +if [ ! -z "${FULL_MANAGEMENT}" ]; then + command+=(--full-management) +fi + +if [ ! -z "${LOCATION}" ]; then + command+=(--location=${LOCATION}) +fi + +if [ ! -z "${NETWORK}" ]; then + command+=(--network=${NETWORK}) +fi + +if [ ! -z "${PROJECT_ID}" ]; then + command+=(--project=${PROJECT_ID}) +fi + +if [ ! -z "${SUBNET}" ]; then + command+=(--subnet=${SUBNET}) +fi + +echo "${command[@]}" +"${command[@]}" diff --git a/best-practices/ml-platform/terraform/modules/config_controller/scripts/gcloud_delete.sh b/best-practices/ml-platform/terraform/modules/config_controller/scripts/gcloud_delete.sh new file mode 100755 index 000000000..dd61ac07a --- /dev/null +++ b/best-practices/ml-platform/terraform/modules/config_controller/scripts/gcloud_delete.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -u + +SCRIPT_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" + +command=(gcloud anthos config controller delete "${NAME}") + +if [ ! -z "${LOCATION}" ]; then + command+=(--location=${LOCATION}) +fi + +if [ ! -z "${PROJECT_ID}" ]; then + command+=(--project=${PROJECT_ID}) +fi + +command+=(--quiet) + +echo "${command[@]}" +"${command[@]}" diff --git a/best-practices/ml-platform/terraform/modules/config_controller/variables.tf b/best-practices/ml-platform/terraform/modules/config_controller/variables.tf new file mode 100644 index 000000000..bf293f27c --- /dev/null +++ b/best-practices/ml-platform/terraform/modules/config_controller/variables.tf @@ -0,0 +1,48 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "full_management" { + description = "Use a fully managed (Autopilot) cluster" + type = bool +} + +variable "kubeconfig_directory" { + description = "Path to store the kubeconfig" + type = string +} + +variable "location" { + description = "Location of the config controller cluster" + type = string +} + +variable "project_id" { + description = "Project ID for the config controller cluster" + type = string +} + +variable "name" { + description = "Name of the config controller cluster" + type = string +} + +variable "network" { + description = "Existing VPC Network to use for the config controller cluster and nodes" + type = string +} + +variable "subnet" { + description = "Specifies the subnet that the VM instances are a part of" + type = string +} diff --git a/best-practices/ml-platform/terraform/modules/config_controller/versions.tf b/best-practices/ml-platform/terraform/modules/config_controller/versions.tf new file mode 100644 index 000000000..49012fb46 --- /dev/null +++ b/best-practices/ml-platform/terraform/modules/config_controller/versions.tf @@ -0,0 +1,24 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_version = ">= 1.5.7" + + required_providers { + null = { + source = "hashicorp/null" + version = "3.2.2" + } + } +} diff --git a/best-practices/ml-platform/terraform/modules/github_repository/versions.tf b/best-practices/ml-platform/terraform/modules/github_repository/versions.tf index 6026fcd8b..9f0bd67dd 100644 --- a/best-practices/ml-platform/terraform/modules/github_repository/versions.tf +++ b/best-practices/ml-platform/terraform/modules/github_repository/versions.tf @@ -18,7 +18,7 @@ terraform { required_providers { github = { source = "integrations/github" - version = "6.2.1" + version = "6.2.2" } } } diff --git a/best-practices/ml-platform/terraform/modules/gitlab_project/main.tf b/best-practices/ml-platform/terraform/modules/gitlab_project/main.tf new file mode 100644 index 000000000..7d98b5ea9 --- /dev/null +++ b/best-practices/ml-platform/terraform/modules/gitlab_project/main.tf @@ -0,0 +1,33 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +data "gitlab_group" "group" { + full_path = var.group_full_path +} + +resource "gitlab_project" "project" { + name = var.project_name + default_branch = var.branches.default + description = var.description + namespace_id = data.gitlab_group.group.id + visibility_level = var.visibility_level +} + +resource "gitlab_branch" "branch" { + for_each = toset(setsubtract(var.branches.names, ["main"])) + + name = each.value + project = gitlab_project.project.id + ref = "main" +} diff --git a/best-practices/ml-platform/terraform/modules/gitlab_project/outputs.tf b/best-practices/ml-platform/terraform/modules/gitlab_project/outputs.tf new file mode 100644 index 000000000..18d8ca6e5 --- /dev/null +++ b/best-practices/ml-platform/terraform/modules/gitlab_project/outputs.tf @@ -0,0 +1,42 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +output "branch_names" { + value = var.branches.names +} + +output "branches" { + value = var.branches.names +} + +output "default_branch" { + value = var.branches.default +} + +output "full_name" { + value = gitlab_project.project.path_with_namespace +} + +output "html_url" { + value = gitlab_project.project.web_url +} + +output "http_clone_url" { + value = gitlab_project.project.http_url_to_repo +} + +output "repo" { + sensitive = true + value = gitlab_project.project +} diff --git a/best-practices/ml-platform/terraform/modules/gitlab_project/variables.tf b/best-practices/ml-platform/terraform/modules/gitlab_project/variables.tf new file mode 100644 index 000000000..e722720c9 --- /dev/null +++ b/best-practices/ml-platform/terraform/modules/gitlab_project/variables.tf @@ -0,0 +1,63 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "branches" { + default = { + default = "main" + names = ["main"] + } + description = "List of branches to create in the repository." + type = object({ + default = string + names = list(string), + }) + + validation { + condition = contains(var.branches.names, var.branches.default) + error_message = "'branches.default' must be in 'branches.names'" + } +} + +variable "description" { + default = null + description = "A description of the project." + type = string +} + +variable "group_full_path" { + description = "The full path of the group." + type = string +} + +variable "project_name" { + description = "The name of the project." + type = string +} + +variable "token" { + description = "The OAuth2 Token, Project, Group, Personal Access Token or CI Job Token used to connect to GitLab." + sensitive = true + type = string +} + +variable "visibility_level" { + default = "private" + description = "The visibility level of the project." + type = string + + validation { + condition = contains(["internal", "private", "public"], var.visibility_level) + error_message = "'visibility_level' must be 'internal', 'private', or'public'" + } +} diff --git a/best-practices/ml-platform/terraform/modules/gitlab_project/versions.tf b/best-practices/ml-platform/terraform/modules/gitlab_project/versions.tf new file mode 100644 index 000000000..837448b62 --- /dev/null +++ b/best-practices/ml-platform/terraform/modules/gitlab_project/versions.tf @@ -0,0 +1,28 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_version = ">= 1.5.7" + + required_providers { + gitlab = { + source = "gitlabhq/gitlab" + version = "17.0.1" + } + } +} + +provider "gitlab" { + token = var.token +} diff --git a/best-practices/ml-platform/terraform/modules/network/versions.tf b/best-practices/ml-platform/terraform/modules/network/versions.tf index 466fd04d7..b75f44891 100644 --- a/best-practices/ml-platform/terraform/modules/network/versions.tf +++ b/best-practices/ml-platform/terraform/modules/network/versions.tf @@ -16,7 +16,7 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "5.19.0" + version = "5.35.0" } } } diff --git a/best-practices/ml-platform/terraform/modules/node-pools/versions.tf b/best-practices/ml-platform/terraform/modules/node-pools/versions.tf index b19f861ad..f0dfc493e 100644 --- a/best-practices/ml-platform/terraform/modules/node-pools/versions.tf +++ b/best-practices/ml-platform/terraform/modules/node-pools/versions.tf @@ -16,11 +16,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "5.19.0" + version = "5.35.0" } google-beta = { source = "hashicorp/google-beta" - version = "5.19.0" + version = "5.35.0" } } } diff --git a/best-practices/ml-platform/terraform/modules/vm-reservations/versions.tf b/best-practices/ml-platform/terraform/modules/vm-reservations/versions.tf index b19f861ad..f0dfc493e 100644 --- a/best-practices/ml-platform/terraform/modules/vm-reservations/versions.tf +++ b/best-practices/ml-platform/terraform/modules/vm-reservations/versions.tf @@ -16,11 +16,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "5.19.0" + version = "5.35.0" } google-beta = { source = "hashicorp/google-beta" - version = "5.19.0" + version = "5.35.0" } } } diff --git a/best-practices/ml-platform/test/README.md b/best-practices/ml-platform/test/README.md new file mode 100644 index 000000000..ddcb6fc8b --- /dev/null +++ b/best-practices/ml-platform/test/README.md @@ -0,0 +1,166 @@ +# Testing + +## Setup + +- Clone the repository and change directory to the ml-platform directory + + ``` + git clone https://github.com/GoogleCloudPlatform/ai-on-gke && \ + cd ai-on-gke/best-practices/ml-platform + ``` + +- Set environment variables + + ``` + export MLP_BASE_DIR=$(pwd) && \ + echo "export MLP_BASE_DIR=${MLP_BASE_DIR}" >> ${HOME}/.bashrc + ``` + +- Configure GitHub credentials + + ``` + # Create a secure directory + mkdir -p ${HOME}/secrets/ + chmod 700 ${HOME}/secrets + + # Create a secure file + touch ${HOME}/secrets/mlp-github-token + chmod 600 ${HOME}/secrets/mlp-github-token + + # Put your access token in the secure file using your preferred editor + nano ${HOME}/secrets/mlp-github-token + ``` + +- Configure GitLab credentials + + ``` + # Create a secure directory + mkdir -p ${HOME}/secrets/ + chmod 700 ${HOME}/secrets + + # Create a secure file + touch ${HOME}/secrets/mlp-gitlab-token + chmod 600 ${HOME}/secrets/mlp-gitlab-token + + # Put your access token in the secure file using your preferred editor + nano ${HOME}/secrets/mlp-gitlab-token + ``` + +- Configure `kaggle` CLI credentials + + ``` + # Create a secure directory + mkdir -p ${HOME}/.kaggle + chmod 700 ${HOME}/.kaggle + + # Create a secure file + touch ${HOME}/.kaggle/kaggle.json + chmod 600 ${HOME}/.kaggle/kaggle.json + + # Put your API token in the secure file using your preferred editor + nano ${HOME}/.kaggle/kaggle.json + ``` + +## End to end tests + +### Playground BYOP GitHub Dataprocessing + +This test script will stand up the `playground` platform using GitHub for the Config Sync repository in an existing project, run the `dataprocessing` job, teardown the platform, and cleanup the environment. + +- Set the GitHub organization or user namespace + + ``` + export MLP_GIT_NAMESPACE= + ``` + +- Set GitHub user name + + ``` + export MLP_GIT_USER_NAME= + ``` + +- Set GitHub email address + + ``` + export MLP_GIT_USER_EMAIL= + ``` + +- Set Project ID + + ``` + export MLP_PROJECT_ID= + ``` + +- Override IAP domain, if required. Defaults to the domain of the active `gcloud` user account(`gcloud auth list --filter=status:ACTIVE --format="value(account)" | awk -F@ '{print $2}'`) + + ``` + export MLP_IAP_DOMAIN= + ``` + +- Ensure the OAuth consent screen for IAP is configured. + + ``` + gcloud iap oauth-brands list --project=${MLP_PROJECT_ID} + ``` + +- Execute the script + + ``` + ${MLP_BASE_DIR}/test/scripts/e2e/playground_byop_gh_dataprocessing.sh + ``` + +### Playground New Project GitHub Dataprocessing + +This test script will initialize a new project, stand up the `playground` platform using GitHub for the Config Sync repository in , run the `dataprocessing` job, and delete the project. + +- Set the GitHub organization or user namespace + + ``` + export MLP_GIT_NAMESPACE= + ``` + +- Set GitHub user name + + ``` + export MLP_GIT_USER_NAME= + ``` + +- Set GitHub email address + + ``` + export MLP_GIT_USER_EMAIL= + ``` + +- Set the billing account ID to assign to the new project + + ``` + export MLP_BILLING_ACCOUNT_ID= + ``` + +- Set the folder ID **OR** organization ID to use for the new project + + ``` + export MLP_FOLDER_ID= + ``` + + **-OR-** + + ``` + export MLP_ORG_ID= + ``` + +- Override IAP domain, if required. Defaults to the domain of the active `gcloud` user account(`gcloud auth list --filter=status:ACTIVE --format="value(account)" | awk -F@ '{print $2}'`) + + ``` + export MLP_IAP_DOMAIN= + ``` + +- Execute the script + + ``` + ${MLP_BASE_DIR}/test/scripts/e2e/playground_new_gh_dataprocessing.sh + ``` + +## Unit tests + +**WIP** diff --git a/best-practices/ml-platform/test/log/.gitkeep b/best-practices/ml-platform/test/log/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/best-practices/ml-platform/test/scripts/e2e/playground_byop_gh_dataprocessing.sh b/best-practices/ml-platform/test/scripts/e2e/playground_byop_gh_dataprocessing.sh new file mode 100755 index 000000000..d46490d5a --- /dev/null +++ b/best-practices/ml-platform/test/scripts/e2e/playground_byop_gh_dataprocessing.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" +SCRIPTS_DIR=$(realpath ${SCRIPT_PATH}/..) + +export MLP_TYPE="playground" +source ${SCRIPTS_DIR}/helpers/include.sh + +echo_title "Preparing the environment" +source ${SCRIPTS_DIR}/helpers/byop_env.sh +source ${SCRIPTS_DIR}/helpers/gh_env.sh +source ${SCRIPTS_DIR}/helpers/dataprocessing_env.sh + +# terraform apply +############################################################################### +if lock_is_set "terraform_apply"; then + echo_bold "Terraform apply previously completed successfully" +else + source ${SCRIPTS_DIR}/helpers/${MLP_TYPE}_env.sh + export TF_VAR_git_token=$(tr --delete '\n' <${HOME}/secrets/mlp-github-token) + source ${SCRIPTS_DIR}/helpers/terraform_apply.sh + lock_set "terraform_apply" +fi + +# dataprocessing +############################################################################### +if lock_is_set "dataprocessing"; then + echo_bold "Dataprocessing previously completed successfully" +else + source ${SCRIPTS_DIR}/helpers/dataprocessing.sh + lock_set "dataprocessing" +fi + +# terraform destroy +############################################################################### +if lock_is_set "terraform_destroy"; then + echo_bold "Terraform destory previously completed successfully" +else + export TF_VAR_git_token=$(tr --delete '\n' <${HOME}/secrets/mlp-github-token) + source ${SCRIPTS_DIR}/helpers/terraform_destroy.sh + lock_set "terraform_destroy" +fi + +# cleanup +############################################################################### +echo_title "Cleaning up the environment" + +source ${SCRIPTS_DIR}/helpers/dataprocessing_cleanup.sh +source ${SCRIPTS_DIR}/helpers/byop_playground_cleanup.sh + +total_runtime "script" + +check_local_error_exit_on_error + +lock_unset "dataprocessing" +lock_unset "terraform_apply" +lock_unset "terraform_destroy" + +check_local_error_and_exit diff --git a/best-practices/ml-platform/test/scripts/e2e/playground_new_gh_dataprocessing.sh b/best-practices/ml-platform/test/scripts/e2e/playground_new_gh_dataprocessing.sh new file mode 100755 index 000000000..73b73a4cd --- /dev/null +++ b/best-practices/ml-platform/test/scripts/e2e/playground_new_gh_dataprocessing.sh @@ -0,0 +1,102 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" +SCRIPTS_DIR=$(realpath ${SCRIPT_PATH}/..) + +export MLP_TYPE="playground" +source ${SCRIPTS_DIR}/helpers/include.sh + +echo_title "Preparing the environment" + +source ${SCRIPTS_DIR}/helpers/new_gh_env.sh + +# feature initialize apply +############################################################################### +if lock_is_set "features_initialize_apply"; then + echo_bold "Feature initialize apply previously completed successfully" +else + source ${SCRIPTS_DIR}/helpers/feature_initialize_env.sh + source ${SCRIPTS_DIR}/helpers/feature_initialize_apply.sh + lock_set "features_initialize_apply" +fi + +export MLP_PROJECT_ID=$(grep environment_project_id ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars | awk -F"=" '{print $2}' | xargs) + +source ${SCRIPTS_DIR}/helpers/dataprocessing_env.sh + +# terraform apply +############################################################################### +if lock_is_set "terraform_apply"; then + echo_bold "Terraform apply previously completed successfully" +else + source ${SCRIPTS_DIR}/helpers/${MLP_TYPE}_env.sh + + export TF_VAR_git_token=$(tr --delete '\n' <${HOME}/secrets/mlp-github-token) + source ${SCRIPTS_DIR}/helpers/terraform_apply.sh + lock_set "terraform_apply" +fi + +# dataprocessing +############################################################################### +if lock_is_set "dataprocessing"; then + echo_bold "Dataprocessing previously completed successfully" +else + source ${SCRIPTS_DIR}/helpers/dataprocessing.sh + lock_set "dataprocessing" +fi + +# terraform destroy +############################################################################### + +if lock_is_set "terraform_destroy"; then + echo_bold "Terraform destory previously completed successfully" +else + export TF_VAR_git_token=$(tr --delete '\n' <${HOME}/secrets/mlp-github-token) + source ${SCRIPTS_DIR}/helpers/terraform_destroy.sh + lock_set "terraform_destroy" +fi + +# feature initialize destroy +############################################################################### + +if lock_is_set "features_initialize_destroy"; then + echo_bold "Feature initialize destroy previously completed successfully" +else + source ${SCRIPTS_DIR}/helpers/feature_initialize_destroy.sh + lock_set "features_initialize_destroy" +fi + +# cleanup +############################################################################### +echo_title "Cleaning up the environment" + +source ${SCRIPTS_DIR}/helpers/new_gh_playground_cleanup.sh + +total_runtime "script" + +check_local_error_exit_on_error + +lock_unset "dataprocessing" +lock_unset "features_initialize_apply" +lock_unset "features_initialize_destroy" +lock_unset "terraform_apply" +lock_unset "terraform_destroy" + +check_local_error_and_exit diff --git a/best-practices/ml-platform/test/scripts/helpers/byop_env.sh b/best-practices/ml-platform/test/scripts/helpers/byop_env.sh new file mode 100755 index 000000000..d8ed0dbd8 --- /dev/null +++ b/best-practices/ml-platform/test/scripts/helpers/byop_env.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +echo_title "Checking BYOP required configuration" + +if [ -z "${MLP_PROJECT_ID}" ]; then + echo "MLP_PROJECT_ID is not set!" + exit 7 +fi + +export MLP_ENVIRONMENT_NAME=${MLP_ENVIRONMENT_NAME:-$(grep environment_name ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars | awk -F"=" '{print $2}' | xargs)} + +export MLP_STATE_BUCKET="${MLP_PROJECT_ID}-${MLP_ENVIRONMENT_NAME}-terraform" + +export TF_DATA_DIR=".terraform-${MLP_PROJECT_ID}-${MLP_ENVIRONMENT_NAME}" + +echo_title "Applying terraform configuration" + +sed -i "s/^\([[:blank:]]*bucket[[:blank:]]*=\).*$/\1 \"${MLP_STATE_BUCKET}\"/" ${MLP_TYPE_BASE_DIR}/backend.tf +sed -i "s/^\([[:blank:]]*environment_name[[:blank:]]*=\).*$/\1 \"${MLP_ENVIRONMENT_NAME}\"/" ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars +sed -i "s/^\([[:blank:]]*environment_project_id[[:blank:]]*=\).*$/\1 \"${MLP_PROJECT_ID}\"/" ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars diff --git a/best-practices/ml-platform/test/scripts/helpers/byop_playground_cleanup.sh b/best-practices/ml-platform/test/scripts/helpers/byop_playground_cleanup.sh new file mode 100755 index 000000000..e3e9e5fca --- /dev/null +++ b/best-practices/ml-platform/test/scripts/helpers/byop_playground_cleanup.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +echo_title "Deleting Terraform GCS bucket" +gsutil -m rm -rf gs://${MLP_STATE_BUCKET}/* +gcloud storage buckets delete gs://${MLP_STATE_BUCKET} --project ${MLP_PROJECT_ID} + +echo_title "Cleaning up local repository changes" + +cd ${MLP_BASE_DIR} && + git restore \ + examples/platform/playground/backend.tf \ + examples/platform/playground/mlp.auto.tfvars + +cd ${MLP_BASE_DIR} && + rm -rf \ + examples/platform/playground/${TF_DATA_DIR} \ + examples/platform/playground/.terraform.lock.hcl diff --git a/best-practices/ml-platform/test/scripts/helpers/dataprocessing.sh b/best-practices/ml-platform/test/scripts/helpers/dataprocessing.sh new file mode 100755 index 000000000..f04696707 --- /dev/null +++ b/best-practices/ml-platform/test/scripts/helpers/dataprocessing.sh @@ -0,0 +1,112 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +start_runtime "dataprocessing" + +echo_title "Preparing dataprocessing job" + +echo_title "Enabling Artifact Registry APIs" +print_and_execute_no_check "gcloud services enable artifactregistry.googleapis.com containerscanning.googleapis.com --project ${PROJECT_ID}" + +echo_title "Enabling Cloud Build APIs" +print_and_execute_no_check "gcloud services enable cloudbuild.googleapis.com --project ${PROJECT_ID}" + +echo_title "Adding IAM permissions" +print_and_execute_no_check "gcloud projects add-iam-policy-binding ${PROJECT_ID} \ +--member 'serviceAccount:${MLP_PROJECT_ID}.svc.id.goog[ml-team/ray-head]' \ +--role roles/storage.objectViewer" + +print_and_execute_no_check "gcloud projects add-iam-policy-binding ${PROJECT_ID} \ +--member 'serviceAccount:${PROJECT_ID}.svc.id.goog[ml-team/ray-worker]' \ +--role roles/storage.objectAdmin" + +echo_title "Creating GCS bucket" +print_and_execute_no_check "gcloud storage buckets create gs://${PROCESSING_BUCKET} --project ${PROJECT_ID} --uniform-bucket-level-access" + +echo_title "Downloading the dataset and uploading to GCS" + +print_and_execute "kaggle datasets download --unzip atharvjairath/flipkart-ecommerce-dataset && \ +gcloud storage cp flipkart_com-ecommerce_sample.csv \ +gs://${PROCESSING_BUCKET}/flipkart_raw_dataset/flipkart_com-ecommerce_sample.csv && \ +rm flipkart_com-ecommerce_sample.csv" + +echo_title "Creating Artifact Registry repository" + +print_and_execute_no_check "gcloud artifacts repositories create ${MLP_ENVIRONMENT_NAME}-dataprocessing \ +--repository-format=docker \ +--location=us \ +--project=${PROJECT_ID}" + +echo_title "Building container image" +print_and_execute_no_check "gcloud config set builds/use_kaniko True" +while ! gcloud services list --project ${PROJECT_ID} | grep cloudbuild.googleapis.com >/dev/null 2>&1; do + sleep 10 +done + +export MLP_USE_CASE_BASE_DIR="${MLP_BASE_DIR}/examples/use-case/ray/dataprocessing" +print_and_execute "cd ${MLP_USE_CASE_BASE_DIR}/src && \ +gcloud builds submit \ +--project ${PROJECT_ID} \ +--tag ${DOCKER_IMAGE_URL} \ +." +check_local_error_exit_on_error + +echo_title "Configuring job" + +sed -i "s|#IMAGE|${DOCKER_IMAGE_URL}|" ${MLP_USE_CASE_BASE_DIR}/job.yaml && + sed -i "s|#PROCESSING_BUCKET|${PROCESSING_BUCKET}|" ${MLP_USE_CASE_BASE_DIR}/job.yaml + +echo_title "Getting cluster credentials" + +print_and_execute "gcloud container fleet memberships get-credentials ${CLUSTER_NAME} --project ${PROJECT_ID}" +check_local_error_exit_on_error + +echo_title "Deleting exsting job" +print_and_execute_no_check "kubectl delete -f ${MLP_USE_CASE_BASE_DIR}/job.yaml" + +echo_title "Creating job" +print_and_execute "kubectl apply -f ${MLP_USE_CASE_BASE_DIR}/job.yaml" +check_local_error_exit_on_error + +echo_title "Waiting for job to complete" +print_and_execute "kubectl wait --namespace=ml-team --for=condition=complete --timeout=3600s job/job & +kubectl wait --namespace=ml-team --for=condition=failed --timeout=3600s job/job && exit 1 & +wait -n && \ +pkill -f 'kubectl wait --namespace=ml-team'" +check_local_error_exit_on_error + +echo_title "Checking processed images" +IMAGES_PROCESS=$(gsutil du gs://${PROCESSING_BUCKET}/flipkart_images | wc -l) +echo_bold "Processed ${IMAGES_PROCESS} images." + +print_and_execute "((IMAGES_PROCESS > 0))" +check_local_error_exit_on_error + +echo_title "Removing IAM permissions" + +gcloud projects remove-iam-policy-binding ${MLP_PROJECT_ID} \ + --member "serviceAccount:${MLP_PROJECT_ID}.svc.id.goog[ml-team/ray-head]" \ + --role roles/storage.objectViewer + +gcloud projects remove-iam-policy-binding ${MLP_PROJECT_ID} \ + --member "serviceAccount:${PROJECT_ID}.svc.id.goog[ml-team/ray-worker]" \ + --role roles/storage.objectAdmin + +echo_title "Cleaning up local repository changes" +cd ${MLP_BASE_DIR} && + git restore examples/use-case/ray/dataprocessing/job.yaml + +total_runtime "dataprocessing" diff --git a/best-practices/ml-platform/test/scripts/helpers/dataprocessing_cleanup.sh b/best-practices/ml-platform/test/scripts/helpers/dataprocessing_cleanup.sh new file mode 100755 index 000000000..f59d674dc --- /dev/null +++ b/best-practices/ml-platform/test/scripts/helpers/dataprocessing_cleanup.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +echo_title "Deleting data processing Artifact Registry repository" + +gcloud artifacts repositories delete ${MLP_ENVIRONMENT_NAME}-dataprocessing \ + --location=us \ + --project=${MLP_PROJECT_ID} \ + --quiet + +echo_title "Deleting dataprocessing GCS buckets" + +gsutil -m -q rm -rf gs://${PROCESSING_BUCKET}/* +gcloud storage buckets delete gs://${PROCESSING_BUCKET} --project ${MLP_PROJECT_ID} + +gsutil -m -q rm -rf gs://${MLP_PROJECT_ID}_cloudbuild/* +gcloud storage buckets delete gs://${MLP_PROJECT_ID}_cloudbuild --project ${MLP_PROJECT_ID} diff --git a/best-practices/ml-platform/test/scripts/helpers/dataprocessing_env.sh b/best-practices/ml-platform/test/scripts/helpers/dataprocessing_env.sh new file mode 100755 index 000000000..b48632b29 --- /dev/null +++ b/best-practices/ml-platform/test/scripts/helpers/dataprocessing_env.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +echo_title "Checking dataprocessing required configuration" +source ${SCRIPTS_DIR}/helpers/kaggle.sh "datasets files atharvjairath/flipkart-ecommerce-dataset" +check_local_error_exit_on_error + +if [ -z "${MLP_PROJECT_ID}" ]; then + echo "MLP_PROJECT_ID is not set!" + exit 7 +fi + +echo_title "Applying dataprocessing configuration" +export CLUSTER_NAME="mlp-${MLP_ENVIRONMENT_NAME}" +export PROJECT_ID="${MLP_PROJECT_ID}" +export PROCESSING_BUCKET="${PROJECT_ID}-${MLP_ENVIRONMENT_NAME}-processing" +export DOCKER_IMAGE_URL=us-docker.pkg.dev/${PROJECT_ID}/${MLP_ENVIRONMENT_NAME}-dataprocessing/dp:v0.0.1 diff --git a/best-practices/ml-platform/test/scripts/helpers/display.sh b/best-practices/ml-platform/test/scripts/helpers/display.sh new file mode 100755 index 000000000..83f2397bf --- /dev/null +++ b/best-practices/ml-platform/test/scripts/helpers/display.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Styles +BOLD=$(tput bold) +NORMAL=$(tput sgr0) + +# Colors +CYAN='\033[1;36m' +GREEN='\e[1;32m' +RED='\e[1;91m' +YELLOW="\e[38;5;226m" +RESET='\e[0m' + +function echo_bold() { + echo "${BOLD}${@}${NORMAL}" +} + +function echo_error() { + echo -e "${RED}${@}${RESET}" +} + +function echo_success() { + echo -e "${GREEN}${@}${RESET}" +} + +function echo_title() { + echo + echo "${BOLD}# ${@}${NORMAL}" +} + +function echo_warning() { + echo -e "${YELLOW}${@}${RESET}" +} + +function print_and_execute() { + clean_command=$(echo ${@} | tr -s ' ') + printf "${GREEN}\$ ${clean_command}${RESET}" + printf "\n" + eval "${clean_command}" + return_code=$? + + if [ ${return_code} -eq "0" ]; then + echo_success "[OK]" + else + echo_error "[Return Code: ${return_code}]" + local_error=$(($local_error + 1)) + fi + echo + + return ${return_code} +} + +function print_and_execute_no_check() { + clean_command=$(echo ${@} | tr -s ' ') + printf "${GREEN}\$ ${clean_command}${RESET}" + printf "\n" + eval "${clean_command}" + return_code=$? + + if [ ${return_code} -eq "0" ]; then + echo_success "[OK]" + else + echo_warning "[Return Code: ${return_code}]" + fi + echo + + return ${return_code} +} diff --git a/best-practices/ml-platform/test/scripts/helpers/feature_initialize_apply.sh b/best-practices/ml-platform/test/scripts/helpers/feature_initialize_apply.sh new file mode 100755 index 000000000..7ad3e5352 --- /dev/null +++ b/best-practices/ml-platform/test/scripts/helpers/feature_initialize_apply.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +start_runtime "features_initialize_apply" + +echo_title "Initializing a new project" + +print_and_execute "cd ${MLP_BASE_DIR}/terraform/features/initialize && \ +terraform init && \ +terraform plan -input=false -out=tfplan && \ +terraform apply -input=false tfplan && \ +rm tfplan && \ +terraform init -force-copy -migrate-state && \ +rm -rf state" + +total_runtime "features_initialize_apply" + +check_local_error_exit_on_error diff --git a/best-practices/ml-platform/test/scripts/helpers/feature_initialize_destroy.sh b/best-practices/ml-platform/test/scripts/helpers/feature_initialize_destroy.sh new file mode 100755 index 000000000..f8ea697a9 --- /dev/null +++ b/best-practices/ml-platform/test/scripts/helpers/feature_initialize_destroy.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +start_runtime "features_initialize_destroy" + +echo_title "Destroying the project" + +export TERRAFORM_BUCKET_NAME=$(grep bucket ${MLP_BASE_DIR}/terraform/features/initialize/backend.tf | awk -F"=" '{print $2}' | xargs) +print_and_execute "cd ${MLP_BASE_DIR}/terraform/features/initialize && \ +cp backend.tf.local backend.tf && \ +terraform init -force-copy -lock=false -migrate-state && \ +gsutil -m rm -rf gs://${TERRAFORM_BUCKET_NAME}/* && \ +terraform init && \ +terraform destroy -auto-approve && \ +rm -rf .terraform .terraform.lock.hcl" + +total_runtime "features_initialize_destroy" + +check_local_error_exit_on_error diff --git a/best-practices/ml-platform/test/scripts/helpers/feature_initialize_env.sh b/best-practices/ml-platform/test/scripts/helpers/feature_initialize_env.sh new file mode 100755 index 000000000..ce126f095 --- /dev/null +++ b/best-practices/ml-platform/test/scripts/helpers/feature_initialize_env.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +echo_title "Applying feature initialize terraform configuration" + +MLP_IAP_SUPPORT_EMAIL=${MLP_IAP_SUPPORT_EMAIL:-$(gcloud auth list --filter=status:ACTIVE --format="value(account)")} +sed -i '/^iap_support_email[[:blank:]]*=/{h;s/=.*/= "'"${MLP_IAP_SUPPORT_EMAIL}"'"/};${x;/^$/{s//iap_support_email = "'"${MLP_IAP_SUPPORT_EMAIL}"'"/;H};x}' ${MLP_BASE_DIR}/terraform/features/initialize/initialize.auto.tfvars + +sed -i '/^ billing_account_id[[:blank:]]*=/{h;s/=.*/= "'"${MLP_BILLING_ACCOUNT_ID}"'"/};${x;/^$/{s// billing_account_id = "'"${MLP_BILLING_ACCOUNT_ID}"'"/;H};x}' ${MLP_BASE_DIR}/terraform/features/initialize/initialize.auto.tfvars +sed -i '/^ folder_id[[:blank:]]*=/{h;s/=.*/= "'"${MLP_FOLDER_ID:-""}"'"/};${x;/^$/{s// folder_id = "'"${MLP_FOLDER_ID:-""}"'"/;H};x}' ${MLP_BASE_DIR}/terraform/features/initialize/initialize.auto.tfvars +sed -i '/^ org_id[[:blank:]]*=/{h;s/=.*/= "'"${MLP_ORG_ID:-""}"'"/};${x;/^$/{s// org_id = "'"${MLP_ORG_ID:-""}"'"/;H};x}' ${MLP_BASE_DIR}/terraform/features/initialize/initialize.auto.tfvars diff --git a/best-practices/ml-platform/test/scripts/helpers/functions.sh b/best-practices/ml-platform/test/scripts/helpers/functions.sh new file mode 100755 index 000000000..6a64681f9 --- /dev/null +++ b/best-practices/ml-platform/test/scripts/helpers/functions.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +function check_local_error() { + printf "\n" + if [ ${local_error} -ne 0 ]; then + echo_error "There was an error while executing the script, review the output." + fi + printf "\n" +} + +function check_local_error_and_exit() { + check_local_error + + if [ ! -z ${MLP_LOG_FILE} ]; then + printf "\n" + echo_bold "A log file is available at '${MLP_LOG_FILE}'" + printf "\n" + fi + + exit ${local_error} +} + +function check_local_error_exit_on_error() { + if [ ${local_error} -ne 0 ]; then + check_local_error_and_exit + fi +} + +function get_lock_file() { + lock_name="${1}" + + echo "${MLP_LOCK_DIR}/${MLP_SCRIPT_NAME}-${lock_name}.lock" +} + +function lock_is_set() { + lock_name="${1}" + + lock_file=$(get_lock_file "${lock_name}") + if [ -f ${lock_file} ]; then + # lock is set + return 0 + fi + + # lock is NOT set + return 1 +} + +function lock_set() { + lock_name="${1}" + + lock_file=$(get_lock_file "${lock_name}") + if [ ! -f ${lock_file} ]; then + touch ${lock_file} + else + echo_warning "Lock ${lock_name} was already set" + fi +} + +function lock_unset() { + lock_name="${1}" + + lock_file=$(get_lock_file "${lock_name}") + if [ -f ${lock_file} ]; then + rm -f ${lock_file} + else + echo_warning "Lock ${lock_name} was not set" + fi + +} + +declare -A runtime=() + +function start_runtime() { + component="${1}" + + start_timestamp=$(date +%s) + + if [[ ${!runtime[@]} =~ ${component} ]]; then + echo_warning "Component ${component} runtime counter already exists, using existing value" + else + runtime[${component}]=${start_timestamp} + fi +} + +function total_runtime() { + component="${1}" + + end_timestamp=$(date +%s) + + total_runtime_value=0 + if [[ ${!runtime[@]} =~ ${component} ]]; then + start_timestamp=${runtime[${component}]} + total_runtime_value=$((end_timestamp - start_timestamp)) + else + echo_warning "Component ${component} does not exist, cannot calculate runtime" + fi + + echo_bold "Total runtime for ${component}: $(date -d@${total_runtime_value} -u +%H:%M:%S)" +} diff --git a/best-practices/ml-platform/test/scripts/helpers/gh_env.sh b/best-practices/ml-platform/test/scripts/helpers/gh_env.sh new file mode 100755 index 000000000..1b915696d --- /dev/null +++ b/best-practices/ml-platform/test/scripts/helpers/gh_env.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +echo_title "Checking GitHub required configuration" + +export GIT_TOKEN_FILE=${GIT_TOKEN_FILE:-${HOME}/secrets/mlp-github-token} + +if [ ! -f ${GIT_TOKEN_FILE} ]; then + echo "Git token missing at '${GIT_TOKEN_FILE}'!" + exit 3 +fi + +if [ -z "${MLP_GIT_NAMESPACE}" ]; then + echo "MLP_GIT_NAMESPACE is not set!" + exit 4 +fi + +if [ -z "${MLP_GIT_USER_NAME}" ]; then + echo "MLP_GIT_USER_NAME is not set!" + exit 5 +fi + +if [ -z "${MLP_GIT_USER_EMAIL}" ]; then + echo "MLP_GIT_USER_EMAIL is not set!" + exit 6 +fi + +echo_title "Applying Git configuration" +sed -i "s/YOUR_GIT_NAMESPACE/${MLP_GIT_NAMESPACE}/g" ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars +sed -i "s/YOUR_GIT_USER_EMAIL/${MLP_GIT_USER_EMAIL}/g" ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars +sed -i "s/YOUR_GIT_USER_NAME/${MLP_GIT_USER_NAME}/g" ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars diff --git a/best-practices/ml-platform/test/scripts/helpers/gl_env.sh b/best-practices/ml-platform/test/scripts/helpers/gl_env.sh new file mode 100755 index 000000000..9e8ddc87a --- /dev/null +++ b/best-practices/ml-platform/test/scripts/helpers/gl_env.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +echo_title "Checking GitLab required configuration" + +export GIT_TOKEN_FILE=${GIT_TOKEN_FILE:-${HOME}/secrets/mlp-gitlab-token} + +if [ ! -f ${GIT_TOKEN_FILE} ]; then + echo "Git token missing at '${GIT_TOKEN_FILE}'!" + exit 3 +fi + +if [ -z "${MLP_GIT_NAMESPACE}" ]; then + echo "MLP_GIT_NAMESPACE is not set!" + exit 4 +fi + +if [ -z "${MLP_GIT_USER_NAME}" ]; then + echo "MLP_GIT_USER_NAME is not set!" + exit 5 +fi + +if [ -z "${MLP_GIT_USER_EMAIL}" ]; then + echo "MLP_GIT_USER_EMAIL is not set!" + exit 6 +fi + +echo_title "Applying Git configuration" +sed -i "s/YOUR_GIT_NAMESPACE/${MLP_GIT_NAMESPACE}/g" ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars +sed -i "s/YOUR_GIT_USER_EMAIL/${MLP_GIT_USER_EMAIL}/g" ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars +sed -i "s/YOUR_GIT_USER_NAME/${MLP_GIT_USER_NAME}/g" ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars diff --git a/best-practices/ml-platform/test/scripts/helpers/include.sh b/best-practices/ml-platform/test/scripts/helpers/include.sh new file mode 100755 index 000000000..cbf98620a --- /dev/null +++ b/best-practices/ml-platform/test/scripts/helpers/include.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +source ${SCRIPTS_DIR}/helpers/display.sh +source ${SCRIPTS_DIR}/helpers/functions.sh + +start_runtime "script" + +# Create a logs file and send stdout and stderr to console and log file +log_directory=$(realpath ${SCRIPTS_DIR}/../log) +export MLP_SCRIPT_NAME=$(basename $0) +export MLP_LOG_TIMESTAMP=$(date +%s) + +export MLP_LOG_FILE=${log_directory}/${MLP_LOG_TIMESTAMP}-${MLP_LOG_FILE_PREFIX}${MLP_SCRIPT_NAME}.log +touch ${MLP_LOG_FILE} + +exec 3>&1 4>&2 +trap 'exec 2>&4 1>&3' 0 1 2 3 +exec 1> >(tee -i ${MLP_LOG_FILE}) 2>&1 + +echo_bold "A log file is available at '${MLP_LOG_FILE}'" + +# Set additional environment variable +export MLP_BASE_DIR=$(realpath "${SCRIPTS_DIR}/../..") +export MLP_LOCK_DIR="${MLP_BASE_DIR}/test/scripts/locks" +export MLP_TYPE_BASE_DIR="${MLP_BASE_DIR}/examples/platform/${MLP_TYPE}" + +# Set local_error to 0 +local_error=0 diff --git a/best-practices/ml-platform/test/scripts/helpers/kaggle.sh b/best-practices/ml-platform/test/scripts/helpers/kaggle.sh new file mode 100755 index 000000000..ab21f007a --- /dev/null +++ b/best-practices/ml-platform/test/scripts/helpers/kaggle.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if [ -z "${1}" ]; then + echo_error "Missing kaggle arguments" + exit 1 +fi +kaggle_args=${1} + +echo_title "Checking kaggle cli" + +print_and_execute "kaggle ${kaggle_args}" +exit_code=${?} + +case ${exit_code} in +0) + echo_success "kaggle cli found and configured" + ;; +1) + echo_error "Missing kaggle credentials" + ;; +2) + echo_error "Malformed kaggle command" + ;; +127) + echo_error "kaggle cli not found" + ;; +*) + echo_error "Unhandled exit code ${exit_code}" + ;; +esac diff --git a/best-practices/ml-platform/test/scripts/helpers/new_gh_env.sh b/best-practices/ml-platform/test/scripts/helpers/new_gh_env.sh new file mode 100755 index 000000000..ab362480c --- /dev/null +++ b/best-practices/ml-platform/test/scripts/helpers/new_gh_env.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +echo_title "Checking new_gh required configuration" + +if [ ! -f ${HOME}/secrets/mlp-github-token ]; then + echo_error "Git token missing at '${HOME}/secrets/mlp-github-token'!" + exit 3 +fi + +if [ -z "${MLP_GIT_NAMESPACE}" ]; then + echo_error "MLP_GIT_NAMESPACE is not set!" + exit 4 +fi + +if [ -z "${MLP_GIT_USER_NAME}" ]; then + echo_error "MLP_GIT_USER_NAME is not set!" + exit 5 +fi + +if [ -z "${MLP_GIT_USER_EMAIL}" ]; then + echo_error "MLP_GIT_USER_EMAIL is not set!" + exit 6 +fi + +if [ -z "${MLP_FOLDER_ID}" ] && [ -z "${MLP_ORG_ID}" ]; then + echo_error "MLP_FOLDER_ID or MLP_ORG_ID is not set, at least one needs to be set!" + exit 6 +fi + +source ${SCRIPTS_DIR}/helpers/gh_env.sh diff --git a/best-practices/ml-platform/test/scripts/helpers/new_gh_playground_cleanup.sh b/best-practices/ml-platform/test/scripts/helpers/new_gh_playground_cleanup.sh new file mode 100755 index 000000000..bf57e5cfe --- /dev/null +++ b/best-practices/ml-platform/test/scripts/helpers/new_gh_playground_cleanup.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +echo_title "Cleaning up local repository changes" + +print_and_execute_no_check "cd ${MLP_BASE_DIR} && +git restore \ +examples/platform/playground/backend.tf \ +examples/platform/playground/mlp.auto.tfvars \ +terraform/features/initialize/backend.tf \ +terraform/features/initialize/backend.tf.bucket \ +terraform/features/initialize/initialize.auto.tfvars" + +print_and_execute_no_check "cd ${MLP_BASE_DIR} && +rm -rf \ +examples/platform/playground/.terraform \ +examples/platform/playground/.terraform.lock.hcl \ +terraform/features/initialize/.terraform \ +terraform/features/initialize/.terraform.lock.hcl \ +terraform/features/initialize/backend.tf.local \ +terraform/features/initialize/stateck.hcl" diff --git a/best-practices/ml-platform/test/scripts/helpers/playground_env.sh b/best-practices/ml-platform/test/scripts/helpers/playground_env.sh new file mode 100755 index 000000000..ba5746ee7 --- /dev/null +++ b/best-practices/ml-platform/test/scripts/helpers/playground_env.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +echo_title "Applying terraform configuration" + +sed -i "s/^\([[:blank:]]*bucket[[:blank:]]*=\).*$/\1 \"${MLP_STATE_BUCKET}\"/" ${MLP_TYPE_BASE_DIR}/backend.tf +sed -i "s/^\([[:blank:]]*environment_name[[:blank:]]*=\).*$/\1 \"${MLP_ENVIRONMENT_NAME}\"/" ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars +sed -i "s/^\([[:blank:]]*environment_project_id[[:blank:]]*=\).*$/\1 \"${MLP_PROJECT_ID}\"/" ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars + +echo_title "Creating GCS bucket" +gcloud storage buckets create gs://${MLP_STATE_BUCKET} --project ${MLP_PROJECT_ID} + +echo_title "Checking MLP_IAP_DOMAIN" +MLP_IAP_DOMAIN=${MLP_IAP_DOMAIN:-$(gcloud auth list --filter=status:ACTIVE --format="value(account)" | awk -F@ '{print $2}')} +echo "MLP_IAP_DOMAIN=${MLP_IAP_DOMAIN}" +sed -i '/^iap_domain[[:blank:]]*=/{h;s/=.*/= "'"${MLP_IAP_DOMAIN}"'"/};${x;/^$/{s//iap_domain = "'"${MLP_IAP_DOMAIN}"'"/;H};x}' ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars + +echo_title "Checking ray-dashboard endpoint" +gcloud endpoints services undelete ray-dashboard.ml-team.mlp-${MLP_ENVIRONMENT_NAME}.endpoints.${MLP_PROJECT_ID}.cloud.goog --quiet 2>/dev/null diff --git a/best-practices/ml-platform/test/scripts/helpers/terraform_apply.sh b/best-practices/ml-platform/test/scripts/helpers/terraform_apply.sh new file mode 100755 index 000000000..ac890c629 --- /dev/null +++ b/best-practices/ml-platform/test/scripts/helpers/terraform_apply.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +start_runtime "terraform_apply" + +echo_title "Running terraform apply" + +print_and_execute "cd ${MLP_TYPE_BASE_DIR} && \ +terraform init && \ +terraform plan -input=false -out=tfplan && \ +terraform apply -input=false tfplan" + +rm tfplan + +total_runtime "terraform_apply" + +check_local_error_exit_on_error diff --git a/best-practices/ml-platform/test/scripts/helpers/terraform_destroy.sh b/best-practices/ml-platform/test/scripts/helpers/terraform_destroy.sh new file mode 100755 index 000000000..72ce6e6a8 --- /dev/null +++ b/best-practices/ml-platform/test/scripts/helpers/terraform_destroy.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +start_runtime "terraform_destroy" + +echo_title "Running terraform destroy" + +print_and_execute "cd ${MLP_TYPE_BASE_DIR} && \ +terraform init && \ +terraform destroy -auto-approve" + +total_runtime "terraform_destroy" + +check_local_error_exit_on_error diff --git a/best-practices/ml-platform/test/scripts/locks/.gitkeep b/best-practices/ml-platform/test/scripts/locks/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/best-practices/ml-platform/test/scripts/unit/playground_byop_gh_apply.sh b/best-practices/ml-platform/test/scripts/unit/playground_byop_gh_apply.sh new file mode 100755 index 000000000..f0d340312 --- /dev/null +++ b/best-practices/ml-platform/test/scripts/unit/playground_byop_gh_apply.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" +SCRIPTS_DIR=$(realpath ${SCRIPT_PATH}/..) + +export MLP_TYPE="playground" +source ${SCRIPTS_DIR}/helpers/include.sh + +echo_title "Preparing the environment" +source ${SCRIPTS_DIR}/helpers/byop_env.sh +source ${SCRIPTS_DIR}/helpers/gh_env.sh +source ${SCRIPTS_DIR}/helpers/${MLP_TYPE}_env.sh + +# terraform apply +############################################################################### +export TF_VAR_git_token=$(tr --delete '\n' <${GIT_TOKEN_FILE}) +source ${SCRIPTS_DIR}/helpers/terraform_apply.sh + +check_local_error_and_exit diff --git a/best-practices/ml-platform/test/scripts/unit/playground_byop_gh_destroy.sh b/best-practices/ml-platform/test/scripts/unit/playground_byop_gh_destroy.sh new file mode 100755 index 000000000..c7121c9dd --- /dev/null +++ b/best-practices/ml-platform/test/scripts/unit/playground_byop_gh_destroy.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" +SCRIPTS_DIR=$(realpath ${SCRIPT_PATH}/..) + +export MLP_TYPE="playground" +source ${SCRIPTS_DIR}/helpers/include.sh + +echo_title "Preparing the environment" +source ${SCRIPTS_DIR}/helpers/byop_env.sh +source ${SCRIPTS_DIR}/helpers/gh_env.sh + +# terraform destroy +############################################################################### +export TF_VAR_git_token=$(tr --delete '\n' <${GIT_TOKEN_FILE}) +source ${SCRIPTS_DIR}/helpers/terraform_destroy.sh + +check_local_error_exit_on_error + +source ${SCRIPTS_DIR}/helpers/byop_playground_cleanup.sh + +check_local_error_and_exit diff --git a/best-practices/ml-platform/test/scripts/unit/playground_byop_gl_apply.sh b/best-practices/ml-platform/test/scripts/unit/playground_byop_gl_apply.sh new file mode 100755 index 000000000..1ec1f6355 --- /dev/null +++ b/best-practices/ml-platform/test/scripts/unit/playground_byop_gl_apply.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" +SCRIPTS_DIR=$(realpath ${SCRIPT_PATH}/..) + +export MLP_TYPE="playground" +source ${SCRIPTS_DIR}/helpers/include.sh + +echo_title "Preparing the environment" +source ${SCRIPTS_DIR}/helpers/byop_env.sh +source ${SCRIPTS_DIR}/helpers/gl_env.sh +source ${SCRIPTS_DIR}/helpers/${MLP_TYPE}_env.sh + +# terraform apply +############################################################################### +export TF_VAR_git_token=$(tr --delete '\n' <${GIT_TOKEN_FILE}) +source ${SCRIPTS_DIR}/helpers/terraform_apply.sh + +check_local_error_and_exit diff --git a/best-practices/ml-platform/test/scripts/unit/playground_byop_gl_destroy.sh b/best-practices/ml-platform/test/scripts/unit/playground_byop_gl_destroy.sh new file mode 100755 index 000000000..fa1d516a5 --- /dev/null +++ b/best-practices/ml-platform/test/scripts/unit/playground_byop_gl_destroy.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" +SCRIPTS_DIR=$(realpath ${SCRIPT_PATH}/..) + +export MLP_TYPE="playground" +source ${SCRIPTS_DIR}/helpers/include.sh + +echo_title "Preparing the environment" +source ${SCRIPTS_DIR}/helpers/byop_env.sh +source ${SCRIPTS_DIR}/helpers/gl_env.sh + +# terraform destroy +############################################################################### +export TF_VAR_git_token=$(tr --delete '\n' <${GIT_TOKEN_FILE}) +source ${SCRIPTS_DIR}/helpers/terraform_destroy.sh + +check_local_error_exit_on_error + +source ${SCRIPTS_DIR}/helpers/byop_playground_cleanup.sh + +check_local_error_and_exit diff --git a/best-practices/ml-platform/test/scripts/unit/playground_dataprocessing.sh b/best-practices/ml-platform/test/scripts/unit/playground_dataprocessing.sh new file mode 100755 index 000000000..325ddcabb --- /dev/null +++ b/best-practices/ml-platform/test/scripts/unit/playground_dataprocessing.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" +SCRIPTS_DIR=$(realpath ${SCRIPT_PATH}/..) + +export MLP_TYPE="playground" +source ${SCRIPTS_DIR}/helpers/include.sh + +echo_title "Preparing the environment" + +source ${SCRIPTS_DIR}/helpers/dataprocessing_env.sh + +source ${SCRIPTS_DIR}/helpers/dataprocessing.sh +check_local_error_and_exit diff --git a/best-practices/ml-platform/test/scripts/unit/playground_dataprocessing_cleanup.sh b/best-practices/ml-platform/test/scripts/unit/playground_dataprocessing_cleanup.sh new file mode 100755 index 000000000..358aeb098 --- /dev/null +++ b/best-practices/ml-platform/test/scripts/unit/playground_dataprocessing_cleanup.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" +SCRIPTS_DIR=$(realpath ${SCRIPT_PATH}/..) + +export MLP_TYPE="playground" +source ${SCRIPTS_DIR}/helpers/include.sh + +echo_title "Preparing the environment" + +source ${SCRIPTS_DIR}/helpers/dataprocessing_env.sh +source ${SCRIPTS_DIR}/helpers/dataprocessing_cleanup.sh + +check_local_error_and_exit diff --git a/best-practices/ml-platform/test/scripts/unit/playground_new_gh_apply.sh b/best-practices/ml-platform/test/scripts/unit/playground_new_gh_apply.sh new file mode 100755 index 000000000..d5ac418b5 --- /dev/null +++ b/best-practices/ml-platform/test/scripts/unit/playground_new_gh_apply.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" +SCRIPTS_DIR=$(realpath ${SCRIPT_PATH}/..) + +export MLP_TYPE="playground" +source ${SCRIPTS_DIR}/helpers/include.sh + +echo_title "Preparing the environment" +source ${SCRIPTS_DIR}/helpers/new_gh_env.sh + +# feature initialize apply +############################################################################### +if lock_is_set "features_initialize_apply"; then + echo_bold "Feature initialize apply previously completed successfully" +else + source ${SCRIPTS_DIR}/helpers/feature_initialize_env.sh + source ${SCRIPTS_DIR}/helpers/feature_initialize_apply.sh + lock_set "features_initialize_apply" +fi + +export MLP_PROJECT_ID=$(grep environment_project_id ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars | awk -F"=" '{print $2}' | xargs) + +# terraform apply +############################################################################### +if lock_is_set "terraform_apply"; then + echo_bold "Terraform apply previously completed successfully" +else + source ${SCRIPTS_DIR}/helpers/${MLP_TYPE}_env.sh + + export TF_VAR_git_token=$(tr --delete '\n' <${HOME}/secrets/mlp-github-token) + source ${SCRIPTS_DIR}/helpers/terraform_apply.sh + lock_set "terraform_apply" +fi + +check_local_error_exit_on_error + +lock_unset "features_initialize_apply" +lock_unset "terraform_apply" + +check_local_error_and_exit diff --git a/best-practices/ml-platform/test/scripts/unit/playground_new_gh_destroy.sh b/best-practices/ml-platform/test/scripts/unit/playground_new_gh_destroy.sh new file mode 100755 index 000000000..c2efcb1ee --- /dev/null +++ b/best-practices/ml-platform/test/scripts/unit/playground_new_gh_destroy.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" +SCRIPTS_DIR=$(realpath ${SCRIPT_PATH}/..) + +export MLP_TYPE="playground" +source ${SCRIPTS_DIR}/helpers/include.sh + +echo_title "Preparing the environment" +source ${SCRIPTS_DIR}/helpers/new_gh_env.sh + +# terraform destroy +############################################################################### + +if lock_is_set "terraform_destroy"; then + echo_bold "Terraform destory previously completed successfully" +else + export TF_VAR_git_token=$(tr --delete '\n' <${HOME}/secrets/mlp-github-token) + source ${SCRIPTS_DIR}/helpers/terraform_destroy.sh + lock_set "terraform_destroy" +fi + +# feature initialize destroy +############################################################################### + +if lock_is_set "features_initialize_destroy"; then + echo_bold "Feature initialize destroy previously completed successfully" +else + source ${SCRIPTS_DIR}/helpers/feature_initialize_destroy.sh + + lock_set "features_initialize_destroy" +fi + +# cleanup +############################################################################### +echo_title "Cleaning up the environment" + +source ${SCRIPTS_DIR}/helpers/new_gh_playground_cleanup.sh + +check_local_error_exit_on_error + +lock_unset "features_initialize_destroy" +lock_unset "terraform_destroy" + +check_local_error_and_exit diff --git a/modules/custom-metrics-stackdriver-adapter/README.md b/modules/custom-metrics-stackdriver-adapter/README.md new file mode 100644 index 000000000..e87f4aafd --- /dev/null +++ b/modules/custom-metrics-stackdriver-adapter/README.md @@ -0,0 +1,65 @@ +# Custom Metrics Stackdriver Adapter + +Adapted from https://raw.githubusercontent.com/GoogleCloudPlatform/k8s-stackdriver/master/custom-metrics-stackdriver-adapter/deploy/production/adapter_new_resource_model.yaml + +## Installation via bash, gcloud, and kubectl + +Assure the following environment variables are set: + - PROJECT_ID: Your GKE project ID + - WORKLOAD_IDENTITY: Is workload identity federation enabled in the target cluster? + +``` +if [ -z "$WORKLOAD_IDENTITY" ]; then + WORKLOAD_IDENTITY=false +fi + +kubectl create namespace custom-metrics +kubectl create serviceaccount custom-metrics-stackdriver-adapter -n custom-metrics + +# If workload identity is enabled, extra steps are required. We need to: +# - create a service account +# - grant it the monitoring.viewer IAM role +# - bind it to the workload identity user for the CMSA +# - annotate the CMSA service account (done above) +if [ "$WORKLOAD_IDENTITY" == "true" ]; then + gcloud iam service-accounts create cmsa-sa + gcloud projects add-iam-policy-binding $PROJECT_ID --member="serviceAccount:cmsa-sa@$PROJECT_ID.iam.gserviceaccount.com" --role=roles/monitoring.viewer + gcloud projects add-iam-policy-binding $PROJECT_ID --member="serviceAccount:cmsa-sa@$PROJECT_ID.iam.gserviceaccount.com" --role=roles/iam.serviceAccountTokenCreator + gcloud iam service-accounts add-iam-policy-binding --role roles/iam.workloadIdentityUser --member "serviceAccount:$PROJECT_ID.svc.id.goog[custom-metrics/custom-metrics-stackdriver-adapter]" "cmsa-sa@$PROJECT_ID.iam.gserviceaccount.com" + kubectl annotate serviceaccount custom-metrics-stackdriver-adapter -n custom-metrics "iam.gke.io/gcp-service-account"="cmsa-sa@tpu-vm-gke-testing.iam.gserviceaccount.com" +fi + +kubectl apply -f clusterrolebinding_custom-metrics:system:auth-delegator.yaml.tftpl +kubectl apply -f rolebinding_custom-metrics-auth-reader.yaml.tftpl +kubectl apply -f clusterrole_custom-metrics-resource-reader.yaml.tftpl +kubectl apply -f clusterrolebinding_custom-metrics-resource-reader.yaml.tftpl +kubectl apply -f deployment_custom-metrics-stackdriver-adapter.yaml.tftpl +kubectl apply -f service_custom-metrics-stackdriver-adapter.yaml.tftpl +kubectl apply -f apiservice_v1beta1.custom.metrics.k8s.io.yaml.tftpl +kubectl apply -f apiservice_v1beta2.custom.metrics.k8s.io.yaml.tftpl +kubectl apply -f apiservice_v1beta1.external.metrics.k8s.io.yaml.tftpl +kubectl apply -f clusterrolebinding_external-metrics-reader.yaml.tftpl +``` + +## Installation via Terraform + +To use this as a module, include it from your terraform main: + +``` +module "custom_metrics_stackdriver_adapter" { + source = "./path/to/custom-metrics-stackdriver-adapter" +} +``` + +For a workload identity enabled cluster, some additional configuration is +needed: + +``` +module "custom_metrics_stackdriver_adapter" { + source = "./path/to/custom-metrics-stackdriver-adapter" + workload_identity = { + enabled = true + project_id = "" + } +} +``` \ No newline at end of file diff --git a/modules/custom-metrics-stackdriver-adapter/main.tf b/modules/custom-metrics-stackdriver-adapter/main.tf new file mode 100644 index 000000000..cf775e6e3 --- /dev/null +++ b/modules/custom-metrics-stackdriver-adapter/main.tf @@ -0,0 +1,130 @@ +locals { + v1beta1-custom-metrics-k8s-io = "${path.module}/templates/apiservice_v1beta1.custom.metrics.k8s.io.yaml.tftpl" + v1beta1-external-metrics-k8s-io = "${path.module}/templates/apiservice_v1beta1.external.metrics.k8s.io.yaml.tftpl" + v1beta2-custom-metrics-k8s-io = "${path.module}/templates/apiservice_v1beta2.custom.metrics.k8s.io.yaml.tftpl" + cluster-role-custom-metrics-resource-reader = "${path.module}/templates/clusterrole_custom-metrics-resource-reader.yaml.tftpl" + cluster-role-binding-custom-metrics-resource-reader = "${path.module}/templates/clusterrolebinding_custom-metrics-resource-reader.yaml.tftpl" + custom-metrics-system-auth-delegator = "${path.module}/templates/clusterrolebinding_custom-metrics:system:auth-delegator.yaml.tftpl" + external-metrics-reader = "${path.module}/templates/clusterrolebinding_external-metrics-reader.yaml.tftpl" + deployment-custom-metrics-stackdriver-adapter = "${path.module}/templates/deployment_custom-metrics-stackdriver-adapter.yaml.tftpl" + service-custom-metrics-stackdriver-adapter = "${path.module}/templates/service_custom-metrics-stackdriver-adapter.yaml.tftpl" + service-account-custom-metrics-stackdriver-adapter = "${path.module}/templates/serviceaccount_custom-metrics-stackdriver-adapter.yaml.tftpl" + custom-metrics-auth-reader = "${path.module}/templates/rolebinding_custom-metrics-auth-reader.yaml.tftpl" +} + +resource "kubernetes_namespace_v1" "custom-metrics" { + metadata { + name = "custom-metrics" + } +} + +resource "kubernetes_service_account_v1" "custom-metrics-stackdriver-adapter" { + count = 1 + metadata { + name = "custom-metrics-stackdriver-adapter" + namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name + annotations = var.workload_identity.enabled ? { + "iam.gke.io/gcp-service-account" = google_service_account.cmsa-sa[0].email + } : {} + } +} + +resource "kubernetes_manifest" "custom-metrics-system-auth-delegator" { + count = 1 + manifest = yamldecode(file(local.custom-metrics-system-auth-delegator)) +} + +resource "kubernetes_manifest" "custom-metrics-auth-reader" { + count = 1 + manifest = yamldecode(file(local.custom-metrics-auth-reader)) +} + +resource "kubernetes_manifest" "cluster-role-custom-metrics-resource-reader" { + count = 1 + manifest = yamldecode(file(local.cluster-role-custom-metrics-resource-reader)) +} + +resource "kubernetes_manifest" "cluster-role-binding-custom-metrics-resource-reader" { + count = 1 + manifest = yamldecode(file(local.cluster-role-binding-custom-metrics-resource-reader)) +} + +resource "kubernetes_manifest" "deployment-custom-metrics-stackdriver-adapter" { + count = 1 + manifest = yamldecode(file(local.deployment-custom-metrics-stackdriver-adapter)) +} + +resource "kubernetes_manifest" "service-custom-metrics-stackdriver-adapter" { + count = 1 + manifest = yamldecode(file(local.service-custom-metrics-stackdriver-adapter)) +} + +resource "kubernetes_manifest" "v1beta1-custom-metrics-k8s-io" { + count = 1 + manifest = yamldecode(file(local.v1beta1-custom-metrics-k8s-io)) +} + +resource "kubernetes_manifest" "v1beta2-custom-metrics-k8s-io" { + count = 1 + manifest = yamldecode(file(local.v1beta2-custom-metrics-k8s-io)) +} + +resource "kubernetes_manifest" "v1beta1-external-metrics-k8s-io" { + count = 1 + manifest = yamldecode(file(local.v1beta1-external-metrics-k8s-io)) +} + +resource "kubernetes_manifest" "external-metrics-reader" { + count = 1 + manifest = yamldecode(file(local.external-metrics-reader)) +} + +# If workload identity is enabled, extra steps are required. We need to: +# - create a service account +# - grant it the monitoring.viewer IAM role +# - bind it to the workload identity user for the cmsa +# - annotate the cmsa service account (done above) + +resource "google_service_account" "cmsa-sa" { + count = var.workload_identity.enabled ? 1 : 0 + account_id = "cmsa-sa" + project = var.workload_identity.project_id +} + +# Equivalent to: +# gcloud projects add-iam-policy-binding PROJECT_ID \ +# --member=serviceAccount:cmsa-sa@PROJECT_ID.iam.gserviceaccount.com \ +# --role=roles/monitoring.viewer +resource "google_project_iam_binding" "cmsa-project-binding-sa-monitoring-viewer" { + count = var.workload_identity.enabled ? 1 : 0 + project = var.workload_identity.project_id + role = "roles/monitoring.viewer" + members = [ + "serviceAccount:${google_service_account.cmsa-sa[0].account_id}@${var.workload_identity.project_id}.iam.gserviceaccount.com" + ] +} + +# Equivalent to: +# gcloud projects add-iam-policy-binding PROJECT_ID \ +# --member=serviceAccount:cmsa-sa@PROJECT_ID.iam.gserviceaccount.com \ +# --role=roles/iam.serviceAccountTokenCreator +resource "google_project_iam_binding" "cmsa-project-binding-sa-token-creator" { + count = var.workload_identity.enabled ? 1 : 0 + project = var.workload_identity.project_id + role = "roles/iam.serviceAccountTokenCreator" + members = [ + "serviceAccount:${google_service_account.cmsa-sa[0].account_id}@${var.workload_identity.project_id}.iam.gserviceaccount.com" + ] +} + +# Equivalent to: +# gcloud iam service-accounts add-iam-policy-binding \ +# --role roles/iam.workloadIdentityUser \ +# --member "serviceAccount:PROJECT_ID.svc.id.goog[custom-metrics/custom-metrics-stackdriver-adapter]" \ +# cmsa-sa@PROJECT_ID.iam.gserviceaccount.com +resource "google_service_account_iam_member" "cmsa-bind-to-gsa" { + count = var.workload_identity.enabled ? 1 : 0 + service_account_id = google_service_account.cmsa-sa[0].name + role = "roles/iam.workloadIdentityUser" + member = "serviceAccount:${var.workload_identity.project_id}.svc.id.goog[custom-metrics/custom-metrics-stackdriver-adapter]" +} diff --git a/modules/custom-metrics-stackdriver-adapter/templates/apiservice_v1beta1.custom.metrics.k8s.io.yaml.tftpl b/modules/custom-metrics-stackdriver-adapter/templates/apiservice_v1beta1.custom.metrics.k8s.io.yaml.tftpl new file mode 100644 index 000000000..2a0141ad3 --- /dev/null +++ b/modules/custom-metrics-stackdriver-adapter/templates/apiservice_v1beta1.custom.metrics.k8s.io.yaml.tftpl @@ -0,0 +1,13 @@ +apiVersion: apiregistration.k8s.io/v1 +kind: APIService +metadata: + name: v1beta1.custom.metrics.k8s.io +spec: + insecureSkipTLSVerify: true + group: custom.metrics.k8s.io + groupPriorityMinimum: 100 + versionPriority: 100 + service: + name: custom-metrics-stackdriver-adapter + namespace: custom-metrics + version: v1beta1 diff --git a/modules/custom-metrics-stackdriver-adapter/templates/apiservice_v1beta1.external.metrics.k8s.io.yaml.tftpl b/modules/custom-metrics-stackdriver-adapter/templates/apiservice_v1beta1.external.metrics.k8s.io.yaml.tftpl new file mode 100644 index 000000000..5db70b050 --- /dev/null +++ b/modules/custom-metrics-stackdriver-adapter/templates/apiservice_v1beta1.external.metrics.k8s.io.yaml.tftpl @@ -0,0 +1,13 @@ +apiVersion: apiregistration.k8s.io/v1 +kind: APIService +metadata: + name: v1beta1.external.metrics.k8s.io +spec: + insecureSkipTLSVerify: true + group: external.metrics.k8s.io + groupPriorityMinimum: 100 + versionPriority: 100 + service: + name: custom-metrics-stackdriver-adapter + namespace: custom-metrics + version: v1beta1 diff --git a/modules/custom-metrics-stackdriver-adapter/templates/apiservice_v1beta2.custom.metrics.k8s.io.yaml.tftpl b/modules/custom-metrics-stackdriver-adapter/templates/apiservice_v1beta2.custom.metrics.k8s.io.yaml.tftpl new file mode 100644 index 000000000..c83b9c625 --- /dev/null +++ b/modules/custom-metrics-stackdriver-adapter/templates/apiservice_v1beta2.custom.metrics.k8s.io.yaml.tftpl @@ -0,0 +1,13 @@ +apiVersion: apiregistration.k8s.io/v1 +kind: APIService +metadata: + name: v1beta2.custom.metrics.k8s.io +spec: + insecureSkipTLSVerify: true + group: custom.metrics.k8s.io + groupPriorityMinimum: 100 + versionPriority: 200 + service: + name: custom-metrics-stackdriver-adapter + namespace: custom-metrics + version: v1beta2 diff --git a/modules/custom-metrics-stackdriver-adapter/templates/clusterrole_custom-metrics-resource-reader.yaml.tftpl b/modules/custom-metrics-stackdriver-adapter/templates/clusterrole_custom-metrics-resource-reader.yaml.tftpl new file mode 100644 index 000000000..710677ada --- /dev/null +++ b/modules/custom-metrics-stackdriver-adapter/templates/clusterrole_custom-metrics-resource-reader.yaml.tftpl @@ -0,0 +1,15 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: custom-metrics-resource-reader +rules: +- apiGroups: + - "" + resources: + - "pods" + - "nodes" + - "nodes/stats" + verbs: + - list + - get + - watch diff --git a/modules/custom-metrics-stackdriver-adapter/templates/clusterrolebinding_custom-metrics-resource-reader.yaml.tftpl b/modules/custom-metrics-stackdriver-adapter/templates/clusterrolebinding_custom-metrics-resource-reader.yaml.tftpl new file mode 100644 index 000000000..8468a16cd --- /dev/null +++ b/modules/custom-metrics-stackdriver-adapter/templates/clusterrolebinding_custom-metrics-resource-reader.yaml.tftpl @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: custom-metrics-resource-reader +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: view +subjects: +- kind: ServiceAccount + name: custom-metrics-stackdriver-adapter + namespace: custom-metrics diff --git a/modules/custom-metrics-stackdriver-adapter/templates/clusterrolebinding_custom-metrics:system:auth-delegator.yaml.tftpl b/modules/custom-metrics-stackdriver-adapter/templates/clusterrolebinding_custom-metrics:system:auth-delegator.yaml.tftpl new file mode 100644 index 000000000..940bbe821 --- /dev/null +++ b/modules/custom-metrics-stackdriver-adapter/templates/clusterrolebinding_custom-metrics:system:auth-delegator.yaml.tftpl @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: custom-metrics:system:auth-delegator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:auth-delegator +subjects: +- kind: ServiceAccount + name: custom-metrics-stackdriver-adapter + namespace: custom-metrics diff --git a/modules/custom-metrics-stackdriver-adapter/templates/clusterrolebinding_external-metrics-reader.yaml.tftpl b/modules/custom-metrics-stackdriver-adapter/templates/clusterrolebinding_external-metrics-reader.yaml.tftpl new file mode 100644 index 000000000..4f6624836 --- /dev/null +++ b/modules/custom-metrics-stackdriver-adapter/templates/clusterrolebinding_external-metrics-reader.yaml.tftpl @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: external-metrics-reader +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: external-metrics-reader +subjects: +- kind: ServiceAccount + name: horizontal-pod-autoscaler + namespace: kube-system diff --git a/modules/custom-metrics-stackdriver-adapter/templates/deployment_custom-metrics-stackdriver-adapter.yaml.tftpl b/modules/custom-metrics-stackdriver-adapter/templates/deployment_custom-metrics-stackdriver-adapter.yaml.tftpl new file mode 100644 index 000000000..b86aee5e1 --- /dev/null +++ b/modules/custom-metrics-stackdriver-adapter/templates/deployment_custom-metrics-stackdriver-adapter.yaml.tftpl @@ -0,0 +1,37 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: custom-metrics-stackdriver-adapter + namespace: custom-metrics + labels: + run: custom-metrics-stackdriver-adapter + k8s-app: custom-metrics-stackdriver-adapter +spec: + replicas: 1 + selector: + matchLabels: + run: custom-metrics-stackdriver-adapter + k8s-app: custom-metrics-stackdriver-adapter + template: + metadata: + labels: + run: custom-metrics-stackdriver-adapter + k8s-app: custom-metrics-stackdriver-adapter + kubernetes.io/cluster-service: "true" + spec: + serviceAccountName: custom-metrics-stackdriver-adapter + containers: + - image: gcr.io/gke-release/custom-metrics-stackdriver-adapter:v0.14.2-gke.0 + imagePullPolicy: Always + name: pod-custom-metrics-stackdriver-adapter + command: + - /adapter + - --use-new-resource-model=true + - --fallback-for-container-metrics=true + resources: + limits: + cpu: 250m + memory: 200Mi + requests: + cpu: 250m + memory: 200Mi diff --git a/modules/custom-metrics-stackdriver-adapter/templates/rolebinding_custom-metrics-auth-reader.yaml.tftpl b/modules/custom-metrics-stackdriver-adapter/templates/rolebinding_custom-metrics-auth-reader.yaml.tftpl new file mode 100644 index 000000000..c56782e28 --- /dev/null +++ b/modules/custom-metrics-stackdriver-adapter/templates/rolebinding_custom-metrics-auth-reader.yaml.tftpl @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: custom-metrics-auth-reader + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: extension-apiserver-authentication-reader +subjects: +- kind: ServiceAccount + name: custom-metrics-stackdriver-adapter + namespace: custom-metrics diff --git a/modules/custom-metrics-stackdriver-adapter/templates/service_custom-metrics-stackdriver-adapter.yaml.tftpl b/modules/custom-metrics-stackdriver-adapter/templates/service_custom-metrics-stackdriver-adapter.yaml.tftpl new file mode 100644 index 000000000..71fb5b347 --- /dev/null +++ b/modules/custom-metrics-stackdriver-adapter/templates/service_custom-metrics-stackdriver-adapter.yaml.tftpl @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + run: custom-metrics-stackdriver-adapter + k8s-app: custom-metrics-stackdriver-adapter + kubernetes.io/cluster-service: 'true' + kubernetes.io/name: Adapter + name: custom-metrics-stackdriver-adapter + namespace: custom-metrics +spec: + ports: + - port: 443 + protocol: TCP + targetPort: 443 + selector: + run: custom-metrics-stackdriver-adapter + k8s-app: custom-metrics-stackdriver-adapter + type: ClusterIP \ No newline at end of file diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/custom-metrics-stackdriver-adapter/variables.tf b/modules/custom-metrics-stackdriver-adapter/variables.tf similarity index 100% rename from tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/custom-metrics-stackdriver-adapter/variables.tf rename to modules/custom-metrics-stackdriver-adapter/variables.tf diff --git a/modules/jetstream-maxtext-deployment/README.md b/modules/jetstream-maxtext-deployment/README.md new file mode 100644 index 000000000..a43ad6fe2 --- /dev/null +++ b/modules/jetstream-maxtext-deployment/README.md @@ -0,0 +1,171 @@ +This module deploys Jetstream Maxtext to a cluster. If `prometheus_port` is set then a [PodMontoring CR](https://cloud.google.com/stackdriver/docs/managed-prometheus/setup-managed#gmp-pod-monitoring) will be deployed for scraping metrics and exporting them to Google Cloud Monitoring. See the [deployment template](./templates/deployment.yaml.tftpl) to see which command line args are passed by default. For additional configuration please reference the [MaxText base config file](https://github.com/google/maxtext/blob/main/MaxText/configs/base.yml) for a list of configurable command line args and their explainations. + +## Installation via bash and kubectl + +Assure the following environment variables are set: + - MODEL_NAME: The name of your LLM (as of the writing of this README valid options are "gemma-7b", "llama2-7b", "llama2-13b") + - PARAMETERS_PATH: Where to find the parameters for your LLM (if using the checkpoint-converter it will be "gs:\/\/$BUCKET_NAME\/final\/unscanned\/gemma_7b-it\/0\/checkpoints\/0\/items" where $BUCKET_NAME is the same one used in the checkpoint-converter) + - (optional) METRICS_PORT: Port to emit custom metrics on + - (optional) TPU_TOPOLOGY: Topology of TPU chips used by jetstream (default: "2x4") + - (optional) TPU_TYPE: Type of TPUs used (default: "tpu-v5-lite-podslice") + - (optional) TPU_CHIP_COUNT: Number of TPU chips requested, can be obtained by algebraically evaluating TPU_TOPOLOGY + - (optional) MAXENGINE_SERVER_IMAGE: Maxengine server container image + - (optional) JETSTREAM_HTTP_SERVER_IMAGE: Jetstream HTTP server container image + +``` +if [ -z "$MAXENGINE_SERVER_IMAGE" ]; then + MAXENGINE_SERVER_IMAGE="us-docker.pkg.dev\/cloud-tpu-images\/inference\/maxengine-server:v0.2.2" +fi + +if [ -z "$JETSTREAM_HTTP_SERVER_IMAGE" ]; then + JETSTREAM_HTTP_SERVER_IMAGE="us-docker.pkg.dev\/cloud-tpu-images\/inferenc\/jetstream-http:v0.2.2" +fi + +if [ -z "$TPU_TOPOLOGY" ]; then + TPU_TOPOLOGY="2x4" +fi + +if [ -z "$TPU_TYPE" ]; then + TPU_TYPE="tpu-v5-lite-podslice" +fi + +if [ -z "$TPU_CHIP_COUNT" ]; then + TPU_CHIP_COUNT="8" +fi + +if [ -z "$MODEL_NAME" ]; then + echo "Must provide MODEL_NAME in environment" 1>&2 + exit 2; +fi + +if [ -z "$PARAMETERS_PATH" ]; then + echo "Must provide PARAMETERS_PATH in environment" 1>&2 + exit 2; +fi + +JETSTREAM_MANIFEST=$(mktemp) +cat ./templates/deployment.yaml.tftpl >> "$JETSTREAM_MANIFEST" + +PODMONITORING_MANIFEST=$(mktemp) +cat ./templates/podmonitoring.yaml.tftpl >> "$PODMONITORING_MANIFEST" + +if [ "$METRICS_PORT" != "" ]; then + cat $PODMONITORING_MANIFEST | sed "s/\${metrics_port}/$METRICS_PORT/g" >> "$PODMONITORING_MANIFEST" + cat $JETSTREAM_MANIFEST | sed "s/\${metrics_port_arg}/prometheus_port=$METRICS_PORT/g" >> "$JETSTREAM_MANIFEST" + + cat $PODMONITORING_MANIFEST | kubectl apply -f - +else + cat $JETSTREAM_MANIFEST | sed "s/\${metrics_port_arg}//g" >> "$JETSTREAM_MANIFEST" +fi + +cat $JETSTREAM_MANIFEST \ +| sed "s/\${tpu-type}/$TPU_TYPE/g" \ +| sed "s/\${tpu-topology}/$TPU_TOPOLOGY/g" \ +| sed "s/\${tpu-chip-count}/$TPU_CHIP_COUNT/g" \ +| sed "s/\${maxengine_server_image}/$MAXENGINE_SERVER_IMAGE/g" \ +| sed "s/\${jetstream_http_server_image}/$JETSTREAM_HTTP_SERVER_IMAGE/g" \ +| sed "s/\${model_name}/$MODEL_NAME/g" \ +| sed "s/\${load_parameters_path_arg}/$PARAMETERS_PATH/g" >> "$JETSTREAM_MANIFEST" + +cat $JETSTREAM_MANIFEST | kubectl apply -f - +``` +## (Optional) Autoscaling Components + +Applying the following resources to your cluster will enable you to scale the number of Jetstream server pods with custom or system metrics: + - Metrics Adapter (either [Prometheus-adapter](https://github.com/kubernetes-sigs/prometheus-adapter)(recommended) or [CMSA](https://github.com/GoogleCloudPlatform/k8s-stackdriver/tree/master/custom-metrics-stackdriver-adapter)): For making metrics from the Google Cloud Monitoring API visible to resources within the cluster. + - [Horizontal Pod Autoscaler (HPA)](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/): For reading metrics and setting the maxengine-servers deployments replica count accordingly. + +### Metrics Adapter + +#### Custom Metrics Stackdriver Adapter + +Follow the [Custom-metrics-stackdriver-adapter README](https://github.com/GoogleCloudPlatform/ai-on-gke/tree/main/modules/custom-metrics-stackdriver-adapter/README.md) to install without terraform. + +Once installed the values of the following metrics can be used as averageValues in a HorizontalPodAutoscaler (HPA): + - Jetstream metrics (i.e. any metric prefixed with "jetstream_") + - "memory_used" (the current sum of memory usage across all accelerators used by a node in bytes, note this value can be extremely large since the unit of measurement is bytes) + +#### Prometheus Adapter + +Follow the [Prometheus-adapter README](https://github.com/GoogleCloudPlatform/ai-on-gke/tree/main/modules/prometheus-adapter/README.md) to install without terraform. A few notes: + +This module uses the the prometheus-community/prometheus-adapter Helm chart as part of the install process, it has a values file that requires "CLUSTER_NAME" to be replaced with your cluster name in order to properly filter metrics. This is a consequence of differing cluster name schemes between GKE and standard k8s clusters. Instructions for each are as follows for if the cluster name isnt already known. For GKE clusters, Remove any characters prior to and including the last underscore with `kubectl config current-context | awk -F'_' ' { print $NF }'` to get the cluster name. For other clusters, The cluster name is simply: `kubectl config current-context`. + +Instructions to set the PROMETHEUS_HELM_VALUES_FILE env var as follows: + +``` +PROMETHEUS_HELM_VALUES_FILE=$(mktemp) +sed "s/\${cluster_name}/$CLUSTER_NAME/g" ../templates/values.yaml.tftpl >> "$PROMETHEUS_HELM_VALUES_FILE" +``` + +Once installed the values of the following metrics can be used as averageValues in a HorizontalPodAutoscaler (HPA): + - Jetstream metrics (i.e. any metric prefixed with "jetstream_") + - "memory_used_percentage" (the percentage of total accelerator memory used across all accelerators used by a node) + +### Horizontal Pod Autoscalers + +The following should be run for each HPA, assure the following are set before running: + - ADAPTER: The adapter currently in cluster, can be either 'custom-metrics-stackdriver-adapter' or 'prometheus-adapter' + - MIN_REPLICAS: Lower bound for number of jetstream replicas + - MAX_REPLICAS: Upper bound for number of jetstream replicas + - METRIC: The metrics whose value will be compared against the average value, can be any metric listed above + - AVERAGE_VALUE: Average value to be used for calculating replica cound, see [docs](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details) for more details + + ``` +if [ -z "$ADAPTER" ]; then + echo "Must provide ADAPTER in environment" 1>&2 + exit 2; +fi + +if [ -z "$MIN_REPLICAS" ]; then + echo "Must provide MIN_REPLICAS in environment" 1>&2 + exit 2; +fi + +if [ -z "$MAX_REPLICAS" ]; then + echo "Must provide MAX_REPLICAS in environment" 1>&2 + exit 2; +fi + +if [[ $METRIC =~ ^jetstream_.* ]]; then + METRICS_SOURCE_TYPE="Pods" + METRICS_SOURCE="pods" +elif [ $METRIC == memory_used ] && [ "$ADAPTER" == custom-metrics-stackdriver-adapter ]; then + METRICS_SOURCE_TYPE="External" + METRICS_SOURCE="external" + METRIC="kubernetes.io|node|accelerator|${METRIC}" +elif [ $METRIC == memory_used_percentage ] && [ "$ADAPTER" == prometheus-adapter ]; then + METRICS_SOURCE_TYPE="External" + METRICS_SOURCE="external" +else + echo "Must provide valid METRIC for ${ADAPTER} in environment" 1>&2 + exit 2; +fi + +if [ -z "$AVERAGE_VALUE" ]; then + echo "Must provide AVERAGE_VALUE in environment" 1>&2 + exit 2; +fi + +echo "apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: jetstream-hpa-$(uuidgen) + namespace: default +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: maxengine-server + minReplicas: ${MIN_REPLICAS} + maxReplicas: ${MAX_REPLICAS} + metrics: + - type: ${METRICS_SOURCE_TYPE} + ${METRICS_SOURCE}: + metric: + name: ${METRIC} + target: + type: AverageValue + averageValue: ${AVERAGE_VALUE} +" | kubectl apply -f - + ``` diff --git a/modules/jetstream-maxtext-deployment/main.tf b/modules/jetstream-maxtext-deployment/main.tf new file mode 100644 index 000000000..8777cb84f --- /dev/null +++ b/modules/jetstream-maxtext-deployment/main.tf @@ -0,0 +1,113 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +locals { + deployment_template = "${path.module}/templates/deployment.yaml.tftpl" + service_template = "${path.module}/templates/service.yaml.tftpl" + podmonitoring_template = "${path.module}/templates/podmonitoring.yaml.tftpl" + cmsa_jetstream_hpa_template = "${path.module}/templates/custom-metrics-stackdriver-adapter/hpa.jetstream.yaml.tftpl" + prometheus_jetstream_hpa_template = "${path.module}/templates/prometheus-adapter/hpa.jetstream.yaml.tftpl" +} + +resource "kubernetes_manifest" "jetstream-deployment" { + count = 1 + manifest = yamldecode(templatefile(local.deployment_template, { + maxengine_server_image = var.maxengine_deployment_settings.maxengine_server_image + jetstream_http_server_image = var.maxengine_deployment_settings.jetstream_http_server_image + model_name = var.maxengine_deployment_settings.model_name + load_parameters_path_arg = var.maxengine_deployment_settings.parameters_path + metrics_port_arg = var.maxengine_deployment_settings.metrics_port != null ? format("prometheus_port=%d", var.maxengine_deployment_settings.metrics_port) : "", + tpu-topology = var.maxengine_deployment_settings.accelerator_selectors.topology + tpu-type = var.maxengine_deployment_settings.accelerator_selectors.accelerator + tpu-chip-count = var.maxengine_deployment_settings.accelerator_selectors.chip_count + })) +} + +resource "kubernetes_manifest" "jetstream-service" { + count = 1 + manifest = yamldecode(file(local.service_template)) +} + +resource "kubernetes_manifest" "jetstream-podmonitoring" { + count = var.maxengine_deployment_settings.metrics_port != null ? 1 : 0 + manifest = yamldecode(templatefile(local.podmonitoring_template, { + metrics_port = var.maxengine_deployment_settings.metrics_port != null ? var.maxengine_deployment_settings.metrics_port : "", + metrics_scrape_interval = var.maxengine_deployment_settings.metrics_scrape_interval + })) +} + +module "custom_metrics_stackdriver_adapter" { + count = var.hpa_config.metrics_adapter == "custom-metrics-stackdriver-adapter" ? 1 : 0 + source = "../custom-metrics-stackdriver-adapter" + workload_identity = { + enabled = true + project_id = var.project_id + } +} + +module "prometheus_adapter" { + count = var.hpa_config.metrics_adapter == "prometheus-adapter" ? 1 : 0 + source = "../prometheus-adapter" + credentials_config = { + kubeconfig = { + path : "~/.kube/config" + } + } + project_id = var.project_id + config_file = templatefile("${path.module}/templates/prometheus-adapter/values.yaml.tftpl", { + cluster_name = var.cluster_name + }) +} + +resource "kubernetes_manifest" "prometheus_adapter_hpa_custom_metric" { + for_each = { + for index, rule in var.hpa_config.rules : + index => { + index = index + target_query = rule.target_query + average_value_target = rule.average_value_target + } + if var.maxengine_deployment_settings.custom_metrics_enabled && var.hpa_config.metrics_adapter == "prometheus-adapter" + } + + manifest = yamldecode(templatefile(local.prometheus_jetstream_hpa_template, { + index = each.value.index + hpa_type = try(each.value.target_query, "") + hpa_averagevalue_target = try(each.value.average_value_target, 1) + hpa_min_replicas = var.hpa_config.min_replicas + hpa_max_replicas = var.hpa_config.max_replicas + })) +} + +resource "kubernetes_manifest" "cmsa_hpa_custom_metric" { + for_each = { + for index, rule in var.hpa_config.rules : + index => { + index = index + target_query = rule.target_query + average_value_target = rule.average_value_target + } + if var.maxengine_deployment_settings.custom_metrics_enabled && var.hpa_config.metrics_adapter == "custom-metrics-stackdriver-adapter" + } + + manifest = yamldecode(templatefile(local.cmsa_jetstream_hpa_template, { + index = each.value.index + hpa_type = try(each.value.target_query, "") + hpa_averagevalue_target = try(each.value.average_value_target, 1) + hpa_min_replicas = var.hpa_config.min_replicas + hpa_max_replicas = var.hpa_config.max_replicas + })) +} diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/hpa-templates/hpa.jetstream.yaml.tftpl b/modules/jetstream-maxtext-deployment/templates/custom-metrics-stackdriver-adapter/hpa.jetstream.yaml.tftpl similarity index 79% rename from tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/hpa-templates/hpa.jetstream.yaml.tftpl rename to modules/jetstream-maxtext-deployment/templates/custom-metrics-stackdriver-adapter/hpa.jetstream.yaml.tftpl index 7bf9bd0fd..f279e52fd 100644 --- a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/hpa-templates/hpa.jetstream.yaml.tftpl +++ b/modules/jetstream-maxtext-deployment/templates/custom-metrics-stackdriver-adapter/hpa.jetstream.yaml.tftpl @@ -1,8 +1,8 @@ apiVersion: autoscaling/v2 kind: HorizontalPodAutoscaler metadata: - name: jetstream-hpa - namespace: ${namespace} + name: jetstream-hpa-${index} + namespace: default spec: scaleTargetRef: apiVersion: apps/v1 @@ -20,12 +20,11 @@ spec: type: AverageValue averageValue: ${hpa_averagevalue_target} %{ else } - - type: Pods - pods: + - type: External + external: metric: - name: kubernetes.io|node|accelerator|memory_used + name: kubernetes.io|node|accelerator|${hpa_type} target: type: AverageValue averageValue: ${hpa_averagevalue_target} -%{ endif } - +%{ endif } \ No newline at end of file diff --git a/modules/jetstream-maxtext-deployment/templates/deployment.yaml.tftpl b/modules/jetstream-maxtext-deployment/templates/deployment.yaml.tftpl new file mode 100644 index 000000000..94f3a22ca --- /dev/null +++ b/modules/jetstream-maxtext-deployment/templates/deployment.yaml.tftpl @@ -0,0 +1,51 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: maxengine-server + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: maxengine-server + template: + metadata: + labels: + app: maxengine-server + spec: + nodeSelector: + cloud.google.com/gke-tpu-topology: ${tpu-topology} + cloud.google.com/gke-tpu-accelerator: ${tpu-type} + containers: + - name: maxengine-server + image: ${maxengine_server_image} + imagePullPolicy: Always + securityContext: + privileged: true + args: + - model_name=${model_name} + - tokenizer_path=assets/tokenizer.gemma + - per_device_batch_size=4 + - max_prefill_predict_length=1024 + - max_target_length=2048 + - async_checkpointing=false + - ici_fsdp_parallelism=1 + - ici_autoregressive_parallelism=-1 + - ici_tensor_parallelism=1 + - scan_layers=false + - weight_dtype=bfloat16 + - attention=dot_product + - load_parameters_path=${load_parameters_path_arg} + - ${metrics_port_arg} + ports: + - containerPort: 9000 + resources: + requests: + google.com/tpu: ${tpu-chip-count} + limits: + google.com/tpu: ${tpu-chip-count} + - name: jetstream-http + image: ${jetstream_http_server_image} + imagePullPolicy: Always + ports: + - containerPort: 8000 diff --git a/modules/jetstream-maxtext-deployment/templates/podmonitoring.yaml.tftpl b/modules/jetstream-maxtext-deployment/templates/podmonitoring.yaml.tftpl new file mode 100644 index 000000000..89b705e2d --- /dev/null +++ b/modules/jetstream-maxtext-deployment/templates/podmonitoring.yaml.tftpl @@ -0,0 +1,15 @@ +apiVersion: monitoring.googleapis.com/v1 +kind: PodMonitoring +metadata: + name: jetstream-podmonitoring + namespace: default +spec: + endpoints: + - interval: ${metrics_scrape_interval}s + path: "/" + port: ${metrics_port} + targetLabels: + metadata: + - pod + - container + - node \ No newline at end of file diff --git a/modules/jetstream-maxtext-deployment/templates/prometheus-adapter/hpa.jetstream.yaml.tftpl b/modules/jetstream-maxtext-deployment/templates/prometheus-adapter/hpa.jetstream.yaml.tftpl new file mode 100644 index 000000000..93ec1e278 --- /dev/null +++ b/modules/jetstream-maxtext-deployment/templates/prometheus-adapter/hpa.jetstream.yaml.tftpl @@ -0,0 +1,30 @@ +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: jetstream-hpa-${index} + namespace: default +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: maxengine-server + minReplicas: ${hpa_min_replicas} + maxReplicas: ${hpa_max_replicas} + metrics: +%{ if length(regexall("jetstream_.*", hpa_type)) > 0 } + - type: Pods + pods: + metric: + name: ${hpa_type} + target: + type: AverageValue + averageValue: ${hpa_averagevalue_target} +%{ else } + - type: External + external: + metric: + name: ${hpa_type} + target: + type: AverageValue + averageValue: ${hpa_averagevalue_target} +%{ endif } \ No newline at end of file diff --git a/modules/jetstream-maxtext-deployment/templates/prometheus-adapter/values.yaml.tftpl b/modules/jetstream-maxtext-deployment/templates/prometheus-adapter/values.yaml.tftpl new file mode 100644 index 000000000..a795825a0 --- /dev/null +++ b/modules/jetstream-maxtext-deployment/templates/prometheus-adapter/values.yaml.tftpl @@ -0,0 +1,38 @@ +rules: + default: false + external: + - seriesQuery: 'jetstream_prefill_backlog_size' + resources: + template: <<.Resource>> + name: + matches: "" + as: "jetstream_prefill_backlog_size" + metricsQuery: sum(<<.Series>>{<<.LabelMatchers>>,cluster="${cluster_name}"}) + - seriesQuery: 'jetstream_transfer_backlog_size' + resources: + template: <<.Resource>> + name: + matches: "" + as: "jetstream_transfer_backlog_size" + metricsQuery: sum(<<.Series>>{<<.LabelMatchers>>,cluster="${cluster_name}"}) + - seriesQuery: 'jetstream_generate_backlog_size' + resources: + template: <<.Resource>> + name: + matches: "" + as: "jetstream_generate_backlog_size" + metricsQuery: sum(<<.Series>>{<<.LabelMatchers>>,cluster="${cluster_name}"}) + - seriesQuery: 'jetstream_slots_used_percentage' + resources: + template: <<.Resource>> + name: + matches: "" + as: "jetstream_slots_used_percentage" + metricsQuery: sum(<<.Series>>{<<.LabelMatchers>>,cluster="${cluster_name}"}) + - seriesQuery: 'kubernetes_io:node_accelerator_memory_used' + resources: + template: <<.Resource>> + name: + matches: "" + as: "memory_used_percentage" + metricsQuery: avg(kubernetes_io:node_accelerator_memory_used{cluster_name="${cluster_name}"}) / avg(kubernetes_io:node_accelerator_memory_total{cluster_name="${cluster_name}"}) \ No newline at end of file diff --git a/modules/jetstream-maxtext-deployment/templates/service.yaml.tftpl b/modules/jetstream-maxtext-deployment/templates/service.yaml.tftpl new file mode 100644 index 000000000..0db1bc6c4 --- /dev/null +++ b/modules/jetstream-maxtext-deployment/templates/service.yaml.tftpl @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: jetstream-svc + namespace: default +spec: + selector: + app: maxengine-server + ports: + - protocol: TCP + name: jetstream-http + port: 8000 + targetPort: 8000 + - protocol: TCP + name: jetstream-grpc + port: 9000 + targetPort: 9000 \ No newline at end of file diff --git a/modules/jetstream-maxtext-deployment/variables.tf b/modules/jetstream-maxtext-deployment/variables.tf new file mode 100644 index 000000000..55878b7a3 --- /dev/null +++ b/modules/jetstream-maxtext-deployment/variables.tf @@ -0,0 +1,88 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "cluster_name" { + type = string + nullable = false +} + +variable "project_id" { + type = string + nullable = false +} + +variable "maxengine_deployment_settings" { + type = object({ + maxengine_server_image = string + jetstream_http_server_image = string + + model_name = string // Name of your LLM (for example: "gemma-7b") + parameters_path = string // Path to the paramters for your model + metrics_port = optional(number) // Emit Jetstream metrics on this port of each contaienr + custom_metrics_enabled = bool // Whether or not custom metrics are also emitted + metrics_scrape_interval = optional(number) // Interval for scraping metrics (default: 10s) + + accelerator_selectors = object({ + topology = string + accelerator = string + chip_count = number + }) + }) + + validation { + condition = contains(["gemma-7b", "llama2-7b", "llama2-13b"], var.maxengine_deployment_settings.model_name) + error_message = "model_name must be one of \"gemma-7b\", \"llama2-7b\", or \"llama2-13b\"" + } +} + +variable "hpa_config" { + type = object({ + metrics_adapter = string + min_replicas = number + max_replicas = number + rules = list(object({ + target_query = string + average_value_target = number + })) + }) + default = null + + validation { + condition = alltrue([ + for hpa_config in var.hpa_config.rules : + hpa_config.target_query != null && hpa_config.average_value_target != null && length(regexall("jetstream_.*", hpa_config.target_query)) > 0 || length(regexall("memory_used", hpa_config.target_query)) > 0 || length(regexall("memory_used_percentage", hpa_config.target_query)) > 0 + ]) + error_message = "Allows values for hpa_type are {null, memory_used, predefined promql queries (i.e. memory_used_percentage, or jetstream metrics (e.g., \"jetstream_prefill_backlog_size\", \"jetstream_slots_used_percentage\")}" + } + validation { + condition = var.hpa_config.metrics_adapter == "custom-metrics-stackdriver-adapter" && alltrue([ + for hpa_config in var.hpa_config.rules : + hpa_config.target_query != null && hpa_config.average_value_target != null && length(regexall("jetstream_.*", hpa_config.target_query)) > 0 || length(regexall("memory_used", hpa_config.target_query)) > 0 + ]) || var.hpa_config.metrics_adapter != "custom-metrics-stackdriver-adapter" + error_message = "Allowed values for target_query when using the custom-metrics-stackdriver are \"memory_used\", or jetstream metrics (i.e. \"jetstream_prefill_backlog_size\", \"jetstream_slots_used_percentage\", etc)" + } + validation { + condition = var.hpa_config.metrics_adapter == "prometheus-adapter" && alltrue([ + for hpa_config in var.hpa_config.rules : + hpa_config.target_query != null && hpa_config.average_value_target != null && length(regexall("jetstream_.*", hpa_config.target_query)) > 0 || length(regexall("memory_used_percentage", hpa_config.target_query)) > 0 + ]) || var.hpa_config.metrics_adapter != "prometheus-adapter" + error_message = "Allowed values for target_query when using the prometheus adapter include predefined promql queries (i.e. \"memory_used_percentage\") and jetstream metrics (i.e. \"jetstream_prefill_backlog_size\", \"jetstream_slots_used_percentage\", etc)" + } + validation { + condition = contains(["", "custom-metrics-stackdriver-adapter", "prometheus-adapter"], var.hpa_config.metrics_adapter) + error_message = "Allowed values for metrics_adapter are \"custom-metrics-stackdriver-adapter\", or \"prometheus-adapter\"." + } +} \ No newline at end of file diff --git a/modules/kuberay-cluster/values.yaml b/modules/kuberay-cluster/values.yaml index cec35e35f..a1028fd0a 100644 --- a/modules/kuberay-cluster/values.yaml +++ b/modules/kuberay-cluster/values.yaml @@ -36,7 +36,7 @@ head: # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod. # Ray autoscaler integration is supported only for Ray versions >= 1.11.0 # Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0. - # enableInTreeAutoscaling: true + enableInTreeAutoscaling: true # autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler. # The example configuration shown below below represents the DEFAULT values. # autoscalerOptions: @@ -95,17 +95,17 @@ head: # Ray recommends at least 8G memory for production workloads. memory: "8G" # Sum of ephemeral storage requests must be max 10Gi on Autopilot default class. - # This includes, ray-head, gcsfuse-sidecar, and fluent-bit. - ephemeral-storage: 4Gi + # This includes, ray-head, gcsfuse-sidecar, fluent-bit, and ray Autoscaler sidecar which requests 1Gi by default. + ephemeral-storage: 3Gi requests: cpu: "4" memory: "8G" - ephemeral-storage: 4Gi + ephemeral-storage: 3Gi annotations: gke-gcsfuse/volumes: "true" gke-gcsfuse/cpu-limit: "1" gke-gcsfuse/memory-limit: 2Gi - gke-gcsfuse/ephemeral-storage-limit: 4Gi + gke-gcsfuse/ephemeral-storage-limit: 3Gi nodeSelector: iam.gke.io/gke-metadata-server-enabled: "true" tolerations: [] @@ -165,7 +165,9 @@ worker: # uncomment the line below # disabled: true groupName: workerGroup - replicas: 1 + replicas: 0 + minReplicas: 0 + maxReplicas: 5 type: worker labels: cloud.google.com/gke-ray-node-type: worker diff --git a/modules/prometheus-adapter/README.md b/modules/prometheus-adapter/README.md new file mode 100644 index 000000000..d2190e24e --- /dev/null +++ b/modules/prometheus-adapter/README.md @@ -0,0 +1,20 @@ +This module deploys a [prometheus-adapter](https://github.com/kubernetes-sigs/prometheus-adapter) and a [Prometheus frontend](https://github.com/GoogleCloudPlatform/prometheus-engine/blob/main/examples/frontend.yaml) to a cluster. See [prometheus-adapter](https://github.com/kubernetes-sigs/prometheus-adapter) repo for more details. + +## Installation via bash and helm + +Assure the following environment variables are set: + - PROJECT_ID: GKE Project ID + - (optional) PROMETHEUS_HELM_VALUES_FILE: Values file to pass when deploying `prometheus-community/prometheus-adapter` chart + +``` +curl https://raw.githubusercontent.com/GoogleCloudPlatform/prometheus-engine/v0.10.0/examples/frontend.yaml | envsubst | kubectl apply -f - + +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update + +if [ -z "$PROMETHEUS_HELM_VALUES_FILE" ] + helm install example-release prometheus-community/prometheus-adapter +else + helm install example-release prometheus-community/prometheus-adapter -f "$PROMETHEUS_HELM_VALUES_FILE" +fi +``` diff --git a/modules/prometheus-adapter/main.tf b/modules/prometheus-adapter/main.tf new file mode 100644 index 000000000..df78d5d30 --- /dev/null +++ b/modules/prometheus-adapter/main.tf @@ -0,0 +1,121 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +resource "helm_release" "prometheus_adapter" { + name = "my-release" + chart = "prometheus-adapter" + repository = "https://prometheus-community.github.io/helm-charts" + values = var.config_file != "" ? [var.config_file] : [] +} + +resource "kubernetes_deployment_v1" "frontend" { + metadata { + name = "frontend" + labels = { + "app" : "frontend" + } + } + spec { + replicas = 2 + selector { + match_labels = { + "app" : "frontend" + } + } + template { + metadata { + labels = { + "app" : "frontend" + } + } + spec { + automount_service_account_token = true + affinity { + node_affinity { + required_during_scheduling_ignored_during_execution { + node_selector_term { + match_expressions { + key = "kubernetes.io/arch" + operator = "In" + values = [ + "arm64", + "amd64" + ] + } + match_expressions { + key = "kubernetes.io/os" + operator = "In" + values = [ + "linux" + ] + } + } + } + } + } + container { + name = "frontend" + image = "gke.gcr.io/prometheus-engine/frontend:v0.8.0-gke.4" + args = [ + "--web.listen-address=:9090", + format("--query.project-id=%s", var.project_id) + ] + port { + name = "web" + container_port = 9090 + } + readiness_probe { + http_get { + path = "/-/ready" + port = "web" + } + } + security_context { + allow_privilege_escalation = false + capabilities { + drop = ["all"] + } + privileged = false + run_as_group = 1000 + run_as_non_root = true + run_as_user = 1000 + } + liveness_probe { + http_get { + path = "/-/healthy" + port = "web" + } + } + } + } + } + } +} + +resource "kubernetes_service_v1" "frontend-service" { + metadata { + name = "prometheus" + } + spec { + cluster_ip = "None" + selector = { + "app" : "frontend" + } + port { + name = "web" + port = 9090 + } + + } +} \ No newline at end of file diff --git a/modules/prometheus-adapter/variables.tf b/modules/prometheus-adapter/variables.tf new file mode 100644 index 000000000..f9f16d2c5 --- /dev/null +++ b/modules/prometheus-adapter/variables.tf @@ -0,0 +1,45 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "credentials_config" { + description = "Configure how Terraform authenticates to the cluster." + type = object({ + fleet_host = optional(string) + kubeconfig = optional(object({ + context = optional(string) + path = optional(string, "~/.kube/config") + })) + }) + nullable = false + validation { + condition = ( + (var.credentials_config.fleet_host != null) != + (var.credentials_config.kubeconfig != null) + ) + error_message = "Exactly one of fleet host or kubeconfig must be set." + } +} + +variable "project_id" { + type = string + description = "GCP project id" + nullable = false +} + +variable "config_file" { + type = string + description = "Values file for prometheus-config Helm chart" + nullable = false + default = "" +} \ No newline at end of file diff --git a/ray-on-gke/guides/tpu/README.md b/ray-on-gke/guides/tpu/README.md index 356094a7b..2a77605ef 100644 --- a/ray-on-gke/guides/tpu/README.md +++ b/ray-on-gke/guides/tpu/README.md @@ -56,7 +56,7 @@ After installing cert-manager, it may take up to two minutes for the certificate Installing the webhook: 1. `git clone https://github.com/GoogleCloudPlatform/ai-on-gke` -2. `cd applications/ray/kuberay-tpu-webhook` +2. `cd ai-on-gke/ray-on-gke/tpu/kuberay-tpu-webhook` 3. `make deploy` - this will create the webhook deployment, configs, and service in the "ray-system" namespace - to change the namespace, edit the "namespace" value in each .yaml in deployments/ and certs/ diff --git a/applications/ray/kuberay-tpu-webhook/Dockerfile b/ray-on-gke/tpu/kuberay-tpu-webhook/Dockerfile similarity index 100% rename from applications/ray/kuberay-tpu-webhook/Dockerfile rename to ray-on-gke/tpu/kuberay-tpu-webhook/Dockerfile diff --git a/applications/ray/kuberay-tpu-webhook/Makefile b/ray-on-gke/tpu/kuberay-tpu-webhook/Makefile similarity index 100% rename from applications/ray/kuberay-tpu-webhook/Makefile rename to ray-on-gke/tpu/kuberay-tpu-webhook/Makefile diff --git a/applications/ray/kuberay-tpu-webhook/README.md b/ray-on-gke/tpu/kuberay-tpu-webhook/README.md similarity index 100% rename from applications/ray/kuberay-tpu-webhook/README.md rename to ray-on-gke/tpu/kuberay-tpu-webhook/README.md diff --git a/applications/ray/kuberay-tpu-webhook/Troubleshooting.md b/ray-on-gke/tpu/kuberay-tpu-webhook/Troubleshooting.md similarity index 100% rename from applications/ray/kuberay-tpu-webhook/Troubleshooting.md rename to ray-on-gke/tpu/kuberay-tpu-webhook/Troubleshooting.md diff --git a/applications/ray/kuberay-tpu-webhook/certs/cert.yaml b/ray-on-gke/tpu/kuberay-tpu-webhook/certs/cert.yaml similarity index 100% rename from applications/ray/kuberay-tpu-webhook/certs/cert.yaml rename to ray-on-gke/tpu/kuberay-tpu-webhook/certs/cert.yaml diff --git a/applications/ray/kuberay-tpu-webhook/deployments/deployment.yaml b/ray-on-gke/tpu/kuberay-tpu-webhook/deployments/deployment.yaml similarity index 100% rename from applications/ray/kuberay-tpu-webhook/deployments/deployment.yaml rename to ray-on-gke/tpu/kuberay-tpu-webhook/deployments/deployment.yaml diff --git a/applications/ray/kuberay-tpu-webhook/deployments/mutating-webhook-cfg.yaml b/ray-on-gke/tpu/kuberay-tpu-webhook/deployments/mutating-webhook-cfg.yaml similarity index 100% rename from applications/ray/kuberay-tpu-webhook/deployments/mutating-webhook-cfg.yaml rename to ray-on-gke/tpu/kuberay-tpu-webhook/deployments/mutating-webhook-cfg.yaml diff --git a/applications/ray/kuberay-tpu-webhook/deployments/validating-webhook-cfg.yaml b/ray-on-gke/tpu/kuberay-tpu-webhook/deployments/validating-webhook-cfg.yaml similarity index 100% rename from applications/ray/kuberay-tpu-webhook/deployments/validating-webhook-cfg.yaml rename to ray-on-gke/tpu/kuberay-tpu-webhook/deployments/validating-webhook-cfg.yaml diff --git a/applications/ray/kuberay-tpu-webhook/deployments/webhook-svc.yaml b/ray-on-gke/tpu/kuberay-tpu-webhook/deployments/webhook-svc.yaml similarity index 100% rename from applications/ray/kuberay-tpu-webhook/deployments/webhook-svc.yaml rename to ray-on-gke/tpu/kuberay-tpu-webhook/deployments/webhook-svc.yaml diff --git a/applications/ray/kuberay-tpu-webhook/go.mod b/ray-on-gke/tpu/kuberay-tpu-webhook/go.mod similarity index 100% rename from applications/ray/kuberay-tpu-webhook/go.mod rename to ray-on-gke/tpu/kuberay-tpu-webhook/go.mod diff --git a/applications/ray/kuberay-tpu-webhook/go.sum b/ray-on-gke/tpu/kuberay-tpu-webhook/go.sum similarity index 100% rename from applications/ray/kuberay-tpu-webhook/go.sum rename to ray-on-gke/tpu/kuberay-tpu-webhook/go.sum diff --git a/applications/ray/kuberay-tpu-webhook/main.go b/ray-on-gke/tpu/kuberay-tpu-webhook/main.go similarity index 100% rename from applications/ray/kuberay-tpu-webhook/main.go rename to ray-on-gke/tpu/kuberay-tpu-webhook/main.go diff --git a/applications/ray/kuberay-tpu-webhook/samples/tpu-test.py b/ray-on-gke/tpu/kuberay-tpu-webhook/samples/tpu-test.py similarity index 100% rename from applications/ray/kuberay-tpu-webhook/samples/tpu-test.py rename to ray-on-gke/tpu/kuberay-tpu-webhook/samples/tpu-test.py diff --git a/applications/ray/kuberay-tpu-webhook/webhook_main_test.go b/ray-on-gke/tpu/kuberay-tpu-webhook/webhook_main_test.go similarity index 100% rename from applications/ray/kuberay-tpu-webhook/webhook_main_test.go rename to ray-on-gke/tpu/kuberay-tpu-webhook/webhook_main_test.go diff --git a/tpu-provisioner/admission_controller/README.md b/tpu-provisioner/admission_controller/README.md index 269efffe0..8a5e9fce2 100644 --- a/tpu-provisioner/admission_controller/README.md +++ b/tpu-provisioner/admission_controller/README.md @@ -46,9 +46,17 @@ kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/release kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.5/cert-manager.yaml ``` +Wait for dependencies to become ready. NOTE: You might need to edit/remove Deployment resource requests based on your machine size. + +```bash +kubectl rollout status --timeout=30s deployment -n jobset-system jobset-controller-manager +kubectl rollout status --timeout=30s deployment -n cert-manager cert-manager cert-manager-cainjector cert-manager-webhook +``` + Deploy the controller locally. ```bash +kubectl create namespace tpu-provisioner-system skaffold dev ``` @@ -60,9 +68,10 @@ To run unit tests, run the command `pytest` from the `admission_controller/` dir ### Run E2E tests -E2E testing is currently done manually via the following steps: +Run the steps above in the Local Development section. Make sure that the `skaffold dev` step is running. -1. [Install JobSet](https://jobset.sigs.k8s.io/docs/installation/) -2. **Deploy admission controller**: Run `kubectl apply -f manifests/` from the `admission_controller/` directory. -3. **Create a test JobSet**: Run `kubectl apply -f test/test-jobset.yaml` -4. **Check Jobs were mutated correctly**: Run `kubectl describe jobs` and view the nodeSelectors in the pod template. +Run the e2e test script. + +```bash +./test/e2e/test.sh +``` diff --git a/tpu-provisioner/admission_controller/manifests/manifest.yaml b/tpu-provisioner/admission_controller/manifests/manifest.yaml index ad3092582..9b6d85e73 100644 --- a/tpu-provisioner/admission_controller/manifests/manifest.yaml +++ b/tpu-provisioner/admission_controller/manifests/manifest.yaml @@ -26,6 +26,14 @@ webhooks: apiVersions: ["v1"] resources: ["jobs"] scope: "Namespaced" + objectSelector: + matchExpressions: + # Only mutate Jobs managed by a JobSet + - key: jobset.sigs.k8s.io/job-key + operator: Exists + - key: tpu-provisioner.cloud.google.com/disable-autoprovisioning + operator: NotIn + values: ["true"] failurePolicy: Fail timeoutSeconds: 20 clientConfig: diff --git a/tpu-provisioner/admission_controller/test/e2e/manifests/test-disabled-provisioning.yaml b/tpu-provisioner/admission_controller/test/e2e/manifests/test-disabled-provisioning.yaml new file mode 100644 index 000000000..9076c7282 --- /dev/null +++ b/tpu-provisioner/admission_controller/test/e2e/manifests/test-disabled-provisioning.yaml @@ -0,0 +1,43 @@ +apiVersion: jobset.x-k8s.io/v1alpha2 +kind: JobSet +metadata: + # Running this JobSet should result in Jobs having the nodeSelector `cloud.google.com/gke-location-hint=cell` injected + # using the example admission controller webhook. + name: test-disabled-provisioning + annotations: + alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool # 1:1 job replica to node pool assignment +spec: + failurePolicy: + maxRestarts: 3 + replicatedJobs: + - name: workers + replicas: 3 # set to number of node pools + template: + metadata: + labels: + tpu-provisioner.cloud.google.com/disable-autoprovisioning: "true" + spec: + backoffLimit: 0 + # completions and parallelism should be the number of cores divided by 8 + # (e.g. 4 for a v4-32) + completions: 2 + parallelism: 2 + template: + spec: + nodeSelector: + cloud.google.com/reservation-name: "my-reservation" + restartPolicy: Never + containers: + - name: tpu-job + image: python:3.8 + ports: + - containerPort: 8471 # Default port using which TPU VMs communicate + securityContext: + privileged: true + command: + - bash + - -c + - | + echo "Sleeping..." + sleep 100 + echo "Done!" diff --git a/tpu-provisioner/admission_controller/test/manual_e2e/test-location-hint-no-reservation.yaml b/tpu-provisioner/admission_controller/test/e2e/manifests/test-location-hint-no-reservation.yaml similarity index 96% rename from tpu-provisioner/admission_controller/test/manual_e2e/test-location-hint-no-reservation.yaml rename to tpu-provisioner/admission_controller/test/e2e/manifests/test-location-hint-no-reservation.yaml index c4f7cf305..d957f94e6 100644 --- a/tpu-provisioner/admission_controller/test/manual_e2e/test-location-hint-no-reservation.yaml +++ b/tpu-provisioner/admission_controller/test/e2e/manifests/test-location-hint-no-reservation.yaml @@ -3,7 +3,7 @@ kind: JobSet metadata: # Running this JobSet should result in Jobs having the nodeSelector `cloud.google.com/gke-location-hint=cell` injected # using the example admission controller webhook. - name: test-jobset-location-hint + name: test-jobset-location-hint-no-reservation annotations: alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool # 1:1 job replica to node pool assignment spec: diff --git a/tpu-provisioner/admission_controller/test/manual_e2e/test-location-hint-with-reservation.yaml b/tpu-provisioner/admission_controller/test/e2e/manifests/test-location-hint-with-reservation.yaml similarity index 96% rename from tpu-provisioner/admission_controller/test/manual_e2e/test-location-hint-with-reservation.yaml rename to tpu-provisioner/admission_controller/test/e2e/manifests/test-location-hint-with-reservation.yaml index c2dad2b85..87a91fa5a 100644 --- a/tpu-provisioner/admission_controller/test/manual_e2e/test-location-hint-with-reservation.yaml +++ b/tpu-provisioner/admission_controller/test/e2e/manifests/test-location-hint-with-reservation.yaml @@ -3,7 +3,7 @@ kind: JobSet metadata: # Running this JobSet should result in Jobs having the nodeSelector `cloud.google.com/gke-location-hint=cell` injected # using the example admission controller webhook. - name: test-jobset-location-hint + name: test-jobset-location-hint-with-reservation annotations: alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool # 1:1 job replica to node pool assignment spec: diff --git a/tpu-provisioner/admission_controller/test/e2e/manifests/test-nonjobset-job.yaml b/tpu-provisioner/admission_controller/test/e2e/manifests/test-nonjobset-job.yaml new file mode 100644 index 000000000..4627cb451 --- /dev/null +++ b/tpu-provisioner/admission_controller/test/e2e/manifests/test-nonjobset-job.yaml @@ -0,0 +1,13 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: test-nonjobset-job +spec: + template: + spec: + containers: + - name: sleeper + image: ubuntu + command: ["sleep", "10000"] + restartPolicy: Never + backoffLimit: 0 \ No newline at end of file diff --git a/tpu-provisioner/admission_controller/test/e2e/test.sh b/tpu-provisioner/admission_controller/test/e2e/test.sh new file mode 100755 index 000000000..0563b33c7 --- /dev/null +++ b/tpu-provisioner/admission_controller/test/e2e/test.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +set -e +set -x +set -u +set -o pipefail + +this_dir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +function clean_up { + echo "Cleaning up" + kubectl delete --ignore-not-found -f $this_dir/manifests/ +} +trap clean_up EXIT + +kubectl create -f $this_dir/manifests/ + +echo "Waiting for Jobs to be created..." +sleep 3 + +function job_has_selector_len { + key_val=$(kubectl get job $3 -ojsonpath="{.spec.template.spec.nodeSelector.$2}") + if [[ ${#key_val} == $1 ]]; then + echo "PASS: Job has node selector "$2" with correct length." + else + echo "FAIL: Job node selector "$2" has the wrong length!" + exit 1 + fi +} + +job_has_selector_len 40 job-key test-jobset-location-hint-no-reservation-workers-0 +job_has_selector_len 40 job-key test-jobset-location-hint-with-reservation-workers-0 +job_has_selector_len 0 job-key test-disabled-provisioning-workers-0 +job_has_selector_len 0 job-key test-nonjobset-job + +job_has_selector_len 4 'cloud\.google\.com/gke-location-hint' test-jobset-location-hint-no-reservation-workers-0 +job_has_selector_len 4 'cloud\.google\.com/gke-location-hint' test-jobset-location-hint-with-reservation-workers-0 +job_has_selector_len 0 'cloud\.google\.com/gke-location-hint' test-disabled-provisioning-workers-0 +job_has_selector_len 0 'cloud\.google\.com/gke-location-hint' test-nonjobset-job + +echo "SUCCESS" \ No newline at end of file diff --git a/tutorials-and-examples/genAI-LLM/finetuning-gemma-2b-on-l4/finetune.yaml b/tutorials-and-examples/genAI-LLM/finetuning-gemma-2b-on-l4/finetune.yaml index 908fe4b09..98f71b92d 100644 --- a/tutorials-and-examples/genAI-LLM/finetuning-gemma-2b-on-l4/finetune.yaml +++ b/tutorials-and-examples/genAI-LLM/finetuning-gemma-2b-on-l4/finetune.yaml @@ -15,7 +15,7 @@ spec: terminationGracePeriodSeconds: 600 containers: - name: finetuner - image: + image: $IMAGE_URL resources: limits: nvidia.com/gpu: "8" @@ -23,7 +23,7 @@ spec: - name: MODEL_NAME value: "google/gemma-2b" - name: NEW_MODEL - value: "" + value: "gemma-2b-sql-finetuned" - name: LORA_R value: "8" - name: LORA_ALPHA diff --git a/tutorials-and-examples/gpu-examples/training-single-gpu/src/tensorflow-mnist-example/tensorflow_mnist_train_distributed.py b/tutorials-and-examples/gpu-examples/training-single-gpu/src/tensorflow-mnist-example/tensorflow_mnist_train_distributed.py index 65c758838..e9b77a656 100644 --- a/tutorials-and-examples/gpu-examples/training-single-gpu/src/tensorflow-mnist-example/tensorflow_mnist_train_distributed.py +++ b/tutorials-and-examples/gpu-examples/training-single-gpu/src/tensorflow-mnist-example/tensorflow_mnist_train_distributed.py @@ -60,7 +60,7 @@ def scale(image, label): # Define the checkpoint directory to store the checkpoints. checkpoint_dir = './training_checkpoints' # Define the name of the checkpoint files. -checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") +checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}.weights.h5") def decay(epoch): if epoch < 3: @@ -74,7 +74,7 @@ def decay(epoch): class PrintLR(tf.keras.callbacks.Callback): def on_epoch_end(self, epoch, logs=None): print('\nLearning rate for epoch {} is {}'.format(epoch + 1, - model.optimizer.lr.numpy())) + model.optimizer.learning_rate.numpy())) callbacks = [ tf.keras.callbacks.TensorBoard(log_dir='./logs'), diff --git a/tutorials-and-examples/inference-servers/checkpoints/Dockerfile b/tutorials-and-examples/inference-servers/checkpoints/Dockerfile index 918f7a586..f1c6cd871 100644 --- a/tutorials-and-examples/inference-servers/checkpoints/Dockerfile +++ b/tutorials-and-examples/inference-servers/checkpoints/Dockerfile @@ -20,7 +20,9 @@ RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyri RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list RUN apt -y update && apt install -y google-cloud-cli -RUN pip install kaggle +RUN pip install kaggle && \ +pip install huggingface_hub[cli] && \ +pip install google-jetstream COPY checkpoint_converter.sh /usr/bin/ RUN chmod +x /usr/bin/checkpoint_converter.sh diff --git a/tutorials-and-examples/inference-servers/checkpoints/README.md b/tutorials-and-examples/inference-servers/checkpoints/README.md index d5c79d3ce..8bbc0645a 100644 --- a/tutorials-and-examples/inference-servers/checkpoints/README.md +++ b/tutorials-and-examples/inference-servers/checkpoints/README.md @@ -11,20 +11,37 @@ docker push gcr.io/${PROJECT_ID}/inference-checkpoint:latest Now you can use it in a [Kubernetes job](../jetstream/maxtext/single-host-inference/checkpoint-job.yaml) and pass the following arguments -Jetstream + MaxText +## Jetstream + MaxText ``` -- -i=INFERENCE_SERVER +- -s=INFERENCE_SERVER - -b=BUCKET_NAME - -m=MODEL_PATH - -v=VERSION (Optional) ``` -Jetstream + Pytorch/XLA +## Jetstream + Pytorch/XLA ``` -- -i=INFERENCE_SERVER +- -s=INFERENCE_SERVER - -m=MODEL_PATH -- -q=QUANTIZE (Optional) -- -v=VERSION -- -1=EXTRA_PARAM_1 -- -2=EXTRA_PARAM_2 +- -n=MODEL_NAME +- -q=QUANTIZE_WEIGHTS (Optional) (default=False) +- -t=QUANTIZE_TYPE (Optional) (default=int8_per_channel) +- -v=VERSION (Optional) (default=jetstream-v0.2.3) +- -i=INPUT_DIRECTORY (Optional) +- -o=OUTPUT_DIRECTORY +- -h=HUGGINGFACE (Optional) (default=False) +``` + +## Argument descriptions: +``` +b) BUCKET_NAME: (str) GSBucket, without gs:// +s) INFERENCE_SERVER: (str) Inference server, ex. jetstream-maxtext, jetstream-pytorch +m) MODEL_PATH: (str) Model path, varies depending on inference server and location of base checkpoint +n) MODEL_NAME: (str) Model name, ex. llama-2, llama-3, gemma +h) HUGGINGFACE: (bool) Checkpoint is from HuggingFace. +q) QUANTIZE_WEIGHTS: (str) Whether to quantize weights +t) QUANTIZE_TYPE: (str) Quantization type, QUANTIZE_WEIGHTS must be set to true. Availabe quantize type: {"int8", "int4"} x {"per_channel", "blockwise"}, +v) VERSION: (str) Version of inference server to override, ex. jetstream-v0.2.2, jetstream-v0.2.3 +i) INPUT_DIRECTORY: (str) Input checkpoint directory, likely a GSBucket path +o) OUTPUT_DIRECTORY: (str) Output checkpoint directory, likely a GSBucket path ``` \ No newline at end of file diff --git a/tutorials-and-examples/inference-servers/checkpoints/checkpoint_converter.sh b/tutorials-and-examples/inference-servers/checkpoints/checkpoint_converter.sh index c2c9a5f69..d52ae35ec 100644 --- a/tutorials-and-examples/inference-servers/checkpoints/checkpoint_converter.sh +++ b/tutorials-and-examples/inference-servers/checkpoints/checkpoint_converter.sh @@ -1,16 +1,17 @@ #!/bin/bash export KAGGLE_CONFIG_DIR="/kaggle" +export HUGGINGFACE_TOKEN_DIR="/huggingface" INFERENCE_SERVER="jetstream-maxtext" BUCKET_NAME="" MODEL_PATH="" print_usage() { - printf "Usage: $0 [ -b BUCKET_NAME ] [ -i INFERENCE_SERVER ] [ -m MODEL_PATH ] [ -q QUANTIZE ] [ -v VERSION ] [ -1 EXTRA_PARAM_1 ] [ -2 EXTRA_PARAM_2 ]" + printf "Usage: $0 [ -b BUCKET_NAME ] [ -s INFERENCE_SERVER ] [ -m MODEL_PATH ] [ -n MODEL_NAME ] [ -h HUGGINGFACE ] [ -q QUANTIZE_WEIGHTS ] [ -t QUANTIZE_TYPE ] [ -v VERSION ] [ -i INPUT_DIRECTORY ] [ -o OUTPUT_DIRECTORY ]" } print_inference_server_unknown() { - printf "Enter a valid inference server [ -i INFERENCE_SERVER ]" + printf "Enter a valid inference server [ -s INFERENCE_SERVER ]" printf "Valid options: jetstream-maxtext, jetstream-pytorch" } @@ -43,6 +44,31 @@ download_kaggle_checkpoint() { echo -e "\nCompleted copy of data to gs://${BUCKET_NAME}/base/${MODEL_NAME}_${VARIATION_NAME}" } +download_huggingface_checkpoint() { + MODEL_PATH=$1 + MODEL_NAME=$2 + + INPUT_CKPT_DIR_LOCAL=/base/ + mkdir /base/ + huggingface-cli login --token $(cat ${HUGGINGFACE_TOKEN_DIR}/HUGGINGFACE_TOKEN) + huggingface-cli download ${MODEL_PATH} --local-dir ${INPUT_CKPT_DIR_LOCAL} + + if [[ $MODEL_NAME == *"llama"* ]]; then + if [[ $MODEL_NAME == "llama-2" ]]; then + TOKENIZER_PATH=/base/tokenizer.model + if [[ $MODEL_PATH != *"hf"* ]]; then + HUGGINGFACE="False" + fi + else + TOKENIZER_PATH=/base/original/tokenizer.model + fi + elif [[ $MODEL_NAME == *"gemma"* ]]; then + TOKENIZER_PATH=/base/tokenizer.model + else + echo -e "Unclear of tokenizer.model for ${MODEL_NAME}. May have to manually upload." + fi +} + convert_maxtext_checkpoint() { BUCKET_NAME=$1 MODEL_NAME=$2 @@ -60,7 +86,7 @@ convert_maxtext_checkpoint() { cd maxtext git checkout ${MAXTEXT_VERSION} python3 -m pip install -r requirements.txt - echo -e "\Cloned MaxText repository and completed installing requirements" + echo -e "\nCloned MaxText repository and completed installing requirements" python3 MaxText/convert_gemma_chkpt.py --base_model_path gs://${BUCKET_NAME}/base/${MODEL_NAME}_${VARIATION_NAME}/${VARIATION_NAME} --maxtext_model_path gs://${BUCKET_NAME}/final/scanned/${MODEL_NAME}_${VARIATION_NAME} --model_size ${MODEL_SIZE} echo -e "\nCompleted conversion of checkpoint to gs://${BUCKET_NAME}/final/scanned/${MODEL_NAME}_${VARIATION_NAME}" @@ -73,59 +99,92 @@ convert_maxtext_checkpoint() { convert_pytorch_checkpoint() { MODEL_PATH=$1 - INPUT_CKPT_DIR=$2 - OUTPUT_CKPT_DIR=$3 - QUANTIZE=$4 - PYTORCH_VERSION=$5 - JETSTREAM_VERSION=v0.2.2 + MODEL_NAME=$2 + HUGGINGFACE=$3 + INPUT_CKPT_DIR=$4 + OUTPUT_CKPT_DIR=$5 + QUANTIZE_TYPE=$6 + QUANTIZE_WEIGHTS=$7 + PYTORCH_VERSION=$8 if [ -z $PYTORCH_VERSION ]; then - PYTORCH_VERSION=jetstream-v0.2.2 + PYTORCH_VERSION=jetstream-v0.2.3 fi CKPT_PATH="$(echo ${INPUT_CKPT_DIR} | awk -F'gs://' '{print $2}')" BUCKET_NAME="$(echo ${CKPT_PATH} | awk -F'/' '{print $1}')" TO_REPLACE=gs://${BUCKET_NAME} - INPUT_CKPT_DIR_LOCAL=${INPUT_CKPT_DIR/${TO_REPLACE}/${MODEL_PATH}} - OUTPUT_CKPT_DIR_LOCAL=/pt-ckpt/ - if [ -z $QUANTIZE ]; then - QUANTIZE="False" - fi + OUTPUT_CKPT_DIR_LOCAL=/pt-ckpt/ - git clone https://github.com/google/JetStream.git git clone https://github.com/google/jetstream-pytorch.git - cd JetStream - git checkout ${JETSTREAM_VERSION} - pip install -e # checkout stable Pytorch commit - cd ../jetstream-pytorch + cd /jetstream-pytorch git checkout ${PYTORCH_VERSION} bash install_everything.sh - export PYTHONPATH=$PYTHONPATH:$(pwd)/deps/xla/experimental/torch_xla2:$(pwd)/JetStream:$(pwd) + echo -e "\nCloned JetStream PyTorch repository and completed installing requirements" echo -e "\nRunning conversion script to convert model weights. This can take a couple minutes..." - python3 -m convert_checkpoints --input_checkpoint_dir=${INPUT_CKPT_DIR_LOCAL} --output_checkpoint_dir=${OUTPUT_CKPT_DIR_LOCAL} --quantize=${QUANTIZE} + + if [ $HUGGINGFACE == "True" ]; then + echo "Checkpoint weights are from HuggingFace" + download_huggingface_checkpoint "$MODEL_PATH" "$MODEL_NAME" + else + HUGGINGFACE="False" + + # Example: + # the input checkpoint directory is gs://jetstream-checkpoints/llama-2-7b/base-checkpoint/ + # the local checkpoint directory will be /models/llama-2-7b/base-checkpoint/ + # INPUT_CKPT_DIR_LOCAL=${INPUT_CKPT_DIR/${TO_REPLACE}/${MODEL_PATH}} + INPUT_CKPT_DIR_LOCAL=${INPUT_CKPT_DIR/${TO_REPLACE}/${MODEL_PATH}} + TOKENIZER_PATH=${INPUT_CKPT_DIR_LOCAL}/tokenizer.model + fi + + if [ -z $QUANTIZE_WEIGHTS ]; then + QUANTIZE_WEIGHTS="False" + fi + + # Possible quantizations: + # 1. quantize_weights = False, we run without specifying quantize_type + # 2. quantize_weights = True, we run without specifying quantize_type to use the default int8_per_channel + # 3. quantize_weights = True, we run and specify quantize_type + # We can use the same command for case #1 and #2, since both have quantize_weights set without needing to specify quantize_type + + echo -e "\n quantize weights: ${QUANTIZE_WEIGHTS}" + if [ $QUANTIZE_WEIGHTS == "True" ]; then + # quantize_type is required, it will be set to the default value if not turned on + if [ -n $QUANTIZE_TYPE ]; then + python3 -m convert_checkpoints --model_name=${MODEL_NAME} --input_checkpoint_dir=${INPUT_CKPT_DIR_LOCAL} --output_checkpoint_dir=${OUTPUT_CKPT_DIR_LOCAL} --quantize_type=${QUANTIZE_TYPE} --quantize_weights=${QUANTIZE_WEIGHTS} --from_hf=${HUGGINGFACE} + fi + else + # quantize_weights should be false, but if not the convert_checkpoints script will catch it + python3 -m convert_checkpoints --model_name=${MODEL_NAME} --input_checkpoint_dir=${INPUT_CKPT_DIR_LOCAL} --output_checkpoint_dir=${OUTPUT_CKPT_DIR_LOCAL} --quantize_weights=${QUANTIZE_WEIGHTS} --from_hf=${HUGGINGFACE} + fi echo -e "\nCompleted conversion of checkpoint to ${OUTPUT_CKPT_DIR_LOCAL}" echo -e "\nUploading converted checkpoint from local path ${OUTPUT_CKPT_DIR_LOCAL} to GSBucket ${OUTPUT_CKPT_DIR}" + gcloud storage cp -r ${OUTPUT_CKPT_DIR_LOCAL}/* ${OUTPUT_CKPT_DIR} + gcloud storage cp ${TOKENIZER_PATH} ${OUTPUT_CKPT_DIR} echo -e "\nCompleted uploading converted checkpoint from local path ${OUTPUT_CKPT_DIR_LOCAL} to GSBucket ${OUTPUT_CKPT_DIR}" } -while getopts 'b:i:m:q:v:1:2:' flag; do +while getopts 'b:s:m:n:h:t:q:v:i:o:' flag; do case "${flag}" in b) BUCKET_NAME="$(echo ${OPTARG} | awk -F'=' '{print $2}')" ;; - i) INFERENCE_SERVER="$(echo ${OPTARG} | awk -F'=' '{print $2}')" ;; + s) INFERENCE_SERVER="$(echo ${OPTARG} | awk -F'=' '{print $2}')" ;; m) MODEL_PATH="$(echo ${OPTARG} | awk -F'=' '{print $2}')" ;; - q) QUANTIZE="$(echo ${OPTARG} | awk -F'=' '{print $2}')" ;; + n) MODEL_NAME="$(echo ${OPTARG} | awk -F'=' '{print $2}')" ;; + h) HUGGINGFACE="$(echo ${OPTARG} | awk -F'=' '{print $2}')" ;; + t) QUANTIZE_TYPE="$(echo ${OPTARG} | awk -F'=' '{print $2}')" ;; + q) QUANTIZE_WEIGHTS="$(echo ${OPTARG} | awk -F'=' '{print $2}')" ;; v) VERSION="$(echo ${OPTARG} | awk -F'=' '{print $2}')" ;; - 1) EXTRA_PARAM_1="$(echo ${OPTARG} | awk -F'=' '{print $2}')" ;; - 2) EXTRA_PARAM_2="$(echo ${OPTARG} | awk -F'=' '{print $2}')" ;; + i) INPUT_DIRECTORY="$(echo ${OPTARG} | awk -F'=' '{print $2}')" ;; + o) OUTPUT_DIRECTORY="$(echo ${OPTARG} | awk -F'=' '{print $2}')" ;; *) print_usage exit 1 ;; esac @@ -142,8 +201,8 @@ case ${INFERENCE_SERVER} in convert_maxtext_checkpoint "$BUCKET_NAME" "$MODEL_NAME" "$VARIATION_NAME" "$MODEL_SIZE" "$VERSION" ;; jetstream-pytorch) - check_model_path "$MODEL_PATH" - convert_pytorch_checkpoint "$MODEL_PATH" "$EXTRA_PARAM_1" "$EXTRA_PARAM_2" "$QUANTIZE" "$VERSION" + check_model_path "$MODEL_PATH" + convert_pytorch_checkpoint "$MODEL_PATH" "$MODEL_NAME" "$HUGGINGFACE" "$INPUT_DIRECTORY" "$OUTPUT_DIRECTORY" "$QUANTIZE_TYPE" "$QUANTIZE_WEIGHTS" "$VERSION" ;; *) print_inference_server_unknown exit 1 ;; diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/README.md b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/README.md index 5a2d6418b..c0d971282 100644 --- a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/README.md +++ b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/README.md @@ -125,76 +125,32 @@ Next, deploy a Maxengine server hosting the Gemma-7b model. You can use the prov ### Deploy via Kubectl -First navigate to the `./kubectl` directory. Add desired overrides to your yaml file by editing the `args` in `deployment.yaml`. You can reference the [MaxText base config file](https://github.com/google/maxtext/blob/main/MaxText/configs/base.yml) on what values can be overridden. +See the [Jetstream component README](../../../../../modules/jetstream-maxtext-deployment/README.md#installation-via-bash-and-kubectl) for start to finish instructions on how to deploy jetstream to your cluster, assure the value of the PARAMETERS_PATH is the path where the checkpoint-converter job uploaded the converted checkpoints to, in this case it should be `gs://$BUCKET_NAME/final/unscanned/gemma_7b-it/0/checkpoints/0/items` where $BUCKET_NAME is the same as above. -In the manifest, ensure the value of the BUCKET_NAME is the name of the Cloud Storage bucket that was used when converting your checkpoint. - -Argument descriptions: -``` -tokenizer_path: The file path to your model’s tokenizer -load_parameters_path: Your checkpoint path (GSBucket) -per_device_batch_size: Decoding batch size per device (1 TPU chip = 1 device) -max_prefill_predict_length: Maximum length for the prefill when doing autoregression -max_target_length: Maximum sequence length -model_name: Model name -ici_fsdp_parallelism: The number of shards for FSDP parallelism -ici_autoregressive_parallelism: The number of shards for autoregressive parallelism -ici_tensor_parallelism: The number of shards for tensor parallelism -weight_dtype: Weight data type (e.g. bfloat16) -scan_layers: Scan layers boolean flag -``` - -Deploy the manifest file for the Maxengine server and HTTP server: -``` -kubectl apply -f deployment.yaml -``` + This README also includes [instructions for setting up autoscaling](../../../../../modules//jetstream-maxtext-deployment/README.md#optional-autoscaling-components). Follow those instructions to install the required components for autoscaling and configuring your HPAs appropriately. ### Deploy via Terraform -Navigate to the `./terraform` directory and do the standard [`terraform init`](https://developer.hashicorp.com/terraform/cli/commands/init). The deployment requires some inputs, an example `sample-terraform.tfvars` is provided as a starting point, run `cp sample-terraform.tfvars terraform.tfvars` and modify the resulting `terraform.tfvars` as needed. Finally run `terraform apply` to apply these resources to your cluster. - -#### (optional) Enable Horizontal Pod Autoscaling via Terraform - -Applying the following resources to your cluster will enable autoscaling with customer metrics: - - PodMonitoring: For scraping metrics and exporting them to Google Cloud Monitoring - - Custom Metrics Stackdriver Adapter (CMSA): For enabling your HPA objects to read metrics from the Google Cloud Monitoring API. - - [Horizontal Pod Autoscaler (HPA)](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/): For reading metrics and setting the maxengine-servers deployments replica count accordingly. +Navigate to the `./terraform` directory and run [`terraform init`](https://developer.hashicorp.com/terraform/cli/commands/init). The deployment requires some inputs, an example `sample-terraform.tfvars` is provided as a starting point, run `cp sample-terraform.tfvars terraform.tfvars` and modify the resulting `terraform.tfvars` as needed. Since we're using gemma-7b the `maxengine_deployment_settings.parameters_path` terraform variable should be set to the following: `gs://BUCKET_NAME/final/unscanned/gemma_7b-it/0/checkpoints/0/items`. Finally run `terraform apply` to apply these resources to your cluster. -These components require a few more inputs and rerunning the [prior step](#deploy-via-terraform) with these set will deploy the components. The following input conditions should be satisfied: `custom_metrics_enabled` should be `true` and `metrics_port`, `hpa_type`, `hpa_averagevalue_target`, `hpa_min_replicas`, `hpa_max_replicas` should all be set. +For deploying autoscaling components via terraform, a few more variables to be set, doing so and rerunning the [prior step](#deploy-via-terraform) with these set will deploy the components. The following variables should be set: - Note that only one HPA resource will be created. For those who want to scale based on multiple metrics, we recommend using the following template to apply more HPA resources: - -``` -apiVersion: autoscaling/v2 -kind: HorizontalPodAutoscaler -metadata: - name: jetstream-hpa -spec: - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: maxengine-server - minReplicas: - maxReplicas: - metrics: - - type: Pods - pods: - metric: - name: prometheus.googleapis.com||gauge - target: - type: AverageValue - averageValue: ``` +maxengine_deployment_settings = { + custom_metrics_enabled = true + metrics_port = + metrics_scrape_interval +} -If you would like to probe the metrics manually, `cURL` your maxengine-server container on whatever metrics port you set and you should see something similar to the following: - -``` -# HELP jetstream_prefill_backlog_size Size of prefill queue -# TYPE jetstream_prefill_backlog_size gauge -jetstream_prefill_backlog_size{id="SOME-HOSTNAME-HERE>"} 0.0 -# HELP jetstream_slots_used_percentage The percentage of decode slots currently being used -# TYPE jetstream_slots_used_percentage gauge -jetstream_slots_used_percentage{id="",idx="0"} 0.04166666666666663 +hpa_config = { + metrics_adapter = + max_replicas + min_replicas + rules = [{ + target_query = + average_value_target + }] +} ``` ### Verify the deployment @@ -278,4 +234,17 @@ kubectl apply -f kubectl/deployment.yaml kubectl port-forward svc/jetstream-svc 9000:9000 ``` -To run benchmarking, pass in the flag `--server 127.0.0.1` when running the benchmarking script. \ No newline at end of file +To run benchmarking, pass in the flag `--server 127.0.0.1` when running the benchmarking script. + +### Observe custom metrics + +This step assumes you specified a metrics port to your jetstream deployment via `prometheus_port`. If you would like to probe the metrics manually, `cURL` your maxengine-server container on the metrics port you set and you should see something similar to the following: + +``` +# HELP jetstream_prefill_backlog_size Size of prefill queue +# TYPE jetstream_prefill_backlog_size gauge +jetstream_prefill_backlog_size{id="SOME-HOSTNAME-HERE>"} 0.0 +# HELP jetstream_slots_used_percentage The percentage of decode slots currently being used +# TYPE jetstream_slots_used_percentage gauge +jetstream_slots_used_percentage{id="",idx="0"} 0.04166666666666663 +``` \ No newline at end of file diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/custom-metrics-stackdriver-adapter/README.md b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/custom-metrics-stackdriver-adapter/README.md deleted file mode 100644 index 855e55be4..000000000 --- a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/custom-metrics-stackdriver-adapter/README.md +++ /dev/null @@ -1,26 +0,0 @@ -# Custom Metrics Stackdriver Adapter - -Adapted from https://raw.githubusercontent.com/GoogleCloudPlatform/k8s-stackdriver/master/custom-metrics-stackdriver-adapter/deploy/production/adapter_new_resource_model.yaml - -## Usage - -To use this module, include it from your main terraform config, i.e.: - -``` -module "custom_metrics_stackdriver_adapter" { - source = "./path/to/custom-metrics-stackdriver-adapter" -} -``` - -For a workload identity enabled cluster, some additional configuration is -needed: - -``` -module "custom_metrics_stackdriver_adapter" { - source = "./path/to/custom-metrics-stackdriver-adapter" - workload_identity = { - enabled = true - project_id = "" - } -} -``` \ No newline at end of file diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/custom-metrics-stackdriver-adapter/main.tf b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/custom-metrics-stackdriver-adapter/main.tf deleted file mode 100644 index 3ecb5f674..000000000 --- a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/custom-metrics-stackdriver-adapter/main.tf +++ /dev/null @@ -1,291 +0,0 @@ -resource "kubernetes_namespace_v1" "custom-metrics" { - metadata { - name = "custom-metrics" - } -} - -resource "kubernetes_service_account_v1" "custom-metrics-stackdriver-adapter-no-wi" { - count = var.workload_identity.enabled ? 0 : 1 - metadata { - name = "custom-metrics-stackdriver-adapter" - namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name - } -} - -resource "kubernetes_service_account_v1" "custom-metrics-stackdriver-adapter-wi" { - count = var.workload_identity.enabled ? 1 : 0 - metadata { - name = "custom-metrics-stackdriver-adapter" - namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name - annotations = { - "iam.gke.io/gcp-service-account" = google_service_account.cmsa-sa[0].email - } - } -} - -resource "kubernetes_cluster_role_binding_v1" "custom-metrics-system-auth-delegator" { - metadata { - name = "custom-metrics:system:auth-delegator" - } - role_ref { - api_group = "rbac.authorization.k8s.io" - kind = "ClusterRole" - name = "system:auth-delegator" - } - subject { - kind = "ServiceAccount" - name = (var.workload_identity.enabled - ? kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-wi[0].metadata[0].name - : kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-no-wi[0].metadata[0].name - ) - namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name - } -} - -resource "kubernetes_role_binding_v1" "custom-metrics-auth-reader" { - metadata { - name = "custom-metrics-auth-reader" - namespace = "kube-system" - } - role_ref { - api_group = "rbac.authorization.k8s.io" - kind = "Role" - name = "extension-apiserver-authentication-reader" - } - subject { - kind = "ServiceAccount" - name = (var.workload_identity.enabled - ? kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-wi[0].metadata[0].name - : kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-no-wi[0].metadata[0].name - ) - namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name - } -} - -resource "kubernetes_cluster_role_v1" "custom-metrics-resource-reader" { - metadata { - name = "custom-metrics-resource-reader" - } - rule { - api_groups = [""] - resources = ["pods", "nodes", "nodes/stats"] - verbs = ["get", "list", "watch"] - } -} - -resource "kubernetes_cluster_role_binding_v1" "custom-metrics-resource-reader" { - metadata { - name = "custom-metrics-resource-reader" - } - role_ref { - api_group = "rbac.authorization.k8s.io" - kind = "ClusterRole" - name = kubernetes_cluster_role_v1.custom-metrics-resource-reader.metadata[0].name - } - subject { - kind = "ServiceAccount" - name = (var.workload_identity.enabled - ? kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-wi[0].metadata[0].name - : kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-no-wi[0].metadata[0].name - ) - namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name - } -} - -resource "kubernetes_deployment_v1" "custom-metrics-stackdriver-adapter" { - metadata { - name = "custom-metrics-stackdriver-adapter" - namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name - labels = { - run = "custom-metrics-stackdriver-adapter" - k8s-app = "custom-metrics-stackdriver-adapter" - } - } - spec { - replicas = 1 - - selector { - match_labels = { - run = "custom-metrics-stackdriver-adapter" - k8s-app = "custom-metrics-stackdriver-adapter" - } - } - - template { - metadata { - labels = { - run = "custom-metrics-stackdriver-adapter" - k8s-app = "custom-metrics-stackdriver-adapter" - "kubernetes.io/cluster-service" = "true" - } - } - - spec { - service_account_name = (var.workload_identity.enabled - ? kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-wi[0].metadata[0].name - : kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-no-wi[0].metadata[0].name - ) - - container { - image = "gcr.io/gke-release/custom-metrics-stackdriver-adapter:v0.14.2-gke.0" - image_pull_policy = "Always" - name = "pod-custom-metrics-stackdriver-adapter" - command = ["/adapter", "--use-new-resource-model=true", "--fallback-for-container-metrics=true"] - resources { - limits = { - cpu = "250m" - memory = "200Mi" - } - requests = { - cpu = "250m" - memory = "200Mi" - } - } - } - } - } - } -} - -resource "kubernetes_service_v1" "custom-metrics-stackdriver-adapter" { - metadata { - name = "custom-metrics-stackdriver-adapter" - namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name - labels = { - run = "custom-metrics-stackdriver-adapter" - k8s-app = "custom-metrics-stackdriver-adapter" - "kubernetes.io/cluster-service" = "true" - "kubernetes.io/name" = "Adapter" - } - } - spec { - selector = { - run = "custom-metrics-stackdriver-adapter" - k8s-app = "custom-metrics-stackdriver-adapter" - } - port { - port = 443 - protocol = "TCP" - target_port = 443 - } - type = "ClusterIP" - } -} - -resource "kubernetes_api_service_v1" "v1beta1-custom-metrics-k8s-io" { - metadata { - name = "v1beta1.custom.metrics.k8s.io" - } - spec { - insecure_skip_tls_verify = true - group = "custom.metrics.k8s.io" - group_priority_minimum = 100 - version_priority = 100 - service { - name = kubernetes_service_v1.custom-metrics-stackdriver-adapter.metadata[0].name - namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name - } - version = "v1beta1" - } -} - -resource "kubernetes_api_service_v1" "v1beta2-custom-metrics-k8s-io" { - metadata { - name = "v1beta2.custom.metrics.k8s.io" - } - spec { - insecure_skip_tls_verify = true - group = "custom.metrics.k8s.io" - group_priority_minimum = 100 - version_priority = 200 - service { - name = kubernetes_service_v1.custom-metrics-stackdriver-adapter.metadata[0].name - namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name - } - version = "v1beta2" - } -} - -resource "kubernetes_api_service_v1" "v1beta1-external-metrics-k8s-io" { - metadata { - name = "v1beta1.external.metrics.k8s.io" - } - spec { - insecure_skip_tls_verify = true - group = "external.metrics.k8s.io" - group_priority_minimum = 100 - version_priority = 100 - service { - name = kubernetes_service_v1.custom-metrics-stackdriver-adapter.metadata[0].name - namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name - } - version = "v1beta1" - } -} - -resource "kubernetes_cluster_role_binding_v1" "external-metrics-reader" { - metadata { - name = "external-metrics-reader" - } - role_ref { - api_group = "rbac.authorization.k8s.io" - kind = "ClusterRole" - name = "external-metrics-reader" - } - subject { - kind = "ServiceAccount" - name = "horizontal-pod-autoscaler" - namespace = "kube-system" - } -} - - -# If workload identity is enabled, extra steps are required. We need to: -# - create a service account -# - grant it the monitoring.viewer IAM role -# - bind it to the workload identity user for the cmsa -# - annotate the cmsa service account (done above) - -resource "google_service_account" "cmsa-sa" { - count = var.workload_identity.enabled ? 1 : 0 - account_id = "cmsa-sa" - project = var.workload_identity.project_id -} - -# Equivalent to: -# gcloud projects add-iam-policy-binding PROJECT_ID \ -# --member=serviceAccount:cmsa-sa@PROJECT_ID.iam.gserviceaccount.com \ -# --role=roles/monitoring.viewer -resource "google_project_iam_binding" "cmsa-project-binding-monitoring-viewer" { - count = var.workload_identity.enabled ? 1 : 0 - project = var.workload_identity.project_id - role = "roles/monitoring.viewer" - members = [ - "serviceAccount:${google_service_account.cmsa-sa[0].account_id}@${var.workload_identity.project_id}.iam.gserviceaccount.com" - ] -} - -# Equivalent to: -# gcloud projects add-iam-policy-binding PROJECT_ID \ -# --member=serviceAccount:cmsa-sa@PROJECT_ID.iam.gserviceaccount.com \ -# --role=roles/iam.serviceAccountTokenCreator -resource "google_project_iam_binding" "cmsa-project-binding-sa-token-creator" { - count = var.workload_identity.enabled ? 1 : 0 - project = var.workload_identity.project_id - role = "roles/iam.serviceAccountTokenCreator" - members = [ - "serviceAccount:${google_service_account.cmsa-sa[0].account_id}@${var.workload_identity.project_id}.iam.gserviceaccount.com" - ] -} - -# Equivalent to: -# gcloud iam service-accounts add-iam-policy-binding \ -# --role roles/iam.workloadIdentityUser \ -# --member "serviceAccount:PROJECT_ID.svc.id.goog[custom-metrics/custom-metrics-stackdriver-adapter]" \ -# cmsa-sa@PROJECT_ID.iam.gserviceaccount.com -resource "google_service_account_iam_member" "cmsa-bind-to-gsa" { - count = var.workload_identity.enabled ? 1 : 0 - service_account_id = google_service_account.cmsa-sa[0].name - role = "roles/iam.workloadIdentityUser" - member = "serviceAccount:${var.workload_identity.project_id}.svc.id.goog[custom-metrics/custom-metrics-stackdriver-adapter]" -} diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/main.tf b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/main.tf index d7c267423..0c0d8ac56 100644 --- a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/main.tf +++ b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/main.tf @@ -14,45 +14,11 @@ * limitations under the License. */ -locals { - hpa_cpu_template = "${path.module}/hpa-templates/hpa.cpu.yaml.tftpl" - hpa_jetstream_template = "${path.module}/hpa-templates/hpa.jetstream.yaml.tftpl" - jetstream_podmonitoring = "${path.module}/monitoring-templates/jetstream-podmonitoring.yaml.tftpl" -} - -module "custom_metrics_stackdriver_adapter" { - count = var.custom_metrics_enabled ? 1 : 0 - source = "./custom-metrics-stackdriver-adapter" - workload_identity = { - enabled = true - project_id = var.project_id - } -} - module "maxengine" { - count = 1 - source = "./maxengine" - bucket_name = var.bucket_name - metrics_port = var.metrics_port - maxengine_server_image = var.maxengine_server_image - jetstream_http_server_image = var.jetstream_http_server_image -} - -resource "kubernetes_manifest" "tgi-pod-monitoring" { - count = var.custom_metrics_enabled && var.metrics_port != null ? 1 : 0 - manifest = yamldecode(templatefile(local.jetstream_podmonitoring, { - namespace = var.namespace - metrics_port = try(var.metrics_port, -1) - })) -} - -resource "kubernetes_manifest" "hpa_custom_metric" { - count = (var.custom_metrics_enabled && var.hpa_type != null || var.hpa_type != "memory_used") && var.hpa_averagevalue_target != null ? 1 : 0 - manifest = yamldecode(templatefile(local.hpa_jetstream_template, { - namespace = var.namespace - hpa_type = try(var.hpa_type, "") - hpa_averagevalue_target = try(var.hpa_averagevalue_target, 1) - hpa_min_replicas = var.hpa_min_replicas - hpa_max_replicas = var.hpa_max_replicas - })) -} + count = 1 + source = "../../../../../../modules/jetstream-maxtext-deployment" + cluster_name = var.cluster_name + project_id = var.project_id + maxengine_deployment_settings = var.maxengine_deployment_settings + hpa_config = var.hpa_config +} \ No newline at end of file diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/maxengine/main.tf b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/maxengine/main.tf deleted file mode 100644 index 3b92ab790..000000000 --- a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/maxengine/main.tf +++ /dev/null @@ -1,108 +0,0 @@ -/** - * Copyright 2024 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -resource "kubernetes_deployment" "deployment_maxengine_server" { - metadata { - name = "maxengine-server" - } - spec { - replicas = 2 - selector { - match_labels = { - app = "maxengine-server" - } - } - template { - metadata { - labels = { - app = "maxengine-server" - } - } - spec { - container { - args = [ - "model_name=gemma-7b", - "tokenizer_path=assets/tokenizer.gemma", - "per_device_batch_size=4", - "max_prefill_predict_length=1024", - "max_target_length=2048", - "async_checkpointing=false", - "ici_fsdp_parallelism=1", - "ici_autoregressive_parallelism=-1", - "ici_tensor_parallelism=1", - "scan_layers=false", - "weight_dtype=bfloat16", - format("load_parameters_path=gs://%s/final/unscanned/gemma_7b-it/0/checkpoints/0/items", var.bucket_name), - "attention=dot_product", - var.metrics_port != null ? format("prometheus_port=%d", var.metrics_port) : "", - ] - image = var.maxengine_server_image - image_pull_policy = "Always" - name = "maxengine-server" - port { - container_port = 9000 - } - resources { - limits = { - "google.com/tpu" = 8 - } - requests = { - "google.com/tpu" = 8 - } - } - security_context { - privileged = true - } - } - container { - image = var.jetstream_http_server_image - image_pull_policy = "Always" - name = "jetstream-http" - port { - container_port = 8000 - } - } - node_selector = { - "cloud.google.com/gke-tpu-accelerator" = "tpu-v5-lite-podslice" - "cloud.google.com/gke-tpu-topology" = "2x4" - } - } - } - } -} - -resource "kubernetes_service" "service_jetstream_svc" { - metadata { - name = "jetstream-svc" - } - spec { - port { - name = "jetstream-http" - port = 8000 - protocol = "TCP" - target_port = 8000 - } - port { - name = "jetstream-grpc" - port = 9000 - protocol = "TCP" - target_port = 9000 - } - selector = { - app = "maxengine-server" - } - } -} \ No newline at end of file diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/maxengine/variables.tf b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/maxengine/variables.tf deleted file mode 100644 index 00096c088..000000000 --- a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/maxengine/variables.tf +++ /dev/null @@ -1,43 +0,0 @@ -/** - * Copyright 2024 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -variable "bucket_name" { - description = "Name of Google Cloud Storage bucket hosting unscanned checkpoints" - type = string - nullable = false -} - -variable "metrics_port" { - description = "Port to emit metrics from" - type = number - default = 9100 - nullable = true -} - -variable "maxengine_server_image" { - description = "maxengine-server container image" - type = string - default = "us-docker.pkg.dev/cloud-tpu-images/inference/maxengine-server:v0.2.2" - nullable = false -} - -variable "jetstream_http_server_image" { - description = "jetstream-http container image" - type = string - default = "us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-http:v0.2.2" - nullable = false -} - diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/monitoring-templates/jetstream-podmonitoring.yaml.tftpl b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/monitoring-templates/jetstream-podmonitoring.yaml.tftpl deleted file mode 100644 index 581d7d3b6..000000000 --- a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/monitoring-templates/jetstream-podmonitoring.yaml.tftpl +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: monitoring.googleapis.com/v1 -kind: PodMonitoring -metadata: - name: "jetstream-podmonitoring" - namespace: ${namespace} -spec: - endpoints: - - port: ${metrics_port} - interval: 1s - path: / - targetLabels: - metadata: ['pod', 'container', 'node'] diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/providers.tf b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/providers.tf index 70c82e817..027152d73 100644 --- a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/providers.tf +++ b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/providers.tf @@ -34,3 +34,36 @@ provider "kubernetes" { ) token = try(data.google_client_config.identity.0.access_token, null) } + +provider "kubectl" { + host = ( + var.credentials_config.fleet_host == null + ? null + : var.credentials_config.fleet_host + ) + config_path = ( + var.credentials_config.kubeconfig == null + ? null + : pathexpand(var.credentials_config.kubeconfig.path) + ) + token = try(data.google_client_config.identity.0.access_token, null) +} + +provider "helm" { + kubernetes { + config_path = ( + var.credentials_config.kubeconfig == null + ? null + : pathexpand(var.credentials_config.kubeconfig.path) + ) + config_context = try( + var.credentials_config.kubeconfig.context, null + ) + host = ( + var.credentials_config.fleet_host == null + ? null + : var.credentials_config.fleet_host + ) + token = try(data.google_client_config.identity.0.access_token, null) + } +} \ No newline at end of file diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/sample-terraform.tfvars b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/sample-terraform.tfvars index 4114b6f5e..79f30c378 100644 --- a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/sample-terraform.tfvars +++ b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/sample-terraform.tfvars @@ -1,21 +1,26 @@ -# How to (horizontally) scale the workload. Allowed values are: -# - null (no scaling), -# - Workload resources: -# - "cpu" (scale on cpu utilization). -# - Workload metrics (i.e. custom metrics): -# - "jetstream_prefill_backlog_size" -# - "jetstream_slots_used_percentage" -# - Other possibilities coming soon... -# -# See `autoscaling.md` for more details and recommendations. -custom_metrics_enabled = true -metrics_port = 9100 +maxengine_deployment_settings = { + maxengine_server_image = "us-docker.pkg.dev/cloud-tpu-images/inference/maxengine-server:v0.2.2" + jetstream_http_server_image = "us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-http:v0.2.2" + + custom_metrics_enabled = true + metrics_port = 9100 + metrics_scrape_interval = 10 + accelerator_selectors = { + topology = "2x4" + accelerator = "tpu-v5-lite-podslice" + chip_count : 8 + } +} # Demonstrating autoscaling with jetstream_prefill_backlog_size, change as desired. # For jetstream_prefill_backlog_size. (experiment with this to determine optimal values). -hpa_type = "jetstream_prefill_backlog_size" -hpa_averagevalue_target = 10 -# Adjust these if you want different min/max values -hpa_min_replicas = 1 -hpa_max_replicas = 2 +# hpa_config = { +# metrics_adapter = "prometheus-adapter" +# max_replicas = 5 +# min_replicas = 1 +# rules = [{ +# target_query = "jetstream_prefill_backlog_size" +# average_value_target = 5 +# }] +# } \ No newline at end of file diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/variables.tf b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/variables.tf index fff306d01..696482180 100644 --- a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/variables.tf +++ b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/variables.tf @@ -39,89 +39,70 @@ variable "credentials_config" { } } -variable "namespace" { - description = "Namespace used for Jetstream resources." +variable "project_id" { + description = "Project id of existing or created project." type = string nullable = false - default = "default" } -variable "maxengine_server_image" { - description = "maxengine-server container image" - type = string - default = "us-docker.pkg.dev/cloud-tpu-images/inference/maxengine-server:v0.2.2" - nullable = false +variable "cluster_name" { + type = string } -variable "jetstream_http_server_image" { - description = "jetstream-http container image" - type = string - default = "us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-http:v0.2.2" - nullable = false -} +variable "maxengine_deployment_settings" { + type = object({ + maxengine_server_image = string + jetstream_http_server_image = string + model_name = string // Name of your LLM (for example: "gemma-7b") + parameters_path = string // Path to the parameters for your model + metrics_port = optional(number) // Emit Jetstream metrics on this port of each contaienr + custom_metrics_enabled = bool // Whether or not custom metrics are also emitted + metrics_scrape_interval = optional(number) // Interval for scraping metrics (default: 10s) -variable "bucket_name" { - description = "Name of Google Cloud Storage bucket hosting unscanned checkpoints" - type = string - nullable = false + accelerator_selectors = object({ + topology = string + accelerator = string + chip_count = number + }) + }) } -variable "templates_path" { - description = "Path where manifest templates will be read from. Set to null to use the default manifests" - type = string - default = null -} +variable "hpa_config" { + type = object({ + metrics_adapter = string + min_replicas = number + max_replicas = number + rules = list(object({ + target_query = string + average_value_target = number + })) + }) + default = null -variable "hpa_type" { - description = "How the Jetstream workload should be scaled." - type = string - default = null - nullable = true validation { - condition = var.hpa_type == null ? true : length(regexall("jetstream_.*", var.hpa_type)) > 0 || length(regexall("memory_used", var.hpa_type)) > 0 - error_message = "Allows values for hpa_type are {null, memory_used, jetstream metrics (e.g., \"jetstream_prefill_backlog_size\", \"jetstream_slots_used_percentage\")}" + condition = alltrue([ + for hpa_config in var.hpa_config.rules : + hpa_config.target_query != null && hpa_config.average_value_target != null && length(regexall("jetstream_.*", hpa_config.target_query)) > 0 || length(regexall("memory_used", hpa_config.target_query)) > 0 || length(regexall("memory_used_percentage", hpa_config.target_query)) > 0 + ]) + error_message = "Allows values for hpa_type are {null, memory_used, predefined promql queries (i.e. memory_used_percentage, or jetstream metrics (e.g., \"jetstream_prefill_backlog_size\", \"jetstream_slots_used_percentage\")}" + } + validation { + condition = var.hpa_config.metrics_adapter == "custom-metrics-stackdriver-adapter" && alltrue([ + for hpa_config in var.hpa_config.rules : + hpa_config.target_query != null && hpa_config.average_value_target != null && length(regexall("jetstream_.*", hpa_config.target_query)) > 0 || length(regexall("memory_used", hpa_config.target_query)) > 0 + ]) || var.hpa_config.metrics_adapter != "custom-metrics-stackdriver-adapter" + error_message = "Allowed values for target_query when using the custom-metrics-stackdriver are \"memory_used\", or jetstream metrics (i.e. \"jetstream_prefill_backlog_size\", \"jetstream_slots_used_percentage\", etc)" + } + validation { + condition = var.hpa_config.metrics_adapter == "prometheus-adapter" && alltrue([ + for hpa_config in var.hpa_config.rules : + hpa_config.target_query != null && hpa_config.average_value_target != null && length(regexall("jetstream_.*", hpa_config.target_query)) > 0 || length(regexall("memory_used_percentage", hpa_config.target_query)) > 0 + ]) || var.hpa_config.metrics_adapter != "prometheus-adapter" + error_message = "Allowed values for target_query when using the prometheus adapter include predefined promql queries (i.e. \"memory_used_percentage\") and jetstream metrics (i.e. \"jetstream_prefill_backlog_size\", \"jetstream_slots_used_percentage\", etc)" + } + validation { + condition = contains(["", "custom-metrics-stackdriver-adapter", "prometheus-adapter"], var.hpa_config.metrics_adapter) + error_message = "Allowed values for metrics_adapter are \"custom-metrics-stackdriver-adapter\", or \"prometheus-adapter\"." } -} - -variable "hpa_min_replicas" { - description = "Minimum number of HPA replicas." - type = number - default = 1 - nullable = false -} - -variable "hpa_max_replicas" { - description = "Maximum number of HPA replicas." - type = number - default = 5 - nullable = false -} - -# TODO: combine hpa variables into a single object (so that they can be -# validated together) -variable "hpa_averagevalue_target" { - description = "AverageValue target for the `hpa_type` metric. Must be set if `hpa_type` is not null." - type = number - default = null - nullable = true -} - -variable "project_id" { - description = "Project id of existing or created project." - type = string - nullable = false -} - -variable "custom_metrics_enabled" { - description = "Enable custom metrics collection" - type = bool - default = false - nullable = false -} - -variable "metrics_port" { - description = "Port to scrape metrics from" - type = number - nullable = true } \ No newline at end of file diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/versions.tf b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/versions.tf new file mode 100644 index 000000000..66d74c47a --- /dev/null +++ b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/versions.tf @@ -0,0 +1,30 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_providers { + google = { + source = "hashicorp/google" + } + kubernetes = { + source = "hashicorp/kubernetes" + } + kubectl = { + source = "hashicorp/kubectl" + } + helm = { + source = "hashicorp/helm" + } + } +} \ No newline at end of file diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/versions_override.tf b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/versions_override.tf new file mode 100644 index 000000000..a623e3a91 --- /dev/null +++ b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/versions_override.tf @@ -0,0 +1,21 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_providers { + kubectl = { + source = "gavinbunney/kubectl" + } + } +} diff --git a/tutorials-and-examples/inference-servers/jetstream/pytorch/jetstream-pytorch-server/Dockerfile b/tutorials-and-examples/inference-servers/jetstream/pytorch/jetstream-pytorch-server/Dockerfile index 81fcdffc9..a4bc13a58 100644 --- a/tutorials-and-examples/inference-servers/jetstream/pytorch/jetstream-pytorch-server/Dockerfile +++ b/tutorials-and-examples/inference-servers/jetstream/pytorch/jetstream-pytorch-server/Dockerfile @@ -4,7 +4,7 @@ FROM ubuntu:22.04 ENV DEBIAN_FRONTEND=noninteractive -ENV PYTORCH_JETSTREAM_VERSION=jetstream-v0.2.2 +ENV PYTORCH_JETSTREAM_VERSION=jetstream-v0.2.3 RUN apt -y update && apt install -y --no-install-recommends \ ca-certificates \ @@ -20,8 +20,6 @@ cd /jetstream-pytorch && \ git checkout ${PYTORCH_JETSTREAM_VERSION} && \ bash install_everything.sh -ENV PYTHONPATH=$PYTHONPATH:$(pwd)/deps/xla/experimental/torch_xla2:$(pwd)/JetStream:$(pwd) - COPY jetstream_pytorch_server_entrypoint.sh /usr/bin/ RUN chmod +x /usr/bin/jetstream_pytorch_server_entrypoint.sh diff --git a/tutorials-and-examples/inference-servers/jetstream/pytorch/single-host-inference/README.md b/tutorials-and-examples/inference-servers/jetstream/pytorch/single-host-inference/README.md index 04b2fe5c5..910b6c944 100644 --- a/tutorials-and-examples/inference-servers/jetstream/pytorch/single-host-inference/README.md +++ b/tutorials-and-examples/inference-servers/jetstream/pytorch/single-host-inference/README.md @@ -66,7 +66,7 @@ $ kubectl annotate serviceaccount default \ iam.gke.io/gcp-service-account=jetstream-iam-sa@${PROJECT_ID}.iam.gserviceaccount.com ``` -### Create a Cloud Storage bucket to store the Llama2-7b model checkpoint +### Create a Cloud Storage bucket to store your model checkpoint ``` BUCKET_NAME= @@ -74,12 +74,71 @@ gcloud storage buckets create $BUCKET_NAME ``` ## Checkpoint conversion + +### [Option #1] Download weights from GitHub Follow the instructions here to download the llama-2-7b weights: https://github.com/meta-llama/llama#download -Upload your weights to your GSBucket +``` +ls llama + +llama-2-7b tokenizer.model .. +``` + +Upload your weights and tokenizer to your GSBucket + +``` +gcloud storage cp -r llama-2-7b/* gs://BUCKET_NAME/llama-2-7b/base/ +gcloud storage cp tokenizer.model gs://BUCKET_NAME/llama-2-7b/base/ +``` + +### [Option #2] Download weights from HuggingFace +Accept the terms and conditions from https://huggingface.co/meta-llama/Llama-2-7b-hf. + +For llama-3-8b: https://huggingface.co/meta-llama/Meta-Llama-3-8B. + +For gemma-2b: https://huggingface.co/google/gemma-2b-pytorch. + +Obtain a HuggingFace CLI token by going to your HuggingFace settings and under the `Access Tokens`, generate a `New token`. Edit permissions to your access token to have read access to your respective checkpoint repository. + +Copy your access token and create a Secret to store the HuggingFace token ``` -gcloud storage cp -r /* gs://BUCKET_NAME/llama-2-7b/base/ +kubectl create secret generic huggingface-secret \ + --from-literal=HUGGINGFACE_TOKEN= +``` + +### Apply the checkpoint conversion job + +For the following models, replace the following arguments in `checkpoint-job.yaml` + +#### Llama-2-7b-hf +``` +- -s=jetstream-pytorch +- -m=meta-llama/Llama-2-7b-hf +- -o=gs://BUCKET_NAME/pytorch/llama-2-7b/final/bf16/ +- -n=llama-2 +- -q=False +- -h=True +``` + +#### Llama-3-8b +``` +- -s=jetstream-pytorch +- -m=meta-llama/Meta-Llama-3-8B +- -o=gs://BUCKET_NAME/pytorch/llama-3-8b/final/bf16/ +- -n=llama-3 +- -q=False +- -h=True +``` + +#### Gemma-2b +``` +- -s=jetstream-pytorch +- -m=google/gemma-2b-pytorch +- -o=gs://BUCKET_NAME/pytorch/gemma-2b/final/bf16/ +- -n=gemma +- -q=False +- -h=True ``` Run the checkpoint conversion job. This will use the [checkpoint conversion script](https://github.com/google/jetstream-pytorch/blob/main/convert_checkpoints.py) from Jetstream-pytorch to create a compatible Pytorch checkpoint @@ -95,19 +154,19 @@ Observe your checkpoint kubectl logs -f jobs/checkpoint-converter # This can take several minutes ... -Completed uploading converted checkpoint from local path /pt-ckpt/ to GSBucket gs://BUCKET_NAME/pytorch/llama2-7b/final/bf16/" +Completed uploading converted checkpoint from local path /pt-ckpt/ to GSBucket gs://BUCKET_NAME/pytorch/llama-2-7b/final/bf16/" ``` -Now your converted checkpoint will be located in `gs://BUCKET_NAME/pytorch/llama2-7b/final/bf16/` +Now your converted checkpoint will be located in `gs://BUCKET_NAME/pytorch/llama-2-7b/final/bf16/` ## Deploy the Jetstream Pytorch server The following flags are set in the manifest file ``` ---param_size: Size of model +--size: Size of model +--model_name: Name of model (llama-2, llama-3, gemma) --batch_size: Batch size --max_cache_length: Maximum length of kv cache ---platform=tpu: TPU machine type (8 for v5e-8, 4 for v4-8) --tokenizer_path: Path to model tokenizer file --checkpoint_path: Path to checkpoint Optional flags to add @@ -115,6 +174,18 @@ Optional flags to add --quantize_kv_cache (Default False): Quantized kv cache ``` +For llama3-8b, you can use the following arguments: +``` +- --size=8b +- --model_name=llama-3 +- --batch_size=80 +- --max_cache_length=2048 +- --quantize_weights=False +- --quantize_kv_cache=False +- --tokenizer_path=/models/pytorch/llama3-8b/final/bf16/tokenizer.model +- --checkpoint_path=/models/pytorch/llama3-8b/final/bf16/model.safetensors +``` + ``` kubectl apply -f deployment.yaml ``` @@ -122,8 +193,8 @@ kubectl apply -f deployment.yaml ### Verify the deployment ``` kubectl get deployment -NAME READY UP-TO-DATE AVAILABLE AGE -jetstream-pytorch-server 2/2 2 2 ##s +NAME READY UP-TO-DATE AVAILABLE AGE +jetstream-pytorch-server 2/2 2 2 ##s ``` View the HTTP server logs to check that the model has been loaded and compiled. It may take the server a few minutes to complete this operation. diff --git a/tutorials-and-examples/inference-servers/jetstream/pytorch/single-host-inference/checkpoint-job.yaml b/tutorials-and-examples/inference-servers/jetstream/pytorch/single-host-inference/checkpoint-job.yaml index 99079648c..f48c1ac79 100644 --- a/tutorials-and-examples/inference-servers/jetstream/pytorch/single-host-inference/checkpoint-job.yaml +++ b/tutorials-and-examples/inference-servers/jetstream/pytorch/single-host-inference/checkpoint-job.yaml @@ -12,16 +12,20 @@ spec: restartPolicy: Never containers: - name: inference-checkpoint - image: us-docker.pkg.dev/cloud-tpu-images/inference/inference-checkpoint:v0.2.0 + image: us-docker.pkg.dev/cloud-tpu-images/inference/inference-checkpoint:v0.2.3 args: - - -i=jetstream-pytorch + - -s=jetstream-pytorch - -m=/models - - -1=gs://BUCKET_NAME/pytorch/llama2-7b/base/ - - -2=gs://BUCKET_NAME/pytorch/llama2-7b/final/bf16/ + - -i=gs://BUCKET_NAME/pytorch/llama2-7b/base/ + - -o=gs://BUCKET_NAME/pytorch/llama2-7b/final/bf16/ + - -q=False volumeMounts: - mountPath: "/kaggle/" name: kaggle-credentials readOnly: true + - mountPath: "/huggingface/" + name: huggingface-credentials + readOnly: true - name: gcs-fuse-checkpoint mountPath: /models readOnly: true @@ -38,6 +42,10 @@ spec: secret: defaultMode: 0400 secretName: kaggle-secret + - name: huggingface-credentials + secret: + defaultMode: 0400 + secretName: huggingface-secret - name: gcs-fuse-checkpoint csi: driver: gcsfuse.csi.storage.gke.io diff --git a/tutorials-and-examples/inference-servers/jetstream/pytorch/single-host-inference/deployment.yaml b/tutorials-and-examples/inference-servers/jetstream/pytorch/single-host-inference/deployment.yaml index 51dffb062..126d8cfbd 100644 --- a/tutorials-and-examples/inference-servers/jetstream/pytorch/single-host-inference/deployment.yaml +++ b/tutorials-and-examples/inference-servers/jetstream/pytorch/single-host-inference/deployment.yaml @@ -19,12 +19,14 @@ spec: cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice containers: - name: jetstream-pytorch-server - image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-pytorch-server:v0.2.0 + image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-pytorch-server:v0.2.3 args: - - --param_size=7b + - --size=7b + - --model_name=llama-2 - --batch_size=80 - --max_cache_length=2048 - - --platform=tpu=8 + - --quantize_weights=False + - --quantize_kv_cache=False - --tokenizer_path=/jetstream-pytorch/jetstream_pt/third_party/llama2/tokenizer.model - --checkpoint_path=/models/pytorch/llama-2-7b/final/bf16/model.safetensors ports: @@ -39,7 +41,7 @@ spec: limits: google.com/tpu: 8 - name: jetstream-http - image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-http:v0.2.0 + image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-http:v0.2.2 ports: - containerPort: 8000 volumes: diff --git a/tutorials-and-examples/inference-servers/jetstream/pytorch/single-host-inference/pd-deployment.yaml b/tutorials-and-examples/inference-servers/jetstream/pytorch/single-host-inference/pd-deployment.yaml index e6491b5e7..6297ce41b 100644 --- a/tutorials-and-examples/inference-servers/jetstream/pytorch/single-host-inference/pd-deployment.yaml +++ b/tutorials-and-examples/inference-servers/jetstream/pytorch/single-host-inference/pd-deployment.yaml @@ -17,16 +17,16 @@ spec: cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice containers: - name: jetstream-pytorch-server - image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-pytorch-server:v0.2.0 + image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-pytorch-server:v0.2.3 args: - - --param_size=7b + - --size=7b + - --model_name=llama-2 - --batch_size=80 - --max_cache_length=2048 - - --platform=tpu=8 - --quantize_weights=False - --quantize_kv_cache=False - --tokenizer_path=/jetstream-pytorch/jetstream_pt/third_party/llama2/tokenizer.model - - --checkpoint_path=/models/llama2-7b/bf16/model.safetensors + - --checkpoint_path=/models/pytorch/llama-2-7b/final/bf16/model.safetensors ports: - containerPort: 9000 volumeMounts: @@ -38,7 +38,7 @@ spec: limits: google.com/tpu: 8 - name: jetstream-http - image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-http:v0.2.0 + image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-http:v0.2.2 ports: - containerPort: 8000 volumes: diff --git a/tutorials-and-examples/tpu-examples/training/gpt/Dockerfile b/tutorials-and-examples/tpu-examples/training/gpt/Dockerfile index 31444db39..29524be62 100644 --- a/tutorials-and-examples/tpu-examples/training/gpt/Dockerfile +++ b/tutorials-and-examples/tpu-examples/training/gpt/Dockerfile @@ -6,7 +6,7 @@ RUN apt-get install libomp5 -y RUN pip3 install mkl mkl-include RUN pip3 install tf-nightly tb-nightly tbp-nightly RUN pip3 install numpy -RUN apt-get install numactl libopenblas-dev +RUN apt-get install numactl libopenblas-dev -y RUN ln -s /usr/local/lib/libmkl_intel_ilp64.so.2 /usr/local/lib/libmkl_intel_ilp64.so.1 diff --git a/tutorials-and-examples/workflow-orchestration/dws-examples/README.md b/tutorials-and-examples/workflow-orchestration/dws-examples/README.md index 99f4fafb4..16ef78717 100644 --- a/tutorials-and-examples/workflow-orchestration/dws-examples/README.md +++ b/tutorials-and-examples/workflow-orchestration/dws-examples/README.md @@ -1,11 +1,22 @@ # Dynamic Workload Scheduler examples -The repository contains examples on how to use DWS in GKE. More information about DWS is +The repository contains examples on how to use DWS in GKE. More information about DWS is available [here](https://cloud.google.com/kubernetes-engine/docs/how-to/provisioningrequest). -Files included: +## Prerequisites + +### [Kueue](https://kueue.sigs.k8s.io/) +To install a released version of Kueue in your cluster, run the following command: + +```sh +VERSION=v0.7.0 +kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/$VERSION/manifests.yaml +``` + +For more configuration options visit [Kueue's installation guide.](https://kueue.sigs.k8s.io/docs/installation/) + +## Files included -* `kueue-manifests.yaml` - [Kueue](https://kueue.sigs.k8s.io/) configuration files with ProvisioningRequest and DWS support enabled. * `dws-queue.yaml` - Kueue's Cluster and Local queues with ProvisioningRequest and DWS support enabled. * `job.yaml` - Sample job that requires GPU and uses DWS-enabled queue. Contains optional annotation ` provreq.kueue.x-k8s.io/maxRunDurationSeconds` which sets `maxRunDurationSeconds` for the created ProvisioningRequest