diff --git a/.codecov.yml b/.codecov.yml index eecf4f034093b..c133c701f7214 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -326,6 +326,10 @@ coverage: target: 75 flags: - kube_metrics_server + Kubeflow: + target: 75 + flags: + - kubeflow Kubelet: target: 75 flags: @@ -1138,6 +1142,11 @@ flags: paths: - kube_scheduler/datadog_checks/kube_scheduler - kube_scheduler/tests + kubeflow: + carryforward: true + paths: + - kubeflow/datadog_checks/kubeflow + - kubeflow/tests kubelet: carryforward: true paths: diff --git a/.github/workflows/config/labeler.yml b/.github/workflows/config/labeler.yml index 1bff642eb6a62..e558230a0bf92 100644 --- a/.github/workflows/config/labeler.yml +++ b/.github/workflows/config/labeler.yml @@ -279,6 +279,8 @@ integration/kube_proxy: - kube_proxy/**/* integration/kube_scheduler: - kube_scheduler/**/* +integration/kubeflow: + - kubeflow/**/* integration/kubelet: - kubelet/**/* integration/kubernetes: diff --git a/.github/workflows/test-all.yml b/.github/workflows/test-all.yml index f0a0e12278942..915067d205c41 100644 --- a/.github/workflows/test-all.yml +++ b/.github/workflows/test-all.yml @@ -2094,6 +2094,26 @@ jobs: minimum-base-package: ${{ inputs.minimum-base-package }} pytest-args: ${{ inputs.pytest-args }} secrets: inherit + j89c297c: + uses: ./.github/workflows/test-target.yml + with: + job-name: Kubeflow + target: kubeflow + platform: linux + runner: '["ubuntu-22.04"]' + repo: "${{ inputs.repo }}" + python-version: "${{ inputs.python-version }}" + standard: ${{ inputs.standard }} + latest: ${{ inputs.latest }} + agent-image: "${{ inputs.agent-image }}" + agent-image-py2: "${{ inputs.agent-image-py2 }}" + agent-image-windows: "${{ inputs.agent-image-windows }}" + agent-image-windows-py2: "${{ inputs.agent-image-windows-py2 }}" + test-py2: ${{ inputs.test-py2 }} + test-py3: ${{ inputs.test-py3 }} + minimum-base-package: ${{ inputs.minimum-base-package }} + pytest-args: ${{ inputs.pytest-args }} + secrets: inherit j24a5cff: uses: ./.github/workflows/test-target.yml with: diff --git a/kubeflow/CHANGELOG.md b/kubeflow/CHANGELOG.md new file mode 100644 index 0000000000000..8658345ebf7a3 --- /dev/null +++ b/kubeflow/CHANGELOG.md @@ -0,0 +1,4 @@ +# CHANGELOG - Kubeflow + + + diff --git a/kubeflow/README.md b/kubeflow/README.md new file mode 100644 index 0000000000000..0d9df6cb33b3b --- /dev/null +++ b/kubeflow/README.md @@ -0,0 +1,92 @@ +# Agent Check: Kubeflow + +## Overview + +This check monitors [Kubeflow][1] through the Datadog Agent. + + +## Setup + +Follow the instructions below to install and configure this check for an Agent running on a host. For containerized environments, see the [Autodiscovery Integration Templates][3] for guidance on applying these instructions. + +### Installation + +The Kubeflow check is included in the [Datadog Agent][2] package. +No additional installation is needed on your server. + +### Configuration + +1. Edit the `kubeflow.d/conf.yaml` file, in the `conf.d/` folder at the root of your Agent's configuration directory to start collecting your kubeflow performance data. See the [sample kubeflow.d/conf.yaml][4] for all available configuration options. + +2. [Restart the Agent][5]. + +#### Metric collection + +Make sure that the Prometheus-formatted metrics are exposed for your `kubeflow` componenet. +For the Agent to start collecting metrics, the `kubeflow` pods need to be annotated. + +Kubeflow has metrics endpoints that can be accessed on port `9090`. + +**Note**: The listed metrics can only be collected if they are available(depending on the version). Some metrics are generated only when certain actions are performed. + +The only parameter required for configuring the `kubeflow` check is `openmetrics_endpoint`. This parameter should be set to the location where the Prometheus-formatted metrics are exposed. The default port is `9090`. In containerized environments, `%%host%%` should be used for [host autodetection][3]. + +```yaml +apiVersion: v1 +kind: Pod +# (...) +metadata: + name: '' + annotations: + ad.datadoghq.com/controller.checks: | + { + "kubeflow": { + "init_config": {}, + "instances": [ + { + "openmetrics_endpoint": "http://%%host%%:9090/metrics" + } + ] + } + } + # (...) +spec: + containers: + - name: 'controller' +# (...) +``` + +### Validation + +[Run the Agent's status subcommand][6] and look for `kubeflow` under the Checks section. + +## Data Collected + +### Metrics + +See [metadata.csv][7] for a list of metrics provided by this integration. + +### Events + +The Kubeflow integration does not include any events. + +### Service Checks + +The Kubeflow integration does not include any service checks. + +See [service_checks.json][8] for a list of service checks provided by this integration. + +## Troubleshooting + +Need help? Contact [Datadog support][9]. + + +[1]: **LINK_TO_INTEGRATION_SITE** +[2]: https://app.datadoghq.com/account/settings/agent/latest +[3]: https://docs.datadoghq.com/agent/kubernetes/integrations/ +[4]: https://github.com/DataDog/integrations-core/blob/master/kubeflow/datadog_checks/kubeflow/data/conf.yaml.example +[5]: https://docs.datadoghq.com/agent/guide/agent-commands/#start-stop-and-restart-the-agent +[6]: https://docs.datadoghq.com/agent/guide/agent-commands/#agent-status-and-information +[7]: https://github.com/DataDog/integrations-core/blob/master/kubeflow/metadata.csv +[8]: https://github.com/DataDog/integrations-core/blob/master/kubeflow/assets/service_checks.json +[9]: https://docs.datadoghq.com/help/ diff --git a/kubeflow/assets/configuration/spec.yaml b/kubeflow/assets/configuration/spec.yaml new file mode 100644 index 0000000000000..f3afed2f8dd0c --- /dev/null +++ b/kubeflow/assets/configuration/spec.yaml @@ -0,0 +1,16 @@ +name: Kubeflow +files: +- name: kubeflow.yaml + options: + - template: init_config + options: + - template: init_config/openmetrics + - template: instances + options: + - template: instances/openmetrics + overrides: + openmetrics_endpoint.required: true + openmetrics_endpoint.value.example: http://:9090/metrics + openmetrics_endpoint.description: | + Endpoint exposing the Kubeflow's Prometheus metrics. + diff --git a/kubeflow/assets/dashboards/overview.json b/kubeflow/assets/dashboards/overview.json new file mode 100644 index 0000000000000..8b77b0122d965 --- /dev/null +++ b/kubeflow/assets/dashboards/overview.json @@ -0,0 +1,1431 @@ +{ + "author_name": "Datadog", + "description": "This dashboard provides an overview of your Kubeflow metrics describing its own operations as well as some metrics related to general state of the different componenets. \n\n**Useful Links\n\n[Kubeflow documentation](https://www.kubeflow.org/docs/)\n\n[Katib Github](https://github.com/kubeflow/katib)\n\n[Pipelines Github](https://github.com/kubeflow/pipelines)\n\n\n", + "layout_type": "ordered", + "template_variables": [], + "title": "Kubeflow Overview", + "widgets": [ + { + "definition": { + "background_color": "vivid_blue", + "layout_type": "ordered", + "show_title": true, + "title": "Kubeflow", + "type": "group", + "widgets": [ + { + "definition": { + "has_background": true, + "has_border": true, + "horizontal_align": "center", + "sizing": "cover", + "type": "image", + "url": "/static/images/logos/kubeflow_small.svg", + "vertical_align": "center" + }, + "id": 5929889920320930, + "layout": { + "height": 3, + "width": 5, + "x": 0, + "y": 0 + } + }, + { + "definition": { + "background_color": "white", + "content": "# Kubeflow\n\nThis dashboard provides an overview of your Kubeflow metrics describing its own operations as well as some metrics related to general state of the different componenets.\n", + "font_size": "14", + "has_padding": true, + "show_tick": false, + "text_align": "left", + "tick_edge": "left", + "tick_pos": "50%", + "type": "note", + "vertical_align": "top" + }, + "id": 4610707819074916, + "layout": { + "height": 3, + "width": 3, + "x": 0, + "y": 3 + } + }, + { + "definition": { + "background_color": "white", + "content": "# Useful Links\n\n[Kubeflow documentation](https://www.kubeflow.org/docs/)\n\n[Katib Github](https://github.com/kubeflow/katib)\n\n[Pipelines Github](https://github.com/kubeflow/pipelines)", + "font_size": "14", + "has_padding": true, + "show_tick": false, + "text_align": "left", + "tick_edge": "left", + "tick_pos": "50%", + "type": "note", + "vertical_align": "top" + }, + "id": 8366490141273904, + "layout": { + "height": 3, + "width": 2, + "x": 3, + "y": 3 + } + } + ] + }, + "id": 3375620455700908, + "layout": { + "height": 7, + "width": 5, + "x": 0, + "y": 0 + } + }, + { + "definition": { + "background_color": "vivid_blue", + "layout_type": "ordered", + "show_title": true, + "title": "Overview", + "type": "group", + "widgets": [ + { + "definition": { + "background_color": "blue", + "content": "The service checks show the Kubeflow OpenMetrics/Prometheus endpoint status.\n\nThe monitor summary shows you any active alerts for the most crucial Metrics. ", + "font_size": "14", + "has_padding": true, + "show_tick": true, + "text_align": "left", + "tick_edge": "left", + "tick_pos": "50%", + "type": "note", + "vertical_align": "center" + }, + "id": 3701171212511382, + "layout": { + "height": 2, + "width": 7, + "x": 0, + "y": 0 + } + }, + { + "definition": { + "color_preference": "text", + "count": 50, + "display_format": "countsAndList", + "hide_zero_counts": true, + "last_triggered_format": "relative", + "query": "tag:integration:kubeflow", + "show_last_triggered": false, + "show_priority": false, + "show_status": true, + "sort": "status,asc", + "start": 0, + "summary_type": "monitors", + "title": "Kubeflow monitors", + "type": "manage_status" + }, + "id": 8113486152180306, + "layout": { + "height": 4, + "width": 7, + "x": 0, + "y": 2 + } + } + ] + }, + "id": 3510698085005998, + "layout": { + "height": 7, + "width": 7, + "x": 5, + "y": 0 + } + }, + { + "definition": { + "background_color": "vivid_blue", + "layout_type": "ordered", + "show_title": true, + "title": "Kubeflow Pipelines", + "type": "group", + "widgets": [ + { + "definition": { + "background_color": "purple", + "content": "Kubeflow Pipelines duration and status. \n", + "font_size": "14", + "has_padding": true, + "show_tick": true, + "text_align": "center", + "tick_edge": "left", + "tick_pos": "50%", + "type": "note", + "vertical_align": "center" + }, + "id": 3082143674816138, + "layout": { + "height": 1, + "width": 12, + "x": 0, + "y": 0 + } + }, + { + "definition": { + "autoscale": true, + "precision": 2, + "requests": [ + { + "conditional_formats": [ + { + "comparator": ">", + "custom_fg_color": "#3a38b2", + "palette": "custom_text", + "value": 0 + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "aggregator": "avg", + "data_source": "metrics", + "name": "query1", + "query": "avg:kubeflow.pipeline.run.duration.seconds.sum{*}" + } + ], + "response_format": "scalar" + } + ], + "timeseries_background": { + "type": "area", + "yaxis": { + "include_zero": true + } + }, + "title": "Sum of the pipeline run duration ", + "title_align": "left", + "title_size": "16", + "type": "query_value" + }, + "id": 1259367181324766, + "layout": { + "height": 3, + "width": 5, + "x": 0, + "y": 1 + } + }, + { + "definition": { + "requests": [ + { + "display_type": "line", + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:kubeflow.pipeline.run.status{*}" + } + ], + "response_format": "timeseries", + "style": { + "color_order": "shuffled", + "line_type": "solid", + "line_width": "normal", + "order_by": "values", + "palette": "dog_classic" + } + } + ], + "show_legend": false, + "title": "Pipeline run status", + "title_align": "left", + "title_size": "16", + "type": "timeseries" + }, + "id": 6694311141159310, + "layout": { + "height": 3, + "width": 7, + "x": 5, + "y": 1 + } + } + ] + }, + "id": 1140994815971286, + "layout": { + "height": 5, + "width": 12, + "x": 0, + "y": 7 + } + }, + { + "definition": { + "background_color": "vivid_blue", + "layout_type": "ordered", + "show_title": true, + "title": "Katib Stats", + "type": "group", + "widgets": [ + { + "definition": { + "background_color": "purple", + "content": "Katib experiments stats\n", + "font_size": "14", + "has_padding": true, + "show_tick": true, + "text_align": "center", + "tick_edge": "left", + "tick_pos": "50%", + "type": "note", + "vertical_align": "center" + }, + "id": 6401772224224004, + "layout": { + "height": 1, + "width": 12, + "x": 0, + "y": 0 + } + }, + { + "definition": { + "precision": 0, + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "aggregator": "last", + "data_source": "metrics", + "name": "query1", + "query": "sum:kubeflow.katib.experiment.created.count{*}" + } + ], + "response_format": "scalar" + } + ], + "timeseries_background": { + "type": "bars", + "yaxis": {} + }, + "title": "Katib number of experiments created", + "type": "query_value" + }, + "id": 5786169943432986, + "layout": { + "height": 2, + "width": 4, + "x": 0, + "y": 1 + } + }, + { + "definition": { + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "legend_layout": "auto", + "markers": [ + { + "display_type": "info bold", + "value": "y = 0" + } + ], + "requests": [ + { + "display_type": "bars", + "formulas": [ + { + "formula": "query3" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query3", + "query": "sum:kubeflow.katib.experiment.succeeded.count{*}" + } + ], + "response_format": "timeseries", + "style": { + "line_type": "solid", + "line_width": "normal", + "palette": "green" + } + }, + { + "display_type": "bars", + "formulas": [ + { + "formula": "-query3" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query3", + "query": "sum:kubeflow.katib.experiment.failed.count{*}" + } + ], + "response_format": "timeseries", + "style": { + "line_type": "solid", + "line_width": "normal", + "palette": "red" + } + } + ], + "show_legend": false, + "title": "successful vs (+) / failed (-) experiments", + "type": "timeseries", + "yaxis": { + "include_zero": true, + "label": "", + "max": "auto", + "min": "auto", + "scale": "linear" + } + }, + "id": 6205396372986786, + "layout": { + "height": 2, + "width": 8, + "x": 4, + "y": 1 + } + }, + { + "definition": { + "autoscale": true, + "precision": 2, + "requests": [ + { + "conditional_formats": [ + { + "comparator": ">", + "custom_fg_color": "#7a42e0", + "palette": "custom_text", + "value": 0 + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "aggregator": "avg", + "data_source": "metrics", + "name": "query1", + "query": "sum:kubeflow.katib.experiment.running.total{*}" + } + ], + "response_format": "scalar" + } + ], + "timeseries_background": { + "type": "bars", + "yaxis": {} + }, + "title": "Total number of running katib experiments", + "type": "query_value" + }, + "id": 4463893929861118, + "layout": { + "height": 2, + "width": 4, + "x": 0, + "y": 3 + } + }, + { + "definition": { + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "legend_layout": "horizontal", + "markers": [ + { + "display_type": "error dashed", + "value": "y > 0" + } + ], + "requests": [ + { + "display_type": "bars", + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kubeflow.katib.experiment.duration.seconds.sum{*} by {type,kubernetes_cluster}" + } + ], + "response_format": "timeseries", + "style": { + "line_type": "solid", + "line_width": "normal", + "palette": "purple" + } + } + ], + "show_legend": true, + "title": "Katib experiments durations ", + "type": "timeseries" + }, + "id": 4154733383926124, + "layout": { + "height": 2, + "width": 8, + "x": 4, + "y": 3 + } + }, + { + "definition": { + "background_color": "purple", + "content": "Katib trial stats\n", + "font_size": "14", + "has_padding": true, + "show_tick": true, + "text_align": "center", + "tick_edge": "left", + "tick_pos": "50%", + "type": "note", + "vertical_align": "center" + }, + "id": 6441304245279666, + "layout": { + "height": 1, + "width": 12, + "x": 0, + "y": 5 + } + }, + { + "definition": { + "precision": 0, + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "aggregator": "last", + "data_source": "metrics", + "name": "query1", + "query": "sum:kubeflow.katib.trial.created.count{*}" + } + ], + "response_format": "scalar" + } + ], + "timeseries_background": { + "type": "bars", + "yaxis": {} + }, + "title": "Katib number of trials created", + "type": "query_value" + }, + "id": 2010901513361406, + "layout": { + "height": 2, + "width": 4, + "x": 0, + "y": 6 + } + }, + { + "definition": { + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "legend_layout": "auto", + "markers": [ + { + "display_type": "info bold", + "value": "y = 0" + } + ], + "requests": [ + { + "display_type": "bars", + "formulas": [ + { + "formula": "query3" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query3", + "query": "sum:kubeflow.katib.trial.succeeded.count{*}" + } + ], + "response_format": "timeseries", + "style": { + "line_type": "solid", + "line_width": "normal", + "palette": "green" + } + }, + { + "display_type": "bars", + "formulas": [ + { + "formula": "-query3" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query3", + "query": "sum:kubeflow.katib.trial.failed.count{*}" + } + ], + "response_format": "timeseries", + "style": { + "line_type": "solid", + "line_width": "normal", + "palette": "red" + } + } + ], + "show_legend": false, + "title": "successful vs (+) / failed (-) trials", + "type": "timeseries", + "yaxis": { + "include_zero": true, + "label": "", + "max": "auto", + "min": "auto", + "scale": "linear" + } + }, + "id": 7719076397917610, + "layout": { + "height": 2, + "width": 8, + "x": 4, + "y": 6 + } + }, + { + "definition": { + "autoscale": true, + "precision": 2, + "requests": [ + { + "conditional_formats": [ + { + "comparator": ">", + "custom_fg_color": "#7a42e0", + "palette": "custom_text", + "value": 0 + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "aggregator": "avg", + "data_source": "metrics", + "name": "query1", + "query": "sum:kubeflow.katib.trial.running.total{*}" + } + ], + "response_format": "scalar" + } + ], + "timeseries_background": { + "type": "bars", + "yaxis": {} + }, + "title": "Total number of running katib trials", + "type": "query_value" + }, + "id": 5848018093621782, + "layout": { + "height": 2, + "width": 4, + "x": 0, + "y": 8 + } + }, + { + "definition": { + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "legend_layout": "horizontal", + "markers": [ + { + "display_type": "error dashed", + "value": "y > 0" + } + ], + "requests": [ + { + "display_type": "bars", + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kubeflow.katib.trial.duration.seconds.sum{*} by {type,kubernetes_cluster}" + } + ], + "response_format": "timeseries", + "style": { + "line_type": "solid", + "line_width": "normal", + "palette": "purple" + } + } + ], + "show_legend": true, + "title": "Katib trials durations ", + "type": "timeseries" + }, + "id": 416208597846836, + "layout": { + "height": 2, + "width": 8, + "x": 4, + "y": 8 + } + }, + { + "definition": { + "background_color": "purple", + "content": "Katib suggestion stats\n", + "font_size": "14", + "has_padding": true, + "show_tick": true, + "text_align": "center", + "tick_edge": "left", + "tick_pos": "50%", + "type": "note", + "vertical_align": "center" + }, + "id": 5079020499380548, + "layout": { + "height": 1, + "width": 12, + "x": 0, + "y": 10 + } + }, + { + "definition": { + "precision": 0, + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "aggregator": "last", + "data_source": "metrics", + "name": "query1", + "query": "sum:kubeflow.katib.suggestion.created.count{*}" + } + ], + "response_format": "scalar" + } + ], + "timeseries_background": { + "type": "bars", + "yaxis": {} + }, + "title": "Katib number of suggestions created", + "type": "query_value" + }, + "id": 8992393927863730, + "layout": { + "height": 2, + "width": 4, + "x": 0, + "y": 11 + } + }, + { + "definition": { + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "legend_layout": "auto", + "markers": [ + { + "display_type": "info bold", + "value": "y = 0" + } + ], + "requests": [ + { + "display_type": "bars", + "formulas": [ + { + "formula": "query3" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query3", + "query": "sum:kubeflow.katib.suggestion.succeeded.count{*}" + } + ], + "response_format": "timeseries", + "style": { + "line_type": "solid", + "line_width": "normal", + "palette": "green" + } + }, + { + "display_type": "bars", + "formulas": [ + { + "formula": "-query3" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query3", + "query": "sum:kubeflow.katib.suggestion.failed.count{*}" + } + ], + "response_format": "timeseries", + "style": { + "line_type": "solid", + "line_width": "normal", + "palette": "red" + } + } + ], + "show_legend": false, + "title": "successful vs (+) / failed (-) suggestions", + "type": "timeseries", + "yaxis": { + "include_zero": true, + "label": "", + "max": "auto", + "min": "auto", + "scale": "linear" + } + }, + "id": 8825412552688930, + "layout": { + "height": 2, + "width": 8, + "x": 4, + "y": 11 + } + }, + { + "definition": { + "autoscale": true, + "precision": 2, + "requests": [ + { + "conditional_formats": [ + { + "comparator": ">", + "custom_fg_color": "#7a42e0", + "palette": "custom_text", + "value": 0 + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "aggregator": "avg", + "data_source": "metrics", + "name": "query1", + "query": "sum:kubeflow.katib.suggestion.running.total{*}" + } + ], + "response_format": "scalar" + } + ], + "timeseries_background": { + "type": "bars", + "yaxis": {} + }, + "title": "Total number of running katib suggestions", + "type": "query_value" + }, + "id": 766816974757162, + "layout": { + "height": 2, + "width": 4, + "x": 0, + "y": 13 + } + }, + { + "definition": { + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "legend_layout": "horizontal", + "markers": [ + { + "display_type": "error dashed", + "value": "y > 0" + } + ], + "requests": [ + { + "display_type": "bars", + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kubeflow.katib.suggestion.duration.seconds.sum{*} by {type,kubernetes_cluster}" + } + ], + "response_format": "timeseries", + "style": { + "line_type": "solid", + "line_width": "normal", + "palette": "purple" + } + } + ], + "show_legend": true, + "title": "Katib suggestions durations ", + "type": "timeseries" + }, + "id": 1119587137713258, + "layout": { + "height": 2, + "width": 8, + "x": 4, + "y": 13 + } + } + ] + }, + "id": 4692872262677082, + "layout": { + "height": 16, + "is_column_break": true, + "width": 12, + "x": 0, + "y": 12 + } + }, + { + "definition": { + "background_color": "vivid_blue", + "layout_type": "ordered", + "show_title": true, + "title": "Kserve Inference", + "type": "group", + "widgets": [ + { + "definition": { + "background_color": "purple", + "content": "The duration and size of Kserve inference requests / responses payloads along with the total number of inferences.", + "font_size": "14", + "has_padding": true, + "show_tick": true, + "text_align": "center", + "tick_edge": "left", + "tick_pos": "50%", + "type": "note", + "vertical_align": "center" + }, + "id": 538462932573560, + "layout": { + "height": 1, + "width": 12, + "x": 0, + "y": 0 + } + }, + { + "definition": { + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "legend_layout": "horizontal", + "markers": [], + "requests": [ + { + "display_type": "bars", + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kubeflow.kserve.inference.response.bytes.count{*}" + } + ], + "response_format": "timeseries", + "style": { + "line_type": "solid", + "line_width": "normal", + "palette": "green" + } + } + ], + "show_legend": true, + "title": " Size of inference response payloads", + "type": "timeseries" + }, + "id": 6445736444220952, + "layout": { + "height": 3, + "width": 6, + "x": 0, + "y": 1 + } + }, + { + "definition": { + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "legend_layout": "horizontal", + "requests": [ + { + "display_type": "bars", + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kubeflow.kserve.inferences.count{*}" + } + ], + "response_format": "timeseries", + "style": { + "line_type": "solid", + "line_width": "normal", + "palette": "dog_classic" + } + } + ], + "show_legend": true, + "title": "Total number of inferences made", + "type": "timeseries", + "yaxis": { + "include_zero": true, + "label": "", + "max": "auto", + "min": "auto", + "scale": "linear" + } + }, + "id": 5356199048292836, + "layout": { + "height": 3, + "width": 6, + "x": 6, + "y": 1 + } + }, + { + "definition": { + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "legend_layout": "auto", + "markers": [], + "requests": [ + { + "display_type": "area", + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "max:kubeflow.kserve.inference.response.bytes.count{*}" + } + ], + "response_format": "timeseries", + "style": { + "line_type": "solid", + "line_width": "normal", + "palette": "dog_classic" + } + } + ], + "show_legend": true, + "title": "Size of inference response payloads", + "type": "timeseries", + "yaxis": { + "include_zero": true, + "label": "", + "max": "auto", + "min": "auto", + "scale": "linear" + } + }, + "id": 4337262209467118, + "layout": { + "height": 2, + "width": 6, + "x": 0, + "y": 4 + } + }, + { + "definition": { + "autoscale": true, + "precision": 2, + "requests": [ + { + "conditional_formats": [ + { + "comparator": ">", + "custom_fg_color": "#8350fb", + "palette": "custom_text", + "value": 0 + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "aggregator": "avg", + "data_source": "metrics", + "name": "query1", + "query": "sum:kubeflow.kserve.inferences.count{*}" + } + ], + "response_format": "scalar" + } + ], + "timeseries_background": { + "type": "area" + }, + "title": "Total number of inferences made", + "title_align": "left", + "title_size": "16", + "type": "query_value" + }, + "id": 5148042452139874, + "layout": { + "height": 3, + "width": 6, + "x": 6, + "y": 4 + } + } + ] + }, + "id": 5580063079033594, + "layout": { + "height": 8, + "width": 12, + "x": 0, + "y": 28 + } + }, + { + "definition": { + "background_color": "vivid_blue", + "layout_type": "ordered", + "show_title": true, + "title": "Notebook stats", + "type": "group", + "widgets": [ + { + "definition": { + "background_color": "purple", + "content": "Number of Notebooks created and running, failed and successful. ", + "font_size": "14", + "has_padding": true, + "show_tick": true, + "text_align": "center", + "tick_edge": "left", + "tick_pos": "50%", + "type": "note", + "vertical_align": "center" + }, + "id": 8099671804704468, + "layout": { + "height": 1, + "width": 12, + "x": 0, + "y": 0 + } + }, + { + "definition": { + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "legend_layout": "auto", + "markers": [ + { + "display_type": "info bold", + "value": "y = 0" + } + ], + "requests": [ + { + "display_type": "bars", + "formulas": [ + { + "formula": "query3" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query3", + "query": "sum:kubeflow.notebook.server.succeeded.count{*}" + } + ], + "response_format": "timeseries", + "style": { + "line_type": "solid", + "line_width": "normal", + "palette": "blue" + } + }, + { + "display_type": "bars", + "formulas": [ + { + "formula": "-query3" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query3", + "query": "sum:kubeflow.notebook.server.failed.count{*}" + } + ], + "response_format": "timeseries", + "style": { + "line_type": "solid", + "line_width": "normal", + "palette": "orange" + } + } + ], + "show_legend": false, + "title": "successful vs (+) / failed (-) notebooks", + "type": "timeseries", + "yaxis": { + "include_zero": true, + "label": "", + "max": "auto", + "min": "auto", + "scale": "linear" + } + }, + "id": 4260635829209500, + "layout": { + "height": 2, + "width": 12, + "x": 0, + "y": 1 + } + }, + { + "definition": { + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "legend_layout": "horizontal", + "markers": [], + "requests": [ + { + "display_type": "bars", + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kubeflow.notebook.server.created.count{*}" + } + ], + "response_format": "timeseries", + "style": { + "line_type": "solid", + "line_width": "normal", + "palette": "purple" + } + } + ], + "show_legend": true, + "title": "Number of notebooks created", + "type": "timeseries" + }, + "id": 6320529556042010, + "layout": { + "height": 3, + "width": 6, + "x": 0, + "y": 3 + } + }, + { + "definition": { + "autoscale": true, + "precision": 2, + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "aggregator": "avg", + "data_source": "metrics", + "name": "query1", + "query": "sum:kubeflow.notebook.server.running.total{*}" + } + ], + "response_format": "scalar" + } + ], + "timeseries_background": { + "type": "area" + }, + "title": "Total number of notebooks running", + "type": "query_value" + }, + "id": 4881143221156770, + "layout": { + "height": 3, + "width": 6, + "x": 6, + "y": 3 + } + } + ] + }, + "id": 5013331516652710, + "layout": { + "height": 7, + "width": 12, + "x": 0, + "y": 36 + } + } + ] +} \ No newline at end of file diff --git a/kubeflow/assets/monitors/kubeflow.json b/kubeflow/assets/monitors/kubeflow.json new file mode 100644 index 0000000000000..0151c006fd495 --- /dev/null +++ b/kubeflow/assets/monitors/kubeflow.json @@ -0,0 +1,32 @@ +{ + "version": 2, + "created_at": "2024-08-27", + "last_updated_at": "2024-08-27", + "title": "Kubeflow has failed pipelines", + "description": "The number of failed Kubeflow pipelines.", + "definition": +{ + "name": "Kubeflow failed pipelines", + "type": "query alert", + "query": "min(last_5m):avg:kubeflow.pipeline.run.status{statust:failed} > 0", + "message": "You have failed pipelines.", + "tags": [], + "options": { + "thresholds": { + "critical": 0 + }, + "notify_audit": false, + "include_tags": false, + "new_host_delay": 300, + "silenced": {}, + "avalanche_window": 10 + }, + "priority": null, + "restriction_policy": { + "bindings": [] + } +}, +"tags": [ + "integration:kubeflow" +] +} \ No newline at end of file diff --git a/kubeflow/assets/service_checks.json b/kubeflow/assets/service_checks.json new file mode 100644 index 0000000000000..9f2199cefeb06 --- /dev/null +++ b/kubeflow/assets/service_checks.json @@ -0,0 +1,17 @@ +[ + { + "agent_version": "7.59.0", + "integration": "Kubeflow", + "check": "kubeflow.openmetrics.health", + "statuses": [ + "ok", + "critical" + ], + "groups": [ + "host", + "endpoint" + ], + "name": "Kubeflow OpenMetrics endpoint health", + "description": "Returns `CRITICAL` if the Agent is unable to connect to the Kubeflow OpenMetrics endpoint, otherwise returns `OK`." + } +] diff --git a/kubeflow/changelog.d/18391.added b/kubeflow/changelog.d/18391.added new file mode 100644 index 0000000000000..aa949b47b7b41 --- /dev/null +++ b/kubeflow/changelog.d/18391.added @@ -0,0 +1 @@ +Initial Release \ No newline at end of file diff --git a/kubeflow/datadog_checks/__init__.py b/kubeflow/datadog_checks/__init__.py new file mode 100644 index 0000000000000..1517d901c0aae --- /dev/null +++ b/kubeflow/datadog_checks/__init__.py @@ -0,0 +1,4 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +__path__ = __import__('pkgutil').extend_path(__path__, __name__) # type: ignore diff --git a/kubeflow/datadog_checks/kubeflow/__about__.py b/kubeflow/datadog_checks/kubeflow/__about__.py new file mode 100644 index 0000000000000..e9541ce83e9e5 --- /dev/null +++ b/kubeflow/datadog_checks/kubeflow/__about__.py @@ -0,0 +1,4 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +__version__ = '0.0.1' diff --git a/kubeflow/datadog_checks/kubeflow/__init__.py b/kubeflow/datadog_checks/kubeflow/__init__.py new file mode 100644 index 0000000000000..1179dfe512ba6 --- /dev/null +++ b/kubeflow/datadog_checks/kubeflow/__init__.py @@ -0,0 +1,7 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +from .__about__ import __version__ +from .check import KubeflowCheck + +__all__ = ['__version__', 'KubeflowCheck'] diff --git a/kubeflow/datadog_checks/kubeflow/check.py b/kubeflow/datadog_checks/kubeflow/check.py new file mode 100644 index 0000000000000..7375cc1d9367f --- /dev/null +++ b/kubeflow/datadog_checks/kubeflow/check.py @@ -0,0 +1,22 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +from datadog_checks.base import OpenMetricsBaseCheckV2 + +from .metrics import METRIC_MAP, RENAME_LABELS_MAP + + +class KubeflowCheck(OpenMetricsBaseCheckV2): + + DEFAULT_METRIC_LIMIT = 0 + __NAMESPACE__ = 'kubeflow' + + def __init__(self, name, init_config, instances=None): + super(KubeflowCheck, self).__init__(name, init_config, instances) + + def get_default_config(self): + return { + 'metrics': [METRIC_MAP], + "rename_labels": RENAME_LABELS_MAP, + } diff --git a/kubeflow/datadog_checks/kubeflow/config_models/__init__.py b/kubeflow/datadog_checks/kubeflow/config_models/__init__.py new file mode 100644 index 0000000000000..106fff2032f68 --- /dev/null +++ b/kubeflow/datadog_checks/kubeflow/config_models/__init__.py @@ -0,0 +1,24 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +# This file is autogenerated. +# To change this file you should edit assets/configuration/spec.yaml and then run the following commands: +# ddev -x validate config -s +# ddev -x validate models -s + +from .instance import InstanceConfig +from .shared import SharedConfig + + +class ConfigMixin: + _config_model_instance: InstanceConfig + _config_model_shared: SharedConfig + + @property + def config(self) -> InstanceConfig: + return self._config_model_instance + + @property + def shared_config(self) -> SharedConfig: + return self._config_model_shared diff --git a/kubeflow/datadog_checks/kubeflow/config_models/defaults.py b/kubeflow/datadog_checks/kubeflow/config_models/defaults.py new file mode 100644 index 0000000000000..bf7519af75f42 --- /dev/null +++ b/kubeflow/datadog_checks/kubeflow/config_models/defaults.py @@ -0,0 +1,132 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +# This file is autogenerated. +# To change this file you should edit assets/configuration/spec.yaml and then run the following commands: +# ddev -x validate config -s +# ddev -x validate models -s + + +def shared_skip_proxy(): + return False + + +def shared_timeout(): + return 10 + + +def instance_allow_redirects(): + return True + + +def instance_auth_type(): + return 'basic' + + +def instance_cache_metric_wildcards(): + return True + + +def instance_cache_shared_labels(): + return True + + +def instance_collect_counters_with_distributions(): + return False + + +def instance_collect_histogram_buckets(): + return True + + +def instance_disable_generic_tags(): + return False + + +def instance_empty_default_hostname(): + return False + + +def instance_enable_health_service_check(): + return True + + +def instance_histogram_buckets_as_distributions(): + return False + + +def instance_ignore_connection_errors(): + return False + + +def instance_kerberos_auth(): + return 'disabled' + + +def instance_kerberos_delegate(): + return False + + +def instance_kerberos_force_initiate(): + return False + + +def instance_log_requests(): + return False + + +def instance_min_collection_interval(): + return 15 + + +def instance_non_cumulative_histogram_buckets(): + return False + + +def instance_persist_connections(): + return False + + +def instance_request_size(): + return 16 + + +def instance_skip_proxy(): + return False + + +def instance_tag_by_endpoint(): + return True + + +def instance_telemetry(): + return False + + +def instance_timeout(): + return 10 + + +def instance_tls_ignore_warning(): + return False + + +def instance_tls_use_host_header(): + return False + + +def instance_tls_verify(): + return True + + +def instance_use_latest_spec(): + return False + + +def instance_use_legacy_auth_encoding(): + return True + + +def instance_use_process_start_time(): + return False diff --git a/kubeflow/datadog_checks/kubeflow/config_models/instance.py b/kubeflow/datadog_checks/kubeflow/config_models/instance.py new file mode 100644 index 0000000000000..8e39a0e921719 --- /dev/null +++ b/kubeflow/datadog_checks/kubeflow/config_models/instance.py @@ -0,0 +1,171 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +# This file is autogenerated. +# To change this file you should edit assets/configuration/spec.yaml and then run the following commands: +# ddev -x validate config -s +# ddev -x validate models -s + +from __future__ import annotations + +from types import MappingProxyType +from typing import Any, Optional, Union + +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator + +from datadog_checks.base.utils.functions import identity +from datadog_checks.base.utils.models import validation + +from . import defaults, validators + + +class AuthToken(BaseModel): + model_config = ConfigDict( + arbitrary_types_allowed=True, + frozen=True, + ) + reader: Optional[MappingProxyType[str, Any]] = None + writer: Optional[MappingProxyType[str, Any]] = None + + +class ExtraMetrics(BaseModel): + model_config = ConfigDict( + arbitrary_types_allowed=True, + extra='allow', + frozen=True, + ) + name: Optional[str] = None + type: Optional[str] = None + + +class MetricPatterns(BaseModel): + model_config = ConfigDict( + arbitrary_types_allowed=True, + frozen=True, + ) + exclude: Optional[tuple[str, ...]] = None + include: Optional[tuple[str, ...]] = None + + +class Metrics(BaseModel): + model_config = ConfigDict( + arbitrary_types_allowed=True, + extra='allow', + frozen=True, + ) + name: Optional[str] = None + type: Optional[str] = None + + +class Proxy(BaseModel): + model_config = ConfigDict( + arbitrary_types_allowed=True, + frozen=True, + ) + http: Optional[str] = None + https: Optional[str] = None + no_proxy: Optional[tuple[str, ...]] = None + + +class ShareLabels(BaseModel): + model_config = ConfigDict( + arbitrary_types_allowed=True, + frozen=True, + ) + labels: Optional[tuple[str, ...]] = None + match: Optional[tuple[str, ...]] = None + + +class InstanceConfig(BaseModel): + model_config = ConfigDict( + validate_default=True, + arbitrary_types_allowed=True, + frozen=True, + ) + allow_redirects: Optional[bool] = None + auth_token: Optional[AuthToken] = None + auth_type: Optional[str] = None + aws_host: Optional[str] = None + aws_region: Optional[str] = None + aws_service: Optional[str] = None + cache_metric_wildcards: Optional[bool] = None + cache_shared_labels: Optional[bool] = None + collect_counters_with_distributions: Optional[bool] = None + collect_histogram_buckets: Optional[bool] = None + connect_timeout: Optional[float] = None + disable_generic_tags: Optional[bool] = None + empty_default_hostname: Optional[bool] = None + enable_health_service_check: Optional[bool] = None + exclude_labels: Optional[tuple[str, ...]] = None + exclude_metrics: Optional[tuple[str, ...]] = None + exclude_metrics_by_labels: Optional[MappingProxyType[str, Union[bool, tuple[str, ...]]]] = None + extra_headers: Optional[MappingProxyType[str, Any]] = None + extra_metrics: Optional[tuple[Union[str, MappingProxyType[str, Union[str, ExtraMetrics]]], ...]] = None + headers: Optional[MappingProxyType[str, Any]] = None + histogram_buckets_as_distributions: Optional[bool] = None + hostname_format: Optional[str] = None + hostname_label: Optional[str] = None + ignore_connection_errors: Optional[bool] = None + ignore_tags: Optional[tuple[str, ...]] = None + include_labels: Optional[tuple[str, ...]] = None + kerberos_auth: Optional[str] = None + kerberos_cache: Optional[str] = None + kerberos_delegate: Optional[bool] = None + kerberos_force_initiate: Optional[bool] = None + kerberos_hostname: Optional[str] = None + kerberos_keytab: Optional[str] = None + kerberos_principal: Optional[str] = None + log_requests: Optional[bool] = None + metric_patterns: Optional[MetricPatterns] = None + metrics: Optional[tuple[Union[str, MappingProxyType[str, Union[str, Metrics]]], ...]] = None + min_collection_interval: Optional[float] = None + namespace: Optional[str] = Field(None, pattern='\\w*') + non_cumulative_histogram_buckets: Optional[bool] = None + ntlm_domain: Optional[str] = None + openmetrics_endpoint: str + password: Optional[str] = None + persist_connections: Optional[bool] = None + proxy: Optional[Proxy] = None + raw_line_filters: Optional[tuple[str, ...]] = None + raw_metric_prefix: Optional[str] = None + read_timeout: Optional[float] = None + rename_labels: Optional[MappingProxyType[str, Any]] = None + request_size: Optional[float] = None + service: Optional[str] = None + share_labels: Optional[MappingProxyType[str, Union[bool, ShareLabels]]] = None + skip_proxy: Optional[bool] = None + tag_by_endpoint: Optional[bool] = None + tags: Optional[tuple[str, ...]] = None + telemetry: Optional[bool] = None + timeout: Optional[float] = None + tls_ca_cert: Optional[str] = None + tls_cert: Optional[str] = None + tls_ignore_warning: Optional[bool] = None + tls_private_key: Optional[str] = None + tls_protocols_allowed: Optional[tuple[str, ...]] = None + tls_use_host_header: Optional[bool] = None + tls_verify: Optional[bool] = None + use_latest_spec: Optional[bool] = None + use_legacy_auth_encoding: Optional[bool] = None + use_process_start_time: Optional[bool] = None + username: Optional[str] = None + + @model_validator(mode='before') + def _initial_validation(cls, values): + return validation.core.initialize_config(getattr(validators, 'initialize_instance', identity)(values)) + + @field_validator('*', mode='before') + def _validate(cls, value, info): + field = cls.model_fields[info.field_name] + field_name = field.alias or info.field_name + if field_name in info.context['configured_fields']: + value = getattr(validators, f'instance_{info.field_name}', identity)(value, field=field) + else: + value = getattr(defaults, f'instance_{info.field_name}', lambda: value)() + + return validation.utils.make_immutable(value) + + @model_validator(mode='after') + def _final_validation(cls, model): + return validation.core.check_model(getattr(validators, 'check_instance', identity)(model)) diff --git a/kubeflow/datadog_checks/kubeflow/config_models/shared.py b/kubeflow/datadog_checks/kubeflow/config_models/shared.py new file mode 100644 index 0000000000000..0e8a9ecab10a2 --- /dev/null +++ b/kubeflow/datadog_checks/kubeflow/config_models/shared.py @@ -0,0 +1,60 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +# This file is autogenerated. +# To change this file you should edit assets/configuration/spec.yaml and then run the following commands: +# ddev -x validate config -s +# ddev -x validate models -s + +from __future__ import annotations + +from typing import Optional + +from pydantic import BaseModel, ConfigDict, field_validator, model_validator + +from datadog_checks.base.utils.functions import identity +from datadog_checks.base.utils.models import validation + +from . import defaults, validators + + +class Proxy(BaseModel): + model_config = ConfigDict( + arbitrary_types_allowed=True, + frozen=True, + ) + http: Optional[str] = None + https: Optional[str] = None + no_proxy: Optional[tuple[str, ...]] = None + + +class SharedConfig(BaseModel): + model_config = ConfigDict( + validate_default=True, + arbitrary_types_allowed=True, + frozen=True, + ) + proxy: Optional[Proxy] = None + service: Optional[str] = None + skip_proxy: Optional[bool] = None + timeout: Optional[float] = None + + @model_validator(mode='before') + def _initial_validation(cls, values): + return validation.core.initialize_config(getattr(validators, 'initialize_shared', identity)(values)) + + @field_validator('*', mode='before') + def _validate(cls, value, info): + field = cls.model_fields[info.field_name] + field_name = field.alias or info.field_name + if field_name in info.context['configured_fields']: + value = getattr(validators, f'shared_{info.field_name}', identity)(value, field=field) + else: + value = getattr(defaults, f'shared_{info.field_name}', lambda: value)() + + return validation.utils.make_immutable(value) + + @model_validator(mode='after') + def _final_validation(cls, model): + return validation.core.check_model(getattr(validators, 'check_shared', identity)(model)) diff --git a/kubeflow/datadog_checks/kubeflow/config_models/validators.py b/kubeflow/datadog_checks/kubeflow/config_models/validators.py new file mode 100644 index 0000000000000..70150e85e6124 --- /dev/null +++ b/kubeflow/datadog_checks/kubeflow/config_models/validators.py @@ -0,0 +1,13 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +# Here you can include additional config validators or transformers +# +# def initialize_instance(values, **kwargs): +# if 'my_option' not in values and 'my_legacy_option' in values: +# values['my_option'] = values['my_legacy_option'] +# if values.get('my_number') > 10: +# raise ValueError('my_number max value is 10, got %s' % str(values.get('my_number'))) +# +# return values diff --git a/kubeflow/datadog_checks/kubeflow/data/conf.yaml.example b/kubeflow/datadog_checks/kubeflow/data/conf.yaml.example new file mode 100644 index 0000000000000..f148774d33235 --- /dev/null +++ b/kubeflow/datadog_checks/kubeflow/data/conf.yaml.example @@ -0,0 +1,625 @@ +## All options defined here are available to all instances. +# +init_config: + + ## @param proxy - mapping - optional + ## Set HTTP or HTTPS proxies for all instances. Use the `no_proxy` list + ## to specify hosts that must bypass proxies. + ## + ## The SOCKS protocol is also supported like so: + ## + ## socks5://user:pass@host:port + ## + ## Using the scheme `socks5` causes the DNS resolution to happen on the + ## client, rather than on the proxy server. This is in line with `curl`, + ## which uses the scheme to decide whether to do the DNS resolution on + ## the client or proxy. If you want to resolve the domains on the proxy + ## server, use `socks5h` as the scheme. + # + # proxy: + # http: http://: + # https: https://: + # no_proxy: + # - + # - + + ## @param skip_proxy - boolean - optional - default: false + ## If set to `true`, this makes the check bypass any proxy + ## settings enabled and attempt to reach services directly. + # + # skip_proxy: false + + ## @param timeout - number - optional - default: 10 + ## The timeout for connecting to services. + # + # timeout: 10 + + ## @param service - string - optional + ## Attach the tag `service:` to every metric, event, and service check emitted by this integration. + ## + ## Additionally, this sets the default `service` for every log source. + # + # service: + +## Every instance is scheduled independently of the others. +# +instances: + + ## @param openmetrics_endpoint - string - required + ## Endpoint exposing the Kubeflow's Prometheus metrics. + # + - openmetrics_endpoint: http://:9090/metrics + + ## @param raw_metric_prefix - string - optional + ## A prefix that is removed from all exposed metric names, if present. + ## All configuration options will use the prefix-less name. + # + # raw_metric_prefix: _ + + ## @param extra_metrics - (list of string or mapping) - optional + ## This list defines metrics to collect from the `openmetrics_endpoint`, in addition to + ## what the check collects by default. If the check already collects a metric, then + ## metric definitions here take precedence. Metrics may be defined in 3 ways: + ## + ## 1. If the item is a string, then it represents the exposed metric name, and + ## the sent metric name will be identical. For example: + ## + ## extra_metrics: + ## - + ## - + ## 2. If the item is a mapping, then the keys represent the exposed metric names. + ## + ## a. If a value is a string, then it represents the sent metric name. For example: + ## + ## extra_metrics: + ## - : + ## - : + ## b. If a value is a mapping, then it must have a `name` and/or `type` key. + ## The `name` represents the sent metric name, and the `type` represents how + ## the metric should be handled, overriding any type information the endpoint + ## may provide. For example: + ## + ## extra_metrics: + ## - : + ## name: + ## type: + ## - : + ## name: + ## type: + ## + ## The supported native types are `gauge`, `counter`, `histogram`, and `summary`. + ## + ## Note: To collect counter metrics with names ending in `_total`, specify the metric name without the `_total` + ## suffix. For example, to collect the counter metric `promhttp_metric_handler_requests_total`, specify + ## `promhttp_metric_handler_requests`. This submits to Datadog the metric name appended with `.count`. + ## For more information, see: + ## https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md#suffixes + ## + ## Regular expressions may be used to match the exposed metric names, for example: + ## + ## extra_metrics: + ## - ^network_(ingress|egress)_.+ + ## - .+: + ## type: gauge + # + # extra_metrics: [] + + ## @param exclude_metrics - list of strings - optional + ## A list of metrics to exclude, with each entry being either + ## the exact metric name or a regular expression. + ## In order to exclude all metrics but the ones matching a specific filter, + ## you can use a negative lookahead regex like: + ## - ^(?!foo).*$ + # + # exclude_metrics: [] + + ## @param exclude_metrics_by_labels - mapping - optional + ## A mapping of labels to exclude metrics with matching label name and their corresponding metric values. To match + ## all values of a label, set it to `true`. + ## + ## Note: Label filtering happens before `rename_labels`. + ## + ## For example, the following configuration instructs the check to exclude all metrics with + ## a label `worker` or a label `pid` with the value of either `23` or `42`. + ## + ## exclude_metrics_by_labels: + ## worker: true + ## pid: + ## - '23' + ## - '42' + # + # exclude_metrics_by_labels: {} + + ## @param exclude_labels - list of strings - optional + ## A list of labels to exclude, useful for high cardinality values like timestamps or UUIDs. + ## May be used in conjunction with `include_labels`. + ## Labels defined in `exclude_labels` will take precedence in case of overlap. + ## + ## Note: Label filtering happens before `rename_labels`. + # + # exclude_labels: [] + + ## @param include_labels - list of strings - optional + ## A list of labels to include. May be used in conjunction with `exclude_labels`. + ## Labels defined in `exclude_labels` will take precedence in case of overlap. + ## + ## Note: Label filtering happens before `rename_labels`. + # + # include_labels: [] + + ## @param rename_labels - mapping - optional + ## A mapping of label names to their new names. + # + # rename_labels: + # : + # : + + ## @param enable_health_service_check - boolean - optional - default: true + ## Whether or not to send a service check named `.openmetrics.health` which reports + ## the health of the `openmetrics_endpoint`. + # + # enable_health_service_check: true + + ## @param ignore_connection_errors - boolean - optional - default: false + ## Whether or not to ignore connection errors when scraping `openmetrics_endpoint`. + # + # ignore_connection_errors: false + + ## @param hostname_label - string - optional + ## Override the hostname for every metric submission with the value of one of its labels. + # + # hostname_label: + + ## @param hostname_format - string - optional + ## When `hostname_label` is set, this instructs the check how to format the values. The string + ## `` is replaced by the value of the label defined by `hostname_label`. + # + # hostname_format: + + ## @param collect_histogram_buckets - boolean - optional - default: true + ## Whether or not to send histogram buckets. + # + # collect_histogram_buckets: true + + ## @param non_cumulative_histogram_buckets - boolean - optional - default: false + ## Whether or not histogram buckets are non-cumulative and to come with a `lower_bound` tag. + # + # non_cumulative_histogram_buckets: false + + ## @param histogram_buckets_as_distributions - boolean - optional - default: false + ## Whether or not to send histogram buckets as Datadog distribution metrics. This implicitly + ## enables the `collect_histogram_buckets` and `non_cumulative_histogram_buckets` options. + ## + ## Learn more about distribution metrics: + ## https://docs.datadoghq.com/developers/metrics/types/?tab=distribution#metric-types + # + # histogram_buckets_as_distributions: false + + ## @param collect_counters_with_distributions - boolean - optional - default: false + ## Whether or not to also collect the observation counter metrics ending in `.sum` and `.count` + ## when sending histogram buckets as Datadog distribution metrics. This implicitly enables the + ## `histogram_buckets_as_distributions` option. + # + # collect_counters_with_distributions: false + + ## @param use_process_start_time - boolean - optional - default: false + ## Whether to enable a heuristic for reporting counter values on the first scrape. When true, + ## the first time an endpoint is scraped, check `process_start_time_seconds` to decide whether zero + ## initial value can be assumed for counters. This requires keeping metrics in memory until the entire + ## response is received. + # + # use_process_start_time: false + + ## @param share_labels - mapping - optional + ## This mapping allows for the sharing of labels across multiple metrics. The keys represent the + ## exposed metrics from which to share labels, and the values are mappings that configure the + ## sharing behavior. Each mapping must have at least one of the following keys: + ## + ## labels - This is a list of labels to share. All labels are shared if this is not set. + ## match - This is a list of labels to match on other metrics as a condition for sharing. + ## values - This is a list of allowed values as a condition for sharing. + ## + ## To unconditionally share all labels of a metric, set it to `true`. + ## + ## For example, the following configuration instructs the check to apply all labels from `metric_a` + ## to all other metrics, the `node` label from `metric_b` to only those metrics that have a `pod` + ## label value that matches the `pod` label value of `metric_b`, and all labels from `metric_c` + ## to all other metrics if their value is equal to `23` or `42`. + ## + ## share_labels: + ## metric_a: true + ## metric_b: + ## labels: + ## - node + ## match: + ## - pod + ## metric_c: + ## values: + ## - 23 + ## - 42 + # + # share_labels: {} + + ## @param cache_shared_labels - boolean - optional - default: true + ## When `share_labels` is set, it instructs the check to cache labels collected from the first payload + ## for improved performance. + ## + ## Set this to `false` to compute label sharing for every payload at the risk of potentially increased memory usage. + # + # cache_shared_labels: true + + ## @param raw_line_filters - list of strings - optional + ## A list of regular expressions used to exclude lines read from the `openmetrics_endpoint` + ## from being parsed. + # + # raw_line_filters: [] + + ## @param cache_metric_wildcards - boolean - optional - default: true + ## Whether or not to cache data from metrics that are defined by regular expressions rather + ## than the full metric name. + # + # cache_metric_wildcards: true + + ## @param telemetry - boolean - optional - default: false + ## Whether or not to submit metrics prefixed by `.telemetry.` for debugging purposes. + # + # telemetry: false + + ## @param ignore_tags - list of strings - optional + ## A list of regular expressions used to ignore tags added by Autodiscovery and entries in the `tags` option. + # + # ignore_tags: + # - + # - + # - + + ## @param proxy - mapping - optional + ## This overrides the `proxy` setting in `init_config`. + ## + ## Set HTTP or HTTPS proxies for this instance. Use the `no_proxy` list + ## to specify hosts that must bypass proxies. + ## + ## The SOCKS protocol is also supported, for example: + ## + ## socks5://user:pass@host:port + ## + ## Using the scheme `socks5` causes the DNS resolution to happen on the + ## client, rather than on the proxy server. This is in line with `curl`, + ## which uses the scheme to decide whether to do the DNS resolution on + ## the client or proxy. If you want to resolve the domains on the proxy + ## server, use `socks5h` as the scheme. + # + # proxy: + # http: http://: + # https: https://: + # no_proxy: + # - + # - + + ## @param skip_proxy - boolean - optional - default: false + ## This overrides the `skip_proxy` setting in `init_config`. + ## + ## If set to `true`, this makes the check bypass any proxy + ## settings enabled and attempt to reach services directly. + # + # skip_proxy: false + + ## @param auth_type - string - optional - default: basic + ## The type of authentication to use. The available types (and related options) are: + ## + ## - basic + ## |__ username + ## |__ password + ## |__ use_legacy_auth_encoding + ## - digest + ## |__ username + ## |__ password + ## - ntlm + ## |__ ntlm_domain + ## |__ password + ## - kerberos + ## |__ kerberos_auth + ## |__ kerberos_cache + ## |__ kerberos_delegate + ## |__ kerberos_force_initiate + ## |__ kerberos_hostname + ## |__ kerberos_keytab + ## |__ kerberos_principal + ## - aws + ## |__ aws_region + ## |__ aws_host + ## |__ aws_service + ## + ## The `aws` auth type relies on boto3 to automatically gather AWS credentials, for example: from `.aws/credentials`. + ## Details: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html#configuring-credentials + # + # auth_type: basic + + ## @param use_legacy_auth_encoding - boolean - optional - default: true + ## When `auth_type` is set to `basic`, this determines whether to encode as `latin1` rather than `utf-8`. + # + # use_legacy_auth_encoding: true + + ## @param username - string - optional + ## The username to use if services are behind basic or digest auth. + # + # username: + + ## @param password - string - optional + ## The password to use if services are behind basic or NTLM auth. + # + # password: + + ## @param ntlm_domain - string - optional + ## If your services use NTLM authentication, specify + ## the domain used in the check. For NTLM Auth, append + ## the username to domain, not as the `username` parameter. + # + # ntlm_domain: \ + + ## @param kerberos_auth - string - optional - default: disabled + ## If your services use Kerberos authentication, you can specify the Kerberos + ## strategy to use between: + ## + ## - required + ## - optional + ## - disabled + ## + ## See https://github.com/requests/requests-kerberos#mutual-authentication + # + # kerberos_auth: disabled + + ## @param kerberos_cache - string - optional + ## Sets the KRB5CCNAME environment variable. + ## It should point to a credential cache with a valid TGT. + # + # kerberos_cache: + + ## @param kerberos_delegate - boolean - optional - default: false + ## Set to `true` to enable Kerberos delegation of credentials to a server that requests delegation. + ## + ## See https://github.com/requests/requests-kerberos#delegation + # + # kerberos_delegate: false + + ## @param kerberos_force_initiate - boolean - optional - default: false + ## Set to `true` to preemptively initiate the Kerberos GSS exchange and + ## present a Kerberos ticket on the initial request (and all subsequent). + ## + ## See https://github.com/requests/requests-kerberos#preemptive-authentication + # + # kerberos_force_initiate: false + + ## @param kerberos_hostname - string - optional + ## Override the hostname used for the Kerberos GSS exchange if its DNS name doesn't + ## match its Kerberos hostname, for example: behind a content switch or load balancer. + ## + ## See https://github.com/requests/requests-kerberos#hostname-override + # + # kerberos_hostname: + + ## @param kerberos_principal - string - optional + ## Set an explicit principal, to force Kerberos to look for a + ## matching credential cache for the named user. + ## + ## See https://github.com/requests/requests-kerberos#explicit-principal + # + # kerberos_principal: + + ## @param kerberos_keytab - string - optional + ## Set the path to your Kerberos key tab file. + # + # kerberos_keytab: + + ## @param auth_token - mapping - optional + ## This allows for the use of authentication information from dynamic sources. + ## Both a reader and writer must be configured. + ## + ## The available readers are: + ## + ## - type: file + ## path (required): The absolute path for the file to read from. + ## pattern: A regular expression pattern with a single capture group used to find the + ## token rather than using the entire file, for example: Your secret is (.+) + ## - type: oauth + ## url (required): The token endpoint. + ## client_id (required): The client identifier. + ## client_secret (required): The client secret. + ## basic_auth: Whether the provider expects credentials to be transmitted in + ## an HTTP Basic Auth header. The default is: false + ## options: Mapping of additional options to pass to the provider, such as the audience + ## or the scope. For example: + ## options: + ## audience: https://example.com + ## scope: read:example + ## + ## The available writers are: + ## + ## - type: header + ## name (required): The name of the field, for example: Authorization + ## value: The template value, for example `Bearer `. The default is: + ## placeholder: The substring in `value` to replace with the token, defaults to: + # + # auth_token: + # reader: + # type: + # : + # : + # writer: + # type: + # : + # : + + ## @param aws_region - string - optional + ## If your services require AWS Signature Version 4 signing, set the region. + ## + ## See https://docs.aws.amazon.com/general/latest/gr/signature-version-4.html + # + # aws_region: + + ## @param aws_host - string - optional + ## If your services require AWS Signature Version 4 signing, set the host. + ## This only needs the hostname and does not require the protocol (HTTP, HTTPS, and more). + ## For example, if connecting to https://us-east-1.amazonaws.com/, set `aws_host` to `us-east-1.amazonaws.com`. + ## + ## Note: This setting is not necessary for official integrations. + ## + ## See https://docs.aws.amazon.com/general/latest/gr/signature-version-4.html + # + # aws_host: + + ## @param aws_service - string - optional + ## If your services require AWS Signature Version 4 signing, set the service code. For a list + ## of available service codes, see https://docs.aws.amazon.com/general/latest/gr/rande.html + ## + ## Note: This setting is not necessary for official integrations. + ## + ## See https://docs.aws.amazon.com/general/latest/gr/signature-version-4.html + # + # aws_service: + + ## @param tls_verify - boolean - optional - default: true + ## Instructs the check to validate the TLS certificate of services. + # + # tls_verify: true + + ## @param tls_use_host_header - boolean - optional - default: false + ## If a `Host` header is set, this enables its use for SNI (matching against the TLS certificate CN or SAN). + # + # tls_use_host_header: false + + ## @param tls_ignore_warning - boolean - optional - default: false + ## If `tls_verify` is disabled, security warnings are logged by the check. + ## Disable those by setting `tls_ignore_warning` to true. + # + # tls_ignore_warning: false + + ## @param tls_cert - string - optional + ## The path to a single file in PEM format containing a certificate as well as any + ## number of CA certificates needed to establish the certificate's authenticity for + ## use when connecting to services. It may also contain an unencrypted private key to use. + # + # tls_cert: + + ## @param tls_private_key - string - optional + ## The unencrypted private key to use for `tls_cert` when connecting to services. This is + ## required if `tls_cert` is set and it does not already contain a private key. + # + # tls_private_key: + + ## @param tls_ca_cert - string - optional + ## The path to a file of concatenated CA certificates in PEM format or a directory + ## containing several CA certificates in PEM format. If a directory, the directory + ## must have been processed using the `openssl rehash` command. See: + ## https://www.openssl.org/docs/man3.2/man1/c_rehash.html + # + # tls_ca_cert: + + ## @param tls_protocols_allowed - list of strings - optional + ## The expected versions of TLS/SSL when fetching intermediate certificates. + ## Only `SSLv3`, `TLSv1.2`, `TLSv1.3` are allowed by default. The possible values are: + ## SSLv3 + ## TLSv1 + ## TLSv1.1 + ## TLSv1.2 + ## TLSv1.3 + # + # tls_protocols_allowed: + # - SSLv3 + # - TLSv1.2 + # - TLSv1.3 + + ## @param headers - mapping - optional + ## The headers parameter allows you to send specific headers with every request. + ## You can use it for explicitly specifying the host header or adding headers for + ## authorization purposes. + ## + ## This overrides any default headers. + # + # headers: + # Host: + # X-Auth-Token: + + ## @param extra_headers - mapping - optional + ## Additional headers to send with every request. + # + # extra_headers: + # Host: + # X-Auth-Token: + + ## @param timeout - number - optional - default: 10 + ## The timeout for accessing services. + ## + ## This overrides the `timeout` setting in `init_config`. + # + # timeout: 10 + + ## @param connect_timeout - number - optional + ## The connect timeout for accessing services. Defaults to `timeout`. + # + # connect_timeout: + + ## @param read_timeout - number - optional + ## The read timeout for accessing services. Defaults to `timeout`. + # + # read_timeout: + + ## @param request_size - number - optional - default: 16 + ## The number of kibibytes (KiB) to read from streaming HTTP responses at a time. + # + # request_size: 16 + + ## @param log_requests - boolean - optional - default: false + ## Whether or not to debug log the HTTP(S) requests made, including the method and URL. + # + # log_requests: false + + ## @param persist_connections - boolean - optional - default: false + ## Whether or not to persist cookies and use connection pooling for improved performance. + # + # persist_connections: false + + ## @param allow_redirects - boolean - optional - default: true + ## Whether or not to allow URL redirection. + # + # allow_redirects: true + + ## @param tags - list of strings - optional + ## A list of tags to attach to every metric and service check emitted by this instance. + ## + ## Learn more about tagging at https://docs.datadoghq.com/tagging + # + # tags: + # - : + # - : + + ## @param service - string - optional + ## Attach the tag `service:` to every metric, event, and service check emitted by this integration. + ## + ## Overrides any `service` defined in the `init_config` section. + # + # service: + + ## @param min_collection_interval - number - optional - default: 15 + ## This changes the collection interval of the check. For more information, see: + ## https://docs.datadoghq.com/developers/write_agent_check/#collection-interval + # + # min_collection_interval: 15 + + ## @param empty_default_hostname - boolean - optional - default: false + ## This forces the check to send metrics with no hostname. + ## + ## This is useful for cluster-level checks. + # + # empty_default_hostname: false + + ## @param metric_patterns - mapping - optional + ## A mapping of metrics to include or exclude, with each entry being a regular expression. + ## + ## Metrics defined in `exclude` will take precedence in case of overlap. + # + # metric_patterns: + # include: + # - + # exclude: + # - diff --git a/kubeflow/datadog_checks/kubeflow/metrics.py b/kubeflow/datadog_checks/kubeflow/metrics.py new file mode 100644 index 0000000000000..c6af5402837ef --- /dev/null +++ b/kubeflow/datadog_checks/kubeflow/metrics.py @@ -0,0 +1,44 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +# Some metrics mapping are too long. This turns off the 120 line limit for this file: +# ruff: noqa: E501 + + +METRIC_MAP = { + 'katib_controller_reconcile_count': 'katib.controller.reconcile', + 'katib_controller_reconcile_duration_seconds': 'katib.controller.reconcile.duration.seconds', + 'katib_experiment_created': 'katib.experiment.created', + 'katib_experiment_duration_seconds': 'katib.experiment.duration.seconds', + 'katib_experiment_failed': 'katib.experiment.failed', + 'katib_experiment_running_total': 'katib.experiment.running.total', + 'katib_experiment_succeeded': 'katib.experiment.succeeded', + 'katib_suggestion_created': 'katib.suggestion.created', + 'katib_suggestion_duration_seconds': 'katib.suggestion.duration.seconds', + 'katib_suggestion_failed': 'katib.suggestion.failed', + 'katib_suggestion_running_total': 'katib.suggestion.running.total', + 'katib_suggestion_succeeded': 'katib.suggestion.succeeded', + 'katib_trial_created': 'katib.trial.created', + 'katib_trial_duration_seconds': 'katib.trial.duration.seconds', + 'katib_trial_failed': 'katib.trial.failed', + 'katib_trial_running_total': 'katib.trial.running.total', + 'katib_trial_succeeded': 'katib.trial.succeeded', + 'kserve_inference_duration_seconds': 'kserve.inference.duration.seconds', + 'kserve_inference_errors': 'kserve.inference.errors', + 'kserve_inference_request_bytes': 'kserve.inference.request.bytes', + 'kserve_inference_response_bytes': 'kserve.inference.response.bytes', + 'kserve_inferences': 'kserve.inferences', + 'notebook_server_created': 'notebook.server.created', + 'notebook_server_failed': 'notebook.server.failed', + 'notebook_server_reconcile_count': 'notebook.server.reconcile', + 'notebook_server_reconcile_duration_seconds': 'notebook.server.reconcile.duration.seconds', + 'notebook_server_running_total': 'notebook.server.running.total', + 'notebook_server_succeeded': 'notebook.server.succeeded', + 'pipeline_run_duration_seconds': 'pipeline.run.duration.seconds', + 'pipeline_run_status': 'pipeline.run.status', +} + +RENAME_LABELS_MAP = { + 'version': 'go_version', +} diff --git a/kubeflow/hatch.toml b/kubeflow/hatch.toml new file mode 100644 index 0000000000000..001e43ce25414 --- /dev/null +++ b/kubeflow/hatch.toml @@ -0,0 +1,4 @@ +[env.collectors.datadog-checks] + +[[envs.default.matrix]] +python = ["3.11"] diff --git a/kubeflow/manifest.json b/kubeflow/manifest.json new file mode 100644 index 0000000000000..4bd141401f483 --- /dev/null +++ b/kubeflow/manifest.json @@ -0,0 +1,58 @@ +{ + "manifest_version": "2.0.0", + "app_uuid": "0db7b333-38a1-4e09-af1b-317da2f9f413", + "app_id": "kubeflow", + "display_on_public_website": false, + "tile": { + "overview": "README.md#Overview", + "configuration": "README.md#Setup", + "support": "README.md#Support", + "changelog": "CHANGELOG.md", + "description": "Integration for Kubeflow", + "title": "Kubeflow", + "media": [], + "classifier_tags": [ + "Supported OS::Linux", + "Supported OS::Windows", + "Supported OS::macOS", + "Category::Metrics", + "Category::Kubernetes", + "Submitted Data Type::Metrics", + "Category::AI/ML", + "Offering::Integration" + ] + }, + "assets": { + "integration": { + "auto_install": true, + "source_type_id": 22259533, + "source_type_name": "Kubeflow", + "configuration": { + "spec": "assets/configuration/spec.yaml" + }, + "events": { + "creates_events": false + }, + "metrics": { + "prefix": "kubeflow.", + "check": "kubeflow.pipeline.run.status", + "metadata_path": "metadata.csv" + }, + "service_checks": { + "metadata_path": "assets/service_checks.json" + } + }, + "monitors": { + "Kubeflow Monitor": "assets/monitors/kubeflow.json" + }, + "dashboards": { + "Kubeflow Overview": "assets/dashboards/overview.json" + } + }, + "author": { + "support_email": "help@datadoghq.com", + "name": "Datadog", + "homepage": "https://www.datadoghq.com", + "sales_email": "info@datadoghq.com" + } +} diff --git a/kubeflow/metadata.csv b/kubeflow/metadata.csv new file mode 100644 index 0000000000000..46f5e2f614455 --- /dev/null +++ b/kubeflow/metadata.csv @@ -0,0 +1,49 @@ +metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name,curated_metric +kubeflow.katib.controller.reconcile.count,count,,,,Number of reconcile loops executed by the Katib controller,0,kubeflow,, +kubeflow.katib.controller.reconcile.duration.seconds.bucket,count,,,,Duration of reconcile loops executed by the Katib controller(bucket),0,kubeflow,, +kubeflow.katib.controller.reconcile.duration.seconds.count,count,,,,Duration of reconcile loops executed by the Katib controller(count),0,kubeflow,, +kubeflow.katib.controller.reconcile.duration.seconds.sum,count,,second,,Duration of reconcile loops executed by the Katib controller(sum),0,kubeflow,, +kubeflow.katib.experiment.created.count,count,,,,Total number of experiments created,0,kubeflow,, +kubeflow.katib.experiment.duration.seconds.bucket,count,,,,Duration of experiments from start to completion(bucket),0,kubeflow,, +kubeflow.katib.experiment.duration.seconds.count,count,,,,Duration of experiments from start to completion(count),0,kubeflow,, +kubeflow.katib.experiment.duration.seconds.sum,count,,second,,Duration of experiments from start to completion(sum),0,kubeflow,, +kubeflow.katib.experiment.failed.count,count,,,,Number of experiments that have failed,0,kubeflow,, +kubeflow.katib.experiment.running.total,gauge,,,,Number of experiments currently running,0,kubeflow,, +kubeflow.katib.experiment.succeeded.count,count,,,,Number of experiments that have successfully completed,0,kubeflow,, +kubeflow.katib.suggestion.created.count,count,,,,Total number of suggestions made,0,kubeflow,, +kubeflow.katib.suggestion.duration.seconds.bucket,count,,,,Duration of suggestion processes from start to completion(bucket),0,kubeflow,, +kubeflow.katib.suggestion.duration.seconds.count,count,,,,Duration of suggestion processes from start to completion(count),0,kubeflow,, +kubeflow.katib.suggestion.duration.seconds.sum,count,,second,,Duration of suggestion processes from start to completion(sum),0,kubeflow,, +kubeflow.katib.suggestion.failed.count,count,,,,Number of suggestions that have failed,0,kubeflow,, +kubeflow.katib.suggestion.running.total,gauge,,,,Number of suggestions currently being processed,0,kubeflow,, +kubeflow.katib.suggestion.succeeded.count,count,,,,Number of suggestions that have successfully completed,0,kubeflow,, +kubeflow.katib.trial.created.count,count,,,,Total number of trials created,0,kubeflow,, +kubeflow.katib.trial.duration.seconds.bucket,count,,,,Duration of trials from start to completion(bucket),0,kubeflow,, +kubeflow.katib.trial.duration.seconds.count,count,,,,Duration of trials from start to completion(count),0,kubeflow,, +kubeflow.katib.trial.duration.seconds.sum,count,,second,,Duration of trials from start to completion(sum),0,kubeflow,, +kubeflow.katib.trial.failed.count,count,,,,Number of trials that have failed,0,kubeflow,, +kubeflow.katib.trial.running.total,gauge,,,,Number of trials currently running,0,kubeflow,, +kubeflow.katib.trial.succeeded.count,count,,,,Number of trials that have successfully completed,0,kubeflow,, +kubeflow.kserve.inference.duration.seconds.bucket,count,,,,Duration of inference requests(bucket),0,kubeflow,, +kubeflow.kserve.inference.duration.seconds.count,count,,,,Duration of inference requests(count),0,kubeflow,, +kubeflow.kserve.inference.duration.seconds.sum,count,,second,,Duration of inference requests(sum),0,kubeflow,, +kubeflow.kserve.inference.errors.count,count,,,,Number of errors encountered during inference,0,kubeflow,, +kubeflow.kserve.inference.request.bytes.bucket,count,,,,Size of inference request payloads(bucket),0,kubeflow,, +kubeflow.kserve.inference.request.bytes.count,count,,,,Size of inference request payloads(count),0,kubeflow,, +kubeflow.kserve.inference.request.bytes.sum,count,,byte,,Size of inference request payloads(sum),0,kubeflow,, +kubeflow.kserve.inference.response.bytes.bucket,count,,,,Size of inference response payloads(bucket),0,kubeflow,, +kubeflow.kserve.inference.response.bytes.count,count,,,,Size of inference response payloads(count),0,kubeflow,, +kubeflow.kserve.inference.response.bytes.sum,count,,byte,,Size of inference response payloads(sum),0,kubeflow,, +kubeflow.kserve.inferences.count,count,,,,Total number of inferences made,0,kubeflow,, +kubeflow.notebook.server.created.count,count,,,,Total number of notebook servers created,0,kubeflow,, +kubeflow.notebook.server.failed.count,count,,,,Number of notebook servers that have failed,0,kubeflow,, +kubeflow.notebook.server.reconcile.count,count,,,,Number of reconcile loops executed by the notebook controller,0,kubeflow,, +kubeflow.notebook.server.reconcile.duration.seconds.bucket,count,,,,Duration of reconcile loops executed by the notebook controller(bucket),0,kubeflow,, +kubeflow.notebook.server.reconcile.duration.seconds.count,count,,,,Duration of reconcile loops executed by the notebook controller(count),0,kubeflow,, +kubeflow.notebook.server.reconcile.duration.seconds.sum,count,,second,,Duration of reconcile loops executed by the notebook controller(sum),0,kubeflow,, +kubeflow.notebook.server.running.total,gauge,,,,Number of notebook servers currently running,0,kubeflow,, +kubeflow.notebook.server.succeeded.count,count,,,,Number of notebook servers that have successfully completed,0,kubeflow,, +kubeflow.pipeline.run.duration.seconds.bucket,count,,,,Duration of pipeline runs(bucket),0,kubeflow,, +kubeflow.pipeline.run.duration.seconds.count,count,,,,Duration of pipeline runs(count),0,kubeflow,, +kubeflow.pipeline.run.duration.seconds.sum,count,,second,,Duration of pipeline runs(sum),0,kubeflow,, +kubeflow.pipeline.run.status,gauge,,,,Status of pipeline runs,0,kubeflow,, diff --git a/kubeflow/pyproject.toml b/kubeflow/pyproject.toml new file mode 100644 index 0000000000000..a7a3f0c00afda --- /dev/null +++ b/kubeflow/pyproject.toml @@ -0,0 +1,60 @@ +[build-system] +requires = [ + "hatchling>=0.13.0", +] +build-backend = "hatchling.build" + +[project] +name = "datadog-kubeflow" +description = "The Kubeflow check" +readme = "README.md" +license = "BSD-3-Clause" +requires-python = ">=3.11" +keywords = [ + "datadog", + "datadog agent", + "datadog check", + "kubeflow", +] +authors = [ + { name = "Datadog", email = "packages@datadoghq.com" }, +] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: System Administrators", + "License :: OSI Approved :: BSD License", + "Private :: Do Not Upload", + "Programming Language :: Python :: 3.11", + "Topic :: System :: Monitoring", +] +dependencies = [ + "datadog-checks-base>=32.6.0", +] +dynamic = [ + "version", +] + +[project.optional-dependencies] +deps = [] + +[project.urls] +Source = "https://github.com/DataDog/integrations-core" + +[tool.hatch.version] +path = "datadog_checks/kubeflow/__about__.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/datadog_checks", + "/tests", + "/manifest.json", +] + +[tool.hatch.build.targets.wheel] +include = [ + "/datadog_checks/kubeflow", +] +dev-mode-dirs = [ + ".", +] diff --git a/kubeflow/tests/__init__.py b/kubeflow/tests/__init__.py new file mode 100644 index 0000000000000..9103122bf028d --- /dev/null +++ b/kubeflow/tests/__init__.py @@ -0,0 +1,3 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) diff --git a/kubeflow/tests/common.py b/kubeflow/tests/common.py new file mode 100644 index 0000000000000..2bdda389f8a49 --- /dev/null +++ b/kubeflow/tests/common.py @@ -0,0 +1,75 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +import os + +from datadog_checks.dev import get_docker_hostname, get_here + +HERE = get_here() +HOST = get_docker_hostname() +PORT = 9090 + + +def get_fixture_path(filename): + return os.path.join(HERE, 'fixtures', filename) + + +MOCKED_INSTANCE = { + "openmetrics_endpoint": f"http://{HOST}:{PORT}/metrics", + 'tags': ['test:test'], +} + +COMPOSE_FILE = os.path.join(HERE, 'docker', 'docker-compose.yaml') + +METRICS_MOCK = [ + 'katib.controller.reconcile.count', + 'katib.controller.reconcile.duration.seconds.bucket', + 'katib.controller.reconcile.duration.seconds.count', + 'katib.controller.reconcile.duration.seconds.sum', + 'katib.experiment.created.count', + 'katib.experiment.duration.seconds.bucket', + 'katib.experiment.duration.seconds.count', + 'katib.experiment.duration.seconds.sum', + 'katib.experiment.failed.count', + 'katib.experiment.running.total', + 'katib.experiment.succeeded.count', + 'katib.suggestion.created.count', + 'katib.suggestion.duration.seconds.bucket', + 'katib.suggestion.duration.seconds.count', + 'katib.suggestion.duration.seconds.sum', + 'katib.suggestion.failed.count', + 'katib.suggestion.running.total', + 'katib.suggestion.succeeded.count', + 'katib.trial.created.count', + 'katib.trial.duration.seconds.bucket', + 'katib.trial.duration.seconds.count', + 'katib.trial.duration.seconds.sum', + 'katib.trial.failed.count', + 'katib.trial.running.total', + 'katib.trial.succeeded.count', + 'kserve.inference.duration.seconds.bucket', + 'kserve.inference.duration.seconds.count', + 'kserve.inference.duration.seconds.sum', + 'kserve.inference.errors.count', + 'kserve.inference.request.bytes.bucket', + 'kserve.inference.request.bytes.count', + 'kserve.inference.request.bytes.sum', + 'kserve.inference.response.bytes.bucket', + 'kserve.inference.response.bytes.count', + 'kserve.inference.response.bytes.sum', + 'kserve.inferences.count', + 'notebook.server.created.count', + 'notebook.server.failed.count', + 'notebook.server.reconcile.count', + 'notebook.server.reconcile.duration.seconds.bucket', + 'notebook.server.reconcile.duration.seconds.count', + 'notebook.server.reconcile.duration.seconds.sum', + 'notebook.server.running.total', + 'notebook.server.succeeded.count', + 'pipeline.run.duration.seconds.bucket', + 'pipeline.run.duration.seconds.count', + 'pipeline.run.duration.seconds.sum', + 'pipeline.run.status', +] + +METRICS_MOCK = [f'kubeflow.{m}' for m in METRICS_MOCK] diff --git a/kubeflow/tests/conftest.py b/kubeflow/tests/conftest.py new file mode 100644 index 0000000000000..5cd3e422a1613 --- /dev/null +++ b/kubeflow/tests/conftest.py @@ -0,0 +1,29 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +import copy + +import pytest + +from datadog_checks.dev import docker_run +from datadog_checks.dev.conditions import CheckDockerLogs, CheckEndpoints + +from . import common + + +@pytest.fixture(scope='session') +def dd_environment(): + compose_file = common.COMPOSE_FILE + conditions = [ + CheckDockerLogs(identifier='caddy', patterns=['server running']), + CheckEndpoints(common.MOCKED_INSTANCE["openmetrics_endpoint"]), + ] + with docker_run(compose_file, conditions=conditions): + yield { + 'instances': [common.MOCKED_INSTANCE], + } + + +@pytest.fixture +def instance(): + return copy.deepcopy(common.MOCKED_INSTANCE) diff --git a/kubeflow/tests/docker/Caddyfile b/kubeflow/tests/docker/Caddyfile new file mode 100644 index 0000000000000..8fa6bd86ccdc2 --- /dev/null +++ b/kubeflow/tests/docker/Caddyfile @@ -0,0 +1,4 @@ +:9090 { + root * /metrics + file_server +} \ No newline at end of file diff --git a/kubeflow/tests/docker/docker-compose.yaml b/kubeflow/tests/docker/docker-compose.yaml new file mode 100644 index 0000000000000..4ddbf9d9ae6fe --- /dev/null +++ b/kubeflow/tests/docker/docker-compose.yaml @@ -0,0 +1,10 @@ +services: + + caddy: + image: caddy:2.7 + container_name: caddy + ports: + - "9090:9090" + volumes: + - ./Caddyfile:/etc/caddy/Caddyfile + - ../fixtures/kubeflow_metrics.txt:/metrics/metrics \ No newline at end of file diff --git a/kubeflow/tests/fixtures/kubeflow_metrics.txt b/kubeflow/tests/fixtures/kubeflow_metrics.txt new file mode 100644 index 0000000000000..3c119392c9181 --- /dev/null +++ b/kubeflow/tests/fixtures/kubeflow_metrics.txt @@ -0,0 +1,116 @@ +# HELP pipeline_run_duration_seconds Duration of pipeline runs +# TYPE pipeline_run_duration_seconds histogram +pipeline_run_duration_seconds_bucket{pipeline="example-pipeline", status="success",le="0.1"} 0 +pipeline_run_duration_seconds_count{pipeline="example-pipeline", status="success"} 1 +pipeline_run_duration_seconds_sum{pipeline="example-pipeline", status="success"} 123.45 +# HELP pipeline_run_status Status of pipeline runs +# TYPE pipeline_run_status gauge +pipeline_run_status{pipeline="example-pipeline", status="running"} 1 +pipeline_run_status{pipeline="example-pipeline", status="failed"} 0 +pipeline_run_status{pipeline="example-pipeline", status="succeeded"} 0 +# HELP katib_experiment_created_total Total number of experiments created +# TYPE katib_experiment_created_total counter +katib_experiment_created_total 10 +# HELP katib_experiment_running_total Number of experiments currently running +# TYPE katib_experiment_running_total gauge +katib_experiment_running_total 2 +# HELP katib_experiment_succeeded_total Number of experiments that have successfully completed +# TYPE katib_experiment_succeeded_total counter +katib_experiment_succeeded_total 5 +# HELP katib_experiment_failed_total Number of experiments that have failed +# TYPE katib_experiment_failed_total counter +katib_experiment_failed_total 3 +# HELP katib_experiment_duration_seconds Duration of experiments from start to completion +# TYPE katib_experiment_duration_seconds histogram +katib_experiment_duration_seconds_bucket{le="60"} 1 +katib_experiment_duration_seconds_bucket{le="300"} 3 +katib_experiment_duration_seconds_count 5 +katib_experiment_duration_seconds_sum 1500 +# HELP katib_trial_created_total Total number of trials created +# TYPE katib_trial_created_total counter +katib_trial_created_total 50 +# HELP katib_trial_running_total Number of trials currently running +# TYPE katib_trial_running_total gauge +katib_trial_running_total 10 +# HELP katib_trial_succeeded_total Number of trials that have successfully completed +# TYPE katib_trial_succeeded_total counter +katib_trial_succeeded_total 30 +# HELP katib_trial_failed_total Number of trials that have failed +# TYPE katib_trial_failed_total counter +katib_trial_failed_total 10 +# HELP katib_trial_duration_seconds Duration of trials from start to completion +# TYPE katib_trial_duration_seconds histogram +katib_trial_duration_seconds_bucket{le="60"} 5 +katib_trial_duration_seconds_bucket{le="300"} 20 +katib_trial_duration_seconds_count 30 +katib_trial_duration_seconds_sum 6000 +# HELP katib_suggestion_created_total Total number of suggestions made +# TYPE katib_suggestion_created_total counter +katib_suggestion_created_total 20 +# HELP katib_suggestion_running_total Number of suggestions currently being processed +# TYPE katib_suggestion_running_total gauge +katib_suggestion_running_total 5 +# HELP katib_suggestion_succeeded_total Number of suggestions that have successfully completed +# TYPE katib_suggestion_succeeded_total counter +katib_suggestion_succeeded_total 15 +# HELP katib_suggestion_failed_total Number of suggestions that have failed +# TYPE katib_suggestion_failed_total counter +katib_suggestion_failed_total 5 +# HELP katib_suggestion_duration_seconds Duration of suggestion processes from start to completion +# TYPE katib_suggestion_duration_seconds histogram +katib_suggestion_duration_seconds_bucket{le="60"} 3 +katib_suggestion_duration_seconds_bucket{le="300"} 17 +katib_suggestion_duration_seconds_count 20 +katib_suggestion_duration_seconds_sum 5000 +# HELP katib_controller_reconcile_count Number of reconcile loops executed by the Katib controller +# TYPE katib_controller_reconcile_count counter +katib_controller_reconcile_count 200 +# HELP katib_controller_reconcile_duration_seconds Duration of reconcile loops executed by the Katib controller +# TYPE katib_controller_reconcile_duration_seconds histogram +katib_controller_reconcile_duration_seconds_bucket{le="0.1"} 150 +katib_controller_reconcile_duration_seconds_bucket{le="1"} 200 +katib_controller_reconcile_duration_seconds_count 200 +katib_controller_reconcile_duration_seconds_sum 100 +# HELP kserve_inferences_total Total number of inferences made +# TYPE kserve_inferences_total counter +kserve_inferences_total{model="example-model"} 1000 +# HELP kserve_inference_duration_seconds Duration of inference requests +# TYPE kserve_inference_duration_seconds histogram +kserve_inference_duration_seconds_bucket{model="example-model",le="0.1"} 800 +kserve_inference_duration_seconds_bucket{model="example-model",le="1"} 1000 +kserve_inference_duration_seconds_count{model="example-model"} 1000 +kserve_inference_duration_seconds_sum{model="example-model"} 500 +# HELP kserve_inference_request_bytes Size of inference request payloads +# TYPE kserve_inference_request_bytes histogram +kserve_inference_request_bytes_bucket{model="example-model",le="1000"} 950 +kserve_inference_request_bytes_count{model="example-model"} 1000 +kserve_inference_request_bytes_sum{model="example-model"} 95000 +# HELP kserve_inference_response_bytes Size of inference response payloads +# TYPE kserve_inference_response_bytes histogram +kserve_inference_response_bytes_bucket{model="example-model",le="1000"} 980 +kserve_inference_response_bytes_count{model="example-model"} 1000 +kserve_inference_response_bytes_sum{model="example-model"} 98000 +# HELP notebook_server_created_total Total number of notebook servers created +# TYPE notebook_server_created_total counter +notebook_server_created_total 5 +# HELP notebook_server_running_total Number of notebook servers currently running +# TYPE notebook_server_running_total gauge +notebook_server_running_total 2 +# HELP notebook_server_succeeded_total Number of notebook servers that have successfully completed +# TYPE notebook_server_succeeded_total counter +notebook_server_succeeded_total 3 +# HELP notebook_server_failed_total Number of notebook servers that have failed +# TYPE notebook_server_failed_total counter +notebook_server_failed_total 1 +# HELP notebook_server_reconcile_count Number of reconcile loops executed by the notebook controller +# TYPE notebook_server_reconcile_count counter +notebook_server_reconcile_count 40 +# HELP notebook_server_reconcile_duration_seconds Duration of reconcile loops executed by the notebook controller +# TYPE notebook_server_reconcile_duration_seconds histogram +notebook_server_reconcile_duration_seconds_bucket{le="0.1"} 30 +notebook_server_reconcile_duration_seconds_bucket{le="1"} 40 +notebook_server_reconcile_duration_seconds_count 40 +notebook_server_reconcile_duration_seconds_sum 20 +# HELP kserve_inference_errors Number of errors encountered during inference +# TYPE kserve_inference_errors counter +kserve_inference_errors{model="example-model"} 5 diff --git a/kubeflow/tests/test_e2e.py b/kubeflow/tests/test_e2e.py new file mode 100644 index 0000000000000..c72168c0dbcc5 --- /dev/null +++ b/kubeflow/tests/test_e2e.py @@ -0,0 +1,14 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +import pytest + +from datadog_checks.base.constants import ServiceCheck +from datadog_checks.dev.utils import assert_service_checks + + +@pytest.mark.e2e +def test_check_kubeflow_e2e(dd_agent_check, instance): + aggregator = dd_agent_check(instance, rate=True) + aggregator.assert_service_check('kubeflow.openmetrics.health', ServiceCheck.OK, count=2) + assert_service_checks(aggregator) diff --git a/kubeflow/tests/test_unit.py b/kubeflow/tests/test_unit.py new file mode 100644 index 0000000000000..ff7bebb706531 --- /dev/null +++ b/kubeflow/tests/test_unit.py @@ -0,0 +1,34 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +import pytest + +from datadog_checks.base.constants import ServiceCheck +from datadog_checks.dev.utils import get_metadata_metrics +from datadog_checks.kubeflow import KubeflowCheck + +from .common import METRICS_MOCK, get_fixture_path + + +def test_check_kubeflow(dd_run_check, aggregator, instance, mock_http_response): + mock_http_response(file_path=get_fixture_path('kubeflow_metrics.txt')) + check = KubeflowCheck('kubeflow', {}, [instance]) + dd_run_check(check) + + for metric in METRICS_MOCK: + aggregator.assert_metric(metric) + aggregator.assert_metric_has_tag(metric, 'test:test') + + aggregator.assert_all_metrics_covered() + aggregator.assert_metrics_using_metadata(get_metadata_metrics()) + aggregator.assert_service_check('kubeflow.openmetrics.health', ServiceCheck.OK) + + +def test_empty_instance(dd_run_check): + with pytest.raises( + Exception, + match='InstanceConfig`:\nopenmetrics_endpoint\n Field required', + ): + check = KubeflowCheck('KubeflowCheck', {}, [{}]) + dd_run_check(check)