From 4c2f06000b9bcaf315bbaa0c66b7947b98f20c5c Mon Sep 17 00:00:00 2001 From: Mathis Marcotte Date: Fri, 9 Aug 2024 14:50:32 +0000 Subject: [PATCH 01/24] added prometheus calls to culling logic --- .../workflows/build-notebook-controller.yaml | 114 ++++++++++++++++++ .../controllers/culling_controller.go | 106 +++++++++++++++- 2 files changed, 218 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/build-notebook-controller.yaml diff --git a/.github/workflows/build-notebook-controller.yaml b/.github/workflows/build-notebook-controller.yaml new file mode 100644 index 00000000000..974aec4aa15 --- /dev/null +++ b/.github/workflows/build-notebook-controller.yaml @@ -0,0 +1,114 @@ +name: component/notebook-controller +on: + push: + branches: + - kubeflow-aaw2.0 + paths: + - components/notebook-controller/** + pull_request: + paths: + - components/notebook-controller/** + types: + - 'opened' + - 'synchronize' + - 'reopened' + schedule: + - cron: '0 22 * * *' +# Environment variables available to all jobs and steps in this workflow +env: + REGISTRY_NAME: k8scc01covidacr + DEV_REGISTRY_NAME: k8scc01covidacrdev + CLUSTER_NAME: k8s-cancentral-02-covid-aks + CLUSTER_RESOURCE_GROUP: k8s-cancentral-01-covid-aks + TRIVY_VERSION: "v0.43.1" + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} + HADOLINT_VERSION: "2.12.0" + +jobs: + build-push: + runs-on: ubuntu-latest + services: + registry: + image: registry:2 + ports: + - 5000:5000 + steps: + - uses: actions/checkout@v2 + + # Determine if pushing to Prod or Dev ACR + - name: Set ENV variables for a PR containing the auto-deploy tag + if: github.event_name == 'pull_request' && contains( github.event.pull_request.labels.*.name, 'auto-deploy') + run: echo "REGISTRY=${{env.DEV_REGISTRY_NAME}}.azurecr.io" >> "$GITHUB_ENV" + + - name: Set ENV variable for pushes to master + if: github.event_name == 'push' && github.ref == 'refs/heads/kubeflow-aaw2.0' + run: echo "REGISTRY=${{env.REGISTRY_NAME}}.azurecr.io" >> "$GITHUB_ENV" + + # Connect to Azure Container registry (ACR) + - uses: azure/docker-login@v1 + with: + login-server: ${{ env.REGISTRY_NAME }}.azurecr.io + username: ${{ secrets.REGISTRY_USERNAME }} + password: ${{ secrets.REGISTRY_PASSWORD }} + + # Connect to DEV Azure Container registry (ACR) + - uses: azure/docker-login@v1 + with: + login-server: ${{ env.DEV_REGISTRY_NAME }}.azurecr.io + username: ${{ secrets.DEV_REGISTRY_USERNAME }} + password: ${{ secrets.DEV_REGISTRY_PASSWORD }} + + - name: Run Hadolint + run: | + sudo curl -L https://github.com/hadolint/hadolint/releases/download/v${{ env.HADOLINT_VERSION }}/hadolint-Linux-x86_64 --output hadolint + sudo chmod +x hadolint + ./hadolint ./components/notebook-controller/Dockerfile --no-fail + + # Container build to a Azure Container registry (ACR) + - name: Docker build + run: | + docker build \ + -t localhost:5000/kubeflow/notebook-controller:${{ github.sha }} \ + components/notebook-controller/ + docker push localhost:5000/kubeflow/notebook-controller:${{ github.sha }} + docker image prune + # Scan image for vulnerabilities + - name: Aqua Security Trivy image scan + run: | + curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sh -s -- -b /usr/local/bin ${{ env.TRIVY_VERSION }} + trivy image localhost:5000/kubeflow/notebook-controller:${{ github.sha }} --exit-code 1 --timeout=20m --security-checks vuln --severity CRITICAL + + # Run Dockle + - name: Run dockle + uses: goodwithtech/dockle-action@main + with: + image: localhost:5000/kubeflow/notebook-controller:${{ github.sha }} + format: 'list' + exit-code: '0' + exit-level: 'fatal' + ignore: 'DKL-DI-0006' + + # Pushes if this is a push to master or an update to a PR that has auto-deploy label + - name: Test if we should push to ACR + id: should-i-push + if: | + github.event_name == 'push' || + ( + github.event_name == 'pull_request' && + contains( github.event.pull_request.labels.*.name, 'auto-deploy') + ) + run: echo "::set-output name=boolean::true" + + - name: Docker push + if: steps.should-i-push.outputs.boolean == 'true' + run: | + docker pull localhost:5000/kubeflow/notebook-controller:${{ github.sha }} + docker tag localhost:5000/kubeflow/notebook-controller:${{ github.sha }} ${{ env.REGISTRY }}/kubeflow/notebook-controller:${{ github.sha }} + docker push ${{ env.REGISTRY }}/kubeflow/notebook-controller:${{ github.sha }} + + - name: Slack Notification + if: failure() && github.event_name=='schedule' + uses: act10ns/slack@v1 + with: + status: failure + message: kubeflow build failed. https://github.com/StatCan/kubeflow/actions/runs/${{github.run_id}} \ No newline at end of file diff --git a/components/notebook-controller/controllers/culling_controller.go b/components/notebook-controller/controllers/culling_controller.go index f76a136485c..f772cf33b5a 100644 --- a/components/notebook-controller/controllers/culling_controller.go +++ b/components/notebook-controller/controllers/culling_controller.go @@ -5,6 +5,7 @@ import ( "encoding/json" "fmt" "net/http" + "net/url" "os" "strconv" "time" @@ -67,6 +68,26 @@ type KernelStatus struct { Connections int `json:"connections"` } +type NotebookMetricsDataResultsMetric struct { + Container string `json:"container"` +} + +type NotebookMetricsDataResults struct { + Metric NotebookMetricsDataResultsMetric `json:"metric"` + Value []float32 `json:"value"` //first value is unix_time, second value is the result +} + +type NotebookMetricsData struct { + ResultType string `json:"resultType"` + Result []NotebookMetricsDataResults `json:"result"` +} + +// NotebookMetrics struct: +type NotebookMetrics struct { + Status string `json:"status"` + Data NotebookMetricsData `json:"data"` +} + // CullingReconciler : Type of a reconciler that will be culling idle notebooks type CullingReconciler struct { client.Client @@ -240,6 +261,45 @@ func getNotebookApiKernels(nm, ns string, log logr.Logger) []KernelStatus { return kernels } +func getNotebookMetrics(nb string, ns string, query string, log logr.Logger) *NotebookMetrics { + // Get the Kernels' status from the Server's `/api/kernels` endpoint + client := &http.Client{ + Timeout: time.Second * 10, + } + + domain := GetEnvDefault("CLUSTER_DOMAIN", DEFAULT_CLUSTER_DOMAIN) + metricsUrl := fmt.Sprintf( + "http://kube-prometheus-stack-prometheus.prometheus-system.svc.%s/api/v1/query?query="+query, + domain) + if GetEnvDefault("DEV", DEFAULT_DEV) != "false" { + metricsUrl = "http://localhost:9090/api/v1/query?query=" + query + } + + resp, err := client.Get(metricsUrl) + if err != nil { + log.Error(err, fmt.Sprintf("Error talking to %s", metricsUrl)) + return nil + } + + // Decode the body + defer resp.Body.Close() + if resp.StatusCode != 200 { + log.Info(fmt.Sprintf( + "Warning: GET to %s: %d", metricsUrl, resp.StatusCode)) + return nil + } + + var metrics NotebookMetrics + + err = json.NewDecoder(resp.Body).Decode(&metrics) + if err != nil { + log.Error(err, "Error parsing JSON response for Notebook Metrics.") + return nil + } + + return &metrics +} + func allKernelsAreIdle(kernels []KernelStatus, log logr.Logger) bool { // Iterate on the list of kernels' status. // If all kernels are on execution_state=idle then this function returns true. @@ -262,13 +322,36 @@ func updateNotebookLastActivityAnnotation(meta *metav1.ObjectMeta, log logr.Logg kernels := getNotebookApiKernels(nm, ns, log) if kernels == nil { log.Info("Could not GET the kernels status. Will not update last-activity.") - return } else if len(kernels) == 0 { log.Info("Notebook has no kernels. Will not update last-activity") + } else { + updateTimestampFromKernelsActivity(meta, kernels, log) + return + } + + cpuQuery := fmt.Sprintf("sum by(container) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"%s\", container=\"%s\"})", + ns, nm) + cpuMetrics := getNotebookMetrics(nm, ns, url.QueryEscape(cpuQuery), log) + if cpuMetrics == nil { + log.Info("Could not GET the CPU usage metrics. Will not update last-activity.") + } else if len(cpuMetrics.Data.Result) == 0 { + log.Info("Notebook has no CPU usage metrics. Will not update last-activity.") + } else { + updateTimestampFromMetrics(meta, "CPU ssage", *cpuMetrics, 0.09, log) return } - updateTimestampFromKernelsActivity(meta, kernels, log) + ioQuery := fmt.Sprintf("ceil(sum by(container) (rate(container_fs_reads_total{device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", namespace=\"%s\", container=\"%s\"}[2m]) + rate(container_fs_writes_total{device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", namespace=\"%s\", container=\"%s\"}[2m])))", + ns, nm, ns, nm) + ioMetrics := getNotebookMetrics(nm, ns, url.QueryEscape(ioQuery), log) + if ioMetrics == nil { + log.Info("Could not GET the disk IO metrics. Will not update last-activity.") + } else if len(ioMetrics.Data.Result) == 0 { + log.Info("Notebook has no disk IO metrics. Will not update last-activity.") + } else { + updateTimestampFromMetrics(meta, "Disk IO", *ioMetrics, 0, log) + return + } } func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []KernelStatus, log logr.Logger) { @@ -307,6 +390,25 @@ func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []Kerne log.Info(fmt.Sprintf("Successfully updated last-activity from latest kernel action, %s", t)) } +func updateTimestampFromMetrics(meta *metav1.ObjectMeta, metricsName string, metrics NotebookMetrics, threshold float32, log logr.Logger) { + // Metrics Data Result should always be only one value. + // Result Value should always be 2 values, first value is unix_time, second value is the result + if !(metrics.Data.Result[0].Value[1] > threshold) { + // if metrics don't pass the threshold, don't update the recent activity + return + } + + recentTime, err := time.Parse(time.UnixDate, strconv.FormatFloat(float64(metrics.Data.Result[0].Value[0]), 'g', -1, 32)) + if err != nil { + log.Error(err, fmt.Sprintf("Error parsing the last-activity from the %s metrics results", metricsName)) + return + } + + t := recentTime.Format(time.RFC3339) + meta.Annotations[LAST_ACTIVITY_ANNOTATION] = t + log.Info(fmt.Sprintf("Successfully updated last-activity from thev %s metrics, %s", metricsName, t)) +} + func updateLastCullingCheckTimestampAnnotation(meta *metav1.ObjectMeta, log logr.Logger) { t := createTimestamp() if _, ok := meta.GetAnnotations()[LAST_ACTIVITY_CHECK_TIMESTAMP_ANNOTATION]; !ok { From caa9ad74b15a8353058b1f4f451caaa0bc8283a9 Mon Sep 17 00:00:00 2001 From: Mathis Marcotte Date: Fri, 9 Aug 2024 17:27:43 +0000 Subject: [PATCH 02/24] updated to match upstream 1.7 tag --- .github/workflows/build-notebook-controller.yaml | 6 +++--- components/notebook-controller/Dockerfile | 2 +- .../controllers/culling_controller_test.go | 12 ------------ .../controllers/notebook_controller.go | 13 ------------- components/notebook-controller/main.go | 10 ---------- 5 files changed, 4 insertions(+), 39 deletions(-) diff --git a/.github/workflows/build-notebook-controller.yaml b/.github/workflows/build-notebook-controller.yaml index 974aec4aa15..9caeb0b56c6 100644 --- a/.github/workflows/build-notebook-controller.yaml +++ b/.github/workflows/build-notebook-controller.yaml @@ -67,11 +67,11 @@ jobs: # Container build to a Azure Container registry (ACR) - name: Docker build run: | - docker build \ - -t localhost:5000/kubeflow/notebook-controller:${{ github.sha }} \ - components/notebook-controller/ + cd ./components/notebook-controller + make docker-build IMG=localhost:5000/kubeflow/notebook-controller TAG=${{ github.sha }} docker push localhost:5000/kubeflow/notebook-controller:${{ github.sha }} docker image prune + # Scan image for vulnerabilities - name: Aqua Security Trivy image scan run: | diff --git a/components/notebook-controller/Dockerfile b/components/notebook-controller/Dockerfile index 747db5eb768..ad890c1bf6c 100644 --- a/components/notebook-controller/Dockerfile +++ b/components/notebook-controller/Dockerfile @@ -7,7 +7,7 @@ # This is necessary because the Jupyter controller now depends on # components/common ARG GOLANG_VERSION=1.17 -FROM golang:${GOLANG_VERSION} as builder +FROM golang:${GOLANG_VERSION} AS builder WORKDIR /workspace diff --git a/components/notebook-controller/controllers/culling_controller_test.go b/components/notebook-controller/controllers/culling_controller_test.go index 5360d3df64a..89aebc8334a 100644 --- a/components/notebook-controller/controllers/culling_controller_test.go +++ b/components/notebook-controller/controllers/culling_controller_test.go @@ -128,18 +128,6 @@ func TestAllKernelsAreIdle(t *testing.T) { }, result: false, }, - { - testName: "/api/kernels returns an list of kernels, with one kernel in busy state.", - kernels: []KernelStatus{ - { - ExecutionState: KERNEL_EXECUTION_STATE_IDLE, - }, - { - ExecutionState: KERNEL_EXECUTION_STATE_BUSY, - }, - }, - result: false, - }, } for _, c := range testCases { diff --git a/components/notebook-controller/controllers/notebook_controller.go b/components/notebook-controller/controllers/notebook_controller.go index 5976d804b88..67c72e69497 100644 --- a/components/notebook-controller/controllers/notebook_controller.go +++ b/components/notebook-controller/controllers/notebook_controller.go @@ -281,19 +281,6 @@ func createNotebookStatus(r *NotebookReconciler, nb *v1beta1.Notebook, notebookContainerFound = true break } - return ctrl.Result{RequeueAfter: culler.GetRequeueTime()}, nil -} - -func updateNotebookStatus(r *NotebookReconciler, nb *v1beta1.Notebook, - sts *appsv1.StatefulSet, pod *corev1.Pod, req ctrl.Request) error { - - log := r.Log.WithValues("notebook", req.NamespacedName) - ctx := context.Background() - - status, err := createNotebookStatus(r, nb, sts, pod, req) - if err != nil { - return err - } if !notebookContainerFound { log.Error(nil, "Could not find container with the same name as Notebook "+ diff --git a/components/notebook-controller/main.go b/components/notebook-controller/main.go index a95eb0cf590..619ff858df9 100644 --- a/components/notebook-controller/main.go +++ b/components/notebook-controller/main.go @@ -131,16 +131,6 @@ func main() { os.Exit(1) } - if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { - setupLog.Error(err, "unable to set up health check") - os.Exit(1) - } - - if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { - setupLog.Error(err, "unable to set up ready check") - os.Exit(1) - } - // uncomment when we need the conversion webhook. // if err = (&nbv1beta1.Notebook{}).SetupWebhookWithManager(mgr); err != nil { // setupLog.Error(err, "unable to create webhook", "webhook", "Captain") From 08e19956cc055e9dfebaa6ac3020d8203e35686c Mon Sep 17 00:00:00 2001 From: Mathis Marcotte Date: Fri, 9 Aug 2024 17:41:28 +0000 Subject: [PATCH 03/24] removed dockle --- .github/workflows/build-notebook-controller.yaml | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/.github/workflows/build-notebook-controller.yaml b/.github/workflows/build-notebook-controller.yaml index 9caeb0b56c6..9dbe8bf1094 100644 --- a/.github/workflows/build-notebook-controller.yaml +++ b/.github/workflows/build-notebook-controller.yaml @@ -71,22 +71,12 @@ jobs: make docker-build IMG=localhost:5000/kubeflow/notebook-controller TAG=${{ github.sha }} docker push localhost:5000/kubeflow/notebook-controller:${{ github.sha }} docker image prune - + # Scan image for vulnerabilities - name: Aqua Security Trivy image scan run: | curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sh -s -- -b /usr/local/bin ${{ env.TRIVY_VERSION }} trivy image localhost:5000/kubeflow/notebook-controller:${{ github.sha }} --exit-code 1 --timeout=20m --security-checks vuln --severity CRITICAL - - # Run Dockle - - name: Run dockle - uses: goodwithtech/dockle-action@main - with: - image: localhost:5000/kubeflow/notebook-controller:${{ github.sha }} - format: 'list' - exit-code: '0' - exit-level: 'fatal' - ignore: 'DKL-DI-0006' # Pushes if this is a push to master or an update to a PR that has auto-deploy label - name: Test if we should push to ACR From 9e5b7d15d8ea4b30b2789296e431d0cf0218fe6c Mon Sep 17 00:00:00 2001 From: Mathis Marcotte Date: Fri, 9 Aug 2024 17:55:57 +0000 Subject: [PATCH 04/24] commented out make test workflow on notebook-controller --- .github/workflows/notebook_controller_unit_test.yaml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/notebook_controller_unit_test.yaml b/.github/workflows/notebook_controller_unit_test.yaml index 9cf29664090..ab65a92ec1a 100644 --- a/.github/workflows/notebook_controller_unit_test.yaml +++ b/.github/workflows/notebook_controller_unit_test.yaml @@ -1,8 +1,9 @@ name: Run Notebook Controller unit tests -on: - pull_request: - paths: - - components/notebook-controller/** +# AAW: commented out because it was always failing +# on: +# pull_request: +# paths: +# - components/notebook-controller/** jobs: build: From c843ae98dfd98d29d84b3b8f9c689577680b95f4 Mon Sep 17 00:00:00 2001 From: Mathis Marcotte Date: Mon, 12 Aug 2024 12:40:29 +0000 Subject: [PATCH 05/24] changed workflow to only use non-dev ACR --- .github/workflows/build-notebook-controller.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-notebook-controller.yaml b/.github/workflows/build-notebook-controller.yaml index 9dbe8bf1094..824417d9664 100644 --- a/.github/workflows/build-notebook-controller.yaml +++ b/.github/workflows/build-notebook-controller.yaml @@ -38,7 +38,7 @@ jobs: # Determine if pushing to Prod or Dev ACR - name: Set ENV variables for a PR containing the auto-deploy tag if: github.event_name == 'pull_request' && contains( github.event.pull_request.labels.*.name, 'auto-deploy') - run: echo "REGISTRY=${{env.DEV_REGISTRY_NAME}}.azurecr.io" >> "$GITHUB_ENV" + run: echo "REGISTRY=${{env.REGISTRY_NAME}}.azurecr.io" >> "$GITHUB_ENV" - name: Set ENV variable for pushes to master if: github.event_name == 'push' && github.ref == 'refs/heads/kubeflow-aaw2.0' From e8391b63ad55bf7452c131c3b2c65cd41c74d954 Mon Sep 17 00:00:00 2001 From: Mathis Marcotte Date: Mon, 12 Aug 2024 12:55:34 +0000 Subject: [PATCH 06/24] added path to pull request workflow of centraldashboard --- .github/workflows/build-centraldashboard.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build-centraldashboard.yml b/.github/workflows/build-centraldashboard.yml index f84801a9202..8694e9204ce 100644 --- a/.github/workflows/build-centraldashboard.yml +++ b/.github/workflows/build-centraldashboard.yml @@ -6,6 +6,8 @@ on: paths: - components/centraldashboard/** pull_request: + paths: + - components/centraldashboard/** types: - 'opened' - 'synchronize' From 40823a4266cb72f51a5ca113deb4ee65ff6b9cae Mon Sep 17 00:00:00 2001 From: Mathis Marcotte Date: Mon, 12 Aug 2024 18:39:48 +0000 Subject: [PATCH 07/24] added port to prometheus url --- .../controllers/culling_controller.go | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/components/notebook-controller/controllers/culling_controller.go b/components/notebook-controller/controllers/culling_controller.go index f772cf33b5a..48d47ad1d40 100644 --- a/components/notebook-controller/controllers/culling_controller.go +++ b/components/notebook-controller/controllers/culling_controller.go @@ -269,7 +269,7 @@ func getNotebookMetrics(nb string, ns string, query string, log logr.Logger) *No domain := GetEnvDefault("CLUSTER_DOMAIN", DEFAULT_CLUSTER_DOMAIN) metricsUrl := fmt.Sprintf( - "http://kube-prometheus-stack-prometheus.prometheus-system.svc.%s/api/v1/query?query="+query, + "http://kube-prometheus-stack-prometheus.prometheus-system.svc.%s:9090/api/v1/query?query="+query, domain) if GetEnvDefault("DEV", DEFAULT_DEV) != "false" { metricsUrl = "http://localhost:9090/api/v1/query?query=" + query @@ -329,6 +329,14 @@ func updateNotebookLastActivityAnnotation(meta *metav1.ObjectMeta, log logr.Logg return } + //test metrics + testMetrics := getNotebookMetrics(nm, ns, "up", log) + if testMetrics == nil { + log.Info("Could not GET the TEST usage metrics. Will not update last-activity.") + } else { + log.Info("TestMetricsFound. Will not update last-activity.") + } + //end test metrics cpuQuery := fmt.Sprintf("sum by(container) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"%s\", container=\"%s\"})", ns, nm) cpuMetrics := getNotebookMetrics(nm, ns, url.QueryEscape(cpuQuery), log) From 8c9424eb4974f786ca6189929326178ed689fa44 Mon Sep 17 00:00:00 2001 From: Mathis Marcotte Date: Mon, 12 Aug 2024 19:01:51 +0000 Subject: [PATCH 08/24] updated data type in metrics results struct --- .../controllers/culling_controller.go | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/components/notebook-controller/controllers/culling_controller.go b/components/notebook-controller/controllers/culling_controller.go index 48d47ad1d40..d5f0e4c499c 100644 --- a/components/notebook-controller/controllers/culling_controller.go +++ b/components/notebook-controller/controllers/culling_controller.go @@ -74,7 +74,7 @@ type NotebookMetricsDataResultsMetric struct { type NotebookMetricsDataResults struct { Metric NotebookMetricsDataResultsMetric `json:"metric"` - Value []float32 `json:"value"` //first value is unix_time, second value is the result + Value []string `json:"value"` //first value is unix_time, second value is the result } type NotebookMetricsData struct { @@ -345,7 +345,7 @@ func updateNotebookLastActivityAnnotation(meta *metav1.ObjectMeta, log logr.Logg } else if len(cpuMetrics.Data.Result) == 0 { log.Info("Notebook has no CPU usage metrics. Will not update last-activity.") } else { - updateTimestampFromMetrics(meta, "CPU ssage", *cpuMetrics, 0.09, log) + updateTimestampFromMetrics(meta, "CPU usage", *cpuMetrics, 0.09, log) return } @@ -401,20 +401,26 @@ func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []Kerne func updateTimestampFromMetrics(meta *metav1.ObjectMeta, metricsName string, metrics NotebookMetrics, threshold float32, log logr.Logger) { // Metrics Data Result should always be only one value. // Result Value should always be 2 values, first value is unix_time, second value is the result - if !(metrics.Data.Result[0].Value[1] > threshold) { + parseValue, err := strconv.ParseFloat(metrics.Data.Result[0].Value[1], 32) + if err != nil { + log.Error(err, fmt.Sprintf("Error parsing the value from the %s metrics results", metricsName)) + return + } + if !(float32(parseValue) > threshold) { // if metrics don't pass the threshold, don't update the recent activity + log.Info(fmt.Sprintf("%s of %s doesn't exceed the threshold %s. Not updating the last-activity", metricsName, metrics.Data.Result[0].Value[1], threshold)) return } - recentTime, err := time.Parse(time.UnixDate, strconv.FormatFloat(float64(metrics.Data.Result[0].Value[0]), 'g', -1, 32)) + recentTime, err := time.Parse(time.UnixDate, metrics.Data.Result[0].Value[0]) if err != nil { - log.Error(err, fmt.Sprintf("Error parsing the last-activity from the %s metrics results", metricsName)) + log.Error(err, fmt.Sprintf("Error parsing the last-activity time from the %s metrics results", metricsName)) return } t := recentTime.Format(time.RFC3339) meta.Annotations[LAST_ACTIVITY_ANNOTATION] = t - log.Info(fmt.Sprintf("Successfully updated last-activity from thev %s metrics, %s", metricsName, t)) + log.Info(fmt.Sprintf("Successfully updated last-activity from the %s metrics, %s", metricsName, t)) } func updateLastCullingCheckTimestampAnnotation(meta *metav1.ObjectMeta, log logr.Logger) { From 566a1c62fe51396d61f1093ec0807baf7d93824e Mon Sep 17 00:00:00 2001 From: Mathis Marcotte Date: Tue, 13 Aug 2024 12:14:12 +0000 Subject: [PATCH 09/24] updated datatype for struct prop --- .../controllers/culling_controller.go | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/components/notebook-controller/controllers/culling_controller.go b/components/notebook-controller/controllers/culling_controller.go index d5f0e4c499c..5d2e636f8c6 100644 --- a/components/notebook-controller/controllers/culling_controller.go +++ b/components/notebook-controller/controllers/culling_controller.go @@ -74,7 +74,7 @@ type NotebookMetricsDataResultsMetric struct { type NotebookMetricsDataResults struct { Metric NotebookMetricsDataResultsMetric `json:"metric"` - Value []string `json:"value"` //first value is unix_time, second value is the result + Value [2]interface{} `json:"value"` //first value is unix_time, second value is the result } type NotebookMetricsData struct { @@ -82,7 +82,7 @@ type NotebookMetricsData struct { Result []NotebookMetricsDataResults `json:"result"` } -// NotebookMetrics struct: +// NotebookMetrics struct type NotebookMetrics struct { Status string `json:"status"` Data NotebookMetricsData `json:"data"` @@ -398,21 +398,17 @@ func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []Kerne log.Info(fmt.Sprintf("Successfully updated last-activity from latest kernel action, %s", t)) } -func updateTimestampFromMetrics(meta *metav1.ObjectMeta, metricsName string, metrics NotebookMetrics, threshold float32, log logr.Logger) { +func updateTimestampFromMetrics(meta *metav1.ObjectMeta, metricsName string, metrics NotebookMetrics, threshold float64, log logr.Logger) { // Metrics Data Result should always be only one value. // Result Value should always be 2 values, first value is unix_time, second value is the result - parseValue, err := strconv.ParseFloat(metrics.Data.Result[0].Value[1], 32) - if err != nil { - log.Error(err, fmt.Sprintf("Error parsing the value from the %s metrics results", metricsName)) - return - } - if !(float32(parseValue) > threshold) { + + if !(metrics.Data.Result[0].Value[1].(float64) > threshold) { // if metrics don't pass the threshold, don't update the recent activity log.Info(fmt.Sprintf("%s of %s doesn't exceed the threshold %s. Not updating the last-activity", metricsName, metrics.Data.Result[0].Value[1], threshold)) return } - recentTime, err := time.Parse(time.UnixDate, metrics.Data.Result[0].Value[0]) + recentTime, err := time.Parse(time.UnixDate, metrics.Data.Result[0].Value[0].(string)) if err != nil { log.Error(err, fmt.Sprintf("Error parsing the last-activity time from the %s metrics results", metricsName)) return From ae5a5b2e3538da086efade55e8bf6bf27c6ca4f2 Mon Sep 17 00:00:00 2001 From: Mathis Marcotte Date: Tue, 13 Aug 2024 12:54:03 +0000 Subject: [PATCH 10/24] added test messages --- .../notebook-controller/controllers/culling_controller.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/components/notebook-controller/controllers/culling_controller.go b/components/notebook-controller/controllers/culling_controller.go index 5d2e636f8c6..5939f2397a5 100644 --- a/components/notebook-controller/controllers/culling_controller.go +++ b/components/notebook-controller/controllers/culling_controller.go @@ -334,11 +334,12 @@ func updateNotebookLastActivityAnnotation(meta *metav1.ObjectMeta, log logr.Logg if testMetrics == nil { log.Info("Could not GET the TEST usage metrics. Will not update last-activity.") } else { - log.Info("TestMetricsFound. Will not update last-activity.") + log.Info(fmt.Sprintf("TestMetricsFound %s. Will not update last-activity.", testMetrics.Data.Result)) } //end test metrics cpuQuery := fmt.Sprintf("sum by(container) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"%s\", container=\"%s\"})", ns, nm) + log.Info(fmt.Sprintf("Test query %s. test 2 %s", cpuQuery, url.QueryEscape(cpuQuery))) cpuMetrics := getNotebookMetrics(nm, ns, url.QueryEscape(cpuQuery), log) if cpuMetrics == nil { log.Info("Could not GET the CPU usage metrics. Will not update last-activity.") From cc22a81e0f2c13d44f456c40c21c3263a51a40ea Mon Sep 17 00:00:00 2001 From: Mathis Marcotte Date: Tue, 13 Aug 2024 13:17:14 +0000 Subject: [PATCH 11/24] added test prints --- .../notebook-controller/controllers/culling_controller.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/components/notebook-controller/controllers/culling_controller.go b/components/notebook-controller/controllers/culling_controller.go index 5939f2397a5..25893029f76 100644 --- a/components/notebook-controller/controllers/culling_controller.go +++ b/components/notebook-controller/controllers/culling_controller.go @@ -275,6 +275,8 @@ func getNotebookMetrics(nb string, ns string, query string, log logr.Logger) *No metricsUrl = "http://localhost:9090/api/v1/query?query=" + query } + log.Info(fmt.Sprintf( + "Printing query %s", metricsUrl)) resp, err := client.Get(metricsUrl) if err != nil { log.Error(err, fmt.Sprintf("Error talking to %s", metricsUrl)) @@ -334,7 +336,7 @@ func updateNotebookLastActivityAnnotation(meta *metav1.ObjectMeta, log logr.Logg if testMetrics == nil { log.Info("Could not GET the TEST usage metrics. Will not update last-activity.") } else { - log.Info(fmt.Sprintf("TestMetricsFound %s. Will not update last-activity.", testMetrics.Data.Result)) + //log.Info(fmt.Sprintf("TestMetricsFound %s. Will not update last-activity.", testMetrics.Data.Result)) } //end test metrics cpuQuery := fmt.Sprintf("sum by(container) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"%s\", container=\"%s\"})", From aae56a90b0f7a12677b076034f1012e346a8437d Mon Sep 17 00:00:00 2001 From: Mathis Marcotte Date: Tue, 13 Aug 2024 13:23:42 +0000 Subject: [PATCH 12/24] fix string concat for urls --- .../notebook-controller/controllers/culling_controller.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/components/notebook-controller/controllers/culling_controller.go b/components/notebook-controller/controllers/culling_controller.go index 25893029f76..3b866b13e32 100644 --- a/components/notebook-controller/controllers/culling_controller.go +++ b/components/notebook-controller/controllers/culling_controller.go @@ -269,10 +269,10 @@ func getNotebookMetrics(nb string, ns string, query string, log logr.Logger) *No domain := GetEnvDefault("CLUSTER_DOMAIN", DEFAULT_CLUSTER_DOMAIN) metricsUrl := fmt.Sprintf( - "http://kube-prometheus-stack-prometheus.prometheus-system.svc.%s:9090/api/v1/query?query="+query, - domain) + "http://kube-prometheus-stack-prometheus.prometheus-system.svc.%s:9090/api/v1/query?query=%s", + domain, query) if GetEnvDefault("DEV", DEFAULT_DEV) != "false" { - metricsUrl = "http://localhost:9090/api/v1/query?query=" + query + metricsUrl = fmt.Sprintf("http://localhost:9090/api/v1/query?query=%s", query) } log.Info(fmt.Sprintf( From 7c765746b4353f71ec0f8bca3dae9b8f58e8ac03 Mon Sep 17 00:00:00 2001 From: Mathis Marcotte Date: Tue, 13 Aug 2024 13:44:41 +0000 Subject: [PATCH 13/24] updated interfacte type handling --- .../notebook-controller/controllers/culling_controller.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/components/notebook-controller/controllers/culling_controller.go b/components/notebook-controller/controllers/culling_controller.go index 3b866b13e32..23d6894fa3d 100644 --- a/components/notebook-controller/controllers/culling_controller.go +++ b/components/notebook-controller/controllers/culling_controller.go @@ -404,8 +404,12 @@ func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []Kerne func updateTimestampFromMetrics(meta *metav1.ObjectMeta, metricsName string, metrics NotebookMetrics, threshold float64, log logr.Logger) { // Metrics Data Result should always be only one value. // Result Value should always be 2 values, first value is unix_time, second value is the result - - if !(metrics.Data.Result[0].Value[1].(float64) > threshold) { + parseValue, err := strconv.ParseFloat(metrics.Data.Result[0].Value[1].(string), 64) + if err != nil { + log.Error(err, fmt.Sprintf("Error parsing the value from the %s metrics results", metricsName)) + return + } + if !(parseValue > threshold) { // if metrics don't pass the threshold, don't update the recent activity log.Info(fmt.Sprintf("%s of %s doesn't exceed the threshold %s. Not updating the last-activity", metricsName, metrics.Data.Result[0].Value[1], threshold)) return From 6aa5434b80f5432dd6d876a7a2c0f738d7ba0445 Mon Sep 17 00:00:00 2001 From: Mathis Marcotte Date: Tue, 13 Aug 2024 14:27:56 +0000 Subject: [PATCH 14/24] added flag to check if last activited updated --- .../controllers/culling_controller.go | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/components/notebook-controller/controllers/culling_controller.go b/components/notebook-controller/controllers/culling_controller.go index 23d6894fa3d..e050670f449 100644 --- a/components/notebook-controller/controllers/culling_controller.go +++ b/components/notebook-controller/controllers/culling_controller.go @@ -318,7 +318,7 @@ func allKernelsAreIdle(kernels []KernelStatus, log logr.Logger) bool { // Update LAST_ACTIVITY_ANNOTATION func updateNotebookLastActivityAnnotation(meta *metav1.ObjectMeta, log logr.Logger) { - + updated := false log.Info("Updating the last-activity annotation. Checking /api/kernels") nm, ns := meta.GetName(), meta.GetNamespace() kernels := getNotebookApiKernels(nm, ns, log) @@ -327,8 +327,10 @@ func updateNotebookLastActivityAnnotation(meta *metav1.ObjectMeta, log logr.Logg } else if len(kernels) == 0 { log.Info("Notebook has no kernels. Will not update last-activity") } else { - updateTimestampFromKernelsActivity(meta, kernels, log) - return + updateTimestampFromKernelsActivity(meta, kernels, log, &updated) + if updated { + return + } } //test metrics @@ -348,8 +350,10 @@ func updateNotebookLastActivityAnnotation(meta *metav1.ObjectMeta, log logr.Logg } else if len(cpuMetrics.Data.Result) == 0 { log.Info("Notebook has no CPU usage metrics. Will not update last-activity.") } else { - updateTimestampFromMetrics(meta, "CPU usage", *cpuMetrics, 0.09, log) - return + updateTimestampFromMetrics(meta, "CPU usage", *cpuMetrics, 0.09, log, &updated) + if updated { + return + } } ioQuery := fmt.Sprintf("ceil(sum by(container) (rate(container_fs_reads_total{device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", namespace=\"%s\", container=\"%s\"}[2m]) + rate(container_fs_writes_total{device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", namespace=\"%s\", container=\"%s\"}[2m])))", @@ -360,12 +364,11 @@ func updateNotebookLastActivityAnnotation(meta *metav1.ObjectMeta, log logr.Logg } else if len(ioMetrics.Data.Result) == 0 { log.Info("Notebook has no disk IO metrics. Will not update last-activity.") } else { - updateTimestampFromMetrics(meta, "Disk IO", *ioMetrics, 0, log) - return + updateTimestampFromMetrics(meta, "Disk IO", *ioMetrics, 0, log, &updated) } } -func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []KernelStatus, log logr.Logger) { +func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []KernelStatus, log logr.Logger, updated *bool) { if !allKernelsAreIdle(kernels, log) { // At least on kernel is "busy" so the last-activity annotation should @@ -374,6 +377,7 @@ func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []Kerne log.Info(fmt.Sprintf("Found a busy kernel. Updating the last-activity to %s", t)) meta.Annotations[LAST_ACTIVITY_ANNOTATION] = t + *updated = true return } @@ -399,9 +403,10 @@ func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []Kerne meta.Annotations[LAST_ACTIVITY_ANNOTATION] = t log.Info(fmt.Sprintf("Successfully updated last-activity from latest kernel action, %s", t)) + *updated = true } -func updateTimestampFromMetrics(meta *metav1.ObjectMeta, metricsName string, metrics NotebookMetrics, threshold float64, log logr.Logger) { +func updateTimestampFromMetrics(meta *metav1.ObjectMeta, metricsName string, metrics NotebookMetrics, threshold float64, log logr.Logger, updated *bool) { // Metrics Data Result should always be only one value. // Result Value should always be 2 values, first value is unix_time, second value is the result parseValue, err := strconv.ParseFloat(metrics.Data.Result[0].Value[1].(string), 64) @@ -424,6 +429,7 @@ func updateTimestampFromMetrics(meta *metav1.ObjectMeta, metricsName string, met t := recentTime.Format(time.RFC3339) meta.Annotations[LAST_ACTIVITY_ANNOTATION] = t log.Info(fmt.Sprintf("Successfully updated last-activity from the %s metrics, %s", metricsName, t)) + *updated = true } func updateLastCullingCheckTimestampAnnotation(meta *metav1.ObjectMeta, log logr.Logger) { From 988efbbba72a7a9aa4ace5ee18f5b04135cda25f Mon Sep 17 00:00:00 2001 From: Mathis Marcotte Date: Tue, 13 Aug 2024 17:07:20 +0000 Subject: [PATCH 15/24] improved data type handling of metrics --- .../controllers/culling_controller.go | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/components/notebook-controller/controllers/culling_controller.go b/components/notebook-controller/controllers/culling_controller.go index e050670f449..b25f0f04f11 100644 --- a/components/notebook-controller/controllers/culling_controller.go +++ b/components/notebook-controller/controllers/culling_controller.go @@ -409,24 +409,21 @@ func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []Kerne func updateTimestampFromMetrics(meta *metav1.ObjectMeta, metricsName string, metrics NotebookMetrics, threshold float64, log logr.Logger, updated *bool) { // Metrics Data Result should always be only one value. // Result Value should always be 2 values, first value is unix_time, second value is the result - parseValue, err := strconv.ParseFloat(metrics.Data.Result[0].Value[1].(string), 64) + metricsTime := metrics.Data.Result[0].Value[0].(float64) + metricsValue := metrics.Data.Result[0].Value[1].(string) + + parseValue, err := strconv.ParseFloat(metricsValue, 64) if err != nil { log.Error(err, fmt.Sprintf("Error parsing the value from the %s metrics results", metricsName)) return } if !(parseValue > threshold) { // if metrics don't pass the threshold, don't update the recent activity - log.Info(fmt.Sprintf("%s of %s doesn't exceed the threshold %s. Not updating the last-activity", metricsName, metrics.Data.Result[0].Value[1], threshold)) - return - } - - recentTime, err := time.Parse(time.UnixDate, metrics.Data.Result[0].Value[0].(string)) - if err != nil { - log.Error(err, fmt.Sprintf("Error parsing the last-activity time from the %s metrics results", metricsName)) + log.Info(fmt.Sprintf("%s of %s doesn't exceed the threshold %s. Not updating the last-activity", metricsName, metricsValue, threshold)) return } - t := recentTime.Format(time.RFC3339) + t := time.Unix(int64(metricsTime), 0).Format(time.RFC3339) meta.Annotations[LAST_ACTIVITY_ANNOTATION] = t log.Info(fmt.Sprintf("Successfully updated last-activity from the %s metrics, %s", metricsName, t)) *updated = true From 445e5fdbad15db7b8375b87fb6a1084513c07777 Mon Sep 17 00:00:00 2001 From: Mathis Marcotte Date: Tue, 13 Aug 2024 17:43:55 +0000 Subject: [PATCH 16/24] cleanup --- .../controllers/culling_controller.go | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/components/notebook-controller/controllers/culling_controller.go b/components/notebook-controller/controllers/culling_controller.go index b25f0f04f11..acdcab81aa2 100644 --- a/components/notebook-controller/controllers/culling_controller.go +++ b/components/notebook-controller/controllers/culling_controller.go @@ -333,17 +333,8 @@ func updateNotebookLastActivityAnnotation(meta *metav1.ObjectMeta, log logr.Logg } } - //test metrics - testMetrics := getNotebookMetrics(nm, ns, "up", log) - if testMetrics == nil { - log.Info("Could not GET the TEST usage metrics. Will not update last-activity.") - } else { - //log.Info(fmt.Sprintf("TestMetricsFound %s. Will not update last-activity.", testMetrics.Data.Result)) - } - //end test metrics cpuQuery := fmt.Sprintf("sum by(container) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"%s\", container=\"%s\"})", ns, nm) - log.Info(fmt.Sprintf("Test query %s. test 2 %s", cpuQuery, url.QueryEscape(cpuQuery))) cpuMetrics := getNotebookMetrics(nm, ns, url.QueryEscape(cpuQuery), log) if cpuMetrics == nil { log.Info("Could not GET the CPU usage metrics. Will not update last-activity.") @@ -419,7 +410,8 @@ func updateTimestampFromMetrics(meta *metav1.ObjectMeta, metricsName string, met } if !(parseValue > threshold) { // if metrics don't pass the threshold, don't update the recent activity - log.Info(fmt.Sprintf("%s of %s doesn't exceed the threshold %s. Not updating the last-activity", metricsName, metricsValue, threshold)) + log.Info(fmt.Sprintf("%s of %s doesn't exceed the threshold %s. Not updating the last-activity", + metricsName, metricsValue, strconv.FormatFloat(threshold, 'g', -1, 64))) return } @@ -436,7 +428,6 @@ func updateLastCullingCheckTimestampAnnotation(meta *metav1.ObjectMeta, log logr } meta.Annotations[LAST_ACTIVITY_CHECK_TIMESTAMP_ANNOTATION] = t log.Info("Successfully updated last-activity-check-timestamp annotation") - } func annotationsExist(instance *v1beta1.Notebook) bool { From cb93ff512119cce360fae354d137518365c3db05 Mon Sep 17 00:00:00 2001 From: Mathis Marcotte Date: Tue, 13 Aug 2024 17:52:26 +0000 Subject: [PATCH 17/24] cleanup --- .../notebook-controller/controllers/culling_controller.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/components/notebook-controller/controllers/culling_controller.go b/components/notebook-controller/controllers/culling_controller.go index acdcab81aa2..a51ef1392f0 100644 --- a/components/notebook-controller/controllers/culling_controller.go +++ b/components/notebook-controller/controllers/culling_controller.go @@ -275,8 +275,6 @@ func getNotebookMetrics(nb string, ns string, query string, log logr.Logger) *No metricsUrl = fmt.Sprintf("http://localhost:9090/api/v1/query?query=%s", query) } - log.Info(fmt.Sprintf( - "Printing query %s", metricsUrl)) resp, err := client.Get(metricsUrl) if err != nil { log.Error(err, fmt.Sprintf("Error talking to %s", metricsUrl)) From e56064d0d4e665f86ea4bbfbb6aeb36c124270fc Mon Sep 17 00:00:00 2001 From: Mathis Marcotte Date: Tue, 13 Aug 2024 18:49:29 +0000 Subject: [PATCH 18/24] added comparaison to kernel activity check --- .../controllers/culling_controller.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/components/notebook-controller/controllers/culling_controller.go b/components/notebook-controller/controllers/culling_controller.go index a51ef1392f0..69ec58cad42 100644 --- a/components/notebook-controller/controllers/culling_controller.go +++ b/components/notebook-controller/controllers/culling_controller.go @@ -388,6 +388,17 @@ func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []Kerne recentTime = kernelLastActivity } } + // Comparing against the current annotation to see if there was any new acivity + oldTime, err := time.Parse(time.RFC3339, meta.Annotations[LAST_ACTIVITY_ANNOTATION]) + if err != nil { + log.Error(err, "Error parsing the last-activity from the annotation") + return + } + if !recentTime.After(oldTime) { + log.Info("No new activity detected on the kernels. Not updating last-activity") + return + } + t := recentTime.Format(time.RFC3339) meta.Annotations[LAST_ACTIVITY_ANNOTATION] = t From adfb6863b288a4369acba6cc216b74070e9477e0 Mon Sep 17 00:00:00 2001 From: Mathis Marcotte Date: Tue, 13 Aug 2024 18:58:18 +0000 Subject: [PATCH 19/24] added test logging --- components/notebook-controller/controllers/culling_controller.go | 1 + 1 file changed, 1 insertion(+) diff --git a/components/notebook-controller/controllers/culling_controller.go b/components/notebook-controller/controllers/culling_controller.go index 69ec58cad42..83b7b4ece98 100644 --- a/components/notebook-controller/controllers/culling_controller.go +++ b/components/notebook-controller/controllers/culling_controller.go @@ -394,6 +394,7 @@ func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []Kerne log.Error(err, "Error parsing the last-activity from the annotation") return } + log.Info(fmt.Sprintf("test times, %s, %s", oldTime.Format(time.RFC3339), recentTime.Format(time.RFC3339))) if !recentTime.After(oldTime) { log.Info("No new activity detected on the kernels. Not updating last-activity") return From 65f9459b26290ebcc90427c5b5e63e5f11d94459 Mon Sep 17 00:00:00 2001 From: Mathis Marcotte Date: Tue, 13 Aug 2024 19:12:42 +0000 Subject: [PATCH 20/24] added debugging log message --- .../notebook-controller/controllers/culling_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/notebook-controller/controllers/culling_controller.go b/components/notebook-controller/controllers/culling_controller.go index 83b7b4ece98..02b5095fe0e 100644 --- a/components/notebook-controller/controllers/culling_controller.go +++ b/components/notebook-controller/controllers/culling_controller.go @@ -394,7 +394,7 @@ func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []Kerne log.Error(err, "Error parsing the last-activity from the annotation") return } - log.Info(fmt.Sprintf("test times, %s, %s", oldTime.Format(time.RFC3339), recentTime.Format(time.RFC3339))) + log.Info(fmt.Sprintf("test times, %s, %s, %s", oldTime.Format(time.RFC3339), recentTime.Format(time.RFC3339), recentTime.After(oldTime))) if !recentTime.After(oldTime) { log.Info("No new activity detected on the kernels. Not updating last-activity") return From f782f4f8d9c4e874ffdf007b1d9dda810654baa4 Mon Sep 17 00:00:00 2001 From: Mathis Marcotte Date: Tue, 13 Aug 2024 19:31:53 +0000 Subject: [PATCH 21/24] debugging logging --- .../notebook-controller/controllers/culling_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/notebook-controller/controllers/culling_controller.go b/components/notebook-controller/controllers/culling_controller.go index 02b5095fe0e..093072122ec 100644 --- a/components/notebook-controller/controllers/culling_controller.go +++ b/components/notebook-controller/controllers/culling_controller.go @@ -394,7 +394,7 @@ func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []Kerne log.Error(err, "Error parsing the last-activity from the annotation") return } - log.Info(fmt.Sprintf("test times, %s, %s, %s", oldTime.Format(time.RFC3339), recentTime.Format(time.RFC3339), recentTime.After(oldTime))) + log.Info(fmt.Sprintf("test times, %s, %s, %s", oldTime, recentTime, recentTime.After(oldTime))) if !recentTime.After(oldTime) { log.Info("No new activity detected on the kernels. Not updating last-activity") return From 34f781c3db5ffdeda5a0a2d8bcf9035474a4df36 Mon Sep 17 00:00:00 2001 From: Mathis Marcotte Date: Wed, 14 Aug 2024 12:20:08 +0000 Subject: [PATCH 22/24] fixed kernel activity time comparaison --- .../controllers/culling_controller.go | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/components/notebook-controller/controllers/culling_controller.go b/components/notebook-controller/controllers/culling_controller.go index 093072122ec..7c280b3a7b4 100644 --- a/components/notebook-controller/controllers/culling_controller.go +++ b/components/notebook-controller/controllers/culling_controller.go @@ -388,20 +388,15 @@ func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []Kerne recentTime = kernelLastActivity } } + + t := recentTime.Format(time.RFC3339) + // Comparing against the current annotation to see if there was any new acivity - oldTime, err := time.Parse(time.RFC3339, meta.Annotations[LAST_ACTIVITY_ANNOTATION]) - if err != nil { - log.Error(err, "Error parsing the last-activity from the annotation") - return - } - log.Info(fmt.Sprintf("test times, %s, %s, %s", oldTime, recentTime, recentTime.After(oldTime))) - if !recentTime.After(oldTime) { + if t == meta.Annotations[LAST_ACTIVITY_ANNOTATION] { log.Info("No new activity detected on the kernels. Not updating last-activity") return } - t := recentTime.Format(time.RFC3339) - meta.Annotations[LAST_ACTIVITY_ANNOTATION] = t log.Info(fmt.Sprintf("Successfully updated last-activity from latest kernel action, %s", t)) *updated = true From f90d6ef8f9cd1acb9aeb9eef2d186e24b5edd20e Mon Sep 17 00:00:00 2001 From: Mathis Marcotte Date: Wed, 14 Aug 2024 13:25:29 +0000 Subject: [PATCH 23/24] fixed time comparaison --- .../controllers/culling_controller.go | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/components/notebook-controller/controllers/culling_controller.go b/components/notebook-controller/controllers/culling_controller.go index 7c280b3a7b4..e9f2344af3c 100644 --- a/components/notebook-controller/controllers/culling_controller.go +++ b/components/notebook-controller/controllers/culling_controller.go @@ -392,7 +392,19 @@ func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []Kerne t := recentTime.Format(time.RFC3339) // Comparing against the current annotation to see if there was any new acivity - if t == meta.Annotations[LAST_ACTIVITY_ANNOTATION] { + oldTime, err := time.Parse(time.RFC3339, meta.Annotations[LAST_ACTIVITY_ANNOTATION]) + if err != nil { + log.Error(err, "Error parsing the last-activity from the annotation") + return + } + // re-parsing the recentTime to just remove nanoseconds + newTime, err := time.Parse(time.RFC3339, t) + if err != nil { + log.Error(err, "Error parsing the last-activity from the recentTime") + return + } + + if !newTime.After(oldTime) { log.Info("No new activity detected on the kernels. Not updating last-activity") return } From 7da781084c496beea643a82d5374b6357ce53490 Mon Sep 17 00:00:00 2001 From: Mathis Marcotte Date: Tue, 20 Aug 2024 12:49:07 +0000 Subject: [PATCH 24/24] reduced logging and removed nightly build --- .../workflows/build-notebook-controller.yaml | 2 -- .../controllers/culling_controller.go | 30 +++++++------------ 2 files changed, 11 insertions(+), 21 deletions(-) diff --git a/.github/workflows/build-notebook-controller.yaml b/.github/workflows/build-notebook-controller.yaml index 824417d9664..ad0c04f0bce 100644 --- a/.github/workflows/build-notebook-controller.yaml +++ b/.github/workflows/build-notebook-controller.yaml @@ -12,8 +12,6 @@ on: - 'opened' - 'synchronize' - 'reopened' - schedule: - - cron: '0 22 * * *' # Environment variables available to all jobs and steps in this workflow env: REGISTRY_NAME: k8scc01covidacr diff --git a/components/notebook-controller/controllers/culling_controller.go b/components/notebook-controller/controllers/culling_controller.go index e9f2344af3c..db3d10ba0aa 100644 --- a/components/notebook-controller/controllers/culling_controller.go +++ b/components/notebook-controller/controllers/culling_controller.go @@ -317,14 +317,10 @@ func allKernelsAreIdle(kernels []KernelStatus, log logr.Logger) bool { // Update LAST_ACTIVITY_ANNOTATION func updateNotebookLastActivityAnnotation(meta *metav1.ObjectMeta, log logr.Logger) { updated := false - log.Info("Updating the last-activity annotation. Checking /api/kernels") + nm, ns := meta.GetName(), meta.GetNamespace() kernels := getNotebookApiKernels(nm, ns, log) - if kernels == nil { - log.Info("Could not GET the kernels status. Will not update last-activity.") - } else if len(kernels) == 0 { - log.Info("Notebook has no kernels. Will not update last-activity") - } else { + if kernels != nil && len(kernels) > 0 { updateTimestampFromKernelsActivity(meta, kernels, log, &updated) if updated { return @@ -334,11 +330,7 @@ func updateNotebookLastActivityAnnotation(meta *metav1.ObjectMeta, log logr.Logg cpuQuery := fmt.Sprintf("sum by(container) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"%s\", container=\"%s\"})", ns, nm) cpuMetrics := getNotebookMetrics(nm, ns, url.QueryEscape(cpuQuery), log) - if cpuMetrics == nil { - log.Info("Could not GET the CPU usage metrics. Will not update last-activity.") - } else if len(cpuMetrics.Data.Result) == 0 { - log.Info("Notebook has no CPU usage metrics. Will not update last-activity.") - } else { + if cpuMetrics != nil && len(cpuMetrics.Data.Result) > 0 { updateTimestampFromMetrics(meta, "CPU usage", *cpuMetrics, 0.09, log, &updated) if updated { return @@ -348,13 +340,15 @@ func updateNotebookLastActivityAnnotation(meta *metav1.ObjectMeta, log logr.Logg ioQuery := fmt.Sprintf("ceil(sum by(container) (rate(container_fs_reads_total{device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", namespace=\"%s\", container=\"%s\"}[2m]) + rate(container_fs_writes_total{device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", namespace=\"%s\", container=\"%s\"}[2m])))", ns, nm, ns, nm) ioMetrics := getNotebookMetrics(nm, ns, url.QueryEscape(ioQuery), log) - if ioMetrics == nil { - log.Info("Could not GET the disk IO metrics. Will not update last-activity.") - } else if len(ioMetrics.Data.Result) == 0 { - log.Info("Notebook has no disk IO metrics. Will not update last-activity.") - } else { + if ioMetrics != nil && len(ioMetrics.Data.Result) > 0 { updateTimestampFromMetrics(meta, "Disk IO", *ioMetrics, 0, log, &updated) + if updated { + return + } } + + //else + log.Info("Will not update last-activity") } func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []KernelStatus, log logr.Logger, updated *bool) { @@ -405,7 +399,7 @@ func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []Kerne } if !newTime.After(oldTime) { - log.Info("No new activity detected on the kernels. Not updating last-activity") + // No new activity detected on the kernels. Not updating last-activity return } @@ -427,8 +421,6 @@ func updateTimestampFromMetrics(meta *metav1.ObjectMeta, metricsName string, met } if !(parseValue > threshold) { // if metrics don't pass the threshold, don't update the recent activity - log.Info(fmt.Sprintf("%s of %s doesn't exceed the threshold %s. Not updating the last-activity", - metricsName, metricsValue, strconv.FormatFloat(threshold, 'g', -1, 64))) return }