From 56b8dabbd1df94ba43534fe5c862f54e09c78676 Mon Sep 17 00:00:00 2001 From: Dale Hamel Date: Mon, 16 Sep 2019 11:50:30 -0400 Subject: [PATCH 1/7] Require all tracejobs to have a deadline, allow overriding this --- pkg/cmd/run.go | 25 +++++++++++++++---------- pkg/tracejob/job.go | 2 ++ 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/pkg/cmd/run.go b/pkg/cmd/run.go index 7d8aa5ae..091cd7fe 100644 --- a/pkg/cmd/run.go +++ b/pkg/cmd/run.go @@ -25,6 +25,8 @@ var ( ImageNameTag = "quay.io/iovisor/kubectl-trace-bpftrace:latest" // InitImageNameTag represents the default init container image InitImageNameTag = "quay.io/iovisor/kubectl-trace-init:latest" + // By default do not allow traces to run for longer than one hour + DefaultDeadline = 3600 ) var ( @@ -73,6 +75,7 @@ type RunOptions struct { imageName string initImageName string fetchHeaders bool + deadline int64 resourceArg string attach bool @@ -91,6 +94,7 @@ func NewRunOptions(streams genericclioptions.IOStreams) *RunOptions { serviceAccount: "default", imageName: ImageNameTag, initImageName: InitImageNameTag, + deadline: int64(DefaultDeadline), } } @@ -127,6 +131,7 @@ func NewRunCommand(factory factory.Factory, streams genericclioptions.IOStreams) cmd.Flags().StringVar(&o.imageName, "imagename", o.imageName, "Custom image for the tracerunner") cmd.Flags().StringVar(&o.initImageName, "init-imagename", o.initImageName, "Custom image for the init container responsible to fetch and prepare linux headers") cmd.Flags().BoolVar(&o.fetchHeaders, "fetch-headers", o.fetchHeaders, "Whether to fetch linux headers or not") + cmd.Flags().Int64Var(&o.deadline, "deadline", o.deadline, "Maximum time to allow trace to run in seconds") return cmd } @@ -289,19 +294,19 @@ func (o *RunOptions) Run() error { } tj := tracejob.TraceJob{ - Name: fmt.Sprintf("%s%s", meta.ObjectNamePrefix, string(juid)), - Namespace: o.namespace, - ServiceAccount: o.serviceAccount, - ID: juid, - Hostname: o.nodeName, - Program: o.program, - PodUID: o.podUID, - ContainerName: o.container, - IsPod: o.isPod, - // todo(dalehamel) > following fields to be used for #48 + Name: fmt.Sprintf("%s%s", meta.ObjectNamePrefix, string(juid)), + Namespace: o.namespace, + ServiceAccount: o.serviceAccount, + ID: juid, + Hostname: o.nodeName, + Program: o.program, + PodUID: o.podUID, + ContainerName: o.container, + IsPod: o.isPod, ImageNameTag: o.imageName, InitImageNameTag: o.initImageName, FetchHeaders: o.fetchHeaders, + Deadline: o.deadline, } job, err := tc.CreateJob(tj) diff --git a/pkg/tracejob/job.go b/pkg/tracejob/job.go index 31ed24aa..a3ded20d 100644 --- a/pkg/tracejob/job.go +++ b/pkg/tracejob/job.go @@ -35,6 +35,7 @@ type TraceJob struct { ImageNameTag string InitImageNameTag string FetchHeaders bool + Deadline int64 StartTime metav1.Time Status TraceJobStatus } @@ -217,6 +218,7 @@ func (t *TraceJobClient) CreateJob(nj TraceJob) (*batchv1.Job, error) { job := &batchv1.Job{ ObjectMeta: commonMeta, Spec: batchv1.JobSpec{ + ActiveDeadlineSeconds: int64Ptr(nj.Deadline), TTLSecondsAfterFinished: int32Ptr(5), Parallelism: int32Ptr(1), Completions: int32Ptr(1), From 7b8ccc0e3bd942466f306bf8c07d068a00b05852 Mon Sep 17 00:00:00 2001 From: Dale Hamel Date: Mon, 16 Sep 2019 13:12:17 -0400 Subject: [PATCH 2/7] Add pre-stop hook to print maps before exit --- pkg/tracejob/job.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pkg/tracejob/job.go b/pkg/tracejob/job.go index a3ded20d..01a1950e 100644 --- a/pkg/tracejob/job.go +++ b/pkg/tracejob/job.go @@ -296,6 +296,21 @@ func (t *TraceJobClient) CreateJob(nj TraceJob) (*batchv1.Job, error) { SecurityContext: &apiv1.SecurityContext{ Privileged: boolPtr(true), }, + // We want to send SIGINT prior to the pod being killed, so we can print the map + // we will also wait for an arbitrary amount of time (10s) to give bpftrace time to + // process and summarize the data + // FIXME should this sleep be configurable? 10s is probably ample for most cases. + Lifecycle: &apiv1.Lifecycle{ + PreStop: &apiv1.Handler{ + Exec: &apiv1.ExecAction{ + Command: []string{ + "/bin/bash", + "-c", + "kill -SIGINT $(pidof bpftrace) && sleep 10", + }, + }, + }, + }, }, }, RestartPolicy: "Never", From 2c030465239a449d78b4b53c9e548d3d60b8c1ce Mon Sep 17 00:00:00 2001 From: Dale Hamel Date: Mon, 16 Sep 2019 19:59:16 -0400 Subject: [PATCH 3/7] Update pkg/cmd/run.go Co-Authored-By: fixmie[bot] <44270338+fixmie[bot]@users.noreply.github.com> --- pkg/cmd/run.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/cmd/run.go b/pkg/cmd/run.go index 091cd7fe..00e42da2 100644 --- a/pkg/cmd/run.go +++ b/pkg/cmd/run.go @@ -25,7 +25,7 @@ var ( ImageNameTag = "quay.io/iovisor/kubectl-trace-bpftrace:latest" // InitImageNameTag represents the default init container image InitImageNameTag = "quay.io/iovisor/kubectl-trace-init:latest" - // By default do not allow traces to run for longer than one hour + // DefaultDeadline default do not allow traces to run for longer than one hour DefaultDeadline = 3600 ) From 6fef7b6fc8b3dc2c0be4d42a62110adcdb5c598e Mon Sep 17 00:00:00 2001 From: Dale Hamel Date: Mon, 16 Sep 2019 20:13:23 -0400 Subject: [PATCH 4/7] Add configurable deadline grace period This allows for bpftrace to take some extra time to process and print bpf map data. For very large maps or in some edge cases the user may want to override this value --- pkg/cmd/run.go | 58 +++++++++++++++++++++++++-------------------- pkg/tracejob/job.go | 33 +++++++++++++------------- 2 files changed, 49 insertions(+), 42 deletions(-) diff --git a/pkg/cmd/run.go b/pkg/cmd/run.go index 00e42da2..44ecb8fb 100644 --- a/pkg/cmd/run.go +++ b/pkg/cmd/run.go @@ -25,8 +25,10 @@ var ( ImageNameTag = "quay.io/iovisor/kubectl-trace-bpftrace:latest" // InitImageNameTag represents the default init container image InitImageNameTag = "quay.io/iovisor/kubectl-trace-init:latest" - // DefaultDeadline default do not allow traces to run for longer than one hour + // DefaultDeadline is the maximum time a tracejob is allowed to run, in seconds DefaultDeadline = 3600 + // DefaultDeadlineGracePeriod is the maximum time to wait to print a map or histogram, in seconds + DefaultDeadlineGracePeriod = 10 ) var ( @@ -68,14 +70,15 @@ type RunOptions struct { explicitNamespace bool // Flags local to this command - container string - eval string - program string - serviceAccount string - imageName string - initImageName string - fetchHeaders bool - deadline int64 + container string + eval string + program string + serviceAccount string + imageName string + initImageName string + fetchHeaders bool + deadline int64 + deadlineGracePeriod int64 resourceArg string attach bool @@ -91,10 +94,11 @@ func NewRunOptions(streams genericclioptions.IOStreams) *RunOptions { return &RunOptions{ IOStreams: streams, - serviceAccount: "default", - imageName: ImageNameTag, - initImageName: InitImageNameTag, - deadline: int64(DefaultDeadline), + serviceAccount: "default", + imageName: ImageNameTag, + initImageName: InitImageNameTag, + deadline: int64(DefaultDeadline), + deadlineGracePeriod: int64(DefaultDeadlineGracePeriod), } } @@ -132,6 +136,7 @@ func NewRunCommand(factory factory.Factory, streams genericclioptions.IOStreams) cmd.Flags().StringVar(&o.initImageName, "init-imagename", o.initImageName, "Custom image for the init container responsible to fetch and prepare linux headers") cmd.Flags().BoolVar(&o.fetchHeaders, "fetch-headers", o.fetchHeaders, "Whether to fetch linux headers or not") cmd.Flags().Int64Var(&o.deadline, "deadline", o.deadline, "Maximum time to allow trace to run in seconds") + cmd.Flags().Int64Var(&o.deadline, "deadline-grace-period", o.deadlineGracePeriod, "Maximum wait time to print maps or histograms after deadline, in seconds") return cmd } @@ -294,19 +299,20 @@ func (o *RunOptions) Run() error { } tj := tracejob.TraceJob{ - Name: fmt.Sprintf("%s%s", meta.ObjectNamePrefix, string(juid)), - Namespace: o.namespace, - ServiceAccount: o.serviceAccount, - ID: juid, - Hostname: o.nodeName, - Program: o.program, - PodUID: o.podUID, - ContainerName: o.container, - IsPod: o.isPod, - ImageNameTag: o.imageName, - InitImageNameTag: o.initImageName, - FetchHeaders: o.fetchHeaders, - Deadline: o.deadline, + Name: fmt.Sprintf("%s%s", meta.ObjectNamePrefix, string(juid)), + Namespace: o.namespace, + ServiceAccount: o.serviceAccount, + ID: juid, + Hostname: o.nodeName, + Program: o.program, + PodUID: o.podUID, + ContainerName: o.container, + IsPod: o.isPod, + ImageNameTag: o.imageName, + InitImageNameTag: o.initImageName, + FetchHeaders: o.fetchHeaders, + Deadline: o.deadline, + DeadlineGracePeriod: o.deadlineGracePeriod, } job, err := tc.CreateJob(tj) diff --git a/pkg/tracejob/job.go b/pkg/tracejob/job.go index 01a1950e..4ab349a3 100644 --- a/pkg/tracejob/job.go +++ b/pkg/tracejob/job.go @@ -23,21 +23,22 @@ type TraceJobClient struct { // TraceJob is a container of info needed to create the job responsible for tracing. type TraceJob struct { - Name string - ID types.UID - Namespace string - ServiceAccount string - Hostname string - Program string - PodUID string - ContainerName string - IsPod bool - ImageNameTag string - InitImageNameTag string - FetchHeaders bool - Deadline int64 - StartTime metav1.Time - Status TraceJobStatus + Name string + ID types.UID + Namespace string + ServiceAccount string + Hostname string + Program string + PodUID string + ContainerName string + IsPod bool + ImageNameTag string + InitImageNameTag string + FetchHeaders bool + Deadline int64 + DeadlineGracePeriod int64 + StartTime metav1.Time + Status TraceJobStatus } // WithOutStream setup a file stream to output trace job operation information @@ -306,7 +307,7 @@ func (t *TraceJobClient) CreateJob(nj TraceJob) (*batchv1.Job, error) { Command: []string{ "/bin/bash", "-c", - "kill -SIGINT $(pidof bpftrace) && sleep 10", + fmt.Sprintf("kill -SIGINT $(pidof bpftrace) && sleep %i", nj.DeadlineGracePeriod), }, }, }, From 80cd6b82b5461ce217a6e731d0bfd0a4fd3830d6 Mon Sep 17 00:00:00 2001 From: Leo Di Donato Date: Tue, 17 Sep 2019 02:38:59 +0200 Subject: [PATCH 5/7] chore: remove comment This is configurable now :) --- pkg/tracejob/job.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/tracejob/job.go b/pkg/tracejob/job.go index 4ab349a3..87b7e71e 100644 --- a/pkg/tracejob/job.go +++ b/pkg/tracejob/job.go @@ -300,7 +300,6 @@ func (t *TraceJobClient) CreateJob(nj TraceJob) (*batchv1.Job, error) { // We want to send SIGINT prior to the pod being killed, so we can print the map // we will also wait for an arbitrary amount of time (10s) to give bpftrace time to // process and summarize the data - // FIXME should this sleep be configurable? 10s is probably ample for most cases. Lifecycle: &apiv1.Lifecycle{ PreStop: &apiv1.Handler{ Exec: &apiv1.ExecAction{ From c0f61a91aca6d2adb951014400437578431c7550 Mon Sep 17 00:00:00 2001 From: Dale Hamel Date: Tue, 17 Sep 2019 23:57:46 -0400 Subject: [PATCH 6/7] Allow adequate time for bpftrace to shut down, so jobs will show as completed --- pkg/cmd/run.go | 5 +++-- pkg/tracejob/job.go | 10 ++++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/pkg/cmd/run.go b/pkg/cmd/run.go index 44ecb8fb..c66e7c33 100644 --- a/pkg/cmd/run.go +++ b/pkg/cmd/run.go @@ -28,7 +28,8 @@ var ( // DefaultDeadline is the maximum time a tracejob is allowed to run, in seconds DefaultDeadline = 3600 // DefaultDeadlineGracePeriod is the maximum time to wait to print a map or histogram, in seconds - DefaultDeadlineGracePeriod = 10 + // note that it must account for startup time, as the deadline as based on start time + DefaultDeadlineGracePeriod = 30 ) var ( @@ -136,7 +137,7 @@ func NewRunCommand(factory factory.Factory, streams genericclioptions.IOStreams) cmd.Flags().StringVar(&o.initImageName, "init-imagename", o.initImageName, "Custom image for the init container responsible to fetch and prepare linux headers") cmd.Flags().BoolVar(&o.fetchHeaders, "fetch-headers", o.fetchHeaders, "Whether to fetch linux headers or not") cmd.Flags().Int64Var(&o.deadline, "deadline", o.deadline, "Maximum time to allow trace to run in seconds") - cmd.Flags().Int64Var(&o.deadline, "deadline-grace-period", o.deadlineGracePeriod, "Maximum wait time to print maps or histograms after deadline, in seconds") + cmd.Flags().Int64Var(&o.deadlineGracePeriod, "deadline-grace-period", o.deadlineGracePeriod, "Maximum wait time to print maps or histograms after deadline, in seconds") return cmd } diff --git a/pkg/tracejob/job.go b/pkg/tracejob/job.go index 019798aa..215239ac 100644 --- a/pkg/tracejob/job.go +++ b/pkg/tracejob/job.go @@ -4,6 +4,7 @@ import ( "fmt" "io" "io/ioutil" + "strconv" "github.com/iovisor/kubectl-trace/pkg/meta" batchv1 "k8s.io/api/batch/v1" @@ -186,6 +187,11 @@ func (t *TraceJobClient) DeleteJobs(nf TraceJobFilter) error { func (t *TraceJobClient) CreateJob(nj TraceJob) (*batchv1.Job, error) { bpfTraceCmd := []string{ + "/bin/timeout", + "--preserve-status", + "--signal", + "INT", + strconv.FormatInt(nj.Deadline, 10), "/bin/trace-runner", "--program=/programs/program.bt", } @@ -219,7 +225,7 @@ func (t *TraceJobClient) CreateJob(nj TraceJob) (*batchv1.Job, error) { job := &batchv1.Job{ ObjectMeta: commonMeta, Spec: batchv1.JobSpec{ - ActiveDeadlineSeconds: int64Ptr(nj.Deadline), + ActiveDeadlineSeconds: int64Ptr(nj.Deadline + nj.DeadlineGracePeriod), TTLSecondsAfterFinished: int32Ptr(5), Parallelism: int32Ptr(1), Completions: int32Ptr(1), @@ -306,7 +312,7 @@ func (t *TraceJobClient) CreateJob(nj TraceJob) (*batchv1.Job, error) { Command: []string{ "/bin/bash", "-c", - fmt.Sprintf("kill -SIGINT $(pidof bpftrace) && sleep %i", nj.DeadlineGracePeriod), + fmt.Sprintf("kill -SIGINT $(pidof bpftrace) && sleep %i", strconv.FormatInt(nj.DeadlineGracePeriod, 10)), }, }, }, From 7f9d490fffc499548f699458245b676f06ea4a72 Mon Sep 17 00:00:00 2001 From: Dale Hamel Date: Wed, 18 Sep 2019 01:03:21 -0400 Subject: [PATCH 7/7] Fix error in format string on deadline grace period --- pkg/tracejob/job.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/tracejob/job.go b/pkg/tracejob/job.go index 215239ac..6a9517ec 100644 --- a/pkg/tracejob/job.go +++ b/pkg/tracejob/job.go @@ -312,7 +312,7 @@ func (t *TraceJobClient) CreateJob(nj TraceJob) (*batchv1.Job, error) { Command: []string{ "/bin/bash", "-c", - fmt.Sprintf("kill -SIGINT $(pidof bpftrace) && sleep %i", strconv.FormatInt(nj.DeadlineGracePeriod, 10)), + fmt.Sprintf("kill -SIGINT $(pidof bpftrace) && sleep %s", strconv.FormatInt(nj.DeadlineGracePeriod, 10)), }, }, },