From 0bcf42ee7c1c7dd27c234b82d6b4daa6f5c72b5b Mon Sep 17 00:00:00 2001 From: Christian Hoffmann Date: Tue, 16 Apr 2024 23:54:22 +0000 Subject: [PATCH] Add -recheck-with-time-limit support (#223) process-exporter already supports the -recheck flag which makes it run the whole matching logic on each scrape. This is very useful when trying to monitor processes which change their names shortly after start. Sadly, -recheck carries a rather high performance penalty. At the same time, process name changes are very common directly after start, are seldomly expected during usage. This commit introduces -recheck-with-time-limit which rechecks processes N seconds after their start and stops doing so afterwards. This combines the accuracy benefits of -recheck with the performance gains of not using -recheck. --- README.md | 3 ++- cmd/process-exporter/main.go | 21 ++++++++++------ collector/process_collector.go | 18 ++++++++------ proc/grouper.go | 4 +-- proc/grouper_test.go | 8 +++--- proc/tracker.go | 45 +++++++++++++++++++++------------- proc/tracker_test.go | 8 +++--- 7 files changed, 64 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 82aa96c..d0353d7 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,8 @@ as well as group name. re-evaluated. This is disabled by default as an optimization, but since processes can choose to change their names, this may result in a process falling into the wrong group if we happen to see it for the first time before -it's assumed its proper name. +it's assumed its proper name. You can use -recheck-with-time-limit to enable this +feature only for a specific duration after process starts. -procnames is intended as a quick alternative to using a config file. Details in the following section. diff --git a/cmd/process-exporter/main.go b/cmd/process-exporter/main.go index 1a83061..54f5d94 100644 --- a/cmd/process-exporter/main.go +++ b/cmd/process-exporter/main.go @@ -173,6 +173,8 @@ func main() { "path to YAML web config file") recheck = flag.Bool("recheck", false, "recheck process names on each scrape") + recheckTimeLimit = flag.Duration("recheck-with-time-limit", 0, + "recheck processes only this much time after their start, but no longer.") debug = flag.Bool("debug", false, "log debugging information to stdout") showVersion = flag.Bool("version", false, @@ -232,15 +234,20 @@ func main() { matchnamer = namemapper } + if *recheckTimeLimit != 0 { + *recheck = true + } + pc, err := collector.NewProcessCollector( collector.ProcessCollectorOption{ - ProcFSPath: *procfsPath, - Children: *children, - Threads: *threads, - GatherSMaps: *smaps, - Namer: matchnamer, - Recheck: *recheck, - Debug: *debug, + ProcFSPath: *procfsPath, + Children: *children, + Threads: *threads, + GatherSMaps: *smaps, + Namer: matchnamer, + Recheck: *recheck, + RecheckTimeLimit: *recheckTimeLimit, + Debug: *debug, }, ) if err != nil { diff --git a/collector/process_collector.go b/collector/process_collector.go index 063fb67..6ef0440 100644 --- a/collector/process_collector.go +++ b/collector/process_collector.go @@ -2,6 +2,7 @@ package collector import ( "log" + "time" common "github.com/ncabatoff/process-exporter" "github.com/ncabatoff/process-exporter/proc" @@ -155,13 +156,14 @@ type ( } ProcessCollectorOption struct { - ProcFSPath string - Children bool - Threads bool - GatherSMaps bool - Namer common.MatchNamer - Recheck bool - Debug bool + ProcFSPath string + Children bool + Threads bool + GatherSMaps bool + Namer common.MatchNamer + Recheck bool + RecheckTimeLimit time.Duration + Debug bool } NamedProcessCollector struct { @@ -186,7 +188,7 @@ func NewProcessCollector(options ProcessCollectorOption) (*NamedProcessCollector fs.GatherSMaps = options.GatherSMaps p := &NamedProcessCollector{ scrapeChan: make(chan scrapeRequest), - Grouper: proc.NewGrouper(options.Namer, options.Children, options.Threads, options.Recheck, options.Debug), + Grouper: proc.NewGrouper(options.Namer, options.Children, options.Threads, options.Recheck, options.RecheckTimeLimit, options.Debug), source: fs, threads: options.Threads, smaps: options.GatherSMaps, diff --git a/proc/grouper.go b/proc/grouper.go index dc612c2..1b50720 100644 --- a/proc/grouper.go +++ b/proc/grouper.go @@ -49,11 +49,11 @@ type ( func lessThreads(x, y Threads) bool { return seq.Compare(x, y) < 0 } // NewGrouper creates a grouper. -func NewGrouper(namer common.MatchNamer, trackChildren, trackThreads, alwaysRecheck, debug bool) *Grouper { +func NewGrouper(namer common.MatchNamer, trackChildren, trackThreads, recheck bool, recheckTimeLimit time.Duration, debug bool) *Grouper { g := Grouper{ groupAccum: make(map[string]Counts), threadAccum: make(map[string]map[string]Threads), - tracker: NewTracker(namer, trackChildren, alwaysRecheck, debug), + tracker: NewTracker(namer, trackChildren, recheck, recheckTimeLimit, debug), debug: debug, } return &g diff --git a/proc/grouper_test.go b/proc/grouper_test.go index 5b3f10f..ff291ae 100644 --- a/proc/grouper_test.go +++ b/proc/grouper_test.go @@ -73,7 +73,7 @@ func TestGrouperBasic(t *testing.T) { }, } - gr := NewGrouper(newNamer(n1, n2), false, false, false, false) + gr := NewGrouper(newNamer(n1, n2), false, false, false, 0, false) for i, tc := range tests { got := rungroup(t, gr, procInfoIter(tc.procs...)) if diff := cmp.Diff(got, tc.want); diff != "" { @@ -128,7 +128,7 @@ func TestGrouperProcJoin(t *testing.T) { }, } - gr := NewGrouper(newNamer(n1), false, false, false, false) + gr := NewGrouper(newNamer(n1), false, false, false, 0, false) for i, tc := range tests { got := rungroup(t, gr, procInfoIter(tc.procs...)) if diff := cmp.Diff(got, tc.want); diff != "" { @@ -171,7 +171,7 @@ func TestGrouperNonDecreasing(t *testing.T) { }, } - gr := NewGrouper(newNamer(n1), false, false, false, false) + gr := NewGrouper(newNamer(n1), false, false, false, 0, false) for i, tc := range tests { got := rungroup(t, gr, procInfoIter(tc.procs...)) if diff := cmp.Diff(got, tc.want); diff != "" { @@ -224,7 +224,7 @@ func TestGrouperThreads(t *testing.T) { } opts := cmpopts.SortSlices(lessThreads) - gr := NewGrouper(newNamer(n), false, true, false, false) + gr := NewGrouper(newNamer(n), false, true, false, 0, false) for i, tc := range tests { got := rungroup(t, gr, procInfoIter(tc.proc)) if diff := cmp.Diff(got, tc.want, opts); diff != "" { diff --git a/proc/tracker.go b/proc/tracker.go index 7b41a87..e09e615 100644 --- a/proc/tracker.go +++ b/proc/tracker.go @@ -30,9 +30,11 @@ type ( // namer wanted tracked. trackChildren bool // never ignore processes, i.e. always re-check untracked processes in case comm has changed - alwaysRecheck bool - username map[int]string - debug bool + recheck bool + // limit rechecks to this much time + recheckTimeLimit time.Duration + username map[int]string + debug bool } // Delta is an alias of Counts used to signal that its contents are not @@ -139,15 +141,16 @@ func (tp *trackedProc) getUpdate() Update { } // NewTracker creates a Tracker. -func NewTracker(namer common.MatchNamer, trackChildren bool, alwaysRecheck bool, debug bool) *Tracker { +func NewTracker(namer common.MatchNamer, trackChildren bool, recheck bool, recheckTimeLimit time.Duration, debug bool) *Tracker { return &Tracker{ - namer: namer, - tracked: make(map[ID]*trackedProc), - procIds: make(map[int]ID), - trackChildren: trackChildren, - alwaysRecheck: alwaysRecheck, - username: make(map[int]string), - debug: debug, + namer: namer, + tracked: make(map[ID]*trackedProc), + procIds: make(map[int]ID), + trackChildren: trackChildren, + recheck: recheck, + recheckTimeLimit: recheckTimeLimit, + username: make(map[int]string), + debug: debug, } } @@ -174,11 +177,19 @@ func (t *Tracker) track(groupName string, idinfo IDInfo) { t.tracked[idinfo.ID] = &tproc } -func (t *Tracker) ignore(id ID) { +func (t *Tracker) ignore(id ID, startTime time.Time) { // only ignore ID if we didn't set recheck to true - if t.alwaysRecheck == false { - t.tracked[id] = nil + if t.recheck { + if t.recheckTimeLimit == 0 { + // plain -recheck with no time limit: + return + } + if startTime.Add(t.recheckTimeLimit).After(time.Now()) { + // -recheckWithTimeLimit is used and the limit is not reached yet: + return + } } + t.tracked[id] = nil } func (tp *trackedProc) update(metrics Metrics, now time.Time, cerrs *CollectErrors, threads []Thread) { @@ -341,7 +352,7 @@ func (t *Tracker) checkAncestry(idinfo IDInfo, newprocs map[ID]IDInfo) string { log.Printf("ignoring unmatched proc with no matched parent: %+v", idinfo) } // Reached root of process tree without finding a tracked parent. - t.ignore(idinfo.ID) + t.ignore(idinfo.ID, idinfo.Static.StartTime) return "" } @@ -357,7 +368,7 @@ func (t *Tracker) checkAncestry(idinfo IDInfo, newprocs map[ID]IDInfo) string { return ptproc.groupName } // We've found an untracked parent. - t.ignore(idinfo.ID) + t.ignore(idinfo.ID, idinfo.Static.StartTime) return "" } @@ -378,7 +389,7 @@ func (t *Tracker) checkAncestry(idinfo IDInfo, newprocs map[ID]IDInfo) string { if t.debug { log.Printf("ignoring unmatched proc with no matched parent: %+v", idinfo) } - t.ignore(idinfo.ID) + t.ignore(idinfo.ID, idinfo.Static.StartTime) return "" } diff --git a/proc/tracker_test.go b/proc/tracker_test.go index 50c632a..4a29516 100644 --- a/proc/tracker_test.go +++ b/proc/tracker_test.go @@ -36,7 +36,7 @@ func TestTrackerBasic(t *testing.T) { }, } // Note that n3 should not be tracked according to our namer. - tr := NewTracker(newNamer(n1, n2, n4), false, false, false) + tr := NewTracker(newNamer(n1, n2, n4), false, false, 0, false) opts := cmpopts.SortSlices(lessUpdateGroupName) for i, tc := range tests { @@ -78,7 +78,7 @@ func TestTrackerChildren(t *testing.T) { }, } // Only n2 and children of n2s should be tracked - tr := NewTracker(newNamer(n2), true, false, false) + tr := NewTracker(newNamer(n2), true, false, 0, false) for i, tc := range tests { _, got, err := tr.Update(procInfoIter(tc.procs...)) @@ -111,7 +111,7 @@ func TestTrackerMetrics(t *testing.T) { Filedesc{2, 20}, tm, 1, States{Running: 1}, msi{}, nil}, }, } - tr := NewTracker(newNamer(n), false, false, false) + tr := NewTracker(newNamer(n), false, false, 0, false) for i, tc := range tests { _, got, err := tr.Update(procInfoIter(tc.proc)) @@ -169,7 +169,7 @@ func TestTrackerThreads(t *testing.T) { }, }, } - tr := NewTracker(newNamer(n), false, false, false) + tr := NewTracker(newNamer(n), false, false, 0, false) opts := cmpopts.SortSlices(lessThreadUpdate) for i, tc := range tests {