Skip to content

Support for filtering on core list for metrics (#428) #441

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 16 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions cmd/metrics/event_defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,10 @@ func isCollectableEvent(event EventDefinition, metadata Metadata) bool {
return true
}
// uncore events
if flagGranularity == granularityCPU && strings.HasPrefix(event.Name, "UNC") {
slog.Debug("Uncore events not supported with specified granularity", slog.String("event", event.Name))
return false
}
if !metadata.SupportsUncore && strings.HasPrefix(event.Name, "UNC") {
slog.Debug("Uncore events not supported on target", slog.String("event", event.Name))
return false
Expand Down Expand Up @@ -209,6 +213,12 @@ func isCollectableEvent(event EventDefinition, metadata Metadata) bool {
slog.Debug("Cstate and power events not supported in process or cgroup scope", slog.String("event", event.Name))
return false
}
// no power events when collecting at CPU granularity
if (flagGranularity == granularityCPU) &&
(strings.Contains(event.Name, "power/energy") || strings.Contains(event.Name, "cstate_pkg")) {
slog.Debug("Power events not supported in CPU granularity", slog.String("event", event.Name))
return false
}
// finally, if it isn't in the perf list output, it isn't collectable
name := strings.Split(event.Name, ":")[0]
if !strings.Contains(metadata.PerfSupportedEvents, name) {
Expand Down
38 changes: 36 additions & 2 deletions cmd/metrics/event_frame.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ import (
"slices"
"strconv"
"strings"

"perfspect/internal/util"
)

// EventGroup represents a group of perf events and their values
Expand Down Expand Up @@ -211,7 +213,35 @@ func coalesceEvents(allEvents []Event, scope string, granularity string, metadat
return
case granularityCPU:
// create one list of Events per CPU
numCPUs := metadata.SocketCount * metadata.CoresPerSocket * metadata.ThreadsPerCore
var numCPUs int

// create a mapping of cpu numbers to event indices
var cpuMap map[int]int
var cpuList []int
// if cpu range is specified, use it to determine the number of cpus
// otherwise, use the number of sockets, cores per socket, and threads per core
// to determine the number of cpus
if len(flagCpuRange) > 0 {
cpuList, err = util.SelectiveIntRangeToIntList(flagCpuRange)
if err != nil {
return nil, fmt.Errorf("failed to parse cpu range: %w", err)
}
numCPUs = len(cpuList)
cpuMap = make(map[int]int, numCPUs)
for i, cpu := range cpuList {
cpuMap[cpu] = i
}
} else {
numCPUs = metadata.SocketCount * metadata.CoresPerSocket * metadata.ThreadsPerCore
cpuList, err = util.SelectiveIntRangeToIntList("0-" + strconv.Itoa(numCPUs-1))
if err != nil {
return nil, fmt.Errorf("failed to parse cpu range: %w", err)
}
cpuMap = make(map[int]int, numCPUs)
for i := 0; i < numCPUs; i++ {
cpuMap[i] = i
}
}
// note: if some cores have been off-lined, this may cause an issue because 'perf' seems
// to still report events for those cores
newEvents := make([][]Event, numCPUs)
Expand All @@ -223,14 +253,18 @@ func coalesceEvents(allEvents []Event, scope string, granularity string, metadat
if cpu, err = strconv.Atoi(event.CPU); err != nil {
return
}
// if cpu is not in cpuList, don't add it to any lists
if !slices.Contains(cpuList, cpu) {
continue
}
// handle case where perf returns events for off-lined cores
if cpu > len(newEvents)-1 {
cpusToAdd := len(newEvents) + 1 - cpu
for range cpusToAdd {
newEvents = append(newEvents, make([]Event, 0, len(allEvents)/numCPUs))
}
}
newEvents[cpu] = append(newEvents[cpu], event)
newEvents[cpuMap[cpu]] = append(newEvents[cpuMap[cpu]], event)
}
coalescedEvents = append(coalescedEvents, newEvents...)
default:
Expand Down
50 changes: 49 additions & 1 deletion cmd/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ var (
flagFilter string
flagCount int
flagRefresh int
flagCpuRange string
// output format options
flagGranularity string
flagOutputFormat []string
Expand Down Expand Up @@ -130,6 +131,7 @@ const (
flagFilterName = "filter"
flagCountName = "count"
flagRefreshName = "refresh"
flagCpuRangeName = "cpus"

flagGranularityName = "granularity"
flagOutputFormatName = "format"
Expand Down Expand Up @@ -183,6 +185,7 @@ func init() {
Cmd.Flags().StringVar(&flagFilter, flagFilterName, "", "")
Cmd.Flags().IntVar(&flagCount, flagCountName, 5, "")
Cmd.Flags().IntVar(&flagRefresh, flagRefreshName, 30, "")
Cmd.Flags().StringVar(&flagCpuRange, flagCpuRangeName, "", "")

Cmd.Flags().StringVar(&flagGranularity, flagGranularityName, granularitySystem, "")
Cmd.Flags().StringSliceVar(&flagOutputFormat, flagOutputFormatName, []string{formatCSV}, "")
Expand Down Expand Up @@ -266,6 +269,10 @@ func getFlagGroups() []common.FlagGroup {
Name: flagRefreshName,
Help: "number of seconds to run before refreshing the \"hot\" or \"filtered\" process or cgroup list. If 0, the list will not be refreshed.",
},
{
Name: flagCpuRangeName,
Help: "comma separated list of CPU cores to monitor. If not provided, all cores will be monitored.",
},
}
groups = append(groups, common.FlagGroup{
GroupName: "Collection Options",
Expand Down Expand Up @@ -375,6 +382,9 @@ func validateFlags(cmd *cobra.Command, args []string) error {
if cmd.Flags().Lookup(flagCountName).Changed {
return common.FlagValidationError(cmd, "count is not supported with an application argument")
}
if cmd.Flags().Lookup(flagCpuRangeName).Changed {
return common.FlagValidationError(cmd, "core range is not supported with an application argument")
}
}
// confirm valid duration
if cmd.Flags().Lookup(flagDurationName).Changed && flagDuration != 0 && flagDuration < flagPerfPrintInterval {
Expand Down Expand Up @@ -461,6 +471,44 @@ func validateFlags(cmd *cobra.Command, args []string) error {
return common.FlagValidationError(cmd, fmt.Sprintf("refresh must be greater than or equal to the event collection interval (%d)", flagPerfPrintInterval))
}
}
// cpu range changed
if len(flagCpuRange) > 0 {
if cmd.Flags().Lookup(flagGranularityName).Changed && flagGranularity != granularityCPU {
return common.FlagValidationError(cmd, fmt.Sprintf("cpu range can only be specified when granularity is %s. Current granularity is %s.", granularityCPU, flagGranularity))
}
flagGranularity = granularityCPU // set granularity to cpu if cpu range is specified
if flagCpuRange == "" {
return common.FlagValidationError(cmd, "cpu range must be specified")
} else {
if flagCpuRange == "all" {
flagCpuRange = "" // treat "all" as empty
} else {
// validate cpu range
cpuList, err := util.SelectiveIntRangeToIntList(flagCpuRange)
numCpus := len(cpuList)
if err != nil {
return common.FlagValidationError(cmd, fmt.Sprintf("invalid cpu range: %s", flagCpuRange))
}
if numCpus == 0 {
return common.FlagValidationError(cmd, fmt.Sprintf("cpu range must contain at least one CPU, got: %s", flagCpuRange))
}
// check if any entries in the cpu range are duplicates
seen := make(map[int]bool)
for _, cpu := range cpuList {
if seen[cpu] {
return common.FlagValidationError(cmd, fmt.Sprintf("duplicate CPU in cpu range: %s", flagCpuRange))
}
seen[cpu] = true
}
// check if any entries in the cpu range are out of bounds
//for _, cpu := range cpuList {
// if cpu < 0 || cpu >= util.GetNumCpus() {
// return common.FlagValidationError(cmd, fmt.Sprintf("cpu %d in cpu range is out of bounds, must be between 0 and %d", cpu, util.GetNumCpus()-1))
// }
//}
}
}
}
// output options
// confirm valid granularity
if cmd.Flags().Lookup(flagGranularityName).Changed && !slices.Contains(granularityOptions, flagGranularity) {
Expand Down Expand Up @@ -1298,7 +1346,7 @@ func collectOnTarget(targetContext *targetContext, localTempDir string, localOut
}
}
var perfCommand *exec.Cmd
perfCommand, err = getPerfCommand(targetContext.perfPath, targetContext.groupDefinitions, pids, cids)
perfCommand, err = getPerfCommand(targetContext.perfPath, targetContext.groupDefinitions, pids, cids, flagCpuRange)
if err != nil {
err = fmt.Errorf("failed to get perf command: %w", err)
_ = statusUpdate(myTarget.GetName(), fmt.Sprintf("Error: %s", err.Error()))
Expand Down
9 changes: 6 additions & 3 deletions cmd/metrics/perf.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,13 @@ func getPerfPath(myTarget target.Target, localPerfPath string) (string, error) {
// Returns:
// - args: The command arguments for the 'perf stat' command.
// - err: An error, if any.
func getPerfCommandArgs(pids []string, cgroups []string, timeout int, eventGroups []GroupDefinition) (args []string, err error) {
func getPerfCommandArgs(pids []string, cgroups []string, timeout int, eventGroups []GroupDefinition, cpuRange string) (args []string, err error) {
// -I: print interval in ms
// -j: json formatted event output
args = append(args, "stat", "-I", fmt.Sprintf("%d", flagPerfPrintInterval*1000), "-j")
if cpuRange != "" {
args = append(args, "-C", cpuRange) // collect only for these cpus
}
switch flagScope {
case scopeSystem:
args = append(args, "-a") // system-wide collection
Expand Down Expand Up @@ -133,7 +136,7 @@ func getPerfCommandArgs(pids []string, cgroups []string, timeout int, eventGroup

// getPerfCommand is responsible for assembling the command that will be
// executed to collect event data
func getPerfCommand(perfPath string, eventGroups []GroupDefinition, pids []string, cids []string) (*exec.Cmd, error) {
func getPerfCommand(perfPath string, eventGroups []GroupDefinition, pids []string, cids []string, cpuRange string) (*exec.Cmd, error) {
var duration int
switch flagScope {
case scopeSystem:
Expand All @@ -148,7 +151,7 @@ func getPerfCommand(perfPath string, eventGroups []GroupDefinition, pids []strin
duration = 0
}

args, err := getPerfCommandArgs(pids, cids, duration, eventGroups)
args, err := getPerfCommandArgs(pids, cids, duration, eventGroups, cpuRange)
if err != nil {
err = fmt.Errorf("failed to assemble perf args: %v", err)
return nil, err
Expand Down