From 48fa2416c0bb30f05712a669bd650fd7252d5cbe Mon Sep 17 00:00:00 2001 From: Markus Stange Date: Wed, 14 Feb 2024 18:16:11 -0500 Subject: [PATCH] Add a way to record only the main thread on macOS. I was seeing too much overhead with lots of thread in a case where I was only interested in the main thread anyway. --- samply/src/mac/sampler.rs | 14 ++++++++----- samply/src/mac/task_profiler.rs | 31 +++++++++++++++++++--------- samply/src/main.rs | 6 ++++++ samply/src/shared/recording_props.rs | 1 + 4 files changed, 37 insertions(+), 15 deletions(-) diff --git a/samply/src/mac/sampler.rs b/samply/src/mac/sampler.rs index f6a0443c..a7ce9e97 100644 --- a/samply/src/mac/sampler.rs +++ b/samply/src/mac/sampler.rs @@ -4,6 +4,7 @@ use mach::port::mach_port_t; use std::mem; use std::path::{Path, PathBuf}; +use std::sync::Arc; use std::thread; use std::time::Duration; use std::time::SystemTime; @@ -33,8 +34,8 @@ pub struct TaskInit { pub struct Sampler { command_name: String, task_receiver: Receiver, - recording_props: RecordingProps, - conversion_props: ConversionProps, + recording_props: Arc, + conversion_props: Arc, } impl Sampler { @@ -55,8 +56,8 @@ impl Sampler { Sampler { command_name, task_receiver, - recording_props, - conversion_props, + recording_props: Arc::new(recording_props), + conversion_props: Arc::new(conversion_props), } } @@ -100,6 +101,8 @@ impl Sampler { &self.command_name, &mut profile, process_recycler.as_mut(), + self.recording_props.clone(), + self.conversion_props.clone(), ) .expect("couldn't create root TaskProfiler"); @@ -129,6 +132,8 @@ impl Sampler { &self.command_name, &mut profile, process_recycler.as_mut(), + self.recording_props.clone(), + self.conversion_props.clone(), ) { live_tasks.push(new_task); } else { @@ -164,7 +169,6 @@ impl Sampler { &mut profile, &mut stack_scratch_buffer, &mut unresolved_stacks, - self.conversion_props.fold_recursive_prefix, )?; if still_alive { live_tasks.push(task); diff --git a/samply/src/mac/task_profiler.rs b/samply/src/mac/task_profiler.rs index 5e829986..59166039 100644 --- a/samply/src/mac/task_profiler.rs +++ b/samply/src/mac/task_profiler.rs @@ -22,6 +22,7 @@ use std::collections::{HashMap, HashSet}; use std::mem; use std::ops::{Deref, Range}; use std::path::{Path, PathBuf}; +use std::sync::Arc; use crate::shared::jit_category_manager::JitCategoryManager; use crate::shared::jit_function_recycler::JitFunctionRecycler; @@ -32,6 +33,7 @@ use crate::shared::lib_mappings::{ use crate::shared::marker_file::get_markers; use crate::shared::perf_map::try_load_perf_map; use crate::shared::process_sample_data::{MarkerSpanOnThread, ProcessSampleData}; +use crate::shared::recording_props::{ConversionProps, RecordingProps}; use crate::shared::recycling::{ProcessRecycler, ProcessRecyclingData, ThreadRecycler}; use crate::shared::timestamp_converter::TimestampConverter; use crate::shared::unresolved_samples::{UnresolvedSamples, UnresolvedStacks}; @@ -109,6 +111,8 @@ pub struct TaskProfiler { thread_recycler: Option, jit_function_recycler: Option, timestamp_converter: TimestampConverter, + recording_props: Arc, + conversion_props: Arc, } impl TaskProfiler { @@ -118,6 +122,8 @@ impl TaskProfiler { command_name: &str, profile: &mut Profile, mut process_recycler: Option<&mut ProcessRecycler>, + recording_props: Arc, + conversion_props: Arc, ) -> Result { let TaskInit { start_time_mono, @@ -141,7 +147,7 @@ impl TaskProfiler { }) .unwrap_or_else(|| command_name.to_string()); - let thread_acts = get_thread_list(task)?; + let thread_acts = get_thread_list(task, recording_props.main_thread_only)?; if thread_acts.is_empty() { return Err(SamplingError::Ignorable( "No threads", @@ -255,6 +261,8 @@ impl TaskProfiler { thread_recycler, jit_function_recycler, timestamp_converter, + recording_props, + conversion_props, }; task_profiler.process_lib_modifications(start_time_mono, initial_lib_mods, profile); @@ -262,7 +270,6 @@ impl TaskProfiler { Ok(task_profiler) } - #[allow(clippy::too_many_arguments)] pub fn sample( &mut self, now: Timestamp, @@ -271,7 +278,6 @@ impl TaskProfiler { profile: &mut Profile, stack_scratch_buffer: &mut Vec, unresolved_stacks: &mut UnresolvedStacks, - fold_recursive_prefix: bool, ) -> Result { let result = self.sample_impl( now, @@ -280,7 +286,6 @@ impl TaskProfiler { profile, stack_scratch_buffer, unresolved_stacks, - fold_recursive_prefix, ); match result { Ok(()) => Ok(true), @@ -303,7 +308,6 @@ impl TaskProfiler { } } - #[allow(clippy::too_many_arguments)] fn sample_impl( &mut self, now: Timestamp, @@ -312,7 +316,6 @@ impl TaskProfiler { profile: &mut Profile, stack_scratch_buffer: &mut Vec, unresolved_stacks: &mut UnresolvedStacks, - fold_recursive_prefix: bool, ) -> Result<(), SamplingError> { // First, check for any newly-loaded libraries. if let Ok(changes) = self.lib_info_manager.check_for_changes() { @@ -320,7 +323,7 @@ impl TaskProfiler { } // Enumerate threads. - let thread_acts = get_thread_list(self.task)?; + let thread_acts = get_thread_list(self.task, self.recording_props.main_thread_only)?; let previously_live_threads: HashSet<_> = self.live_threads.keys().cloned().collect(); let mut now_live_threads = HashSet::new(); for thread_act in thread_acts { @@ -368,7 +371,7 @@ impl TaskProfiler { stack_scratch_buffer, unresolved_stacks, &mut self.unresolved_samples, - fold_recursive_prefix, + self.conversion_props.fold_recursive_prefix, )?; if still_alive { now_live_threads.insert(thread_act); @@ -677,7 +680,10 @@ fn get_debug_frame(file_path: &str) -> Option { } } -fn get_thread_list(task: mach_port_t) -> Result, SamplingError> { +fn get_thread_list( + task: mach_port_t, + main_thread_only: bool, +) -> Result, SamplingError> { let mut thread_list: thread_act_port_array_t = std::ptr::null_mut(); let mut thread_count: mach_msg_type_number_t = Default::default(); unsafe { task_threads(task, &mut thread_list, &mut thread_count) } @@ -691,7 +697,7 @@ fn get_thread_list(task: mach_port_t) -> Result, SamplingError err => SamplingError::Ignorable("task_threads in get_thread_list", err), })?; - let thread_acts = + let mut thread_acts = unsafe { std::slice::from_raw_parts(thread_list, thread_count as usize) }.to_owned(); unsafe { @@ -704,6 +710,11 @@ fn get_thread_list(task: mach_port_t) -> Result, SamplingError .into_result() .map_err(|err| SamplingError::Fatal("mach_vm_deallocate in get_thread_list", err))?; + if main_thread_only { + // Keep only the main thread. It's always the first thread in the list. + thread_acts.truncate(1); + } + Ok(thread_acts) } diff --git a/samply/src/main.rs b/samply/src/main.rs index f03c71b3..a403c5b5 100644 --- a/samply/src/main.rs +++ b/samply/src/main.rs @@ -104,6 +104,11 @@ struct RecordArgs { #[arg(long, default_value = "1")] iteration_count: u32, + /// Reduce profiling overhead by only recording the main thread. + /// This option is only respected on macOS. + #[arg(long)] + main_thread_only: bool, + #[command(flatten)] conversion_args: ConversionArgs, @@ -242,6 +247,7 @@ impl RecordArgs { output_file: self.output.clone(), time_limit, interval, + main_thread_only: self.main_thread_only, } } diff --git a/samply/src/shared/recording_props.rs b/samply/src/shared/recording_props.rs index 8b7ec0af..fd996b4f 100644 --- a/samply/src/shared/recording_props.rs +++ b/samply/src/shared/recording_props.rs @@ -4,6 +4,7 @@ pub struct RecordingProps { pub output_file: PathBuf, pub time_limit: Option, pub interval: Duration, + pub main_thread_only: bool, } pub struct ConversionProps {