diff --git a/scheds/rust/scx_rusty/src/bpf/main.bpf.c b/scheds/rust/scx_rusty/src/bpf/main.bpf.c index 2e8cf070a3..b5bce47be1 100644 --- a/scheds/rust/scx_rusty/src/bpf/main.bpf.c +++ b/scheds/rust/scx_rusty/src/bpf/main.bpf.c @@ -198,6 +198,7 @@ struct { __uint(map_flags, 0); } node_data SEC(".maps"); + struct lock_wrapper { struct bpf_spin_lock lock; }; @@ -302,6 +303,7 @@ static void task_load_adj(struct task_ctx *taskc, { taskc->runnable = runnable; ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, load_half_life); + ravg_accumulate(&taskc->l3_rd, taskc->l3_traffic, now, load_half_life); } static struct bucket_ctx *lookup_dom_bucket(dom_ptr dom_ctx, @@ -343,7 +345,7 @@ static u64 scale_inverse_fair(u64 value, u64 weight) return value * 100 / weight; } -static void dom_dcycle_adj(dom_ptr domc, u32 weight, u64 now, bool runnable) +static void dom_dcycle_adj(dom_ptr domc, u32 weight, u64 now, bool runnable, s64 l3_adj) { struct bucket_ctx *bucket; struct lock_wrapper *lockw; @@ -363,19 +365,22 @@ static void dom_dcycle_adj(dom_ptr domc, u32 weight, u64 now, bool runnable) bpf_spin_lock(&lockw->lock); bucket->dcycle += adj; + bucket->l3 += l3_adj; ravg_accumulate(&bucket->rd, bucket->dcycle, now, load_half_life); + ravg_accumulate(&bucket->l3_rd, bucket->l3, now, load_half_life); bpf_spin_unlock(&lockw->lock); - if (adj < 0 && (s64)bucket->dcycle < 0) - scx_bpf_error("cpu%d dom%u bucket%u load underflow (dcycle=%lld adj=%lld)", + if ((adj < 0 && (s64)bucket->dcycle < 0) || ((s64) bucket->l3 - l3_adj < 0)) + scx_bpf_error("cpu%d dom%u bucket%u load underflow (dcycle=%lld l3=%lld adj=%lld l3_adj=%lld)", bpf_get_smp_processor_id(), dom_id, bucket_idx, - bucket->dcycle, adj); + bucket->dcycle, bucket->l3, adj, l3_adj); if (debug >=2 && (!domc->dbg_dcycle_printed_at || now - domc->dbg_dcycle_printed_at >= 1000000000)) { - bpf_printk("DCYCLE ADJ dom=%u bucket=%u adj=%lld dcycle=%u avg_dcycle=%llu", + bpf_printk("ADJ dom=%u bucket=%u adj=%lld dcycle=%u avg_dcycle=%llu avg_l3=%llu", dom_id, bucket_idx, adj, bucket->dcycle, - ravg_read(&bucket->rd, now, load_half_life) >> RAVG_FRAC_BITS); + ravg_read(&bucket->rd, now, load_half_life) >> RAVG_FRAC_BITS, + ravg_read(&bucket->l3_rd, now, load_half_life) >> RAVG_FRAC_BITS); domc->dbg_dcycle_printed_at = now; } } @@ -387,8 +392,9 @@ static void dom_dcycle_xfer_task(struct task_struct *p, struct task_ctx *taskc, struct bucket_ctx *from_bucket, *to_bucket; u32 idx = 0, weight = taskc->weight; struct lock_wrapper *from_lockw, *to_lockw; - struct ravg_data task_dcyc_rd; + struct ravg_data task_dcyc_rd, task_l3_rd; u64 from_dcycle[2], to_dcycle[2], task_dcycle; + u64 from_l3[2], to_l3[2], task_l3; from_lockw = lookup_dom_bkt_lock(from_domc->id, weight); to_lockw = lookup_dom_bkt_lock(to_domc->id, weight); @@ -409,42 +415,63 @@ static void dom_dcycle_xfer_task(struct task_struct *p, struct task_ctx *taskc, */ ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, load_half_life); task_dcyc_rd = taskc->dcyc_rd; - if (debug >= 2) + task_l3_rd = taskc->l3_rd; + if (debug >= 2) { task_dcycle = ravg_read(&task_dcyc_rd, now, load_half_life); + task_l3 = ravg_read(&task_dcyc_rd, now, load_half_life); + } /* transfer out of @from_domc */ bpf_spin_lock(&from_lockw->lock); - if (taskc->runnable) + if (taskc->runnable) { from_bucket->dcycle--; + from_bucket->l3 -= taskc->l3_traffic; + } - if (debug >= 2) + if (debug >= 2) { from_dcycle[0] = ravg_read(&from_bucket->rd, now, load_half_life); + from_l3[0] = ravg_read(&from_bucket->rd, now, load_half_life); + } ravg_transfer(&from_bucket->rd, from_bucket->dcycle, &task_dcyc_rd, taskc->runnable, load_half_life, false); + ravg_transfer(&from_bucket->l3_rd, from_bucket->l3, + &task_l3_rd, taskc->l3_traffic, load_half_life, false); - if (debug >= 2) + + if (debug >= 2) { from_dcycle[1] = ravg_read(&from_bucket->rd, now, load_half_life); + from_l3[1] = ravg_read(&from_bucket->rd, now, load_half_life); + } bpf_spin_unlock(&from_lockw->lock); /* transfer into @to_domc */ bpf_spin_lock(&to_lockw->lock); - if (taskc->runnable) + if (taskc->runnable) { to_bucket->dcycle++; + to_bucket->l3 += taskc->l3_traffic; + } - if (debug >= 2) + if (debug >= 2) { to_dcycle[0] = ravg_read(&to_bucket->rd, now, load_half_life); + to_l3[0] = ravg_read(&to_bucket->rd, now, load_half_life); + } ravg_transfer(&to_bucket->rd, to_bucket->dcycle, &task_dcyc_rd, taskc->runnable, load_half_life, true); + ravg_transfer(&to_bucket->l3_rd, to_bucket->l3, + &task_l3_rd, taskc->l3_traffic, load_half_life, true); - if (debug >= 2) + + if (debug >= 2) { to_dcycle[1] = ravg_read(&to_bucket->rd, now, load_half_life); + to_l3[1] = ravg_read(&to_bucket->rd, now, load_half_life); + } bpf_spin_unlock(&to_lockw->lock); - if (debug >= 2) + if (debug >= 2) { bpf_printk("XFER DCYCLE dom%u->%u task=%lu from=%lu->%lu to=%lu->%lu", from_domc->id, to_domc->id, task_dcycle >> RAVG_FRAC_BITS, @@ -452,6 +479,14 @@ static void dom_dcycle_xfer_task(struct task_struct *p, struct task_ctx *taskc, from_dcycle[1] >> RAVG_FRAC_BITS, to_dcycle[0] >> RAVG_FRAC_BITS, to_dcycle[1] >> RAVG_FRAC_BITS); + bpf_printk("XFER L3 dom%u->%u task=%lu from=%lu->%lu to=%lu->%lu", + from_domc->id, to_domc->id, + task_l3 >> RAVG_FRAC_BITS, + from_l3[0] >> RAVG_FRAC_BITS, + from_l3[1] >> RAVG_FRAC_BITS, + to_l3[0] >> RAVG_FRAC_BITS, + to_l3[1] >> RAVG_FRAC_BITS); + } } static u64 dom_min_vruntime(dom_ptr domc) @@ -572,6 +607,70 @@ static void refresh_tune_params(void) } } +void *bpf_cast_to_kern_ctx(void *) __ksym; + +/* + * Performance counter callback. + */ +SEC("perf_event") +int read_sample(struct bpf_perf_event_data_kern __kptr *arg) +{ + struct bpf_perf_event_data_kern *ctx, a; + union perf_mem_data_src data_src; + struct perf_sample_data data; + struct task_ctx *taskc; + struct task_struct *p; + int ret; + + ctx = bpf_cast_to_kern_ctx(arg); + + if ((ret = bpf_probe_read_kernel(&a, sizeof(a), ctx))) { + scx_bpf_error("[0] %s: bpf_probe_read_kernel failed", __func__); + return -EACCES; + } + + if ((ret = bpf_probe_read_kernel(&data, sizeof(data), a.data))) { + scx_bpf_error("%s: bpf_probe_read_kernel failed", __func__); + return -EACCES; + } + + data_src = ctx->data->data_src; + if (!ctx->data->sample_flags || data_src.mem_op == 1) + return 0; + + + p = bpf_get_current_task_btf(); + if (!p) { + bpf_printk("could not retrieve current task"); + return 0; + } + + /* Benign failure for tasks not in scx, e.g., idle. */ + taskc = try_lookup_task_ctx(p); + if (!taskc) + return 0; + + + /* Require level 3 because it is really spammy. */ + if (debug >= 3) { + bpf_printk("(1/2) %s\t(0x%lx,0x%lx,0x%lx) ", + data_src.mem_op == 2 ? "STORE" : (data_src.mem_op == 4 ? "LOAD" : "UNKNOWN") , + data_src.mem_lvl_num, + data_src.mem_snoop, + data_src.mem_remote + ); + bpf_printk("(2/2) [%llx, %llx] 0x%lx", + ctx->data->phys_addr, + ctx->data->addr, + data_src.mem_dtlb + ); + } + + taskc->l3_next += 1; + + return 0; +} + static u64 min(u64 a, u64 b) { return a <= b ? a : b; @@ -1431,7 +1530,7 @@ void BPF_STRUCT_OPS(rusty_runnable, struct task_struct *p, u64 enq_flags) wakee_ctx->is_kworker = p->flags & PF_WQ_WORKER; task_load_adj(wakee_ctx, now, true); - dom_dcycle_adj(wakee_ctx->domc, wakee_ctx->weight, now, true); + dom_dcycle_adj(wakee_ctx->domc, wakee_ctx->weight, now, true, wakee_ctx->l3_traffic); if (fifo_sched) return; @@ -1554,7 +1653,11 @@ void BPF_STRUCT_OPS(rusty_quiescent, struct task_struct *p, u64 deq_flags) return; task_load_adj(taskc, now, false); - dom_dcycle_adj(domc, taskc->weight, now, false); + dom_dcycle_adj(domc, taskc->weight, now, false, -taskc->l3_traffic); + + /* Update our current L3 traffic prediction. */ + taskc->l3_traffic = taskc->l3_next; + taskc->l3_next = 0; if (fifo_sched) return; diff --git a/scheds/rust/scx_rusty/src/bpf/types.h b/scheds/rust/scx_rusty/src/bpf/types.h index c5e0b8125d..ce9abc3389 100644 --- a/scheds/rust/scx_rusty/src/bpf/types.h +++ b/scheds/rust/scx_rusty/src/bpf/types.h @@ -63,6 +63,10 @@ struct task_ctx { u32 pid; struct ravg_data dcyc_rd; + struct ravg_data l3_rd; + + u64 l3_traffic; + u64 l3_next; }; /* XXXETSAL Same rationale as for dom_ptr. Remove once we dump Clang 18.*/ @@ -76,6 +80,9 @@ typedef struct task_ctx *task_ptr; struct bucket_ctx { u64 dcycle; struct ravg_data rd; + + u64 l3; + struct ravg_data l3_rd; }; struct dom_active_tasks { diff --git a/scheds/rust/scx_rusty/src/load_balance.rs b/scheds/rust/scx_rusty/src/load_balance.rs index cb6b0d5225..24352bedba 100644 --- a/scheds/rust/scx_rusty/src/load_balance.rs +++ b/scheds/rust/scx_rusty/src/load_balance.rs @@ -460,6 +460,7 @@ pub struct LoadBalancer<'a, 'b> { lb_apply_weight: bool, balance_load: bool, + l3_balancing: bool, } // Verify that the number of buckets is a factor of the maximum weight to @@ -476,6 +477,7 @@ impl<'a, 'b> LoadBalancer<'a, 'b> { skip_kworkers: bool, lb_apply_weight: bool, balance_load: bool, + l3_balancing: bool, ) -> Self { Self { skel, @@ -489,6 +491,7 @@ impl<'a, 'b> LoadBalancer<'a, 'b> { balance_load, dom_group, + l3_balancing, } } @@ -573,7 +576,7 @@ impl<'a, 'b> LoadBalancer<'a, 'b> { for bucket in 0..NUM_BUCKETS { let bucket_ctx = &dom_ctx.buckets[bucket as usize]; - let rd = &bucket_ctx.rd; + let rd = if self.l3_balancing { &bucket_ctx.l3_rd } else { &bucket_ctx.rd }; let duty_cycle = ravg_read( rd.val, rd.val_at, @@ -653,7 +656,7 @@ impl<'a, 'b> LoadBalancer<'a, 'b> { continue; } - let rd = &taskc.dcyc_rd; + let rd = if self.l3_balancing { &taskc.l3_rd } else { &taskc.dcyc_rd }; let mut load = ravg_read( rd.val, rd.val_at, @@ -769,6 +772,10 @@ impl<'a, 'b> LoadBalancer<'a, 'b> { } let load = *(task.load); + if load == 0.0f64 { + return Ok(None); + } + let taskc_p = task.taskc_p; task.migrated.set(true); std::mem::swap(&mut push_dom.tasks, &mut SortedVec::from_unsorted(tasks)); diff --git a/scheds/rust/scx_rusty/src/main.rs b/scheds/rust/scx_rusty/src/main.rs index 39c658c83a..9e0d2f58ac 100644 --- a/scheds/rust/scx_rusty/src/main.rs +++ b/scheds/rust/scx_rusty/src/main.rs @@ -15,6 +15,9 @@ use tuner::Tuner; pub mod load_balance; use load_balance::LoadBalancer; +pub mod perf; +use perf::init_perf_counters; + mod stats; use std::collections::BTreeMap; use std::mem::MaybeUninit; @@ -227,6 +230,10 @@ struct Opts { /// prioritize energy efficiency. When in doubt, use 0 or 1024. #[clap(long, default_value = "0")] perf: u32, + + /// Use L3 traffic sampling for load balancing instead of CPU load. + #[clap(long, default_value="false")] + l3_balancing: bool, } fn read_cpu_busy_and_total(reader: &procfs::ProcReader) -> Result<(u64, u64)> { @@ -348,6 +355,8 @@ struct Scheduler<'a> { tuner: Tuner, stats_server: StatsServer, + _pefds: Vec<(i32, libbpf_rs::Link)>, + l3_balancing: bool, } impl<'a> Scheduler<'a> { @@ -444,7 +453,14 @@ impl<'a> Scheduler<'a> { // Attach. let mut skel = scx_ops_load!(skel, rusty, uei)?; + + let mut pefds: Vec<(i32, libbpf_rs::Link)> = vec![]; + for i in 0..32 { + pefds.push(init_perf_counters(&mut skel, &i)?); + } + let struct_ops = Some(scx_ops_attach!(skel, rusty)?); + let stats_server = StatsServer::new(stats::server_data()).launch()?; for (id, dom) in domains.doms().iter() { @@ -466,6 +482,7 @@ impl<'a> Scheduler<'a> { tune_interval: Duration::from_secs_f64(opts.tune_interval), balance_load: !opts.no_load_balance, balanced_kworkers: opts.balanced_kworkers, + l3_balancing: opts.l3_balancing, dom_group: domains.clone(), proc_reader, @@ -482,6 +499,7 @@ impl<'a> Scheduler<'a> { opts.slice_us_overutil * 1000, )?, stats_server, + _pefds: pefds, }) } @@ -560,6 +578,7 @@ impl<'a> Scheduler<'a> { self.balanced_kworkers, self.tuner.fully_utilized, self.balance_load, + self.l3_balancing, ); lb.load_balance()?; diff --git a/scheds/rust/scx_rusty/src/perf.rs b/scheds/rust/scx_rusty/src/perf.rs new file mode 100644 index 0000000000..44616de1a5 --- /dev/null +++ b/scheds/rust/scx_rusty/src/perf.rs @@ -0,0 +1,230 @@ +extern crate libc; + +use crate::bpf_skel::*; + +use libbpf_rs; +use libbpf_rs::AsRawLibbpf; +use libbpf_rs::libbpf_sys; +use std::io; +use std::mem; + +// Expanded from systing's code: www.github.com/josefbacik/systing + +const _PERF_TYPE_HARDWARE: u32 = 0x0; +const _PERF_TYPE_SOFTWARE: u32 = 0x1; +const _PERF_TYPE_RAW: u32 = 0x3; +const _PERF_TYPE_AMD_IBS: u32 = 0xb; + +const _PERF_COUNT_HW_CPU_CYCLES: u64 = 0; +const _PERF_COUNT_HW_CACHE_REFERENCES: u64 = 2; +const _PERF_COUNT_HW_CACHE_MISSES: u64 = 3; +const _PERF_COUNT_HW_STALLED_CYCLES_FRONTEND: u64 = 7; +const _PERF_COUNT_HW_STALLED_CYCLES_BACKEND: u64 = 8; + +const _PERF_COUNT_SW_CPU_CLOCK: u64 = 0; + +// WARNING: These are not guaranteed to be correct because the layout of the bitfield +// in the perf_sample_attr C struct that contains them is not guaranteed by the C standard. +const _PERF_SAMPLE_FLAG_DISABLED: u64 = 1 << 0; +const _PERF_SAMPLE_FLAG_INHERIT: u64 = 1 << 1; +const _PERF_SAMPLE_FLAG_PINNED: u64 = 1 << 2; +const _PERF_SAMPLE_FLAG_EXCLUSIVE: u64 = 1 << 3; +const _PERF_SAMPLE_FLAG_EXCLUDE_USER: u64 = 1 << 4; +const _PERF_SAMPLE_FLAG_EXCLUDE_KERNEL: u64 = 1 << 5; +const _PERF_SAMPLE_FLAG_EXCLUDE_HV: u64 = 1 << 6; +const _PERF_SAMPLE_FLAG_EXCLUDE_IDLE: u64 = 1 << 7; +const _PERF_SAMPLE_FLAG_MMAP: u64 = 1 << 8; +const _PERF_SAMPLE_FLAG_COMM: u64 = 1 << 9; +const _PERF_SAMPLE_FLAG_FREQ: u64 = 1 << 10; +const _PERF_SAMPLE_FLAG_INHERIT_STAT: u64 = 1 << 11; +const _PERF_SAMPLE_FLAG_ENABLE_ON_EXEC: u64 = 1 << 12; +const _PERF_SAMPLE_FLAG_TASK: u64 = 1 << 13; +const _PERF_SAMPLE_FLAG_WATERMARK: u64 = 1 << 14; +const _PERF_SAMPLE_FLAG_PRECISE_IP: u64 = 1 << 15; +const _PERF_SAMPLE_FLAG_MMAP_DATA: u64 = 1 << 17; +const _PERF_SAMPLE_FLAG_ID_ALL: u64 = 1 << 18; +const _PERF_SAMPLE_FLAG_EXCLUDE_HOST: u64 = 1 << 19; +const _PERF_SAMPLE_FLAG_EXCLUDE_GUEST: u64 = 1 << 20; +const _PERF_SAMPLE_FLAG_EXCLUDE_CALLCHAIN_KERNEL: u64 = 1 << 21; +const _PERF_SAMPLE_FLAG_EXCLUDE_CALLCHAIN_USER: u64 = 1 << 22; +const _PERF_SAMPLE_FLAG_MMAP2: u64 = 1 << 23; +const _PERF_SAMPLE_FLAG_COMM_EXEC: u64 = 1 << 24; +const _PERF_SAMPLE_FLAG_USE_CLOCKID: u64 = 1 << 25; +const _PERF_SAMPLE_FLAG_WRITE_BACKWARD: u64 = 1 << 26; +const _PERF_SAMPLE_FLAG_NAMESPACES: u64 = 1 << 27; +const _PERF_SAMPLE_FLAG_KSYMBOL: u64 = 1 << 28; +const _PERF_SAMPLE_FLAG_BPF_SYMBOL: u64 = 1 << 29; +const _PERF_SAMPLE_FLAG_AUX_OUTPUT: u64 = 1 << 30; +const _PERF_SAMPLE_FLAG_CGROUP: u64 = 1 << 31; +const _PERF_SAMPLE_FLAG_TEXT_POKE: u64 = 1 << 32; +const _PERF_SAMPLE_FLAG_BUILD_ID: u64 = 1 << 33; +const _PERF_SAMPLE_FLAG_INHERIT_THREAD: u64 = 1 << 34; +const _PERF_SAMPLE_FLAG_REMOVE_ON_EXEC: u64 = 1 << 35; +const _PERF_SAMPLE_FLAG_SIGTRAP: u64 = 1 << 36; + +const _PERF_SAMPLE_IP: u64 = 1 << 0; +const _PERF_SAMPLE_TID: u64 = 1 << 1; +const _PERF_SAMPLE_TIME: u64 = 1 << 2; +const _PERF_SAMPLE_ADDR: u64 = 1 << 3; +const _PERF_SAMPLE_READ: u64 = 1 << 4; +const _PERF_SAMPLE_CALLCHAIN: u64 = 1 << 5; +const _PERF_SAMPLE_ID: u64 = 1 << 6; +const _PERF_SAMPLE_CPU: u64 = 1 << 7; +const _PERF_SAMPLE_PERIOD: u64 = 1 << 8; +const _PERF_SAMPLE_STREAM_ID: u64 = 1 << 9; +const _PERF_SAMPLE_RAW: u64 = 1 << 10; +const _PERF_SAMPLE_BRANCH_STACK: u64 = 1 << 11; +const _PERF_SAMPLE_REGS_USER: u64 = 1 << 12; +const _PERF_SAMPLE_STACK_USER: u64 = 1 << 13; +const _PERF_SAMPLE_WEIGHT: u64 = 1 << 14; +const _PERF_SAMPLE_DATA_SRC: u64 = 1 << 15; +const _PERF_SAMPLE_IDENTIFIER: u64 = 1 << 16; +const _PERF_SAMPLE_TRANSACTION: u64 = 1 << 17; +const _PERF_SAMPLE_REGS_INTR: u64 = 1 << 18; +const _PERF_SAMPLE_PHYS_ADDR: u64 = 1 << 19; +const _PERF_SAMPLE_PHYS_AUX: u64 = 1 << 20; +const _PERF_SAMPLE_PHYS_CGROUP: u64 = 1 << 21; +const _PERF_SAMPLE_DATA_PAGE_SIZE: u64 = 1 << 22; +const _PERF_SAMPLE_CODE_PAGE_SIZE: u64 = 1 << 23; +const _PERF_SAMPLE_WEIGHT_STRUCT: u64 = 1 << 24; + +#[repr(C)] +union sample_un { + pub sample_period: u64, + pub sample_freq: u64, +} + +#[repr(C)] +union wakeup_un { + pub wakeup_events: u32, + pub wakeup_atermark: u32, +} + +#[repr(C)] +union bp_1_un { + pub bp_addr: u64, + pub kprobe_func: u64, + pub uprobe_path: u64, + pub config1: u64, +} + +#[repr(C)] +union bp_2_un { + pub bp_len: u64, + pub kprobe_addr: u64, + pub probe_offset: u64, + pub config2: u64, +} + +#[repr(C)] +#[allow(non_camel_case_types)] +struct perf_event_attr { + pub _type: u32, + pub size: u32, + pub config: u64, + pub sample: sample_un, + pub sample_type: u64, + pub read_format: u64, + pub flags: u64, + pub wakeup: wakeup_un, + pub bp_type: u32, + pub bp_1: bp_1_un, + pub bp_2: bp_2_un, + pub branch_sample_type: u64, + pub sample_regs_user: u64, + pub sample_stack_user: u32, + pub clockid: i32, + pub sample_regs_intr: u64, + pub aux_watermark: u32, + pub sample_max_stack: u16, + pub __reserved_2: u16, + pub aux_sample_size: u32, + pub __reserved_3: u32, +} + +extern "C" { + fn syscall(number: libc::c_long, ...) -> libc::c_long; +} + +fn perf_event_open( + hw_event: &perf_event_attr, + pid: libc::pid_t, + cpu: libc::c_int, + group_fd: libc::c_int, + flags: libc::c_ulong, +) -> libc::c_long { + unsafe { + syscall( + libc::SYS_perf_event_open, + hw_event as *const perf_event_attr, + pid, + cpu, + group_fd, + flags, + ) + } +} + +// XXXETSAL: Comment below is taken verbatim from the original systing code +// We're just doing this until the libbpf-rs crate gets updated with my patch. +trait LibbpfPerfOptions { + fn attach_perf_event_with_opts( + &self, + pefd: i32, + ) -> Result; +} + +impl LibbpfPerfOptions for libbpf_rs::ProgramMut<'_> { + fn attach_perf_event_with_opts( + &self, + pefd: i32, + ) -> Result { + let mut opts = libbpf_sys::bpf_perf_event_opts::default(); + opts.bpf_cookie = 0; + opts.sz = mem::size_of::() as u64; + let ptr = unsafe { + libbpf_sys::bpf_program__attach_perf_event_opts( + self.as_libbpf_object().as_ptr(), + pefd, + &opts as *const _ as *const _, + ) + }; + let ret = unsafe { libbpf_sys::libbpf_get_error(ptr as *const _) }; + if ret != 0 { + return Err(libbpf_rs::Error::from_raw_os_error(-ret as i32)); + } + let ptr = unsafe { std::ptr::NonNull::new_unchecked(ptr) }; + let link = unsafe { libbpf_rs::Link::from_ptr(ptr) }; + Ok(link) + } +} +pub fn init_perf_counters(skel: &mut BpfSkel, cpu: &i32) -> Result<(i32, libbpf_rs::Link), libbpf_rs::Error> { + let buf: Vec = vec![0; mem::size_of::()]; + let mut attr = unsafe { + Box::::from_raw( + buf.leak().as_mut_ptr() as *mut perf_event_attr + ) + }; + + /* + * XXX Discover the counter instead of hardcoding it. + * Afterwards we can use any counter we care for. + */ + attr._type = _PERF_TYPE_AMD_IBS; + attr.size = mem::size_of::() as u32; + // Only mark L3 misses. + attr.config = 1 << 16; + attr.sample_type = _PERF_SAMPLE_CPU | _PERF_SAMPLE_IP | _PERF_SAMPLE_TID | _PERF_SAMPLE_DATA_SRC | _PERF_SAMPLE_PHYS_ADDR | _PERF_SAMPLE_ADDR; + attr.sample.sample_period = 1000; + attr.flags = 3 * _PERF_SAMPLE_FLAG_PRECISE_IP; + + let pefd = perf_event_open(attr.as_ref(), -1, *cpu, -1, 0) as i32; + if pefd == -1 { + let os_error = io::Error::last_os_error(); + return Err(libbpf_rs::Error::from(os_error)); + } + + let link = skel.progs.read_sample.attach_perf_event_with_opts(pefd); + + Ok((pefd, link?)) +}