| // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) |
| // Copyright (c) 2021 Facebook |
| // Copyright (c) 2021 Google |
| #include "vmlinux.h" |
| #include <bpf/bpf_helpers.h> |
| #include <bpf/bpf_tracing.h> |
| #include <bpf/bpf_core_read.h> |
| |
| #define MAX_LEVELS 10 // max cgroup hierarchy level: arbitrary |
| #define MAX_EVENTS 32 // max events per cgroup: arbitrary |
| |
| // NOTE: many of map and global data will be modified before loading |
| // from the userspace (perf tool) using the skeleton helpers. |
| |
| // single set of global perf events to measure |
| struct { |
| __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); |
| __uint(key_size, sizeof(__u32)); |
| __uint(value_size, sizeof(int)); |
| __uint(max_entries, 1); |
| } events SEC(".maps"); |
| |
| // from cgroup id to event index |
| struct { |
| __uint(type, BPF_MAP_TYPE_HASH); |
| __uint(key_size, sizeof(__u64)); |
| __uint(value_size, sizeof(__u32)); |
| __uint(max_entries, 1); |
| } cgrp_idx SEC(".maps"); |
| |
| // per-cpu event snapshots to calculate delta |
| struct { |
| __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); |
| __uint(key_size, sizeof(__u32)); |
| __uint(value_size, sizeof(struct bpf_perf_event_value)); |
| } prev_readings SEC(".maps"); |
| |
| // aggregated event values for each cgroup (per-cpu) |
| // will be read from the user-space |
| struct { |
| __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); |
| __uint(key_size, sizeof(__u32)); |
| __uint(value_size, sizeof(struct bpf_perf_event_value)); |
| } cgrp_readings SEC(".maps"); |
| |
| /* new kernel cgroup definition */ |
| struct cgroup___new { |
| int level; |
| struct cgroup *ancestors[]; |
| } __attribute__((preserve_access_index)); |
| |
| /* old kernel cgroup definition */ |
| struct cgroup___old { |
| int level; |
| u64 ancestor_ids[]; |
| } __attribute__((preserve_access_index)); |
| |
| const volatile __u32 num_events = 1; |
| const volatile __u32 num_cpus = 1; |
| const volatile int use_cgroup_v2 = 0; |
| |
| int enabled = 0; |
| int perf_subsys_id = -1; |
| |
| static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level) |
| { |
| /* recast pointer to capture new type for compiler */ |
| struct cgroup___new *cgrp_new = (void *)cgrp; |
| |
| if (bpf_core_field_exists(cgrp_new->ancestors)) { |
| return BPF_CORE_READ(cgrp_new, ancestors[level], kn, id); |
| } else { |
| /* recast pointer to capture old type for compiler */ |
| struct cgroup___old *cgrp_old = (void *)cgrp; |
| |
| return BPF_CORE_READ(cgrp_old, ancestor_ids[level]); |
| } |
| } |
| |
| static inline int get_cgroup_v1_idx(__u32 *cgrps, int size) |
| { |
| struct task_struct *p = (void *)bpf_get_current_task(); |
| struct cgroup *cgrp; |
| register int i = 0; |
| __u32 *elem; |
| int level; |
| int cnt; |
| |
| if (perf_subsys_id == -1) { |
| #if __has_builtin(__builtin_preserve_enum_value) |
| perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id, |
| perf_event_cgrp_id); |
| #else |
| perf_subsys_id = perf_event_cgrp_id; |
| #endif |
| } |
| cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_subsys_id], cgroup); |
| level = BPF_CORE_READ(cgrp, level); |
| |
| for (cnt = 0; i < MAX_LEVELS; i++) { |
| __u64 cgrp_id; |
| |
| if (i > level) |
| break; |
| |
| // convert cgroup-id to a map index |
| cgrp_id = get_cgroup_v1_ancestor_id(cgrp, i); |
| elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id); |
| if (!elem) |
| continue; |
| |
| cgrps[cnt++] = *elem; |
| if (cnt == size) |
| break; |
| } |
| |
| return cnt; |
| } |
| |
| static inline int get_cgroup_v2_idx(__u32 *cgrps, int size) |
| { |
| register int i = 0; |
| __u32 *elem; |
| int cnt; |
| |
| for (cnt = 0; i < MAX_LEVELS; i++) { |
| __u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i); |
| |
| if (cgrp_id == 0) |
| break; |
| |
| // convert cgroup-id to a map index |
| elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id); |
| if (!elem) |
| continue; |
| |
| cgrps[cnt++] = *elem; |
| if (cnt == size) |
| break; |
| } |
| |
| return cnt; |
| } |
| |
| static int bperf_cgroup_count(void) |
| { |
| register __u32 idx = 0; // to have it in a register to pass BPF verifier |
| register int c = 0; |
| struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val; |
| __u32 cpu = bpf_get_smp_processor_id(); |
| __u32 cgrp_idx[MAX_LEVELS]; |
| int cgrp_cnt; |
| __u32 key, cgrp; |
| long err; |
| |
| if (use_cgroup_v2) |
| cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, MAX_LEVELS); |
| else |
| cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, MAX_LEVELS); |
| |
| for ( ; idx < MAX_EVENTS; idx++) { |
| if (idx == num_events) |
| break; |
| |
| // XXX: do not pass idx directly (for verifier) |
| key = idx; |
| // this is per-cpu array for diff |
| prev_val = bpf_map_lookup_elem(&prev_readings, &key); |
| if (!prev_val) { |
| val.counter = val.enabled = val.running = 0; |
| bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY); |
| |
| prev_val = bpf_map_lookup_elem(&prev_readings, &key); |
| if (!prev_val) |
| continue; |
| } |
| |
| // read from global perf_event array |
| key = idx * num_cpus + cpu; |
| err = bpf_perf_event_read_value(&events, key, &val, sizeof(val)); |
| if (err) |
| continue; |
| |
| if (enabled) { |
| delta.counter = val.counter - prev_val->counter; |
| delta.enabled = val.enabled - prev_val->enabled; |
| delta.running = val.running - prev_val->running; |
| |
| for (c = 0; c < MAX_LEVELS; c++) { |
| if (c == cgrp_cnt) |
| break; |
| |
| cgrp = cgrp_idx[c]; |
| |
| // aggregate the result by cgroup |
| key = cgrp * num_events + idx; |
| cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key); |
| if (cgrp_val) { |
| cgrp_val->counter += delta.counter; |
| cgrp_val->enabled += delta.enabled; |
| cgrp_val->running += delta.running; |
| } else { |
| bpf_map_update_elem(&cgrp_readings, &key, |
| &delta, BPF_ANY); |
| } |
| } |
| } |
| |
| *prev_val = val; |
| } |
| return 0; |
| } |
| |
| // This will be attached to cgroup-switches event for each cpu |
| SEC("perf_event") |
| int BPF_PROG(on_cgrp_switch) |
| { |
| return bperf_cgroup_count(); |
| } |
| |
| SEC("raw_tp/sched_switch") |
| int BPF_PROG(trigger_read) |
| { |
| return bperf_cgroup_count(); |
| } |
| |
| char LICENSE[] SEC("license") = "Dual BSD/GPL"; |