| // SPDX-License-Identifier: GPL-2.0 |
| // Copyright (c) 2019 Facebook |
| |
| #include <stdint.h> |
| #include <stddef.h> |
| #include <stdbool.h> |
| #include <linux/bpf.h> |
| #include <linux/ptrace.h> |
| #include <linux/sched.h> |
| #include <linux/types.h> |
| #include <bpf/bpf_helpers.h> |
| |
| #include "bpf_compiler.h" |
| |
| typedef uint32_t pid_t; |
| struct task_struct {}; |
| |
| #define TASK_COMM_LEN 16 |
| #define PERF_MAX_STACK_DEPTH 127 |
| |
| #define STROBE_TYPE_INVALID 0 |
| #define STROBE_TYPE_INT 1 |
| #define STROBE_TYPE_STR 2 |
| #define STROBE_TYPE_MAP 3 |
| |
| #define STACK_TABLE_EPOCH_SHIFT 20 |
| #define STROBE_MAX_STR_LEN 1 |
| #define STROBE_MAX_CFGS 32 |
| #define READ_MAP_VAR_PAYLOAD_CAP \ |
| ((1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN) |
| #define STROBE_MAX_PAYLOAD \ |
| (STROBE_MAX_STRS * STROBE_MAX_STR_LEN + \ |
| STROBE_MAX_MAPS * READ_MAP_VAR_PAYLOAD_CAP) |
| |
| struct strobe_value_header { |
| /* |
| * meaning depends on type: |
| * 1. int: 0, if value not set, 1 otherwise |
| * 2. str: 1 always, whether value is set or not is determined by ptr |
| * 3. map: 1 always, pointer points to additional struct with number |
| * of entries (up to STROBE_MAX_MAP_ENTRIES) |
| */ |
| uint16_t len; |
| /* |
| * _reserved might be used for some future fields/flags, but we always |
| * want to keep strobe_value_header to be 8 bytes, so BPF can read 16 |
| * bytes in one go and get both header and value |
| */ |
| uint8_t _reserved[6]; |
| }; |
| |
| /* |
| * strobe_value_generic is used from BPF probe only, but needs to be a union |
| * of strobe_value_int/strobe_value_str/strobe_value_map |
| */ |
| struct strobe_value_generic { |
| struct strobe_value_header header; |
| union { |
| int64_t val; |
| void *ptr; |
| }; |
| }; |
| |
| struct strobe_value_int { |
| struct strobe_value_header header; |
| int64_t value; |
| }; |
| |
| struct strobe_value_str { |
| struct strobe_value_header header; |
| const char* value; |
| }; |
| |
| struct strobe_value_map { |
| struct strobe_value_header header; |
| const struct strobe_map_raw* value; |
| }; |
| |
| struct strobe_map_entry { |
| const char* key; |
| const char* val; |
| }; |
| |
| /* |
| * Map of C-string key/value pairs with fixed maximum capacity. Each map has |
| * corresponding int64 ID, which application can use (or ignore) in whatever |
| * way appropriate. Map is "write-only", there is no way to get data out of |
| * map. Map is intended to be used to provide metadata for profilers and is |
| * not to be used for internal in-app communication. All methods are |
| * thread-safe. |
| */ |
| struct strobe_map_raw { |
| /* |
| * general purpose unique ID that's up to application to decide |
| * whether and how to use; for request metadata use case id is unique |
| * request ID that's used to match metadata with stack traces on |
| * Strobelight backend side |
| */ |
| int64_t id; |
| /* number of used entries in map */ |
| int64_t cnt; |
| /* |
| * having volatile doesn't change anything on BPF side, but clang |
| * emits warnings for passing `volatile const char *` into |
| * bpf_probe_read_user_str that expects just `const char *` |
| */ |
| const char* tag; |
| /* |
| * key/value entries, each consisting of 2 pointers to key and value |
| * C strings |
| */ |
| struct strobe_map_entry entries[STROBE_MAX_MAP_ENTRIES]; |
| }; |
| |
| /* Following values define supported values of TLS mode */ |
| #define TLS_NOT_SET -1 |
| #define TLS_LOCAL_EXEC 0 |
| #define TLS_IMM_EXEC 1 |
| #define TLS_GENERAL_DYN 2 |
| |
| /* |
| * structure that universally represents TLS location (both for static |
| * executables and shared libraries) |
| */ |
| struct strobe_value_loc { |
| /* |
| * tls_mode defines what TLS mode was used for particular metavariable: |
| * - -1 (TLS_NOT_SET) - no metavariable; |
| * - 0 (TLS_LOCAL_EXEC) - Local Executable mode; |
| * - 1 (TLS_IMM_EXEC) - Immediate Executable mode; |
| * - 2 (TLS_GENERAL_DYN) - General Dynamic mode; |
| * Local Dynamic mode is not yet supported, because never seen in |
| * practice. Mode defines how offset field is interpreted. See |
| * calc_location() in below for details. |
| */ |
| int64_t tls_mode; |
| /* |
| * TLS_LOCAL_EXEC: offset from thread pointer (fs:0 for x86-64, |
| * tpidr_el0 for aarch64). |
| * TLS_IMM_EXEC: absolute address of GOT entry containing offset |
| * from thread pointer; |
| * TLS_GENERAL_DYN: absolute address of double GOT entry |
| * containing tls_index_t struct; |
| */ |
| int64_t offset; |
| }; |
| |
| struct strobemeta_cfg { |
| int64_t req_meta_idx; |
| struct strobe_value_loc int_locs[STROBE_MAX_INTS]; |
| struct strobe_value_loc str_locs[STROBE_MAX_STRS]; |
| struct strobe_value_loc map_locs[STROBE_MAX_MAPS]; |
| }; |
| |
| struct strobe_map_descr { |
| uint64_t id; |
| int16_t tag_len; |
| /* |
| * cnt <0 - map value isn't set; |
| * 0 - map has id set, but no key/value entries |
| */ |
| int16_t cnt; |
| /* |
| * both key_lens[i] and val_lens[i] should be >0 for present key/value |
| * entry |
| */ |
| uint16_t key_lens[STROBE_MAX_MAP_ENTRIES]; |
| uint16_t val_lens[STROBE_MAX_MAP_ENTRIES]; |
| }; |
| |
| struct strobemeta_payload { |
| /* req_id has valid request ID, if req_meta_valid == 1 */ |
| int64_t req_id; |
| uint8_t req_meta_valid; |
| /* |
| * mask has Nth bit set to 1, if Nth metavar was present and |
| * successfully read |
| */ |
| uint64_t int_vals_set_mask; |
| int64_t int_vals[STROBE_MAX_INTS]; |
| /* len is >0 for present values */ |
| uint16_t str_lens[STROBE_MAX_STRS]; |
| /* if map_descrs[i].cnt == -1, metavar is not present/set */ |
| struct strobe_map_descr map_descrs[STROBE_MAX_MAPS]; |
| /* |
| * payload has compactly packed values of str and map variables in the |
| * form: strval1\0strval2\0map1key1\0map1val1\0map2key1\0map2val1\0 |
| * (and so on); str_lens[i], key_lens[i] and val_lens[i] determines |
| * value length |
| */ |
| char payload[STROBE_MAX_PAYLOAD]; |
| }; |
| |
| struct strobelight_bpf_sample { |
| uint64_t ktime; |
| char comm[TASK_COMM_LEN]; |
| pid_t pid; |
| int user_stack_id; |
| int kernel_stack_id; |
| int has_meta; |
| struct strobemeta_payload metadata; |
| /* |
| * makes it possible to pass (<real payload size> + 1) as data size to |
| * perf_submit() to avoid perf_submit's paranoia about passing zero as |
| * size, as it deduces that <real payload size> might be |
| * **theoretically** zero |
| */ |
| char dummy_safeguard; |
| }; |
| |
| struct { |
| __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); |
| __uint(max_entries, 32); |
| __uint(key_size, sizeof(int)); |
| __uint(value_size, sizeof(int)); |
| } samples SEC(".maps"); |
| |
| struct { |
| __uint(type, BPF_MAP_TYPE_STACK_TRACE); |
| __uint(max_entries, 16); |
| __uint(key_size, sizeof(uint32_t)); |
| __uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH); |
| } stacks_0 SEC(".maps"); |
| |
| struct { |
| __uint(type, BPF_MAP_TYPE_STACK_TRACE); |
| __uint(max_entries, 16); |
| __uint(key_size, sizeof(uint32_t)); |
| __uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH); |
| } stacks_1 SEC(".maps"); |
| |
| struct { |
| __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); |
| __uint(max_entries, 1); |
| __type(key, uint32_t); |
| __type(value, struct strobelight_bpf_sample); |
| } sample_heap SEC(".maps"); |
| |
| struct { |
| __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); |
| __uint(max_entries, STROBE_MAX_CFGS); |
| __type(key, pid_t); |
| __type(value, struct strobemeta_cfg); |
| } strobemeta_cfgs SEC(".maps"); |
| |
| /* Type for the dtv. */ |
| /* https://github.com/lattera/glibc/blob/master/nptl/sysdeps/x86_64/tls.h#L34 */ |
| typedef union dtv { |
| size_t counter; |
| struct { |
| void* val; |
| bool is_static; |
| } pointer; |
| } dtv_t; |
| |
| /* Partial definition for tcbhead_t */ |
| /* https://github.com/bminor/glibc/blob/master/sysdeps/x86_64/nptl/tls.h#L42 */ |
| struct tcbhead { |
| void* tcb; |
| dtv_t* dtv; |
| }; |
| |
| /* |
| * TLS module/offset information for shared library case. |
| * For x86-64, this is mapped onto two entries in GOT. |
| * For aarch64, this is pointed to by second GOT entry. |
| */ |
| struct tls_index { |
| uint64_t module; |
| uint64_t offset; |
| }; |
| |
| #ifdef SUBPROGS |
| __noinline |
| #else |
| __always_inline |
| #endif |
| static void *calc_location(struct strobe_value_loc *loc, void *tls_base) |
| { |
| /* |
| * tls_mode value is: |
| * - -1 (TLS_NOT_SET), if no metavar is present; |
| * - 0 (TLS_LOCAL_EXEC), if metavar uses Local Executable mode of TLS |
| * (offset from fs:0 for x86-64 or tpidr_el0 for aarch64); |
| * - 1 (TLS_IMM_EXEC), if metavar uses Immediate Executable mode of TLS; |
| * - 2 (TLS_GENERAL_DYN), if metavar uses General Dynamic mode of TLS; |
| * This schema allows to use something like: |
| * (tls_mode + 1) * (tls_base + offset) |
| * to get NULL for "no metavar" location, or correct pointer for local |
| * executable mode without doing extra ifs. |
| */ |
| if (loc->tls_mode <= TLS_LOCAL_EXEC) { |
| /* static executable is simple, we just have offset from |
| * tls_base */ |
| void *addr = tls_base + loc->offset; |
| /* multiply by (tls_mode + 1) to get NULL, if we have no |
| * metavar in this slot */ |
| return (void *)((loc->tls_mode + 1) * (int64_t)addr); |
| } |
| /* |
| * Other modes are more complicated, we need to jump through few hoops. |
| * |
| * For immediate executable mode (currently supported only for aarch64): |
| * - loc->offset is pointing to a GOT entry containing fixed offset |
| * relative to tls_base; |
| * |
| * For general dynamic mode: |
| * - loc->offset is pointing to a beginning of double GOT entries; |
| * - (for aarch64 only) second entry points to tls_index_t struct; |
| * - (for x86-64 only) two GOT entries are already tls_index_t; |
| * - tls_index_t->module is used to find start of TLS section in |
| * which variable resides; |
| * - tls_index_t->offset provides offset within that TLS section, |
| * pointing to value of variable. |
| */ |
| struct tls_index tls_index; |
| dtv_t *dtv; |
| void *tls_ptr; |
| |
| bpf_probe_read_user(&tls_index, sizeof(struct tls_index), |
| (void *)loc->offset); |
| /* valid module index is always positive */ |
| if (tls_index.module > 0) { |
| /* dtv = ((struct tcbhead *)tls_base)->dtv[tls_index.module] */ |
| bpf_probe_read_user(&dtv, sizeof(dtv), |
| &((struct tcbhead *)tls_base)->dtv); |
| dtv += tls_index.module; |
| } else { |
| dtv = NULL; |
| } |
| bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); |
| /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ |
| return tls_ptr && tls_ptr != (void *)-1 |
| ? tls_ptr + tls_index.offset |
| : NULL; |
| } |
| |
| #ifdef SUBPROGS |
| __noinline |
| #else |
| __always_inline |
| #endif |
| static void read_int_var(struct strobemeta_cfg *cfg, |
| size_t idx, void *tls_base, |
| struct strobe_value_generic *value, |
| struct strobemeta_payload *data) |
| { |
| void *location = calc_location(&cfg->int_locs[idx], tls_base); |
| if (!location) |
| return; |
| |
| bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); |
| data->int_vals[idx] = value->val; |
| if (value->header.len) |
| data->int_vals_set_mask |= (1 << idx); |
| } |
| |
| static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg, |
| size_t idx, void *tls_base, |
| struct strobe_value_generic *value, |
| struct strobemeta_payload *data, |
| size_t off) |
| { |
| void *location; |
| uint64_t len; |
| |
| data->str_lens[idx] = 0; |
| location = calc_location(&cfg->str_locs[idx], tls_base); |
| if (!location) |
| return 0; |
| |
| bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); |
| len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, value->ptr); |
| /* |
| * if bpf_probe_read_user_str returns error (<0), due to casting to |
| * unsinged int, it will become big number, so next check is |
| * sufficient to check for errors AND prove to BPF verifier, that |
| * bpf_probe_read_user_str won't return anything bigger than |
| * STROBE_MAX_STR_LEN |
| */ |
| if (len > STROBE_MAX_STR_LEN) |
| return 0; |
| |
| data->str_lens[idx] = len; |
| return off + len; |
| } |
| |
| static __always_inline uint64_t read_map_var(struct strobemeta_cfg *cfg, |
| size_t idx, void *tls_base, |
| struct strobe_value_generic *value, |
| struct strobemeta_payload *data, |
| size_t off) |
| { |
| struct strobe_map_descr* descr = &data->map_descrs[idx]; |
| struct strobe_map_raw map; |
| void *location; |
| uint64_t len; |
| |
| descr->tag_len = 0; /* presume no tag is set */ |
| descr->cnt = -1; /* presume no value is set */ |
| |
| location = calc_location(&cfg->map_locs[idx], tls_base); |
| if (!location) |
| return off; |
| |
| bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); |
| if (bpf_probe_read_user(&map, sizeof(struct strobe_map_raw), value->ptr)) |
| return off; |
| |
| descr->id = map.id; |
| descr->cnt = map.cnt; |
| if (cfg->req_meta_idx == idx) { |
| data->req_id = map.id; |
| data->req_meta_valid = 1; |
| } |
| |
| len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, map.tag); |
| if (len <= STROBE_MAX_STR_LEN) { |
| descr->tag_len = len; |
| off += len; |
| } |
| |
| #ifdef NO_UNROLL |
| __pragma_loop_no_unroll |
| #else |
| __pragma_loop_unroll |
| #endif |
| for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) { |
| if (i >= map.cnt) |
| break; |
| |
| descr->key_lens[i] = 0; |
| len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, |
| map.entries[i].key); |
| if (len <= STROBE_MAX_STR_LEN) { |
| descr->key_lens[i] = len; |
| off += len; |
| } |
| descr->val_lens[i] = 0; |
| len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, |
| map.entries[i].val); |
| if (len <= STROBE_MAX_STR_LEN) { |
| descr->val_lens[i] = len; |
| off += len; |
| } |
| } |
| |
| return off; |
| } |
| |
| #ifdef USE_BPF_LOOP |
| enum read_type { |
| READ_INT_VAR, |
| READ_MAP_VAR, |
| READ_STR_VAR, |
| }; |
| |
| struct read_var_ctx { |
| struct strobemeta_payload *data; |
| void *tls_base; |
| struct strobemeta_cfg *cfg; |
| size_t payload_off; |
| /* value gets mutated */ |
| struct strobe_value_generic *value; |
| enum read_type type; |
| }; |
| |
| static int read_var_callback(__u64 index, struct read_var_ctx *ctx) |
| { |
| /* lose precision info for ctx->payload_off, verifier won't track |
| * double xor, barrier_var() is needed to force clang keep both xors. |
| */ |
| ctx->payload_off ^= index; |
| barrier_var(ctx->payload_off); |
| ctx->payload_off ^= index; |
| switch (ctx->type) { |
| case READ_INT_VAR: |
| if (index >= STROBE_MAX_INTS) |
| return 1; |
| read_int_var(ctx->cfg, index, ctx->tls_base, ctx->value, ctx->data); |
| break; |
| case READ_MAP_VAR: |
| if (index >= STROBE_MAX_MAPS) |
| return 1; |
| if (ctx->payload_off > sizeof(ctx->data->payload) - READ_MAP_VAR_PAYLOAD_CAP) |
| return 1; |
| ctx->payload_off = read_map_var(ctx->cfg, index, ctx->tls_base, |
| ctx->value, ctx->data, ctx->payload_off); |
| break; |
| case READ_STR_VAR: |
| if (index >= STROBE_MAX_STRS) |
| return 1; |
| if (ctx->payload_off > sizeof(ctx->data->payload) - STROBE_MAX_STR_LEN) |
| return 1; |
| ctx->payload_off = read_str_var(ctx->cfg, index, ctx->tls_base, |
| ctx->value, ctx->data, ctx->payload_off); |
| break; |
| } |
| return 0; |
| } |
| #endif /* USE_BPF_LOOP */ |
| |
| /* |
| * read_strobe_meta returns NULL, if no metadata was read; otherwise returns |
| * pointer to *right after* payload ends |
| */ |
| #ifdef SUBPROGS |
| __noinline |
| #else |
| __always_inline |
| #endif |
| static void *read_strobe_meta(struct task_struct *task, |
| struct strobemeta_payload *data) |
| { |
| pid_t pid = bpf_get_current_pid_tgid() >> 32; |
| struct strobe_value_generic value = {0}; |
| struct strobemeta_cfg *cfg; |
| size_t payload_off; |
| void *tls_base; |
| |
| cfg = bpf_map_lookup_elem(&strobemeta_cfgs, &pid); |
| if (!cfg) |
| return NULL; |
| |
| data->int_vals_set_mask = 0; |
| data->req_meta_valid = 0; |
| payload_off = 0; |
| /* |
| * we don't have struct task_struct definition, it should be: |
| * tls_base = (void *)task->thread.fsbase; |
| */ |
| tls_base = (void *)task; |
| |
| #ifdef USE_BPF_LOOP |
| struct read_var_ctx ctx = { |
| .cfg = cfg, |
| .tls_base = tls_base, |
| .value = &value, |
| .data = data, |
| .payload_off = 0, |
| }; |
| int err; |
| |
| ctx.type = READ_INT_VAR; |
| err = bpf_loop(STROBE_MAX_INTS, read_var_callback, &ctx, 0); |
| if (err != STROBE_MAX_INTS) |
| return NULL; |
| |
| ctx.type = READ_STR_VAR; |
| err = bpf_loop(STROBE_MAX_STRS, read_var_callback, &ctx, 0); |
| if (err != STROBE_MAX_STRS) |
| return NULL; |
| |
| ctx.type = READ_MAP_VAR; |
| err = bpf_loop(STROBE_MAX_MAPS, read_var_callback, &ctx, 0); |
| if (err != STROBE_MAX_MAPS) |
| return NULL; |
| |
| payload_off = ctx.payload_off; |
| /* this should not really happen, here only to satisfy verifer */ |
| if (payload_off > sizeof(data->payload)) |
| payload_off = sizeof(data->payload); |
| #else |
| #ifdef NO_UNROLL |
| __pragma_loop_no_unroll |
| #else |
| __pragma_loop_unroll |
| #endif /* NO_UNROLL */ |
| for (int i = 0; i < STROBE_MAX_INTS; ++i) { |
| read_int_var(cfg, i, tls_base, &value, data); |
| } |
| #ifdef NO_UNROLL |
| __pragma_loop_no_unroll |
| #else |
| __pragma_loop_unroll |
| #endif /* NO_UNROLL */ |
| for (int i = 0; i < STROBE_MAX_STRS; ++i) { |
| payload_off = read_str_var(cfg, i, tls_base, &value, data, payload_off); |
| } |
| #ifdef NO_UNROLL |
| __pragma_loop_no_unroll |
| #else |
| __pragma_loop_unroll |
| #endif /* NO_UNROLL */ |
| for (int i = 0; i < STROBE_MAX_MAPS; ++i) { |
| payload_off = read_map_var(cfg, i, tls_base, &value, data, payload_off); |
| } |
| #endif /* USE_BPF_LOOP */ |
| |
| /* |
| * return pointer right after end of payload, so it's possible to |
| * calculate exact amount of useful data that needs to be sent |
| */ |
| return &data->payload[payload_off]; |
| } |
| |
| SEC("raw_tracepoint/kfree_skb") |
| int on_event(struct pt_regs *ctx) { |
| pid_t pid = bpf_get_current_pid_tgid() >> 32; |
| struct strobelight_bpf_sample* sample; |
| struct task_struct *task; |
| uint32_t zero = 0; |
| uint64_t ktime_ns; |
| void *sample_end; |
| |
| sample = bpf_map_lookup_elem(&sample_heap, &zero); |
| if (!sample) |
| return 0; /* this will never happen */ |
| |
| sample->pid = pid; |
| bpf_get_current_comm(&sample->comm, TASK_COMM_LEN); |
| ktime_ns = bpf_ktime_get_ns(); |
| sample->ktime = ktime_ns; |
| |
| task = (struct task_struct *)bpf_get_current_task(); |
| sample_end = read_strobe_meta(task, &sample->metadata); |
| sample->has_meta = sample_end != NULL; |
| sample_end = sample_end ? : &sample->metadata; |
| |
| if ((ktime_ns >> STACK_TABLE_EPOCH_SHIFT) & 1) { |
| sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_1, 0); |
| sample->user_stack_id = bpf_get_stackid(ctx, &stacks_1, BPF_F_USER_STACK); |
| } else { |
| sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_0, 0); |
| sample->user_stack_id = bpf_get_stackid(ctx, &stacks_0, BPF_F_USER_STACK); |
| } |
| |
| uint64_t sample_size = sample_end - (void *)sample; |
| /* should always be true */ |
| if (sample_size < sizeof(struct strobelight_bpf_sample)) |
| bpf_perf_event_output(ctx, &samples, 0, sample, 1 + sample_size); |
| return 0; |
| } |
| |
| char _license[] SEC("license") = "GPL"; |