Thomas Gleixner | 457c899 | 2019-05-19 13:08:55 +0100 | [diff] [blame] | 1 | // SPDX-License-Identifier: GPL-2.0-only |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 2 | #include "cgroup-internal.h" |
| 3 | |
| 4 | #include <linux/sched/cputime.h> |
| 5 | |
Yosry Ahmed | a319185 | 2022-08-24 16:31:15 -0700 | [diff] [blame] | 6 | #include <linux/bpf.h> |
| 7 | #include <linux/btf.h> |
| 8 | #include <linux/btf_ids.h> |
| 9 | |
Tejun Heo | 0fa294f | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 10 | static DEFINE_SPINLOCK(cgroup_rstat_lock); |
Tejun Heo | c58632b | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 11 | static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 12 | |
Tejun Heo | a17556f8 | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 13 | static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); |
| 14 | |
Tejun Heo | c58632b | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 15 | static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 16 | { |
Tejun Heo | c58632b | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 17 | return per_cpu_ptr(cgrp->rstat_cpu, cpu); |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 18 | } |
| 19 | |
| 20 | /** |
Tejun Heo | 6162cef | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 21 | * cgroup_rstat_updated - keep track of updated rstat_cpu |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 22 | * @cgrp: target cgroup |
Tejun Heo | c58632b | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 23 | * @cpu: cpu on which rstat_cpu was updated |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 24 | * |
Tejun Heo | c58632b | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 25 | * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching |
| 26 | * rstat_cpu->updated_children list. See the comment on top of |
| 27 | * cgroup_rstat_cpu definition for details. |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 28 | */ |
David Vernet | 400031e | 2023-02-01 11:30:15 -0600 | [diff] [blame] | 29 | __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 30 | { |
Tejun Heo | c58632b | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 31 | raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 32 | unsigned long flags; |
| 33 | |
| 34 | /* |
Tejun Heo | d8ef4b3 | 2020-04-09 14:55:35 -0400 | [diff] [blame] | 35 | * Speculative already-on-list test. This may race leading to |
| 36 | * temporary inaccuracies, which is fine. |
| 37 | * |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 38 | * Because @parent's updated_children is terminated with @parent |
| 39 | * instead of NULL, we can tell whether @cgrp is on the list by |
| 40 | * testing the next pointer for NULL. |
| 41 | */ |
Michal Koutný | eda0970 | 2021-11-03 17:58:45 +0100 | [diff] [blame] | 42 | if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next)) |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 43 | return; |
| 44 | |
| 45 | raw_spin_lock_irqsave(cpu_lock, flags); |
| 46 | |
| 47 | /* put @cgrp and all ancestors on the corresponding updated lists */ |
Johannes Weiner | dc26532 | 2021-04-29 22:56:23 -0700 | [diff] [blame] | 48 | while (true) { |
Tejun Heo | c58632b | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 49 | struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); |
Johannes Weiner | dc26532 | 2021-04-29 22:56:23 -0700 | [diff] [blame] | 50 | struct cgroup *parent = cgroup_parent(cgrp); |
| 51 | struct cgroup_rstat_cpu *prstatc; |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 52 | |
| 53 | /* |
| 54 | * Both additions and removals are bottom-up. If a cgroup |
| 55 | * is already in the tree, all ancestors are. |
| 56 | */ |
Tejun Heo | c58632b | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 57 | if (rstatc->updated_next) |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 58 | break; |
| 59 | |
Johannes Weiner | dc26532 | 2021-04-29 22:56:23 -0700 | [diff] [blame] | 60 | /* Root has no parent to link it to, but mark it busy */ |
| 61 | if (!parent) { |
| 62 | rstatc->updated_next = cgrp; |
| 63 | break; |
| 64 | } |
| 65 | |
| 66 | prstatc = cgroup_rstat_cpu(parent, cpu); |
Tejun Heo | c58632b | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 67 | rstatc->updated_next = prstatc->updated_children; |
| 68 | prstatc->updated_children = cgrp; |
Johannes Weiner | dc26532 | 2021-04-29 22:56:23 -0700 | [diff] [blame] | 69 | |
| 70 | cgrp = parent; |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 71 | } |
| 72 | |
| 73 | raw_spin_unlock_irqrestore(cpu_lock, flags); |
| 74 | } |
| 75 | |
| 76 | /** |
Tejun Heo | c58632b | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 77 | * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 78 | * @pos: current position |
| 79 | * @root: root of the tree to traversal |
| 80 | * @cpu: target cpu |
| 81 | * |
Zhen Lei | 08b2b6f | 2021-05-24 16:29:43 +0800 | [diff] [blame] | 82 | * Walks the updated rstat_cpu tree on @cpu from @root. %NULL @pos starts |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 83 | * the traversal and %NULL return indicates the end. During traversal, |
| 84 | * each returned cgroup is unlinked from the tree. Must be called with the |
Tejun Heo | c58632b | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 85 | * matching cgroup_rstat_cpu_lock held. |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 86 | * |
| 87 | * The only ordering guarantee is that, for a parent and a child pair |
| 88 | * covered by a given traversal, if a child is visited, its parent is |
| 89 | * guaranteed to be visited afterwards. |
| 90 | */ |
Tejun Heo | c58632b | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 91 | static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, |
| 92 | struct cgroup *root, int cpu) |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 93 | { |
Tejun Heo | c58632b | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 94 | struct cgroup_rstat_cpu *rstatc; |
Wei Yang | f5f60d2 | 2021-12-25 00:09:32 +0000 | [diff] [blame] | 95 | struct cgroup *parent; |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 96 | |
| 97 | if (pos == root) |
| 98 | return NULL; |
| 99 | |
| 100 | /* |
| 101 | * We're gonna walk down to the first leaf and visit/remove it. We |
| 102 | * can pick whatever unvisited node as the starting point. |
| 103 | */ |
Wei Yang | f5f60d2 | 2021-12-25 00:09:32 +0000 | [diff] [blame] | 104 | if (!pos) { |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 105 | pos = root; |
Wei Yang | f5f60d2 | 2021-12-25 00:09:32 +0000 | [diff] [blame] | 106 | /* return NULL if this subtree is not on-list */ |
| 107 | if (!cgroup_rstat_cpu(pos, cpu)->updated_next) |
| 108 | return NULL; |
| 109 | } else { |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 110 | pos = cgroup_parent(pos); |
Wei Yang | f5f60d2 | 2021-12-25 00:09:32 +0000 | [diff] [blame] | 111 | } |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 112 | |
| 113 | /* walk down to the first leaf */ |
| 114 | while (true) { |
Tejun Heo | c58632b | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 115 | rstatc = cgroup_rstat_cpu(pos, cpu); |
| 116 | if (rstatc->updated_children == pos) |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 117 | break; |
Tejun Heo | c58632b | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 118 | pos = rstatc->updated_children; |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 119 | } |
| 120 | |
| 121 | /* |
| 122 | * Unlink @pos from the tree. As the updated_children list is |
| 123 | * singly linked, we have to walk it to find the removal point. |
| 124 | * However, due to the way we traverse, @pos will be the first |
| 125 | * child in most cases. The only exception is @root. |
| 126 | */ |
Wei Yang | f5f60d2 | 2021-12-25 00:09:32 +0000 | [diff] [blame] | 127 | parent = cgroup_parent(pos); |
| 128 | if (parent) { |
| 129 | struct cgroup_rstat_cpu *prstatc; |
| 130 | struct cgroup **nextp; |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 131 | |
Wei Yang | f5f60d2 | 2021-12-25 00:09:32 +0000 | [diff] [blame] | 132 | prstatc = cgroup_rstat_cpu(parent, cpu); |
| 133 | nextp = &prstatc->updated_children; |
| 134 | while (*nextp != pos) { |
| 135 | struct cgroup_rstat_cpu *nrstatc; |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 136 | |
Wei Yang | f5f60d2 | 2021-12-25 00:09:32 +0000 | [diff] [blame] | 137 | nrstatc = cgroup_rstat_cpu(*nextp, cpu); |
| 138 | WARN_ON_ONCE(*nextp == parent); |
| 139 | nextp = &nrstatc->updated_next; |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 140 | } |
Wei Yang | f5f60d2 | 2021-12-25 00:09:32 +0000 | [diff] [blame] | 141 | *nextp = rstatc->updated_next; |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 142 | } |
| 143 | |
Wei Yang | f5f60d2 | 2021-12-25 00:09:32 +0000 | [diff] [blame] | 144 | rstatc->updated_next = NULL; |
| 145 | return pos; |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 146 | } |
| 147 | |
Yosry Ahmed | a319185 | 2022-08-24 16:31:15 -0700 | [diff] [blame] | 148 | /* |
| 149 | * A hook for bpf stat collectors to attach to and flush their stats. |
| 150 | * Together with providing bpf kfuncs for cgroup_rstat_updated() and |
| 151 | * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that |
| 152 | * collect cgroup stats can integrate with rstat for efficient flushing. |
| 153 | * |
| 154 | * A static noinline declaration here could cause the compiler to optimize away |
| 155 | * the function. A global noinline declaration will keep the definition, but may |
| 156 | * optimize away the callsite. Therefore, __weak is needed to ensure that the |
| 157 | * call is still emitted, by telling the compiler that we don't know what the |
| 158 | * function might eventually be. |
Yosry Ahmed | a319185 | 2022-08-24 16:31:15 -0700 | [diff] [blame] | 159 | */ |
Dave Marchevsky | 15fb6f2b | 2023-10-31 14:56:25 -0700 | [diff] [blame] | 160 | |
| 161 | __bpf_hook_start(); |
Yosry Ahmed | a319185 | 2022-08-24 16:31:15 -0700 | [diff] [blame] | 162 | |
| 163 | __weak noinline void bpf_rstat_flush(struct cgroup *cgrp, |
| 164 | struct cgroup *parent, int cpu) |
| 165 | { |
| 166 | } |
| 167 | |
Dave Marchevsky | 15fb6f2b | 2023-10-31 14:56:25 -0700 | [diff] [blame] | 168 | __bpf_hook_end(); |
Yosry Ahmed | a319185 | 2022-08-24 16:31:15 -0700 | [diff] [blame] | 169 | |
Tejun Heo | a17556f8 | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 170 | /* see cgroup_rstat_flush() */ |
Yosry Ahmed | 0a2dc6a | 2023-04-21 17:40:20 +0000 | [diff] [blame] | 171 | static void cgroup_rstat_flush_locked(struct cgroup *cgrp) |
Tejun Heo | 0fa294f | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 172 | __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) |
Tejun Heo | a17556f8 | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 173 | { |
| 174 | int cpu; |
| 175 | |
Tejun Heo | 0fa294f | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 176 | lockdep_assert_held(&cgroup_rstat_lock); |
Tejun Heo | a17556f8 | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 177 | |
| 178 | for_each_possible_cpu(cpu) { |
| 179 | raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, |
| 180 | cpu); |
| 181 | struct cgroup *pos = NULL; |
Sebastian Andrzej Siewior | b1e2c8d | 2022-03-23 16:06:29 -0700 | [diff] [blame] | 182 | unsigned long flags; |
Tejun Heo | a17556f8 | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 183 | |
Sebastian Andrzej Siewior | b1e2c8d | 2022-03-23 16:06:29 -0700 | [diff] [blame] | 184 | /* |
| 185 | * The _irqsave() is needed because cgroup_rstat_lock is |
| 186 | * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring |
| 187 | * this lock with the _irq() suffix only disables interrupts on |
| 188 | * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables |
| 189 | * interrupts on both configurations. The _irqsave() ensures |
| 190 | * that interrupts are always disabled and later restored. |
| 191 | */ |
| 192 | raw_spin_lock_irqsave(cpu_lock, flags); |
Tejun Heo | 8f53470 | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 193 | while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { |
| 194 | struct cgroup_subsys_state *css; |
| 195 | |
Tejun Heo | a17556f8 | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 196 | cgroup_base_stat_flush(pos, cpu); |
Yosry Ahmed | a319185 | 2022-08-24 16:31:15 -0700 | [diff] [blame] | 197 | bpf_rstat_flush(pos, cgroup_parent(pos), cpu); |
Tejun Heo | 8f53470 | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 198 | |
| 199 | rcu_read_lock(); |
| 200 | list_for_each_entry_rcu(css, &pos->rstat_css_list, |
| 201 | rstat_css_node) |
| 202 | css->ss->css_rstat_flush(css, cpu); |
| 203 | rcu_read_unlock(); |
| 204 | } |
Sebastian Andrzej Siewior | b1e2c8d | 2022-03-23 16:06:29 -0700 | [diff] [blame] | 205 | raw_spin_unlock_irqrestore(cpu_lock, flags); |
Tejun Heo | 0fa294f | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 206 | |
Yosry Ahmed | 0a2dc6a | 2023-04-21 17:40:20 +0000 | [diff] [blame] | 207 | /* play nice and yield if necessary */ |
| 208 | if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) { |
Tejun Heo | 0fa294f | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 209 | spin_unlock_irq(&cgroup_rstat_lock); |
| 210 | if (!cond_resched()) |
| 211 | cpu_relax(); |
| 212 | spin_lock_irq(&cgroup_rstat_lock); |
| 213 | } |
Tejun Heo | a17556f8 | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 214 | } |
| 215 | } |
| 216 | |
| 217 | /** |
| 218 | * cgroup_rstat_flush - flush stats in @cgrp's subtree |
| 219 | * @cgrp: target cgroup |
| 220 | * |
| 221 | * Collect all per-cpu stats in @cgrp's subtree into the global counters |
| 222 | * and propagate them upwards. After this function returns, all cgroups in |
| 223 | * the subtree have up-to-date ->stat. |
| 224 | * |
| 225 | * This also gets all cgroups in the subtree including @cgrp off the |
| 226 | * ->updated_children lists. |
Tejun Heo | 0fa294f | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 227 | * |
| 228 | * This function may block. |
Tejun Heo | a17556f8 | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 229 | */ |
David Vernet | 400031e | 2023-02-01 11:30:15 -0600 | [diff] [blame] | 230 | __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) |
Tejun Heo | a17556f8 | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 231 | { |
Tejun Heo | 0fa294f | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 232 | might_sleep(); |
| 233 | |
| 234 | spin_lock_irq(&cgroup_rstat_lock); |
Yosry Ahmed | 0a2dc6a | 2023-04-21 17:40:20 +0000 | [diff] [blame] | 235 | cgroup_rstat_flush_locked(cgrp); |
Tejun Heo | 0fa294f | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 236 | spin_unlock_irq(&cgroup_rstat_lock); |
| 237 | } |
| 238 | |
| 239 | /** |
Yang Li | 2ca11b0 | 2021-05-26 10:49:09 +0800 | [diff] [blame] | 240 | * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold |
Tejun Heo | 6162cef | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 241 | * @cgrp: target cgroup |
| 242 | * |
| 243 | * Flush stats in @cgrp's subtree and prevent further flushes. Must be |
| 244 | * paired with cgroup_rstat_flush_release(). |
Tejun Heo | 0fa294f | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 245 | * |
| 246 | * This function may block. |
Tejun Heo | 6162cef | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 247 | */ |
| 248 | void cgroup_rstat_flush_hold(struct cgroup *cgrp) |
Tejun Heo | 0fa294f | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 249 | __acquires(&cgroup_rstat_lock) |
Tejun Heo | 6162cef | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 250 | { |
Tejun Heo | 0fa294f | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 251 | might_sleep(); |
| 252 | spin_lock_irq(&cgroup_rstat_lock); |
Yosry Ahmed | 0a2dc6a | 2023-04-21 17:40:20 +0000 | [diff] [blame] | 253 | cgroup_rstat_flush_locked(cgrp); |
Tejun Heo | 6162cef | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 254 | } |
| 255 | |
| 256 | /** |
| 257 | * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() |
| 258 | */ |
| 259 | void cgroup_rstat_flush_release(void) |
Tejun Heo | 0fa294f | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 260 | __releases(&cgroup_rstat_lock) |
Tejun Heo | 6162cef | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 261 | { |
Tejun Heo | 0fa294f | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 262 | spin_unlock_irq(&cgroup_rstat_lock); |
Tejun Heo | 6162cef | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 263 | } |
| 264 | |
Tejun Heo | a17556f8 | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 265 | int cgroup_rstat_init(struct cgroup *cgrp) |
| 266 | { |
| 267 | int cpu; |
| 268 | |
| 269 | /* the root cgrp has rstat_cpu preallocated */ |
| 270 | if (!cgrp->rstat_cpu) { |
| 271 | cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu); |
| 272 | if (!cgrp->rstat_cpu) |
| 273 | return -ENOMEM; |
| 274 | } |
| 275 | |
| 276 | /* ->updated_children list is self terminated */ |
| 277 | for_each_possible_cpu(cpu) { |
| 278 | struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); |
| 279 | |
| 280 | rstatc->updated_children = cgrp; |
| 281 | u64_stats_init(&rstatc->bsync); |
| 282 | } |
| 283 | |
| 284 | return 0; |
| 285 | } |
| 286 | |
| 287 | void cgroup_rstat_exit(struct cgroup *cgrp) |
| 288 | { |
| 289 | int cpu; |
| 290 | |
| 291 | cgroup_rstat_flush(cgrp); |
| 292 | |
| 293 | /* sanity check */ |
| 294 | for_each_possible_cpu(cpu) { |
| 295 | struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); |
| 296 | |
| 297 | if (WARN_ON_ONCE(rstatc->updated_children != cgrp) || |
| 298 | WARN_ON_ONCE(rstatc->updated_next)) |
| 299 | return; |
| 300 | } |
| 301 | |
| 302 | free_percpu(cgrp->rstat_cpu); |
| 303 | cgrp->rstat_cpu = NULL; |
| 304 | } |
| 305 | |
| 306 | void __init cgroup_rstat_boot(void) |
| 307 | { |
| 308 | int cpu; |
| 309 | |
| 310 | for_each_possible_cpu(cpu) |
| 311 | raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); |
Tejun Heo | a17556f8 | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 312 | } |
| 313 | |
| 314 | /* |
| 315 | * Functions for cgroup basic resource statistics implemented on top of |
| 316 | * rstat. |
| 317 | */ |
Tejun Heo | 1bb5ec2 | 2019-11-06 12:49:57 -0800 | [diff] [blame] | 318 | static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, |
| 319 | struct cgroup_base_stat *src_bstat) |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 320 | { |
Tejun Heo | d4ff749 | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 321 | dst_bstat->cputime.utime += src_bstat->cputime.utime; |
| 322 | dst_bstat->cputime.stime += src_bstat->cputime.stime; |
| 323 | dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; |
Josh Don | 1fcf54d | 2022-06-29 14:14:26 -0700 | [diff] [blame] | 324 | #ifdef CONFIG_SCHED_CORE |
| 325 | dst_bstat->forceidle_sum += src_bstat->forceidle_sum; |
| 326 | #endif |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 327 | } |
| 328 | |
Tejun Heo | 1bb5ec2 | 2019-11-06 12:49:57 -0800 | [diff] [blame] | 329 | static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, |
| 330 | struct cgroup_base_stat *src_bstat) |
| 331 | { |
| 332 | dst_bstat->cputime.utime -= src_bstat->cputime.utime; |
| 333 | dst_bstat->cputime.stime -= src_bstat->cputime.stime; |
| 334 | dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime; |
Josh Don | 1fcf54d | 2022-06-29 14:14:26 -0700 | [diff] [blame] | 335 | #ifdef CONFIG_SCHED_CORE |
| 336 | dst_bstat->forceidle_sum -= src_bstat->forceidle_sum; |
| 337 | #endif |
Tejun Heo | 1bb5ec2 | 2019-11-06 12:49:57 -0800 | [diff] [blame] | 338 | } |
| 339 | |
Tejun Heo | d4ff749 | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 340 | static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 341 | { |
Tejun Heo | c58632b | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 342 | struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); |
Johannes Weiner | dc26532 | 2021-04-29 22:56:23 -0700 | [diff] [blame] | 343 | struct cgroup *parent = cgroup_parent(cgrp); |
Hao Jia | 0437719 | 2023-08-07 11:29:30 +0800 | [diff] [blame] | 344 | struct cgroup_rstat_cpu *prstatc; |
Wei Yang | 95b99f3 | 2022-01-08 00:38:17 +0000 | [diff] [blame] | 345 | struct cgroup_base_stat delta; |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 346 | unsigned seq; |
| 347 | |
Johannes Weiner | dc26532 | 2021-04-29 22:56:23 -0700 | [diff] [blame] | 348 | /* Root-level stats are sourced from system-wide CPU stats */ |
| 349 | if (!parent) |
| 350 | return; |
| 351 | |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 352 | /* fetch the current per-cpu values */ |
| 353 | do { |
Tejun Heo | d4ff749 | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 354 | seq = __u64_stats_fetch_begin(&rstatc->bsync); |
Wei Yang | 95b99f3 | 2022-01-08 00:38:17 +0000 | [diff] [blame] | 355 | delta = rstatc->bstat; |
Tejun Heo | d4ff749 | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 356 | } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 357 | |
Hao Jia | 0437719 | 2023-08-07 11:29:30 +0800 | [diff] [blame] | 358 | /* propagate per-cpu delta to cgroup and per-cpu global statistics */ |
Tejun Heo | 1bb5ec2 | 2019-11-06 12:49:57 -0800 | [diff] [blame] | 359 | cgroup_base_stat_sub(&delta, &rstatc->last_bstat); |
| 360 | cgroup_base_stat_add(&cgrp->bstat, &delta); |
| 361 | cgroup_base_stat_add(&rstatc->last_bstat, &delta); |
Hao Jia | 0437719 | 2023-08-07 11:29:30 +0800 | [diff] [blame] | 362 | cgroup_base_stat_add(&rstatc->subtree_bstat, &delta); |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 363 | |
Hao Jia | 0437719 | 2023-08-07 11:29:30 +0800 | [diff] [blame] | 364 | /* propagate cgroup and per-cpu global delta to parent (unless that's root) */ |
Johannes Weiner | dc26532 | 2021-04-29 22:56:23 -0700 | [diff] [blame] | 365 | if (cgroup_parent(parent)) { |
Tejun Heo | 1bb5ec2 | 2019-11-06 12:49:57 -0800 | [diff] [blame] | 366 | delta = cgrp->bstat; |
| 367 | cgroup_base_stat_sub(&delta, &cgrp->last_bstat); |
| 368 | cgroup_base_stat_add(&parent->bstat, &delta); |
| 369 | cgroup_base_stat_add(&cgrp->last_bstat, &delta); |
Hao Jia | 0437719 | 2023-08-07 11:29:30 +0800 | [diff] [blame] | 370 | |
| 371 | delta = rstatc->subtree_bstat; |
| 372 | prstatc = cgroup_rstat_cpu(parent, cpu); |
| 373 | cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat); |
| 374 | cgroup_base_stat_add(&prstatc->subtree_bstat, &delta); |
| 375 | cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta); |
Tejun Heo | 1bb5ec2 | 2019-11-06 12:49:57 -0800 | [diff] [blame] | 376 | } |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 377 | } |
| 378 | |
Tejun Heo | c58632b | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 379 | static struct cgroup_rstat_cpu * |
Tejun Heo | c3df5fb | 2021-07-27 13:12:20 -1000 | [diff] [blame] | 380 | cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags) |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 381 | { |
Tejun Heo | c58632b | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 382 | struct cgroup_rstat_cpu *rstatc; |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 383 | |
Tejun Heo | c58632b | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 384 | rstatc = get_cpu_ptr(cgrp->rstat_cpu); |
Tejun Heo | c3df5fb | 2021-07-27 13:12:20 -1000 | [diff] [blame] | 385 | *flags = u64_stats_update_begin_irqsave(&rstatc->bsync); |
Tejun Heo | c58632b | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 386 | return rstatc; |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 387 | } |
| 388 | |
Tejun Heo | d4ff749 | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 389 | static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, |
Tejun Heo | c3df5fb | 2021-07-27 13:12:20 -1000 | [diff] [blame] | 390 | struct cgroup_rstat_cpu *rstatc, |
| 391 | unsigned long flags) |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 392 | { |
Tejun Heo | c3df5fb | 2021-07-27 13:12:20 -1000 | [diff] [blame] | 393 | u64_stats_update_end_irqrestore(&rstatc->bsync, flags); |
Tejun Heo | 6162cef | 2018-04-26 14:29:05 -0700 | [diff] [blame] | 394 | cgroup_rstat_updated(cgrp, smp_processor_id()); |
Tejun Heo | c58632b | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 395 | put_cpu_ptr(rstatc); |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 396 | } |
| 397 | |
| 398 | void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) |
| 399 | { |
Tejun Heo | c58632b | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 400 | struct cgroup_rstat_cpu *rstatc; |
Tejun Heo | c3df5fb | 2021-07-27 13:12:20 -1000 | [diff] [blame] | 401 | unsigned long flags; |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 402 | |
Tejun Heo | c3df5fb | 2021-07-27 13:12:20 -1000 | [diff] [blame] | 403 | rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); |
Tejun Heo | d4ff749 | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 404 | rstatc->bstat.cputime.sum_exec_runtime += delta_exec; |
Tejun Heo | c3df5fb | 2021-07-27 13:12:20 -1000 | [diff] [blame] | 405 | cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 406 | } |
| 407 | |
| 408 | void __cgroup_account_cputime_field(struct cgroup *cgrp, |
| 409 | enum cpu_usage_stat index, u64 delta_exec) |
| 410 | { |
Tejun Heo | c58632b | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 411 | struct cgroup_rstat_cpu *rstatc; |
Tejun Heo | c3df5fb | 2021-07-27 13:12:20 -1000 | [diff] [blame] | 412 | unsigned long flags; |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 413 | |
Tejun Heo | c3df5fb | 2021-07-27 13:12:20 -1000 | [diff] [blame] | 414 | rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 415 | |
| 416 | switch (index) { |
| 417 | case CPUTIME_USER: |
| 418 | case CPUTIME_NICE: |
Tejun Heo | d4ff749 | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 419 | rstatc->bstat.cputime.utime += delta_exec; |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 420 | break; |
| 421 | case CPUTIME_SYSTEM: |
| 422 | case CPUTIME_IRQ: |
| 423 | case CPUTIME_SOFTIRQ: |
Tejun Heo | d4ff749 | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 424 | rstatc->bstat.cputime.stime += delta_exec; |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 425 | break; |
Josh Don | 1fcf54d | 2022-06-29 14:14:26 -0700 | [diff] [blame] | 426 | #ifdef CONFIG_SCHED_CORE |
| 427 | case CPUTIME_FORCEIDLE: |
| 428 | rstatc->bstat.forceidle_sum += delta_exec; |
| 429 | break; |
| 430 | #endif |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 431 | default: |
| 432 | break; |
| 433 | } |
| 434 | |
Tejun Heo | c3df5fb | 2021-07-27 13:12:20 -1000 | [diff] [blame] | 435 | cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 436 | } |
| 437 | |
Boris Burkov | 936f2a7 | 2020-05-27 14:43:19 -0700 | [diff] [blame] | 438 | /* |
| 439 | * compute the cputime for the root cgroup by getting the per cpu data |
| 440 | * at a global level, then categorizing the fields in a manner consistent |
| 441 | * with how it is done by __cgroup_account_cputime_field for each bit of |
| 442 | * cpu time attributed to a cgroup. |
| 443 | */ |
Josh Don | 1fcf54d | 2022-06-29 14:14:26 -0700 | [diff] [blame] | 444 | static void root_cgroup_cputime(struct cgroup_base_stat *bstat) |
Boris Burkov | 936f2a7 | 2020-05-27 14:43:19 -0700 | [diff] [blame] | 445 | { |
Josh Don | 1fcf54d | 2022-06-29 14:14:26 -0700 | [diff] [blame] | 446 | struct task_cputime *cputime = &bstat->cputime; |
Boris Burkov | 936f2a7 | 2020-05-27 14:43:19 -0700 | [diff] [blame] | 447 | int i; |
| 448 | |
Josh Don | fcdb1ed | 2023-03-15 14:40:29 -0700 | [diff] [blame] | 449 | memset(bstat, 0, sizeof(*bstat)); |
Boris Burkov | 936f2a7 | 2020-05-27 14:43:19 -0700 | [diff] [blame] | 450 | for_each_possible_cpu(i) { |
| 451 | struct kernel_cpustat kcpustat; |
| 452 | u64 *cpustat = kcpustat.cpustat; |
| 453 | u64 user = 0; |
| 454 | u64 sys = 0; |
| 455 | |
| 456 | kcpustat_cpu_fetch(&kcpustat, i); |
| 457 | |
| 458 | user += cpustat[CPUTIME_USER]; |
| 459 | user += cpustat[CPUTIME_NICE]; |
| 460 | cputime->utime += user; |
| 461 | |
| 462 | sys += cpustat[CPUTIME_SYSTEM]; |
| 463 | sys += cpustat[CPUTIME_IRQ]; |
| 464 | sys += cpustat[CPUTIME_SOFTIRQ]; |
| 465 | cputime->stime += sys; |
| 466 | |
| 467 | cputime->sum_exec_runtime += user; |
| 468 | cputime->sum_exec_runtime += sys; |
| 469 | cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; |
Josh Don | 1fcf54d | 2022-06-29 14:14:26 -0700 | [diff] [blame] | 470 | |
| 471 | #ifdef CONFIG_SCHED_CORE |
| 472 | bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE]; |
| 473 | #endif |
Boris Burkov | 936f2a7 | 2020-05-27 14:43:19 -0700 | [diff] [blame] | 474 | } |
| 475 | } |
| 476 | |
Tejun Heo | d4ff749 | 2018-04-26 14:29:04 -0700 | [diff] [blame] | 477 | void cgroup_base_stat_cputime_show(struct seq_file *seq) |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 478 | { |
| 479 | struct cgroup *cgrp = seq_css(seq)->cgroup; |
| 480 | u64 usage, utime, stime; |
Josh Don | 1fcf54d | 2022-06-29 14:14:26 -0700 | [diff] [blame] | 481 | struct cgroup_base_stat bstat; |
| 482 | #ifdef CONFIG_SCHED_CORE |
| 483 | u64 forceidle_time; |
| 484 | #endif |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 485 | |
Boris Burkov | 936f2a7 | 2020-05-27 14:43:19 -0700 | [diff] [blame] | 486 | if (cgroup_parent(cgrp)) { |
| 487 | cgroup_rstat_flush_hold(cgrp); |
| 488 | usage = cgrp->bstat.cputime.sum_exec_runtime; |
| 489 | cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, |
| 490 | &utime, &stime); |
Josh Don | 1fcf54d | 2022-06-29 14:14:26 -0700 | [diff] [blame] | 491 | #ifdef CONFIG_SCHED_CORE |
| 492 | forceidle_time = cgrp->bstat.forceidle_sum; |
| 493 | #endif |
Boris Burkov | 936f2a7 | 2020-05-27 14:43:19 -0700 | [diff] [blame] | 494 | cgroup_rstat_flush_release(); |
| 495 | } else { |
Josh Don | 1fcf54d | 2022-06-29 14:14:26 -0700 | [diff] [blame] | 496 | root_cgroup_cputime(&bstat); |
| 497 | usage = bstat.cputime.sum_exec_runtime; |
| 498 | utime = bstat.cputime.utime; |
| 499 | stime = bstat.cputime.stime; |
| 500 | #ifdef CONFIG_SCHED_CORE |
| 501 | forceidle_time = bstat.forceidle_sum; |
| 502 | #endif |
Boris Burkov | 936f2a7 | 2020-05-27 14:43:19 -0700 | [diff] [blame] | 503 | } |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 504 | |
| 505 | do_div(usage, NSEC_PER_USEC); |
| 506 | do_div(utime, NSEC_PER_USEC); |
| 507 | do_div(stime, NSEC_PER_USEC); |
Josh Don | 1fcf54d | 2022-06-29 14:14:26 -0700 | [diff] [blame] | 508 | #ifdef CONFIG_SCHED_CORE |
| 509 | do_div(forceidle_time, NSEC_PER_USEC); |
| 510 | #endif |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 511 | |
Tejun Heo | d41bf8c9 | 2017-10-23 16:18:27 -0700 | [diff] [blame] | 512 | seq_printf(seq, "usage_usec %llu\n" |
| 513 | "user_usec %llu\n" |
| 514 | "system_usec %llu\n", |
| 515 | usage, utime, stime); |
Josh Don | 1fcf54d | 2022-06-29 14:14:26 -0700 | [diff] [blame] | 516 | |
| 517 | #ifdef CONFIG_SCHED_CORE |
| 518 | seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time); |
| 519 | #endif |
Tejun Heo | 041cd64 | 2017-09-25 08:12:05 -0700 | [diff] [blame] | 520 | } |
Yosry Ahmed | a319185 | 2022-08-24 16:31:15 -0700 | [diff] [blame] | 521 | |
| 522 | /* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */ |
| 523 | BTF_SET8_START(bpf_rstat_kfunc_ids) |
| 524 | BTF_ID_FLAGS(func, cgroup_rstat_updated) |
| 525 | BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE) |
| 526 | BTF_SET8_END(bpf_rstat_kfunc_ids) |
| 527 | |
| 528 | static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = { |
| 529 | .owner = THIS_MODULE, |
| 530 | .set = &bpf_rstat_kfunc_ids, |
| 531 | }; |
| 532 | |
| 533 | static int __init bpf_rstat_kfunc_init(void) |
| 534 | { |
| 535 | return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, |
| 536 | &bpf_rstat_kfunc_set); |
| 537 | } |
| 538 | late_initcall(bpf_rstat_kfunc_init); |