Thomas Gleixner | ee1ee6d | 2023-03-23 21:55:31 +0100 | [diff] [blame] | 1 | // SPDX-License-Identifier: GPL-2.0-only |
| 2 | |
| 3 | /* |
| 4 | * rcuref - A scalable reference count implementation for RCU managed objects |
| 5 | * |
| 6 | * rcuref is provided to replace open coded reference count implementations |
| 7 | * based on atomic_t. It protects explicitely RCU managed objects which can |
| 8 | * be visible even after the last reference has been dropped and the object |
| 9 | * is heading towards destruction. |
| 10 | * |
| 11 | * A common usage pattern is: |
| 12 | * |
| 13 | * get() |
| 14 | * rcu_read_lock(); |
| 15 | * p = get_ptr(); |
| 16 | * if (p && !atomic_inc_not_zero(&p->refcnt)) |
| 17 | * p = NULL; |
| 18 | * rcu_read_unlock(); |
| 19 | * return p; |
| 20 | * |
| 21 | * put() |
| 22 | * if (!atomic_dec_return(&->refcnt)) { |
| 23 | * remove_ptr(p); |
| 24 | * kfree_rcu((p, rcu); |
| 25 | * } |
| 26 | * |
| 27 | * atomic_inc_not_zero() is implemented with a try_cmpxchg() loop which has |
| 28 | * O(N^2) behaviour under contention with N concurrent operations. |
| 29 | * |
| 30 | * rcuref uses atomic_add_negative_relaxed() for the fast path, which scales |
| 31 | * better under contention. |
| 32 | * |
| 33 | * Why not refcount? |
| 34 | * ================= |
| 35 | * |
| 36 | * In principle it should be possible to make refcount use the rcuref |
| 37 | * scheme, but the destruction race described below cannot be prevented |
| 38 | * unless the protected object is RCU managed. |
| 39 | * |
| 40 | * Theory of operation |
| 41 | * =================== |
| 42 | * |
| 43 | * rcuref uses an unsigned integer reference counter. As long as the |
| 44 | * counter value is greater than or equal to RCUREF_ONEREF and not larger |
| 45 | * than RCUREF_MAXREF the reference is alive: |
| 46 | * |
| 47 | * ONEREF MAXREF SATURATED RELEASED DEAD NOREF |
| 48 | * 0 0x7FFFFFFF 0x8000000 0xA0000000 0xBFFFFFFF 0xC0000000 0xE0000000 0xFFFFFFFF |
| 49 | * <---valid --------> <-------saturation zone-------> <-----dead zone-----> |
| 50 | * |
| 51 | * The get() and put() operations do unconditional increments and |
| 52 | * decrements. The result is checked after the operation. This optimizes |
| 53 | * for the fast path. |
| 54 | * |
| 55 | * If the reference count is saturated or dead, then the increments and |
| 56 | * decrements are not harmful as the reference count still stays in the |
| 57 | * respective zones and is always set back to STATURATED resp. DEAD. The |
| 58 | * zones have room for 2^28 racing operations in each direction, which |
| 59 | * makes it practically impossible to escape the zones. |
| 60 | * |
| 61 | * Once the last reference is dropped the reference count becomes |
| 62 | * RCUREF_NOREF which forces rcuref_put() into the slowpath operation. The |
| 63 | * slowpath then tries to set the reference count from RCUREF_NOREF to |
| 64 | * RCUREF_DEAD via a cmpxchg(). This opens a small window where a |
| 65 | * concurrent rcuref_get() can acquire the reference count and bring it |
| 66 | * back to RCUREF_ONEREF or even drop the reference again and mark it DEAD. |
| 67 | * |
| 68 | * If the cmpxchg() succeeds then a concurrent rcuref_get() will result in |
| 69 | * DEAD + 1, which is inside the dead zone. If that happens the reference |
| 70 | * count is put back to DEAD. |
| 71 | * |
| 72 | * The actual race is possible due to the unconditional increment and |
| 73 | * decrements in rcuref_get() and rcuref_put(): |
| 74 | * |
| 75 | * T1 T2 |
| 76 | * get() put() |
| 77 | * if (atomic_add_negative(-1, &ref->refcnt)) |
| 78 | * succeeds-> atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); |
| 79 | * |
| 80 | * atomic_add_negative(1, &ref->refcnt); <- Elevates refcount to DEAD + 1 |
| 81 | * |
| 82 | * As the result of T1's add is negative, the get() goes into the slow path |
| 83 | * and observes refcnt being in the dead zone which makes the operation fail. |
| 84 | * |
| 85 | * Possible critical states: |
| 86 | * |
| 87 | * Context Counter References Operation |
| 88 | * T1 0 1 init() |
| 89 | * T2 1 2 get() |
| 90 | * T1 0 1 put() |
| 91 | * T2 -1 0 put() tries to mark dead |
| 92 | * T1 0 1 get() |
| 93 | * T2 0 1 put() mark dead fails |
| 94 | * T1 -1 0 put() tries to mark dead |
| 95 | * T1 DEAD 0 put() mark dead succeeds |
| 96 | * T2 DEAD+1 0 get() fails and puts it back to DEAD |
| 97 | * |
| 98 | * Of course there are more complex scenarios, but the above illustrates |
| 99 | * the working principle. The rest is left to the imagination of the |
| 100 | * reader. |
| 101 | * |
| 102 | * Deconstruction race |
| 103 | * =================== |
| 104 | * |
| 105 | * The release operation must be protected by prohibiting a grace period in |
| 106 | * order to prevent a possible use after free: |
| 107 | * |
| 108 | * T1 T2 |
| 109 | * put() get() |
| 110 | * // ref->refcnt = ONEREF |
| 111 | * if (!atomic_add_negative(-1, &ref->refcnt)) |
| 112 | * return false; <- Not taken |
| 113 | * |
| 114 | * // ref->refcnt == NOREF |
| 115 | * --> preemption |
| 116 | * // Elevates ref->refcnt to ONEREF |
| 117 | * if (!atomic_add_negative(1, &ref->refcnt)) |
| 118 | * return true; <- taken |
| 119 | * |
| 120 | * if (put(&p->ref)) { <-- Succeeds |
| 121 | * remove_pointer(p); |
| 122 | * kfree_rcu(p, rcu); |
| 123 | * } |
| 124 | * |
| 125 | * RCU grace period ends, object is freed |
| 126 | * |
| 127 | * atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); <- UAF |
| 128 | * |
| 129 | * This is prevented by disabling preemption around the put() operation as |
| 130 | * that's in most kernel configurations cheaper than a rcu_read_lock() / |
| 131 | * rcu_read_unlock() pair and in many cases even a NOOP. In any case it |
| 132 | * prevents the grace period which keeps the object alive until all put() |
| 133 | * operations complete. |
| 134 | * |
| 135 | * Saturation protection |
| 136 | * ===================== |
| 137 | * |
| 138 | * The reference count has a saturation limit RCUREF_MAXREF (INT_MAX). |
| 139 | * Once this is exceedded the reference count becomes stale by setting it |
| 140 | * to RCUREF_SATURATED, which will cause a memory leak, but it prevents |
| 141 | * wrap arounds which obviously cause worse problems than a memory |
| 142 | * leak. When saturation is reached a warning is emitted. |
| 143 | * |
| 144 | * Race conditions |
| 145 | * =============== |
| 146 | * |
| 147 | * All reference count increment/decrement operations are unconditional and |
| 148 | * only verified after the fact. This optimizes for the good case and takes |
| 149 | * the occasional race vs. a dead or already saturated refcount into |
| 150 | * account. The saturation and dead zones are large enough to accomodate |
| 151 | * for that. |
| 152 | * |
| 153 | * Memory ordering |
| 154 | * =============== |
| 155 | * |
| 156 | * Memory ordering rules are slightly relaxed wrt regular atomic_t functions |
| 157 | * and provide only what is strictly required for refcounts. |
| 158 | * |
| 159 | * The increments are fully relaxed; these will not provide ordering. The |
| 160 | * rationale is that whatever is used to obtain the object to increase the |
| 161 | * reference count on will provide the ordering. For locked data |
| 162 | * structures, its the lock acquire, for RCU/lockless data structures its |
| 163 | * the dependent load. |
| 164 | * |
| 165 | * rcuref_get() provides a control dependency ordering future stores which |
| 166 | * ensures that the object is not modified when acquiring a reference |
| 167 | * fails. |
| 168 | * |
| 169 | * rcuref_put() provides release order, i.e. all prior loads and stores |
| 170 | * will be issued before. It also provides a control dependency ordering |
| 171 | * against the subsequent destruction of the object. |
| 172 | * |
| 173 | * If rcuref_put() successfully dropped the last reference and marked the |
| 174 | * object DEAD it also provides acquire ordering. |
| 175 | */ |
| 176 | |
| 177 | #include <linux/export.h> |
| 178 | #include <linux/rcuref.h> |
| 179 | |
| 180 | /** |
| 181 | * rcuref_get_slowpath - Slowpath of rcuref_get() |
| 182 | * @ref: Pointer to the reference count |
| 183 | * |
| 184 | * Invoked when the reference count is outside of the valid zone. |
| 185 | * |
| 186 | * Return: |
| 187 | * False if the reference count was already marked dead |
| 188 | * |
| 189 | * True if the reference count is saturated, which prevents the |
| 190 | * object from being deconstructed ever. |
| 191 | */ |
| 192 | bool rcuref_get_slowpath(rcuref_t *ref) |
| 193 | { |
| 194 | unsigned int cnt = atomic_read(&ref->refcnt); |
| 195 | |
| 196 | /* |
| 197 | * If the reference count was already marked dead, undo the |
| 198 | * increment so it stays in the middle of the dead zone and return |
| 199 | * fail. |
| 200 | */ |
| 201 | if (cnt >= RCUREF_RELEASED) { |
| 202 | atomic_set(&ref->refcnt, RCUREF_DEAD); |
| 203 | return false; |
| 204 | } |
| 205 | |
| 206 | /* |
| 207 | * If it was saturated, warn and mark it so. In case the increment |
| 208 | * was already on a saturated value restore the saturation |
| 209 | * marker. This keeps it in the middle of the saturation zone and |
| 210 | * prevents the reference count from overflowing. This leaks the |
| 211 | * object memory, but prevents the obvious reference count overflow |
| 212 | * damage. |
| 213 | */ |
| 214 | if (WARN_ONCE(cnt > RCUREF_MAXREF, "rcuref saturated - leaking memory")) |
| 215 | atomic_set(&ref->refcnt, RCUREF_SATURATED); |
| 216 | return true; |
| 217 | } |
| 218 | EXPORT_SYMBOL_GPL(rcuref_get_slowpath); |
| 219 | |
| 220 | /** |
| 221 | * rcuref_put_slowpath - Slowpath of __rcuref_put() |
| 222 | * @ref: Pointer to the reference count |
| 223 | * |
| 224 | * Invoked when the reference count is outside of the valid zone. |
| 225 | * |
| 226 | * Return: |
| 227 | * True if this was the last reference with no future references |
| 228 | * possible. This signals the caller that it can safely schedule the |
| 229 | * object, which is protected by the reference counter, for |
| 230 | * deconstruction. |
| 231 | * |
| 232 | * False if there are still active references or the put() raced |
| 233 | * with a concurrent get()/put() pair. Caller is not allowed to |
| 234 | * deconstruct the protected object. |
| 235 | */ |
| 236 | bool rcuref_put_slowpath(rcuref_t *ref) |
| 237 | { |
| 238 | unsigned int cnt = atomic_read(&ref->refcnt); |
| 239 | |
| 240 | /* Did this drop the last reference? */ |
| 241 | if (likely(cnt == RCUREF_NOREF)) { |
| 242 | /* |
| 243 | * Carefully try to set the reference count to RCUREF_DEAD. |
| 244 | * |
| 245 | * This can fail if a concurrent get() operation has |
| 246 | * elevated it again or the corresponding put() even marked |
| 247 | * it dead already. Both are valid situations and do not |
| 248 | * require a retry. If this fails the caller is not |
| 249 | * allowed to deconstruct the object. |
| 250 | */ |
Uros Bizjak | 4fbf8b1 | 2023-05-09 17:02:55 +0200 | [diff] [blame] | 251 | if (!atomic_try_cmpxchg_release(&ref->refcnt, &cnt, RCUREF_DEAD)) |
Thomas Gleixner | ee1ee6d | 2023-03-23 21:55:31 +0100 | [diff] [blame] | 252 | return false; |
| 253 | |
| 254 | /* |
| 255 | * The caller can safely schedule the object for |
| 256 | * deconstruction. Provide acquire ordering. |
| 257 | */ |
| 258 | smp_acquire__after_ctrl_dep(); |
| 259 | return true; |
| 260 | } |
| 261 | |
| 262 | /* |
| 263 | * If the reference count was already in the dead zone, then this |
| 264 | * put() operation is imbalanced. Warn, put the reference count back to |
| 265 | * DEAD and tell the caller to not deconstruct the object. |
| 266 | */ |
| 267 | if (WARN_ONCE(cnt >= RCUREF_RELEASED, "rcuref - imbalanced put()")) { |
| 268 | atomic_set(&ref->refcnt, RCUREF_DEAD); |
| 269 | return false; |
| 270 | } |
| 271 | |
| 272 | /* |
| 273 | * This is a put() operation on a saturated refcount. Restore the |
| 274 | * mean saturation value and tell the caller to not deconstruct the |
| 275 | * object. |
| 276 | */ |
| 277 | if (cnt > RCUREF_MAXREF) |
| 278 | atomic_set(&ref->refcnt, RCUREF_SATURATED); |
| 279 | return false; |
| 280 | } |
| 281 | EXPORT_SYMBOL_GPL(rcuref_put_slowpath); |