| // SPDX-License-Identifier: GPL-2.0-only |
| /* |
| * kernel/workqueue.c - generic async execution with shared worker pool |
| * |
| * Copyright (C) 2002 Ingo Molnar |
| * |
| * Derived from the taskqueue/keventd code by: |
| * David Woodhouse <dwmw2@infradead.org> |
| * Andrew Morton |
| * Kai Petzke <wpp@marie.physik.tu-berlin.de> |
| * Theodore Ts'o <tytso@mit.edu> |
| * |
| * Made to use alloc_percpu by Christoph Lameter. |
| * |
| * Copyright (C) 2010 SUSE Linux Products GmbH |
| * Copyright (C) 2010 Tejun Heo <tj@kernel.org> |
| * |
| * This is the generic async execution mechanism. Work items as are |
| * executed in process context. The worker pool is shared and |
| * automatically managed. There are two worker pools for each CPU (one for |
| * normal work items and the other for high priority ones) and some extra |
| * pools for workqueues which are not bound to any specific CPU - the |
| * number of these backing pools is dynamic. |
| * |
| * Please read Documentation/core-api/workqueue.rst for details. |
| */ |
| |
| #include <linux/export.h> |
| #include <linux/kernel.h> |
| #include <linux/sched.h> |
| #include <linux/init.h> |
| #include <linux/interrupt.h> |
| #include <linux/signal.h> |
| #include <linux/completion.h> |
| #include <linux/workqueue.h> |
| #include <linux/slab.h> |
| #include <linux/cpu.h> |
| #include <linux/notifier.h> |
| #include <linux/kthread.h> |
| #include <linux/hardirq.h> |
| #include <linux/mempolicy.h> |
| #include <linux/freezer.h> |
| #include <linux/debug_locks.h> |
| #include <linux/lockdep.h> |
| #include <linux/idr.h> |
| #include <linux/jhash.h> |
| #include <linux/hashtable.h> |
| #include <linux/rculist.h> |
| #include <linux/nodemask.h> |
| #include <linux/moduleparam.h> |
| #include <linux/uaccess.h> |
| #include <linux/sched/isolation.h> |
| #include <linux/sched/debug.h> |
| #include <linux/nmi.h> |
| #include <linux/kvm_para.h> |
| #include <linux/delay.h> |
| #include <linux/irq_work.h> |
| |
| #include "workqueue_internal.h" |
| |
| enum worker_pool_flags { |
| /* |
| * worker_pool flags |
| * |
| * A bound pool is either associated or disassociated with its CPU. |
| * While associated (!DISASSOCIATED), all workers are bound to the |
| * CPU and none has %WORKER_UNBOUND set and concurrency management |
| * is in effect. |
| * |
| * While DISASSOCIATED, the cpu may be offline and all workers have |
| * %WORKER_UNBOUND set and concurrency management disabled, and may |
| * be executing on any CPU. The pool behaves as an unbound one. |
| * |
| * Note that DISASSOCIATED should be flipped only while holding |
| * wq_pool_attach_mutex to avoid changing binding state while |
| * worker_attach_to_pool() is in progress. |
| * |
| * As there can only be one concurrent BH execution context per CPU, a |
| * BH pool is per-CPU and always DISASSOCIATED. |
| */ |
| POOL_BH = 1 << 0, /* is a BH pool */ |
| POOL_MANAGER_ACTIVE = 1 << 1, /* being managed */ |
| POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ |
| POOL_BH_DRAINING = 1 << 3, /* draining after CPU offline */ |
| }; |
| |
| enum worker_flags { |
| /* worker flags */ |
| WORKER_DIE = 1 << 1, /* die die die */ |
| WORKER_IDLE = 1 << 2, /* is idle */ |
| WORKER_PREP = 1 << 3, /* preparing to run works */ |
| WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ |
| WORKER_UNBOUND = 1 << 7, /* worker is unbound */ |
| WORKER_REBOUND = 1 << 8, /* worker was rebound */ |
| |
| WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE | |
| WORKER_UNBOUND | WORKER_REBOUND, |
| }; |
| |
| enum work_cancel_flags { |
| WORK_CANCEL_DELAYED = 1 << 0, /* canceling a delayed_work */ |
| WORK_CANCEL_DISABLE = 1 << 1, /* canceling to disable */ |
| }; |
| |
| enum wq_internal_consts { |
| NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ |
| |
| UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */ |
| BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ |
| |
| MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ |
| IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ |
| |
| MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2, |
| /* call for help after 10ms |
| (min two ticks) */ |
| MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ |
| CREATE_COOLDOWN = HZ, /* time to breath after fail */ |
| |
| /* |
| * Rescue workers are used only on emergencies and shared by |
| * all cpus. Give MIN_NICE. |
| */ |
| RESCUER_NICE_LEVEL = MIN_NICE, |
| HIGHPRI_NICE_LEVEL = MIN_NICE, |
| |
| WQ_NAME_LEN = 32, |
| WORKER_ID_LEN = 10 + WQ_NAME_LEN, /* "kworker/R-" + WQ_NAME_LEN */ |
| }; |
| |
| /* |
| * We don't want to trap softirq for too long. See MAX_SOFTIRQ_TIME and |
| * MAX_SOFTIRQ_RESTART in kernel/softirq.c. These are macros because |
| * msecs_to_jiffies() can't be an initializer. |
| */ |
| #define BH_WORKER_JIFFIES msecs_to_jiffies(2) |
| #define BH_WORKER_RESTARTS 10 |
| |
| /* |
| * Structure fields follow one of the following exclusion rules. |
| * |
| * I: Modifiable by initialization/destruction paths and read-only for |
| * everyone else. |
| * |
| * P: Preemption protected. Disabling preemption is enough and should |
| * only be modified and accessed from the local cpu. |
| * |
| * L: pool->lock protected. Access with pool->lock held. |
| * |
| * LN: pool->lock and wq_node_nr_active->lock protected for writes. Either for |
| * reads. |
| * |
| * K: Only modified by worker while holding pool->lock. Can be safely read by |
| * self, while holding pool->lock or from IRQ context if %current is the |
| * kworker. |
| * |
| * S: Only modified by worker self. |
| * |
| * A: wq_pool_attach_mutex protected. |
| * |
| * PL: wq_pool_mutex protected. |
| * |
| * PR: wq_pool_mutex protected for writes. RCU protected for reads. |
| * |
| * PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads. |
| * |
| * PWR: wq_pool_mutex and wq->mutex protected for writes. Either or |
| * RCU for reads. |
| * |
| * WQ: wq->mutex protected. |
| * |
| * WR: wq->mutex protected for writes. RCU protected for reads. |
| * |
| * WO: wq->mutex protected for writes. Updated with WRITE_ONCE() and can be read |
| * with READ_ONCE() without locking. |
| * |
| * MD: wq_mayday_lock protected. |
| * |
| * WD: Used internally by the watchdog. |
| */ |
| |
| /* struct worker is defined in workqueue_internal.h */ |
| |
| struct worker_pool { |
| raw_spinlock_t lock; /* the pool lock */ |
| int cpu; /* I: the associated cpu */ |
| int node; /* I: the associated node ID */ |
| int id; /* I: pool ID */ |
| unsigned int flags; /* L: flags */ |
| |
| unsigned long watchdog_ts; /* L: watchdog timestamp */ |
| bool cpu_stall; /* WD: stalled cpu bound pool */ |
| |
| /* |
| * The counter is incremented in a process context on the associated CPU |
| * w/ preemption disabled, and decremented or reset in the same context |
| * but w/ pool->lock held. The readers grab pool->lock and are |
| * guaranteed to see if the counter reached zero. |
| */ |
| int nr_running; |
| |
| struct list_head worklist; /* L: list of pending works */ |
| |
| int nr_workers; /* L: total number of workers */ |
| int nr_idle; /* L: currently idle workers */ |
| |
| struct list_head idle_list; /* L: list of idle workers */ |
| struct timer_list idle_timer; /* L: worker idle timeout */ |
| struct work_struct idle_cull_work; /* L: worker idle cleanup */ |
| |
| struct timer_list mayday_timer; /* L: SOS timer for workers */ |
| |
| /* a workers is either on busy_hash or idle_list, or the manager */ |
| DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); |
| /* L: hash of busy workers */ |
| |
| struct worker *manager; /* L: purely informational */ |
| struct list_head workers; /* A: attached workers */ |
| |
| struct ida worker_ida; /* worker IDs for task name */ |
| |
| struct workqueue_attrs *attrs; /* I: worker attributes */ |
| struct hlist_node hash_node; /* PL: unbound_pool_hash node */ |
| int refcnt; /* PL: refcnt for unbound pools */ |
| |
| /* |
| * Destruction of pool is RCU protected to allow dereferences |
| * from get_work_pool(). |
| */ |
| struct rcu_head rcu; |
| }; |
| |
| /* |
| * Per-pool_workqueue statistics. These can be monitored using |
| * tools/workqueue/wq_monitor.py. |
| */ |
| enum pool_workqueue_stats { |
| PWQ_STAT_STARTED, /* work items started execution */ |
| PWQ_STAT_COMPLETED, /* work items completed execution */ |
| PWQ_STAT_CPU_TIME, /* total CPU time consumed */ |
| PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */ |
| PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */ |
| PWQ_STAT_REPATRIATED, /* unbound workers brought back into scope */ |
| PWQ_STAT_MAYDAY, /* maydays to rescuer */ |
| PWQ_STAT_RESCUED, /* linked work items executed by rescuer */ |
| |
| PWQ_NR_STATS, |
| }; |
| |
| /* |
| * The per-pool workqueue. While queued, bits below WORK_PWQ_SHIFT |
| * of work_struct->data are used for flags and the remaining high bits |
| * point to the pwq; thus, pwqs need to be aligned at two's power of the |
| * number of flag bits. |
| */ |
| struct pool_workqueue { |
| struct worker_pool *pool; /* I: the associated pool */ |
| struct workqueue_struct *wq; /* I: the owning workqueue */ |
| int work_color; /* L: current color */ |
| int flush_color; /* L: flushing color */ |
| int refcnt; /* L: reference count */ |
| int nr_in_flight[WORK_NR_COLORS]; |
| /* L: nr of in_flight works */ |
| bool plugged; /* L: execution suspended */ |
| |
| /* |
| * nr_active management and WORK_STRUCT_INACTIVE: |
| * |
| * When pwq->nr_active >= max_active, new work item is queued to |
| * pwq->inactive_works instead of pool->worklist and marked with |
| * WORK_STRUCT_INACTIVE. |
| * |
| * All work items marked with WORK_STRUCT_INACTIVE do not participate in |
| * nr_active and all work items in pwq->inactive_works are marked with |
| * WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE work items are |
| * in pwq->inactive_works. Some of them are ready to run in |
| * pool->worklist or worker->scheduled. Those work itmes are only struct |
| * wq_barrier which is used for flush_work() and should not participate |
| * in nr_active. For non-barrier work item, it is marked with |
| * WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works. |
| */ |
| int nr_active; /* L: nr of active works */ |
| struct list_head inactive_works; /* L: inactive works */ |
| struct list_head pending_node; /* LN: node on wq_node_nr_active->pending_pwqs */ |
| struct list_head pwqs_node; /* WR: node on wq->pwqs */ |
| struct list_head mayday_node; /* MD: node on wq->maydays */ |
| |
| u64 stats[PWQ_NR_STATS]; |
| |
| /* |
| * Release of unbound pwq is punted to a kthread_worker. See put_pwq() |
| * and pwq_release_workfn() for details. pool_workqueue itself is also |
| * RCU protected so that the first pwq can be determined without |
| * grabbing wq->mutex. |
| */ |
| struct kthread_work release_work; |
| struct rcu_head rcu; |
| } __aligned(1 << WORK_STRUCT_PWQ_SHIFT); |
| |
| /* |
| * Structure used to wait for workqueue flush. |
| */ |
| struct wq_flusher { |
| struct list_head list; /* WQ: list of flushers */ |
| int flush_color; /* WQ: flush color waiting for */ |
| struct completion done; /* flush completion */ |
| }; |
| |
| struct wq_device; |
| |
| /* |
| * Unlike in a per-cpu workqueue where max_active limits its concurrency level |
| * on each CPU, in an unbound workqueue, max_active applies to the whole system. |
| * As sharing a single nr_active across multiple sockets can be very expensive, |
| * the counting and enforcement is per NUMA node. |
| * |
| * The following struct is used to enforce per-node max_active. When a pwq wants |
| * to start executing a work item, it should increment ->nr using |
| * tryinc_node_nr_active(). If acquisition fails due to ->nr already being over |
| * ->max, the pwq is queued on ->pending_pwqs. As in-flight work items finish |
| * and decrement ->nr, node_activate_pending_pwq() activates the pending pwqs in |
| * round-robin order. |
| */ |
| struct wq_node_nr_active { |
| int max; /* per-node max_active */ |
| atomic_t nr; /* per-node nr_active */ |
| raw_spinlock_t lock; /* nests inside pool locks */ |
| struct list_head pending_pwqs; /* LN: pwqs with inactive works */ |
| }; |
| |
| /* |
| * The externally visible workqueue. It relays the issued work items to |
| * the appropriate worker_pool through its pool_workqueues. |
| */ |
| struct workqueue_struct { |
| struct list_head pwqs; /* WR: all pwqs of this wq */ |
| struct list_head list; /* PR: list of all workqueues */ |
| |
| struct mutex mutex; /* protects this wq */ |
| int work_color; /* WQ: current work color */ |
| int flush_color; /* WQ: current flush color */ |
| atomic_t nr_pwqs_to_flush; /* flush in progress */ |
| struct wq_flusher *first_flusher; /* WQ: first flusher */ |
| struct list_head flusher_queue; /* WQ: flush waiters */ |
| struct list_head flusher_overflow; /* WQ: flush overflow list */ |
| |
| struct list_head maydays; /* MD: pwqs requesting rescue */ |
| struct worker *rescuer; /* MD: rescue worker */ |
| |
| int nr_drainers; /* WQ: drain in progress */ |
| |
| /* See alloc_workqueue() function comment for info on min/max_active */ |
| int max_active; /* WO: max active works */ |
| int min_active; /* WO: min active works */ |
| int saved_max_active; /* WQ: saved max_active */ |
| int saved_min_active; /* WQ: saved min_active */ |
| |
| struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */ |
| struct pool_workqueue __rcu *dfl_pwq; /* PW: only for unbound wqs */ |
| |
| #ifdef CONFIG_SYSFS |
| struct wq_device *wq_dev; /* I: for sysfs interface */ |
| #endif |
| #ifdef CONFIG_LOCKDEP |
| char *lock_name; |
| struct lock_class_key key; |
| struct lockdep_map __lockdep_map; |
| struct lockdep_map *lockdep_map; |
| #endif |
| char name[WQ_NAME_LEN]; /* I: workqueue name */ |
| |
| /* |
| * Destruction of workqueue_struct is RCU protected to allow walking |
| * the workqueues list without grabbing wq_pool_mutex. |
| * This is used to dump all workqueues from sysrq. |
| */ |
| struct rcu_head rcu; |
| |
| /* hot fields used during command issue, aligned to cacheline */ |
| unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ |
| struct pool_workqueue __rcu * __percpu *cpu_pwq; /* I: per-cpu pwqs */ |
| struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */ |
| }; |
| |
| /* |
| * Each pod type describes how CPUs should be grouped for unbound workqueues. |
| * See the comment above workqueue_attrs->affn_scope. |
| */ |
| struct wq_pod_type { |
| int nr_pods; /* number of pods */ |
| cpumask_var_t *pod_cpus; /* pod -> cpus */ |
| int *pod_node; /* pod -> node */ |
| int *cpu_pod; /* cpu -> pod */ |
| }; |
| |
| struct work_offq_data { |
| u32 pool_id; |
| u32 disable; |
| u32 flags; |
| }; |
| |
| static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = { |
| [WQ_AFFN_DFL] = "default", |
| [WQ_AFFN_CPU] = "cpu", |
| [WQ_AFFN_SMT] = "smt", |
| [WQ_AFFN_CACHE] = "cache", |
| [WQ_AFFN_NUMA] = "numa", |
| [WQ_AFFN_SYSTEM] = "system", |
| }; |
| |
| /* |
| * Per-cpu work items which run for longer than the following threshold are |
| * automatically considered CPU intensive and excluded from concurrency |
| * management to prevent them from noticeably delaying other per-cpu work items. |
| * ULONG_MAX indicates that the user hasn't overridden it with a boot parameter. |
| * The actual value is initialized in wq_cpu_intensive_thresh_init(). |
| */ |
| static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX; |
| module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644); |
| #ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT |
| static unsigned int wq_cpu_intensive_warning_thresh = 4; |
| module_param_named(cpu_intensive_warning_thresh, wq_cpu_intensive_warning_thresh, uint, 0644); |
| #endif |
| |
| /* see the comment above the definition of WQ_POWER_EFFICIENT */ |
| static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT); |
| module_param_named(power_efficient, wq_power_efficient, bool, 0444); |
| |
| static bool wq_online; /* can kworkers be created yet? */ |
| static bool wq_topo_initialized __read_mostly = false; |
| |
| static struct kmem_cache *pwq_cache; |
| |
| static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES]; |
| static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE; |
| |
| /* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */ |
| static struct workqueue_attrs *unbound_wq_update_pwq_attrs_buf; |
| |
| static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ |
| static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ |
| static DEFINE_RAW_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ |
| /* wait for manager to go away */ |
| static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait); |
| |
| static LIST_HEAD(workqueues); /* PR: list of all workqueues */ |
| static bool workqueue_freezing; /* PL: have wqs started freezing? */ |
| |
| /* PL: mirror the cpu_online_mask excluding the CPU in the midst of hotplugging */ |
| static cpumask_var_t wq_online_cpumask; |
| |
| /* PL&A: allowable cpus for unbound wqs and work items */ |
| static cpumask_var_t wq_unbound_cpumask; |
| |
| /* PL: user requested unbound cpumask via sysfs */ |
| static cpumask_var_t wq_requested_unbound_cpumask; |
| |
| /* PL: isolated cpumask to be excluded from unbound cpumask */ |
| static cpumask_var_t wq_isolated_cpumask; |
| |
| /* for further constrain wq_unbound_cpumask by cmdline parameter*/ |
| static struct cpumask wq_cmdline_cpumask __initdata; |
| |
| /* CPU where unbound work was last round robin scheduled from this CPU */ |
| static DEFINE_PER_CPU(int, wq_rr_cpu_last); |
| |
| /* |
| * Local execution of unbound work items is no longer guaranteed. The |
| * following always forces round-robin CPU selection on unbound work items |
| * to uncover usages which depend on it. |
| */ |
| #ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU |
| static bool wq_debug_force_rr_cpu = true; |
| #else |
| static bool wq_debug_force_rr_cpu = false; |
| #endif |
| module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644); |
| |
| /* to raise softirq for the BH worker pools on other CPUs */ |
| static DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_work [NR_STD_WORKER_POOLS], bh_pool_irq_works); |
| |
| /* the BH worker pools */ |
| static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], bh_worker_pools); |
| |
| /* the per-cpu worker pools */ |
| static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); |
| |
| static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ |
| |
| /* PL: hash of all unbound pools keyed by pool->attrs */ |
| static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); |
| |
| /* I: attributes used when instantiating standard unbound pools on demand */ |
| static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; |
| |
| /* I: attributes used when instantiating ordered pools on demand */ |
| static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; |
| |
| /* |
| * I: kthread_worker to release pwq's. pwq release needs to be bounced to a |
| * process context while holding a pool lock. Bounce to a dedicated kthread |
| * worker to avoid A-A deadlocks. |
| */ |
| static struct kthread_worker *pwq_release_worker __ro_after_init; |
| |
| struct workqueue_struct *system_wq __ro_after_init; |
| EXPORT_SYMBOL(system_wq); |
| struct workqueue_struct *system_highpri_wq __ro_after_init; |
| EXPORT_SYMBOL_GPL(system_highpri_wq); |
| struct workqueue_struct *system_long_wq __ro_after_init; |
| EXPORT_SYMBOL_GPL(system_long_wq); |
| struct workqueue_struct *system_unbound_wq __ro_after_init; |
| EXPORT_SYMBOL_GPL(system_unbound_wq); |
| struct workqueue_struct *system_freezable_wq __ro_after_init; |
| EXPORT_SYMBOL_GPL(system_freezable_wq); |
| struct workqueue_struct *system_power_efficient_wq __ro_after_init; |
| EXPORT_SYMBOL_GPL(system_power_efficient_wq); |
| struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init; |
| EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); |
| struct workqueue_struct *system_bh_wq; |
| EXPORT_SYMBOL_GPL(system_bh_wq); |
| struct workqueue_struct *system_bh_highpri_wq; |
| EXPORT_SYMBOL_GPL(system_bh_highpri_wq); |
| |
| static int worker_thread(void *__worker); |
| static void workqueue_sysfs_unregister(struct workqueue_struct *wq); |
| static void show_pwq(struct pool_workqueue *pwq); |
| static void show_one_worker_pool(struct worker_pool *pool); |
| |
| #define CREATE_TRACE_POINTS |
| #include <trace/events/workqueue.h> |
| |
| #define assert_rcu_or_pool_mutex() \ |
| RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() && \ |
| !lockdep_is_held(&wq_pool_mutex), \ |
| "RCU or wq_pool_mutex should be held") |
| |
| #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ |
| RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() && \ |
| !lockdep_is_held(&wq->mutex) && \ |
| !lockdep_is_held(&wq_pool_mutex), \ |
| "RCU, wq->mutex or wq_pool_mutex should be held") |
| |
| #define for_each_bh_worker_pool(pool, cpu) \ |
| for ((pool) = &per_cpu(bh_worker_pools, cpu)[0]; \ |
| (pool) < &per_cpu(bh_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ |
| (pool)++) |
| |
| #define for_each_cpu_worker_pool(pool, cpu) \ |
| for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ |
| (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ |
| (pool)++) |
| |
| /** |
| * for_each_pool - iterate through all worker_pools in the system |
| * @pool: iteration cursor |
| * @pi: integer used for iteration |
| * |
| * This must be called either with wq_pool_mutex held or RCU read |
| * locked. If the pool needs to be used beyond the locking in effect, the |
| * caller is responsible for guaranteeing that the pool stays online. |
| * |
| * The if/else clause exists only for the lockdep assertion and can be |
| * ignored. |
| */ |
| #define for_each_pool(pool, pi) \ |
| idr_for_each_entry(&worker_pool_idr, pool, pi) \ |
| if (({ assert_rcu_or_pool_mutex(); false; })) { } \ |
| else |
| |
| /** |
| * for_each_pool_worker - iterate through all workers of a worker_pool |
| * @worker: iteration cursor |
| * @pool: worker_pool to iterate workers of |
| * |
| * This must be called with wq_pool_attach_mutex. |
| * |
| * The if/else clause exists only for the lockdep assertion and can be |
| * ignored. |
| */ |
| #define for_each_pool_worker(worker, pool) \ |
| list_for_each_entry((worker), &(pool)->workers, node) \ |
| if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \ |
| else |
| |
| /** |
| * for_each_pwq - iterate through all pool_workqueues of the specified workqueue |
| * @pwq: iteration cursor |
| * @wq: the target workqueue |
| * |
| * This must be called either with wq->mutex held or RCU read locked. |
| * If the pwq needs to be used beyond the locking in effect, the caller is |
| * responsible for guaranteeing that the pwq stays online. |
| * |
| * The if/else clause exists only for the lockdep assertion and can be |
| * ignored. |
| */ |
| #define for_each_pwq(pwq, wq) \ |
| list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node, \ |
| lockdep_is_held(&(wq->mutex))) |
| |
| #ifdef CONFIG_DEBUG_OBJECTS_WORK |
| |
| static const struct debug_obj_descr work_debug_descr; |
| |
| static void *work_debug_hint(void *addr) |
| { |
| return ((struct work_struct *) addr)->func; |
| } |
| |
| static bool work_is_static_object(void *addr) |
| { |
| struct work_struct *work = addr; |
| |
| return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work)); |
| } |
| |
| /* |
| * fixup_init is called when: |
| * - an active object is initialized |
| */ |
| static bool work_fixup_init(void *addr, enum debug_obj_state state) |
| { |
| struct work_struct *work = addr; |
| |
| switch (state) { |
| case ODEBUG_STATE_ACTIVE: |
| cancel_work_sync(work); |
| debug_object_init(work, &work_debug_descr); |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| /* |
| * fixup_free is called when: |
| * - an active object is freed |
| */ |
| static bool work_fixup_free(void *addr, enum debug_obj_state state) |
| { |
| struct work_struct *work = addr; |
| |
| switch (state) { |
| case ODEBUG_STATE_ACTIVE: |
| cancel_work_sync(work); |
| debug_object_free(work, &work_debug_descr); |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| static const struct debug_obj_descr work_debug_descr = { |
| .name = "work_struct", |
| .debug_hint = work_debug_hint, |
| .is_static_object = work_is_static_object, |
| .fixup_init = work_fixup_init, |
| .fixup_free = work_fixup_free, |
| }; |
| |
| static inline void debug_work_activate(struct work_struct *work) |
| { |
| debug_object_activate(work, &work_debug_descr); |
| } |
| |
| static inline void debug_work_deactivate(struct work_struct *work) |
| { |
| debug_object_deactivate(work, &work_debug_descr); |
| } |
| |
| void __init_work(struct work_struct *work, int onstack) |
| { |
| if (onstack) |
| debug_object_init_on_stack(work, &work_debug_descr); |
| else |
| debug_object_init(work, &work_debug_descr); |
| } |
| EXPORT_SYMBOL_GPL(__init_work); |
| |
| void destroy_work_on_stack(struct work_struct *work) |
| { |
| debug_object_free(work, &work_debug_descr); |
| } |
| EXPORT_SYMBOL_GPL(destroy_work_on_stack); |
| |
| void destroy_delayed_work_on_stack(struct delayed_work *work) |
| { |
| destroy_timer_on_stack(&work->timer); |
| debug_object_free(&work->work, &work_debug_descr); |
| } |
| EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack); |
| |
| #else |
| static inline void debug_work_activate(struct work_struct *work) { } |
| static inline void debug_work_deactivate(struct work_struct *work) { } |
| #endif |
| |
| /** |
| * worker_pool_assign_id - allocate ID and assign it to @pool |
| * @pool: the pool pointer of interest |
| * |
| * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned |
| * successfully, -errno on failure. |
| */ |
| static int worker_pool_assign_id(struct worker_pool *pool) |
| { |
| int ret; |
| |
| lockdep_assert_held(&wq_pool_mutex); |
| |
| ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE, |
| GFP_KERNEL); |
| if (ret >= 0) { |
| pool->id = ret; |
| return 0; |
| } |
| return ret; |
| } |
| |
| static struct pool_workqueue __rcu ** |
| unbound_pwq_slot(struct workqueue_struct *wq, int cpu) |
| { |
| if (cpu >= 0) |
| return per_cpu_ptr(wq->cpu_pwq, cpu); |
| else |
| return &wq->dfl_pwq; |
| } |
| |
| /* @cpu < 0 for dfl_pwq */ |
| static struct pool_workqueue *unbound_pwq(struct workqueue_struct *wq, int cpu) |
| { |
| return rcu_dereference_check(*unbound_pwq_slot(wq, cpu), |
| lockdep_is_held(&wq_pool_mutex) || |
| lockdep_is_held(&wq->mutex)); |
| } |
| |
| /** |
| * unbound_effective_cpumask - effective cpumask of an unbound workqueue |
| * @wq: workqueue of interest |
| * |
| * @wq->unbound_attrs->cpumask contains the cpumask requested by the user which |
| * is masked with wq_unbound_cpumask to determine the effective cpumask. The |
| * default pwq is always mapped to the pool with the current effective cpumask. |
| */ |
| static struct cpumask *unbound_effective_cpumask(struct workqueue_struct *wq) |
| { |
| return unbound_pwq(wq, -1)->pool->attrs->__pod_cpumask; |
| } |
| |
| static unsigned int work_color_to_flags(int color) |
| { |
| return color << WORK_STRUCT_COLOR_SHIFT; |
| } |
| |
| static int get_work_color(unsigned long work_data) |
| { |
| return (work_data >> WORK_STRUCT_COLOR_SHIFT) & |
| ((1 << WORK_STRUCT_COLOR_BITS) - 1); |
| } |
| |
| static int work_next_color(int color) |
| { |
| return (color + 1) % WORK_NR_COLORS; |
| } |
| |
| static unsigned long pool_offq_flags(struct worker_pool *pool) |
| { |
| return (pool->flags & POOL_BH) ? WORK_OFFQ_BH : 0; |
| } |
| |
| /* |
| * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data |
| * contain the pointer to the queued pwq. Once execution starts, the flag |
| * is cleared and the high bits contain OFFQ flags and pool ID. |
| * |
| * set_work_pwq(), set_work_pool_and_clear_pending() and mark_work_canceling() |
| * can be used to set the pwq, pool or clear work->data. These functions should |
| * only be called while the work is owned - ie. while the PENDING bit is set. |
| * |
| * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq |
| * corresponding to a work. Pool is available once the work has been |
| * queued anywhere after initialization until it is sync canceled. pwq is |
| * available only while the work item is queued. |
| */ |
| static inline void set_work_data(struct work_struct *work, unsigned long data) |
| { |
| WARN_ON_ONCE(!work_pending(work)); |
| atomic_long_set(&work->data, data | work_static(work)); |
| } |
| |
| static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq, |
| unsigned long flags) |
| { |
| set_work_data(work, (unsigned long)pwq | WORK_STRUCT_PENDING | |
| WORK_STRUCT_PWQ | flags); |
| } |
| |
| static void set_work_pool_and_keep_pending(struct work_struct *work, |
| int pool_id, unsigned long flags) |
| { |
| set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) | |
| WORK_STRUCT_PENDING | flags); |
| } |
| |
| static void set_work_pool_and_clear_pending(struct work_struct *work, |
| int pool_id, unsigned long flags) |
| { |
| /* |
| * The following wmb is paired with the implied mb in |
| * test_and_set_bit(PENDING) and ensures all updates to @work made |
| * here are visible to and precede any updates by the next PENDING |
| * owner. |
| */ |
| smp_wmb(); |
| set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) | |
| flags); |
| /* |
| * The following mb guarantees that previous clear of a PENDING bit |
| * will not be reordered with any speculative LOADS or STORES from |
| * work->current_func, which is executed afterwards. This possible |
| * reordering can lead to a missed execution on attempt to queue |
| * the same @work. E.g. consider this case: |
| * |
| * CPU#0 CPU#1 |
| * ---------------------------- -------------------------------- |
| * |
| * 1 STORE event_indicated |
| * 2 queue_work_on() { |
| * 3 test_and_set_bit(PENDING) |
| * 4 } set_..._and_clear_pending() { |
| * 5 set_work_data() # clear bit |
| * 6 smp_mb() |
| * 7 work->current_func() { |
| * 8 LOAD event_indicated |
| * } |
| * |
| * Without an explicit full barrier speculative LOAD on line 8 can |
| * be executed before CPU#0 does STORE on line 1. If that happens, |
| * CPU#0 observes the PENDING bit is still set and new execution of |
| * a @work is not queued in a hope, that CPU#1 will eventually |
| * finish the queued @work. Meanwhile CPU#1 does not see |
| * event_indicated is set, because speculative LOAD was executed |
| * before actual STORE. |
| */ |
| smp_mb(); |
| } |
| |
| static inline struct pool_workqueue *work_struct_pwq(unsigned long data) |
| { |
| return (struct pool_workqueue *)(data & WORK_STRUCT_PWQ_MASK); |
| } |
| |
| static struct pool_workqueue *get_work_pwq(struct work_struct *work) |
| { |
| unsigned long data = atomic_long_read(&work->data); |
| |
| if (data & WORK_STRUCT_PWQ) |
| return work_struct_pwq(data); |
| else |
| return NULL; |
| } |
| |
| /** |
| * get_work_pool - return the worker_pool a given work was associated with |
| * @work: the work item of interest |
| * |
| * Pools are created and destroyed under wq_pool_mutex, and allows read |
| * access under RCU read lock. As such, this function should be |
| * called under wq_pool_mutex or inside of a rcu_read_lock() region. |
| * |
| * All fields of the returned pool are accessible as long as the above |
| * mentioned locking is in effect. If the returned pool needs to be used |
| * beyond the critical section, the caller is responsible for ensuring the |
| * returned pool is and stays online. |
| * |
| * Return: The worker_pool @work was last associated with. %NULL if none. |
| */ |
| static struct worker_pool *get_work_pool(struct work_struct *work) |
| { |
| unsigned long data = atomic_long_read(&work->data); |
| int pool_id; |
| |
| assert_rcu_or_pool_mutex(); |
| |
| if (data & WORK_STRUCT_PWQ) |
| return work_struct_pwq(data)->pool; |
| |
| pool_id = data >> WORK_OFFQ_POOL_SHIFT; |
| if (pool_id == WORK_OFFQ_POOL_NONE) |
| return NULL; |
| |
| return idr_find(&worker_pool_idr, pool_id); |
| } |
| |
| static unsigned long shift_and_mask(unsigned long v, u32 shift, u32 bits) |
| { |
| return (v >> shift) & ((1U << bits) - 1); |
| } |
| |
| static void work_offqd_unpack(struct work_offq_data *offqd, unsigned long data) |
| { |
| WARN_ON_ONCE(data & WORK_STRUCT_PWQ); |
| |
| offqd->pool_id = shift_and_mask(data, WORK_OFFQ_POOL_SHIFT, |
| WORK_OFFQ_POOL_BITS); |
| offqd->disable = shift_and_mask(data, WORK_OFFQ_DISABLE_SHIFT, |
| WORK_OFFQ_DISABLE_BITS); |
| offqd->flags = data & WORK_OFFQ_FLAG_MASK; |
| } |
| |
| static unsigned long work_offqd_pack_flags(struct work_offq_data *offqd) |
| { |
| return ((unsigned long)offqd->disable << WORK_OFFQ_DISABLE_SHIFT) | |
| ((unsigned long)offqd->flags); |
| } |
| |
| /* |
| * Policy functions. These define the policies on how the global worker |
| * pools are managed. Unless noted otherwise, these functions assume that |
| * they're being called with pool->lock held. |
| */ |
| |
| /* |
| * Need to wake up a worker? Called from anything but currently |
| * running workers. |
| * |
| * Note that, because unbound workers never contribute to nr_running, this |
| * function will always return %true for unbound pools as long as the |
| * worklist isn't empty. |
| */ |
| static bool need_more_worker(struct worker_pool *pool) |
| { |
| return !list_empty(&pool->worklist) && !pool->nr_running; |
| } |
| |
| /* Can I start working? Called from busy but !running workers. */ |
| static bool may_start_working(struct worker_pool *pool) |
| { |
| return pool->nr_idle; |
| } |
| |
| /* Do I need to keep working? Called from currently running workers. */ |
| static bool keep_working(struct worker_pool *pool) |
| { |
| return !list_empty(&pool->worklist) && (pool->nr_running <= 1); |
| } |
| |
| /* Do we need a new worker? Called from manager. */ |
| static bool need_to_create_worker(struct worker_pool *pool) |
| { |
| return need_more_worker(pool) && !may_start_working(pool); |
| } |
| |
| /* Do we have too many workers and should some go away? */ |
| static bool too_many_workers(struct worker_pool *pool) |
| { |
| bool managing = pool->flags & POOL_MANAGER_ACTIVE; |
| int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ |
| int nr_busy = pool->nr_workers - nr_idle; |
| |
| return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; |
| } |
| |
| /** |
| * worker_set_flags - set worker flags and adjust nr_running accordingly |
| * @worker: self |
| * @flags: flags to set |
| * |
| * Set @flags in @worker->flags and adjust nr_running accordingly. |
| */ |
| static inline void worker_set_flags(struct worker *worker, unsigned int flags) |
| { |
| struct worker_pool *pool = worker->pool; |
| |
| lockdep_assert_held(&pool->lock); |
| |
| /* If transitioning into NOT_RUNNING, adjust nr_running. */ |
| if ((flags & WORKER_NOT_RUNNING) && |
| !(worker->flags & WORKER_NOT_RUNNING)) { |
| pool->nr_running--; |
| } |
| |
| worker->flags |= flags; |
| } |
| |
| /** |
| * worker_clr_flags - clear worker flags and adjust nr_running accordingly |
| * @worker: self |
| * @flags: flags to clear |
| * |
| * Clear @flags in @worker->flags and adjust nr_running accordingly. |
| */ |
| static inline void worker_clr_flags(struct worker *worker, unsigned int flags) |
| { |
| struct worker_pool *pool = worker->pool; |
| unsigned int oflags = worker->flags; |
| |
| lockdep_assert_held(&pool->lock); |
| |
| worker->flags &= ~flags; |
| |
| /* |
| * If transitioning out of NOT_RUNNING, increment nr_running. Note |
| * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask |
| * of multiple flags, not a single flag. |
| */ |
| if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) |
| if (!(worker->flags & WORKER_NOT_RUNNING)) |
| pool->nr_running++; |
| } |
| |
| /* Return the first idle worker. Called with pool->lock held. */ |
| static struct worker *first_idle_worker(struct worker_pool *pool) |
| { |
| if (unlikely(list_empty(&pool->idle_list))) |
| return NULL; |
| |
| return list_first_entry(&pool->idle_list, struct worker, entry); |
| } |
| |
| /** |
| * worker_enter_idle - enter idle state |
| * @worker: worker which is entering idle state |
| * |
| * @worker is entering idle state. Update stats and idle timer if |
| * necessary. |
| * |
| * LOCKING: |
| * raw_spin_lock_irq(pool->lock). |
| */ |
| static void worker_enter_idle(struct worker *worker) |
| { |
| struct worker_pool *pool = worker->pool; |
| |
| if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) || |
| WARN_ON_ONCE(!list_empty(&worker->entry) && |
| (worker->hentry.next || worker->hentry.pprev))) |
| return; |
| |
| /* can't use worker_set_flags(), also called from create_worker() */ |
| worker->flags |= WORKER_IDLE; |
| pool->nr_idle++; |
| worker->last_active = jiffies; |
| |
| /* idle_list is LIFO */ |
| list_add(&worker->entry, &pool->idle_list); |
| |
| if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) |
| mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); |
| |
| /* Sanity check nr_running. */ |
| WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running); |
| } |
| |
| /** |
| * worker_leave_idle - leave idle state |
| * @worker: worker which is leaving idle state |
| * |
| * @worker is leaving idle state. Update stats. |
| * |
| * LOCKING: |
| * raw_spin_lock_irq(pool->lock). |
| */ |
| static void worker_leave_idle(struct worker *worker) |
| { |
| struct worker_pool *pool = worker->pool; |
| |
| if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE))) |
| return; |
| worker_clr_flags(worker, WORKER_IDLE); |
| pool->nr_idle--; |
| list_del_init(&worker->entry); |
| } |
| |
| /** |
| * find_worker_executing_work - find worker which is executing a work |
| * @pool: pool of interest |
| * @work: work to find worker for |
| * |
| * Find a worker which is executing @work on @pool by searching |
| * @pool->busy_hash which is keyed by the address of @work. For a worker |
| * to match, its current execution should match the address of @work and |
| * its work function. This is to avoid unwanted dependency between |
| * unrelated work executions through a work item being recycled while still |
| * being executed. |
| * |
| * This is a bit tricky. A work item may be freed once its execution |
| * starts and nothing prevents the freed area from being recycled for |
| * another work item. If the same work item address ends up being reused |
| * before the original execution finishes, workqueue will identify the |
| * recycled work item as currently executing and make it wait until the |
| * current execution finishes, introducing an unwanted dependency. |
| * |
| * This function checks the work item address and work function to avoid |
| * false positives. Note that this isn't complete as one may construct a |
| * work function which can introduce dependency onto itself through a |
| * recycled work item. Well, if somebody wants to shoot oneself in the |
| * foot that badly, there's only so much we can do, and if such deadlock |
| * actually occurs, it should be easy to locate the culprit work function. |
| * |
| * CONTEXT: |
| * raw_spin_lock_irq(pool->lock). |
| * |
| * Return: |
| * Pointer to worker which is executing @work if found, %NULL |
| * otherwise. |
| */ |
| static struct worker *find_worker_executing_work(struct worker_pool *pool, |
| struct work_struct *work) |
| { |
| struct worker *worker; |
| |
| hash_for_each_possible(pool->busy_hash, worker, hentry, |
| (unsigned long)work) |
| if (worker->current_work == work && |
| worker->current_func == work->func) |
| return worker; |
| |
| return NULL; |
| } |
| |
| /** |
| * move_linked_works - move linked works to a list |
| * @work: start of series of works to be scheduled |
| * @head: target list to append @work to |
| * @nextp: out parameter for nested worklist walking |
| * |
| * Schedule linked works starting from @work to @head. Work series to be |
| * scheduled starts at @work and includes any consecutive work with |
| * WORK_STRUCT_LINKED set in its predecessor. See assign_work() for details on |
| * @nextp. |
| * |
| * CONTEXT: |
| * raw_spin_lock_irq(pool->lock). |
| */ |
| static void move_linked_works(struct work_struct *work, struct list_head *head, |
| struct work_struct **nextp) |
| { |
| struct work_struct *n; |
| |
| /* |
| * Linked worklist will always end before the end of the list, |
| * use NULL for list head. |
| */ |
| list_for_each_entry_safe_from(work, n, NULL, entry) { |
| list_move_tail(&work->entry, head); |
| if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) |
| break; |
| } |
| |
| /* |
| * If we're already inside safe list traversal and have moved |
| * multiple works to the scheduled queue, the next position |
| * needs to be updated. |
| */ |
| if (nextp) |
| *nextp = n; |
| } |
| |
| /** |
| * assign_work - assign a work item and its linked work items to a worker |
| * @work: work to assign |
| * @worker: worker to assign to |
| * @nextp: out parameter for nested worklist walking |
| * |
| * Assign @work and its linked work items to @worker. If @work is already being |
| * executed by another worker in the same pool, it'll be punted there. |
| * |
| * If @nextp is not NULL, it's updated to point to the next work of the last |
| * scheduled work. This allows assign_work() to be nested inside |
| * list_for_each_entry_safe(). |
| * |
| * Returns %true if @work was successfully assigned to @worker. %false if @work |
| * was punted to another worker already executing it. |
| */ |
| static bool assign_work(struct work_struct *work, struct worker *worker, |
| struct work_struct **nextp) |
| { |
| struct worker_pool *pool = worker->pool; |
| struct worker *collision; |
| |
| lockdep_assert_held(&pool->lock); |
| |
| /* |
| * A single work shouldn't be executed concurrently by multiple workers. |
| * __queue_work() ensures that @work doesn't jump to a different pool |
| * while still running in the previous pool. Here, we should ensure that |
| * @work is not executed concurrently by multiple workers from the same |
| * pool. Check whether anyone is already processing the work. If so, |
| * defer the work to the currently executing one. |
| */ |
| collision = find_worker_executing_work(pool, work); |
| if (unlikely(collision)) { |
| move_linked_works(work, &collision->scheduled, nextp); |
| return false; |
| } |
| |
| move_linked_works(work, &worker->scheduled, nextp); |
| return true; |
| } |
| |
| static struct irq_work *bh_pool_irq_work(struct worker_pool *pool) |
| { |
| int high = pool->attrs->nice == HIGHPRI_NICE_LEVEL ? 1 : 0; |
| |
| return &per_cpu(bh_pool_irq_works, pool->cpu)[high]; |
| } |
| |
| static void kick_bh_pool(struct worker_pool *pool) |
| { |
| #ifdef CONFIG_SMP |
| /* see drain_dead_softirq_workfn() for BH_DRAINING */ |
| if (unlikely(pool->cpu != smp_processor_id() && |
| !(pool->flags & POOL_BH_DRAINING))) { |
| irq_work_queue_on(bh_pool_irq_work(pool), pool->cpu); |
| return; |
| } |
| #endif |
| if (pool->attrs->nice == HIGHPRI_NICE_LEVEL) |
| raise_softirq_irqoff(HI_SOFTIRQ); |
| else |
| raise_softirq_irqoff(TASKLET_SOFTIRQ); |
| } |
| |
| /** |
| * kick_pool - wake up an idle worker if necessary |
| * @pool: pool to kick |
| * |
| * @pool may have pending work items. Wake up worker if necessary. Returns |
| * whether a worker was woken up. |
| */ |
| static bool kick_pool(struct worker_pool *pool) |
| { |
| struct worker *worker = first_idle_worker(pool); |
| struct task_struct *p; |
| |
| lockdep_assert_held(&pool->lock); |
| |
| if (!need_more_worker(pool) || !worker) |
| return false; |
| |
| if (pool->flags & POOL_BH) { |
| kick_bh_pool(pool); |
| return true; |
| } |
| |
| p = worker->task; |
| |
| #ifdef CONFIG_SMP |
| /* |
| * Idle @worker is about to execute @work and waking up provides an |
| * opportunity to migrate @worker at a lower cost by setting the task's |
| * wake_cpu field. Let's see if we want to move @worker to improve |
| * execution locality. |
| * |
| * We're waking the worker that went idle the latest and there's some |
| * chance that @worker is marked idle but hasn't gone off CPU yet. If |
| * so, setting the wake_cpu won't do anything. As this is a best-effort |
| * optimization and the race window is narrow, let's leave as-is for |
| * now. If this becomes pronounced, we can skip over workers which are |
| * still on cpu when picking an idle worker. |
| * |
| * If @pool has non-strict affinity, @worker might have ended up outside |
| * its affinity scope. Repatriate. |
| */ |
| if (!pool->attrs->affn_strict && |
| !cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) { |
| struct work_struct *work = list_first_entry(&pool->worklist, |
| struct work_struct, entry); |
| int wake_cpu = cpumask_any_and_distribute(pool->attrs->__pod_cpumask, |
| cpu_online_mask); |
| if (wake_cpu < nr_cpu_ids) { |
| p->wake_cpu = wake_cpu; |
| get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++; |
| } |
| } |
| #endif |
| wake_up_process(p); |
| return true; |
| } |
| |
| #ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT |
| |
| /* |
| * Concurrency-managed per-cpu work items that hog CPU for longer than |
| * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism, |
| * which prevents them from stalling other concurrency-managed work items. If a |
| * work function keeps triggering this mechanism, it's likely that the work item |
| * should be using an unbound workqueue instead. |
| * |
| * wq_cpu_intensive_report() tracks work functions which trigger such conditions |
| * and report them so that they can be examined and converted to use unbound |
| * workqueues as appropriate. To avoid flooding the console, each violating work |
| * function is tracked and reported with exponential backoff. |
| */ |
| #define WCI_MAX_ENTS 128 |
| |
| struct wci_ent { |
| work_func_t func; |
| atomic64_t cnt; |
| struct hlist_node hash_node; |
| }; |
| |
| static struct wci_ent wci_ents[WCI_MAX_ENTS]; |
| static int wci_nr_ents; |
| static DEFINE_RAW_SPINLOCK(wci_lock); |
| static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS)); |
| |
| static struct wci_ent *wci_find_ent(work_func_t func) |
| { |
| struct wci_ent *ent; |
| |
| hash_for_each_possible_rcu(wci_hash, ent, hash_node, |
| (unsigned long)func) { |
| if (ent->func == func) |
| return ent; |
| } |
| return NULL; |
| } |
| |
| static void wq_cpu_intensive_report(work_func_t func) |
| { |
| struct wci_ent *ent; |
| |
| restart: |
| ent = wci_find_ent(func); |
| if (ent) { |
| u64 cnt; |
| |
| /* |
| * Start reporting from the warning_thresh and back off |
| * exponentially. |
| */ |
| cnt = atomic64_inc_return_relaxed(&ent->cnt); |
| if (wq_cpu_intensive_warning_thresh && |
| cnt >= wq_cpu_intensive_warning_thresh && |
| is_power_of_2(cnt + 1 - wq_cpu_intensive_warning_thresh)) |
| printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n", |
| ent->func, wq_cpu_intensive_thresh_us, |
| atomic64_read(&ent->cnt)); |
| return; |
| } |
| |
| /* |
| * @func is a new violation. Allocate a new entry for it. If wcn_ents[] |
| * is exhausted, something went really wrong and we probably made enough |
| * noise already. |
| */ |
| if (wci_nr_ents >= WCI_MAX_ENTS) |
| return; |
| |
| raw_spin_lock(&wci_lock); |
| |
| if (wci_nr_ents >= WCI_MAX_ENTS) { |
| raw_spin_unlock(&wci_lock); |
| return; |
| } |
| |
| if (wci_find_ent(func)) { |
| raw_spin_unlock(&wci_lock); |
| goto restart; |
| } |
| |
| ent = &wci_ents[wci_nr_ents++]; |
| ent->func = func; |
| atomic64_set(&ent->cnt, 0); |
| hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func); |
| |
| raw_spin_unlock(&wci_lock); |
| |
| goto restart; |
| } |
| |
| #else /* CONFIG_WQ_CPU_INTENSIVE_REPORT */ |
| static void wq_cpu_intensive_report(work_func_t func) {} |
| #endif /* CONFIG_WQ_CPU_INTENSIVE_REPORT */ |
| |
| /** |
| * wq_worker_running - a worker is running again |
| * @task: task waking up |
| * |
| * This function is called when a worker returns from schedule() |
| */ |
| void wq_worker_running(struct task_struct *task) |
| { |
| struct worker *worker = kthread_data(task); |
| |
| if (!READ_ONCE(worker->sleeping)) |
| return; |
| |
| /* |
| * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check |
| * and the nr_running increment below, we may ruin the nr_running reset |
| * and leave with an unexpected pool->nr_running == 1 on the newly unbound |
| * pool. Protect against such race. |
| */ |
| preempt_disable(); |
| if (!(worker->flags & WORKER_NOT_RUNNING)) |
| worker->pool->nr_running++; |
| preempt_enable(); |
| |
| /* |
| * CPU intensive auto-detection cares about how long a work item hogged |
| * CPU without sleeping. Reset the starting timestamp on wakeup. |
| */ |
| worker->current_at = worker->task->se.sum_exec_runtime; |
| |
| WRITE_ONCE(worker->sleeping, 0); |
| } |
| |
| /** |
| * wq_worker_sleeping - a worker is going to sleep |
| * @task: task going to sleep |
| * |
| * This function is called from schedule() when a busy worker is |
| * going to sleep. |
| */ |
| void wq_worker_sleeping(struct task_struct *task) |
| { |
| struct worker *worker = kthread_data(task); |
| struct worker_pool *pool; |
| |
| /* |
| * Rescuers, which may not have all the fields set up like normal |
| * workers, also reach here, let's not access anything before |
| * checking NOT_RUNNING. |
| */ |
| if (worker->flags & WORKER_NOT_RUNNING) |
| return; |
| |
| pool = worker->pool; |
| |
| /* Return if preempted before wq_worker_running() was reached */ |
| if (READ_ONCE(worker->sleeping)) |
| return; |
| |
| WRITE_ONCE(worker->sleeping, 1); |
| raw_spin_lock_irq(&pool->lock); |
| |
| /* |
| * Recheck in case unbind_workers() preempted us. We don't |
| * want to decrement nr_running after the worker is unbound |
| * and nr_running has been reset. |
| */ |
| if (worker->flags & WORKER_NOT_RUNNING) { |
| raw_spin_unlock_irq(&pool->lock); |
| return; |
| } |
| |
| pool->nr_running--; |
| if (kick_pool(pool)) |
| worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++; |
| |
| raw_spin_unlock_irq(&pool->lock); |
| } |
| |
| /** |
| * wq_worker_tick - a scheduler tick occurred while a kworker is running |
| * @task: task currently running |
| * |
| * Called from sched_tick(). We're in the IRQ context and the current |
| * worker's fields which follow the 'K' locking rule can be accessed safely. |
| */ |
| void wq_worker_tick(struct task_struct *task) |
| { |
| struct worker *worker = kthread_data(task); |
| struct pool_workqueue *pwq = worker->current_pwq; |
| struct worker_pool *pool = worker->pool; |
| |
| if (!pwq) |
| return; |
| |
| pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC; |
| |
| if (!wq_cpu_intensive_thresh_us) |
| return; |
| |
| /* |
| * If the current worker is concurrency managed and hogged the CPU for |
| * longer than wq_cpu_intensive_thresh_us, it's automatically marked |
| * CPU_INTENSIVE to avoid stalling other concurrency-managed work items. |
| * |
| * Set @worker->sleeping means that @worker is in the process of |
| * switching out voluntarily and won't be contributing to |
| * @pool->nr_running until it wakes up. As wq_worker_sleeping() also |
| * decrements ->nr_running, setting CPU_INTENSIVE here can lead to |
| * double decrements. The task is releasing the CPU anyway. Let's skip. |
| * We probably want to make this prettier in the future. |
| */ |
| if ((worker->flags & WORKER_NOT_RUNNING) || READ_ONCE(worker->sleeping) || |
| worker->task->se.sum_exec_runtime - worker->current_at < |
| wq_cpu_intensive_thresh_us * NSEC_PER_USEC) |
| return; |
| |
| raw_spin_lock(&pool->lock); |
| |
| worker_set_flags(worker, WORKER_CPU_INTENSIVE); |
| wq_cpu_intensive_report(worker->current_func); |
| pwq->stats[PWQ_STAT_CPU_INTENSIVE]++; |
| |
| if (kick_pool(pool)) |
| pwq->stats[PWQ_STAT_CM_WAKEUP]++; |
| |
| raw_spin_unlock(&pool->lock); |
| } |
| |
| /** |
| * wq_worker_last_func - retrieve worker's last work function |
| * @task: Task to retrieve last work function of. |
| * |
| * Determine the last function a worker executed. This is called from |
| * the scheduler to get a worker's last known identity. |
| * |
| * CONTEXT: |
| * raw_spin_lock_irq(rq->lock) |
| * |
| * This function is called during schedule() when a kworker is going |
| * to sleep. It's used by psi to identify aggregation workers during |
| * dequeuing, to allow periodic aggregation to shut-off when that |
| * worker is the last task in the system or cgroup to go to sleep. |
| * |
| * As this function doesn't involve any workqueue-related locking, it |
| * only returns stable values when called from inside the scheduler's |
| * queuing and dequeuing paths, when @task, which must be a kworker, |
| * is guaranteed to not be processing any works. |
| * |
| * Return: |
| * The last work function %current executed as a worker, NULL if it |
| * hasn't executed any work yet. |
| */ |
| work_func_t wq_worker_last_func(struct task_struct *task) |
| { |
| struct worker *worker = kthread_data(task); |
| |
| return worker->last_func; |
| } |
| |
| /** |
| * wq_node_nr_active - Determine wq_node_nr_active to use |
| * @wq: workqueue of interest |
| * @node: NUMA node, can be %NUMA_NO_NODE |
| * |
| * Determine wq_node_nr_active to use for @wq on @node. Returns: |
| * |
| * - %NULL for per-cpu workqueues as they don't need to use shared nr_active. |
| * |
| * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE. |
| * |
| * - Otherwise, node_nr_active[@node]. |
| */ |
| static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq, |
| int node) |
| { |
| if (!(wq->flags & WQ_UNBOUND)) |
| return NULL; |
| |
| if (node == NUMA_NO_NODE) |
| node = nr_node_ids; |
| |
| return wq->node_nr_active[node]; |
| } |
| |
| /** |
| * wq_update_node_max_active - Update per-node max_actives to use |
| * @wq: workqueue to update |
| * @off_cpu: CPU that's going down, -1 if a CPU is not going down |
| * |
| * Update @wq->node_nr_active[]->max. @wq must be unbound. max_active is |
| * distributed among nodes according to the proportions of numbers of online |
| * cpus. The result is always between @wq->min_active and max_active. |
| */ |
| static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu) |
| { |
| struct cpumask *effective = unbound_effective_cpumask(wq); |
| int min_active = READ_ONCE(wq->min_active); |
| int max_active = READ_ONCE(wq->max_active); |
| int total_cpus, node; |
| |
| lockdep_assert_held(&wq->mutex); |
| |
| if (!wq_topo_initialized) |
| return; |
| |
| if (off_cpu >= 0 && !cpumask_test_cpu(off_cpu, effective)) |
| off_cpu = -1; |
| |
| total_cpus = cpumask_weight_and(effective, cpu_online_mask); |
| if (off_cpu >= 0) |
| total_cpus--; |
| |
| /* If all CPUs of the wq get offline, use the default values */ |
| if (unlikely(!total_cpus)) { |
| for_each_node(node) |
| wq_node_nr_active(wq, node)->max = min_active; |
| |
| wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active; |
| return; |
| } |
| |
| for_each_node(node) { |
| int node_cpus; |
| |
| node_cpus = cpumask_weight_and(effective, cpumask_of_node(node)); |
| if (off_cpu >= 0 && cpu_to_node(off_cpu) == node) |
| node_cpus--; |
| |
| wq_node_nr_active(wq, node)->max = |
| clamp(DIV_ROUND_UP(max_active * node_cpus, total_cpus), |
| min_active, max_active); |
| } |
| |
| wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active; |
| } |
| |
| /** |
| * get_pwq - get an extra reference on the specified pool_workqueue |
| * @pwq: pool_workqueue to get |
| * |
| * Obtain an extra reference on @pwq. The caller should guarantee that |
| * @pwq has positive refcnt and be holding the matching pool->lock. |
| */ |
| static void get_pwq(struct pool_workqueue *pwq) |
| { |
| lockdep_assert_held(&pwq->pool->lock); |
| WARN_ON_ONCE(pwq->refcnt <= 0); |
| pwq->refcnt++; |
| } |
| |
| /** |
| * put_pwq - put a pool_workqueue reference |
| * @pwq: pool_workqueue to put |
| * |
| * Drop a reference of @pwq. If its refcnt reaches zero, schedule its |
| * destruction. The caller should be holding the matching pool->lock. |
| */ |
| static void put_pwq(struct pool_workqueue *pwq) |
| { |
| lockdep_assert_held(&pwq->pool->lock); |
| if (likely(--pwq->refcnt)) |
| return; |
| /* |
| * @pwq can't be released under pool->lock, bounce to a dedicated |
| * kthread_worker to avoid A-A deadlocks. |
| */ |
| kthread_queue_work(pwq_release_worker, &pwq->release_work); |
| } |
| |
| /** |
| * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock |
| * @pwq: pool_workqueue to put (can be %NULL) |
| * |
| * put_pwq() with locking. This function also allows %NULL @pwq. |
| */ |
| static void put_pwq_unlocked(struct pool_workqueue *pwq) |
| { |
| if (pwq) { |
| /* |
| * As both pwqs and pools are RCU protected, the |
| * following lock operations are safe. |
| */ |
| raw_spin_lock_irq(&pwq->pool->lock); |
| put_pwq(pwq); |
| raw_spin_unlock_irq(&pwq->pool->lock); |
| } |
| } |
| |
| static bool pwq_is_empty(struct pool_workqueue *pwq) |
| { |
| return !pwq->nr_active && list_empty(&pwq->inactive_works); |
| } |
| |
| static void __pwq_activate_work(struct pool_workqueue *pwq, |
| struct work_struct *work) |
| { |
| unsigned long *wdb = work_data_bits(work); |
| |
| WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE)); |
| trace_workqueue_activate_work(work); |
| if (list_empty(&pwq->pool->worklist)) |
| pwq->pool->watchdog_ts = jiffies; |
| move_linked_works(work, &pwq->pool->worklist, NULL); |
| __clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb); |
| } |
| |
| static bool tryinc_node_nr_active(struct wq_node_nr_active *nna) |
| { |
| int max = READ_ONCE(nna->max); |
| |
| while (true) { |
| int old, tmp; |
| |
| old = atomic_read(&nna->nr); |
| if (old >= max) |
| return false; |
| tmp = atomic_cmpxchg_relaxed(&nna->nr, old, old + 1); |
| if (tmp == old) |
| return true; |
| } |
| } |
| |
| /** |
| * pwq_tryinc_nr_active - Try to increment nr_active for a pwq |
| * @pwq: pool_workqueue of interest |
| * @fill: max_active may have increased, try to increase concurrency level |
| * |
| * Try to increment nr_active for @pwq. Returns %true if an nr_active count is |
| * successfully obtained. %false otherwise. |
| */ |
| static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill) |
| { |
| struct workqueue_struct *wq = pwq->wq; |
| struct worker_pool *pool = pwq->pool; |
| struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node); |
| bool obtained = false; |
| |
| lockdep_assert_held(&pool->lock); |
| |
| if (!nna) { |
| /* BH or per-cpu workqueue, pwq->nr_active is sufficient */ |
| obtained = pwq->nr_active < READ_ONCE(wq->max_active); |
| goto out; |
| } |
| |
| if (unlikely(pwq->plugged)) |
| return false; |
| |
| /* |
| * Unbound workqueue uses per-node shared nr_active $nna. If @pwq is |
| * already waiting on $nna, pwq_dec_nr_active() will maintain the |
| * concurrency level. Don't jump the line. |
| * |
| * We need to ignore the pending test after max_active has increased as |
| * pwq_dec_nr_active() can only maintain the concurrency level but not |
| * increase it. This is indicated by @fill. |
| */ |
| if (!list_empty(&pwq->pending_node) && likely(!fill)) |
| goto out; |
| |
| obtained = tryinc_node_nr_active(nna); |
| if (obtained) |
| goto out; |
| |
| /* |
| * Lockless acquisition failed. Lock, add ourself to $nna->pending_pwqs |
| * and try again. The smp_mb() is paired with the implied memory barrier |
| * of atomic_dec_return() in pwq_dec_nr_active() to ensure that either |
| * we see the decremented $nna->nr or they see non-empty |
| * $nna->pending_pwqs. |
| */ |
| raw_spin_lock(&nna->lock); |
| |
| if (list_empty(&pwq->pending_node)) |
| list_add_tail(&pwq->pending_node, &nna->pending_pwqs); |
| else if (likely(!fill)) |
| goto out_unlock; |
| |
| smp_mb(); |
| |
| obtained = tryinc_node_nr_active(nna); |
| |
| /* |
| * If @fill, @pwq might have already been pending. Being spuriously |
| * pending in cold paths doesn't affect anything. Let's leave it be. |
| */ |
| if (obtained && likely(!fill)) |
| list_del_init(&pwq->pending_node); |
| |
| out_unlock: |
| raw_spin_unlock(&nna->lock); |
| out: |
| if (obtained) |
| pwq->nr_active++; |
| return obtained; |
| } |
| |
| /** |
| * pwq_activate_first_inactive - Activate the first inactive work item on a pwq |
| * @pwq: pool_workqueue of interest |
| * @fill: max_active may have increased, try to increase concurrency level |
| * |
| * Activate the first inactive work item of @pwq if available and allowed by |
| * max_active limit. |
| * |
| * Returns %true if an inactive work item has been activated. %false if no |
| * inactive work item is found or max_active limit is reached. |
| */ |
| static bool pwq_activate_first_inactive(struct pool_workqueue *pwq, bool fill) |
| { |
| struct work_struct *work = |
| list_first_entry_or_null(&pwq->inactive_works, |
| struct work_struct, entry); |
| |
| if (work && pwq_tryinc_nr_active(pwq, fill)) { |
| __pwq_activate_work(pwq, work); |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * unplug_oldest_pwq - unplug the oldest pool_workqueue |
| * @wq: workqueue_struct where its oldest pwq is to be unplugged |
| * |
| * This function should only be called for ordered workqueues where only the |
| * oldest pwq is unplugged, the others are plugged to suspend execution to |
| * ensure proper work item ordering:: |
| * |
| * dfl_pwq --------------+ [P] - plugged |
| * | |
| * v |
| * pwqs -> A -> B [P] -> C [P] (newest) |
| * | | | |
| * 1 3 5 |
| * | | | |
| * 2 4 6 |
| * |
| * When the oldest pwq is drained and removed, this function should be called |
| * to unplug the next oldest one to start its work item execution. Note that |
| * pwq's are linked into wq->pwqs with the oldest first, so the first one in |
| * the list is the oldest. |
| */ |
| static void unplug_oldest_pwq(struct workqueue_struct *wq) |
| { |
| struct pool_workqueue *pwq; |
| |
| lockdep_assert_held(&wq->mutex); |
| |
| /* Caller should make sure that pwqs isn't empty before calling */ |
| pwq = list_first_entry_or_null(&wq->pwqs, struct pool_workqueue, |
| pwqs_node); |
| raw_spin_lock_irq(&pwq->pool->lock); |
| if (pwq->plugged) { |
| pwq->plugged = false; |
| if (pwq_activate_first_inactive(pwq, true)) |
| kick_pool(pwq->pool); |
| } |
| raw_spin_unlock_irq(&pwq->pool->lock); |
| } |
| |
| /** |
| * node_activate_pending_pwq - Activate a pending pwq on a wq_node_nr_active |
| * @nna: wq_node_nr_active to activate a pending pwq for |
| * @caller_pool: worker_pool the caller is locking |
| * |
| * Activate a pwq in @nna->pending_pwqs. Called with @caller_pool locked. |
| * @caller_pool may be unlocked and relocked to lock other worker_pools. |
| */ |
| static void node_activate_pending_pwq(struct wq_node_nr_active *nna, |
| struct worker_pool *caller_pool) |
| { |
| struct worker_pool *locked_pool = caller_pool; |
| struct pool_workqueue *pwq; |
| struct work_struct *work; |
| |
| lockdep_assert_held(&caller_pool->lock); |
| |
| raw_spin_lock(&nna->lock); |
| retry: |
| pwq = list_first_entry_or_null(&nna->pending_pwqs, |
| struct pool_workqueue, pending_node); |
| if (!pwq) |
| goto out_unlock; |
| |
| /* |
| * If @pwq is for a different pool than @locked_pool, we need to lock |
| * @pwq->pool->lock. Let's trylock first. If unsuccessful, do the unlock |
| * / lock dance. For that, we also need to release @nna->lock as it's |
| * nested inside pool locks. |
| */ |
| if (pwq->pool != locked_pool) { |
| raw_spin_unlock(&locked_pool->lock); |
| locked_pool = pwq->pool; |
| if (!raw_spin_trylock(&locked_pool->lock)) { |
| raw_spin_unlock(&nna->lock); |
| raw_spin_lock(&locked_pool->lock); |
| raw_spin_lock(&nna->lock); |
| goto retry; |
| } |
| } |
| |
| /* |
| * $pwq may not have any inactive work items due to e.g. cancellations. |
| * Drop it from pending_pwqs and see if there's another one. |
| */ |
| work = list_first_entry_or_null(&pwq->inactive_works, |
| struct work_struct, entry); |
| if (!work) { |
| list_del_init(&pwq->pending_node); |
| goto retry; |
| } |
| |
| /* |
| * Acquire an nr_active count and activate the inactive work item. If |
| * $pwq still has inactive work items, rotate it to the end of the |
| * pending_pwqs so that we round-robin through them. This means that |
| * inactive work items are not activated in queueing order which is fine |
| * given that there has never been any ordering across different pwqs. |
| */ |
| if (likely(tryinc_node_nr_active(nna))) { |
| pwq->nr_active++; |
| __pwq_activate_work(pwq, work); |
| |
| if (list_empty(&pwq->inactive_works)) |
| list_del_init(&pwq->pending_node); |
| else |
| list_move_tail(&pwq->pending_node, &nna->pending_pwqs); |
| |
| /* if activating a foreign pool, make sure it's running */ |
| if (pwq->pool != caller_pool) |
| kick_pool(pwq->pool); |
| } |
| |
| out_unlock: |
| raw_spin_unlock(&nna->lock); |
| if (locked_pool != caller_pool) { |
| raw_spin_unlock(&locked_pool->lock); |
| raw_spin_lock(&caller_pool->lock); |
| } |
| } |
| |
| /** |
| * pwq_dec_nr_active - Retire an active count |
| * @pwq: pool_workqueue of interest |
| * |
| * Decrement @pwq's nr_active and try to activate the first inactive work item. |
| * For unbound workqueues, this function may temporarily drop @pwq->pool->lock. |
| */ |
| static void pwq_dec_nr_active(struct pool_workqueue *pwq) |
| { |
| struct worker_pool *pool = pwq->pool; |
| struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node); |
| |
| lockdep_assert_held(&pool->lock); |
| |
| /* |
| * @pwq->nr_active should be decremented for both percpu and unbound |
| * workqueues. |
| */ |
| pwq->nr_active--; |
| |
| /* |
| * For a percpu workqueue, it's simple. Just need to kick the first |
| * inactive work item on @pwq itself. |
| */ |
| if (!nna) { |
| pwq_activate_first_inactive(pwq, false); |
| return; |
| } |
| |
| /* |
| * If @pwq is for an unbound workqueue, it's more complicated because |
| * multiple pwqs and pools may be sharing the nr_active count. When a |
| * pwq needs to wait for an nr_active count, it puts itself on |
| * $nna->pending_pwqs. The following atomic_dec_return()'s implied |
| * memory barrier is paired with smp_mb() in pwq_tryinc_nr_active() to |
| * guarantee that either we see non-empty pending_pwqs or they see |
| * decremented $nna->nr. |
| * |
| * $nna->max may change as CPUs come online/offline and @pwq->wq's |
| * max_active gets updated. However, it is guaranteed to be equal to or |
| * larger than @pwq->wq->min_active which is above zero unless freezing. |
| * This maintains the forward progress guarantee. |
| */ |
| if (atomic_dec_return(&nna->nr) >= READ_ONCE(nna->max)) |
| return; |
| |
| if (!list_empty(&nna->pending_pwqs)) |
| node_activate_pending_pwq(nna, pool); |
| } |
| |
| /** |
| * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight |
| * @pwq: pwq of interest |
| * @work_data: work_data of work which left the queue |
| * |
| * A work either has completed or is removed from pending queue, |
| * decrement nr_in_flight of its pwq and handle workqueue flushing. |
| * |
| * NOTE: |
| * For unbound workqueues, this function may temporarily drop @pwq->pool->lock |
| * and thus should be called after all other state updates for the in-flight |
| * work item is complete. |
| * |
| * CONTEXT: |
| * raw_spin_lock_irq(pool->lock). |
| */ |
| static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_data) |
| { |
| int color = get_work_color(work_data); |
| |
| if (!(work_data & WORK_STRUCT_INACTIVE)) |
| pwq_dec_nr_active(pwq); |
| |
| pwq->nr_in_flight[color]--; |
| |
| /* is flush in progress and are we at the flushing tip? */ |
| if (likely(pwq->flush_color != color)) |
| goto out_put; |
| |
| /* are there still in-flight works? */ |
| if (pwq->nr_in_flight[color]) |
| goto out_put; |
| |
| /* this pwq is done, clear flush_color */ |
| pwq->flush_color = -1; |
| |
| /* |
| * If this was the last pwq, wake up the first flusher. It |
| * will handle the rest. |
| */ |
| if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush)) |
| complete(&pwq->wq->first_flusher->done); |
| out_put: |
| put_pwq(pwq); |
| } |
| |
| /** |
| * try_to_grab_pending - steal work item from worklist and disable irq |
| * @work: work item to steal |
| * @cflags: %WORK_CANCEL_ flags |
| * @irq_flags: place to store irq state |
| * |
| * Try to grab PENDING bit of @work. This function can handle @work in any |
| * stable state - idle, on timer or on worklist. |
| * |
| * Return: |
| * |
| * ======== ================================================================ |
| * 1 if @work was pending and we successfully stole PENDING |
| * 0 if @work was idle and we claimed PENDING |
| * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry |
| * ======== ================================================================ |
| * |
| * Note: |
| * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting |
| * interrupted while holding PENDING and @work off queue, irq must be |
| * disabled on entry. This, combined with delayed_work->timer being |
| * irqsafe, ensures that we return -EAGAIN for finite short period of time. |
| * |
| * On successful return, >= 0, irq is disabled and the caller is |
| * responsible for releasing it using local_irq_restore(*@irq_flags). |
| * |
| * This function is safe to call from any context including IRQ handler. |
| */ |
| static int try_to_grab_pending(struct work_struct *work, u32 cflags, |
| unsigned long *irq_flags) |
| { |
| struct worker_pool *pool; |
| struct pool_workqueue *pwq; |
| |
| local_irq_save(*irq_flags); |
| |
| /* try to steal the timer if it exists */ |
| if (cflags & WORK_CANCEL_DELAYED) { |
| struct delayed_work *dwork = to_delayed_work(work); |
| |
| /* |
| * dwork->timer is irqsafe. If del_timer() fails, it's |
| * guaranteed that the timer is not queued anywhere and not |
| * running on the local CPU. |
| */ |
| if (likely(del_timer(&dwork->timer))) |
| return 1; |
| } |
| |
| /* try to claim PENDING the normal way */ |
| if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) |
| return 0; |
| |
| rcu_read_lock(); |
| /* |
| * The queueing is in progress, or it is already queued. Try to |
| * steal it from ->worklist without clearing WORK_STRUCT_PENDING. |
| */ |
| pool = get_work_pool(work); |
| if (!pool) |
| goto fail; |
| |
| raw_spin_lock(&pool->lock); |
| /* |
| * work->data is guaranteed to point to pwq only while the work |
| * item is queued on pwq->wq, and both updating work->data to point |
| * to pwq on queueing and to pool on dequeueing are done under |
| * pwq->pool->lock. This in turn guarantees that, if work->data |
| * points to pwq which is associated with a locked pool, the work |
| * item is currently queued on that pool. |
| */ |
| pwq = get_work_pwq(work); |
| if (pwq && pwq->pool == pool) { |
| unsigned long work_data = *work_data_bits(work); |
| |
| debug_work_deactivate(work); |
| |
| /* |
| * A cancelable inactive work item must be in the |
| * pwq->inactive_works since a queued barrier can't be |
| * canceled (see the comments in insert_wq_barrier()). |
| * |
| * An inactive work item cannot be deleted directly because |
| * it might have linked barrier work items which, if left |
| * on the inactive_works list, will confuse pwq->nr_active |
| * management later on and cause stall. Move the linked |
| * barrier work items to the worklist when deleting the grabbed |
| * item. Also keep WORK_STRUCT_INACTIVE in work_data, so that |
| * it doesn't participate in nr_active management in later |
| * pwq_dec_nr_in_flight(). |
| */ |
| if (work_data & WORK_STRUCT_INACTIVE) |
| move_linked_works(work, &pwq->pool->worklist, NULL); |
| |
| list_del_init(&work->entry); |
| |
| /* |
| * work->data points to pwq iff queued. Let's point to pool. As |
| * this destroys work->data needed by the next step, stash it. |
| */ |
| set_work_pool_and_keep_pending(work, pool->id, |
| pool_offq_flags(pool)); |
| |
| /* must be the last step, see the function comment */ |
| pwq_dec_nr_in_flight(pwq, work_data); |
| |
| raw_spin_unlock(&pool->lock); |
| rcu_read_unlock(); |
| return 1; |
| } |
| raw_spin_unlock(&pool->lock); |
| fail: |
| rcu_read_unlock(); |
| local_irq_restore(*irq_flags); |
| return -EAGAIN; |
| } |
| |
| /** |
| * work_grab_pending - steal work item from worklist and disable irq |
| * @work: work item to steal |
| * @cflags: %WORK_CANCEL_ flags |
| * @irq_flags: place to store IRQ state |
| * |
| * Grab PENDING bit of @work. @work can be in any stable state - idle, on timer |
| * or on worklist. |
| * |
| * Can be called from any context. IRQ is disabled on return with IRQ state |
| * stored in *@irq_flags. The caller is responsible for re-enabling it using |
| * local_irq_restore(). |
| * |
| * Returns %true if @work was pending. %false if idle. |
| */ |
| static bool work_grab_pending(struct work_struct *work, u32 cflags, |
| unsigned long *irq_flags) |
| { |
| int ret; |
| |
| while (true) { |
| ret = try_to_grab_pending(work, cflags, irq_flags); |
| if (ret >= 0) |
| return ret; |
| cpu_relax(); |
| } |
| } |
| |
| /** |
| * insert_work - insert a work into a pool |
| * @pwq: pwq @work belongs to |
| * @work: work to insert |
| * @head: insertion point |
| * @extra_flags: extra WORK_STRUCT_* flags to set |
| * |
| * Insert @work which belongs to @pwq after @head. @extra_flags is or'd to |
| * work_struct flags. |
| * |
| * CONTEXT: |
| * raw_spin_lock_irq(pool->lock). |
| */ |
| static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, |
| struct list_head *head, unsigned int extra_flags) |
| { |
| debug_work_activate(work); |
| |
| /* record the work call stack in order to print it in KASAN reports */ |
| kasan_record_aux_stack_noalloc(work); |
| |
| /* we own @work, set data and link */ |
| set_work_pwq(work, pwq, extra_flags); |
| list_add_tail(&work->entry, head); |
| get_pwq(pwq); |
| } |
| |
| /* |
| * Test whether @work is being queued from another work executing on the |
| * same workqueue. |
| */ |
| static bool is_chained_work(struct workqueue_struct *wq) |
| { |
| struct worker *worker; |
| |
| worker = current_wq_worker(); |
| /* |
| * Return %true iff I'm a worker executing a work item on @wq. If |
| * I'm @worker, it's safe to dereference it without locking. |
| */ |
| return worker && worker->current_pwq->wq == wq; |
| } |
| |
| /* |
| * When queueing an unbound work item to a wq, prefer local CPU if allowed |
| * by wq_unbound_cpumask. Otherwise, round robin among the allowed ones to |
| * avoid perturbing sensitive tasks. |
| */ |
| static int wq_select_unbound_cpu(int cpu) |
| { |
| int new_cpu; |
| |
| if (likely(!wq_debug_force_rr_cpu)) { |
| if (cpumask_test_cpu(cpu, wq_unbound_cpumask)) |
| return cpu; |
| } else { |
| pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n"); |
| } |
| |
| new_cpu = __this_cpu_read(wq_rr_cpu_last); |
| new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask); |
| if (unlikely(new_cpu >= nr_cpu_ids)) { |
| new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask); |
| if (unlikely(new_cpu >= nr_cpu_ids)) |
| return cpu; |
| } |
| __this_cpu_write(wq_rr_cpu_last, new_cpu); |
| |
| return new_cpu; |
| } |
| |
| static void __queue_work(int cpu, struct workqueue_struct *wq, |
| struct work_struct *work) |
| { |
| struct pool_workqueue *pwq; |
| struct worker_pool *last_pool, *pool; |
| unsigned int work_flags; |
| unsigned int req_cpu = cpu; |
| |
| /* |
| * While a work item is PENDING && off queue, a task trying to |
| * steal the PENDING will busy-loop waiting for it to either get |
| * queued or lose PENDING. Grabbing PENDING and queueing should |
| * happen with IRQ disabled. |
| */ |
| lockdep_assert_irqs_disabled(); |
| |
| /* |
| * For a draining wq, only works from the same workqueue are |
| * allowed. The __WQ_DESTROYING helps to spot the issue that |
| * queues a new work item to a wq after destroy_workqueue(wq). |
| */ |
| if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) && |
| WARN_ON_ONCE(!is_chained_work(wq)))) |
| return; |
| rcu_read_lock(); |
| retry: |
| /* pwq which will be used unless @work is executing elsewhere */ |
| if (req_cpu == WORK_CPU_UNBOUND) { |
| if (wq->flags & WQ_UNBOUND) |
| cpu = wq_select_unbound_cpu(raw_smp_processor_id()); |
| else |
| cpu = raw_smp_processor_id(); |
| } |
| |
| pwq = rcu_dereference(*per_cpu_ptr(wq->cpu_pwq, cpu)); |
| pool = pwq->pool; |
| |
| /* |
| * If @work was previously on a different pool, it might still be |
| * running there, in which case the work needs to be queued on that |
| * pool to guarantee non-reentrancy. |
| * |
| * For ordered workqueue, work items must be queued on the newest pwq |
| * for accurate order management. Guaranteed order also guarantees |
| * non-reentrancy. See the comments above unplug_oldest_pwq(). |
| */ |
| last_pool = get_work_pool(work); |
| if (last_pool && last_pool != pool && !(wq->flags & __WQ_ORDERED)) { |
| struct worker *worker; |
| |
| raw_spin_lock(&last_pool->lock); |
| |
| worker = find_worker_executing_work(last_pool, work); |
| |
| if (worker && worker->current_pwq->wq == wq) { |
| pwq = worker->current_pwq; |
| pool = pwq->pool; |
| WARN_ON_ONCE(pool != last_pool); |
| } else { |
| /* meh... not running there, queue here */ |
| raw_spin_unlock(&last_pool->lock); |
| raw_spin_lock(&pool->lock); |
| } |
| } else { |
| raw_spin_lock(&pool->lock); |
| } |
| |
| /* |
| * pwq is determined and locked. For unbound pools, we could have raced |
| * with pwq release and it could already be dead. If its refcnt is zero, |
| * repeat pwq selection. Note that unbound pwqs never die without |
| * another pwq replacing it in cpu_pwq or while work items are executing |
| * on it, so the retrying is guaranteed to make forward-progress. |
| */ |
| if (unlikely(!pwq->refcnt)) { |
| if (wq->flags & WQ_UNBOUND) { |
| raw_spin_unlock(&pool->lock); |
| cpu_relax(); |
| goto retry; |
| } |
| /* oops */ |
| WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt", |
| wq->name, cpu); |
| } |
| |
| /* pwq determined, queue */ |
| trace_workqueue_queue_work(req_cpu, pwq, work); |
| |
| if (WARN_ON(!list_empty(&work->entry))) |
| goto out; |
| |
| pwq->nr_in_flight[pwq->work_color]++; |
| work_flags = work_color_to_flags(pwq->work_color); |
| |
| /* |
| * Limit the number of concurrently active work items to max_active. |
| * @work must also queue behind existing inactive work items to maintain |
| * ordering when max_active changes. See wq_adjust_max_active(). |
| */ |
| if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) { |
| if (list_empty(&pool->worklist)) |
| pool->watchdog_ts = jiffies; |
| |
| trace_workqueue_activate_work(work); |
| insert_work(pwq, work, &pool->worklist, work_flags); |
| kick_pool(pool); |
| } else { |
| work_flags |= WORK_STRUCT_INACTIVE; |
| insert_work(pwq, work, &pwq->inactive_works, work_flags); |
| } |
| |
| out: |
| raw_spin_unlock(&pool->lock); |
| rcu_read_unlock(); |
| } |
| |
| static bool clear_pending_if_disabled(struct work_struct *work) |
| { |
| unsigned long data = *work_data_bits(work); |
| struct work_offq_data offqd; |
| |
| if (likely((data & WORK_STRUCT_PWQ) || |
| !(data & WORK_OFFQ_DISABLE_MASK))) |
| return false; |
| |
| work_offqd_unpack(&offqd, data); |
| set_work_pool_and_clear_pending(work, offqd.pool_id, |
| work_offqd_pack_flags(&offqd)); |
| return true; |
| } |
| |
| /** |
| * queue_work_on - queue work on specific cpu |
| * @cpu: CPU number to execute work on |
| * @wq: workqueue to use |
| * @work: work to queue |
| * |
| * We queue the work to a specific CPU, the caller must ensure it |
| * can't go away. Callers that fail to ensure that the specified |
| * CPU cannot go away will execute on a randomly chosen CPU. |
| * But note well that callers specifying a CPU that never has been |
| * online will get a splat. |
| * |
| * Return: %false if @work was already on a queue, %true otherwise. |
| */ |
| bool queue_work_on(int cpu, struct workqueue_struct *wq, |
| struct work_struct *work) |
| { |
| bool ret = false; |
| unsigned long irq_flags; |
| |
| local_irq_save(irq_flags); |
| |
| if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && |
| !clear_pending_if_disabled(work)) { |
| __queue_work(cpu, wq, work); |
| ret = true; |
| } |
| |
| local_irq_restore(irq_flags); |
| return ret; |
| } |
| EXPORT_SYMBOL(queue_work_on); |
| |
| /** |
| * select_numa_node_cpu - Select a CPU based on NUMA node |
| * @node: NUMA node ID that we want to select a CPU from |
| * |
| * This function will attempt to find a "random" cpu available on a given |
| * node. If there are no CPUs available on the given node it will return |
| * WORK_CPU_UNBOUND indicating that we should just schedule to any |
| * available CPU if we need to schedule this work. |
| */ |
| static int select_numa_node_cpu(int node) |
| { |
| int cpu; |
| |
| /* Delay binding to CPU if node is not valid or online */ |
| if (node < 0 || node >= MAX_NUMNODES || !node_online(node)) |
| return WORK_CPU_UNBOUND; |
| |
| /* Use local node/cpu if we are already there */ |
| cpu = raw_smp_processor_id(); |
| if (node == cpu_to_node(cpu)) |
| return cpu; |
| |
| /* Use "random" otherwise know as "first" online CPU of node */ |
| cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask); |
| |
| /* If CPU is valid return that, otherwise just defer */ |
| return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND; |
| } |
| |
| /** |
| * queue_work_node - queue work on a "random" cpu for a given NUMA node |
| * @node: NUMA node that we are targeting the work for |
| * @wq: workqueue to use |
| * @work: work to queue |
| * |
| * We queue the work to a "random" CPU within a given NUMA node. The basic |
| * idea here is to provide a way to somehow associate work with a given |
| * NUMA node. |
| * |
| * This function will only make a best effort attempt at getting this onto |
| * the right NUMA node. If no node is requested or the requested node is |
| * offline then we just fall back to standard queue_work behavior. |
| * |
| * Currently the "random" CPU ends up being the first available CPU in the |
| * intersection of cpu_online_mask and the cpumask of the node, unless we |
| * are running on the node. In that case we just use the current CPU. |
| * |
| * Return: %false if @work was already on a queue, %true otherwise. |
| */ |
| bool queue_work_node(int node, struct workqueue_struct *wq, |
| struct work_struct *work) |
| { |
| unsigned long irq_flags; |
| bool ret = false; |
| |
| /* |
| * This current implementation is specific to unbound workqueues. |
| * Specifically we only return the first available CPU for a given |
| * node instead of cycling through individual CPUs within the node. |
| * |
| * If this is used with a per-cpu workqueue then the logic in |
| * workqueue_select_cpu_near would need to be updated to allow for |
| * some round robin type logic. |
| */ |
| WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)); |
| |
| local_irq_save(irq_flags); |
| |
| if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && |
| !clear_pending_if_disabled(work)) { |
| int cpu = select_numa_node_cpu(node); |
| |
| __queue_work(cpu, wq, work); |
| ret = true; |
| } |
| |
| local_irq_restore(irq_flags); |
| return ret; |
| } |
| EXPORT_SYMBOL_GPL(queue_work_node); |
| |
| void delayed_work_timer_fn(struct timer_list *t) |
| { |
| struct delayed_work *dwork = from_timer(dwork, t, timer); |
| |
| /* should have been called from irqsafe timer with irq already off */ |
| __queue_work(dwork->cpu, dwork->wq, &dwork->work); |
| } |
| EXPORT_SYMBOL(delayed_work_timer_fn); |
| |
| static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, |
| struct delayed_work *dwork, unsigned long delay) |
| { |
| struct timer_list *timer = &dwork->timer; |
| struct work_struct *work = &dwork->work; |
| |
| WARN_ON_ONCE(!wq); |
| WARN_ON_ONCE(timer->function != delayed_work_timer_fn); |
| WARN_ON_ONCE(timer_pending(timer)); |
| WARN_ON_ONCE(!list_empty(&work->entry)); |
| |
| /* |
| * If @delay is 0, queue @dwork->work immediately. This is for |
| * both optimization and correctness. The earliest @timer can |
| * expire is on the closest next tick and delayed_work users depend |
| * on that there's no such delay when @delay is 0. |
| */ |
| if (!delay) { |
| __queue_work(cpu, wq, &dwork->work); |
| return; |
| } |
| |
| dwork->wq = wq; |
| dwork->cpu = cpu; |
| timer->expires = jiffies + delay; |
| |
| if (housekeeping_enabled(HK_TYPE_TIMER)) { |
| /* If the current cpu is a housekeeping cpu, use it. */ |
| cpu = smp_processor_id(); |
| if (!housekeeping_test_cpu(cpu, HK_TYPE_TIMER)) |
| cpu = housekeeping_any_cpu(HK_TYPE_TIMER); |
| add_timer_on(timer, cpu); |
| } else { |
| if (likely(cpu == WORK_CPU_UNBOUND)) |
| add_timer_global(timer); |
| else |
| add_timer_on(timer, cpu); |
| } |
| } |
| |
| /** |
| * queue_delayed_work_on - queue work on specific CPU after delay |
| * @cpu: CPU number to execute work on |
| * @wq: workqueue to use |
| * @dwork: work to queue |
| * @delay: number of jiffies to wait before queueing |
| * |
| * Return: %false if @work was already on a queue, %true otherwise. If |
| * @delay is zero and @dwork is idle, it will be scheduled for immediate |
| * execution. |
| */ |
| bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, |
| struct delayed_work *dwork, unsigned long delay) |
| { |
| struct work_struct *work = &dwork->work; |
| bool ret = false; |
| unsigned long irq_flags; |
| |
| /* read the comment in __queue_work() */ |
| local_irq_save(irq_flags); |
| |
| if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && |
| !clear_pending_if_disabled(work)) { |
| __queue_delayed_work(cpu, wq, dwork, delay); |
| ret = true; |
| } |
| |
| local_irq_restore(irq_flags); |
| return ret; |
| } |
| EXPORT_SYMBOL(queue_delayed_work_on); |
| |
| /** |
| * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU |
| * @cpu: CPU number to execute work on |
| * @wq: workqueue to use |
| * @dwork: work to queue |
| * @delay: number of jiffies to wait before queueing |
| * |
| * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise, |
| * modify @dwork's timer so that it expires after @delay. If @delay is |
| * zero, @work is guaranteed to be scheduled immediately regardless of its |
| * current state. |
| * |
| * Return: %false if @dwork was idle and queued, %true if @dwork was |
| * pending and its timer was modified. |
| * |
| * This function is safe to call from any context including IRQ handler. |
| * See try_to_grab_pending() for details. |
| */ |
| bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, |
| struct delayed_work *dwork, unsigned long delay) |
| { |
| unsigned long irq_flags; |
| bool ret; |
| |
| ret = work_grab_pending(&dwork->work, WORK_CANCEL_DELAYED, &irq_flags); |
| |
| if (!clear_pending_if_disabled(&dwork->work)) |
| __queue_delayed_work(cpu, wq, dwork, delay); |
| |
| local_irq_restore(irq_flags); |
| return ret; |
| } |
| EXPORT_SYMBOL_GPL(mod_delayed_work_on); |
| |
| static void rcu_work_rcufn(struct rcu_head *rcu) |
| { |
| struct rcu_work *rwork = container_of(rcu, struct rcu_work, rcu); |
| |
| /* read the comment in __queue_work() */ |
| local_irq_disable(); |
| __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work); |
| local_irq_enable(); |
| } |
| |
| /** |
| * queue_rcu_work - queue work after a RCU grace period |
| * @wq: workqueue to use |
| * @rwork: work to queue |
| * |
| * Return: %false if @rwork was already pending, %true otherwise. Note |
| * that a full RCU grace period is guaranteed only after a %true return. |
| * While @rwork is guaranteed to be executed after a %false return, the |
| * execution may happen before a full RCU grace period has passed. |
| */ |
| bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) |
| { |
| struct work_struct *work = &rwork->work; |
| |
| /* |
| * rcu_work can't be canceled or disabled. Warn if the user reached |
| * inside @rwork and disabled the inner work. |
| */ |
| if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && |
| !WARN_ON_ONCE(clear_pending_if_disabled(work))) { |
| rwork->wq = wq; |
| call_rcu_hurry(&rwork->rcu, rcu_work_rcufn); |
| return true; |
| } |
| |
| return false; |
| } |
| EXPORT_SYMBOL(queue_rcu_work); |
| |
| static struct worker *alloc_worker(int node) |
| { |
| struct worker *worker; |
| |
| worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node); |
| if (worker) { |
| INIT_LIST_HEAD(&worker->entry); |
| INIT_LIST_HEAD(&worker->scheduled); |
| INIT_LIST_HEAD(&worker->node); |
| /* on creation a worker is in !idle && prep state */ |
| worker->flags = WORKER_PREP; |
| } |
| return worker; |
| } |
| |
| static cpumask_t *pool_allowed_cpus(struct worker_pool *pool) |
| { |
| if (pool->cpu < 0 && pool->attrs->affn_strict) |
| return pool->attrs->__pod_cpumask; |
| else |
| return pool->attrs->cpumask; |
| } |
| |
| /** |
| * worker_attach_to_pool() - attach a worker to a pool |
| * @worker: worker to be attached |
| * @pool: the target pool |
| * |
| * Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and |
| * cpu-binding of @worker are kept coordinated with the pool across |
| * cpu-[un]hotplugs. |
| */ |
| static void worker_attach_to_pool(struct worker *worker, |
| struct worker_pool *pool) |
| { |
| mutex_lock(&wq_pool_attach_mutex); |
| |
| /* |
| * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains stable |
| * across this function. See the comments above the flag definition for |
| * details. BH workers are, while per-CPU, always DISASSOCIATED. |
| */ |
| if (pool->flags & POOL_DISASSOCIATED) { |
| worker->flags |= WORKER_UNBOUND; |
| } else { |
| WARN_ON_ONCE(pool->flags & POOL_BH); |
| kthread_set_per_cpu(worker->task, pool->cpu); |
| } |
| |
| if (worker->rescue_wq) |
| set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool)); |
| |
| list_add_tail(&worker->node, &pool->workers); |
| worker->pool = pool; |
| |
| mutex_unlock(&wq_pool_attach_mutex); |
| } |
| |
| static void unbind_worker(struct worker *worker) |
| { |
| lockdep_assert_held(&wq_pool_attach_mutex); |
| |
| kthread_set_per_cpu(worker->task, -1); |
| if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) |
| WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0); |
| else |
| WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); |
| } |
| |
| |
| static void detach_worker(struct worker *worker) |
| { |
| lockdep_assert_held(&wq_pool_attach_mutex); |
| |
| unbind_worker(worker); |
| list_del(&worker->node); |
| } |
| |
| /** |
| * worker_detach_from_pool() - detach a worker from its pool |
| * @worker: worker which is attached to its pool |
| * |
| * Undo the attaching which had been done in worker_attach_to_pool(). The |
| * caller worker shouldn't access to the pool after detached except it has |
| * other reference to the pool. |
| */ |
| static void worker_detach_from_pool(struct worker *worker) |
| { |
| struct worker_pool *pool = worker->pool; |
| |
| /* there is one permanent BH worker per CPU which should never detach */ |
| WARN_ON_ONCE(pool->flags & POOL_BH); |
| |
| mutex_lock(&wq_pool_attach_mutex); |
| detach_worker(worker); |
| worker->pool = NULL; |
| mutex_unlock(&wq_pool_attach_mutex); |
| |
| /* clear leftover flags without pool->lock after it is detached */ |
| worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND); |
| } |
| |
| static int format_worker_id(char *buf, size_t size, struct worker *worker, |
| struct worker_pool *pool) |
| { |
| if (worker->rescue_wq) |
| return scnprintf(buf, size, "kworker/R-%s", |
| worker->rescue_wq->name); |
| |
| if (pool) { |
| if (pool->cpu >= 0) |
| return scnprintf(buf, size, "kworker/%d:%d%s", |
| pool->cpu, worker->id, |
| pool->attrs->nice < 0 ? "H" : ""); |
| else |
| return scnprintf(buf, size, "kworker/u%d:%d", |
| pool->id, worker->id); |
| } else { |
| return scnprintf(buf, size, "kworker/dying"); |
| } |
| } |
| |
| /** |
| * create_worker - create a new workqueue worker |
| * @pool: pool the new worker will belong to |
| * |
| * Create and start a new worker which is attached to @pool. |
| * |
| * CONTEXT: |
| * Might sleep. Does GFP_KERNEL allocations. |
| * |
| * Return: |
| * Pointer to the newly created worker. |
| */ |
| static struct worker *create_worker(struct worker_pool *pool) |
| { |
| struct worker *worker; |
| int id; |
| |
| /* ID is needed to determine kthread name */ |
| id = ida_alloc(&pool->worker_ida, GFP_KERNEL); |
| if (id < 0) { |
| pr_err_once("workqueue: Failed to allocate a worker ID: %pe\n", |
| ERR_PTR(id)); |
| return NULL; |
| } |
| |
| worker = alloc_worker(pool->node); |
| if (!worker) { |
| pr_err_once("workqueue: Failed to allocate a worker\n"); |
| goto fail; |
| } |
| |
| worker->id = id; |
| |
| if (!(pool->flags & POOL_BH)) { |
| char id_buf[WORKER_ID_LEN]; |
| |
| format_worker_id(id_buf, sizeof(id_buf), worker, pool); |
| worker->task = kthread_create_on_node(worker_thread, worker, |
| pool->node, "%s", id_buf); |
| if (IS_ERR(worker->task)) { |
| if (PTR_ERR(worker->task) == -EINTR) { |
| pr_err("workqueue: Interrupted when creating a worker thread \"%s\"\n", |
| id_buf); |
| } else { |
| pr_err_once("workqueue: Failed to create a worker thread: %pe", |
| worker->task); |
| } |
| goto fail; |
| } |
| |
| set_user_nice(worker->task, pool->attrs->nice); |
| kthread_bind_mask(worker->task, pool_allowed_cpus(pool)); |
| } |
| |
| /* successful, attach the worker to the pool */ |
| worker_attach_to_pool(worker, pool); |
| |
| /* start the newly created worker */ |
| raw_spin_lock_irq(&pool->lock); |
| |
| worker->pool->nr_workers++; |
| worker_enter_idle(worker); |
| |
| /* |
| * @worker is waiting on a completion in kthread() and will trigger hung |
| * check if not woken up soon. As kick_pool() is noop if @pool is empty, |
| * wake it up explicitly. |
| */ |
| if (worker->task) |
| wake_up_process(worker->task); |
| |
| raw_spin_unlock_irq(&pool->lock); |
| |
| return worker; |
| |
| fail: |
| ida_free(&pool->worker_ida, id); |
| kfree(worker); |
| return NULL; |
| } |
| |
| static void detach_dying_workers(struct list_head *cull_list) |
| { |
| struct worker *worker; |
| |
| list_for_each_entry(worker, cull_list, entry) |
| detach_worker(worker); |
| } |
| |
| static void reap_dying_workers(struct list_head *cull_list) |
| { |
| struct worker *worker, *tmp; |
| |
| list_for_each_entry_safe(worker, tmp, cull_list, entry) { |
| list_del_init(&worker->entry); |
| kthread_stop_put(worker->task); |
| kfree(worker); |
| } |
| } |
| |
| /** |
| * set_worker_dying - Tag a worker for destruction |
| * @worker: worker to be destroyed |
| * @list: transfer worker away from its pool->idle_list and into list |
| * |
| * Tag @worker for destruction and adjust @pool stats accordingly. The worker |
| * should be idle. |
| * |
| * CONTEXT: |
| * raw_spin_lock_irq(pool->lock). |
| */ |
| static void set_worker_dying(struct worker *worker, struct list_head *list) |
| { |
| struct worker_pool *pool = worker->pool; |
| |
| lockdep_assert_held(&pool->lock); |
| lockdep_assert_held(&wq_pool_attach_mutex); |
| |
| /* sanity check frenzy */ |
| if (WARN_ON(worker->current_work) || |
| WARN_ON(!list_empty(&worker->scheduled)) || |
| WARN_ON(!(worker->flags & WORKER_IDLE))) |
| return; |
| |
| pool->nr_workers--; |
| pool->nr_idle--; |
| |
| worker->flags |= WORKER_DIE; |
| |
| list_move(&worker->entry, list); |
| |
| /* get an extra task struct reference for later kthread_stop_put() */ |
| get_task_struct(worker->task); |
| } |
| |
| /** |
| * idle_worker_timeout - check if some idle workers can now be deleted. |
| * @t: The pool's idle_timer that just expired |
| * |
| * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in |
| * worker_leave_idle(), as a worker flicking between idle and active while its |
| * pool is at the too_many_workers() tipping point would cause too much timer |
| * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let |
| * it expire and re-evaluate things from there. |
| */ |
| static void idle_worker_timeout(struct timer_list *t) |
| { |
| struct worker_pool *pool = from_timer(pool, t, idle_timer); |
| bool do_cull = false; |
| |
| if (work_pending(&pool->idle_cull_work)) |
| return; |
| |
| raw_spin_lock_irq(&pool->lock); |
| |
| if (too_many_workers(pool)) { |
| struct worker *worker; |
| unsigned long expires; |
| |
| /* idle_list is kept in LIFO order, check the last one */ |
| worker = list_last_entry(&pool->idle_list, struct worker, entry); |
| expires = worker->last_active + IDLE_WORKER_TIMEOUT; |
| do_cull = !time_before(jiffies, expires); |
| |
| if (!do_cull) |
| mod_timer(&pool->idle_timer, expires); |
| } |
| raw_spin_unlock_irq(&pool->lock); |
| |
| if (do_cull) |
| queue_work(system_unbound_wq, &pool->idle_cull_work); |
| } |
| |
| /** |
| * idle_cull_fn - cull workers that have been idle for too long. |
| * @work: the pool's work for handling these idle workers |
| * |
| * This goes through a pool's idle workers and gets rid of those that have been |
| * idle for at least IDLE_WORKER_TIMEOUT seconds. |
| * |
| * We don't want to disturb isolated CPUs because of a pcpu kworker being |
| * culled, so this also resets worker affinity. This requires a sleepable |
| * context, hence the split between timer callback and work item. |
| */ |
| static void idle_cull_fn(struct work_struct *work) |
| { |
| struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work); |
| LIST_HEAD(cull_list); |
| |
| /* |
| * Grabbing wq_pool_attach_mutex here ensures an already-running worker |
| * cannot proceed beyong set_pf_worker() in its self-destruct path. |
| * This is required as a previously-preempted worker could run after |
| * set_worker_dying() has happened but before detach_dying_workers() did. |
| */ |
| mutex_lock(&wq_pool_attach_mutex); |
| raw_spin_lock_irq(&pool->lock); |
| |
| while (too_many_workers(pool)) { |
| struct worker *worker; |
| unsigned long expires; |
| |
| worker = list_last_entry(&pool->idle_list, struct worker, entry); |
| expires = worker->last_active + IDLE_WORKER_TIMEOUT; |
| |
| if (time_before(jiffies, expires)) { |
| mod_timer(&pool->idle_timer, expires); |
| break; |
| } |
| |
| set_worker_dying(worker, &cull_list); |
| } |
| |
| raw_spin_unlock_irq(&pool->lock); |
| detach_dying_workers(&cull_list); |
| mutex_unlock(&wq_pool_attach_mutex); |
| |
| reap_dying_workers(&cull_list); |
| } |
| |
| static void send_mayday(struct work_struct *work) |
| { |
| struct pool_workqueue *pwq = get_work_pwq(work); |
| struct workqueue_struct *wq = pwq->wq; |
| |
| lockdep_assert_held(&wq_mayday_lock); |
| |
| if (!wq->rescuer) |
| return; |
| |
| /* mayday mayday mayday */ |
| if (list_empty(&pwq->mayday_node)) { |
| /* |
| * If @pwq is for an unbound wq, its base ref may be put at |
| * any time due to an attribute change. Pin @pwq until the |
| * rescuer is done with it. |
| */ |
| get_pwq(pwq); |
| list_add_tail(&pwq->mayday_node, &wq->maydays); |
| wake_up_process(wq->rescuer->task); |
| pwq->stats[PWQ_STAT_MAYDAY]++; |
| } |
| } |
| |
| static void pool_mayday_timeout(struct timer_list *t) |
| { |
| struct worker_pool *pool = from_timer(pool, t, mayday_timer); |
| struct work_struct *work; |
| |
| raw_spin_lock_irq(&pool->lock); |
| raw_spin_lock(&wq_mayday_lock); /* for wq->maydays */ |
| |
| if (need_to_create_worker(pool)) { |
| /* |
| * We've been trying to create a new worker but |
| * haven't been successful. We might be hitting an |
| * allocation deadlock. Send distress signals to |
| * rescuers. |
| */ |
| list_for_each_entry(work, &pool->worklist, entry) |
| send_mayday(work); |
| } |
| |
| raw_spin_unlock(&wq_mayday_lock); |
| raw_spin_unlock_irq(&pool->lock); |
| |
| mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); |
| } |
| |
| /** |
| * maybe_create_worker - create a new worker if necessary |
| * @pool: pool to create a new worker for |
| * |
| * Create a new worker for @pool if necessary. @pool is guaranteed to |
| * have at least one idle worker on return from this function. If |
| * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is |
| * sent to all rescuers with works scheduled on @pool to resolve |
| * possible allocation deadlock. |
| * |
| * On return, need_to_create_worker() is guaranteed to be %false and |
| * may_start_working() %true. |
| * |
| * LOCKING: |
| * raw_spin_lock_irq(pool->lock) which may be released and regrabbed |
| * multiple times. Does GFP_KERNEL allocations. Called only from |
| * manager. |
| */ |
| static void maybe_create_worker(struct worker_pool *pool) |
| __releases(&pool->lock) |
| __acquires(&pool->lock) |
| { |
| restart: |
| raw_spin_unlock_irq(&pool->lock); |
| |
| /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ |
| mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); |
| |
| while (true) { |
| if (create_worker(pool) || !need_to_create_worker(pool)) |
| break; |
| |
| schedule_timeout_interruptible(CREATE_COOLDOWN); |
| |
| if (!need_to_create_worker(pool)) |
| break; |
| } |
| |
| del_timer_sync(&pool->mayday_timer); |
| raw_spin_lock_irq(&pool->lock); |
| /* |
| * This is necessary even after a new worker was just successfully |
| * created as @pool->lock was dropped and the new worker might have |
| * already become busy. |
| */ |
| if (need_to_create_worker(pool)) |
| goto restart; |
| } |
| |
| /** |
| * manage_workers - manage worker pool |
| * @worker: self |
| * |
| * Assume the manager role and manage the worker pool @worker belongs |
| * to. At any given time, there can be only zero or one manager per |
| * pool. The exclusion is handled automatically by this function. |
| * |
| * The caller can safely start processing works on false return. On |
| * true return, it's guaranteed that need_to_create_worker() is false |
| * and may_start_working() is true. |
| * |
| * CONTEXT: |
| * raw_spin_lock_irq(pool->lock) which may be released and regrabbed |
| * multiple times. Does GFP_KERNEL allocations. |
| * |
| * Return: |
| * %false if the pool doesn't need management and the caller can safely |
| * start processing works, %true if management function was performed and |
| * the conditions that the caller verified before calling the function may |
| * no longer be true. |
| */ |
| static bool manage_workers(struct worker *worker) |
| { |
| struct worker_pool *pool = worker->pool; |
| |
| if (pool->flags & POOL_MANAGER_ACTIVE) |
| return false; |
| |
| pool->flags |= POOL_MANAGER_ACTIVE; |
| pool->manager = worker; |
| |
| maybe_create_worker(pool); |
| |
| pool->manager = NULL; |
| pool->flags &= ~POOL_MANAGER_ACTIVE; |
| rcuwait_wake_up(&manager_wait); |
| return true; |
| } |
| |
| /** |
| * process_one_work - process single work |
| * @worker: self |
| * @work: work to process |
| * |
| * Process @work. This function contains all the logics necessary to |
| * process a single work including synchronization against and |
| * interaction with other workers on the same cpu, queueing and |
| * flushing. As long as context requirement is met, any worker can |
| * call this function to process a work. |
| * |
| * CONTEXT: |
| * raw_spin_lock_irq(pool->lock) which is released and regrabbed. |
| */ |
| static void process_one_work(struct worker *worker, struct work_struct *work) |
| __releases(&pool->lock) |
| __acquires(&pool->lock) |
| { |
| struct pool_workqueue *pwq = get_work_pwq(work); |
| struct worker_pool *pool = worker->pool; |
| unsigned long work_data; |
| int lockdep_start_depth, rcu_start_depth; |
| bool bh_draining = pool->flags & POOL_BH_DRAINING; |
| #ifdef CONFIG_LOCKDEP |
| /* |
| * It is permissible to free the struct work_struct from |
| * inside the function that is called from it, this we need to |
| * take into account for lockdep too. To avoid bogus "held |
| * lock freed" warnings as well as problems when looking into |
| * work->lockdep_map, make a copy and use that here. |
| */ |
| struct lockdep_map lockdep_map; |
| |
| lockdep_copy_map(&lockdep_map, &work->lockdep_map); |
| #endif |
| /* ensure we're on the correct CPU */ |
| WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && |
| raw_smp_processor_id() != pool->cpu); |
| |
| /* claim and dequeue */ |
| debug_work_deactivate(work); |
| hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work); |
| worker->current_work = work; |
| worker->current_func = work->func; |
| worker->current_pwq = pwq; |
| if (worker->task) |
| worker->current_at = worker->task->se.sum_exec_runtime; |
| work_data = *work_data_bits(work); |
| worker->current_color = get_work_color(work_data); |
| |
| /* |
| * Record wq name for cmdline and debug reporting, may get |
| * overridden through set_worker_desc(). |
| */ |
| strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN); |
| |
| list_del_init(&work->entry); |
| |
| /* |
| * CPU intensive works don't participate in concurrency management. |
| * They're the scheduler's responsibility. This takes @worker out |
| * of concurrency management and the next code block will chain |
| * execution of the pending work items. |
| */ |
| if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE)) |
| worker_set_flags(worker, WORKER_CPU_INTENSIVE); |
| |
| /* |
| * Kick @pool if necessary. It's always noop for per-cpu worker pools |
| * since nr_running would always be >= 1 at this point. This is used to |
| * chain execution of the pending work items for WORKER_NOT_RUNNING |
| * workers such as the UNBOUND and CPU_INTENSIVE ones. |
| */ |
| kick_pool(pool); |
| |
| /* |
| * Record the last pool and clear PENDING which should be the last |
| * update to @work. Also, do this inside @pool->lock so that |
| * PENDING and queued state changes happen together while IRQ is |
| * disabled. |
| */ |
| set_work_pool_and_clear_pending(work, pool->id, pool_offq_flags(pool)); |
| |
| pwq->stats[PWQ_STAT_STARTED]++; |
| raw_spin_unlock_irq(&pool->lock); |
| |
| rcu_start_depth = rcu_preempt_depth(); |
| lockdep_start_depth = lockdep_depth(current); |
| /* see drain_dead_softirq_workfn() */ |
| if (!bh_draining) |
| lock_map_acquire(pwq->wq->lockdep_map); |
| lock_map_acquire(&lockdep_map); |
| /* |
| * Strictly speaking we should mark the invariant state without holding |
| * any locks, that is, before these two lock_map_acquire()'s. |
| * |
| * However, that would result in: |
| * |
| * A(W1) |
| * WFC(C) |
| * A(W1) |
| * C(C) |
| * |
| * Which would create W1->C->W1 dependencies, even though there is no |
| * actual deadlock possible. There are two solutions, using a |
| * read-recursive acquire on the work(queue) 'locks', but this will then |
| * hit the lockdep limitation on recursive locks, or simply discard |
| * these locks. |
| * |
| * AFAICT there is no possible deadlock scenario between the |
| * flush_work() and complete() primitives (except for single-threaded |
| * workqueues), so hiding them isn't a problem. |
| */ |
| lockdep_invariant_state(true); |
| trace_workqueue_execute_start(work); |
| worker->current_func(work); |
| /* |
| * While we must be careful to not use "work" after this, the trace |
| * point will only record its address. |
| */ |
| trace_workqueue_execute_end(work, worker->current_func); |
| pwq->stats[PWQ_STAT_COMPLETED]++; |
| lock_map_release(&lockdep_map); |
| if (!bh_draining) |
| lock_map_release(pwq->wq->lockdep_map); |
| |
| if (unlikely((worker->task && in_atomic()) || |
| lockdep_depth(current) != lockdep_start_depth || |
| rcu_preempt_depth() != rcu_start_depth)) { |
| pr_err("BUG: workqueue leaked atomic, lock or RCU: %s[%d]\n" |
| " preempt=0x%08x lock=%d->%d RCU=%d->%d workfn=%ps\n", |
| current->comm, task_pid_nr(current), preempt_count(), |
| lockdep_start_depth, lockdep_depth(current), |
| rcu_start_depth, rcu_preempt_depth(), |
| worker->current_func); |
| debug_show_held_locks(current); |
| dump_stack(); |
| } |
| |
| /* |
| * The following prevents a kworker from hogging CPU on !PREEMPTION |
| * kernels, where a requeueing work item waiting for something to |
| * happen could deadlock with stop_machine as such work item could |
| * indefinitely requeue itself while all other CPUs are trapped in |
| * stop_machine. At the same time, report a quiescent RCU state so |
| * the same condition doesn't freeze RCU. |
| */ |
| if (worker->task) |
| cond_resched(); |
| |
| raw_spin_lock_irq(&pool->lock); |
| |
| /* |
| * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked |
| * CPU intensive by wq_worker_tick() if @work hogged CPU longer than |
| * wq_cpu_intensive_thresh_us. Clear it. |
| */ |
| worker_clr_flags(worker, WORKER_CPU_INTENSIVE); |
| |
| /* tag the worker for identification in schedule() */ |
| worker->last_func = worker->current_func; |
| |
| /* we're done with it, release */ |
| hash_del(&worker->hentry); |
| worker->current_work = NULL; |
| worker->current_func = NULL; |
| worker->current_pwq = NULL; |
| worker->current_color = INT_MAX; |
| |
| /* must be the last step, see the function comment */ |
| pwq_dec_nr_in_flight(pwq, work_data); |
| } |
| |
| /** |
| * process_scheduled_works - process scheduled works |
| * @worker: self |
| * |
| * Process all scheduled works. Please note that the scheduled list |
| * may change while processing a work, so this function repeatedly |
| * fetches a work from the top and executes it. |
| * |
| * CONTEXT: |
| * raw_spin_lock_irq(pool->lock) which may be released and regrabbed |
| * multiple times. |
| */ |
| static void process_scheduled_works(struct worker *worker) |
| { |
| struct work_struct *work; |
| bool first = true; |
| |
| while ((work = list_first_entry_or_null(&worker->scheduled, |
| struct work_struct, entry))) { |
| if (first) { |
| worker->pool->watchdog_ts = jiffies; |
| first = false; |
| } |
| process_one_work(worker, work); |
| } |
| } |
| |
| static void set_pf_worker(bool val) |
| { |
| mutex_lock(&wq_pool_attach_mutex); |
| if (val) |
| current->flags |= PF_WQ_WORKER; |
| else |
| current->flags &= ~PF_WQ_WORKER; |
| mutex_unlock(&wq_pool_attach_mutex); |
| } |
| |
| /** |
| * worker_thread - the worker thread function |
| * @__worker: self |
| * |
| * The worker thread function. All workers belong to a worker_pool - |
| * either a per-cpu one or dynamic unbound one. These workers process all |
| * work items regardless of their specific target workqueue. The only |
| * exception is work items which belong to workqueues with a rescuer which |
| * will be explained in rescuer_thread(). |
| * |
| * Return: 0 |
| */ |
| static int worker_thread(void *__worker) |
| { |
| struct worker *worker = __worker; |
| struct worker_pool *pool = worker->pool; |
| |
| /* tell the scheduler that this is a workqueue worker */ |
| set_pf_worker(true); |
| woke_up: |
| raw_spin_lock_irq(&pool->lock); |
| |
| /* am I supposed to die? */ |
| if (unlikely(worker->flags & WORKER_DIE)) { |
| raw_spin_unlock_irq(&pool->lock); |
| set_pf_worker(false); |
| /* |
| * The worker is dead and PF_WQ_WORKER is cleared, worker->pool |
| * shouldn't be accessed, reset it to NULL in case otherwise. |
| */ |
| worker->pool = NULL; |
| ida_free(&pool->worker_ida, worker->id); |
| return 0; |
| } |
| |
| worker_leave_idle(worker); |
| recheck: |
| /* no more worker necessary? */ |
| if (!need_more_worker(pool)) |
| goto sleep; |
| |
| /* do we need to manage? */ |
| if (unlikely(!may_start_working(pool)) && manage_workers(worker)) |
| goto recheck; |
| |
| /* |
| * ->scheduled list can only be filled while a worker is |
| * preparing to process a work or actually processing it. |
| * Make sure nobody diddled with it while I was sleeping. |
| */ |
| WARN_ON_ONCE(!list_empty(&worker->scheduled)); |
| |
| /* |
| * Finish PREP stage. We're guaranteed to have at least one idle |
| * worker or that someone else has already assumed the manager |
| * role. This is where @worker starts participating in concurrency |
| * management if applicable and concurrency management is restored |
| * after being rebound. See rebind_workers() for details. |
| */ |
| worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND); |
| |
| do { |
| struct work_struct *work = |
| list_first_entry(&pool->worklist, |
| struct work_struct, entry); |
| |
| if (assign_work(work, worker, NULL)) |
| process_scheduled_works(worker); |
| } while (keep_working(pool)); |
| |
| worker_set_flags(worker, WORKER_PREP); |
| sleep: |
| /* |
| * pool->lock is held and there's no work to process and no need to |
| * manage, sleep. Workers are woken up only while holding |
| * pool->lock or from local cpu, so setting the current state |
| * before releasing pool->lock is enough to prevent losing any |
| * event. |
| */ |
| worker_enter_idle(worker); |
| __set_current_state(TASK_IDLE); |
| raw_spin_unlock_irq(&pool->lock); |
| schedule(); |
| goto woke_up; |
| } |
| |
| /** |
| * rescuer_thread - the rescuer thread function |
| * @__rescuer: self |
| * |
| * Workqueue rescuer thread function. There's one rescuer for each |
| * workqueue which has WQ_MEM_RECLAIM set. |
| * |
| * Regular work processing on a pool may block trying to create a new |
| * worker which uses GFP_KERNEL allocation which has slight chance of |
| * developing into deadlock if some works currently on the same queue |
| * need to be processed to satisfy the GFP_KERNEL allocation. This is |
| * the problem rescuer solves. |
| * |
| * When such condition is possible, the pool summons rescuers of all |
| * workqueues which have works queued on the pool and let them process |
| * those works so that forward progress can be guaranteed. |
| * |
| * This should happen rarely. |
|