tools/sched_ext/scx_qmap.bpf.c - linux - Git at Google

 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  * A simple five-level FIFO queue scheduler.
  *
  * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
  * assigned to one depending on its compound weight. Each CPU round robins
  * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
  * queue0, 2 from queue1, 4 from queue2 and so on.
  *
  * This scheduler demonstrates:
  *
  * - BPF-side queueing using PIDs.
  * - Sleepable per-task storage allocation using ops.prep_enable().
  * - Using ops.cpu_release() to handle a higher priority scheduling class taking
  *   the CPU away.
  * - Core-sched support.
  *
  * This scheduler is primarily for demonstration and testing of sched_ext
  * features and unlikely to be useful for actual workloads.
  *
  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
  */
 #include <scx/common.bpf.h>

 enum consts {
 	ONE_SEC_IN_NS		= 1000000000,
 	SHARED_DSQ		= 0,
 	HIGHPRI_DSQ		= 1,
 	HIGHPRI_WEIGHT		= 8668,		/* this is what -20 maps to */
 };

 char _license[] SEC("license") = "GPL";

 const volatile u64 slice_ns = SCX_SLICE_DFL;
 const volatile u32 stall_user_nth;
 const volatile u32 stall_kernel_nth;
 const volatile u32 dsp_inf_loop_after;
 const volatile u32 dsp_batch;
 const volatile bool highpri_boosting;
 const volatile bool print_shared_dsq;
 const volatile s32 disallow_tgid;
 const volatile bool suppress_dump;

 u64 nr_highpri_queued;
 u32 test_error_cnt;

 UEI_DEFINE(uei);

 struct qmap {
 	__uint(type, BPF_MAP_TYPE_QUEUE);
 	__uint(max_entries, 4096);
 	__type(value, u32);
 } queue0 SEC(".maps"),
   queue1 SEC(".maps"),
   queue2 SEC(".maps"),
   queue3 SEC(".maps"),
   queue4 SEC(".maps");

 struct {
 	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
 	__uint(max_entries, 5);
 	__type(key, int);
 	__array(values, struct qmap);
 } queue_arr SEC(".maps") = {
 	.values = {
 		[0] = &queue0,
 		[1] = &queue1,
 		[2] = &queue2,
 		[3] = &queue3,
 		[4] = &queue4,
 	},
 };

 /*
  * If enabled, CPU performance target is set according to the queue index
  * according to the following table.
  */
 static const u32 qidx_to_cpuperf_target[] = {
 	[0] = SCX_CPUPERF_ONE * 0 / 4,
 	[1] = SCX_CPUPERF_ONE * 1 / 4,
 	[2] = SCX_CPUPERF_ONE * 2 / 4,
 	[3] = SCX_CPUPERF_ONE * 3 / 4,
 	[4] = SCX_CPUPERF_ONE * 4 / 4,
 };

 /*
  * Per-queue sequence numbers to implement core-sched ordering.
  *
  * Tail seq is assigned to each queued task and incremented. Head seq tracks the
  * sequence number of the latest dispatched task. The distance between the a
  * task's seq and the associated queue's head seq is called the queue distance
  * and used when comparing two tasks for ordering. See qmap_core_sched_before().
  */
 static u64 core_sched_head_seqs[5];
 static u64 core_sched_tail_seqs[5];

 /* Per-task scheduling context */
 struct task_ctx {
 	bool	force_local;	/* Dispatch directly to local_dsq */
 	bool	highpri;
 	u64	core_sched_seq;
 };

 struct {
 	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
 	__uint(map_flags, BPF_F_NO_PREALLOC);
 	__type(key, int);
 	__type(value, struct task_ctx);
 } task_ctx_stor SEC(".maps");

 struct cpu_ctx {
 	u64	dsp_idx;	/* dispatch index */
 	u64	dsp_cnt;	/* remaining count */
 	u32	avg_weight;
 	u32	cpuperf_target;
 };

 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 	__uint(max_entries, 1);
 	__type(key, u32);
 	__type(value, struct cpu_ctx);
 } cpu_ctx_stor SEC(".maps");

 /* Statistics */
 u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq;
 u64 nr_core_sched_execed;
 u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
 u32 cpuperf_min, cpuperf_avg, cpuperf_max;
 u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;

 static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
 {
 	s32 cpu;

 	if (p->nr_cpus_allowed == 1 ||
 	    scx_bpf_test_and_clear_cpu_idle(prev_cpu))
 		return prev_cpu;

 	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
 	if (cpu >= 0)
 		return cpu;

 	return -1;
 }

 static struct task_ctx *lookup_task_ctx(struct task_struct *p)
 {
 	struct task_ctx *tctx;

 	if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
 		scx_bpf_error("task_ctx lookup failed");
 		return NULL;
 	}
 	return tctx;
 }

 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
 {
 	struct task_ctx *tctx;
 	s32 cpu;

 	if (!(tctx = lookup_task_ctx(p)))
 		return -ESRCH;

 	cpu = pick_direct_dispatch_cpu(p, prev_cpu);

 	if (cpu >= 0) {
 		tctx->force_local = true;
 		return cpu;
 	} else {
 		return prev_cpu;
 	}
 }

 static int weight_to_idx(u32 weight)
 {
 	/* Coarsely map the compound weight to a FIFO. */
 	if (weight <= 25)
 		return 0;
 	else if (weight <= 50)
 		return 1;
 	else if (weight < 200)
 		return 2;
 	else if (weight < 400)
 		return 3;
 	else
 		return 4;
 }

 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 {
 	static u32 user_cnt, kernel_cnt;
 	struct task_ctx *tctx;
 	u32 pid = p->pid;
 	int idx = weight_to_idx(p->scx.weight);
 	void *ring;
 	s32 cpu;

 	if (p->flags & PF_KTHREAD) {
 		if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
 			return;
 	} else {
 		if (stall_user_nth && !(++user_cnt % stall_user_nth))
 			return;
 	}

 	if (test_error_cnt && !--test_error_cnt)
 		scx_bpf_error("test triggering error");

 	if (!(tctx = lookup_task_ctx(p)))
 		return;

 	/*
 	 * All enqueued tasks must have their core_sched_seq updated for correct
 	 * core-sched ordering. Also, take a look at the end of qmap_dispatch().
 	 */
 	tctx->core_sched_seq = core_sched_tail_seqs[idx]++;

 	/*
 	 * If qmap_select_cpu() is telling us to or this is the last runnable
 	 * task on the CPU, enqueue locally.
 	 */
 	if (tctx->force_local) {
 		tctx->force_local = false;
 		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
 		return;
 	}

 	/* if select_cpu() wasn't called, try direct dispatch */
 	if (!(enq_flags & SCX_ENQ_CPU_SELECTED) &&
 	    (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
 		__sync_fetch_and_add(&nr_ddsp_from_enq, 1);
 		scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
 		return;
 	}

 	/*
 	 * If the task was re-enqueued due to the CPU being preempted by a
 	 * higher priority scheduling class, just re-enqueue the task directly
 	 * on the global DSQ. As we want another CPU to pick it up, find and
 	 * kick an idle CPU.
 	 */
 	if (enq_flags & SCX_ENQ_REENQ) {
 		s32 cpu;

 		scx_bpf_dispatch(p, SHARED_DSQ, 0, enq_flags);
 		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
 		if (cpu >= 0)
 			scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
 		return;
 	}

 	ring = bpf_map_lookup_elem(&queue_arr, &idx);
 	if (!ring) {
 		scx_bpf_error("failed to find ring %d", idx);
 		return;
 	}

 	/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
 	if (bpf_map_push_elem(ring, &pid, 0)) {
 		scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags);
 		return;
 	}

 	if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
 		tctx->highpri = true;
 		__sync_fetch_and_add(&nr_highpri_queued, 1);
 	}
 	__sync_fetch_and_add(&nr_enqueued, 1);
 }

 /*
  * The BPF queue map doesn't support removal and sched_ext can handle spurious
  * dispatches. qmap_dequeue() is only used to collect statistics.
  */
 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
 {
 	__sync_fetch_and_add(&nr_dequeued, 1);
 	if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
 		__sync_fetch_and_add(&nr_core_sched_execed, 1);
 }

 static void update_core_sched_head_seq(struct task_struct *p)
 {
 	int idx = weight_to_idx(p->scx.weight);
 	struct task_ctx *tctx;

 	if ((tctx = lookup_task_ctx(p)))
 		core_sched_head_seqs[idx] = tctx->core_sched_seq;
 }

 /*
  * To demonstrate the use of scx_bpf_dispatch_from_dsq(), implement silly
  * selective priority boosting mechanism by scanning SHARED_DSQ looking for
  * highpri tasks, moving them to HIGHPRI_DSQ and then consuming them first. This
  * makes minor difference only when dsp_batch is larger than 1.
  *
  * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
  * non-rq-lock holding BPF programs. As demonstration, this function is called
  * from qmap_dispatch() and monitor_timerfn().
  */
 static bool dispatch_highpri(bool from_timer)
 {
 	struct task_struct *p;
 	s32 this_cpu = bpf_get_smp_processor_id();

 	/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
 	bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
 		static u64 highpri_seq;
 		struct task_ctx *tctx;

 		if (!(tctx = lookup_task_ctx(p)))
 			return false;

 		if (tctx->highpri) {
 			/* exercise the set_*() and vtime interface too */
 			__COMPAT_scx_bpf_dispatch_from_dsq_set_slice(
 				BPF_FOR_EACH_ITER, slice_ns * 2);
 			__COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(
 				BPF_FOR_EACH_ITER, highpri_seq++);
 			__COMPAT_scx_bpf_dispatch_vtime_from_dsq(
 				BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
 		}
 	}

 	/*
 	 * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
 	 * is found.
 	 */
 	bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
 		bool dispatched = false;
 		s32 cpu;

 		if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
 			cpu = this_cpu;
 		else
 			cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);

 		if (__COMPAT_scx_bpf_dispatch_from_dsq(BPF_FOR_EACH_ITER, p,
 						       SCX_DSQ_LOCAL_ON | cpu,
 						       SCX_ENQ_PREEMPT)) {
 			if (cpu == this_cpu) {
 				dispatched = true;
 				__sync_fetch_and_add(&nr_expedited_local, 1);
 			} else {
 				__sync_fetch_and_add(&nr_expedited_remote, 1);
 			}
 			if (from_timer)
 				__sync_fetch_and_add(&nr_expedited_from_timer, 1);
 		} else {
 			__sync_fetch_and_add(&nr_expedited_lost, 1);
 		}

 		if (dispatched)
 			return true;
 	}

 	return false;
 }

 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 {
 	struct task_struct *p;
 	struct cpu_ctx *cpuc;
 	struct task_ctx *tctx;
 	u32 zero = 0, batch = dsp_batch ?: 1;
 	void *fifo;
 	s32 i, pid;

 	if (dispatch_highpri(false))
 		return;

 	if (!nr_highpri_queued && scx_bpf_consume(SHARED_DSQ))
 		return;

 	if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
 		/*
 		 * PID 2 should be kthreadd which should mostly be idle and off
 		 * the scheduler. Let's keep dispatching it to force the kernel
 		 * to call this function over and over again.
 		 */
 		p = bpf_task_from_pid(2);
 		if (p) {
 			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0);
 			bpf_task_release(p);
 			return;
 		}
 	}

 	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
 		scx_bpf_error("failed to look up cpu_ctx");
 		return;
 	}

 	for (i = 0; i < 5; i++) {
 		/* Advance the dispatch cursor and pick the fifo. */
 		if (!cpuc->dsp_cnt) {
 			cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
 			cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
 		}

 		fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
 		if (!fifo) {
 			scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
 			return;
 		}

 		/* Dispatch or advance. */
 		bpf_repeat(BPF_MAX_LOOPS) {
 			struct task_ctx *tctx;

 			if (bpf_map_pop_elem(fifo, &pid))
 				break;

 			p = bpf_task_from_pid(pid);
 			if (!p)
 				continue;

 			if (!(tctx = lookup_task_ctx(p))) {
 				bpf_task_release(p);
 				return;
 			}

 			if (tctx->highpri)
 				__sync_fetch_and_sub(&nr_highpri_queued, 1);

 			update_core_sched_head_seq(p);
 			__sync_fetch_and_add(&nr_dispatched, 1);

 			scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0);
 			bpf_task_release(p);

 			batch--;
 			cpuc->dsp_cnt--;
 			if (!batch || !scx_bpf_dispatch_nr_slots()) {
 				if (dispatch_highpri(false))
 					return;
 				scx_bpf_consume(SHARED_DSQ);
 				return;
 			}
 			if (!cpuc->dsp_cnt)
 				break;
 		}

 		cpuc->dsp_cnt = 0;
 	}

 	/*
 	 * No other tasks. @prev will keep running. Update its core_sched_seq as
 	 * if the task were enqueued and dispatched immediately.
 	 */
 	if (prev) {
 		tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
 		if (!tctx) {
 			scx_bpf_error("task_ctx lookup failed");
 			return;
 		}

 		tctx->core_sched_seq =
 			core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
 	}
 }

 void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
 {
 	struct cpu_ctx *cpuc;
 	u32 zero = 0;
 	int idx;

 	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
 		scx_bpf_error("failed to look up cpu_ctx");
 		return;
 	}

 	/*
 	 * Use the running avg of weights to select the target cpuperf level.
 	 * This is a demonstration of the cpuperf feature rather than a
 	 * practical strategy to regulate CPU frequency.
 	 */
 	cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
 	idx = weight_to_idx(cpuc->avg_weight);
 	cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];

 	scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
 }

 /*
  * The distance from the head of the queue scaled by the weight of the queue.
  * The lower the number, the older the task and the higher the priority.
  */
 static s64 task_qdist(struct task_struct *p)
 {
 	int idx = weight_to_idx(p->scx.weight);
 	struct task_ctx *tctx;
 	s64 qdist;

 	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
 	if (!tctx) {
 		scx_bpf_error("task_ctx lookup failed");
 		return 0;
 	}

 	qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];

 	/*
 	 * As queue index increments, the priority doubles. The queue w/ index 3
 	 * is dispatched twice more frequently than 2. Reflect the difference by
 	 * scaling qdists accordingly. Note that the shift amount needs to be
 	 * flipped depending on the sign to avoid flipping priority direction.
 	 */
 	if (qdist >= 0)
 		return qdist << (4 - idx);
 	else
 		return qdist << idx;
 }

 /*
  * This is called to determine the task ordering when core-sched is picking
  * tasks to execute on SMT siblings and should encode about the same ordering as
  * the regular scheduling path. Use the priority-scaled distances from the head
  * of the queues to compare the two tasks which should be consistent with the
  * dispatch path behavior.
  */
 bool BPF_STRUCT_OPS(qmap_core_sched_before,
 		    struct task_struct *a, struct task_struct *b)
 {
 	return task_qdist(a) > task_qdist(b);
 }

 void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
 {
 	u32 cnt;

 	/*
 	 * Called when @cpu is taken by a higher priority scheduling class. This
 	 * makes @cpu no longer available for executing sched_ext tasks. As we
 	 * don't want the tasks in @cpu's local dsq to sit there until @cpu
 	 * becomes available again, re-enqueue them into the global dsq. See
 	 * %SCX_ENQ_REENQ handling in qmap_enqueue().
 	 */
 	cnt = scx_bpf_reenqueue_local();
 	if (cnt)
 		__sync_fetch_and_add(&nr_reenqueued, cnt);
 }

 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
 		   struct scx_init_task_args *args)
 {
 	if (p->tgid == disallow_tgid)
 		p->scx.disallow = true;

 	/*
 	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
 	 * in this function and the following will automatically use GFP_KERNEL.
 	 */
 	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
 				 BPF_LOCAL_STORAGE_GET_F_CREATE))
 		return 0;
 	else
 		return -ENOMEM;
 }

 void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
 {
 	s32 i, pid;

 	if (suppress_dump)
 		return;

 	bpf_for(i, 0, 5) {
 		void *fifo;

 		if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
 			return;

 		scx_bpf_dump("QMAP FIFO[%d]:", i);
 		bpf_repeat(4096) {
 			if (bpf_map_pop_elem(fifo, &pid))
 				break;
 			scx_bpf_dump(" %d", pid);
 		}
 		scx_bpf_dump("\n");
 	}
 }

 void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
 {
 	u32 zero = 0;
 	struct cpu_ctx *cpuc;

 	if (suppress_dump || idle)
 		return;
 	if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
 		return;

 	scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
 		     cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
 		     cpuc->cpuperf_target);
 }

 void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
 {
 	struct task_ctx *taskc;

 	if (suppress_dump)
 		return;
 	if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
 		return;

 	scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
 		     taskc->force_local, taskc->core_sched_seq);
 }

 /*
  * Print out the online and possible CPU map using bpf_printk() as a
  * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
  */
 static void print_cpus(void)
 {
 	const struct cpumask *possible, *online;
 	s32 cpu;
 	char buf[128] = "", *p;
 	int idx;

 	possible = scx_bpf_get_possible_cpumask();
 	online = scx_bpf_get_online_cpumask();

 	idx = 0;
 	bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
 		if (!(p = MEMBER_VPTR(buf, [idx++])))
 			break;
 		if (bpf_cpumask_test_cpu(cpu, online))
 			*p++ = 'O';
 		else if (bpf_cpumask_test_cpu(cpu, possible))
 			*p++ = 'X';
 		else
 			*p++ = ' ';

 		if ((cpu & 7) == 7) {
 			if (!(p = MEMBER_VPTR(buf, [idx++])))
 				break;
 			*p++ = '|';
 		}
 	}
 	buf[sizeof(buf) - 1] = '\0';

 	scx_bpf_put_cpumask(online);
 	scx_bpf_put_cpumask(possible);

 	bpf_printk("CPUS: |%s", buf);
 }

 void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
 {
 	bpf_printk("CPU %d coming online", cpu);
 	/* @cpu is already online at this point */
 	print_cpus();
 }

 void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
 {
 	bpf_printk("CPU %d going offline", cpu);
 	/* @cpu is still online at this point */
 	print_cpus();
 }

 struct monitor_timer {
 	struct bpf_timer timer;
 };

 struct {
 	__uint(type, BPF_MAP_TYPE_ARRAY);
 	__uint(max_entries, 1);
 	__type(key, u32);
 	__type(value, struct monitor_timer);
 } monitor_timer SEC(".maps");

 /*
  * Print out the min, avg and max performance levels of CPUs every second to
  * demonstrate the cpuperf interface.
  */
 static void monitor_cpuperf(void)
 {
 	u32 zero = 0, nr_cpu_ids;
 	u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
 	u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
 	const struct cpumask *online;
 	int i, nr_online_cpus = 0;

 	nr_cpu_ids = scx_bpf_nr_cpu_ids();
 	online = scx_bpf_get_online_cpumask();

 	bpf_for(i, 0, nr_cpu_ids) {
 		struct cpu_ctx *cpuc;
 		u32 cap, cur;

 		if (!bpf_cpumask_test_cpu(i, online))
 			continue;
 		nr_online_cpus++;

 		/* collect the capacity and current cpuperf */
 		cap = scx_bpf_cpuperf_cap(i);
 		cur = scx_bpf_cpuperf_cur(i);

 		cur_min = cur < cur_min ? cur : cur_min;
 		cur_max = cur > cur_max ? cur : cur_max;

 		/*
 		 * $cur is relative to $cap. Scale it down accordingly so that
 		 * it's in the same scale as other CPUs and $cur_sum/$cap_sum
 		 * makes sense.
 		 */
 		cur_sum += cur * cap / SCX_CPUPERF_ONE;
 		cap_sum += cap;

 		if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
 			scx_bpf_error("failed to look up cpu_ctx");
 			goto out;
 		}

 		/* collect target */
 		cur = cpuc->cpuperf_target;
 		target_sum += cur;
 		target_min = cur < target_min ? cur : target_min;
 		target_max = cur > target_max ? cur : target_max;
 	}

 	cpuperf_min = cur_min;
 	cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
 	cpuperf_max = cur_max;

 	cpuperf_target_min = target_min;
 	cpuperf_target_avg = target_sum / nr_online_cpus;
 	cpuperf_target_max = target_max;
 out:
 	scx_bpf_put_cpumask(online);
 }

 /*
  * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
  * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
  * see meaningful dumps in the trace pipe.
  */
 static void dump_shared_dsq(void)
 {
 	struct task_struct *p;
 	s32 nr;

 	if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
 		return;

 	bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);

 	bpf_rcu_read_lock();
 	bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV)
 		bpf_printk("%s[%d]", p->comm, p->pid);
 	bpf_rcu_read_unlock();
 }

 static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
 {
 	bpf_rcu_read_lock();
 	dispatch_highpri(true);
 	bpf_rcu_read_unlock();

 	monitor_cpuperf();

 	if (print_shared_dsq)
 		dump_shared_dsq();

 	bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
 	return 0;
 }

 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
 {
 	u32 key = 0;
 	struct bpf_timer *timer;
 	s32 ret;

 	print_cpus();

 	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
 	if (ret)
 		return ret;

 	ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
 	if (ret)
 		return ret;

 	timer = bpf_map_lookup_elem(&monitor_timer, &key);
 	if (!timer)
 		return -ESRCH;

 	bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
 	bpf_timer_set_callback(timer, monitor_timerfn);

 	return bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
 }

 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
 {
 	UEI_RECORD(uei, ei);
 }

 SCX_OPS_DEFINE(qmap_ops,
 	       .select_cpu		= (void *)qmap_select_cpu,
 	       .enqueue			= (void *)qmap_enqueue,
 	       .dequeue			= (void *)qmap_dequeue,
 	       .dispatch		= (void *)qmap_dispatch,
 	       .tick			= (void *)qmap_tick,
 	       .core_sched_before	= (void *)qmap_core_sched_before,
 	       .cpu_release		= (void *)qmap_cpu_release,
 	       .init_task		= (void *)qmap_init_task,
 	       .dump			= (void *)qmap_dump,
 	       .dump_cpu		= (void *)qmap_dump_cpu,
 	       .dump_task		= (void *)qmap_dump_task,
 	       .cpu_online		= (void *)qmap_cpu_online,
 	       .cpu_offline		= (void *)qmap_cpu_offline,
 	       .init			= (void *)qmap_init,
 	       .exit			= (void *)qmap_exit,
 	       .timeout_ms		= 5000U,
 	       .name			= "qmap");
	/* SPDX-License-Identifier: GPL-2.0 */
	/*
	* A simple five-level FIFO queue scheduler.
	*
	* There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
	* assigned to one depending on its compound weight. Each CPU round robins
	* through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
	* queue0, 2 from queue1, 4 from queue2 and so on.
	*
	* This scheduler demonstrates:
	*
	* - BPF-side queueing using PIDs.
	* - Sleepable per-task storage allocation using ops.prep_enable().
	* - Using ops.cpu_release() to handle a higher priority scheduling class taking
	* the CPU away.
	* - Core-sched support.
	*
	* This scheduler is primarily for demonstration and testing of sched_ext
	* features and unlikely to be useful for actual workloads.
	*
	* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
	* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
	* Copyright (c) 2022 David Vernet <dvernet@meta.com>
	*/
	#include <scx/common.bpf.h>

	enum consts {
	ONE_SEC_IN_NS = 1000000000,
	SHARED_DSQ = 0,
	HIGHPRI_DSQ = 1,
	HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */
	};

	char _license[] SEC("license") = "GPL";

	const volatile u64 slice_ns = SCX_SLICE_DFL;
	const volatile u32 stall_user_nth;
	const volatile u32 stall_kernel_nth;
	const volatile u32 dsp_inf_loop_after;
	const volatile u32 dsp_batch;
	const volatile bool highpri_boosting;
	const volatile bool print_shared_dsq;
	const volatile s32 disallow_tgid;
	const volatile bool suppress_dump;

	u64 nr_highpri_queued;
	u32 test_error_cnt;

	UEI_DEFINE(uei);

	struct qmap {
	__uint(type, BPF_MAP_TYPE_QUEUE);
	__uint(max_entries, 4096);
	__type(value, u32);
	} queue0 SEC(".maps"),
	queue1 SEC(".maps"),
	queue2 SEC(".maps"),
	queue3 SEC(".maps"),
	queue4 SEC(".maps");

	struct {
	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
	__uint(max_entries, 5);
	__type(key, int);
	__array(values, struct qmap);
	} queue_arr SEC(".maps") = {
	.values = {
	[0] = &queue0,
	[1] = &queue1,
	[2] = &queue2,
	[3] = &queue3,
	[4] = &queue4,
	},
	};

	/*
	* If enabled, CPU performance target is set according to the queue index
	* according to the following table.
	*/
	static const u32 qidx_to_cpuperf_target[] = {
	[0] = SCX_CPUPERF_ONE * 0 / 4,
	[1] = SCX_CPUPERF_ONE * 1 / 4,
	[2] = SCX_CPUPERF_ONE * 2 / 4,
	[3] = SCX_CPUPERF_ONE * 3 / 4,
	[4] = SCX_CPUPERF_ONE * 4 / 4,
	};

	/*
	* Per-queue sequence numbers to implement core-sched ordering.
	*
	* Tail seq is assigned to each queued task and incremented. Head seq tracks the
	* sequence number of the latest dispatched task. The distance between the a
	* task's seq and the associated queue's head seq is called the queue distance
	* and used when comparing two tasks for ordering. See qmap_core_sched_before().
	*/
	static u64 core_sched_head_seqs[5];
	static u64 core_sched_tail_seqs[5];

	/* Per-task scheduling context */
	struct task_ctx {
	bool force_local; /* Dispatch directly to local_dsq */
	bool highpri;
	u64 core_sched_seq;
	};

	struct {
	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
	__uint(map_flags, BPF_F_NO_PREALLOC);
	__type(key, int);
	__type(value, struct task_ctx);
	} task_ctx_stor SEC(".maps");

	struct cpu_ctx {
	u64 dsp_idx; /* dispatch index */
	u64 dsp_cnt; /* remaining count */
	u32 avg_weight;
	u32 cpuperf_target;
	};

	struct {
	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
	__uint(max_entries, 1);
	__type(key, u32);
	__type(value, struct cpu_ctx);
	} cpu_ctx_stor SEC(".maps");

	/* Statistics */
	u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq;
	u64 nr_core_sched_execed;
	u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
	u32 cpuperf_min, cpuperf_avg, cpuperf_max;
	u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;

	static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
	{
	s32 cpu;

	if (p->nr_cpus_allowed == 1 \|\|
	scx_bpf_test_and_clear_cpu_idle(prev_cpu))
	return prev_cpu;

	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
	if (cpu >= 0)
	return cpu;

	return -1;
	}

	static struct task_ctx lookup_task_ctx(struct task_struct p)
	{
	struct task_ctx *tctx;

	if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
	scx_bpf_error("task_ctx lookup failed");
	return NULL;
	}
	return tctx;
	}

	s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
	s32 prev_cpu, u64 wake_flags)
	{
	struct task_ctx *tctx;
	s32 cpu;

	if (!(tctx = lookup_task_ctx(p)))
	return -ESRCH;

	cpu = pick_direct_dispatch_cpu(p, prev_cpu);

	if (cpu >= 0) {
	tctx->force_local = true;
	return cpu;
	} else {
	return prev_cpu;
	}
	}

	static int weight_to_idx(u32 weight)
	{
	/* Coarsely map the compound weight to a FIFO. */
	if (weight <= 25)
	return 0;
	else if (weight <= 50)
	return 1;
	else if (weight < 200)
	return 2;
	else if (weight < 400)
	return 3;
	else
	return 4;
	}

	void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
	{
	static u32 user_cnt, kernel_cnt;
	struct task_ctx *tctx;
	u32 pid = p->pid;
	int idx = weight_to_idx(p->scx.weight);
	void *ring;
	s32 cpu;

	if (p->flags & PF_KTHREAD) {
	if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
	return;
	} else {
	if (stall_user_nth && !(++user_cnt % stall_user_nth))
	return;
	}

	if (test_error_cnt && !--test_error_cnt)
	scx_bpf_error("test triggering error");

	if (!(tctx = lookup_task_ctx(p)))
	return;

	/*
	* All enqueued tasks must have their core_sched_seq updated for correct
	* core-sched ordering. Also, take a look at the end of qmap_dispatch().
	*/
	tctx->core_sched_seq = core_sched_tail_seqs[idx]++;

	/*
	* If qmap_select_cpu() is telling us to or this is the last runnable
	* task on the CPU, enqueue locally.
	*/
	if (tctx->force_local) {
	tctx->force_local = false;
	scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
	return;
	}

	/* if select_cpu() wasn't called, try direct dispatch */
	if (!(enq_flags & SCX_ENQ_CPU_SELECTED) &&
	(cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
	__sync_fetch_and_add(&nr_ddsp_from_enq, 1);
	scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON \| cpu, slice_ns, enq_flags);
	return;
	}

	/*
	* If the task was re-enqueued due to the CPU being preempted by a
	* higher priority scheduling class, just re-enqueue the task directly
	* on the global DSQ. As we want another CPU to pick it up, find and
	* kick an idle CPU.
	*/
	if (enq_flags & SCX_ENQ_REENQ) {
	s32 cpu;

	scx_bpf_dispatch(p, SHARED_DSQ, 0, enq_flags);
	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
	if (cpu >= 0)
	scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
	return;
	}

	ring = bpf_map_lookup_elem(&queue_arr, &idx);
	if (!ring) {
	scx_bpf_error("failed to find ring %d", idx);
	return;
	}

	/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
	if (bpf_map_push_elem(ring, &pid, 0)) {
	scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags);
	return;
	}

	if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
	tctx->highpri = true;
	__sync_fetch_and_add(&nr_highpri_queued, 1);
	}
	__sync_fetch_and_add(&nr_enqueued, 1);
	}

	/*
	* The BPF queue map doesn't support removal and sched_ext can handle spurious
	* dispatches. qmap_dequeue() is only used to collect statistics.
	*/
	void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
	{
	__sync_fetch_and_add(&nr_dequeued, 1);
	if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
	__sync_fetch_and_add(&nr_core_sched_execed, 1);
	}

	static void update_core_sched_head_seq(struct task_struct *p)
	{
	int idx = weight_to_idx(p->scx.weight);
	struct task_ctx *tctx;

	if ((tctx = lookup_task_ctx(p)))
	core_sched_head_seqs[idx] = tctx->core_sched_seq;
	}

	/*
	* To demonstrate the use of scx_bpf_dispatch_from_dsq(), implement silly
	* selective priority boosting mechanism by scanning SHARED_DSQ looking for
	* highpri tasks, moving them to HIGHPRI_DSQ and then consuming them first. This
	* makes minor difference only when dsp_batch is larger than 1.
	*
	* scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
	* non-rq-lock holding BPF programs. As demonstration, this function is called
	* from qmap_dispatch() and monitor_timerfn().
	*/
	static bool dispatch_highpri(bool from_timer)
	{
	struct task_struct *p;
	s32 this_cpu = bpf_get_smp_processor_id();

	/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
	bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
	static u64 highpri_seq;
	struct task_ctx *tctx;

	if (!(tctx = lookup_task_ctx(p)))
	return false;

	if (tctx->highpri) {
	/* exercise the set_() and vtime interface too /
	__COMPAT_scx_bpf_dispatch_from_dsq_set_slice(
	BPF_FOR_EACH_ITER, slice_ns * 2);
	__COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(
	BPF_FOR_EACH_ITER, highpri_seq++);
	__COMPAT_scx_bpf_dispatch_vtime_from_dsq(
	BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
	}
	}

	/*
	* Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
	* is found.
	*/
	bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
	bool dispatched = false;
	s32 cpu;

	if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
	cpu = this_cpu;
	else
	cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);

	if (__COMPAT_scx_bpf_dispatch_from_dsq(BPF_FOR_EACH_ITER, p,
	SCX_DSQ_LOCAL_ON \| cpu,
	SCX_ENQ_PREEMPT)) {
	if (cpu == this_cpu) {
	dispatched = true;
	__sync_fetch_and_add(&nr_expedited_local, 1);
	} else {
	__sync_fetch_and_add(&nr_expedited_remote, 1);
	}
	if (from_timer)
	__sync_fetch_and_add(&nr_expedited_from_timer, 1);
	} else {
	__sync_fetch_and_add(&nr_expedited_lost, 1);
	}

	if (dispatched)
	return true;
	}

	return false;
	}

	void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
	{
	struct task_struct *p;
	struct cpu_ctx *cpuc;
	struct task_ctx *tctx;
	u32 zero = 0, batch = dsp_batch ?: 1;
	void *fifo;
	s32 i, pid;

	if (dispatch_highpri(false))
	return;

	if (!nr_highpri_queued && scx_bpf_consume(SHARED_DSQ))
	return;

	if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
	/*
	* PID 2 should be kthreadd which should mostly be idle and off
	* the scheduler. Let's keep dispatching it to force the kernel
	* to call this function over and over again.
	*/
	p = bpf_task_from_pid(2);
	if (p) {
	scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0);
	bpf_task_release(p);
	return;
	}
	}

	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
	scx_bpf_error("failed to look up cpu_ctx");
	return;
	}

	for (i = 0; i < 5; i++) {
	/* Advance the dispatch cursor and pick the fifo. */
	if (!cpuc->dsp_cnt) {
	cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
	cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
	}

	fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
	if (!fifo) {
	scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
	return;
	}

	/* Dispatch or advance. */
	bpf_repeat(BPF_MAX_LOOPS) {
	struct task_ctx *tctx;

	if (bpf_map_pop_elem(fifo, &pid))
	break;

	p = bpf_task_from_pid(pid);
	if (!p)
	continue;

	if (!(tctx = lookup_task_ctx(p))) {
	bpf_task_release(p);
	return;
	}

	if (tctx->highpri)
	__sync_fetch_and_sub(&nr_highpri_queued, 1);

	update_core_sched_head_seq(p);
	__sync_fetch_and_add(&nr_dispatched, 1);

	scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0);
	bpf_task_release(p);

	batch--;
	cpuc->dsp_cnt--;
	if (!batch \|\| !scx_bpf_dispatch_nr_slots()) {
	if (dispatch_highpri(false))
	return;
	scx_bpf_consume(SHARED_DSQ);
	return;
	}
	if (!cpuc->dsp_cnt)
	break;
	}

	cpuc->dsp_cnt = 0;
	}

	/*
	* No other tasks. @prev will keep running. Update its core_sched_seq as
	* if the task were enqueued and dispatched immediately.
	*/
	if (prev) {
	tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
	if (!tctx) {
	scx_bpf_error("task_ctx lookup failed");
	return;
	}

	tctx->core_sched_seq =
	core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
	}
	}

	void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
	{
	struct cpu_ctx *cpuc;
	u32 zero = 0;
	int idx;

	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
	scx_bpf_error("failed to look up cpu_ctx");
	return;
	}

	/*
	* Use the running avg of weights to select the target cpuperf level.
	* This is a demonstration of the cpuperf feature rather than a
	* practical strategy to regulate CPU frequency.
	*/
	cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
	idx = weight_to_idx(cpuc->avg_weight);
	cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];

	scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
	}

	/*
	* The distance from the head of the queue scaled by the weight of the queue.
	* The lower the number, the older the task and the higher the priority.
	*/
	static s64 task_qdist(struct task_struct *p)
	{
	int idx = weight_to_idx(p->scx.weight);
	struct task_ctx *tctx;
	s64 qdist;

	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
	if (!tctx) {
	scx_bpf_error("task_ctx lookup failed");
	return 0;
	}

	qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];

	/*
	* As queue index increments, the priority doubles. The queue w/ index 3
	* is dispatched twice more frequently than 2. Reflect the difference by
	* scaling qdists accordingly. Note that the shift amount needs to be
	* flipped depending on the sign to avoid flipping priority direction.
	*/
	if (qdist >= 0)
	return qdist << (4 - idx);
	else
	return qdist << idx;
	}

	/*
	* This is called to determine the task ordering when core-sched is picking
	* tasks to execute on SMT siblings and should encode about the same ordering as
	* the regular scheduling path. Use the priority-scaled distances from the head
	* of the queues to compare the two tasks which should be consistent with the
	* dispatch path behavior.
	*/
	bool BPF_STRUCT_OPS(qmap_core_sched_before,
	struct task_struct a, struct task_struct b)
	{
	return task_qdist(a) > task_qdist(b);
	}

	void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
	{
	u32 cnt;

	/*
	* Called when @cpu is taken by a higher priority scheduling class. This
	* makes @cpu no longer available for executing sched_ext tasks. As we
	* don't want the tasks in @cpu's local dsq to sit there until @cpu
	* becomes available again, re-enqueue them into the global dsq. See
	* %SCX_ENQ_REENQ handling in qmap_enqueue().
	*/
	cnt = scx_bpf_reenqueue_local();
	if (cnt)
	__sync_fetch_and_add(&nr_reenqueued, cnt);
	}

	s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
	struct scx_init_task_args *args)
	{
	if (p->tgid == disallow_tgid)
	p->scx.disallow = true;

	/*
	* @p is new. Let's ensure that its task_ctx is available. We can sleep
	* in this function and the following will automatically use GFP_KERNEL.
	*/
	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
	BPF_LOCAL_STORAGE_GET_F_CREATE))
	return 0;
	else
	return -ENOMEM;
	}

	void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
	{
	s32 i, pid;

	if (suppress_dump)
	return;

	bpf_for(i, 0, 5) {
	void *fifo;

	if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
	return;

	scx_bpf_dump("QMAP FIFO[%d]:", i);
	bpf_repeat(4096) {
	if (bpf_map_pop_elem(fifo, &pid))
	break;
	scx_bpf_dump(" %d", pid);
	}
	scx_bpf_dump("\n");
	}
	}

	void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
	{
	u32 zero = 0;
	struct cpu_ctx *cpuc;

	if (suppress_dump \|\| idle)
	return;
	if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
	return;

	scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
	cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
	cpuc->cpuperf_target);
	}

	void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx dctx, struct task_struct p)
	{
	struct task_ctx *taskc;

	if (suppress_dump)
	return;
	if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
	return;

	scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
	taskc->force_local, taskc->core_sched_seq);
	}

	/*
	* Print out the online and possible CPU map using bpf_printk() as a
	* demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
	*/
	static void print_cpus(void)
	{
	const struct cpumask possible, online;
	s32 cpu;
	char buf[128] = "", *p;
	int idx;

	possible = scx_bpf_get_possible_cpumask();
	online = scx_bpf_get_online_cpumask();

	idx = 0;
	bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
	if (!(p = MEMBER_VPTR(buf, [idx++])))
	break;
	if (bpf_cpumask_test_cpu(cpu, online))
	*p++ = 'O';
	else if (bpf_cpumask_test_cpu(cpu, possible))
	*p++ = 'X';
	else
	*p++ = ' ';

	if ((cpu & 7) == 7) {
	if (!(p = MEMBER_VPTR(buf, [idx++])))
	break;
	*p++ = '\|';
	}
	}
	buf[sizeof(buf) - 1] = '\0';

	scx_bpf_put_cpumask(online);
	scx_bpf_put_cpumask(possible);

	bpf_printk("CPUS: \|%s", buf);
	}

	void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
	{
	bpf_printk("CPU %d coming online", cpu);
	/* @cpu is already online at this point */
	print_cpus();
	}

	void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
	{
	bpf_printk("CPU %d going offline", cpu);
	/* @cpu is still online at this point */
	print_cpus();
	}

	struct monitor_timer {
	struct bpf_timer timer;
	};

	struct {
	__uint(type, BPF_MAP_TYPE_ARRAY);
	__uint(max_entries, 1);
	__type(key, u32);
	__type(value, struct monitor_timer);
	} monitor_timer SEC(".maps");

	/*
	* Print out the min, avg and max performance levels of CPUs every second to
	* demonstrate the cpuperf interface.
	*/
	static void monitor_cpuperf(void)
	{
	u32 zero = 0, nr_cpu_ids;
	u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
	u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
	const struct cpumask *online;
	int i, nr_online_cpus = 0;

	nr_cpu_ids = scx_bpf_nr_cpu_ids();
	online = scx_bpf_get_online_cpumask();

	bpf_for(i, 0, nr_cpu_ids) {
	struct cpu_ctx *cpuc;
	u32 cap, cur;

	if (!bpf_cpumask_test_cpu(i, online))
	continue;
	nr_online_cpus++;

	/* collect the capacity and current cpuperf */
	cap = scx_bpf_cpuperf_cap(i);
	cur = scx_bpf_cpuperf_cur(i);

	cur_min = cur < cur_min ? cur : cur_min;
	cur_max = cur > cur_max ? cur : cur_max;

	/*
	* $cur is relative to $cap. Scale it down accordingly so that
	* it's in the same scale as other CPUs and $cur_sum/$cap_sum
	* makes sense.
	*/
	cur_sum += cur * cap / SCX_CPUPERF_ONE;
	cap_sum += cap;

	if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
	scx_bpf_error("failed to look up cpu_ctx");
	goto out;
	}

	/* collect target */
	cur = cpuc->cpuperf_target;
	target_sum += cur;
	target_min = cur < target_min ? cur : target_min;
	target_max = cur > target_max ? cur : target_max;
	}

	cpuperf_min = cur_min;
	cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
	cpuperf_max = cur_max;

	cpuperf_target_min = target_min;
	cpuperf_target_avg = target_sum / nr_online_cpus;
	cpuperf_target_max = target_max;
	out:
	scx_bpf_put_cpumask(online);
	}

	/*
	* Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
	* scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
	* see meaningful dumps in the trace pipe.
	*/
	static void dump_shared_dsq(void)
	{
	struct task_struct *p;
	s32 nr;

	if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
	return;

	bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);

	bpf_rcu_read_lock();
	bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV)
	bpf_printk("%s[%d]", p->comm, p->pid);
	bpf_rcu_read_unlock();
	}

	static int monitor_timerfn(void map, int key, struct bpf_timer *timer)
	{
	bpf_rcu_read_lock();
	dispatch_highpri(true);
	bpf_rcu_read_unlock();

	monitor_cpuperf();

	if (print_shared_dsq)
	dump_shared_dsq();

	bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
	return 0;
	}

	s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
	{
	u32 key = 0;
	struct bpf_timer *timer;
	s32 ret;

	print_cpus();

	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
	if (ret)
	return ret;

	ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
	if (ret)
	return ret;

	timer = bpf_map_lookup_elem(&monitor_timer, &key);
	if (!timer)
	return -ESRCH;

	bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
	bpf_timer_set_callback(timer, monitor_timerfn);

	return bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
	}

	void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
	{
	UEI_RECORD(uei, ei);
	}

	SCX_OPS_DEFINE(qmap_ops,
	.select_cpu = (void *)qmap_select_cpu,
	.enqueue = (void *)qmap_enqueue,
	.dequeue = (void *)qmap_dequeue,
	.dispatch = (void *)qmap_dispatch,
	.tick = (void *)qmap_tick,
	.core_sched_before = (void *)qmap_core_sched_before,
	.cpu_release = (void *)qmap_cpu_release,
	.init_task = (void *)qmap_init_task,
	.dump = (void *)qmap_dump,
	.dump_cpu = (void *)qmap_dump_cpu,
	.dump_task = (void *)qmap_dump_task,
	.cpu_online = (void *)qmap_cpu_online,
	.cpu_offline = (void *)qmap_cpu_offline,
	.init = (void *)qmap_init,
	.exit = (void *)qmap_exit,
	.timeout_ms = 5000U,
	.name = "qmap");