cfq-iosched: rework the whole round-robin list concept

Drawing on some inspiration from the CFS CPU scheduler design, overhaul
the pending cfq_queue concept list management. Currently CFQ uses a
doubly linked list per priority level for sorting and service uses.
Kill those lists and maintain an rbtree of cfq_queue's, sorted by when
to service them.

This unfortunately means that the ionice levels aren't as strong
anymore, will work on improving those later. We only scale the slice
time now, not the number of times we service. This means that latency
is better (for all priority levels), but that the distinction between
the highest and lower levels aren't as big.

The diffstat speaks for itself.

 cfq-iosched.c |  363 +++++++++++++++++---------------------------------
 1 file changed, 125 insertions(+), 238 deletions(-)

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9d6f0410..4838c2b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -26,7 +26,16 @@
 static const int cfq_slice_async_rq = 2;
 static int cfq_slice_idle = HZ / 125;
 
+/*
+ * grace period before allowing idle class to get disk access
+ */
 #define CFQ_IDLE_GRACE		(HZ / 10)
+
+/*
+ * below this threshold, we consider thinktime immediate
+ */
+#define CFQ_MIN_TT		(2)
+
 #define CFQ_SLICE_SCALE		(5)
 
 #define CFQ_KEY_ASYNC		(0)
@@ -69,10 +78,9 @@
 	/*
 	 * rr list of queues with requests and the count of them
 	 */
-	struct list_head rr_list[CFQ_PRIO_LISTS];
+	struct rb_root service_tree;
 	struct list_head cur_rr;
 	struct list_head idle_rr;
-	unsigned long cur_rr_tick;
 	unsigned int busy_queues;
 
 	/*
@@ -91,8 +99,6 @@
 
 	struct cfq_queue *active_queue;
 	struct cfq_io_context *active_cic;
-	int cur_prio, cur_end_prio;
-	unsigned long prio_time;
 	unsigned int dispatch_slice;
 
 	struct timer_list idle_class_timer;
@@ -131,8 +137,10 @@
 	unsigned int key;
 	/* member of the rr/busy/cur/idle cfqd list */
 	struct list_head cfq_list;
-	/* in what tick we were last serviced */
-	unsigned long rr_tick;
+	/* service_tree member */
+	struct rb_node rb_node;
+	/* service_tree key */
+	unsigned long rb_key;
 	/* sorted list of pending requests */
 	struct rb_root sort_list;
 	/* if fifo isn't expired, next request to serve */
@@ -147,8 +155,6 @@
 	struct list_head fifo;
 
 	unsigned long slice_end;
-	unsigned long service_last;
-	unsigned long slice_start;
 	long slice_resid;
 
 	/* number of requests that are on the dispatch list or inside driver */
@@ -240,30 +246,26 @@
  * if a queue is marked sync and has sync io queued. A sync queue with async
  * io only, should not get full sync slice length.
  */
+static inline int cfq_prio_slice(struct cfq_data *cfqd, int sync,
+				 unsigned short prio)
+{
+	const int base_slice = cfqd->cfq_slice[sync];
+
+	WARN_ON(prio >= IOPRIO_BE_NR);
+
+	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
+}
+
 static inline int
 cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-	const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
-
-	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
-
-	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio));
+	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
 }
 
 static inline void
 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
-	cfqq->slice_end += cfqq->slice_resid;
-
-	/*
-	 * Don't carry over residual for more than one slice, we only want
-	 * to slightly correct the fairness. Carrying over forever would
-	 * easily introduce oscillations.
-	 */
-	cfqq->slice_resid = 0;
-
-	cfqq->slice_start = jiffies;
 }
 
 /*
@@ -403,33 +405,50 @@
 	return cfq_choose_req(cfqd, next, prev);
 }
 
-/*
- * This function finds out where to insert a BE queue in the service hierarchy
- */
-static void cfq_resort_be_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-				int preempted)
+static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
+				      struct cfq_queue *cfqq)
 {
-	if (!cfq_cfqq_sync(cfqq))
-		list_add_tail(&cfqq->cfq_list, &cfqd->rr_list[cfqq->ioprio]);
-	else {
-		struct list_head *n = &cfqd->rr_list[cfqq->ioprio];
+	/*
+	 * just an approximation, should be ok.
+	 */
+	return ((cfqd->busy_queues - 1) * cfq_prio_slice(cfqd, 1, 0));
+}
 
+static void cfq_service_tree_add(struct cfq_data *cfqd,
+				    struct cfq_queue *cfqq)
+{
+	struct rb_node **p = &cfqd->service_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct cfq_queue *__cfqq;
+	unsigned long rb_key;
+
+	rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
+	rb_key += cfqq->slice_resid;
+	cfqq->slice_resid = 0;
+
+	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
 		/*
-		 * sort by last service, but don't cross a new or async
-		 * queue. we don't cross a new queue because it hasn't
-		 * been service before, and we don't cross an async
-		 * queue because it gets added to the end on expire.
+		 * same position, nothing more to do
 		 */
-		while ((n = n->prev) != &cfqd->rr_list[cfqq->ioprio]) {
-			struct cfq_queue *__c = list_entry_cfqq(n);
+		if (rb_key == cfqq->rb_key)
+			return;
 
-			if (!cfq_cfqq_sync(__c) || !__c->service_last)
-				break;
-			if (time_before(__c->service_last, cfqq->service_last))
-				break;
-		}
-		list_add(&cfqq->cfq_list, n);
+		rb_erase(&cfqq->rb_node, &cfqd->service_tree);
 	}
+
+	while (*p) {
+		parent = *p;
+		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
+
+		if (rb_key < __cfqq->rb_key)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	cfqq->rb_key = rb_key;
+	rb_link_node(&cfqq->rb_node, parent, p);
+	rb_insert_color(&cfqq->rb_node, &cfqd->service_tree);
 }
 
 static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
@@ -443,7 +462,7 @@
 	if (!cfq_cfqq_on_rr(cfqq))
 		return;
 
-	list_del(&cfqq->cfq_list);
+	list_del_init(&cfqq->cfq_list);
 
 	if (cfq_class_rt(cfqq)) {
 		/*
@@ -465,7 +484,7 @@
 		/*
 		 * So we get here, ergo the queue is a regular best-effort queue
 		 */
-		cfq_resort_be_queue(cfqd, cfqq, preempted);
+		cfq_service_tree_add(cfqd, cfqq);
 	}
 }
 
@@ -490,6 +509,11 @@
 	cfq_clear_cfqq_on_rr(cfqq);
 	list_del_init(&cfqq->cfq_list);
 
+	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
+		rb_erase(&cfqq->rb_node, &cfqd->service_tree);
+		RB_CLEAR_NODE(&cfqq->rb_node);
+	}
+
 	BUG_ON(!cfqd->busy_queues);
 	cfqd->busy_queues--;
 }
@@ -684,7 +708,6 @@
 		cfq_clear_cfqq_fifo_expire(cfqq);
 		cfq_mark_cfqq_slice_new(cfqq);
 		cfq_clear_cfqq_queue_new(cfqq);
-		cfqq->rr_tick = cfqd->cur_rr_tick;
 	}
 
 	cfqd->active_queue = cfqq;
@@ -732,114 +755,19 @@
 		__cfq_slice_expired(cfqd, cfqq, preempted, timed_out);
 }
 
-/*
- * 0
- * 0,1
- * 0,1,2
- * 0,1,2,3
- * 0,1,2,3,4
- * 0,1,2,3,4,5
- * 0,1,2,3,4,5,6
- * 0,1,2,3,4,5,6,7
- */
-static int cfq_get_next_prio_level(struct cfq_data *cfqd)
-{
-	int prio, wrap;
-
-	prio = -1;
-	wrap = 0;
-	do {
-		int p;
-
-		for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) {
-			if (!list_empty(&cfqd->rr_list[p])) {
-				prio = p;
-				break;
-			}
-		}
-
-		if (prio != -1)
-			break;
-		cfqd->cur_prio = 0;
-		if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
-			cfqd->cur_end_prio = 0;
-			if (wrap)
-				break;
-			wrap = 1;
-		}
-	} while (1);
-
-	if (unlikely(prio == -1))
-		return -1;
-
-	BUG_ON(prio >= CFQ_PRIO_LISTS);
-
-	list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr);
-
-	cfqd->cur_prio = prio + 1;
-	if (cfqd->cur_prio > cfqd->cur_end_prio) {
-		cfqd->cur_end_prio = cfqd->cur_prio;
-		cfqd->cur_prio = 0;
-	}
-	if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
-		cfqd->cur_prio = 0;
-		cfqd->cur_end_prio = 0;
-	}
-
-	cfqd->cur_rr_tick++;
-	cfqd->prio_time = jiffies;
-	return prio;
-}
-
-static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
-					  struct request *rq)
-{
-	if (rq->sector >= cfqd->last_position)
-		return rq->sector - cfqd->last_position;
-	else
-		return cfqd->last_position - rq->sector;
-}
-
-static struct cfq_queue *cfq_get_best_queue(struct cfq_data *cfqd)
-{
-	struct cfq_queue *cfqq = NULL, *__cfqq;
-	sector_t best = -1, first = -1, dist;
-
-	list_for_each_entry(__cfqq, &cfqd->cur_rr, cfq_list) {
-		if (!__cfqq->next_rq || !cfq_cfqq_sync(__cfqq))
-			continue;
-
-		dist = cfq_dist_from_last(cfqd, __cfqq->next_rq);
-		if (first == -1)
-			first = dist;
-		if (dist < best) {
-			best = dist;
-			cfqq = __cfqq;
-		}
-	}
-
-	/*
-	 * Only async queue(s) available, grab first entry. Do the same
-	 * if the difference between the first and best isn't more than
-	 * twice, to obey fairness.
-	 */
-	if (!cfqq || (best && first != best && ((first / best) < 4)))
-		cfqq = list_entry_cfqq(cfqd->cur_rr.next);
-
-	return cfqq;
-}
-
 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq = NULL;
 
-	if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1) {
+	if (!list_empty(&cfqd->cur_rr)) {
 		/*
-		 * if current list is non-empty, grab first entry. if it is
-		 * empty, get next prio level and grab first entry then if any
-		 * are spliced
+		 * if current list is non-empty, grab first entry.
 		 */
-		cfqq = cfq_get_best_queue(cfqd);
+		cfqq = list_entry_cfqq(cfqd->cur_rr.next);
+	} else if (!RB_EMPTY_ROOT(&cfqd->service_tree)) {
+		struct rb_node *n = rb_first(&cfqd->service_tree);
+
+		cfqq = rb_entry(n, struct cfq_queue, rb_node);
 	} else if (!list_empty(&cfqd->idle_rr)) {
 		/*
 		 * if we have idle queues and no rt or be queues had pending
@@ -861,27 +789,20 @@
 {
 	struct cfq_queue *cfqq;
 
-	do {
-		long prio;
-
-		cfqq = cfq_get_next_queue(cfqd);
-		if (!cfqq)
-			break;
-
-		prio = cfq_prio_to_slice(cfqd, cfqq);
-		if (cfqq->slice_resid > -prio)
-			break;
-
-		cfqq->slice_resid += prio;
-		list_del_init(&cfqq->cfq_list);
-		list_add_tail(&cfqq->cfq_list, &cfqd->rr_list[cfqq->ioprio]);
-		cfqq = NULL;
-	} while (1);
-
+	cfqq = cfq_get_next_queue(cfqd);
 	__cfq_set_active_queue(cfqd, cfqq);
 	return cfqq;
 }
 
+static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
+					  struct request *rq)
+{
+	if (rq->sector >= cfqd->last_position)
+		return rq->sector - cfqd->last_position;
+	else
+		return cfqd->last_position - rq->sector;
+}
+
 static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
 {
 	struct cfq_io_context *cic = cfqd->active_cic;
@@ -892,45 +813,15 @@
 	return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean;
 }
 
-static struct cfq_queue *__cfq_close_cooperator(struct cfq_data *cfqd,
-						struct cfq_queue *cur_cfqq,
-						struct list_head *list)
+static int cfq_close_cooperator(struct cfq_data *cfq_data,
+				struct cfq_queue *cfqq)
 {
-	struct cfq_queue *cfqq;
-
-	list_for_each_entry(cfqq, list, cfq_list) {
-		if (cfqq == cur_cfqq || !cfq_cfqq_sync(cfqq))
-			continue;
-
-		BUG_ON(!cfqq->next_rq);
-
-		if (cfq_rq_close(cfqd, cfqq->next_rq))
-			return cfqq;
-	}
-
-	return NULL;
-}
-
-static int cfq_close_cooperator(struct cfq_data *cfqd,
-				struct cfq_queue *cur_cfqq)
-{
-	struct cfq_queue *cfqq;
-
-	if (!cfqd->busy_queues)
-		return 0;
-
 	/*
-	 * check cur_rr and same-prio rr_list for candidates
+	 * We should notice if some of the queues are cooperating, eg
+	 * working closely on the same area of the disk. In that case,
+	 * we can group them together and don't waste time idling.
 	 */
-	cfqq = __cfq_close_cooperator(cfqd, cur_cfqq, &cfqd->cur_rr);
-	if (cfqq)
-		return 1;
-
-	cfqq = __cfq_close_cooperator(cfqd, cur_cfqq, &cfqd->rr_list[cur_cfqq->ioprio]);
-	if (cfqq && (cfqq->rr_tick == cfqd->cur_rr_tick))
-		cfqq = NULL;
-
-	return cfqq != NULL;
+	return 0;
 }
 
 #define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024))
@@ -974,7 +865,7 @@
 	 */
 	sl = cfqd->cfq_slice_idle;
 	if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
-		sl = min(sl, msecs_to_jiffies(2));
+		sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
 
 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
 }
@@ -1115,31 +1006,41 @@
 	return dispatched;
 }
 
-static int
-cfq_forced_dispatch_cfqqs(struct list_head *list)
+static inline int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
+{
+	int dispatched = 0;
+
+	while (cfqq->next_rq) {
+		cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
+		dispatched++;
+	}
+
+	BUG_ON(!list_empty(&cfqq->fifo));
+	return dispatched;
+}
+
+static int cfq_forced_dispatch_cfqqs(struct list_head *list)
 {
 	struct cfq_queue *cfqq, *next;
 	int dispatched;
 
 	dispatched = 0;
-	list_for_each_entry_safe(cfqq, next, list, cfq_list) {
-		while (cfqq->next_rq) {
-			cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
-			dispatched++;
-		}
-		BUG_ON(!list_empty(&cfqq->fifo));
-	}
+	list_for_each_entry_safe(cfqq, next, list, cfq_list)
+		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
 
 	return dispatched;
 }
 
-static int
-cfq_forced_dispatch(struct cfq_data *cfqd)
+static int cfq_forced_dispatch(struct cfq_data *cfqd)
 {
-	int i, dispatched = 0;
+	int dispatched = 0;
+	struct rb_node *n;
 
-	for (i = 0; i < CFQ_PRIO_LISTS; i++)
-		dispatched += cfq_forced_dispatch_cfqqs(&cfqd->rr_list[i]);
+	while ((n = rb_first(&cfqd->service_tree)) != NULL) {
+		struct cfq_queue *cfqq = rb_entry(n, struct cfq_queue, rb_node);
+
+		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
+	}
 
 	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr);
 	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->idle_rr);
@@ -1151,8 +1052,7 @@
 	return dispatched;
 }
 
-static int
-cfq_dispatch_requests(request_queue_t *q, int force)
+static int cfq_dispatch_requests(request_queue_t *q, int force)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq;
@@ -1222,7 +1122,6 @@
 	/*
 	 * it's on the empty list and still hashed
 	 */
-	list_del(&cfqq->cfq_list);
 	hlist_del(&cfqq->cfq_hash);
 	kmem_cache_free(cfq_pool, cfqq);
 }
@@ -1391,8 +1290,6 @@
 	 */
 	cfqq->org_ioprio = cfqq->ioprio;
 	cfqq->org_ioprio_class = cfqq->ioprio_class;
-
-	cfq_resort_rr_list(cfqq, 0);
 	cfq_clear_cfqq_prio_changed(cfqq);
 }
 
@@ -1478,6 +1375,7 @@
 
 		INIT_HLIST_NODE(&cfqq->cfq_hash);
 		INIT_LIST_HEAD(&cfqq->cfq_list);
+		RB_CLEAR_NODE(&cfqq->rb_node);
 		INIT_LIST_HEAD(&cfqq->fifo);
 
 		cfqq->key = key;
@@ -1752,7 +1650,8 @@
 	 * so we know that it will be selected next.
 	 */
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
-	list_move(&cfqq->cfq_list, &cfqd->cur_rr);
+	list_del_init(&cfqq->cfq_list);
+	list_add(&cfqq->cfq_list, &cfqd->cur_rr);
 
 	cfqq->slice_end = 0;
 	cfq_mark_cfqq_slice_new(cfqq);
@@ -1828,13 +1727,10 @@
 	WARN_ON(!cfqq->dispatched);
 	cfqd->rq_in_driver--;
 	cfqq->dispatched--;
-	cfqq->service_last = now;
 
 	if (!cfq_class_idle(cfqq))
 		cfqd->last_end_request = now;
 
-	cfq_resort_rr_list(cfqq, 0);
-
 	if (sync)
 		RQ_CIC(rq)->last_end_request = now;
 
@@ -1863,9 +1759,6 @@
  */
 static void cfq_prio_boost(struct cfq_queue *cfqq)
 {
-	const int ioprio_class = cfqq->ioprio_class;
-	const int ioprio = cfqq->ioprio;
-
 	if (has_fs_excl()) {
 		/*
 		 * boost idle prio on transactions that would lock out other
@@ -1884,12 +1777,6 @@
 		if (cfqq->ioprio != cfqq->org_ioprio)
 			cfqq->ioprio = cfqq->org_ioprio;
 	}
-
-	/*
-	 * refile between round-robin lists if we moved the priority class
-	 */
-	if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio))
-		cfq_resort_rr_list(cfqq, 0);
 }
 
 static inline int __cfq_may_queue(struct cfq_queue *cfqq)
@@ -2127,9 +2014,7 @@
 
 	memset(cfqd, 0, sizeof(*cfqd));
 
-	for (i = 0; i < CFQ_PRIO_LISTS; i++)
-		INIT_LIST_HEAD(&cfqd->rr_list[i]);
-
+	cfqd->service_tree = RB_ROOT;
 	INIT_LIST_HEAD(&cfqd->cur_rr);
 	INIT_LIST_HEAD(&cfqd->idle_rr);
 	INIT_LIST_HEAD(&cfqd->cic_list);