exofs: convert io_state to use pages array instead of bio at input

* inode.c operations are full-pages based, and not actually
  true scatter-gather
* Lets us use more pages at once upto 512 (from 249) in 64 bit
* Brings us much much closer to be able to use exofs's io_state engine
  from objlayout driver. (Once I decide where to put the common code)

After RAID0 patch the outer (input) bio was never used as a bio, but
was simply a page carrier into the raid engine. Even in the simple
mirror/single-dev arrangement pages info was copied into a second bio.
It is now easer to just pass a pages array into the io_state and prepare
bio(s) once.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 0d8a34b..acfebd3 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -128,7 +128,10 @@
 	loff_t			offset;
 	unsigned long		length;
 	void			*kern_buff;
-	struct bio		*bio;
+
+	struct page		**pages;
+	unsigned		nr_pages;
+	unsigned		pgbase;
 
 	/* Attributes */
 	unsigned		in_attr_len;
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 2b3163e..6ca0b01 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -41,16 +41,18 @@
 
 enum { BIO_MAX_PAGES_KMALLOC =
 		(PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
+	MAX_PAGES_KMALLOC =
+		PAGE_SIZE / sizeof(struct page *),
 };
 
 struct page_collect {
 	struct exofs_sb_info *sbi;
-	struct request_queue *req_q;
 	struct inode *inode;
 	unsigned expected_pages;
 	struct exofs_io_state *ios;
 
-	struct bio *bio;
+	struct page **pages;
+	unsigned alloc_pages;
 	unsigned nr_pages;
 	unsigned long length;
 	loff_t pg_first; /* keep 64bit also in 32-arches */
@@ -62,15 +64,12 @@
 	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
 
 	pcol->sbi = sbi;
-	/* Create master bios on first Q, later on cloning, each clone will be
-	 * allocated on it's destination Q
-	 */
-	pcol->req_q = osd_request_queue(sbi->layout.s_ods[0]);
 	pcol->inode = inode;
 	pcol->expected_pages = expected_pages;
 
 	pcol->ios = NULL;
-	pcol->bio = NULL;
+	pcol->pages = NULL;
+	pcol->alloc_pages = 0;
 	pcol->nr_pages = 0;
 	pcol->length = 0;
 	pcol->pg_first = -1;
@@ -80,7 +79,8 @@
 {
 	pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
 
-	pcol->bio = NULL;
+	pcol->pages = NULL;
+	pcol->alloc_pages = 0;
 	pcol->nr_pages = 0;
 	pcol->length = 0;
 	pcol->pg_first = -1;
@@ -90,13 +90,13 @@
 	 * it might not end here. don't be left with nothing
 	 */
 	if (!pcol->expected_pages)
-		pcol->expected_pages = BIO_MAX_PAGES_KMALLOC;
+		pcol->expected_pages = MAX_PAGES_KMALLOC;
 }
 
 static int pcol_try_alloc(struct page_collect *pcol)
 {
-	int pages = min_t(unsigned, pcol->expected_pages,
-			  BIO_MAX_PAGES_KMALLOC);
+	unsigned pages = min_t(unsigned, pcol->expected_pages,
+			  MAX_PAGES_KMALLOC);
 
 	if (!pcol->ios) { /* First time allocate io_state */
 		int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios);
@@ -105,23 +105,28 @@
 			return ret;
 	}
 
+	/* TODO: easily support bio chaining */
+	pages =  min_t(unsigned, pages,
+		       pcol->sbi->layout.group_width * BIO_MAX_PAGES_KMALLOC);
+
 	for (; pages; pages >>= 1) {
-		pcol->bio = bio_kmalloc(GFP_KERNEL, pages);
-		if (likely(pcol->bio))
+		pcol->pages = kmalloc(pages * sizeof(struct page *),
+				      GFP_KERNEL);
+		if (likely(pcol->pages)) {
+			pcol->alloc_pages = pages;
 			return 0;
+		}
 	}
 
-	EXOFS_ERR("Failed to bio_kmalloc expected_pages=%u\n",
+	EXOFS_ERR("Failed to kmalloc expected_pages=%u\n",
 		  pcol->expected_pages);
 	return -ENOMEM;
 }
 
 static void pcol_free(struct page_collect *pcol)
 {
-	if (pcol->bio) {
-		bio_put(pcol->bio);
-		pcol->bio = NULL;
-	}
+	kfree(pcol->pages);
+	pcol->pages = NULL;
 
 	if (pcol->ios) {
 		exofs_put_io_state(pcol->ios);
@@ -132,11 +137,10 @@
 static int pcol_add_page(struct page_collect *pcol, struct page *page,
 			 unsigned len)
 {
-	int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0);
-	if (unlikely(len != added_len))
+	if (unlikely(pcol->nr_pages >= pcol->alloc_pages))
 		return -ENOMEM;
 
-	++pcol->nr_pages;
+	pcol->pages[pcol->nr_pages++] = page;
 	pcol->length += len;
 	return 0;
 }
@@ -181,7 +185,6 @@
  */
 static int __readpages_done(struct page_collect *pcol, bool do_unlock)
 {
-	struct bio_vec *bvec;
 	int i;
 	u64 resid;
 	u64 good_bytes;
@@ -198,8 +201,8 @@
 		     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
 		     pcol->nr_pages);
 
-	__bio_for_each_segment(bvec, pcol->bio, i, 0) {
-		struct page *page = bvec->bv_page;
+	for (i = 0; i < pcol->nr_pages; i++) {
+		struct page *page = pcol->pages[i];
 		struct inode *inode = page->mapping->host;
 		int page_stat;
 
@@ -218,7 +221,7 @@
 		ret = update_read_page(page, page_stat);
 		if (do_unlock)
 			unlock_page(page);
-		length += bvec->bv_len;
+		length += PAGE_SIZE;
 	}
 
 	pcol_free(pcol);
@@ -238,11 +241,10 @@
 
 static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
 {
-	struct bio_vec *bvec;
 	int i;
 
-	__bio_for_each_segment(bvec, pcol->bio, i, 0) {
-		struct page *page = bvec->bv_page;
+	for (i = 0; i < pcol->nr_pages; i++) {
+		struct page *page = pcol->pages[i];
 
 		if (rw == READ)
 			update_read_page(page, ret);
@@ -260,13 +262,14 @@
 	struct page_collect *pcol_copy = NULL;
 	int ret;
 
-	if (!pcol->bio)
+	if (!pcol->pages)
 		return 0;
 
 	/* see comment in _readpage() about sync reads */
 	WARN_ON(is_sync && (pcol->nr_pages != 1));
 
-	ios->bio = pcol->bio;
+	ios->pages = pcol->pages;
+	ios->nr_pages = pcol->nr_pages;
 	ios->length = pcol->length;
 	ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
 
@@ -366,7 +369,7 @@
 		goto try_again;
 	}
 
-	if (!pcol->bio) {
+	if (!pcol->pages) {
 		ret = pcol_try_alloc(pcol);
 		if (unlikely(ret))
 			goto fail;
@@ -448,7 +451,6 @@
 static void writepages_done(struct exofs_io_state *ios, void *p)
 {
 	struct page_collect *pcol = p;
-	struct bio_vec *bvec;
 	int i;
 	u64 resid;
 	u64  good_bytes;
@@ -467,8 +469,8 @@
 		     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
 		     pcol->nr_pages);
 
-	__bio_for_each_segment(bvec, pcol->bio, i, 0) {
-		struct page *page = bvec->bv_page;
+	for (i = 0; i < pcol->nr_pages; i++) {
+		struct page *page = pcol->pages[i];
 		struct inode *inode = page->mapping->host;
 		int page_stat;
 
@@ -485,7 +487,7 @@
 		EXOFS_DBGMSG2("    writepages_done(0x%lx, 0x%lx) status=%d\n",
 			     inode->i_ino, page->index, page_stat);
 
-		length += bvec->bv_len;
+		length += PAGE_SIZE;
 	}
 
 	pcol_free(pcol);
@@ -500,7 +502,7 @@
 	struct page_collect *pcol_copy = NULL;
 	int ret;
 
-	if (!pcol->bio)
+	if (!pcol->pages)
 		return 0;
 
 	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
@@ -512,9 +514,8 @@
 
 	*pcol_copy = *pcol;
 
-	pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */
-
-	ios->bio = pcol_copy->bio;
+	ios->pages = pcol_copy->pages;
+	ios->nr_pages = pcol_copy->nr_pages;
 	ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT;
 	ios->length = pcol_copy->length;
 	ios->done = writepages_done;
@@ -605,7 +606,7 @@
 		goto try_again;
 	}
 
-	if (!pcol->bio) {
+	if (!pcol->pages) {
 		ret = pcol_try_alloc(pcol);
 		if (unlikely(ret))
 			goto fail;
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 6e446b2..263052c 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -283,10 +283,11 @@
 	*dev = stripe_mod / stripe_unit * ios->layout->mirrors_p1;
 }
 
-static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_bvec,
-		     struct exofs_per_dev_state *per_dev, int cur_len)
+static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_pg,
+		unsigned pgbase, struct exofs_per_dev_state *per_dev,
+		int cur_len)
 {
-	unsigned bv = *cur_bvec;
+	unsigned pg = *cur_pg;
 	struct request_queue *q =
 			osd_request_queue(exofs_ios_od(ios, per_dev->dev));
 
@@ -295,7 +296,7 @@
 	if (per_dev->bio == NULL) {
 		unsigned pages_in_stripe = ios->layout->group_width *
 					(ios->layout->stripe_unit / PAGE_SIZE);
-		unsigned bio_size = (ios->bio->bi_vcnt + pages_in_stripe) /
+		unsigned bio_size = (ios->nr_pages + pages_in_stripe) /
 						ios->layout->group_width;
 
 		per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
@@ -307,21 +308,22 @@
 	}
 
 	while (cur_len > 0) {
-		int added_len;
-		struct bio_vec *bvec = &ios->bio->bi_io_vec[bv];
+		unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
+		unsigned added_len;
 
-		BUG_ON(ios->bio->bi_vcnt <= bv);
-		cur_len -= bvec->bv_len;
+		BUG_ON(ios->nr_pages <= pg);
+		cur_len -= pglen;
 
-		added_len = bio_add_pc_page(q, per_dev->bio, bvec->bv_page,
-					    bvec->bv_len, bvec->bv_offset);
-		if (unlikely(bvec->bv_len != added_len))
+		added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg],
+					    pglen, pgbase);
+		if (unlikely(pglen != added_len))
 			return -ENOMEM;
-		++bv;
+		pgbase = 0;
+		++pg;
 	}
 	BUG_ON(cur_len);
 
-	*cur_bvec = bv;
+	*cur_pg = pg;
 	return 0;
 }
 
@@ -332,10 +334,10 @@
 	unsigned stripe_unit = ios->layout->stripe_unit;
 	unsigned comp = 0;
 	unsigned stripes = 0;
-	unsigned cur_bvec = 0;
-	int ret;
+	unsigned cur_pg = 0;
+	int ret = 0;
 
-	if (!ios->bio) {
+	if (!ios->pages) {
 		if (ios->kern_buff) {
 			struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
 			unsigned unit_off;
@@ -352,7 +354,7 @@
 
 	while (length) {
 		struct exofs_per_dev_state *per_dev = &ios->per_dev[comp];
-		unsigned cur_len;
+		unsigned cur_len, page_off;
 
 		if (!per_dev->length) {
 			unsigned unit_off;
@@ -362,11 +364,15 @@
 			stripes++;
 			cur_len = min_t(u64, stripe_unit - unit_off, length);
 			offset += cur_len;
+			page_off = unit_off & ~PAGE_MASK;
+			BUG_ON(page_off != ios->pgbase);
 		} else {
 			cur_len = min_t(u64, stripe_unit, length);
+			page_off = 0;
 		}
 
-		ret = _add_stripe_unit(ios, &cur_bvec, per_dev, cur_len);
+		ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
+				       cur_len);
 		if (unlikely(ret))
 			goto out;
 
@@ -448,7 +454,7 @@
 		per_dev->or = or;
 		per_dev->offset = master_dev->offset;
 
-		if (ios->bio) {
+		if (ios->pages) {
 			struct bio *bio;
 
 			if (per_dev != master_dev) {
@@ -541,7 +547,7 @@
 	}
 	per_dev->or = or;
 
-	if (ios->bio) {
+	if (ios->pages) {
 		osd_req_read(or, &ios->obj, per_dev->offset,
 				per_dev->bio, per_dev->length);
 		EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"