Btrfs: delay commits during fsync to allow more writers

Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 6bce46b..f446f16 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -25,6 +25,11 @@
 	struct btrfs_block_group_cache *block_group;
 	struct btrfs_key location;
 	struct inode vfs_inode;
+
+	/*
+	 * transid of the trans_handle that last modified this inode
+	 */
+	u64 last_trans;
 };
 static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
 {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d3cd564..c7f5161 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -301,6 +301,7 @@
 	struct radix_tree_root extent_map_radix;
 	struct radix_tree_root extent_ins_radix;
 	u64 generation;
+	u64 last_trans_committed;
 	struct btrfs_transaction *running_transaction;
 	struct btrfs_super_block *disk_super;
 	struct btrfs_super_block super_copy;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1922112..b2f7987 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -435,6 +435,7 @@
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	sb_set_blocksize(sb, 4096);
 	fs_info->running_transaction = NULL;
+	fs_info->last_trans_committed = 0;
 	fs_info->tree_root = tree_root;
 	fs_info->extent_root = extent_root;
 	fs_info->sb = sb;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 00b118a..6933ab1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -694,22 +694,36 @@
 {
 	struct inode *inode = dentry->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	int ret;
+	int ret = 0;
 	struct btrfs_trans_handle *trans;
 
 	/*
-	 * FIXME, use inode generation number to check if we can skip the
-	 * commit
+	 * check the transaction that last modified this inode
+	 * and see if its already been committed
 	 */
 	mutex_lock(&root->fs_info->fs_mutex);
+	if (!BTRFS_I(inode)->last_trans)
+		goto out;
+	mutex_lock(&root->fs_info->trans_mutex);
+	if (BTRFS_I(inode)->last_trans <=
+	    root->fs_info->last_trans_committed) {
+		BTRFS_I(inode)->last_trans = 0;
+		mutex_unlock(&root->fs_info->trans_mutex);
+		goto out;
+	}
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	/*
+ 	 * ok we haven't committed the transaction yet, lets do a commit
+ 	 */
 	trans = btrfs_start_transaction(root, 1);
 	if (!trans) {
 		ret = -ENOMEM;
 		goto out;
 	}
 	ret = btrfs_commit_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
 out:
+	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret > 0 ? EIO : ret;
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5c05ecb..3984841 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -193,6 +193,7 @@
 
 	fill_inode_item(inode_item, inode);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_set_inode_last_trans(trans, inode);
 	ret = 0;
 failed:
 	btrfs_release_path(root, path);
@@ -2234,6 +2235,7 @@
 	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
 	if (!ei)
 		return NULL;
+	ei->last_trans = 0;
 	return &ei->vfs_inode;
 }
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c9d52dc..18abea8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -55,7 +55,8 @@
 		BUG_ON(!cur_trans);
 		root->fs_info->generation++;
 		root->fs_info->running_transaction = cur_trans;
-		cur_trans->num_writers = 0;
+		cur_trans->num_writers = 1;
+		cur_trans->num_joined = 0;
 		cur_trans->transid = root->fs_info->generation;
 		init_waitqueue_head(&cur_trans->writer_wait);
 		init_waitqueue_head(&cur_trans->commit_wait);
@@ -65,8 +66,11 @@
 		cur_trans->start_time = get_seconds();
 		list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
 		init_bit_radix(&cur_trans->dirty_pages);
+	} else {
+		cur_trans->num_writers++;
+		cur_trans->num_joined++;
 	}
-	cur_trans->num_writers++;
+
 	return 0;
 }
 
@@ -428,12 +432,14 @@
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root)
 {
-	int ret = 0;
+	unsigned long joined = 0;
+	unsigned long timeout = 1;
 	struct btrfs_transaction *cur_trans;
 	struct btrfs_transaction *prev_trans = NULL;
 	struct list_head dirty_fs_roots;
 	struct radix_tree_root pinned_copy;
 	DEFINE_WAIT(wait);
+	int ret;
 
 	init_bit_radix(&pinned_copy);
 	INIT_LIST_HEAD(&dirty_fs_roots);
@@ -448,7 +454,11 @@
 		mutex_unlock(&root->fs_info->fs_mutex);
 		ret = wait_for_commit(root, cur_trans);
 		BUG_ON(ret);
+
+		mutex_lock(&root->fs_info->trans_mutex);
 		put_transaction(cur_trans);
+		mutex_unlock(&root->fs_info->trans_mutex);
+
 		mutex_lock(&root->fs_info->fs_mutex);
 		return 0;
 	}
@@ -463,26 +473,35 @@
 			mutex_unlock(&root->fs_info->trans_mutex);
 
 			wait_for_commit(root, prev_trans);
-			put_transaction(prev_trans);
 
 			mutex_lock(&root->fs_info->fs_mutex);
 			mutex_lock(&root->fs_info->trans_mutex);
+			put_transaction(prev_trans);
 		}
 	}
-	while (trans->transaction->num_writers > 1) {
+
+	do {
+		joined = cur_trans->num_joined;
 		WARN_ON(cur_trans != trans->transaction);
-		prepare_to_wait(&trans->transaction->writer_wait, &wait,
+		prepare_to_wait(&cur_trans->writer_wait, &wait,
 				TASK_UNINTERRUPTIBLE);
-		if (trans->transaction->num_writers <= 1)
-			break;
+
+		if (cur_trans->num_writers > 1)
+			timeout = MAX_SCHEDULE_TIMEOUT;
+		else
+			timeout = 1;
+
 		mutex_unlock(&root->fs_info->fs_mutex);
 		mutex_unlock(&root->fs_info->trans_mutex);
-		schedule();
+
+		schedule_timeout(timeout);
+
 		mutex_lock(&root->fs_info->fs_mutex);
 		mutex_lock(&root->fs_info->trans_mutex);
-		finish_wait(&trans->transaction->writer_wait, &wait);
-	}
-	finish_wait(&trans->transaction->writer_wait, &wait);
+		finish_wait(&cur_trans->writer_wait, &wait);
+	} while (cur_trans->num_writers > 1 ||
+		 (cur_trans->num_joined != joined));
+
 	WARN_ON(cur_trans != trans->transaction);
 	ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
 			      &dirty_fs_roots);
@@ -511,6 +530,7 @@
 	btrfs_finish_extent_commit(trans, root, &pinned_copy);
 	mutex_lock(&root->fs_info->trans_mutex);
 	cur_trans->commit_done = 1;
+	root->fs_info->last_trans_committed = cur_trans->transid;
 	wake_up(&cur_trans->commit_wait);
 	put_transaction(cur_trans);
 	put_transaction(cur_trans);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index d5f491d..e451783 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -23,6 +23,7 @@
 struct btrfs_transaction {
 	u64 transid;
 	unsigned long num_writers;
+	unsigned long num_joined;
 	int in_commit;
 	int use_count;
 	int commit_done;
@@ -57,6 +58,12 @@
 	BTRFS_I(inode)->block_group = trans->block_group;
 }
 
+static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
+					      struct inode *inode)
+{
+	BTRFS_I(inode)->last_trans = trans->transaction->transid;
+}
+
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,