drivers/md/dm-kcopyd.c - linux - Git at Google

 // SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2002 Sistina Software (UK) Limited.
  * Copyright (C) 2006 Red Hat GmbH
  *
  * This file is released under the GPL.
  *
  * Kcopyd provides a simple interface for copying an area of one
  * block-device to one or more other block-devices, with an asynchronous
  * completion notification.
  */

 #include <linux/types.h>
 #include <linux/atomic.h>
 #include <linux/blkdev.h>
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/mempool.h>
 #include <linux/module.h>
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
 #include <linux/mutex.h>
 #include <linux/delay.h>
 #include <linux/device-mapper.h>
 #include <linux/dm-kcopyd.h>

 #include "dm-core.h"

 #define SPLIT_COUNT	8
 #define MIN_JOBS	8

 #define DEFAULT_SUB_JOB_SIZE_KB 512
 #define MAX_SUB_JOB_SIZE_KB     1024

 static unsigned int kcopyd_subjob_size_kb = DEFAULT_SUB_JOB_SIZE_KB;

 module_param(kcopyd_subjob_size_kb, uint, 0644);
 MODULE_PARM_DESC(kcopyd_subjob_size_kb, "Sub-job size for dm-kcopyd clients");

 static unsigned int dm_get_kcopyd_subjob_size(void)
 {
 	unsigned int sub_job_size_kb;

 	sub_job_size_kb = __dm_get_module_param(&kcopyd_subjob_size_kb,
 						DEFAULT_SUB_JOB_SIZE_KB,
 						MAX_SUB_JOB_SIZE_KB);

 	return sub_job_size_kb << 1;
 }

 /*
  *----------------------------------------------------------------
  * Each kcopyd client has its own little pool of preallocated
  * pages for kcopyd io.
  *---------------------------------------------------------------
  */
 struct dm_kcopyd_client {
 	struct page_list *pages;
 	unsigned int nr_reserved_pages;
 	unsigned int nr_free_pages;
 	unsigned int sub_job_size;

 	struct dm_io_client *io_client;

 	wait_queue_head_t destroyq;

 	mempool_t job_pool;

 	struct workqueue_struct *kcopyd_wq;
 	struct work_struct kcopyd_work;

 	struct dm_kcopyd_throttle *throttle;

 	atomic_t nr_jobs;

 /*
  * We maintain four lists of jobs:
  *
  * i)   jobs waiting for pages
  * ii)  jobs that have pages, and are waiting for the io to be issued.
  * iii) jobs that don't need to do any IO and just run a callback
  * iv) jobs that have completed.
  *
  * All four of these are protected by job_lock.
  */
 	spinlock_t job_lock;
 	struct list_head callback_jobs;
 	struct list_head complete_jobs;
 	struct list_head io_jobs;
 	struct list_head pages_jobs;
 };

 static struct page_list zero_page_list;

 static DEFINE_SPINLOCK(throttle_spinlock);

 /*
  * IO/IDLE accounting slowly decays after (1 << ACCOUNT_INTERVAL_SHIFT) period.
  * When total_period >= (1 << ACCOUNT_INTERVAL_SHIFT) the counters are divided
  * by 2.
  */
 #define ACCOUNT_INTERVAL_SHIFT		SHIFT_HZ

 /*
  * Sleep this number of milliseconds.
  *
  * The value was decided experimentally.
  * Smaller values seem to cause an increased copy rate above the limit.
  * The reason for this is unknown but possibly due to jiffies rounding errors
  * or read/write cache inside the disk.
  */
 #define SLEEP_USEC			100000

 /*
  * Maximum number of sleep events. There is a theoretical livelock if more
  * kcopyd clients do work simultaneously which this limit avoids.
  */
 #define MAX_SLEEPS			10

 static void io_job_start(struct dm_kcopyd_throttle *t)
 {
 	unsigned int throttle, now, difference;
 	int slept = 0, skew;

 	if (unlikely(!t))
 		return;

 try_again:
 	spin_lock_irq(&throttle_spinlock);

 	throttle = READ_ONCE(t->throttle);

 	if (likely(throttle >= 100))
 		goto skip_limit;

 	now = jiffies;
 	difference = now - t->last_jiffies;
 	t->last_jiffies = now;
 	if (t->num_io_jobs)
 		t->io_period += difference;
 	t->total_period += difference;

 	/*
 	 * Maintain sane values if we got a temporary overflow.
 	 */
 	if (unlikely(t->io_period > t->total_period))
 		t->io_period = t->total_period;

 	if (unlikely(t->total_period >= (1 << ACCOUNT_INTERVAL_SHIFT))) {
 		int shift = fls(t->total_period >> ACCOUNT_INTERVAL_SHIFT);

 		t->total_period >>= shift;
 		t->io_period >>= shift;
 	}

 	skew = t->io_period - throttle * t->total_period / 100;

 	if (unlikely(skew > 0) && slept < MAX_SLEEPS) {
 		slept++;
 		spin_unlock_irq(&throttle_spinlock);
 		fsleep(SLEEP_USEC);
 		goto try_again;
 	}

 skip_limit:
 	t->num_io_jobs++;

 	spin_unlock_irq(&throttle_spinlock);
 }

 static void io_job_finish(struct dm_kcopyd_throttle *t)
 {
 	unsigned long flags;

 	if (unlikely(!t))
 		return;

 	spin_lock_irqsave(&throttle_spinlock, flags);

 	t->num_io_jobs--;

 	if (likely(READ_ONCE(t->throttle) >= 100))
 		goto skip_limit;

 	if (!t->num_io_jobs) {
 		unsigned int now, difference;

 		now = jiffies;
 		difference = now - t->last_jiffies;
 		t->last_jiffies = now;

 		t->io_period += difference;
 		t->total_period += difference;

 		/*
 		 * Maintain sane values if we got a temporary overflow.
 		 */
 		if (unlikely(t->io_period > t->total_period))
 			t->io_period = t->total_period;
 	}

 skip_limit:
 	spin_unlock_irqrestore(&throttle_spinlock, flags);
 }


 static void wake(struct dm_kcopyd_client *kc)
 {
 	queue_work(kc->kcopyd_wq, &kc->kcopyd_work);
 }

 /*
  * Obtain one page for the use of kcopyd.
  */
 static struct page_list *alloc_pl(gfp_t gfp)
 {
 	struct page_list *pl;

 	pl = kmalloc(sizeof(*pl), gfp);
 	if (!pl)
 		return NULL;

 	pl->page = alloc_page(gfp | __GFP_HIGHMEM);
 	if (!pl->page) {
 		kfree(pl);
 		return NULL;
 	}

 	return pl;
 }

 static void free_pl(struct page_list *pl)
 {
 	__free_page(pl->page);
 	kfree(pl);
 }

 /*
  * Add the provided pages to a client's free page list, releasing
  * back to the system any beyond the reserved_pages limit.
  */
 static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl)
 {
 	struct page_list *next;

 	do {
 		next = pl->next;

 		if (kc->nr_free_pages >= kc->nr_reserved_pages)
 			free_pl(pl);
 		else {
 			pl->next = kc->pages;
 			kc->pages = pl;
 			kc->nr_free_pages++;
 		}

 		pl = next;
 	} while (pl);
 }

 static int kcopyd_get_pages(struct dm_kcopyd_client *kc,
 			    unsigned int nr, struct page_list **pages)
 {
 	struct page_list *pl;

 	*pages = NULL;

 	do {
 		pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY | __GFP_KSWAPD_RECLAIM);
 		if (unlikely(!pl)) {
 			/* Use reserved pages */
 			pl = kc->pages;
 			if (unlikely(!pl))
 				goto out_of_memory;
 			kc->pages = pl->next;
 			kc->nr_free_pages--;
 		}
 		pl->next = *pages;
 		*pages = pl;
 	} while (--nr);

 	return 0;

 out_of_memory:
 	if (*pages)
 		kcopyd_put_pages(kc, *pages);
 	return -ENOMEM;
 }

 /*
  * These three functions resize the page pool.
  */
 static void drop_pages(struct page_list *pl)
 {
 	struct page_list *next;

 	while (pl) {
 		next = pl->next;
 		free_pl(pl);
 		pl = next;
 	}
 }

 /*
  * Allocate and reserve nr_pages for the use of a specific client.
  */
 static int client_reserve_pages(struct dm_kcopyd_client *kc, unsigned int nr_pages)
 {
 	unsigned int i;
 	struct page_list *pl = NULL, *next;

 	for (i = 0; i < nr_pages; i++) {
 		next = alloc_pl(GFP_KERNEL);
 		if (!next) {
 			if (pl)
 				drop_pages(pl);
 			return -ENOMEM;
 		}
 		next->next = pl;
 		pl = next;
 	}

 	kc->nr_reserved_pages += nr_pages;
 	kcopyd_put_pages(kc, pl);

 	return 0;
 }

 static void client_free_pages(struct dm_kcopyd_client *kc)
 {
 	BUG_ON(kc->nr_free_pages != kc->nr_reserved_pages);
 	drop_pages(kc->pages);
 	kc->pages = NULL;
 	kc->nr_free_pages = kc->nr_reserved_pages = 0;
 }

 /*
  *---------------------------------------------------------------
  * kcopyd_jobs need to be allocated by the *clients* of kcopyd,
  * for this reason we use a mempool to prevent the client from
  * ever having to do io (which could cause a deadlock).
  *---------------------------------------------------------------
  */
 struct kcopyd_job {
 	struct dm_kcopyd_client *kc;
 	struct list_head list;
 	unsigned int flags;

 	/*
 	 * Error state of the job.
 	 */
 	int read_err;
 	unsigned long write_err;

 	/*
 	 * REQ_OP_READ, REQ_OP_WRITE or REQ_OP_WRITE_ZEROES.
 	 */
 	enum req_op op;
 	struct dm_io_region source;

 	/*
 	 * The destinations for the transfer.
 	 */
 	unsigned int num_dests;
 	struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS];

 	struct page_list *pages;

 	/*
 	 * Set this to ensure you are notified when the job has
 	 * completed.  'context' is for callback to use.
 	 */
 	dm_kcopyd_notify_fn fn;
 	void *context;

 	/*
 	 * These fields are only used if the job has been split
 	 * into more manageable parts.
 	 */
 	struct mutex lock;
 	atomic_t sub_jobs;
 	sector_t progress;
 	sector_t write_offset;

 	struct kcopyd_job *master_job;
 };

 static struct kmem_cache *_job_cache;

 int __init dm_kcopyd_init(void)
 {
 	_job_cache = kmem_cache_create("kcopyd_job",
 				sizeof(struct kcopyd_job) * (SPLIT_COUNT + 1),
 				__alignof__(struct kcopyd_job), 0, NULL);
 	if (!_job_cache)
 		return -ENOMEM;

 	zero_page_list.next = &zero_page_list;
 	zero_page_list.page = ZERO_PAGE(0);

 	return 0;
 }

 void dm_kcopyd_exit(void)
 {
 	kmem_cache_destroy(_job_cache);
 	_job_cache = NULL;
 }

 /*
  * Functions to push and pop a job onto the head of a given job
  * list.
  */
 static struct kcopyd_job *pop_io_job(struct list_head *jobs,
 				     struct dm_kcopyd_client *kc)
 {
 	struct kcopyd_job *job;

 	/*
 	 * For I/O jobs, pop any read, any write without sequential write
 	 * constraint and sequential writes that are at the right position.
 	 */
 	list_for_each_entry(job, jobs, list) {
 		if (job->op == REQ_OP_READ ||
 		    !(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) {
 			list_del(&job->list);
 			return job;
 		}

 		if (job->write_offset == job->master_job->write_offset) {
 			job->master_job->write_offset += job->source.count;
 			list_del(&job->list);
 			return job;
 		}
 	}

 	return NULL;
 }

 static struct kcopyd_job *pop(struct list_head *jobs,
 			      struct dm_kcopyd_client *kc)
 {
 	struct kcopyd_job *job = NULL;

 	spin_lock_irq(&kc->job_lock);

 	if (!list_empty(jobs)) {
 		if (jobs == &kc->io_jobs)
 			job = pop_io_job(jobs, kc);
 		else {
 			job = list_entry(jobs->next, struct kcopyd_job, list);
 			list_del(&job->list);
 		}
 	}
 	spin_unlock_irq(&kc->job_lock);

 	return job;
 }

 static void push(struct list_head *jobs, struct kcopyd_job *job)
 {
 	unsigned long flags;
 	struct dm_kcopyd_client *kc = job->kc;

 	spin_lock_irqsave(&kc->job_lock, flags);
 	list_add_tail(&job->list, jobs);
 	spin_unlock_irqrestore(&kc->job_lock, flags);
 }


 static void push_head(struct list_head *jobs, struct kcopyd_job *job)
 {
 	struct dm_kcopyd_client *kc = job->kc;

 	spin_lock_irq(&kc->job_lock);
 	list_add(&job->list, jobs);
 	spin_unlock_irq(&kc->job_lock);
 }

 /*
  * These three functions process 1 item from the corresponding
  * job list.
  *
  * They return:
  * < 0: error
  *   0: success
  * > 0: can't process yet.
  */
 static int run_complete_job(struct kcopyd_job *job)
 {
 	void *context = job->context;
 	int read_err = job->read_err;
 	unsigned long write_err = job->write_err;
 	dm_kcopyd_notify_fn fn = job->fn;
 	struct dm_kcopyd_client *kc = job->kc;

 	if (job->pages && job->pages != &zero_page_list)
 		kcopyd_put_pages(kc, job->pages);
 	/*
 	 * If this is the master job, the sub jobs have already
 	 * completed so we can free everything.
 	 */
 	if (job->master_job == job) {
 		mutex_destroy(&job->lock);
 		mempool_free(job, &kc->job_pool);
 	}
 	fn(read_err, write_err, context);

 	if (atomic_dec_and_test(&kc->nr_jobs))
 		wake_up(&kc->destroyq);

 	cond_resched();

 	return 0;
 }

 static void complete_io(unsigned long error, void *context)
 {
 	struct kcopyd_job *job = context;
 	struct dm_kcopyd_client *kc = job->kc;

 	io_job_finish(kc->throttle);

 	if (error) {
 		if (op_is_write(job->op))
 			job->write_err |= error;
 		else
 			job->read_err = 1;

 		if (!(job->flags & BIT(DM_KCOPYD_IGNORE_ERROR))) {
 			push(&kc->complete_jobs, job);
 			wake(kc);
 			return;
 		}
 	}

 	if (op_is_write(job->op))
 		push(&kc->complete_jobs, job);

 	else {
 		job->op = REQ_OP_WRITE;
 		push(&kc->io_jobs, job);
 	}

 	wake(kc);
 }

 /*
  * Request io on as many buffer heads as we can currently get for
  * a particular job.
  */
 static int run_io_job(struct kcopyd_job *job)
 {
 	int r;
 	struct dm_io_request io_req = {
 		.bi_opf = job->op,
 		.mem.type = DM_IO_PAGE_LIST,
 		.mem.ptr.pl = job->pages,
 		.mem.offset = 0,
 		.notify.fn = complete_io,
 		.notify.context = job,
 		.client = job->kc->io_client,
 	};

 	/*
 	 * If we need to write sequentially and some reads or writes failed,
 	 * no point in continuing.
 	 */
 	if (job->flags & BIT(DM_KCOPYD_WRITE_SEQ) &&
 	    job->master_job->write_err) {
 		job->write_err = job->master_job->write_err;
 		return -EIO;
 	}

 	io_job_start(job->kc->throttle);

 	if (job->op == REQ_OP_READ)
 		r = dm_io(&io_req, 1, &job->source, NULL);
 	else
 		r = dm_io(&io_req, job->num_dests, job->dests, NULL);

 	return r;
 }

 static int run_pages_job(struct kcopyd_job *job)
 {
 	int r;
 	unsigned int nr_pages = dm_div_up(job->dests[0].count, PAGE_SIZE >> 9);

 	r = kcopyd_get_pages(job->kc, nr_pages, &job->pages);
 	if (!r) {
 		/* this job is ready for io */
 		push(&job->kc->io_jobs, job);
 		return 0;
 	}

 	if (r == -ENOMEM)
 		/* can't complete now */
 		return 1;

 	return r;
 }

 /*
  * Run through a list for as long as possible.  Returns the count
  * of successful jobs.
  */
 static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc,
 			int (*fn)(struct kcopyd_job *))
 {
 	struct kcopyd_job *job;
 	int r, count = 0;

 	while ((job = pop(jobs, kc))) {

 		r = fn(job);

 		if (r < 0) {
 			/* error this rogue job */
 			if (op_is_write(job->op))
 				job->write_err = (unsigned long) -1L;
 			else
 				job->read_err = 1;
 			push(&kc->complete_jobs, job);
 			wake(kc);
 			break;
 		}

 		if (r > 0) {
 			/*
 			 * We couldn't service this job ATM, so
 			 * push this job back onto the list.
 			 */
 			push_head(jobs, job);
 			break;
 		}

 		count++;
 	}

 	return count;
 }

 /*
  * kcopyd does this every time it's woken up.
  */
 static void do_work(struct work_struct *work)
 {
 	struct dm_kcopyd_client *kc = container_of(work,
 					struct dm_kcopyd_client, kcopyd_work);
 	struct blk_plug plug;

 	/*
 	 * The order that these are called is *very* important.
 	 * complete jobs can free some pages for pages jobs.
 	 * Pages jobs when successful will jump onto the io jobs
 	 * list.  io jobs call wake when they complete and it all
 	 * starts again.
 	 */
 	spin_lock_irq(&kc->job_lock);
 	list_splice_tail_init(&kc->callback_jobs, &kc->complete_jobs);
 	spin_unlock_irq(&kc->job_lock);

 	blk_start_plug(&plug);
 	process_jobs(&kc->complete_jobs, kc, run_complete_job);
 	process_jobs(&kc->pages_jobs, kc, run_pages_job);
 	process_jobs(&kc->io_jobs, kc, run_io_job);
 	blk_finish_plug(&plug);
 }

 /*
  * If we are copying a small region we just dispatch a single job
  * to do the copy, otherwise the io has to be split up into many
  * jobs.
  */
 static void dispatch_job(struct kcopyd_job *job)
 {
 	struct dm_kcopyd_client *kc = job->kc;

 	atomic_inc(&kc->nr_jobs);
 	if (unlikely(!job->source.count))
 		push(&kc->callback_jobs, job);
 	else if (job->pages == &zero_page_list)
 		push(&kc->io_jobs, job);
 	else
 		push(&kc->pages_jobs, job);
 	wake(kc);
 }

 static void segment_complete(int read_err, unsigned long write_err,
 			     void *context)
 {
 	/* FIXME: tidy this function */
 	sector_t progress = 0;
 	sector_t count = 0;
 	struct kcopyd_job *sub_job = context;
 	struct kcopyd_job *job = sub_job->master_job;
 	struct dm_kcopyd_client *kc = job->kc;

 	mutex_lock(&job->lock);

 	/* update the error */
 	if (read_err)
 		job->read_err = 1;

 	if (write_err)
 		job->write_err |= write_err;

 	/*
 	 * Only dispatch more work if there hasn't been an error.
 	 */
 	if ((!job->read_err && !job->write_err) ||
 	    job->flags & BIT(DM_KCOPYD_IGNORE_ERROR)) {
 		/* get the next chunk of work */
 		progress = job->progress;
 		count = job->source.count - progress;
 		if (count) {
 			if (count > kc->sub_job_size)
 				count = kc->sub_job_size;

 			job->progress += count;
 		}
 	}
 	mutex_unlock(&job->lock);

 	if (count) {
 		int i;

 		*sub_job = *job;
 		sub_job->write_offset = progress;
 		sub_job->source.sector += progress;
 		sub_job->source.count = count;

 		for (i = 0; i < job->num_dests; i++) {
 			sub_job->dests[i].sector += progress;
 			sub_job->dests[i].count = count;
 		}

 		sub_job->fn = segment_complete;
 		sub_job->context = sub_job;
 		dispatch_job(sub_job);

 	} else if (atomic_dec_and_test(&job->sub_jobs)) {

 		/*
 		 * Queue the completion callback to the kcopyd thread.
 		 *
 		 * Some callers assume that all the completions are called
 		 * from a single thread and don't race with each other.
 		 *
 		 * We must not call the callback directly here because this
 		 * code may not be executing in the thread.
 		 */
 		push(&kc->complete_jobs, job);
 		wake(kc);
 	}
 }

 /*
  * Create some sub jobs to share the work between them.
  */
 static void split_job(struct kcopyd_job *master_job)
 {
 	int i;

 	atomic_inc(&master_job->kc->nr_jobs);

 	atomic_set(&master_job->sub_jobs, SPLIT_COUNT);
 	for (i = 0; i < SPLIT_COUNT; i++) {
 		master_job[i + 1].master_job = master_job;
 		segment_complete(0, 0u, &master_job[i + 1]);
 	}
 }

 void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
 		    unsigned int num_dests, struct dm_io_region *dests,
 		    unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
 {
 	struct kcopyd_job *job;
 	int i;

 	/*
 	 * Allocate an array of jobs consisting of one master job
 	 * followed by SPLIT_COUNT sub jobs.
 	 */
 	job = mempool_alloc(&kc->job_pool, GFP_NOIO);
 	mutex_init(&job->lock);

 	/*
 	 * set up for the read.
 	 */
 	job->kc = kc;
 	job->flags = flags;
 	job->read_err = 0;
 	job->write_err = 0;

 	job->num_dests = num_dests;
 	memcpy(&job->dests, dests, sizeof(*dests) * num_dests);

 	/*
 	 * If one of the destination is a host-managed zoned block device,
 	 * we need to write sequentially. If one of the destination is a
 	 * host-aware device, then leave it to the caller to choose what to do.
 	 */
 	if (!(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) {
 		for (i = 0; i < job->num_dests; i++) {
 			if (bdev_is_zoned(dests[i].bdev)) {
 				job->flags |= BIT(DM_KCOPYD_WRITE_SEQ);
 				break;
 			}
 		}
 	}

 	/*
 	 * If we need to write sequentially, errors cannot be ignored.
 	 */
 	if (job->flags & BIT(DM_KCOPYD_WRITE_SEQ) &&
 	    job->flags & BIT(DM_KCOPYD_IGNORE_ERROR))
 		job->flags &= ~BIT(DM_KCOPYD_IGNORE_ERROR);

 	if (from) {
 		job->source = *from;
 		job->pages = NULL;
 		job->op = REQ_OP_READ;
 	} else {
 		memset(&job->source, 0, sizeof(job->source));
 		job->source.count = job->dests[0].count;
 		job->pages = &zero_page_list;

 		/*
 		 * Use WRITE ZEROES to optimize zeroing if all dests support it.
 		 */
 		job->op = REQ_OP_WRITE_ZEROES;
 		for (i = 0; i < job->num_dests; i++)
 			if (!bdev_write_zeroes_sectors(job->dests[i].bdev)) {
 				job->op = REQ_OP_WRITE;
 				break;
 			}
 	}

 	job->fn = fn;
 	job->context = context;
 	job->master_job = job;
 	job->write_offset = 0;

 	if (job->source.count <= kc->sub_job_size)
 		dispatch_job(job);
 	else {
 		job->progress = 0;
 		split_job(job);
 	}
 }
 EXPORT_SYMBOL(dm_kcopyd_copy);

 void dm_kcopyd_zero(struct dm_kcopyd_client *kc,
 		    unsigned int num_dests, struct dm_io_region *dests,
 		    unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
 {
 	dm_kcopyd_copy(kc, NULL, num_dests, dests, flags, fn, context);
 }
 EXPORT_SYMBOL(dm_kcopyd_zero);

 void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc,
 				 dm_kcopyd_notify_fn fn, void *context)
 {
 	struct kcopyd_job *job;

 	job = mempool_alloc(&kc->job_pool, GFP_NOIO);

 	memset(job, 0, sizeof(struct kcopyd_job));
 	job->kc = kc;
 	job->fn = fn;
 	job->context = context;
 	job->master_job = job;

 	atomic_inc(&kc->nr_jobs);

 	return job;
 }
 EXPORT_SYMBOL(dm_kcopyd_prepare_callback);

 void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err)
 {
 	struct kcopyd_job *job = j;
 	struct dm_kcopyd_client *kc = job->kc;

 	job->read_err = read_err;
 	job->write_err = write_err;

 	push(&kc->callback_jobs, job);
 	wake(kc);
 }
 EXPORT_SYMBOL(dm_kcopyd_do_callback);

 /*
  * Cancels a kcopyd job, eg. someone might be deactivating a
  * mirror.
  */
 #if 0
 int kcopyd_cancel(struct kcopyd_job *job, int block)
 {
 	/* FIXME: finish */
 	return -1;
 }
 #endif  /*  0  */

 /*
  *---------------------------------------------------------------
  * Client setup
  *---------------------------------------------------------------
  */
 struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle)
 {
 	int r;
 	unsigned int reserve_pages;
 	struct dm_kcopyd_client *kc;

 	kc = kzalloc(sizeof(*kc), GFP_KERNEL);
 	if (!kc)
 		return ERR_PTR(-ENOMEM);

 	spin_lock_init(&kc->job_lock);
 	INIT_LIST_HEAD(&kc->callback_jobs);
 	INIT_LIST_HEAD(&kc->complete_jobs);
 	INIT_LIST_HEAD(&kc->io_jobs);
 	INIT_LIST_HEAD(&kc->pages_jobs);
 	kc->throttle = throttle;

 	r = mempool_init_slab_pool(&kc->job_pool, MIN_JOBS, _job_cache);
 	if (r)
 		goto bad_slab;

 	INIT_WORK(&kc->kcopyd_work, do_work);
 	kc->kcopyd_wq = alloc_workqueue("kcopyd", WQ_MEM_RECLAIM, 0);
 	if (!kc->kcopyd_wq) {
 		r = -ENOMEM;
 		goto bad_workqueue;
 	}

 	kc->sub_job_size = dm_get_kcopyd_subjob_size();
 	reserve_pages = DIV_ROUND_UP(kc->sub_job_size << SECTOR_SHIFT, PAGE_SIZE);

 	kc->pages = NULL;
 	kc->nr_reserved_pages = kc->nr_free_pages = 0;
 	r = client_reserve_pages(kc, reserve_pages);
 	if (r)
 		goto bad_client_pages;

 	kc->io_client = dm_io_client_create();
 	if (IS_ERR(kc->io_client)) {
 		r = PTR_ERR(kc->io_client);
 		goto bad_io_client;
 	}

 	init_waitqueue_head(&kc->destroyq);
 	atomic_set(&kc->nr_jobs, 0);

 	return kc;

 bad_io_client:
 	client_free_pages(kc);
 bad_client_pages:
 	destroy_workqueue(kc->kcopyd_wq);
 bad_workqueue:
 	mempool_exit(&kc->job_pool);
 bad_slab:
 	kfree(kc);

 	return ERR_PTR(r);
 }
 EXPORT_SYMBOL(dm_kcopyd_client_create);

 void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc)
 {
 	/* Wait for completion of all jobs submitted by this client. */
 	wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));

 	BUG_ON(!list_empty(&kc->callback_jobs));
 	BUG_ON(!list_empty(&kc->complete_jobs));
 	BUG_ON(!list_empty(&kc->io_jobs));
 	BUG_ON(!list_empty(&kc->pages_jobs));
 	destroy_workqueue(kc->kcopyd_wq);
 	dm_io_client_destroy(kc->io_client);
 	client_free_pages(kc);
 	mempool_exit(&kc->job_pool);
 	kfree(kc);
 }
 EXPORT_SYMBOL(dm_kcopyd_client_destroy);

 void dm_kcopyd_client_flush(struct dm_kcopyd_client *kc)
 {
 	flush_workqueue(kc->kcopyd_wq);
 }
 EXPORT_SYMBOL(dm_kcopyd_client_flush);
	// SPDX-License-Identifier: GPL-2.0-only
	/*
	* Copyright (C) 2002 Sistina Software (UK) Limited.
	* Copyright (C) 2006 Red Hat GmbH
	*
	* This file is released under the GPL.
	*
	* Kcopyd provides a simple interface for copying an area of one
	* block-device to one or more other block-devices, with an asynchronous
	* completion notification.
	*/

	#include <linux/types.h>
	#include <linux/atomic.h>
	#include <linux/blkdev.h>
	#include <linux/fs.h>
	#include <linux/init.h>
	#include <linux/list.h>
	#include <linux/mempool.h>
	#include <linux/module.h>
	#include <linux/pagemap.h>
	#include <linux/slab.h>
	#include <linux/vmalloc.h>
	#include <linux/workqueue.h>
	#include <linux/mutex.h>
	#include <linux/delay.h>
	#include <linux/device-mapper.h>
	#include <linux/dm-kcopyd.h>

	#include "dm-core.h"

	#define SPLIT_COUNT 8
	#define MIN_JOBS 8

	#define DEFAULT_SUB_JOB_SIZE_KB 512
	#define MAX_SUB_JOB_SIZE_KB 1024

	static unsigned int kcopyd_subjob_size_kb = DEFAULT_SUB_JOB_SIZE_KB;

	module_param(kcopyd_subjob_size_kb, uint, 0644);
	MODULE_PARM_DESC(kcopyd_subjob_size_kb, "Sub-job size for dm-kcopyd clients");

	static unsigned int dm_get_kcopyd_subjob_size(void)
	{
	unsigned int sub_job_size_kb;

	sub_job_size_kb = __dm_get_module_param(&kcopyd_subjob_size_kb,
	DEFAULT_SUB_JOB_SIZE_KB,
	MAX_SUB_JOB_SIZE_KB);

	return sub_job_size_kb << 1;
	}

	/*
	*----------------------------------------------------------------
	* Each kcopyd client has its own little pool of preallocated
	* pages for kcopyd io.
	*---------------------------------------------------------------
	*/
	struct dm_kcopyd_client {
	struct page_list *pages;
	unsigned int nr_reserved_pages;
	unsigned int nr_free_pages;
	unsigned int sub_job_size;

	struct dm_io_client *io_client;

	wait_queue_head_t destroyq;

	mempool_t job_pool;

	struct workqueue_struct *kcopyd_wq;
	struct work_struct kcopyd_work;

	struct dm_kcopyd_throttle *throttle;

	atomic_t nr_jobs;

	/*
	* We maintain four lists of jobs:
	*
	* i) jobs waiting for pages
	* ii) jobs that have pages, and are waiting for the io to be issued.
	* iii) jobs that don't need to do any IO and just run a callback
	* iv) jobs that have completed.
	*
	* All four of these are protected by job_lock.
	*/
	spinlock_t job_lock;
	struct list_head callback_jobs;
	struct list_head complete_jobs;
	struct list_head io_jobs;
	struct list_head pages_jobs;
	};

	static struct page_list zero_page_list;

	static DEFINE_SPINLOCK(throttle_spinlock);

	/*
	* IO/IDLE accounting slowly decays after (1 << ACCOUNT_INTERVAL_SHIFT) period.
	* When total_period >= (1 << ACCOUNT_INTERVAL_SHIFT) the counters are divided
	* by 2.
	*/
	#define ACCOUNT_INTERVAL_SHIFT SHIFT_HZ

	/*
	* Sleep this number of milliseconds.
	*
	* The value was decided experimentally.
	* Smaller values seem to cause an increased copy rate above the limit.
	* The reason for this is unknown but possibly due to jiffies rounding errors
	* or read/write cache inside the disk.
	*/
	#define SLEEP_USEC 100000

	/*
	* Maximum number of sleep events. There is a theoretical livelock if more
	* kcopyd clients do work simultaneously which this limit avoids.
	*/
	#define MAX_SLEEPS 10

	static void io_job_start(struct dm_kcopyd_throttle *t)
	{
	unsigned int throttle, now, difference;
	int slept = 0, skew;

	if (unlikely(!t))
	return;

	try_again:
	spin_lock_irq(&throttle_spinlock);

	throttle = READ_ONCE(t->throttle);

	if (likely(throttle >= 100))
	goto skip_limit;

	now = jiffies;
	difference = now - t->last_jiffies;
	t->last_jiffies = now;
	if (t->num_io_jobs)
	t->io_period += difference;
	t->total_period += difference;

	/*
	* Maintain sane values if we got a temporary overflow.
	*/
	if (unlikely(t->io_period > t->total_period))
	t->io_period = t->total_period;

	if (unlikely(t->total_period >= (1 << ACCOUNT_INTERVAL_SHIFT))) {
	int shift = fls(t->total_period >> ACCOUNT_INTERVAL_SHIFT);

	t->total_period >>= shift;
	t->io_period >>= shift;
	}

	skew = t->io_period - throttle * t->total_period / 100;

	if (unlikely(skew > 0) && slept < MAX_SLEEPS) {
	slept++;
	spin_unlock_irq(&throttle_spinlock);
	fsleep(SLEEP_USEC);
	goto try_again;
	}

	skip_limit:
	t->num_io_jobs++;

	spin_unlock_irq(&throttle_spinlock);
	}

	static void io_job_finish(struct dm_kcopyd_throttle *t)
	{
	unsigned long flags;

	if (unlikely(!t))
	return;

	spin_lock_irqsave(&throttle_spinlock, flags);

	t->num_io_jobs--;

	if (likely(READ_ONCE(t->throttle) >= 100))
	goto skip_limit;

	if (!t->num_io_jobs) {
	unsigned int now, difference;

	now = jiffies;
	difference = now - t->last_jiffies;
	t->last_jiffies = now;

	t->io_period += difference;
	t->total_period += difference;

	/*
	* Maintain sane values if we got a temporary overflow.
	*/
	if (unlikely(t->io_period > t->total_period))
	t->io_period = t->total_period;
	}

	skip_limit:
	spin_unlock_irqrestore(&throttle_spinlock, flags);
	}


	static void wake(struct dm_kcopyd_client *kc)
	{
	queue_work(kc->kcopyd_wq, &kc->kcopyd_work);
	}

	/*
	* Obtain one page for the use of kcopyd.
	*/
	static struct page_list *alloc_pl(gfp_t gfp)
	{
	struct page_list *pl;

	pl = kmalloc(sizeof(*pl), gfp);
	if (!pl)
	return NULL;

	pl->page = alloc_page(gfp \| __GFP_HIGHMEM);
	if (!pl->page) {
	kfree(pl);
	return NULL;
	}

	return pl;
	}

	static void free_pl(struct page_list *pl)
	{
	__free_page(pl->page);
	kfree(pl);
	}

	/*
	* Add the provided pages to a client's free page list, releasing
	* back to the system any beyond the reserved_pages limit.
	*/
	static void kcopyd_put_pages(struct dm_kcopyd_client kc, struct page_list pl)
	{
	struct page_list *next;

	do {
	next = pl->next;

	if (kc->nr_free_pages >= kc->nr_reserved_pages)
	free_pl(pl);
	else {
	pl->next = kc->pages;
	kc->pages = pl;
	kc->nr_free_pages++;
	}

	pl = next;
	} while (pl);
	}

	static int kcopyd_get_pages(struct dm_kcopyd_client *kc,
	unsigned int nr, struct page_list **pages)
	{
	struct page_list *pl;

	*pages = NULL;

	do {
	pl = alloc_pl(__GFP_NOWARN \| __GFP_NORETRY \| __GFP_KSWAPD_RECLAIM);
	if (unlikely(!pl)) {
	/* Use reserved pages */
	pl = kc->pages;
	if (unlikely(!pl))
	goto out_of_memory;
	kc->pages = pl->next;
	kc->nr_free_pages--;
	}
	pl->next = *pages;
	*pages = pl;
	} while (--nr);

	return 0;

	out_of_memory:
	if (*pages)
	kcopyd_put_pages(kc, *pages);
	return -ENOMEM;
	}

	/*
	* These three functions resize the page pool.
	*/
	static void drop_pages(struct page_list *pl)
	{
	struct page_list *next;

	while (pl) {
	next = pl->next;
	free_pl(pl);
	pl = next;
	}
	}

	/*
	* Allocate and reserve nr_pages for the use of a specific client.
	*/
	static int client_reserve_pages(struct dm_kcopyd_client *kc, unsigned int nr_pages)
	{
	unsigned int i;
	struct page_list pl = NULL, next;

	for (i = 0; i < nr_pages; i++) {
	next = alloc_pl(GFP_KERNEL);
	if (!next) {
	if (pl)
	drop_pages(pl);
	return -ENOMEM;
	}
	next->next = pl;
	pl = next;
	}

	kc->nr_reserved_pages += nr_pages;
	kcopyd_put_pages(kc, pl);

	return 0;
	}

	static void client_free_pages(struct dm_kcopyd_client *kc)
	{
	BUG_ON(kc->nr_free_pages != kc->nr_reserved_pages);
	drop_pages(kc->pages);
	kc->pages = NULL;
	kc->nr_free_pages = kc->nr_reserved_pages = 0;
	}

	/*
	*---------------------------------------------------------------
	* kcopyd_jobs need to be allocated by the clients of kcopyd,
	* for this reason we use a mempool to prevent the client from
	* ever having to do io (which could cause a deadlock).
	*---------------------------------------------------------------
	*/
	struct kcopyd_job {
	struct dm_kcopyd_client *kc;
	struct list_head list;
	unsigned int flags;

	/*
	* Error state of the job.
	*/
	int read_err;
	unsigned long write_err;

	/*
	* REQ_OP_READ, REQ_OP_WRITE or REQ_OP_WRITE_ZEROES.
	*/
	enum req_op op;
	struct dm_io_region source;

	/*
	* The destinations for the transfer.
	*/
	unsigned int num_dests;
	struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS];

	struct page_list *pages;

	/*
	* Set this to ensure you are notified when the job has
	* completed. 'context' is for callback to use.
	*/
	dm_kcopyd_notify_fn fn;
	void *context;

	/*
	* These fields are only used if the job has been split
	* into more manageable parts.
	*/
	struct mutex lock;
	atomic_t sub_jobs;
	sector_t progress;
	sector_t write_offset;

	struct kcopyd_job *master_job;
	};

	static struct kmem_cache *_job_cache;

	int __init dm_kcopyd_init(void)
	{
	_job_cache = kmem_cache_create("kcopyd_job",
	sizeof(struct kcopyd_job) * (SPLIT_COUNT + 1),
	__alignof__(struct kcopyd_job), 0, NULL);
	if (!_job_cache)
	return -ENOMEM;

	zero_page_list.next = &zero_page_list;
	zero_page_list.page = ZERO_PAGE(0);

	return 0;
	}

	void dm_kcopyd_exit(void)
	{
	kmem_cache_destroy(_job_cache);
	_job_cache = NULL;
	}

	/*
	* Functions to push and pop a job onto the head of a given job
	* list.
	*/
	static struct kcopyd_job pop_io_job(struct list_head jobs,
	struct dm_kcopyd_client *kc)
	{
	struct kcopyd_job *job;

	/*
	* For I/O jobs, pop any read, any write without sequential write
	* constraint and sequential writes that are at the right position.
	*/
	list_for_each_entry(job, jobs, list) {
	if (job->op == REQ_OP_READ \|\|
	!(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) {
	list_del(&job->list);
	return job;
	}

	if (job->write_offset == job->master_job->write_offset) {
	job->master_job->write_offset += job->source.count;
	list_del(&job->list);
	return job;
	}
	}

	return NULL;
	}

	static struct kcopyd_job pop(struct list_head jobs,
	struct dm_kcopyd_client *kc)
	{
	struct kcopyd_job *job = NULL;

	spin_lock_irq(&kc->job_lock);

	if (!list_empty(jobs)) {
	if (jobs == &kc->io_jobs)
	job = pop_io_job(jobs, kc);
	else {
	job = list_entry(jobs->next, struct kcopyd_job, list);
	list_del(&job->list);
	}
	}
	spin_unlock_irq(&kc->job_lock);

	return job;
	}

	static void push(struct list_head jobs, struct kcopyd_job job)
	{
	unsigned long flags;
	struct dm_kcopyd_client *kc = job->kc;

	spin_lock_irqsave(&kc->job_lock, flags);
	list_add_tail(&job->list, jobs);
	spin_unlock_irqrestore(&kc->job_lock, flags);
	}


	static void push_head(struct list_head jobs, struct kcopyd_job job)
	{
	struct dm_kcopyd_client *kc = job->kc;

	spin_lock_irq(&kc->job_lock);
	list_add(&job->list, jobs);
	spin_unlock_irq(&kc->job_lock);
	}

	/*
	* These three functions process 1 item from the corresponding
	* job list.
	*
	* They return:
	* < 0: error
	* 0: success
	* > 0: can't process yet.
	*/
	static int run_complete_job(struct kcopyd_job *job)
	{
	void *context = job->context;
	int read_err = job->read_err;
	unsigned long write_err = job->write_err;
	dm_kcopyd_notify_fn fn = job->fn;
	struct dm_kcopyd_client *kc = job->kc;

	if (job->pages && job->pages != &zero_page_list)
	kcopyd_put_pages(kc, job->pages);
	/*
	* If this is the master job, the sub jobs have already
	* completed so we can free everything.
	*/
	if (job->master_job == job) {
	mutex_destroy(&job->lock);
	mempool_free(job, &kc->job_pool);
	}
	fn(read_err, write_err, context);

	if (atomic_dec_and_test(&kc->nr_jobs))
	wake_up(&kc->destroyq);

	cond_resched();

	return 0;
	}

	static void complete_io(unsigned long error, void *context)
	{
	struct kcopyd_job *job = context;
	struct dm_kcopyd_client *kc = job->kc;

	io_job_finish(kc->throttle);

	if (error) {
	if (op_is_write(job->op))
	job->write_err \|= error;
	else
	job->read_err = 1;

	if (!(job->flags & BIT(DM_KCOPYD_IGNORE_ERROR))) {
	push(&kc->complete_jobs, job);
	wake(kc);
	return;
	}
	}

	if (op_is_write(job->op))
	push(&kc->complete_jobs, job);

	else {
	job->op = REQ_OP_WRITE;
	push(&kc->io_jobs, job);
	}

	wake(kc);
	}

	/*
	* Request io on as many buffer heads as we can currently get for
	* a particular job.
	*/
	static int run_io_job(struct kcopyd_job *job)
	{
	int r;
	struct dm_io_request io_req = {
	.bi_opf = job->op,
	.mem.type = DM_IO_PAGE_LIST,
	.mem.ptr.pl = job->pages,
	.mem.offset = 0,
	.notify.fn = complete_io,
	.notify.context = job,
	.client = job->kc->io_client,
	};

	/*
	* If we need to write sequentially and some reads or writes failed,
	* no point in continuing.
	*/
	if (job->flags & BIT(DM_KCOPYD_WRITE_SEQ) &&
	job->master_job->write_err) {
	job->write_err = job->master_job->write_err;
	return -EIO;
	}

	io_job_start(job->kc->throttle);

	if (job->op == REQ_OP_READ)
	r = dm_io(&io_req, 1, &job->source, NULL);
	else
	r = dm_io(&io_req, job->num_dests, job->dests, NULL);

	return r;
	}

	static int run_pages_job(struct kcopyd_job *job)
	{
	int r;
	unsigned int nr_pages = dm_div_up(job->dests[0].count, PAGE_SIZE >> 9);

	r = kcopyd_get_pages(job->kc, nr_pages, &job->pages);
	if (!r) {
	/* this job is ready for io */
	push(&job->kc->io_jobs, job);
	return 0;
	}

	if (r == -ENOMEM)
	/* can't complete now */
	return 1;

	return r;
	}

	/*
	* Run through a list for as long as possible. Returns the count
	* of successful jobs.
	*/
	static int process_jobs(struct list_head jobs, struct dm_kcopyd_client kc,
	int (fn)(struct kcopyd_job ))
	{
	struct kcopyd_job *job;
	int r, count = 0;

	while ((job = pop(jobs, kc))) {

	r = fn(job);

	if (r < 0) {
	/* error this rogue job */
	if (op_is_write(job->op))
	job->write_err = (unsigned long) -1L;
	else
	job->read_err = 1;
	push(&kc->complete_jobs, job);
	wake(kc);
	break;
	}

	if (r > 0) {
	/*
	* We couldn't service this job ATM, so
	* push this job back onto the list.
	*/
	push_head(jobs, job);
	break;
	}

	count++;
	}

	return count;
	}

	/*
	* kcopyd does this every time it's woken up.
	*/
	static void do_work(struct work_struct *work)
	{
	struct dm_kcopyd_client *kc = container_of(work,
	struct dm_kcopyd_client, kcopyd_work);
	struct blk_plug plug;

	/*
	* The order that these are called is very important.
	* complete jobs can free some pages for pages jobs.
	* Pages jobs when successful will jump onto the io jobs
	* list. io jobs call wake when they complete and it all
	* starts again.
	*/
	spin_lock_irq(&kc->job_lock);
	list_splice_tail_init(&kc->callback_jobs, &kc->complete_jobs);
	spin_unlock_irq(&kc->job_lock);

	blk_start_plug(&plug);
	process_jobs(&kc->complete_jobs, kc, run_complete_job);
	process_jobs(&kc->pages_jobs, kc, run_pages_job);
	process_jobs(&kc->io_jobs, kc, run_io_job);
	blk_finish_plug(&plug);
	}

	/*
	* If we are copying a small region we just dispatch a single job
	* to do the copy, otherwise the io has to be split up into many
	* jobs.
	*/
	static void dispatch_job(struct kcopyd_job *job)
	{
	struct dm_kcopyd_client *kc = job->kc;

	atomic_inc(&kc->nr_jobs);
	if (unlikely(!job->source.count))
	push(&kc->callback_jobs, job);
	else if (job->pages == &zero_page_list)
	push(&kc->io_jobs, job);
	else
	push(&kc->pages_jobs, job);
	wake(kc);
	}

	static void segment_complete(int read_err, unsigned long write_err,
	void *context)
	{
	/* FIXME: tidy this function */
	sector_t progress = 0;
	sector_t count = 0;
	struct kcopyd_job *sub_job = context;
	struct kcopyd_job *job = sub_job->master_job;
	struct dm_kcopyd_client *kc = job->kc;

	mutex_lock(&job->lock);

	/* update the error */
	if (read_err)
	job->read_err = 1;

	if (write_err)
	job->write_err \|= write_err;

	/*
	* Only dispatch more work if there hasn't been an error.
	*/
	if ((!job->read_err && !job->write_err) \|\|
	job->flags & BIT(DM_KCOPYD_IGNORE_ERROR)) {
	/* get the next chunk of work */
	progress = job->progress;
	count = job->source.count - progress;
	if (count) {
	if (count > kc->sub_job_size)
	count = kc->sub_job_size;

	job->progress += count;
	}
	}
	mutex_unlock(&job->lock);

	if (count) {
	int i;

	sub_job = job;
	sub_job->write_offset = progress;
	sub_job->source.sector += progress;
	sub_job->source.count = count;

	for (i = 0; i < job->num_dests; i++) {
	sub_job->dests[i].sector += progress;
	sub_job->dests[i].count = count;
	}

	sub_job->fn = segment_complete;
	sub_job->context = sub_job;
	dispatch_job(sub_job);

	} else if (atomic_dec_and_test(&job->sub_jobs)) {

	/*
	* Queue the completion callback to the kcopyd thread.
	*
	* Some callers assume that all the completions are called
	* from a single thread and don't race with each other.
	*
	* We must not call the callback directly here because this
	* code may not be executing in the thread.
	*/
	push(&kc->complete_jobs, job);
	wake(kc);
	}
	}

	/*
	* Create some sub jobs to share the work between them.
	*/
	static void split_job(struct kcopyd_job *master_job)
	{
	int i;

	atomic_inc(&master_job->kc->nr_jobs);

	atomic_set(&master_job->sub_jobs, SPLIT_COUNT);
	for (i = 0; i < SPLIT_COUNT; i++) {
	master_job[i + 1].master_job = master_job;
	segment_complete(0, 0u, &master_job[i + 1]);
	}
	}

	void dm_kcopyd_copy(struct dm_kcopyd_client kc, struct dm_io_region from,
	unsigned int num_dests, struct dm_io_region *dests,
	unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
	{
	struct kcopyd_job *job;
	int i;

	/*
	* Allocate an array of jobs consisting of one master job
	* followed by SPLIT_COUNT sub jobs.
	*/
	job = mempool_alloc(&kc->job_pool, GFP_NOIO);
	mutex_init(&job->lock);

	/*
	* set up for the read.
	*/
	job->kc = kc;
	job->flags = flags;
	job->read_err = 0;
	job->write_err = 0;

	job->num_dests = num_dests;
	memcpy(&job->dests, dests, sizeof(dests) num_dests);

	/*
	* If one of the destination is a host-managed zoned block device,
	* we need to write sequentially. If one of the destination is a
	* host-aware device, then leave it to the caller to choose what to do.
	*/
	if (!(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) {
	for (i = 0; i < job->num_dests; i++) {
	if (bdev_is_zoned(dests[i].bdev)) {
	job->flags \|= BIT(DM_KCOPYD_WRITE_SEQ);
	break;
	}
	}
	}

	/*
	* If we need to write sequentially, errors cannot be ignored.
	*/
	if (job->flags & BIT(DM_KCOPYD_WRITE_SEQ) &&
	job->flags & BIT(DM_KCOPYD_IGNORE_ERROR))
	job->flags &= ~BIT(DM_KCOPYD_IGNORE_ERROR);

	if (from) {
	job->source = *from;
	job->pages = NULL;
	job->op = REQ_OP_READ;
	} else {
	memset(&job->source, 0, sizeof(job->source));
	job->source.count = job->dests[0].count;
	job->pages = &zero_page_list;

	/*
	* Use WRITE ZEROES to optimize zeroing if all dests support it.
	*/
	job->op = REQ_OP_WRITE_ZEROES;
	for (i = 0; i < job->num_dests; i++)
	if (!bdev_write_zeroes_sectors(job->dests[i].bdev)) {
	job->op = REQ_OP_WRITE;
	break;
	}
	}

	job->fn = fn;
	job->context = context;
	job->master_job = job;
	job->write_offset = 0;

	if (job->source.count <= kc->sub_job_size)
	dispatch_job(job);
	else {
	job->progress = 0;
	split_job(job);
	}
	}
	EXPORT_SYMBOL(dm_kcopyd_copy);

	void dm_kcopyd_zero(struct dm_kcopyd_client *kc,
	unsigned int num_dests, struct dm_io_region *dests,
	unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
	{
	dm_kcopyd_copy(kc, NULL, num_dests, dests, flags, fn, context);
	}
	EXPORT_SYMBOL(dm_kcopyd_zero);

	void dm_kcopyd_prepare_callback(struct dm_kcopyd_client kc,
	dm_kcopyd_notify_fn fn, void *context)
	{
	struct kcopyd_job *job;

	job = mempool_alloc(&kc->job_pool, GFP_NOIO);

	memset(job, 0, sizeof(struct kcopyd_job));
	job->kc = kc;
	job->fn = fn;
	job->context = context;
	job->master_job = job;

	atomic_inc(&kc->nr_jobs);

	return job;
	}
	EXPORT_SYMBOL(dm_kcopyd_prepare_callback);

	void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err)
	{
	struct kcopyd_job *job = j;
	struct dm_kcopyd_client *kc = job->kc;

	job->read_err = read_err;
	job->write_err = write_err;

	push(&kc->callback_jobs, job);
	wake(kc);
	}
	EXPORT_SYMBOL(dm_kcopyd_do_callback);

	/*
	* Cancels a kcopyd job, eg. someone might be deactivating a
	* mirror.
	*/
	#if 0
	int kcopyd_cancel(struct kcopyd_job *job, int block)
	{
	/* FIXME: finish */
	return -1;
	}
	#endif /* 0 */

	/*
	*---------------------------------------------------------------
	* Client setup
	*---------------------------------------------------------------
	*/
	struct dm_kcopyd_client dm_kcopyd_client_create(struct dm_kcopyd_throttle throttle)
	{
	int r;
	unsigned int reserve_pages;
	struct dm_kcopyd_client *kc;

	kc = kzalloc(sizeof(*kc), GFP_KERNEL);
	if (!kc)
	return ERR_PTR(-ENOMEM);

	spin_lock_init(&kc->job_lock);
	INIT_LIST_HEAD(&kc->callback_jobs);
	INIT_LIST_HEAD(&kc->complete_jobs);
	INIT_LIST_HEAD(&kc->io_jobs);
	INIT_LIST_HEAD(&kc->pages_jobs);
	kc->throttle = throttle;

	r = mempool_init_slab_pool(&kc->job_pool, MIN_JOBS, _job_cache);
	if (r)
	goto bad_slab;

	INIT_WORK(&kc->kcopyd_work, do_work);
	kc->kcopyd_wq = alloc_workqueue("kcopyd", WQ_MEM_RECLAIM, 0);
	if (!kc->kcopyd_wq) {
	r = -ENOMEM;
	goto bad_workqueue;
	}

	kc->sub_job_size = dm_get_kcopyd_subjob_size();
	reserve_pages = DIV_ROUND_UP(kc->sub_job_size << SECTOR_SHIFT, PAGE_SIZE);

	kc->pages = NULL;
	kc->nr_reserved_pages = kc->nr_free_pages = 0;
	r = client_reserve_pages(kc, reserve_pages);
	if (r)
	goto bad_client_pages;

	kc->io_client = dm_io_client_create();
	if (IS_ERR(kc->io_client)) {
	r = PTR_ERR(kc->io_client);
	goto bad_io_client;
	}

	init_waitqueue_head(&kc->destroyq);
	atomic_set(&kc->nr_jobs, 0);

	return kc;

	bad_io_client:
	client_free_pages(kc);
	bad_client_pages:
	destroy_workqueue(kc->kcopyd_wq);
	bad_workqueue:
	mempool_exit(&kc->job_pool);
	bad_slab:
	kfree(kc);

	return ERR_PTR(r);
	}
	EXPORT_SYMBOL(dm_kcopyd_client_create);

	void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc)
	{
	/* Wait for completion of all jobs submitted by this client. */
	wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));

	BUG_ON(!list_empty(&kc->callback_jobs));
	BUG_ON(!list_empty(&kc->complete_jobs));
	BUG_ON(!list_empty(&kc->io_jobs));
	BUG_ON(!list_empty(&kc->pages_jobs));
	destroy_workqueue(kc->kcopyd_wq);
	dm_io_client_destroy(kc->io_client);
	client_free_pages(kc);
	mempool_exit(&kc->job_pool);
	kfree(kc);
	}
	EXPORT_SYMBOL(dm_kcopyd_client_destroy);

	void dm_kcopyd_client_flush(struct dm_kcopyd_client *kc)
	{
	flush_workqueue(kc->kcopyd_wq);
	}
	EXPORT_SYMBOL(dm_kcopyd_client_flush);