disk/qcow.c - kvmtool - Git at Google

 #include "kvm/qcow.h"

 #include "kvm/disk-image.h"
 #include "kvm/read-write.h"
 #include "kvm/mutex.h"
 #include "kvm/util.h"

 #include <sys/types.h>
 #include <sys/stat.h>
 #include <stdbool.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <errno.h>
 #ifdef CONFIG_HAS_ZLIB
 #include <zlib.h>
 #endif

 #include <linux/err.h>
 #include <linux/byteorder.h>
 #include <linux/kernel.h>
 #include <linux/types.h>

 static int update_cluster_refcount(struct qcow *q, u64 clust_idx, u16 append);
 static int qcow_write_refcount_table(struct qcow *q);
 static u64 qcow_alloc_clusters(struct qcow *q, u64 size, int update_ref);
 static void  qcow_free_clusters(struct qcow *q, u64 clust_start, u64 size);

 static inline int qcow_pwrite_sync(int fd,
 	void *buf, size_t count, off_t offset)
 {
 	if (pwrite_in_full(fd, buf, count, offset) < 0)
 		return -1;

 	return fdatasync(fd);
 }

 static int l2_table_insert(struct rb_root *root, struct qcow_l2_table *new)
 {
 	struct rb_node **link = &(root->rb_node), *parent = NULL;
 	u64 offset = new->offset;

 	/* search the tree */
 	while (*link) {
 		struct qcow_l2_table *t;

 		t = rb_entry(*link, struct qcow_l2_table, node);
 		if (!t)
 			goto error;

 		parent = *link;

 		if (t->offset > offset)
 			link = &(*link)->rb_left;
 		else if (t->offset < offset)
 			link = &(*link)->rb_right;
 		else
 			goto out;
 	}

 	/* add new node */
 	rb_link_node(&new->node, parent, link);
 	rb_insert_color(&new->node, root);
 out:
 	return 0;
 error:
 	return -1;
 }

 static struct qcow_l2_table *l2_table_lookup(struct rb_root *root, u64 offset)
 {
 	struct rb_node *link = root->rb_node;

 	while (link) {
 		struct qcow_l2_table *t;

 		t = rb_entry(link, struct qcow_l2_table, node);
 		if (!t)
 			goto out;

 		if (t->offset > offset)
 			link = link->rb_left;
 		else if (t->offset < offset)
 			link = link->rb_right;
 		else
 			return t;
 	}
 out:
 	return NULL;
 }

 static void l1_table_free_cache(struct qcow_l1_table *l1t)
 {
 	struct rb_root *r = &l1t->root;
 	struct list_head *pos, *n;
 	struct qcow_l2_table *t;

 	list_for_each_safe(pos, n, &l1t->lru_list) {
 		/* Remove cache table from the list and RB tree */
 		list_del(pos);
 		t = list_entry(pos, struct qcow_l2_table, list);
 		rb_erase(&t->node, r);

 		/* Free the cached node */
 		free(t);
 	}
 }

 static int qcow_l2_cache_write(struct qcow *q, struct qcow_l2_table *c)
 {
 	struct qcow_header *header = q->header;
 	u64 size;

 	if (!c->dirty)
 		return 0;

 	size = 1 << header->l2_bits;

 	if (qcow_pwrite_sync(q->fd, c->table,
 		size * sizeof(u64), c->offset) < 0)
 		return -1;

 	c->dirty = 0;

 	return 0;
 }

 static int cache_table(struct qcow *q, struct qcow_l2_table *c)
 {
 	struct qcow_l1_table *l1t = &q->table;
 	struct rb_root *r = &l1t->root;
 	struct qcow_l2_table *lru;

 	if (l1t->nr_cached == MAX_CACHE_NODES) {
 		/*
 		 * The node at the head of the list is least recently used
 		 * node. Remove it from the list and replaced with a new node.
 		 */
 		lru = list_first_entry(&l1t->lru_list, struct qcow_l2_table, list);

 		/* Remove the node from the cache */
 		rb_erase(&lru->node, r);
 		list_del_init(&lru->list);
 		l1t->nr_cached--;

 		/* Free the LRUed node */
 		free(lru);
 	}

 	/* Add new node in RB Tree: Helps in searching faster */
 	if (l2_table_insert(r, c) < 0)
 		goto error;

 	/* Add in LRU replacement list */
 	list_add_tail(&c->list, &l1t->lru_list);
 	l1t->nr_cached++;

 	return 0;
 error:
 	return -1;
 }

 static struct qcow_l2_table *l2_table_search(struct qcow *q, u64 offset)
 {
 	struct qcow_l1_table *l1t = &q->table;
 	struct qcow_l2_table *l2t;

 	l2t = l2_table_lookup(&l1t->root, offset);
 	if (!l2t)
 		return NULL;

 	/* Update the LRU state, by moving the searched node to list tail */
 	list_move_tail(&l2t->list, &l1t->lru_list);

 	return l2t;
 }

 /* Allocates a new node for caching L2 table */
 static struct qcow_l2_table *new_cache_table(struct qcow *q, u64 offset)
 {
 	struct qcow_header *header = q->header;
 	struct qcow_l2_table *c;
 	u64 l2t_sz;
 	u64 size;

 	l2t_sz = 1 << header->l2_bits;
 	size   = sizeof(*c) + l2t_sz * sizeof(u64);
 	c      = calloc(1, size);
 	if (!c)
 		goto out;

 	c->offset = offset;
 	RB_CLEAR_NODE(&c->node);
 	INIT_LIST_HEAD(&c->list);
 out:
 	return c;
 }

 static inline u64 get_l1_index(struct qcow *q, u64 offset)
 {
 	struct qcow_header *header = q->header;

 	return offset >> (header->l2_bits + header->cluster_bits);
 }

 static inline u64 get_l2_index(struct qcow *q, u64 offset)
 {
 	struct qcow_header *header = q->header;

 	return (offset >> (header->cluster_bits)) & ((1 << header->l2_bits)-1);
 }

 static inline u64 get_cluster_offset(struct qcow *q, u64 offset)
 {
 	struct qcow_header *header = q->header;

 	return offset & ((1 << header->cluster_bits)-1);
 }

 static struct qcow_l2_table *qcow_read_l2_table(struct qcow *q, u64 offset)
 {
 	struct qcow_header *header = q->header;
 	struct qcow_l2_table *l2t;
 	u64 size;

 	size = 1 << header->l2_bits;

 	/* search an entry for offset in cache */
 	l2t = l2_table_search(q, offset);
 	if (l2t)
 		return l2t;

 	/* allocate new node for caching l2 table */
 	l2t = new_cache_table(q, offset);
 	if (!l2t)
 		goto error;

 	/* table not cached: read from the disk */
 	if (pread_in_full(q->fd, l2t->table, size * sizeof(u64), offset) < 0)
 		goto error;

 	/* cache the table */
 	if (cache_table(q, l2t) < 0)
 		goto error;

 	return l2t;
 error:
 	free(l2t);
 	return NULL;
 }

 static int qcow_decompress_buffer(u8 *out_buf, int out_buf_size,
 	const u8 *buf, int buf_size)
 {
 #ifdef CONFIG_HAS_ZLIB
 	z_stream strm1, *strm = &strm1;
 	int ret, out_len;

 	memset(strm, 0, sizeof(*strm));

 	strm->next_in	= (u8 *)buf;
 	strm->avail_in	= buf_size;
 	strm->next_out	= out_buf;
 	strm->avail_out	= out_buf_size;

 	ret = inflateInit2(strm, -12);
 	if (ret != Z_OK)
 		return -1;

 	ret = inflate(strm, Z_FINISH);
 	out_len = strm->next_out - out_buf;
 	if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
 		out_len != out_buf_size) {
 		inflateEnd(strm);
 		return -1;
 	}

 	inflateEnd(strm);
 	return 0;
 #else
 	return -1;
 #endif
 }

 static ssize_t qcow1_read_cluster(struct qcow *q, u64 offset,
 	void *dst, u32 dst_len)
 {
 	struct qcow_header *header = q->header;
 	struct qcow_l1_table *l1t = &q->table;
 	struct qcow_l2_table *l2t;
 	u64 clust_offset;
 	u64 clust_start;
 	u64 l2t_offset;
 	size_t length;
 	u64 l2t_size;
 	u64 l1_idx;
 	u64 l2_idx;
 	int coffset;
 	int csize;

 	l1_idx = get_l1_index(q, offset);
 	if (l1_idx >= l1t->table_size)
 		return -1;

 	clust_offset = get_cluster_offset(q, offset);
 	if (clust_offset >= q->cluster_size)
 		return -1;

 	length = q->cluster_size - clust_offset;
 	if (length > dst_len)
 		length = dst_len;

 	mutex_lock(&q->mutex);

 	l2t_offset = be64_to_cpu(l1t->l1_table[l1_idx]);
 	if (!l2t_offset)
 		goto zero_cluster;

 	l2t_size = 1 << header->l2_bits;

 	/* read and cache level 2 table */
 	l2t = qcow_read_l2_table(q, l2t_offset);
 	if (!l2t)
 		goto out_error;

 	l2_idx = get_l2_index(q, offset);
 	if (l2_idx >= l2t_size)
 		goto out_error;

 	clust_start = be64_to_cpu(l2t->table[l2_idx]);
 	if (clust_start & QCOW1_OFLAG_COMPRESSED) {
 		coffset	= clust_start & q->cluster_offset_mask;
 		csize	= clust_start >> (63 - q->header->cluster_bits);
 		csize	&= (q->cluster_size - 1);

 		if (pread_in_full(q->fd, q->cluster_data, csize,
 				  coffset) < 0)
 			goto out_error;

 		if (qcow_decompress_buffer(q->cluster_cache, q->cluster_size,
 					q->cluster_data, csize) < 0)
 			goto out_error;

 		memcpy(dst, q->cluster_cache + clust_offset, length);
 		mutex_unlock(&q->mutex);
 	} else {
 		if (!clust_start)
 			goto zero_cluster;

 		mutex_unlock(&q->mutex);

 		if (pread_in_full(q->fd, dst, length,
 				  clust_start + clust_offset) < 0)
 			return -1;
 	}

 	return length;

 zero_cluster:
 	mutex_unlock(&q->mutex);
 	memset(dst, 0, length);
 	return length;

 out_error:
 	mutex_unlock(&q->mutex);
 	length = -1;
 	return -1;
 }

 static ssize_t qcow2_read_cluster(struct qcow *q, u64 offset,
 	void *dst, u32 dst_len)
 {
 	struct qcow_header *header = q->header;
 	struct qcow_l1_table *l1t = &q->table;
 	struct qcow_l2_table *l2t;
 	u64 clust_offset;
 	u64 clust_start;
 	u64 l2t_offset;
 	size_t length;
 	u64 l2t_size;
 	u64 l1_idx;
 	u64 l2_idx;
 	int coffset;
 	int sector_offset;
 	int nb_csectors;
 	int csize;

 	l1_idx = get_l1_index(q, offset);
 	if (l1_idx >= l1t->table_size)
 		return -1;

 	clust_offset = get_cluster_offset(q, offset);
 	if (clust_offset >= q->cluster_size)
 		return -1;

 	length = q->cluster_size - clust_offset;
 	if (length > dst_len)
 		length = dst_len;

 	mutex_lock(&q->mutex);

 	l2t_offset = be64_to_cpu(l1t->l1_table[l1_idx]);

 	l2t_offset &= ~QCOW2_OFLAG_COPIED;
 	if (!l2t_offset)
 		goto zero_cluster;

 	l2t_size = 1 << header->l2_bits;

 	/* read and cache level 2 table */
 	l2t = qcow_read_l2_table(q, l2t_offset);
 	if (!l2t)
 		goto out_error;

 	l2_idx = get_l2_index(q, offset);
 	if (l2_idx >= l2t_size)
 		goto out_error;

 	clust_start = be64_to_cpu(l2t->table[l2_idx]);
 	if (clust_start & QCOW2_OFLAG_COMPRESSED) {
 		coffset = clust_start & q->cluster_offset_mask;
 		nb_csectors = ((clust_start >> q->csize_shift)
 			& q->csize_mask) + 1;
 		sector_offset = coffset & (SECTOR_SIZE - 1);
 		csize = nb_csectors * SECTOR_SIZE - sector_offset;

 		if (pread_in_full(q->fd, q->cluster_data,
 				  nb_csectors * SECTOR_SIZE,
 				  coffset & ~(SECTOR_SIZE - 1)) < 0) {
 			goto out_error;
 		}

 		if (qcow_decompress_buffer(q->cluster_cache, q->cluster_size,
 					q->cluster_data + sector_offset,
 					csize) < 0) {
 			goto out_error;
 		}

 		memcpy(dst, q->cluster_cache + clust_offset, length);
 		mutex_unlock(&q->mutex);
 	} else {
 		clust_start &= QCOW2_OFFSET_MASK;
 		if (!clust_start)
 			goto zero_cluster;

 		mutex_unlock(&q->mutex);

 		if (pread_in_full(q->fd, dst, length,
 				  clust_start + clust_offset) < 0)
 			return -1;
 	}

 	return length;

 zero_cluster:
 	mutex_unlock(&q->mutex);
 	memset(dst, 0, length);
 	return length;

 out_error:
 	mutex_unlock(&q->mutex);
 	length = -1;
 	return -1;
 }

 static ssize_t qcow_read_sector_single(struct disk_image *disk, u64 sector,
 	void *dst, u32 dst_len)
 {
 	struct qcow *q = disk->priv;
 	struct qcow_header *header = q->header;
 	u32 nr_read;
 	u64 offset;
 	char *buf;
 	u32 nr;

 	buf = dst;
 	nr_read = 0;

 	while (nr_read < dst_len) {
 		offset = sector << SECTOR_SHIFT;
 		if (offset >= header->size)
 			return -1;

 		if (q->version == QCOW1_VERSION)
 			nr = qcow1_read_cluster(q, offset, buf,
 				dst_len - nr_read);
 		else
 			nr = qcow2_read_cluster(q, offset, buf,
 				dst_len - nr_read);

 		if (nr <= 0)
 			return -1;

 		nr_read	+= nr;
 		buf	+= nr;
 		sector	+= (nr >> SECTOR_SHIFT);
 	}

 	return dst_len;
 }

 static ssize_t qcow_read_sector(struct disk_image *disk, u64 sector,
 				const struct iovec *iov, int iovcount, void *param)
 {
 	ssize_t nr, total = 0;

 	while (iovcount--) {
 		nr = qcow_read_sector_single(disk, sector, iov->iov_base, iov->iov_len);
 		if (nr != (ssize_t)iov->iov_len) {
 			pr_info("qcow_read_sector error: nr=%ld iov_len=%ld\n", (long)nr, (long)iov->iov_len);
 			return -1;
 		}

 		sector += iov->iov_len >> SECTOR_SHIFT;
 		total += nr;
 		iov++;
 	}

 	return total;
 }

 static void refcount_table_free_cache(struct qcow_refcount_table *rft)
 {
 	struct rb_root *r = &rft->root;
 	struct list_head *pos, *n;
 	struct qcow_refcount_block *t;

 	list_for_each_safe(pos, n, &rft->lru_list) {
 		list_del(pos);
 		t = list_entry(pos, struct qcow_refcount_block, list);
 		rb_erase(&t->node, r);

 		free(t);
 	}
 }

 static int refcount_block_insert(struct rb_root *root, struct qcow_refcount_block *new)
 {
 	struct rb_node **link = &(root->rb_node), *parent = NULL;
 	u64 offset = new->offset;

 	/* search the tree */
 	while (*link) {
 		struct qcow_refcount_block *t;

 		t = rb_entry(*link, struct qcow_refcount_block, node);
 		if (!t)
 			goto error;

 		parent = *link;

 		if (t->offset > offset)
 			link = &(*link)->rb_left;
 		else if (t->offset < offset)
 			link = &(*link)->rb_right;
 		else
 			goto out;
 	}

 	/* add new node */
 	rb_link_node(&new->node, parent, link);
 	rb_insert_color(&new->node, root);
 out:
 	return 0;
 error:
 	return -1;
 }

 static int write_refcount_block(struct qcow *q, struct qcow_refcount_block *rfb)
 {
 	if (!rfb->dirty)
 		return 0;

 	if (qcow_pwrite_sync(q->fd, rfb->entries,
 		rfb->size * sizeof(u16), rfb->offset) < 0)
 		return -1;

 	rfb->dirty = 0;

 	return 0;
 }

 static int cache_refcount_block(struct qcow *q, struct qcow_refcount_block *c)
 {
 	struct qcow_refcount_table *rft = &q->refcount_table;
 	struct rb_root *r = &rft->root;
 	struct qcow_refcount_block *lru;

 	if (rft->nr_cached == MAX_CACHE_NODES) {
 		lru = list_first_entry(&rft->lru_list, struct qcow_refcount_block, list);

 		rb_erase(&lru->node, r);
 		list_del_init(&lru->list);
 		rft->nr_cached--;

 		free(lru);
 	}

 	if (refcount_block_insert(r, c) < 0)
 		goto error;

 	list_add_tail(&c->list, &rft->lru_list);
 	rft->nr_cached++;

 	return 0;
 error:
 	return -1;
 }

 static struct qcow_refcount_block *new_refcount_block(struct qcow *q, u64 rfb_offset)
 {
 	struct qcow_refcount_block *rfb;

 	rfb = malloc(sizeof *rfb + q->cluster_size);
 	if (!rfb)
 		return NULL;

 	rfb->offset = rfb_offset;
 	rfb->size = q->cluster_size / sizeof(u16);
 	RB_CLEAR_NODE(&rfb->node);
 	INIT_LIST_HEAD(&rfb->list);

 	return rfb;
 }

 static struct qcow_refcount_block *refcount_block_lookup(struct rb_root *root, u64 offset)
 {
 	struct rb_node *link = root->rb_node;

 	while (link) {
 		struct qcow_refcount_block *t;

 		t = rb_entry(link, struct qcow_refcount_block, node);
 		if (!t)
 			goto out;

 		if (t->offset > offset)
 			link = link->rb_left;
 		else if (t->offset < offset)
 			link = link->rb_right;
 		else
 			return t;
 	}
 out:
 	return NULL;
 }

 static struct qcow_refcount_block *refcount_block_search(struct qcow *q, u64 offset)
 {
 	struct qcow_refcount_table *rft = &q->refcount_table;
 	struct qcow_refcount_block *rfb;

 	rfb = refcount_block_lookup(&rft->root, offset);
 	if (!rfb)
 		return NULL;

 	/* Update the LRU state, by moving the searched node to list tail */
 	list_move_tail(&rfb->list, &rft->lru_list);

 	return rfb;
 }

 static struct qcow_refcount_block *qcow_grow_refcount_block(struct qcow *q,
 	u64 clust_idx)
 {
 	struct qcow_header *header = q->header;
 	struct qcow_refcount_table *rft = &q->refcount_table;
 	struct qcow_refcount_block *rfb;
 	u64 new_block_offset;
 	u64 rft_idx;

 	rft_idx = clust_idx >> (header->cluster_bits -
 		QCOW_REFCOUNT_BLOCK_SHIFT);

 	if (rft_idx >= rft->rf_size) {
 		pr_warning("Don't support grow refcount block table");
 		return NULL;
 	}

 	new_block_offset = qcow_alloc_clusters(q, q->cluster_size, 0);
 	if (new_block_offset == (u64)-1)
 		return NULL;

 	rfb = new_refcount_block(q, new_block_offset);
 	if (!rfb)
 		return NULL;

 	memset(rfb->entries, 0x00, q->cluster_size);
 	rfb->dirty = 1;

 	/* write refcount block */
 	if (write_refcount_block(q, rfb) < 0)
 		goto free_rfb;

 	if (cache_refcount_block(q, rfb) < 0)
 		goto free_rfb;

 	rft->rf_table[rft_idx] = cpu_to_be64(new_block_offset);
 	if (update_cluster_refcount(q, new_block_offset >>
 		    header->cluster_bits, 1) < 0)
 		goto recover_rft;

 	if (qcow_write_refcount_table(q) < 0)
 		goto recover_rft;

 	return rfb;

 recover_rft:
 	rft->rf_table[rft_idx] = 0;
 free_rfb:
 	free(rfb);
 	return NULL;
 }

 static struct qcow_refcount_block *qcow_read_refcount_block(struct qcow *q, u64 clust_idx)
 {
 	struct qcow_header *header = q->header;
 	struct qcow_refcount_table *rft = &q->refcount_table;
 	struct qcow_refcount_block *rfb;
 	u64 rfb_offset;
 	u64 rft_idx;

 	rft_idx = clust_idx >> (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT);
 	if (rft_idx >= rft->rf_size)
 		return ERR_PTR(-ENOSPC);

 	rfb_offset = be64_to_cpu(rft->rf_table[rft_idx]);
 	if (!rfb_offset)
 		return ERR_PTR(-ENOSPC);

 	rfb = refcount_block_search(q, rfb_offset);
 	if (rfb)
 		return rfb;

 	rfb = new_refcount_block(q, rfb_offset);
 	if (!rfb)
 		return NULL;

 	if (pread_in_full(q->fd, rfb->entries, rfb->size * sizeof(u16), rfb_offset) < 0)
 		goto error_free_rfb;

 	if (cache_refcount_block(q, rfb) < 0)
 		goto error_free_rfb;

 	return rfb;

 error_free_rfb:
 	free(rfb);

 	return NULL;
 }

 static u16 qcow_get_refcount(struct qcow *q, u64 clust_idx)
 {
 	struct qcow_refcount_block *rfb = NULL;
 	struct qcow_header *header = q->header;
 	u64 rfb_idx;

 	rfb = qcow_read_refcount_block(q, clust_idx);
 	if (PTR_ERR(rfb) == -ENOSPC)
 		return 0;
 	else if (IS_ERR_OR_NULL(rfb)) {
 		pr_warning("Error while reading refcount table");
 		return -1;
 	}

 	rfb_idx = clust_idx & (((1ULL <<
 		(header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));

 	if (rfb_idx >= rfb->size) {
 		pr_warning("L1: refcount block index out of bounds");
 		return -1;
 	}

 	return be16_to_cpu(rfb->entries[rfb_idx]);
 }

 static int update_cluster_refcount(struct qcow *q, u64 clust_idx, u16 append)
 {
 	struct qcow_refcount_block *rfb = NULL;
 	struct qcow_header *header = q->header;
 	u16 refcount;
 	u64 rfb_idx;

 	rfb = qcow_read_refcount_block(q, clust_idx);
 	if (PTR_ERR(rfb) == -ENOSPC) {
 		rfb = qcow_grow_refcount_block(q, clust_idx);
 		if (!rfb) {
 			pr_warning("error while growing refcount table");
 			return -1;
 		}
 	} else if (IS_ERR_OR_NULL(rfb)) {
 		pr_warning("error while reading refcount table");
 		return -1;
 	}

 	rfb_idx = clust_idx & (((1ULL <<
 		(header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
 	if (rfb_idx >= rfb->size) {
 		pr_warning("refcount block index out of bounds");
 		return -1;
 	}

 	refcount = be16_to_cpu(rfb->entries[rfb_idx]) + append;
 	rfb->entries[rfb_idx] = cpu_to_be16(refcount);
 	rfb->dirty = 1;

 	/* write refcount block */
 	if (write_refcount_block(q, rfb) < 0) {
 		pr_warning("refcount block index out of bounds");
 		return -1;
 	}

 	/* update free_clust_idx since refcount becomes zero */
 	if (!refcount && clust_idx < q->free_clust_idx)
 		q->free_clust_idx = clust_idx;

 	return 0;
 }

 static void  qcow_free_clusters(struct qcow *q, u64 clust_start, u64 size)
 {
 	struct qcow_header *header = q->header;
 	u64 start, end, offset;

 	start = clust_start & ~(q->cluster_size - 1);
 	end = (clust_start + size - 1) & ~(q->cluster_size - 1);
 	for (offset = start; offset <= end; offset += q->cluster_size)
 		update_cluster_refcount(q, offset >> header->cluster_bits, -1);
 }

 /*
  * Allocate clusters according to the size. Find a postion that
  * can satisfy the size. free_clust_idx is initialized to zero and
  * Record last position.
  */
 static u64 qcow_alloc_clusters(struct qcow *q, u64 size, int update_ref)
 {
 	struct qcow_header *header = q->header;
 	u16 clust_refcount;
 	u32 clust_idx = 0, i;
 	u64 clust_num;

 	clust_num = (size + (q->cluster_size - 1)) >> header->cluster_bits;

 again:
 	for (i = 0; i < clust_num; i++) {
 		clust_idx = q->free_clust_idx++;
 		clust_refcount = qcow_get_refcount(q, clust_idx);
 		if (clust_refcount == (u16)-1)
 			return -1;
 		else if (clust_refcount > 0)
 			goto again;
 	}

 	clust_idx++;

 	if (update_ref)
 		for (i = 0; i < clust_num; i++)
 			if (update_cluster_refcount(q,
 				clust_idx - clust_num + i, 1))
 				return -1;

 	return (clust_idx - clust_num) << header->cluster_bits;
 }

 static int qcow_write_l1_table(struct qcow *q)
 {
 	struct qcow_l1_table *l1t = &q->table;
 	struct qcow_header *header = q->header;

 	if (qcow_pwrite_sync(q->fd, l1t->l1_table,
 		l1t->table_size * sizeof(u64),
 		header->l1_table_offset) < 0)
 		return -1;

 	return 0;
 }

 /*
  * Get l2 table. If the table has been copied, read table directly.
  * If the table exists, allocate a new cluster and copy the table
  * to the new cluster.
  */
 static int get_cluster_table(struct qcow *q, u64 offset,
 	struct qcow_l2_table **result_l2t, u64 *result_l2_index)
 {
 	struct qcow_header *header = q->header;
 	struct qcow_l1_table *l1t = &q->table;
 	struct qcow_l2_table *l2t;
 	u64 l1t_idx;
 	u64 l2t_offset;
 	u64 l2t_idx;
 	u64 l2t_size;
 	u64 l2t_new_offset;

 	l2t_size = 1 << header->l2_bits;

 	l1t_idx = get_l1_index(q, offset);
 	if (l1t_idx >= l1t->table_size)
 		return -1;

 	l2t_idx = get_l2_index(q, offset);
 	if (l2t_idx >= l2t_size)
 		return -1;

 	l2t_offset = be64_to_cpu(l1t->l1_table[l1t_idx]);
 	if (l2t_offset & QCOW2_OFLAG_COPIED) {
 		l2t_offset &= ~QCOW2_OFLAG_COPIED;
 		l2t = qcow_read_l2_table(q, l2t_offset);
 		if (!l2t)
 			goto error;
 	} else {
 		l2t_new_offset = qcow_alloc_clusters(q,
 			l2t_size*sizeof(u64), 1);

 		if (l2t_new_offset != (u64)-1)
 			goto error;

 		l2t = new_cache_table(q, l2t_new_offset);
 		if (!l2t)
 			goto free_cluster;

 		if (l2t_offset) {
 			l2t = qcow_read_l2_table(q, l2t_offset);
 			if (!l2t)
 				goto free_cache;
 		} else
 			memset(l2t->table, 0x00, l2t_size * sizeof(u64));

 		/* write l2 table */
 		l2t->dirty = 1;
 		if (qcow_l2_cache_write(q, l2t) < 0)
 			goto free_cache;

 		/* cache l2 table */
 		if (cache_table(q, l2t))
 			goto free_cache;

 		/* update the l1 talble */
 		l1t->l1_table[l1t_idx] = cpu_to_be64(l2t_new_offset
 			| QCOW2_OFLAG_COPIED);
 		if (qcow_write_l1_table(q)) {
 			pr_warning("Update l1 table error");
 			goto free_cache;
 		}

 		/* free old cluster */
 		qcow_free_clusters(q, l2t_offset, q->cluster_size);
 	}

 	*result_l2t = l2t;
 	*result_l2_index = l2t_idx;

 	return 0;

 free_cache:
 	free(l2t);

 free_cluster:
 	qcow_free_clusters(q, l2t_new_offset, q->cluster_size);

 error:
 	return -1;
 }

 /*
  * If the cluster has been copied, write data directly. If not,
  * read the original data and write it to the new cluster with
  * modification.
  */
 static ssize_t qcow_write_cluster(struct qcow *q, u64 offset,
 		void *buf, u32 src_len)
 {
 	struct qcow_l2_table *l2t;
 	u64 clust_new_start;
 	u64 clust_start;
 	u64 clust_flags;
 	u64 clust_off;
 	u64 l2t_idx;
 	u64 len;

 	l2t = NULL;

 	clust_off = get_cluster_offset(q, offset);
 	if (clust_off >= q->cluster_size)
 		return -1;

 	len = q->cluster_size - clust_off;
 	if (len > src_len)
 		len = src_len;

 	mutex_lock(&q->mutex);

 	if (get_cluster_table(q, offset, &l2t, &l2t_idx)) {
 		pr_warning("Get l2 table error");
 		goto error;
 	}

 	clust_start = be64_to_cpu(l2t->table[l2t_idx]);
 	clust_flags = clust_start & QCOW2_OFLAGS_MASK;

 	clust_start &= QCOW2_OFFSET_MASK;
 	if (!(clust_flags & QCOW2_OFLAG_COPIED)) {
 		clust_new_start	= qcow_alloc_clusters(q, q->cluster_size, 1);
 		if (clust_new_start != (u64)-1) {
 			pr_warning("Cluster alloc error");
 			goto error;
 		}

 		offset &= ~(q->cluster_size - 1);

 		/* if clust_start is not zero, read the original data*/
 		if (clust_start) {
 			mutex_unlock(&q->mutex);
 			if (qcow2_read_cluster(q, offset, q->copy_buff,
 				q->cluster_size) < 0) {
 				pr_warning("Read copy cluster error");
 				qcow_free_clusters(q, clust_new_start,
 					q->cluster_size);
 				return -1;
 			}
 			mutex_lock(&q->mutex);
 		} else
 			memset(q->copy_buff, 0x00, q->cluster_size);

 		memcpy(q->copy_buff + clust_off, buf, len);

 		 /* Write actual data */
 		if (pwrite_in_full(q->fd, q->copy_buff, q->cluster_size,
 			clust_new_start) < 0)
 			goto free_cluster;

 		/* update l2 table*/
 		l2t->table[l2t_idx] = cpu_to_be64(clust_new_start
 			| QCOW2_OFLAG_COPIED);
 		l2t->dirty = 1;

 		if (qcow_l2_cache_write(q, l2t))
 			goto free_cluster;

 		/* free old cluster*/
 		if (clust_flags & QCOW2_OFLAG_COMPRESSED) {
 			int size;
 			size = ((clust_start >> q->csize_shift) &
 				q->csize_mask) + 1;
 			size *= 512;
 			clust_start &= q->cluster_offset_mask;
 			clust_start &= ~511;

 			qcow_free_clusters(q, clust_start, size);
 		} else if (clust_start)
 			qcow_free_clusters(q, clust_start, q->cluster_size);

 	} else {
 		/* Write actual data */
 		if (pwrite_in_full(q->fd, buf, len,
 			clust_start + clust_off) < 0)
 			goto error;
 	}
 	mutex_unlock(&q->mutex);
 	return len;

 free_cluster:
 	qcow_free_clusters(q, clust_new_start, q->cluster_size);

 error:
 	mutex_unlock(&q->mutex);
 	return -1;
 }

 static ssize_t qcow_write_sector_single(struct disk_image *disk, u64 sector, void *src, u32 src_len)
 {
 	struct qcow *q = disk->priv;
 	struct qcow_header *header = q->header;
 	u32 nr_written;
 	char *buf;
 	u64 offset;
 	ssize_t nr;

 	buf		= src;
 	nr_written	= 0;
 	offset		= sector << SECTOR_SHIFT;

 	while (nr_written < src_len) {
 		if (offset >= header->size)
 			return -1;

 		nr = qcow_write_cluster(q, offset, buf, src_len - nr_written);
 		if (nr < 0)
 			return -1;

 		nr_written	+= nr;
 		buf		+= nr;
 		offset		+= nr;
 	}

 	return nr_written;
 }

 static ssize_t qcow_write_sector(struct disk_image *disk, u64 sector,
 				const struct iovec *iov, int iovcount, void *param)
 {
 	ssize_t nr, total = 0;

 	while (iovcount--) {
 		nr = qcow_write_sector_single(disk, sector, iov->iov_base, iov->iov_len);
 		if (nr != (ssize_t)iov->iov_len) {
 			pr_info("qcow_write_sector error: nr=%ld iov_len=%ld\n", (long)nr, (long)iov->iov_len);
 			return -1;
 		}

 		sector	+= iov->iov_len >> SECTOR_SHIFT;
 		iov++;
 		total	+= nr;
 	}

 	return total;
 }

 static int qcow_disk_flush(struct disk_image *disk)
 {
 	struct qcow *q = disk->priv;
 	struct qcow_refcount_table *rft;
 	struct list_head *pos, *n;
 	struct qcow_l1_table *l1t;

 	l1t = &q->table;
 	rft = &q->refcount_table;

 	mutex_lock(&q->mutex);

 	list_for_each_safe(pos, n, &rft->lru_list) {
 		struct qcow_refcount_block *c = list_entry(pos, struct qcow_refcount_block, list);

 		if (write_refcount_block(q, c) < 0)
 			goto error_unlock;
 	}

 	list_for_each_safe(pos, n, &l1t->lru_list) {
 		struct qcow_l2_table *c = list_entry(pos, struct qcow_l2_table, list);

 		if (qcow_l2_cache_write(q, c) < 0)
 			goto error_unlock;
 	}

 	if (qcow_write_l1_table < 0)
 		goto error_unlock;

 	mutex_unlock(&q->mutex);

 	return fsync(disk->fd);

 error_unlock:
 	mutex_unlock(&q->mutex);
 	return -1;
 }

 static int qcow_disk_close(struct disk_image *disk)
 {
 	struct qcow *q;

 	if (!disk)
 		return 0;

 	q = disk->priv;

 	refcount_table_free_cache(&q->refcount_table);
 	l1_table_free_cache(&q->table);
 	free(q->copy_buff);
 	free(q->cluster_data);
 	free(q->cluster_cache);
 	free(q->refcount_table.rf_table);
 	free(q->table.l1_table);
 	free(q->header);
 	free(q);

 	return 0;
 }

 static struct disk_image_operations qcow_disk_readonly_ops = {
 	.read	= qcow_read_sector,
 	.close	= qcow_disk_close,
 };

 static struct disk_image_operations qcow_disk_ops = {
 	.read	= qcow_read_sector,
 	.write	= qcow_write_sector,
 	.flush	= qcow_disk_flush,
 	.close	= qcow_disk_close,
 };

 static int qcow_read_refcount_table(struct qcow *q)
 {
 	struct qcow_header *header = q->header;
 	struct qcow_refcount_table *rft = &q->refcount_table;

 	rft->rf_size = (header->refcount_table_size * q->cluster_size)
 		/ sizeof(u64);

 	rft->rf_table = calloc(rft->rf_size, sizeof(u64));
 	if (!rft->rf_table)
 		return -1;

 	rft->root = (struct rb_root) RB_ROOT;
 	INIT_LIST_HEAD(&rft->lru_list);

 	return pread_in_full(q->fd, rft->rf_table, sizeof(u64) * rft->rf_size, header->refcount_table_offset);
 }

 static int qcow_write_refcount_table(struct qcow *q)
 {
 	struct qcow_header *header = q->header;
 	struct qcow_refcount_table *rft = &q->refcount_table;

 	return qcow_pwrite_sync(q->fd, rft->rf_table,
 		rft->rf_size * sizeof(u64), header->refcount_table_offset);
 }

 static int qcow_read_l1_table(struct qcow *q)
 {
 	struct qcow_header *header = q->header;
 	struct qcow_l1_table *table = &q->table;

 	table->table_size = header->l1_size;

 	table->l1_table	= calloc(table->table_size, sizeof(u64));
 	if (!table->l1_table)
 		return -1;

 	return pread_in_full(q->fd, table->l1_table, sizeof(u64) * table->table_size, header->l1_table_offset);
 }

 static void *qcow2_read_header(int fd)
 {
 	struct qcow2_header_disk f_header;
 	struct qcow_header *header;

 	header = malloc(sizeof(struct qcow_header));
 	if (!header)
 		return NULL;

 	if (pread_in_full(fd, &f_header, sizeof(struct qcow2_header_disk), 0) < 0) {
 		free(header);
 		return NULL;
 	}

 	be32_to_cpus(&f_header.magic);
 	be32_to_cpus(&f_header.version);
 	be64_to_cpus(&f_header.backing_file_offset);
 	be32_to_cpus(&f_header.backing_file_size);
 	be32_to_cpus(&f_header.cluster_bits);
 	be64_to_cpus(&f_header.size);
 	be32_to_cpus(&f_header.crypt_method);
 	be32_to_cpus(&f_header.l1_size);
 	be64_to_cpus(&f_header.l1_table_offset);
 	be64_to_cpus(&f_header.refcount_table_offset);
 	be32_to_cpus(&f_header.refcount_table_clusters);
 	be32_to_cpus(&f_header.nb_snapshots);
 	be64_to_cpus(&f_header.snapshots_offset);

 	*header		= (struct qcow_header) {
 		.size			= f_header.size,
 		.l1_table_offset	= f_header.l1_table_offset,
 		.l1_size		= f_header.l1_size,
 		.cluster_bits		= f_header.cluster_bits,
 		.l2_bits		= f_header.cluster_bits - 3,
 		.refcount_table_offset	= f_header.refcount_table_offset,
 		.refcount_table_size	= f_header.refcount_table_clusters,
 	};

 	return header;
 }

 static struct disk_image *qcow2_probe(int fd, bool readonly)
 {
 	struct disk_image *disk_image;
 	struct qcow_l1_table *l1t;
 	struct qcow_header *h;
 	struct qcow *q;

 	q = calloc(1, sizeof(struct qcow));
 	if (!q)
 		return NULL;

 	mutex_init(&q->mutex);
 	q->fd = fd;

 	l1t = &q->table;

 	l1t->root = (struct rb_root) RB_ROOT;
 	INIT_LIST_HEAD(&l1t->lru_list);

 	h = q->header = qcow2_read_header(fd);
 	if (!h)
 		goto free_qcow;

 	q->version = QCOW2_VERSION;
 	q->csize_shift = (62 - (q->header->cluster_bits - 8));
 	q->csize_mask = (1 << (q->header->cluster_bits - 8)) - 1;
 	q->cluster_offset_mask = (1LL << q->csize_shift) - 1;
 	q->cluster_size = 1 << q->header->cluster_bits;

 	q->copy_buff = malloc(q->cluster_size);
 	if (!q->copy_buff) {
 		pr_warning("copy buff malloc error");
 		goto free_header;
 	}

 	q->cluster_data = malloc(q->cluster_size);
 	if (!q->cluster_data) {
 		pr_warning("cluster data malloc error");
 		goto free_copy_buff;
 	}

 	q->cluster_cache = malloc(q->cluster_size);
 	if (!q->cluster_cache) {
 		pr_warning("cluster cache malloc error");
 		goto free_cluster_data;
 	}

 	if (qcow_read_l1_table(q) < 0)
 		goto free_cluster_cache;

 	if (qcow_read_refcount_table(q) < 0)
 		goto free_l1_table;

 	/*
 	 * Do not use mmap use read/write instead
 	 */
 	if (readonly)
 		disk_image = disk_image__new(fd, h->size, &qcow_disk_readonly_ops, DISK_IMAGE_REGULAR);
 	else
 		disk_image = disk_image__new(fd, h->size, &qcow_disk_ops, DISK_IMAGE_REGULAR);

 	if (IS_ERR_OR_NULL(disk_image))
 		goto free_refcount_table;

 	disk_image->priv = q;

 	return disk_image;

 free_refcount_table:
 	if (q->refcount_table.rf_table)
 		free(q->refcount_table.rf_table);
 free_l1_table:
 	if (q->table.l1_table)
 		free(q->table.l1_table);
 free_cluster_cache:
 	if (q->cluster_cache)
 		free(q->cluster_cache);
 free_cluster_data:
 	if (q->cluster_data)
 		free(q->cluster_data);
 free_copy_buff:
 	if (q->copy_buff)
 		free(q->copy_buff);
 free_header:
 	if (q->header)
 		free(q->header);
 free_qcow:
 	free(q);

 	return NULL;
 }

 static bool qcow2_check_image(int fd)
 {
 	struct qcow2_header_disk f_header;

 	if (pread_in_full(fd, &f_header, sizeof(struct qcow2_header_disk), 0) < 0)
 		return false;

 	be32_to_cpus(&f_header.magic);
 	be32_to_cpus(&f_header.version);

 	if (f_header.magic != QCOW_MAGIC)
 		return false;

 	if (f_header.version != QCOW2_VERSION)
 		return false;

 	return true;
 }

 static void *qcow1_read_header(int fd)
 {
 	struct qcow1_header_disk f_header;
 	struct qcow_header *header;

 	header = malloc(sizeof(struct qcow_header));
 	if (!header)
 		return NULL;

 	if (pread_in_full(fd, &f_header, sizeof(struct qcow1_header_disk), 0) < 0) {
 		free(header);
 		return NULL;
 	}

 	be32_to_cpus(&f_header.magic);
 	be32_to_cpus(&f_header.version);
 	be64_to_cpus(&f_header.backing_file_offset);
 	be32_to_cpus(&f_header.backing_file_size);
 	be32_to_cpus(&f_header.mtime);
 	be64_to_cpus(&f_header.size);
 	be32_to_cpus(&f_header.crypt_method);
 	be64_to_cpus(&f_header.l1_table_offset);

 	*header		= (struct qcow_header) {
 		.size			= f_header.size,
 		.l1_table_offset	= f_header.l1_table_offset,
 		.l1_size		= f_header.size / ((1 << f_header.l2_bits) * (1 << f_header.cluster_bits)),
 		.cluster_bits		= f_header.cluster_bits,
 		.l2_bits		= f_header.l2_bits,
 	};

 	return header;
 }

 static struct disk_image *qcow1_probe(int fd, bool readonly)
 {
 	struct disk_image *disk_image;
 	struct qcow_l1_table *l1t;
 	struct qcow_header *h;
 	struct qcow *q;

 	q = calloc(1, sizeof(struct qcow));
 	if (!q)
 		return NULL;

 	mutex_init(&q->mutex);
 	q->fd = fd;

 	l1t = &q->table;

 	l1t->root = (struct rb_root)RB_ROOT;
 	INIT_LIST_HEAD(&l1t->lru_list);
 	INIT_LIST_HEAD(&q->refcount_table.lru_list);

 	h = q->header = qcow1_read_header(fd);
 	if (!h)
 		goto free_qcow;

 	q->version = QCOW1_VERSION;
 	q->cluster_size = 1 << q->header->cluster_bits;
 	q->cluster_offset_mask = (1LL << (63 - q->header->cluster_bits)) - 1;
 	q->free_clust_idx = 0;

 	q->cluster_data = malloc(q->cluster_size);
 	if (!q->cluster_data) {
 		pr_warning("cluster data malloc error");
 		goto free_header;
 	}

 	q->cluster_cache = malloc(q->cluster_size);
 	if (!q->cluster_cache) {
 		pr_warning("cluster cache malloc error");
 		goto free_cluster_data;
 	}

 	if (qcow_read_l1_table(q) < 0)
 		goto free_cluster_cache;

 	/*
 	 * Do not use mmap use read/write instead
 	 */
 	if (readonly)
 		disk_image = disk_image__new(fd, h->size, &qcow_disk_readonly_ops, DISK_IMAGE_REGULAR);
 	else
 		disk_image = disk_image__new(fd, h->size, &qcow_disk_ops, DISK_IMAGE_REGULAR);

 	if (!disk_image)
 		goto free_l1_table;

 	disk_image->priv = q;

 	return disk_image;

 free_l1_table:
 	if (q->table.l1_table)
 		free(q->table.l1_table);
 free_cluster_cache:
 	if (q->cluster_cache)
 		free(q->cluster_cache);
 free_cluster_data:
 	if (q->cluster_data)
 		free(q->cluster_data);
 free_header:
 	if (q->header)
 		free(q->header);
 free_qcow:
 	free(q);

 	return NULL;
 }

 static bool qcow1_check_image(int fd)
 {
 	struct qcow1_header_disk f_header;

 	if (pread_in_full(fd, &f_header, sizeof(struct qcow1_header_disk), 0) < 0)
 		return false;

 	be32_to_cpus(&f_header.magic);
 	be32_to_cpus(&f_header.version);

 	if (f_header.magic != QCOW_MAGIC)
 		return false;

 	if (f_header.version != QCOW1_VERSION)
 		return false;

 	return true;
 }

 struct disk_image *qcow_probe(int fd, bool readonly)
 {
 	if (qcow1_check_image(fd))
 		return qcow1_probe(fd, readonly);

 	if (qcow2_check_image(fd))
 		return qcow2_probe(fd, readonly);

 	return NULL;
 }