Add kvmtool
diff --git a/kvmtool/.gitignore b/kvmtool/.gitignore
new file mode 100644
index 0000000..e935ee5
--- /dev/null
+++ b/kvmtool/.gitignore
@@ -0,0 +1,16 @@
+/lkvm
+/lkvm-static
+/vm
+*.o
+*.d
+*.swp
+/cscope.*
+tags
+tests/boot/boot_test.iso
+tests/boot/rootfs/
+guest/init
+guest/pre_init
+guest/init_stage2
+KVMTOOLS-VERSION-FILE
+/x86/bios/bios.bin
+/x86/bios/bios.bin.elf
diff --git a/kvmtool/COPYING b/kvmtool/COPYING
new file mode 100644
index 0000000..10828e0
--- /dev/null
+++ b/kvmtool/COPYING
@@ -0,0 +1,341 @@
+
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                       51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/kvmtool/CREDITS-Git b/kvmtool/CREDITS-Git
new file mode 100644
index 0000000..c2ddcb3
--- /dev/null
+++ b/kvmtool/CREDITS-Git
@@ -0,0 +1,30 @@
+Most of the infrastructure that 'perf' uses here has been reused
+from the Git project, as of version:
+
+    66996ec: Sync with 1.6.2.4
+
+Here is an (incomplete!) list of main contributors to those files
+in util/* and elsewhere:
+
+ Alex Riesen
+ Christian Couder
+ Dmitry Potapov
+ Jeff King
+ Johannes Schindelin
+ Johannes Sixt
+ Junio C Hamano
+ Linus Torvalds
+ Matthias Kestenholz
+ Michal Ostrowski
+ Miklos Vajna
+ Petr Baudis
+ Pierre Habouzit
+ René Scharfe
+ Samuel Tardieu
+ Shawn O. Pearce
+ Steffen Prohaska
+ Steve Haslam
+
+Thanks guys!
+
+The full history of the files can be found in the upstream Git commits.
diff --git a/kvmtool/Documentation/kernel-debugging.txt b/kvmtool/Documentation/kernel-debugging.txt
new file mode 100644
index 0000000..98b9438
--- /dev/null
+++ b/kvmtool/Documentation/kernel-debugging.txt
@@ -0,0 +1,15 @@
+This document explains how to debug a guests' kernel using KGDB.
+
+1. Run the guest:
+        'lkvm run -k [vmlinuz] -p "kgdboc=ttyS1 kgdbwait" --tty 1'
+
+And see which PTY got assigned to ttyS1 (you'll see:
+'  Info: Assigned terminal 1 to pty /dev/pts/X').
+
+2. Run GDB on the host:
+        'gdb [vmlinuz]'
+
+3. Connect to the guest (from within GDB):
+        'target remote /dev/pty/X'
+
+4. Start debugging! (enter 'continue' to continue boot).
diff --git a/kvmtool/Documentation/kvmtool.1 b/kvmtool/Documentation/kvmtool.1
new file mode 100644
index 0000000..2b8c274
--- /dev/null
+++ b/kvmtool/Documentation/kvmtool.1
@@ -0,0 +1,226 @@
+.\" Manpage for kvmtool
+.\" Copyright (C) 2015 by Andre Przywara <andre.przywara@arm.com>
+.TH kvmtool 1 "11 Nov 2015" "0.1" "kvmtool man page"
+.SH NAME
+kvmtool \- running KVM guests
+.SH SYNOPSIS
+lkvm COMMAND [ARGS]
+.SH DESCRIPTION
+kvmtool is a userland tool for creating and controlling KVM guests.
+.SH "KVMTOOL COMMANDS"
+.sp
+.PP
+.B run -k <kernel\-image> [\-c <cores>] [\-m <MiB>] [\-p <command line>]
+.br
+.B [\-i <initrd>] [\-d <image file>] [\-\-console serial|virtio|hv]
+.br
+.B [\-\-dev <node>] [\-\-debug] [\-\-debug\-single\-step] [\-\-debug\-ioport]
+.RS 4
+Run a guest.
+.sp
+.B \-k, \-\-kernel <image file>
+.RS 4
+The virtual machine kernel.
+.RE
+.sp
+.B \-c, \-\-cpus <n>
+.RS 4
+The number of virtual CPUs to run.
+.RE
+.sp
+.B \-m, \-\-mem <n>
+.RS 4
+Virtual machine memory size in MiB.
+.RE
+.sp
+.B \-p, \-\-params <parameters>
+.RS 4
+Additional kernel command line arguments.
+.RE
+.sp
+.B \-i, \-\-initrd <image file>
+.RS 4
+Initial RAM disk image.
+.RE
+.sp
+.B \-d, \-\-disk <image file|directory>
+.RS 4
+A disk image file or a rootfs directory.
+.RE
+.sp
+.B \-\-console serial|virtio|hv
+.RS 4
+Console to use.
+.RE
+.sp
+.B \-\-dev <device node>
+.RS 4
+KVM device file (instead of the default /dev/kvm).
+.RE
+.sp
+.B \-\-debug
+.RS 4
+Enable debug messages.
+.RE
+.sp
+.B \-\-debug-single-step
+.RS 4
+Enable single stepping.
+.RE
+.sp
+.B \-\-debug-ioport
+.RS 4
+Enable ioport debugging.
+.RE
+.RE
+.PP
+.B setup <name>
+.RS 4
+Setup a new virtual machine. This creates a new rootfs in the .lkvm
+folder of your home directory.
+.RE
+.PP
+.B pause \-\-all|\-\-name <name>
+.RS 4
+Pause a virtual machine.
+.sp
+.B \-a, \-\-all
+.RS 4
+Pause all running instances.
+.RE
+.sp
+.B \-n, \-\-name <name>
+.RS 4
+Pause that specified instance. For a list of running instances, see \fI lkvm list\fR.
+.RE
+.RE
+.PP
+.B resume --all|--name <name>
+.RS 4
+Resume a previously paused virtual machine.
+.sp
+.B \-a, \-\-all
+.RS 4
+Resume all running instances.
+.RE
+.sp
+.B \-n, \-\-name <name>
+.RS 4
+Resume that specified instance. For a list of running instances, see \fI lkvm list\fR.
+.RE
+.RE
+.PP
+.B list [\-i] [\-r]
+.RS 4
+Print a list of running instances on the host. This is restricted to instances
+started by the current user, as it looks in the .lkvm folder in your home
+directory to find the socket files.
+.sp
+.B \-i, \-\-run
+.RS 4
+List all running instances.
+.RE
+.sp
+.B \-r, \-\-rootfs
+.RS 4
+List rootfs instances.
+.RE
+.RE
+.PP
+.B debug --all|--name <guest name> [--dump] [--nmi <n>] [--sysrq <rq>]
+.RS 4
+Print debug information from a running VM instance.
+.sp
+.B \-a, \-\-all
+.RS 4
+Debug all running instances.
+.RE
+.PP
+.B \-n, \-\-name <guest name>
+.RS 4
+Debug the specified instance.
+.RE
+.sp
+.B \-d, \-\-dump
+.RS 4
+Generate a debug dump from guest.
+.RE
+.PP
+.B \-m, \-\-nmi <VCPU nr>
+.RS 4
+Generate an NMI on the specified virtual CPU.
+.RE
+.PP
+.B \-s, \-\-sysrq <sysrq>
+.RS 4
+Inject a Linux sysrq into the guest.
+.RE
+.RE
+.PP
+.B balloon \-\-name <guest name> \-\-inflate|\-\-deflate <amount in MB>
+.RS 4
+This command inflates or deflates the virtio balloon located in the
+specified instance.
+\-\-inflate increases the size of the balloon, thus \fIdecreasing\fR the
+amount of virtual RAM available for the guest. \-\-deflate returns previously
+inflated memory back to the guest.
+.sp
+.B \-n, \-\-name <guest name>
+.RS 4
+Ballon the specified instance. For a list of all instances, see \fI"lkvm list"\fR.
+.RE
+.PP
+.B \-i, \-\-inflate <n>
+.RS 4
+Inflates the ballon by the specified number of Megabytes. This decreases the
+amount of usable memory in the guest.
+.RE
+.PP
+.B \-d, \-\-deflate <n>
+.RS 4
+Deflates the ballon by the specified number of Megabytes. This increases the
+amount of usable memory in the guest.
+.RE
+.RE
+.PP
+.B stop --all|--name <name>
+.RS 4
+Stop a running instance.
+.sp
+.B \-a, \-\-all
+.RS 4
+Stop all running instances.
+.RE
+.sp
+.B \-n, \-\-name <name>
+.RS 4
+Stop the specified instance. For a list of running instances, see \fI lkvm list\fR.
+.RE
+.RE
+.PP
+.B stat \-\-all|\-\-name <name> [\-m]
+.RS 4
+Print statistics about a running instance.
+.sp
+.B \-m, \-\-memory
+.RS 4
+Display memory statistics.
+.RE
+.RE
+.PP
+.B sandbox (\fIlkvm run arguments\fR) \-\- [sandboxed command]
+.RS 4
+Run a command in a sandboxed guest. Kvmtool will inject a special init
+binary which will do an initial setup of the guest Linux and then
+lauch a shell script with the specified command. Upon this command ending,
+the guest will be shutdown.
+.RE
+.SH EXAMPLES
+.RS 4
+\fB$\fR lkvm run -k bzImage
+.RE
+.SH SEE ALSO
+qemu(1), kvm(4)
+.SH BUGS
+.SH AUTHOR
+Andre Przywara <andre.przywara@arm.com>
diff --git a/kvmtool/Documentation/virtio-console.txt b/kvmtool/Documentation/virtio-console.txt
new file mode 100644
index 0000000..4a58d56
--- /dev/null
+++ b/kvmtool/Documentation/virtio-console.txt
@@ -0,0 +1,41 @@
+General
+--------
+
+virtio-console as the name implies is a console over virtio transport. Here is
+a simple head to head comparison of the virtio-console vs regular 8250 console:
+
+8250 serial console:
+
+ - Requires CONFIG_SERIAL_8250=y and CONFIG_SERIAL_8250_CONSOLE=y kernel configs,
+which are enabled almost everywhere.
+ - Doesn't require guest-side changes.
+ - Compatible with older guests.
+
+virtio-console:
+
+ - Requires CONFIG_VIRTIO_CONSOLE=y (along with all other virtio dependencies),
+which got enabled only in recent kernels (but not all of them).
+ - Much faster.
+ - Consumes less processing resources.
+ - Requires guest-side changes.
+
+Enabling virtio-console
+------------------------
+
+First, make sure guest kernel is built with CONFIG_VIRTIO_CONSOLE=y. Once this
+is done, the following has to be done inside guest image:
+
+ - Add the following line to /etc/inittab:
+	'hvc0:2345:respawn:/sbin/agetty -L 9600 hvc0'
+ - Add 'hvc0' to /etc/securetty (so you could actually log on)
+ - Start the guest with '--console virtio'
+
+Common errors
+--------------
+
+Q: I don't see anything on the screen!
+A: Make sure CONFIG_VIRTIO_CONSOLE=y is enabled in the *guest* kernel, also
+make sure you've updated /etc/inittab
+
+Q: It won't accept my username/password, but I enter them correctly!
+A: You didn't add 'hvc0' to /etc/securetty
diff --git a/kvmtool/INSTALL b/kvmtool/INSTALL
new file mode 100644
index 0000000..ca8e022
--- /dev/null
+++ b/kvmtool/INSTALL
@@ -0,0 +1,99 @@
+ Installation instructions for kvmtool
+---------------------------------------
+
+==========================================================================
+For the impatient:
+Just typing "make" should do the trick most of the times.
+You will get a binary called "lkvm" which is self-contained.
+No extra libraries or files need to be installed.
+==========================================================================
+
+"make install" will copy the compiled file into $HOME/bin, this can be
+changed by providing "prefix=" on the make command-line. DESTDIR will be
+honoured.
+
+Prerequisites
+--------------
+For compilation you will need a recent GNU tool chain (binutils, gcc, make),
+also the standard C library.
+
+For deb based systems:
+	$ sudo apt-get install build-essential
+	On x86-64 systems you have to add the 32-bit compat headers:
+	$ sudo apt-get install libc6-dev-i386
+For Fedora based systems:
+	# yum install glibc-static
+For OpenSUSE based systems:
+	# zypper install glibc-devel-static
+
+Architectures which require device tree (PowerPC, ARM, ARM64) also require
+libfdt.
+	deb: $ sudo apt-get install libfdt-dev
+	Fedora: # yum install libfdt-devel
+	OpenSUSE: # zypper install libfdt1-devel
+Also see "Cross compiling" below.
+
+Optional libraries
+-------------------
+By running "make" some checks are invoked that determine the availability
+of certain optional libraries. Those are:
+- libbfd: enable symbol look-up support in debug mode
+- gtk3: enable support for displaying the guest framebuffer in a GTK+-3 window
+- vncserver: enable support for exporting the guest framebuffer in a VNC session
+- SDL: enable support for displaying the guest framebuffer in a SDL window
+- zlib: enable support for compressed QCOW images
+- aio: enable support for asynchronous I/O
+(Note that a guest framebuffer is currently only supported on x86.)
+So for the full glory you would need:
+(on a .deb based system):
+$ sudo apt-get install binutils-dev libgtk-3-dev libvncserver-dev libsdl2-dev \
+		       zlib1g-dev libaio-dev
+(on RPM based systems):
+# $TOOL install binutils-devel gtk3-devel libvncserver-devel SDL-devel \
+	      zlib-devel libaio-devel
+$TOOL is "yum" for Fedora and "zypper" for OpenSUSE.
+
+Cross compiling
+----------------
+The Makefile will honour the CROSS_COMPILE environment variable when calling
+the compiler and the linker binary. To trigger cross compilation, also set ARCH
+to the Linux name of the architecture. Architectures supported:
+- i386
+- x86_64
+- powerpc
+- arm
+- arm64
+- mips
+If ARCH is not provided, the target architecture will be automatically
+determined by running "uname -m" on your host, resulting in a native build.
+
+To cross-compile to ARM for instance, install a cross-compiler, put the
+required libraries in the cross-compiler's SYSROOT and type:
+$ make CROSS_COMPILE=arm-linux-gnueabihf- ARCH=arm
+
+Missing libraries when cross-compiling
+---------------------------------------
+The cross-compiler will look for target libraries in its SYSROOT directory,
+so you need to put the header and library files (.so) there.
+While most cross compiler packages come with the target's glibc already
+installed, optional libraries (or libfdt) maybe not.
+On multiarch system you should be able to install those be appending
+the architecture name after the package (example for ARM64):
+$ sudo apt-get install libfdt-dev:arm64
+
+PowerPC and ARM/ARM64 require libfdt to be installed. If you cannot use
+precompiled mulitarch packages, you could either copy the required header and
+library files from an installed target system into the SYSROOT (you will need
+/usr/include/*fdt*.h and /usr/lib64/libfdt-v.v.v.so and its symlinks), or you
+can cross-compile the libfdt library yourself:
+
+$ git clone git://git.kernel.org/pub/scm/utils/dtc/dtc.git
+$ cd dtc
+$ export CC=${CROSS_COMPILE}gcc
+$ TRIPLET=$($CC -dumpmachine)
+$ SYSROOT=$($CC -print-sysroot)
+$ make libfdt
+$ sudo make DESTDIR=$SYSROOT PREFIX=/usr LIBDIR=/usr/lib/$TRIPLET install-lib install-includes
+
+This assumes a multiarch-enabled system, if there is no per-arch directory for
+libraries, replace the LIBDIR paths above with LIBDIR=/usr/lib or /usr/lib64.
diff --git a/kvmtool/Makefile b/kvmtool/Makefile
new file mode 100644
index 0000000..35bb118
--- /dev/null
+++ b/kvmtool/Makefile
@@ -0,0 +1,563 @@
+#
+# Define WERROR=0 to disable -Werror.
+#
+
+ifeq ($(strip $(V)),)
+	ifeq ($(findstring s,$(filter-out --%,$(MAKEFLAGS))),)
+		E = @echo
+	else
+		E = @\#
+	endif
+	Q = @
+else
+	E = @\#
+	Q =
+endif
+export E Q
+
+include config/utilities.mak
+include config/feature-tests.mak
+
+CC	:= $(CROSS_COMPILE)gcc
+CFLAGS	:=
+LD	:= $(CROSS_COMPILE)ld
+LDFLAGS	:=
+OBJCOPY	:= $(CROSS_COMPILE)objcopy
+
+FIND	:= find
+CSCOPE	:= cscope
+TAGS	:= ctags
+INSTALL := install
+
+prefix = $(HOME)
+bindir_relative = bin
+bindir = $(prefix)/$(bindir_relative)
+
+DESTDIR_SQ = $(subst ','\'',$(DESTDIR))
+bindir_SQ = $(subst ','\'',$(bindir))
+
+PROGRAM	:= lkvm
+PROGRAM_ALIAS := vm
+
+OBJS	+= builtin-balloon.o
+OBJS	+= builtin-debug.o
+OBJS	+= builtin-help.o
+OBJS	+= builtin-list.o
+OBJS	+= builtin-stat.o
+OBJS	+= builtin-pause.o
+OBJS	+= builtin-resume.o
+OBJS	+= builtin-run.o
+OBJS	+= builtin-setup.o
+OBJS	+= builtin-stop.o
+OBJS	+= builtin-version.o
+OBJS	+= devices.o
+OBJS	+= disk/core.o
+OBJS	+= framebuffer.o
+OBJS	+= guest_compat.o
+OBJS	+= hw/rtc.o
+OBJS	+= hw/serial.o
+OBJS	+= ioport.o
+OBJS	+= irq.o
+OBJS	+= kvm-cpu.o
+OBJS	+= kvm.o
+OBJS	+= main.o
+OBJS	+= mmio.o
+OBJS	+= pci.o
+OBJS	+= term.o
+OBJS	+= vfio/core.o
+OBJS	+= vfio/pci.o
+OBJS	+= virtio/blk.o
+OBJS	+= virtio/scsi.o
+OBJS	+= virtio/console.o
+OBJS	+= virtio/core.o
+OBJS	+= virtio/net.o
+OBJS	+= virtio/rng.o
+OBJS    += virtio/balloon.o
+OBJS	+= virtio/pci.o
+OBJS	+= disk/blk.o
+OBJS	+= disk/qcow.o
+OBJS	+= disk/raw.o
+OBJS	+= ioeventfd.o
+OBJS	+= net/uip/core.o
+OBJS	+= net/uip/arp.o
+OBJS	+= net/uip/icmp.o
+OBJS	+= net/uip/ipv4.o
+OBJS	+= net/uip/tcp.o
+OBJS	+= net/uip/udp.o
+OBJS	+= net/uip/buf.o
+OBJS	+= net/uip/csum.o
+OBJS	+= net/uip/dhcp.o
+OBJS	+= kvm-cmd.o
+OBJS	+= util/init.o
+OBJS    += util/iovec.o
+OBJS	+= util/rbtree.o
+OBJS	+= util/threadpool.o
+OBJS	+= util/parse-options.o
+OBJS	+= util/rbtree-interval.o
+OBJS	+= util/strbuf.o
+OBJS	+= util/read-write.o
+OBJS	+= util/util.o
+OBJS	+= virtio/9p.o
+OBJS	+= virtio/9p-pdu.o
+OBJS	+= kvm-ipc.o
+OBJS	+= builtin-sandbox.o
+OBJS	+= virtio/mmio.o
+
+# Translate uname -m into ARCH string
+ARCH ?= $(shell uname -m | sed -e s/i.86/i386/ -e s/ppc.*/powerpc/ \
+	  -e s/armv.*/arm/ -e s/aarch64.*/arm64/ -e s/mips64/mips/)
+
+ifeq ($(ARCH),i386)
+	ARCH         := x86
+	DEFINES      += -DCONFIG_X86_32
+endif
+ifeq ($(ARCH),x86_64)
+	ARCH         := x86
+	DEFINES      += -DCONFIG_X86_64
+	ARCH_PRE_INIT = x86/init.S
+endif
+
+### Arch-specific stuff
+
+#x86
+ifeq ($(ARCH),x86)
+	DEFINES += -DCONFIG_X86
+	OBJS	+= hw/i8042.o
+	OBJS	+= x86/boot.o
+	OBJS	+= x86/cpuid.o
+	OBJS	+= x86/interrupt.o
+	OBJS	+= x86/ioport.o
+	OBJS	+= x86/irq.o
+	OBJS	+= x86/kvm.o
+	OBJS	+= x86/kvm-cpu.o
+	OBJS	+= x86/mptable.o
+# Exclude BIOS object files from header dependencies.
+	OTHEROBJS	+= x86/bios.o
+	OTHEROBJS	+= x86/bios/bios-rom.o
+	ARCH_INCLUDE := x86/include
+	ARCH_HAS_FRAMEBUFFER := y
+endif
+# POWER/ppc:  Actually only support ppc64 currently.
+ifeq ($(ARCH), powerpc)
+	DEFINES += -DCONFIG_PPC
+	OBJS	+= powerpc/boot.o
+	OBJS	+= powerpc/ioport.o
+	OBJS	+= powerpc/kvm.o
+	OBJS	+= powerpc/cpu_info.o
+	OBJS	+= powerpc/kvm-cpu.o
+	OBJS	+= powerpc/spapr_hcall.o
+	OBJS	+= powerpc/spapr_rtas.o
+	OBJS	+= powerpc/spapr_hvcons.o
+	OBJS	+= powerpc/spapr_pci.o
+	OBJS	+= powerpc/xics.o
+	ARCH_INCLUDE := powerpc/include
+
+	ARCH_WANT_LIBFDT := y
+endif
+
+# ARM
+OBJS_ARM_COMMON		:= arm/fdt.o arm/gic.o arm/gicv2m.o arm/ioport.o \
+			   arm/kvm.o arm/kvm-cpu.o arm/pci.o arm/timer.o \
+			   arm/pmu.o
+HDRS_ARM_COMMON		:= arm/include
+ifeq ($(ARCH), arm)
+	DEFINES		+= -DCONFIG_ARM
+	OBJS		+= $(OBJS_ARM_COMMON)
+	OBJS		+= arm/aarch32/arm-cpu.o
+	OBJS		+= arm/aarch32/kvm-cpu.o
+	ARCH_INCLUDE	:= $(HDRS_ARM_COMMON)
+	ARCH_INCLUDE	+= -Iarm/aarch32/include
+	CFLAGS		+= -march=armv7-a
+
+	ARCH_WANT_LIBFDT := y
+	ARCH_HAS_FLASH_MEM := y
+endif
+
+# ARM64
+ifeq ($(ARCH), arm64)
+	DEFINES		+= -DCONFIG_ARM64
+	OBJS		+= $(OBJS_ARM_COMMON)
+	OBJS		+= arm/aarch64/arm-cpu.o
+	OBJS		+= arm/aarch64/kvm-cpu.o
+	OBJS		+= arm/aarch64/kvm.o
+	ARCH_INCLUDE	:= $(HDRS_ARM_COMMON)
+	ARCH_INCLUDE	+= -Iarm/aarch64/include
+
+	ARCH_WANT_LIBFDT := y
+	ARCH_HAS_FLASH_MEM := y
+endif
+
+ifeq ($(ARCH),mips)
+	DEFINES		+= -DCONFIG_MIPS
+	ARCH_INCLUDE	:= mips/include
+	OBJS		+= mips/kvm.o
+	OBJS		+= mips/kvm-cpu.o
+endif
+###
+
+ifeq (,$(ARCH_INCLUDE))
+        $(error This architecture ($(ARCH)) is not supported in kvmtool)
+endif
+
+###
+
+# Detect optional features.
+# On a given system, some libs may link statically, some may not; so, check
+# both and only build those that link!
+
+ifeq ($(call try-build,$(SOURCE_STRLCPY),$(CFLAGS),$(LDFLAGS)),y)
+	CFLAGS_DYNOPT	+= -DHAVE_STRLCPY
+	CFLAGS_STATOPT	+= -DHAVE_STRLCPY
+endif
+
+ifeq ($(call try-build,$(SOURCE_BFD),$(CFLAGS),$(LDFLAGS) -lbfd -static),y)
+	CFLAGS_STATOPT	+= -DCONFIG_HAS_BFD
+	OBJS_STATOPT	+= symbol.o
+	LIBS_STATOPT	+= -lbfd
+else
+	ifeq ($(call try-build,$(SOURCE_BFD),$(CFLAGS),$(LDFLAGS) -lbfd),y)
+		CFLAGS_DYNOPT	+= -DCONFIG_HAS_BFD
+		OBJS_DYNOPT	+= symbol.o
+		LIBS_DYNOPT	+= -lbfd
+	else
+		NOTFOUND	+= bfd
+	endif
+endif
+
+ifeq (y,$(ARCH_HAS_FRAMEBUFFER))
+	OBJS	+= hw/vesa.o
+
+	CFLAGS_GTK3 := $(shell pkg-config --cflags gtk+-3.0 2>/dev/null)
+	LDFLAGS_GTK3 := $(shell pkg-config --libs gtk+-3.0 2>/dev/null)
+	ifeq ($(call try-build,$(SOURCE_GTK3),$(CFLAGS) $(CFLAGS_GTK3),$(LDFLAGS) $(LDFLAGS_GTK3)),y)
+		OBJS_DYNOPT	+= ui/gtk3.o
+		CFLAGS_DYNOPT	+= -DCONFIG_HAS_GTK3 $(CFLAGS_GTK3)
+		LIBS_DYNOPT	+= $(LDFLAGS_GTK3)
+	else
+		NOTFOUND	+= GTK3
+	endif
+
+	ifeq ($(call try-build,$(SOURCE_VNCSERVER),$(CFLAGS),$(LDFLAGS) -lvncserver),y)
+		OBJS_DYNOPT	+= ui/vnc.o
+		CFLAGS_DYNOPT	+= -DCONFIG_HAS_VNCSERVER
+		LIBS_DYNOPT	+= -lvncserver
+	else
+		NOTFOUND	+= vncserver
+	endif
+	ifeq ($(call try-build,$(SOURCE_VNCSERVER),$(CFLAGS),$(LDFLAGS) -lvncserver -static),y)
+		OBJS_STATOPT	+= ui/vnc.o
+		CFLAGS_STATOPT	+= -DCONFIG_HAS_VNCSERVER
+		LIBS_STATOPT	+= -lvncserver
+	endif
+
+	ifeq ($(call try-build,$(SOURCE_SDL),$(CFLAGS),$(LDFLAGS) -lSDL),y)
+		OBJS_DYNOPT	+= ui/sdl.o
+		CFLAGS_DYNOPT	+= -DCONFIG_HAS_SDL
+		LIBS_DYNOPT	+= -lSDL
+	else
+		NOTFOUND	+= SDL
+	endif
+	ifeq ($(call try-build,$(SOURCE_SDL),$(CFLAGS),$(LDFLAGS) -lSDL -static), y)
+		OBJS_STATOPT	+= ui/sdl.o
+		CFLAGS_STATOPT	+= -DCONFIG_HAS_SDL
+		LIBS_STATOPT	+= -lSDL
+	endif
+endif
+
+ifeq (y,$(ARCH_HAS_FLASH_MEM))
+	OBJS	+= hw/cfi_flash.o
+endif
+
+ifeq ($(call try-build,$(SOURCE_ZLIB),$(CFLAGS),$(LDFLAGS) -lz),y)
+	CFLAGS_DYNOPT	+= -DCONFIG_HAS_ZLIB
+	LIBS_DYNOPT	+= -lz
+else
+	ifeq ($(call try-build,$(SOURCE_ZLIB),$(CFLAGS),$(LDFLAGS) -lz -static),y)
+		CFLAGS_STATOPT	+= -DCONFIG_HAS_ZLIB
+		LIBS_STATOPT	+= -lz
+	else
+		NOTFOUND	+= zlib
+	endif
+endif
+
+ifeq ($(call try-build,$(SOURCE_AIO),$(CFLAGS),$(LDFLAGS) -laio),y)
+	CFLAGS_DYNOPT	+= -DCONFIG_HAS_AIO
+	LIBS_DYNOPT	+= -laio
+	OBJS_DYNOPT	+= disk/aio.o
+else
+	ifeq ($(call try-build,$(SOURCE_AIO),$(CFLAGS),$(LDFLAGS) -laio -static),y)
+		CFLAGS_STATOPT	+= -DCONFIG_HAS_AIO
+		LIBS_STATOPT	+= -laio
+		OBJS_STATOPT	+= disk/aio.o
+	else
+		NOTFOUND	+= aio
+	endif
+endif
+
+ifeq ($(LTO),1)
+	FLAGS_LTO := -flto
+	ifeq ($(call try-build,$(SOURCE_HELLO),$(CFLAGS),$(LDFLAGS) $(FLAGS_LTO)),y)
+		CFLAGS		+= $(FLAGS_LTO)
+	endif
+endif
+
+ifeq ($(call try-build,$(SOURCE_STATIC),$(CFLAGS),$(LDFLAGS) -static),y)
+	CFLAGS		+= -DCONFIG_GUEST_INIT
+	GUEST_INIT	:= guest/init
+	GUEST_OBJS	= guest/guest_init.o
+	ifeq ($(ARCH_PRE_INIT),)
+		GUEST_INIT_FLAGS	+= -static $(PIE_FLAGS)
+	else
+		CFLAGS			+= -DCONFIG_GUEST_PRE_INIT
+		GUEST_INIT_FLAGS	+= -DCONFIG_GUEST_PRE_INIT
+		GUEST_PRE_INIT		:= guest/pre_init
+		GUEST_OBJS		+= guest/guest_pre_init.o
+	endif
+else
+$(warning No static libc found. Skipping guest init)
+	NOTFOUND        += static-libc
+endif
+
+ifeq (y,$(ARCH_WANT_LIBFDT))
+	ifneq ($(call try-build,$(SOURCE_LIBFDT),$(CFLAGS),-lfdt),y)
+          $(error No libfdt found. Please install libfdt-dev package)
+	else
+		CFLAGS_DYNOPT	+= -DCONFIG_HAS_LIBFDT
+		CFLAGS_STATOPT	+= -DCONFIG_HAS_LIBFDT
+		LIBS_DYNOPT	+= -lfdt
+		LIBS_STATOPT	+= -lfdt
+	endif
+endif
+
+ifeq ($(call try-build,$(SOURCE_HELLO),$(CFLAGS),-no-pie),y)
+	PIE_FLAGS	+= -no-pie
+endif
+
+ifneq ($(NOTFOUND),)
+        $(warning Skipping optional libraries: $(NOTFOUND))
+endif
+
+###
+
+LIBS	+= -lrt
+LIBS	+= -lpthread
+LIBS	+= -lutil
+
+
+comma = ,
+
+# The dependency file for the current target
+depfile = $(subst $(comma),_,$(dir $@).$(notdir $@).d)
+
+DEPS	:= $(foreach obj,$(OBJS),\
+		$(subst $(comma),_,$(dir $(obj)).$(notdir $(obj)).d))
+
+DEFINES	+= -D_FILE_OFFSET_BITS=64
+DEFINES	+= -D_GNU_SOURCE
+DEFINES	+= -DKVMTOOLS_VERSION='"$(KVMTOOLS_VERSION)"'
+DEFINES	+= -DBUILD_ARCH='"$(ARCH)"'
+
+KVM_INCLUDE := include
+CFLAGS	+= $(CPPFLAGS) $(DEFINES) -I$(KVM_INCLUDE) -I$(ARCH_INCLUDE) -O2 -fno-strict-aliasing -g
+
+WARNINGS += -Wall
+WARNINGS += -Wformat=2
+WARNINGS += -Winit-self
+WARNINGS += -Wmissing-declarations
+WARNINGS += -Wmissing-prototypes
+WARNINGS += -Wnested-externs
+WARNINGS += -Wno-system-headers
+WARNINGS += -Wold-style-definition
+WARNINGS += -Wredundant-decls
+WARNINGS += -Wsign-compare
+WARNINGS += -Wstrict-prototypes
+WARNINGS += -Wundef
+WARNINGS += -Wvolatile-register-var
+WARNINGS += -Wwrite-strings
+WARNINGS += -Wno-format-nonliteral
+
+CFLAGS	+= $(WARNINGS)
+
+ifneq ($(WERROR),0)
+	CFLAGS += -Werror
+endif
+
+all: $(PROGRAM) $(PROGRAM_ALIAS)
+
+# CFLAGS used when building objects
+# This is intentionally not assigned using :=
+c_flags	= -Wp,-MD,$(depfile) -Wp,-MT,$@ $(CFLAGS)
+
+# When building -static all objects are built with appropriate flags, which
+# may differ between static & dynamic .o.  The objects are separated into
+# .o and .static.o.  See the %.o: %.c rules below.
+#
+# $(OTHEROBJS) are things that do not get substituted like this.
+#
+STATIC_OBJS = $(patsubst %.o,%.static.o,$(OBJS) $(OBJS_STATOPT))
+
+STATIC_DEPS	:= $(foreach obj,$(STATIC_OBJS),\
+		$(subst $(comma),_,$(dir $(obj)).$(notdir $(obj)).d))
+
+$(PROGRAM)-static:  $(STATIC_OBJS) $(OTHEROBJS) $(GUEST_OBJS)
+	$(E) "  LINK    " $@
+	$(Q) $(CC) -static $(CFLAGS) $(STATIC_OBJS) $(OTHEROBJS) $(GUEST_OBJS) $(LDFLAGS) $(LIBS) $(LIBS_STATOPT) -o $@
+
+$(PROGRAM): $(OBJS) $(OBJS_DYNOPT) $(OTHEROBJS) $(GUEST_OBJS)
+	$(E) "  LINK    " $@
+	$(Q) $(CC) $(CFLAGS) $(OBJS) $(OBJS_DYNOPT) $(OTHEROBJS) $(GUEST_OBJS) $(LDFLAGS) $(LIBS) $(LIBS_DYNOPT) -o $@
+
+$(PROGRAM_ALIAS): $(PROGRAM)
+	$(E) "  LN      " $@
+	$(Q) ln -f $(PROGRAM) $@
+
+ifneq ($(ARCH_PRE_INIT),)
+$(GUEST_PRE_INIT): $(ARCH_PRE_INIT)
+	$(E) "  COMPILE " $@
+	$(Q) $(CC) -s $(PIE_FLAGS) -nostdlib $< -o $@
+
+guest/guest_pre_init.c: $(GUEST_PRE_INIT)
+	$(E) "  CONVERT " $@
+	$(Q) $(call binary-to-C,$<,pre_init_binary,$@)
+endif
+
+$(GUEST_INIT): guest/init.c
+	$(E) "  COMPILE " $@
+	$(Q) $(CC) $(GUEST_INIT_FLAGS) $< -o $@
+
+guest/guest_init.c: $(GUEST_INIT)
+	$(E) "  CONVERT " $@
+	$(Q) $(call binary-to-C,$<,init_binary,$@)
+
+%.s: %.c
+	$(Q) $(CC) -o $@ -S $(CFLAGS) -fverbose-asm $<
+
+$(OBJS):
+
+util/rbtree.static.o util/rbtree.o: util/rbtree.c
+ifeq ($(C),1)
+	$(E) "  CHECK   " $@
+	$(Q) $(CHECK) -c $(CFLAGS) $< -o $@
+endif
+	$(E) "  CC      " $@
+	$(Q) $(CC) -c $(c_flags) $< -o $@
+
+%.static.o: %.c
+ifeq ($(C),1)
+	$(E) "  CHECK   " $@
+	$(Q) $(CHECK) -c $(CFLAGS) $(CFLAGS_STATOPT) $< -o $@
+endif
+	$(E) "  CC      " $@
+	$(Q) $(CC) -c $(c_flags) $(CFLAGS_STATOPT)  $< -o $@
+
+%.o: %.c
+ifeq ($(C),1)
+	$(E) "  CHECK   " $@
+	$(Q) $(CHECK) -c $(CFLAGS) $(CFLAGS_DYNOPT) $< -o $@
+endif
+	$(E) "  CC      " $@
+	$(Q) $(CC) -c $(c_flags) $(CFLAGS_DYNOPT) $< -o $@
+
+
+#
+# BIOS assembly weirdness
+#
+BIOS_CFLAGS += -m32
+BIOS_CFLAGS += -march=i386
+BIOS_CFLAGS += -mregparm=3
+
+BIOS_CFLAGS += -fno-stack-protector
+BIOS_CFLAGS += -fno-pic
+
+x86/bios.o: x86/bios/bios.bin x86/bios/bios-rom.h
+
+x86/bios/bios.bin.elf: x86/bios/entry.S x86/bios/e820.c x86/bios/int10.c x86/bios/int15.c x86/bios/rom.ld.S
+	$(E) "  CC       x86/bios/memcpy.o"
+	$(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c x86/bios/memcpy.c -o x86/bios/memcpy.o
+	$(E) "  CC       x86/bios/e820.o"
+	$(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c x86/bios/e820.c -o x86/bios/e820.o
+	$(E) "  CC       x86/bios/int10.o"
+	$(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c x86/bios/int10.c -o x86/bios/int10.o
+	$(E) "  CC       x86/bios/int15.o"
+	$(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c x86/bios/int15.c -o x86/bios/int15.o
+	$(E) "  CC       x86/bios/entry.o"
+	$(Q) $(CC) $(CFLAGS) $(BIOS_CFLAGS) -c x86/bios/entry.S -o x86/bios/entry.o
+	$(E) "  LD      " $@
+	$(Q) $(LD) -T x86/bios/rom.ld.S -o x86/bios/bios.bin.elf x86/bios/memcpy.o x86/bios/entry.o x86/bios/e820.o x86/bios/int10.o x86/bios/int15.o
+
+x86/bios/bios.bin: x86/bios/bios.bin.elf
+	$(E) "  OBJCOPY " $@
+	$(Q) $(OBJCOPY) -O binary -j .text x86/bios/bios.bin.elf x86/bios/bios.bin
+
+x86/bios/bios-rom.o: x86/bios/bios-rom.S x86/bios/bios.bin x86/bios/bios-rom.h
+	$(E) "  CC      " $@
+	$(Q) $(CC) -c $(CFLAGS) x86/bios/bios-rom.S -o x86/bios/bios-rom.o
+
+x86/bios/bios-rom.h: x86/bios/bios.bin.elf
+	$(E) "  NM      " $@
+	$(Q) cd x86/bios && sh gen-offsets.sh > bios-rom.h && cd ..
+
+check: all
+	$(MAKE) -C tests
+	./$(PROGRAM) run tests/pit/tick.bin
+	./$(PROGRAM) run -d tests/boot/boot_test.iso -p "init=init"
+.PHONY: check
+
+install: all
+	$(E) "  INSTALL"
+	$(Q) $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)' 
+	$(Q) $(INSTALL) $(PROGRAM) '$(DESTDIR_SQ)$(bindir_SQ)' 
+.PHONY: install
+
+clean:
+	$(E) "  CLEAN"
+	$(Q) rm -f x86/bios/*.bin
+	$(Q) rm -f x86/bios/*.elf
+	$(Q) rm -f x86/bios/*.o
+	$(Q) rm -f x86/bios/bios-rom.h
+	$(Q) rm -f tests/boot/boot_test.iso
+	$(Q) rm -rf tests/boot/rootfs/
+	$(Q) rm -f $(DEPS) $(STATIC_DEPS) $(OBJS) $(OTHEROBJS) $(OBJS_DYNOPT) $(STATIC_OBJS) $(PROGRAM) $(PROGRAM_ALIAS) $(PROGRAM)-static $(GUEST_INIT) $(GUEST_PRE_INIT) $(GUEST_OBJS)
+	$(Q) rm -f guest/guest_init.c guest/guest_pre_init.c
+	$(Q) rm -f cscope.*
+	$(Q) rm -f tags
+	$(Q) rm -f TAGS
+	$(Q) rm -f KVMTOOLS-VERSION-FILE
+.PHONY: clean
+
+KVM_DEV	?= /dev/kvm
+
+$(KVM_DEV):
+	$(E) "  MKNOD " $@
+	$(Q) mknod $@ char 10 232
+
+devices: $(KVM_DEV)
+.PHONY: devices
+
+TAGS:
+	$(E) "  GEN" $@
+	$(Q) $(RM) -f TAGS
+	$(Q) $(FIND) . -name '*.[hcS]' -print | xargs etags -a
+.PHONY: TAGS
+
+tags:
+	$(E) "  GEN" $@
+	$(Q) $(RM) -f tags
+	$(Q) $(FIND) . -name '*.[hcS]' -print | xargs ctags -a
+.PHONY: tags
+
+cscope:
+	$(E) "  GEN" $@
+	$(Q) $(FIND) . -name '*.[hcS]' -print > cscope.files
+	$(Q) $(CSCOPE) -bkqu
+.PHONY: cscope
+
+#
+# Escape redundant work on cleaning up
+ifneq ($(MAKECMDGOALS),clean)
+-include $(DEPS)
+
+KVMTOOLS-VERSION-FILE:
+	@$(SHELL_PATH) util/KVMTOOLS-VERSION-GEN $(OUTPUT)
+-include $(OUTPUT)KVMTOOLS-VERSION-FILE
+endif
diff --git a/kvmtool/README b/kvmtool/README
new file mode 100644
index 0000000..d3c2d3a
--- /dev/null
+++ b/kvmtool/README
@@ -0,0 +1,119 @@
+Native Linux KVM tool
+=====================
+
+kvmtool is a lightweight tool for hosting KVM guests. As a pure virtualization
+tool it only supports guests using the same architecture, though it supports
+running 32-bit guests on those 64-bit architectures that allow this.
+
+From the original announcement email:
+-------------------------------------------------------
+The goal of this tool is to provide a clean, from-scratch, lightweight
+KVM host tool implementation that can boot Linux guest images (just a
+hobby, won't be big and professional like QEMU) with no BIOS
+dependencies and with only the minimal amount of legacy device
+emulation.
+
+It's great as a learning tool if you want to get your feet wet in
+virtualization land: it's only 5 KLOC of clean C code that can already
+boot a guest Linux image.
+
+Right now it can boot a Linux image and provide you output via a serial
+console, over the host terminal, i.e. you can use it to boot a guest
+Linux image in a terminal or over ssh and log into the guest without
+much guest or host side setup work needed.
+--------------------------
+
+This is the stand-alone version which does not live inside a Linux
+kernel tree.
+1. To check it out, clone the main git repository:
+
+  git clone git://git.kernel.org/pub/scm/linux/kernel/git/will/kvmtool.git
+
+2. Compile the tool (for more elaborate instructions see INSTALL):
+
+  cd kvmtool && make
+
+3. Download a raw userspace image:
+
+  wget http://wiki.qemu.org/download/linux-0.2.img.bz2 && bunzip2
+linux-0.2.img.bz2
+
+4. The guest kernel has to be built with the following configuration:
+
+ - For the default console output:
+	CONFIG_SERIAL_8250=y
+	CONFIG_SERIAL_8250_CONSOLE=y
+
+ - For running 32bit images on 64bit hosts:
+	CONFIG_IA32_EMULATION=y
+
+ - Proper FS options according to image FS (e.g. CONFIG_EXT2_FS, CONFIG_EXT4_FS).
+
+ - For all virtio devices listed below:
+	CONFIG_VIRTIO=y
+	CONFIG_VIRTIO_RING=y
+	CONFIG_VIRTIO_PCI=y
+
+ - For virtio-blk devices (--disk, -d):
+	CONFIG_VIRTIO_BLK=y
+
+ - For virtio-net devices ([--network, -n] virtio):
+	CONFIG_VIRTIO_NET=y
+
+ - For virtio-9p devices (--virtio-9p):
+	CONFIG_NET_9P=y
+	CONFIG_NET_9P_VIRTIO=y
+	CONFIG_9P_FS=y
+
+ - For virtio-balloon device (--balloon):
+	CONFIG_VIRTIO_BALLOON=y
+
+ - For virtio-console device (--console virtio):
+	CONFIG_VIRTIO_CONSOLE=y
+
+ - For virtio-rng device (--rng):
+	CONFIG_HW_RANDOM_VIRTIO=y
+
+ - For vesa device (--sdl or --vnc):
+	CONFIG_FB_VESA=y
+
+
+5. And finally, launch the hypervisor:
+
+  ./lkvm run --disk linux-0.2.img \
+	    --kernel ../../arch/x86/boot/bzImage \
+or
+
+  sudo ./lkvm run --disk linux-0.2.img \
+		 --kernel ../../arch/x86/boot/bzImage \
+		 --network virtio
+
+The tool has been written by Pekka Enberg, Cyrill Gorcunov, Asias He,
+Sasha Levin and Prasad Joshi. Special thanks to Avi Kivity for his help
+on KVM internals and Ingo Molnar for all-around support and encouragement!
+
+See the following thread for original discussion for motivation of this
+project:
+
+http://thread.gmane.org/gmane.linux.kernel/962051/focus=962620
+
+Another detailed example can be found in the lwn.net article:
+
+http://lwn.net/Articles/658511/
+
+Contributing
+------------
+
+Please send patches for kvmtool to kvm@vger.kernel.org, in the usual git
+patch format, including "kvmtool" in the mail subject. "kvmtool" can be
+added automatically by issuing the command
+
+ git config format.subjectprefix "PATCH kvmtool"
+
+in the git repository.
+
+Maintainers
+-----------
+
+kvmtool is maintained by Will Deacon <will@kernel.org> and Julien Thierry
+<julien.thierry.kdev@gmail.com>.
diff --git a/kvmtool/arm/aarch32/arm-cpu.c b/kvmtool/arm/aarch32/arm-cpu.c
new file mode 100644
index 0000000..16bba55
--- /dev/null
+++ b/kvmtool/arm/aarch32/arm-cpu.c
@@ -0,0 +1,50 @@
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/util.h"
+
+#include "arm-common/gic.h"
+#include "arm-common/timer.h"
+
+#include <linux/byteorder.h>
+#include <linux/types.h>
+
+static void generate_fdt_nodes(void *fdt, struct kvm *kvm)
+{
+	int timer_interrupts[4] = {13, 14, 11, 10};
+
+	gic__generate_fdt_nodes(fdt, kvm->cfg.arch.irqchip);
+	timer__generate_fdt_nodes(fdt, kvm, timer_interrupts);
+}
+
+static int arm_cpu__vcpu_init(struct kvm_cpu *vcpu)
+{
+	vcpu->generate_fdt_nodes = generate_fdt_nodes;
+	return 0;
+}
+
+static struct kvm_arm_target target_generic_v7 = {
+	.id		= UINT_MAX,
+	.compatible	= "arm,arm-v7",
+	.init		= arm_cpu__vcpu_init,
+};
+
+static struct kvm_arm_target target_cortex_a15 = {
+	.id		= KVM_ARM_TARGET_CORTEX_A15,
+	.compatible	= "arm,cortex-a15",
+	.init		= arm_cpu__vcpu_init,
+};
+
+static struct kvm_arm_target target_cortex_a7 = {
+	.id		= KVM_ARM_TARGET_CORTEX_A7,
+	.compatible	= "arm,cortex-a7",
+	.init		= arm_cpu__vcpu_init,
+};
+
+static int arm_cpu__core_init(struct kvm *kvm)
+{
+	kvm_cpu__set_kvm_arm_generic_target(&target_generic_v7);
+
+	return (kvm_cpu__register_kvm_arm_target(&target_cortex_a15) ||
+		kvm_cpu__register_kvm_arm_target(&target_cortex_a7));
+}
+core_init(arm_cpu__core_init);
diff --git a/kvmtool/arm/aarch32/include/asm/kvm.h b/kvmtool/arm/aarch32/include/asm/kvm.h
new file mode 100644
index 0000000..a4217c1
--- /dev/null
+++ b/kvmtool/arm/aarch32/include/asm/kvm.h
@@ -0,0 +1,311 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall <c.dall@virtualopensystems.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#ifndef __ARM_KVM_H__
+#define __ARM_KVM_H__
+
+#include <linux/types.h>
+#include <linux/psci.h>
+#include <asm/ptrace.h>
+
+#define __KVM_HAVE_GUEST_DEBUG
+#define __KVM_HAVE_IRQ_LINE
+#define __KVM_HAVE_READONLY_MEM
+#define __KVM_HAVE_VCPU_EVENTS
+
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+
+#define KVM_REG_SIZE(id)						\
+	(1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
+
+/* Valid for svc_regs, abt_regs, und_regs, irq_regs in struct kvm_regs */
+#define KVM_ARM_SVC_sp		svc_regs[0]
+#define KVM_ARM_SVC_lr		svc_regs[1]
+#define KVM_ARM_SVC_spsr	svc_regs[2]
+#define KVM_ARM_ABT_sp		abt_regs[0]
+#define KVM_ARM_ABT_lr		abt_regs[1]
+#define KVM_ARM_ABT_spsr	abt_regs[2]
+#define KVM_ARM_UND_sp		und_regs[0]
+#define KVM_ARM_UND_lr		und_regs[1]
+#define KVM_ARM_UND_spsr	und_regs[2]
+#define KVM_ARM_IRQ_sp		irq_regs[0]
+#define KVM_ARM_IRQ_lr		irq_regs[1]
+#define KVM_ARM_IRQ_spsr	irq_regs[2]
+
+/* Valid only for fiq_regs in struct kvm_regs */
+#define KVM_ARM_FIQ_r8		fiq_regs[0]
+#define KVM_ARM_FIQ_r9		fiq_regs[1]
+#define KVM_ARM_FIQ_r10		fiq_regs[2]
+#define KVM_ARM_FIQ_fp		fiq_regs[3]
+#define KVM_ARM_FIQ_ip		fiq_regs[4]
+#define KVM_ARM_FIQ_sp		fiq_regs[5]
+#define KVM_ARM_FIQ_lr		fiq_regs[6]
+#define KVM_ARM_FIQ_spsr	fiq_regs[7]
+
+struct kvm_regs {
+	struct pt_regs usr_regs;	/* R0_usr - R14_usr, PC, CPSR */
+	unsigned long svc_regs[3];	/* SP_svc, LR_svc, SPSR_svc */
+	unsigned long abt_regs[3];	/* SP_abt, LR_abt, SPSR_abt */
+	unsigned long und_regs[3];	/* SP_und, LR_und, SPSR_und */
+	unsigned long irq_regs[3];	/* SP_irq, LR_irq, SPSR_irq */
+	unsigned long fiq_regs[8];	/* R8_fiq - R14_fiq, SPSR_fiq */
+};
+
+/* Supported Processor Types */
+#define KVM_ARM_TARGET_CORTEX_A15	0
+#define KVM_ARM_TARGET_CORTEX_A7	1
+#define KVM_ARM_NUM_TARGETS		2
+
+/* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */
+#define KVM_ARM_DEVICE_TYPE_SHIFT	0
+#define KVM_ARM_DEVICE_TYPE_MASK	(0xffff << KVM_ARM_DEVICE_TYPE_SHIFT)
+#define KVM_ARM_DEVICE_ID_SHIFT		16
+#define KVM_ARM_DEVICE_ID_MASK		(0xffff << KVM_ARM_DEVICE_ID_SHIFT)
+
+/* Supported device IDs */
+#define KVM_ARM_DEVICE_VGIC_V2		0
+
+/* Supported VGIC address types  */
+#define KVM_VGIC_V2_ADDR_TYPE_DIST	0
+#define KVM_VGIC_V2_ADDR_TYPE_CPU	1
+
+#define KVM_VGIC_V2_DIST_SIZE		0x1000
+#define KVM_VGIC_V2_CPU_SIZE		0x2000
+
+/* Supported VGICv3 address types  */
+#define KVM_VGIC_V3_ADDR_TYPE_DIST	2
+#define KVM_VGIC_V3_ADDR_TYPE_REDIST	3
+#define KVM_VGIC_ITS_ADDR_TYPE		4
+#define KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION	5
+
+#define KVM_VGIC_V3_DIST_SIZE		SZ_64K
+#define KVM_VGIC_V3_REDIST_SIZE		(2 * SZ_64K)
+#define KVM_VGIC_V3_ITS_SIZE		(2 * SZ_64K)
+
+#define KVM_ARM_VCPU_POWER_OFF		0 /* CPU is started in OFF state */
+#define KVM_ARM_VCPU_PSCI_0_2		1 /* CPU uses PSCI v0.2 */
+
+struct kvm_vcpu_init {
+	__u32 target;
+	__u32 features[7];
+};
+
+struct kvm_sregs {
+};
+
+struct kvm_fpu {
+};
+
+struct kvm_guest_debug_arch {
+};
+
+struct kvm_debug_exit_arch {
+};
+
+struct kvm_sync_regs {
+	/* Used with KVM_CAP_ARM_USER_IRQ */
+	__u64 device_irq_level;
+};
+
+struct kvm_arch_memory_slot {
+};
+
+/* for KVM_GET/SET_VCPU_EVENTS */
+struct kvm_vcpu_events {
+	struct {
+		__u8 serror_pending;
+		__u8 serror_has_esr;
+		/* Align it to 8 bytes */
+		__u8 pad[6];
+		__u64 serror_esr;
+	} exception;
+	__u32 reserved[12];
+};
+
+/* If you need to interpret the index values, here is the key: */
+#define KVM_REG_ARM_COPROC_MASK		0x000000000FFF0000
+#define KVM_REG_ARM_COPROC_SHIFT	16
+#define KVM_REG_ARM_32_OPC2_MASK	0x0000000000000007
+#define KVM_REG_ARM_32_OPC2_SHIFT	0
+#define KVM_REG_ARM_OPC1_MASK		0x0000000000000078
+#define KVM_REG_ARM_OPC1_SHIFT		3
+#define KVM_REG_ARM_CRM_MASK		0x0000000000000780
+#define KVM_REG_ARM_CRM_SHIFT		7
+#define KVM_REG_ARM_32_CRN_MASK		0x0000000000007800
+#define KVM_REG_ARM_32_CRN_SHIFT	11
+/*
+ * For KVM currently all guest registers are nonsecure, but we reserve a bit
+ * in the encoding to distinguish secure from nonsecure for AArch32 system
+ * registers that are banked by security. This is 1 for the secure banked
+ * register, and 0 for the nonsecure banked register or if the register is
+ * not banked by security.
+ */
+#define KVM_REG_ARM_SECURE_MASK	0x0000000010000000
+#define KVM_REG_ARM_SECURE_SHIFT	28
+
+#define ARM_CP15_REG_SHIFT_MASK(x,n) \
+	(((x) << KVM_REG_ARM_ ## n ## _SHIFT) & KVM_REG_ARM_ ## n ## _MASK)
+
+#define __ARM_CP15_REG(op1,crn,crm,op2) \
+	(KVM_REG_ARM | (15 << KVM_REG_ARM_COPROC_SHIFT) | \
+	ARM_CP15_REG_SHIFT_MASK(op1, OPC1) | \
+	ARM_CP15_REG_SHIFT_MASK(crn, 32_CRN) | \
+	ARM_CP15_REG_SHIFT_MASK(crm, CRM) | \
+	ARM_CP15_REG_SHIFT_MASK(op2, 32_OPC2))
+
+#define ARM_CP15_REG32(...) (__ARM_CP15_REG(__VA_ARGS__) | KVM_REG_SIZE_U32)
+
+#define __ARM_CP15_REG64(op1,crm) \
+	(__ARM_CP15_REG(op1, 0, crm, 0) | KVM_REG_SIZE_U64)
+#define ARM_CP15_REG64(...) __ARM_CP15_REG64(__VA_ARGS__)
+
+/* PL1 Physical Timer Registers */
+#define KVM_REG_ARM_PTIMER_CTL		ARM_CP15_REG32(0, 14, 2, 1)
+#define KVM_REG_ARM_PTIMER_CNT		ARM_CP15_REG64(0, 14)
+#define KVM_REG_ARM_PTIMER_CVAL		ARM_CP15_REG64(2, 14)
+
+/* Virtual Timer Registers */
+#define KVM_REG_ARM_TIMER_CTL		ARM_CP15_REG32(0, 14, 3, 1)
+#define KVM_REG_ARM_TIMER_CNT		ARM_CP15_REG64(1, 14)
+#define KVM_REG_ARM_TIMER_CVAL		ARM_CP15_REG64(3, 14)
+
+/* Normal registers are mapped as coprocessor 16. */
+#define KVM_REG_ARM_CORE		(0x0010 << KVM_REG_ARM_COPROC_SHIFT)
+#define KVM_REG_ARM_CORE_REG(name)	(offsetof(struct kvm_regs, name) / 4)
+
+/* Some registers need more space to represent values. */
+#define KVM_REG_ARM_DEMUX		(0x0011 << KVM_REG_ARM_COPROC_SHIFT)
+#define KVM_REG_ARM_DEMUX_ID_MASK	0x000000000000FF00
+#define KVM_REG_ARM_DEMUX_ID_SHIFT	8
+#define KVM_REG_ARM_DEMUX_ID_CCSIDR	(0x00 << KVM_REG_ARM_DEMUX_ID_SHIFT)
+#define KVM_REG_ARM_DEMUX_VAL_MASK	0x00000000000000FF
+#define KVM_REG_ARM_DEMUX_VAL_SHIFT	0
+
+/* VFP registers: we could overload CP10 like ARM does, but that's ugly. */
+#define KVM_REG_ARM_VFP			(0x0012 << KVM_REG_ARM_COPROC_SHIFT)
+#define KVM_REG_ARM_VFP_MASK		0x000000000000FFFF
+#define KVM_REG_ARM_VFP_BASE_REG	0x0
+#define KVM_REG_ARM_VFP_FPSID		0x1000
+#define KVM_REG_ARM_VFP_FPSCR		0x1001
+#define KVM_REG_ARM_VFP_MVFR1		0x1006
+#define KVM_REG_ARM_VFP_MVFR0		0x1007
+#define KVM_REG_ARM_VFP_FPEXC		0x1008
+#define KVM_REG_ARM_VFP_FPINST		0x1009
+#define KVM_REG_ARM_VFP_FPINST2		0x100A
+
+/* KVM-as-firmware specific pseudo-registers */
+#define KVM_REG_ARM_FW			(0x0014 << KVM_REG_ARM_COPROC_SHIFT)
+#define KVM_REG_ARM_FW_REG(r)		(KVM_REG_ARM | KVM_REG_SIZE_U64 | \
+					 KVM_REG_ARM_FW | ((r) & 0xffff))
+#define KVM_REG_ARM_PSCI_VERSION	KVM_REG_ARM_FW_REG(0)
+#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1	KVM_REG_ARM_FW_REG(1)
+	/* Higher values mean better protection. */
+#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL		0
+#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL		1
+#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED	2
+#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2	KVM_REG_ARM_FW_REG(2)
+	/* Higher values mean better protection. */
+#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL		0
+#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN		1
+#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL		2
+#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED	3
+#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED	(1U << 4)
+
+/* Device Control API: ARM VGIC */
+#define KVM_DEV_ARM_VGIC_GRP_ADDR	0
+#define KVM_DEV_ARM_VGIC_GRP_DIST_REGS	1
+#define KVM_DEV_ARM_VGIC_GRP_CPU_REGS	2
+#define   KVM_DEV_ARM_VGIC_CPUID_SHIFT	32
+#define   KVM_DEV_ARM_VGIC_CPUID_MASK	(0xffULL << KVM_DEV_ARM_VGIC_CPUID_SHIFT)
+#define   KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT 32
+#define   KVM_DEV_ARM_VGIC_V3_MPIDR_MASK \
+			(0xffffffffULL << KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT)
+#define   KVM_DEV_ARM_VGIC_OFFSET_SHIFT	0
+#define   KVM_DEV_ARM_VGIC_OFFSET_MASK	(0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT)
+#define   KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK (0xffff)
+#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS	3
+#define KVM_DEV_ARM_VGIC_GRP_CTRL       4
+#define KVM_DEV_ARM_VGIC_GRP_REDIST_REGS 5
+#define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6
+#define KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO  7
+#define KVM_DEV_ARM_VGIC_GRP_ITS_REGS	8
+#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT	10
+#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK \
+			(0x3fffffULL << KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT)
+#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK 0x3ff
+#define VGIC_LEVEL_INFO_LINE_LEVEL	0
+
+/* Device Control API on vcpu fd */
+#define KVM_ARM_VCPU_PMU_V3_CTRL	0
+#define   KVM_ARM_VCPU_PMU_V3_IRQ	0
+#define   KVM_ARM_VCPU_PMU_V3_INIT	1
+#define KVM_ARM_VCPU_TIMER_CTRL		1
+#define   KVM_ARM_VCPU_TIMER_IRQ_VTIMER		0
+#define   KVM_ARM_VCPU_TIMER_IRQ_PTIMER		1
+
+#define   KVM_DEV_ARM_VGIC_CTRL_INIT		0
+#define   KVM_DEV_ARM_ITS_SAVE_TABLES		1
+#define   KVM_DEV_ARM_ITS_RESTORE_TABLES	2
+#define   KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES	3
+#define   KVM_DEV_ARM_ITS_CTRL_RESET		4
+
+/* KVM_IRQ_LINE irq field index values */
+#define KVM_ARM_IRQ_TYPE_SHIFT		24
+#define KVM_ARM_IRQ_TYPE_MASK		0xff
+#define KVM_ARM_IRQ_VCPU_SHIFT		16
+#define KVM_ARM_IRQ_VCPU_MASK		0xff
+#define KVM_ARM_IRQ_NUM_SHIFT		0
+#define KVM_ARM_IRQ_NUM_MASK		0xffff
+
+/* irq_type field */
+#define KVM_ARM_IRQ_TYPE_CPU		0
+#define KVM_ARM_IRQ_TYPE_SPI		1
+#define KVM_ARM_IRQ_TYPE_PPI		2
+
+/* out-of-kernel GIC cpu interrupt injection irq_number field */
+#define KVM_ARM_IRQ_CPU_IRQ		0
+#define KVM_ARM_IRQ_CPU_FIQ		1
+
+/*
+ * This used to hold the highest supported SPI, but it is now obsolete
+ * and only here to provide source code level compatibility with older
+ * userland. The highest SPI number can be set via KVM_DEV_ARM_VGIC_GRP_NR_IRQS.
+ */
+#ifndef __KERNEL__
+#define KVM_ARM_IRQ_GIC_MAX		127
+#endif
+
+/* One single KVM irqchip, ie. the VGIC */
+#define KVM_NR_IRQCHIPS          1
+
+/* PSCI interface */
+#define KVM_PSCI_FN_BASE		0x95c1ba5e
+#define KVM_PSCI_FN(n)			(KVM_PSCI_FN_BASE + (n))
+
+#define KVM_PSCI_FN_CPU_SUSPEND		KVM_PSCI_FN(0)
+#define KVM_PSCI_FN_CPU_OFF		KVM_PSCI_FN(1)
+#define KVM_PSCI_FN_CPU_ON		KVM_PSCI_FN(2)
+#define KVM_PSCI_FN_MIGRATE		KVM_PSCI_FN(3)
+
+#define KVM_PSCI_RET_SUCCESS		PSCI_RET_SUCCESS
+#define KVM_PSCI_RET_NI			PSCI_RET_NOT_SUPPORTED
+#define KVM_PSCI_RET_INVAL		PSCI_RET_INVALID_PARAMS
+#define KVM_PSCI_RET_DENIED		PSCI_RET_DENIED
+
+#endif /* __ARM_KVM_H__ */
diff --git a/kvmtool/arm/aarch32/include/kvm/barrier.h b/kvmtool/arm/aarch32/include/kvm/barrier.h
new file mode 100644
index 0000000..94913a9
--- /dev/null
+++ b/kvmtool/arm/aarch32/include/kvm/barrier.h
@@ -0,0 +1,10 @@
+#ifndef KVM__KVM_BARRIER_H
+#define KVM__KVM_BARRIER_H
+
+#define dmb()	asm volatile ("dmb" : : : "memory")
+
+#define mb()	dmb()
+#define rmb()	dmb()
+#define wmb()	dmb()
+
+#endif /* KVM__KVM_BARRIER_H */
diff --git a/kvmtool/arm/aarch32/include/kvm/fdt-arch.h b/kvmtool/arm/aarch32/include/kvm/fdt-arch.h
new file mode 100644
index 0000000..e448bf1
--- /dev/null
+++ b/kvmtool/arm/aarch32/include/kvm/fdt-arch.h
@@ -0,0 +1,6 @@
+#ifndef KVM__KVM_FDT_H
+#define KVM__KVM_FDT_H
+
+#include "arm-common/fdt-arch.h"
+
+#endif /* KVM__KVM_FDT_H */
diff --git a/kvmtool/arm/aarch32/include/kvm/kvm-arch.h b/kvmtool/arm/aarch32/include/kvm/kvm-arch.h
new file mode 100644
index 0000000..a772bb1
--- /dev/null
+++ b/kvmtool/arm/aarch32/include/kvm/kvm-arch.h
@@ -0,0 +1,10 @@
+#ifndef KVM__KVM_ARCH_H
+#define KVM__KVM_ARCH_H
+
+#define kvm__arch_get_kern_offset(...)	0x8000
+
+#define ARM_MAX_MEMORY(...)	ARM_LOMAP_MAX_MEMORY
+
+#include "arm-common/kvm-arch.h"
+
+#endif /* KVM__KVM_ARCH_H */
diff --git a/kvmtool/arm/aarch32/include/kvm/kvm-config-arch.h b/kvmtool/arm/aarch32/include/kvm/kvm-config-arch.h
new file mode 100644
index 0000000..acf0d23
--- /dev/null
+++ b/kvmtool/arm/aarch32/include/kvm/kvm-config-arch.h
@@ -0,0 +1,8 @@
+#ifndef KVM__KVM_CONFIG_ARCH_H
+#define KVM__KVM_CONFIG_ARCH_H
+
+#define ARM_OPT_ARCH_RUN(...)
+
+#include "arm-common/kvm-config-arch.h"
+
+#endif /* KVM__KVM_CONFIG_ARCH_H */
diff --git a/kvmtool/arm/aarch32/include/kvm/kvm-cpu-arch.h b/kvmtool/arm/aarch32/include/kvm/kvm-cpu-arch.h
new file mode 100644
index 0000000..780e0e2
--- /dev/null
+++ b/kvmtool/arm/aarch32/include/kvm/kvm-cpu-arch.h
@@ -0,0 +1,23 @@
+#ifndef KVM__KVM_CPU_ARCH_H
+#define KVM__KVM_CPU_ARCH_H
+
+#include "kvm/kvm.h"
+
+#include "arm-common/kvm-cpu-arch.h"
+
+#define ARM_VCPU_FEATURE_FLAGS(kvm, cpuid)	{			\
+	[0] = (!!(cpuid) << KVM_ARM_VCPU_POWER_OFF),			\
+}
+
+#define ARM_MPIDR_HWID_BITMASK	0xFFFFFF
+#define ARM_CPU_ID		0, 0, 0
+#define ARM_CPU_ID_MPIDR	5
+
+static inline void kvm_cpu__select_features(struct kvm *kvm,
+					    struct kvm_vcpu_init *init) { }
+static inline int kvm_cpu__configure_features(struct kvm_cpu *vcpu)
+{
+	return 0;
+}
+
+#endif /* KVM__KVM_CPU_ARCH_H */
diff --git a/kvmtool/arm/aarch32/kvm-cpu.c b/kvmtool/arm/aarch32/kvm-cpu.c
new file mode 100644
index 0000000..95fb1da
--- /dev/null
+++ b/kvmtool/arm/aarch32/kvm-cpu.c
@@ -0,0 +1,132 @@
+#include "kvm/kvm-cpu.h"
+#include "kvm/kvm.h"
+#include "kvm/virtio.h"
+
+#include <asm/ptrace.h>
+
+#define ARM_CORE_REG(x)	(KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | \
+			 KVM_REG_ARM_CORE_REG(x))
+
+unsigned long kvm_cpu__get_vcpu_mpidr(struct kvm_cpu *vcpu)
+{
+	struct kvm_one_reg reg;
+	u32 mpidr;
+
+	reg.id = ARM_CP15_REG32(ARM_CPU_ID, ARM_CPU_ID_MPIDR);
+	reg.addr = (u64)(unsigned long)&mpidr;
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (get_mpidr vcpu%ld", vcpu->cpu_id);
+
+	return mpidr;
+}
+
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu)
+{
+	struct kvm *kvm	= vcpu->kvm;
+	struct kvm_one_reg reg;
+	u32 data;
+
+	/* Who said future-proofing was a good idea? */
+	reg.addr = (u64)(unsigned long)&data;
+
+	/* cpsr = IRQs/FIQs masked */
+	data	= PSR_I_BIT | PSR_F_BIT | SVC_MODE;
+	reg.id	= ARM_CORE_REG(usr_regs.ARM_cpsr);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (cpsr)");
+
+	/* Secondary cores are stopped awaiting PSCI wakeup */
+	if (vcpu->cpu_id != 0)
+		return;
+
+	/* r0 = 0 */
+	data	= 0;
+	reg.id	= ARM_CORE_REG(usr_regs.ARM_r0);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (r0)");
+
+	/* r1 = machine type (-1) */
+	data	= -1;
+	reg.id	= ARM_CORE_REG(usr_regs.ARM_r1);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (r1)");
+
+	/* r2 = physical address of the device tree blob */
+	data	= kvm->arch.dtb_guest_start;
+	reg.id	= ARM_CORE_REG(usr_regs.ARM_r2);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (r2)");
+
+	/* pc = start of kernel image */
+	data	= kvm->arch.kern_guest_start;
+	reg.id	= ARM_CORE_REG(usr_regs.ARM_pc);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (pc)");
+}
+
+int kvm_cpu__get_endianness(struct kvm_cpu *vcpu)
+{
+	struct kvm_one_reg reg;
+	u32 data;
+
+	reg.id = ARM_CORE_REG(usr_regs.ARM_cpsr);
+	reg.addr = (u64)(unsigned long)&data;
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (cpsr)");
+
+	return (data & PSR_E_BIT) ? VIRTIO_ENDIAN_BE : VIRTIO_ENDIAN_LE;
+}
+
+void kvm_cpu__show_code(struct kvm_cpu *vcpu)
+{
+	struct kvm_one_reg reg;
+	u32 data;
+	int debug_fd = kvm_cpu__get_debug_fd();
+
+	reg.addr = (u64)(unsigned long)&data;
+
+	dprintf(debug_fd, "\n*pc:\n");
+	reg.id = ARM_CORE_REG(usr_regs.ARM_pc);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (show_code @ PC)");
+
+	kvm__dump_mem(vcpu->kvm, data, 32, debug_fd);
+
+	dprintf(debug_fd, "\n*lr (svc):\n");
+	reg.id = ARM_CORE_REG(svc_regs[1]);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (show_code @ LR_svc)");
+	data &= ~0x1;
+
+	kvm__dump_mem(vcpu->kvm, data, 32, debug_fd);
+}
+
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu)
+{
+	struct kvm_one_reg reg;
+	u32 data;
+	int debug_fd = kvm_cpu__get_debug_fd();
+
+	reg.addr	= (u64)(unsigned long)&data;
+	dprintf(debug_fd, "\n Registers:\n");
+
+	reg.id		= ARM_CORE_REG(usr_regs.ARM_pc);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (pc)");
+	dprintf(debug_fd, " PC:    0x%x\n", data);
+
+	reg.id		= ARM_CORE_REG(usr_regs.ARM_cpsr);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (cpsr)");
+	dprintf(debug_fd, " CPSR:  0x%x\n", data);
+
+	reg.id		= ARM_CORE_REG(svc_regs[0]);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (SP_svc)");
+	dprintf(debug_fd, " SP_svc:  0x%x\n", data);
+
+	reg.id		= ARM_CORE_REG(svc_regs[1]);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (LR_svc)");
+	dprintf(debug_fd, " LR_svc:  0x%x\n", data);
+}
diff --git a/kvmtool/arm/aarch64/arm-cpu.c b/kvmtool/arm/aarch64/arm-cpu.c
new file mode 100644
index 0000000..d7572b7
--- /dev/null
+++ b/kvmtool/arm/aarch64/arm-cpu.c
@@ -0,0 +1,72 @@
+#include "kvm/fdt.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/util.h"
+
+#include "arm-common/gic.h"
+#include "arm-common/timer.h"
+#include "arm-common/pmu.h"
+
+#include <linux/byteorder.h>
+#include <linux/types.h>
+
+static void generate_fdt_nodes(void *fdt, struct kvm *kvm)
+{
+	int timer_interrupts[4] = {13, 14, 11, 10};
+
+	gic__generate_fdt_nodes(fdt, kvm->cfg.arch.irqchip);
+	timer__generate_fdt_nodes(fdt, kvm, timer_interrupts);
+	pmu__generate_fdt_nodes(fdt, kvm);
+}
+
+static int arm_cpu__vcpu_init(struct kvm_cpu *vcpu)
+{
+	vcpu->generate_fdt_nodes = generate_fdt_nodes;
+	return 0;
+}
+
+static struct kvm_arm_target target_generic_v8 = {
+	.id		= UINT_MAX,
+	.compatible	= "arm,arm-v8",
+	.init		= arm_cpu__vcpu_init,
+};
+
+static struct kvm_arm_target target_aem_v8 = {
+	.id		= KVM_ARM_TARGET_AEM_V8,
+	.compatible	= "arm,arm-v8",
+	.init		= arm_cpu__vcpu_init,
+};
+
+static struct kvm_arm_target target_foundation_v8 = {
+	.id		= KVM_ARM_TARGET_FOUNDATION_V8,
+	.compatible	= "arm,arm-v8",
+	.init		= arm_cpu__vcpu_init,
+};
+
+static struct kvm_arm_target target_cortex_a57 = {
+	.id		= KVM_ARM_TARGET_CORTEX_A57,
+	.compatible	= "arm,cortex-a57",
+	.init		= arm_cpu__vcpu_init,
+};
+
+/*
+ * We really don't need to register a target for every
+ * new CPU. The target for Potenza CPU is only registered
+ * to enable compatibility with older host kernels.
+ */
+static struct kvm_arm_target target_potenza = {
+	.id		= KVM_ARM_TARGET_XGENE_POTENZA,
+	.compatible	= "arm,arm-v8",
+	.init		= arm_cpu__vcpu_init,
+};
+
+static int arm_cpu__core_init(struct kvm *kvm)
+{
+	kvm_cpu__set_kvm_arm_generic_target(&target_generic_v8);
+
+	return (kvm_cpu__register_kvm_arm_target(&target_aem_v8) ||
+		kvm_cpu__register_kvm_arm_target(&target_foundation_v8) ||
+		kvm_cpu__register_kvm_arm_target(&target_cortex_a57) ||
+		kvm_cpu__register_kvm_arm_target(&target_potenza));
+}
+core_init(arm_cpu__core_init);
diff --git a/kvmtool/arm/aarch64/include/asm/image.h b/kvmtool/arm/aarch64/include/asm/image.h
new file mode 100644
index 0000000..c2b1321
--- /dev/null
+++ b/kvmtool/arm/aarch64/include/asm/image.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_IMAGE_H
+#define __ASM_IMAGE_H
+
+#define ARM64_IMAGE_MAGIC	"ARM\x64"
+
+#define ARM64_IMAGE_FLAG_BE_SHIFT		0
+#define ARM64_IMAGE_FLAG_PAGE_SIZE_SHIFT	(ARM64_IMAGE_FLAG_BE_SHIFT + 1)
+#define ARM64_IMAGE_FLAG_PHYS_BASE_SHIFT \
+					(ARM64_IMAGE_FLAG_PAGE_SIZE_SHIFT + 2)
+#define ARM64_IMAGE_FLAG_BE_MASK		0x1
+#define ARM64_IMAGE_FLAG_PAGE_SIZE_MASK		0x3
+#define ARM64_IMAGE_FLAG_PHYS_BASE_MASK		0x1
+
+#define ARM64_IMAGE_FLAG_LE			0
+#define ARM64_IMAGE_FLAG_BE			1
+#define ARM64_IMAGE_FLAG_PAGE_SIZE_4K		1
+#define ARM64_IMAGE_FLAG_PAGE_SIZE_16K		2
+#define ARM64_IMAGE_FLAG_PAGE_SIZE_64K		3
+#define ARM64_IMAGE_FLAG_PHYS_BASE		1
+
+#ifndef __ASSEMBLY__
+
+#define arm64_image_flag_field(flags, field) \
+				(((flags) >> field##_SHIFT) & field##_MASK)
+
+/*
+ * struct arm64_image_header - arm64 kernel image header
+ * See Documentation/arm64/booting.rst for details
+ *
+ * @code0:		Executable code, or
+ *   @mz_header		  alternatively used for part of MZ header
+ * @code1:		Executable code
+ * @text_offset:	Image load offset
+ * @image_size:		Effective Image size
+ * @flags:		kernel flags
+ * @reserved:		reserved
+ * @magic:		Magic number
+ * @reserved5:		reserved, or
+ *   @pe_header:	  alternatively used for PE COFF offset
+ */
+
+struct arm64_image_header {
+	__le32 code0;
+	__le32 code1;
+	__le64 text_offset;
+	__le64 image_size;
+	__le64 flags;
+	__le64 res2;
+	__le64 res3;
+	__le64 res4;
+	__le32 magic;
+	__le32 res5;
+};
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __ASM_IMAGE_H */
diff --git a/kvmtool/arm/aarch64/include/asm/kvm.h b/kvmtool/arm/aarch64/include/asm/kvm.h
new file mode 100644
index 0000000..9a50771
--- /dev/null
+++ b/kvmtool/arm/aarch64/include/asm/kvm.h
@@ -0,0 +1,372 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright (C) 2012,2013 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * Derived from arch/arm/include/uapi/asm/kvm.h:
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall <c.dall@virtualopensystems.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ARM_KVM_H__
+#define __ARM_KVM_H__
+
+#define KVM_SPSR_EL1	0
+#define KVM_SPSR_SVC	KVM_SPSR_EL1
+#define KVM_SPSR_ABT	1
+#define KVM_SPSR_UND	2
+#define KVM_SPSR_IRQ	3
+#define KVM_SPSR_FIQ	4
+#define KVM_NR_SPSR	5
+
+#ifndef __ASSEMBLY__
+#include <linux/psci.h>
+#include <linux/types.h>
+#include <asm/ptrace.h>
+#include <asm/sve_context.h>
+
+#define __KVM_HAVE_GUEST_DEBUG
+#define __KVM_HAVE_IRQ_LINE
+#define __KVM_HAVE_READONLY_MEM
+#define __KVM_HAVE_VCPU_EVENTS
+
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+
+#define KVM_REG_SIZE(id)						\
+	(1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
+
+struct kvm_regs {
+	struct user_pt_regs regs;	/* sp = sp_el0 */
+
+	__u64	sp_el1;
+	__u64	elr_el1;
+
+	__u64	spsr[KVM_NR_SPSR];
+
+	struct user_fpsimd_state fp_regs;
+};
+
+/*
+ * Supported CPU Targets - Adding a new target type is not recommended,
+ * unless there are some special registers not supported by the
+ * genericv8 syreg table.
+ */
+#define KVM_ARM_TARGET_AEM_V8		0
+#define KVM_ARM_TARGET_FOUNDATION_V8	1
+#define KVM_ARM_TARGET_CORTEX_A57	2
+#define KVM_ARM_TARGET_XGENE_POTENZA	3
+#define KVM_ARM_TARGET_CORTEX_A53	4
+/* Generic ARM v8 target */
+#define KVM_ARM_TARGET_GENERIC_V8	5
+
+#define KVM_ARM_NUM_TARGETS		6
+
+/* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */
+#define KVM_ARM_DEVICE_TYPE_SHIFT	0
+#define KVM_ARM_DEVICE_TYPE_MASK	(0xffff << KVM_ARM_DEVICE_TYPE_SHIFT)
+#define KVM_ARM_DEVICE_ID_SHIFT		16
+#define KVM_ARM_DEVICE_ID_MASK		(0xffff << KVM_ARM_DEVICE_ID_SHIFT)
+
+/* Supported device IDs */
+#define KVM_ARM_DEVICE_VGIC_V2		0
+
+/* Supported VGIC address types  */
+#define KVM_VGIC_V2_ADDR_TYPE_DIST	0
+#define KVM_VGIC_V2_ADDR_TYPE_CPU	1
+
+#define KVM_VGIC_V2_DIST_SIZE		0x1000
+#define KVM_VGIC_V2_CPU_SIZE		0x2000
+
+/* Supported VGICv3 address types  */
+#define KVM_VGIC_V3_ADDR_TYPE_DIST	2
+#define KVM_VGIC_V3_ADDR_TYPE_REDIST	3
+#define KVM_VGIC_ITS_ADDR_TYPE		4
+#define KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION	5
+
+#define KVM_VGIC_V3_DIST_SIZE		SZ_64K
+#define KVM_VGIC_V3_REDIST_SIZE		(2 * SZ_64K)
+#define KVM_VGIC_V3_ITS_SIZE		(2 * SZ_64K)
+
+#define KVM_ARM_VCPU_POWER_OFF		0 /* CPU is started in OFF state */
+#define KVM_ARM_VCPU_EL1_32BIT		1 /* CPU running a 32bit VM */
+#define KVM_ARM_VCPU_PSCI_0_2		2 /* CPU uses PSCI v0.2 */
+#define KVM_ARM_VCPU_PMU_V3		3 /* Support guest PMUv3 */
+#define KVM_ARM_VCPU_SVE		4 /* enable SVE for this CPU */
+#define KVM_ARM_VCPU_PTRAUTH_ADDRESS	5 /* VCPU uses address authentication */
+#define KVM_ARM_VCPU_PTRAUTH_GENERIC	6 /* VCPU uses generic authentication */
+
+struct kvm_vcpu_init {
+	__u32 target;
+	__u32 features[7];
+};
+
+struct kvm_sregs {
+};
+
+struct kvm_fpu {
+};
+
+/*
+ * See v8 ARM ARM D7.3: Debug Registers
+ *
+ * The architectural limit is 16 debug registers of each type although
+ * in practice there are usually less (see ID_AA64DFR0_EL1).
+ *
+ * Although the control registers are architecturally defined as 32
+ * bits wide we use a 64 bit structure here to keep parity with
+ * KVM_GET/SET_ONE_REG behaviour which treats all system registers as
+ * 64 bit values. It also allows for the possibility of the
+ * architecture expanding the control registers without having to
+ * change the userspace ABI.
+ */
+#define KVM_ARM_MAX_DBG_REGS 16
+struct kvm_guest_debug_arch {
+	__u64 dbg_bcr[KVM_ARM_MAX_DBG_REGS];
+	__u64 dbg_bvr[KVM_ARM_MAX_DBG_REGS];
+	__u64 dbg_wcr[KVM_ARM_MAX_DBG_REGS];
+	__u64 dbg_wvr[KVM_ARM_MAX_DBG_REGS];
+};
+
+struct kvm_debug_exit_arch {
+	__u32 hsr;
+	__u64 far;	/* used for watchpoints */
+};
+
+/*
+ * Architecture specific defines for kvm_guest_debug->control
+ */
+
+#define KVM_GUESTDBG_USE_SW_BP		(1 << 16)
+#define KVM_GUESTDBG_USE_HW		(1 << 17)
+
+struct kvm_sync_regs {
+	/* Used with KVM_CAP_ARM_USER_IRQ */
+	__u64 device_irq_level;
+};
+
+struct kvm_arch_memory_slot {
+};
+
+/* for KVM_GET/SET_VCPU_EVENTS */
+struct kvm_vcpu_events {
+	struct {
+		__u8 serror_pending;
+		__u8 serror_has_esr;
+		/* Align it to 8 bytes */
+		__u8 pad[6];
+		__u64 serror_esr;
+	} exception;
+	__u32 reserved[12];
+};
+
+/* If you need to interpret the index values, here is the key: */
+#define KVM_REG_ARM_COPROC_MASK		0x000000000FFF0000
+#define KVM_REG_ARM_COPROC_SHIFT	16
+
+/* Normal registers are mapped as coprocessor 16. */
+#define KVM_REG_ARM_CORE		(0x0010 << KVM_REG_ARM_COPROC_SHIFT)
+#define KVM_REG_ARM_CORE_REG(name)	(offsetof(struct kvm_regs, name) / sizeof(__u32))
+
+/* Some registers need more space to represent values. */
+#define KVM_REG_ARM_DEMUX		(0x0011 << KVM_REG_ARM_COPROC_SHIFT)
+#define KVM_REG_ARM_DEMUX_ID_MASK	0x000000000000FF00
+#define KVM_REG_ARM_DEMUX_ID_SHIFT	8
+#define KVM_REG_ARM_DEMUX_ID_CCSIDR	(0x00 << KVM_REG_ARM_DEMUX_ID_SHIFT)
+#define KVM_REG_ARM_DEMUX_VAL_MASK	0x00000000000000FF
+#define KVM_REG_ARM_DEMUX_VAL_SHIFT	0
+
+/* AArch64 system registers */
+#define KVM_REG_ARM64_SYSREG		(0x0013 << KVM_REG_ARM_COPROC_SHIFT)
+#define KVM_REG_ARM64_SYSREG_OP0_MASK	0x000000000000c000
+#define KVM_REG_ARM64_SYSREG_OP0_SHIFT	14
+#define KVM_REG_ARM64_SYSREG_OP1_MASK	0x0000000000003800
+#define KVM_REG_ARM64_SYSREG_OP1_SHIFT	11
+#define KVM_REG_ARM64_SYSREG_CRN_MASK	0x0000000000000780
+#define KVM_REG_ARM64_SYSREG_CRN_SHIFT	7
+#define KVM_REG_ARM64_SYSREG_CRM_MASK	0x0000000000000078
+#define KVM_REG_ARM64_SYSREG_CRM_SHIFT	3
+#define KVM_REG_ARM64_SYSREG_OP2_MASK	0x0000000000000007
+#define KVM_REG_ARM64_SYSREG_OP2_SHIFT	0
+
+#define ARM64_SYS_REG_SHIFT_MASK(x,n) \
+	(((x) << KVM_REG_ARM64_SYSREG_ ## n ## _SHIFT) & \
+	KVM_REG_ARM64_SYSREG_ ## n ## _MASK)
+
+#define __ARM64_SYS_REG(op0,op1,crn,crm,op2) \
+	(KVM_REG_ARM64 | KVM_REG_ARM64_SYSREG | \
+	ARM64_SYS_REG_SHIFT_MASK(op0, OP0) | \
+	ARM64_SYS_REG_SHIFT_MASK(op1, OP1) | \
+	ARM64_SYS_REG_SHIFT_MASK(crn, CRN) | \
+	ARM64_SYS_REG_SHIFT_MASK(crm, CRM) | \
+	ARM64_SYS_REG_SHIFT_MASK(op2, OP2))
+
+#define ARM64_SYS_REG(...) (__ARM64_SYS_REG(__VA_ARGS__) | KVM_REG_SIZE_U64)
+
+/* Physical Timer EL0 Registers */
+#define KVM_REG_ARM_PTIMER_CTL		ARM64_SYS_REG(3, 3, 14, 2, 1)
+#define KVM_REG_ARM_PTIMER_CVAL		ARM64_SYS_REG(3, 3, 14, 2, 2)
+#define KVM_REG_ARM_PTIMER_CNT		ARM64_SYS_REG(3, 3, 14, 0, 1)
+
+/* EL0 Virtual Timer Registers */
+#define KVM_REG_ARM_TIMER_CTL		ARM64_SYS_REG(3, 3, 14, 3, 1)
+#define KVM_REG_ARM_TIMER_CNT		ARM64_SYS_REG(3, 3, 14, 3, 2)
+#define KVM_REG_ARM_TIMER_CVAL		ARM64_SYS_REG(3, 3, 14, 0, 2)
+
+/* KVM-as-firmware specific pseudo-registers */
+#define KVM_REG_ARM_FW			(0x0014 << KVM_REG_ARM_COPROC_SHIFT)
+#define KVM_REG_ARM_FW_REG(r)		(KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \
+					 KVM_REG_ARM_FW | ((r) & 0xffff))
+#define KVM_REG_ARM_PSCI_VERSION	KVM_REG_ARM_FW_REG(0)
+#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1	KVM_REG_ARM_FW_REG(1)
+#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL		0
+#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL		1
+#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED	2
+#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2	KVM_REG_ARM_FW_REG(2)
+#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL		0
+#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN		1
+#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL		2
+#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED	3
+#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED     	(1U << 4)
+
+/* SVE registers */
+#define KVM_REG_ARM64_SVE		(0x15 << KVM_REG_ARM_COPROC_SHIFT)
+
+/* Z- and P-regs occupy blocks at the following offsets within this range: */
+#define KVM_REG_ARM64_SVE_ZREG_BASE	0
+#define KVM_REG_ARM64_SVE_PREG_BASE	0x400
+#define KVM_REG_ARM64_SVE_FFR_BASE	0x600
+
+#define KVM_ARM64_SVE_NUM_ZREGS		__SVE_NUM_ZREGS
+#define KVM_ARM64_SVE_NUM_PREGS		__SVE_NUM_PREGS
+
+#define KVM_ARM64_SVE_MAX_SLICES	32
+
+#define KVM_REG_ARM64_SVE_ZREG(n, i)					\
+	(KVM_REG_ARM64 | KVM_REG_ARM64_SVE | KVM_REG_ARM64_SVE_ZREG_BASE | \
+	 KVM_REG_SIZE_U2048 |						\
+	 (((n) & (KVM_ARM64_SVE_NUM_ZREGS - 1)) << 5) |			\
+	 ((i) & (KVM_ARM64_SVE_MAX_SLICES - 1)))
+
+#define KVM_REG_ARM64_SVE_PREG(n, i)					\
+	(KVM_REG_ARM64 | KVM_REG_ARM64_SVE | KVM_REG_ARM64_SVE_PREG_BASE | \
+	 KVM_REG_SIZE_U256 |						\
+	 (((n) & (KVM_ARM64_SVE_NUM_PREGS - 1)) << 5) |			\
+	 ((i) & (KVM_ARM64_SVE_MAX_SLICES - 1)))
+
+#define KVM_REG_ARM64_SVE_FFR(i)					\
+	(KVM_REG_ARM64 | KVM_REG_ARM64_SVE | KVM_REG_ARM64_SVE_FFR_BASE | \
+	 KVM_REG_SIZE_U256 |						\
+	 ((i) & (KVM_ARM64_SVE_MAX_SLICES - 1)))
+
+/*
+ * Register values for KVM_REG_ARM64_SVE_ZREG(), KVM_REG_ARM64_SVE_PREG() and
+ * KVM_REG_ARM64_SVE_FFR() are represented in memory in an endianness-
+ * invariant layout which differs from the layout used for the FPSIMD
+ * V-registers on big-endian systems: see sigcontext.h for more explanation.
+ */
+
+#define KVM_ARM64_SVE_VQ_MIN __SVE_VQ_MIN
+#define KVM_ARM64_SVE_VQ_MAX __SVE_VQ_MAX
+
+/* Vector lengths pseudo-register: */
+#define KVM_REG_ARM64_SVE_VLS		(KVM_REG_ARM64 | KVM_REG_ARM64_SVE | \
+					 KVM_REG_SIZE_U512 | 0xffff)
+#define KVM_ARM64_SVE_VLS_WORDS	\
+	((KVM_ARM64_SVE_VQ_MAX - KVM_ARM64_SVE_VQ_MIN) / 64 + 1)
+
+/* Device Control API: ARM VGIC */
+#define KVM_DEV_ARM_VGIC_GRP_ADDR	0
+#define KVM_DEV_ARM_VGIC_GRP_DIST_REGS	1
+#define KVM_DEV_ARM_VGIC_GRP_CPU_REGS	2
+#define   KVM_DEV_ARM_VGIC_CPUID_SHIFT	32
+#define   KVM_DEV_ARM_VGIC_CPUID_MASK	(0xffULL << KVM_DEV_ARM_VGIC_CPUID_SHIFT)
+#define   KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT 32
+#define   KVM_DEV_ARM_VGIC_V3_MPIDR_MASK \
+			(0xffffffffULL << KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT)
+#define   KVM_DEV_ARM_VGIC_OFFSET_SHIFT	0
+#define   KVM_DEV_ARM_VGIC_OFFSET_MASK	(0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT)
+#define   KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK (0xffff)
+#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS	3
+#define KVM_DEV_ARM_VGIC_GRP_CTRL	4
+#define KVM_DEV_ARM_VGIC_GRP_REDIST_REGS 5
+#define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6
+#define KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO  7
+#define KVM_DEV_ARM_VGIC_GRP_ITS_REGS 8
+#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT	10
+#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK \
+			(0x3fffffULL << KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT)
+#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK	0x3ff
+#define VGIC_LEVEL_INFO_LINE_LEVEL	0
+
+#define   KVM_DEV_ARM_VGIC_CTRL_INIT		0
+#define   KVM_DEV_ARM_ITS_SAVE_TABLES           1
+#define   KVM_DEV_ARM_ITS_RESTORE_TABLES        2
+#define   KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES	3
+#define   KVM_DEV_ARM_ITS_CTRL_RESET		4
+
+/* Device Control API on vcpu fd */
+#define KVM_ARM_VCPU_PMU_V3_CTRL	0
+#define   KVM_ARM_VCPU_PMU_V3_IRQ	0
+#define   KVM_ARM_VCPU_PMU_V3_INIT	1
+#define KVM_ARM_VCPU_TIMER_CTRL		1
+#define   KVM_ARM_VCPU_TIMER_IRQ_VTIMER		0
+#define   KVM_ARM_VCPU_TIMER_IRQ_PTIMER		1
+
+/* KVM_IRQ_LINE irq field index values */
+#define KVM_ARM_IRQ_TYPE_SHIFT		24
+#define KVM_ARM_IRQ_TYPE_MASK		0xff
+#define KVM_ARM_IRQ_VCPU_SHIFT		16
+#define KVM_ARM_IRQ_VCPU_MASK		0xff
+#define KVM_ARM_IRQ_NUM_SHIFT		0
+#define KVM_ARM_IRQ_NUM_MASK		0xffff
+
+/* irq_type field */
+#define KVM_ARM_IRQ_TYPE_CPU		0
+#define KVM_ARM_IRQ_TYPE_SPI		1
+#define KVM_ARM_IRQ_TYPE_PPI		2
+
+/* out-of-kernel GIC cpu interrupt injection irq_number field */
+#define KVM_ARM_IRQ_CPU_IRQ		0
+#define KVM_ARM_IRQ_CPU_FIQ		1
+
+/*
+ * This used to hold the highest supported SPI, but it is now obsolete
+ * and only here to provide source code level compatibility with older
+ * userland. The highest SPI number can be set via KVM_DEV_ARM_VGIC_GRP_NR_IRQS.
+ */
+#ifndef __KERNEL__
+#define KVM_ARM_IRQ_GIC_MAX		127
+#endif
+
+/* One single KVM irqchip, ie. the VGIC */
+#define KVM_NR_IRQCHIPS          1
+
+/* PSCI interface */
+#define KVM_PSCI_FN_BASE		0x95c1ba5e
+#define KVM_PSCI_FN(n)			(KVM_PSCI_FN_BASE + (n))
+
+#define KVM_PSCI_FN_CPU_SUSPEND		KVM_PSCI_FN(0)
+#define KVM_PSCI_FN_CPU_OFF		KVM_PSCI_FN(1)
+#define KVM_PSCI_FN_CPU_ON		KVM_PSCI_FN(2)
+#define KVM_PSCI_FN_MIGRATE		KVM_PSCI_FN(3)
+
+#define KVM_PSCI_RET_SUCCESS		PSCI_RET_SUCCESS
+#define KVM_PSCI_RET_NI			PSCI_RET_NOT_SUPPORTED
+#define KVM_PSCI_RET_INVAL		PSCI_RET_INVALID_PARAMS
+#define KVM_PSCI_RET_DENIED		PSCI_RET_DENIED
+
+#endif
+
+#endif /* __ARM_KVM_H__ */
diff --git a/kvmtool/arm/aarch64/include/asm/sve_context.h b/kvmtool/arm/aarch64/include/asm/sve_context.h
new file mode 100644
index 0000000..754ab75
--- /dev/null
+++ b/kvmtool/arm/aarch64/include/asm/sve_context.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* Copyright (C) 2017-2018 ARM Limited */
+
+/*
+ * For use by other UAPI headers only.
+ * Do not make direct use of header or its definitions.
+ */
+
+#ifndef _UAPI__ASM_SVE_CONTEXT_H
+#define _UAPI__ASM_SVE_CONTEXT_H
+
+#include <linux/types.h>
+
+#define __SVE_VQ_BYTES		16	/* number of bytes per quadword */
+
+#define __SVE_VQ_MIN		1
+#define __SVE_VQ_MAX		512
+
+#define __SVE_VL_MIN		(__SVE_VQ_MIN * __SVE_VQ_BYTES)
+#define __SVE_VL_MAX		(__SVE_VQ_MAX * __SVE_VQ_BYTES)
+
+#define __SVE_NUM_ZREGS		32
+#define __SVE_NUM_PREGS		16
+
+#define __sve_vl_valid(vl)			\
+	((vl) % __SVE_VQ_BYTES == 0 &&		\
+	 (vl) >= __SVE_VL_MIN &&		\
+	 (vl) <= __SVE_VL_MAX)
+
+#define __sve_vq_from_vl(vl)	((vl) / __SVE_VQ_BYTES)
+#define __sve_vl_from_vq(vq)	((vq) * __SVE_VQ_BYTES)
+
+#define __SVE_ZREG_SIZE(vq)	((__u32)(vq) * __SVE_VQ_BYTES)
+#define __SVE_PREG_SIZE(vq)	((__u32)(vq) * (__SVE_VQ_BYTES / 8))
+#define __SVE_FFR_SIZE(vq)	__SVE_PREG_SIZE(vq)
+
+#define __SVE_ZREGS_OFFSET	0
+#define __SVE_ZREG_OFFSET(vq, n) \
+	(__SVE_ZREGS_OFFSET + __SVE_ZREG_SIZE(vq) * (n))
+#define __SVE_ZREGS_SIZE(vq) \
+	(__SVE_ZREG_OFFSET(vq, __SVE_NUM_ZREGS) - __SVE_ZREGS_OFFSET)
+
+#define __SVE_PREGS_OFFSET(vq) \
+	(__SVE_ZREGS_OFFSET + __SVE_ZREGS_SIZE(vq))
+#define __SVE_PREG_OFFSET(vq, n) \
+	(__SVE_PREGS_OFFSET(vq) + __SVE_PREG_SIZE(vq) * (n))
+#define __SVE_PREGS_SIZE(vq) \
+	(__SVE_PREG_OFFSET(vq, __SVE_NUM_PREGS) - __SVE_PREGS_OFFSET(vq))
+
+#define __SVE_FFR_OFFSET(vq) \
+	(__SVE_PREGS_OFFSET(vq) + __SVE_PREGS_SIZE(vq))
+
+#endif /* ! _UAPI__ASM_SVE_CONTEXT_H */
diff --git a/kvmtool/arm/aarch64/include/kvm/barrier.h b/kvmtool/arm/aarch64/include/kvm/barrier.h
new file mode 100644
index 0000000..97ab252
--- /dev/null
+++ b/kvmtool/arm/aarch64/include/kvm/barrier.h
@@ -0,0 +1,8 @@
+#ifndef KVM__KVM_BARRIER_H
+#define KVM__KVM_BARRIER_H
+
+#define mb()	asm volatile ("dmb ish"		: : : "memory")
+#define rmb()	asm volatile ("dmb ishld"	: : : "memory")
+#define wmb()	asm volatile ("dmb ishst"	: : : "memory")
+
+#endif /* KVM__KVM_BARRIER_H */
diff --git a/kvmtool/arm/aarch64/include/kvm/fdt-arch.h b/kvmtool/arm/aarch64/include/kvm/fdt-arch.h
new file mode 100644
index 0000000..e448bf1
--- /dev/null
+++ b/kvmtool/arm/aarch64/include/kvm/fdt-arch.h
@@ -0,0 +1,6 @@
+#ifndef KVM__KVM_FDT_H
+#define KVM__KVM_FDT_H
+
+#include "arm-common/fdt-arch.h"
+
+#endif /* KVM__KVM_FDT_H */
diff --git a/kvmtool/arm/aarch64/include/kvm/kvm-arch.h b/kvmtool/arm/aarch64/include/kvm/kvm-arch.h
new file mode 100644
index 0000000..55ef8ed
--- /dev/null
+++ b/kvmtool/arm/aarch64/include/kvm/kvm-arch.h
@@ -0,0 +1,13 @@
+#ifndef KVM__KVM_ARCH_H
+#define KVM__KVM_ARCH_H
+
+struct kvm;
+unsigned long long kvm__arch_get_kern_offset(struct kvm *kvm, int fd);
+
+#define ARM_MAX_MEMORY(kvm)	((kvm)->cfg.arch.aarch32_guest	?	\
+				ARM_LOMAP_MAX_MEMORY		:	\
+				ARM_HIMAP_MAX_MEMORY)
+
+#include "arm-common/kvm-arch.h"
+
+#endif /* KVM__KVM_ARCH_H */
diff --git a/kvmtool/arm/aarch64/include/kvm/kvm-config-arch.h b/kvmtool/arm/aarch64/include/kvm/kvm-config-arch.h
new file mode 100644
index 0000000..04be43d
--- /dev/null
+++ b/kvmtool/arm/aarch64/include/kvm/kvm-config-arch.h
@@ -0,0 +1,15 @@
+#ifndef KVM__KVM_CONFIG_ARCH_H
+#define KVM__KVM_CONFIG_ARCH_H
+
+#define ARM_OPT_ARCH_RUN(cfg)						\
+	OPT_BOOLEAN('\0', "aarch32", &(cfg)->aarch32_guest,		\
+			"Run AArch32 guest"),				\
+	OPT_BOOLEAN('\0', "pmu", &(cfg)->has_pmuv3,			\
+			"Create PMUv3 device"),				\
+	OPT_U64('\0', "kaslr-seed", &(cfg)->kaslr_seed,			\
+			"Specify random seed for Kernel Address Space "	\
+			"Layout Randomization (KASLR)"),
+
+#include "arm-common/kvm-config-arch.h"
+
+#endif /* KVM__KVM_CONFIG_ARCH_H */
diff --git a/kvmtool/arm/aarch64/include/kvm/kvm-cpu-arch.h b/kvmtool/arm/aarch64/include/kvm/kvm-cpu-arch.h
new file mode 100644
index 0000000..8dfb82e
--- /dev/null
+++ b/kvmtool/arm/aarch64/include/kvm/kvm-cpu-arch.h
@@ -0,0 +1,23 @@
+#ifndef KVM__KVM_CPU_ARCH_H
+#define KVM__KVM_CPU_ARCH_H
+
+#include "kvm/kvm.h"
+
+#include "arm-common/kvm-cpu-arch.h"
+
+#define ARM_VCPU_FEATURE_FLAGS(kvm, cpuid)	{				\
+	[0] = ((!!(cpuid) << KVM_ARM_VCPU_POWER_OFF) |				\
+	       (!!(kvm)->cfg.arch.aarch32_guest << KVM_ARM_VCPU_EL1_32BIT) |	\
+	       (!!(kvm)->cfg.arch.has_pmuv3 << KVM_ARM_VCPU_PMU_V3))		\
+}
+
+#define ARM_MPIDR_HWID_BITMASK	0xFF00FFFFFFUL
+#define ARM_CPU_ID		3, 0, 0, 0
+#define ARM_CPU_ID_MPIDR	5
+#define ARM_CPU_CTRL		3, 0, 1, 0
+#define ARM_CPU_CTRL_SCTLR_EL1	0
+
+void kvm_cpu__select_features(struct kvm *kvm, struct kvm_vcpu_init *init);
+int kvm_cpu__configure_features(struct kvm_cpu *vcpu);
+
+#endif /* KVM__KVM_CPU_ARCH_H */
diff --git a/kvmtool/arm/aarch64/kvm-cpu.c b/kvmtool/arm/aarch64/kvm-cpu.c
new file mode 100644
index 0000000..9f3e858
--- /dev/null
+++ b/kvmtool/arm/aarch64/kvm-cpu.c
@@ -0,0 +1,256 @@
+#include "kvm/kvm-cpu.h"
+#include "kvm/kvm.h"
+#include "kvm/virtio.h"
+
+#include <asm/ptrace.h>
+
+#define COMPAT_PSR_F_BIT	0x00000040
+#define COMPAT_PSR_I_BIT	0x00000080
+#define COMPAT_PSR_E_BIT	0x00000200
+#define COMPAT_PSR_MODE_SVC	0x00000013
+
+#define SCTLR_EL1_E0E_MASK	(1 << 24)
+#define SCTLR_EL1_EE_MASK	(1 << 25)
+
+static __u64 __core_reg_id(__u64 offset)
+{
+	__u64 id = KVM_REG_ARM64 | KVM_REG_ARM_CORE | offset;
+
+	if (offset < KVM_REG_ARM_CORE_REG(fp_regs))
+		id |= KVM_REG_SIZE_U64;
+	else if (offset < KVM_REG_ARM_CORE_REG(fp_regs.fpsr))
+		id |= KVM_REG_SIZE_U128;
+	else
+		id |= KVM_REG_SIZE_U32;
+
+	return id;
+}
+
+#define ARM64_CORE_REG(x) __core_reg_id(KVM_REG_ARM_CORE_REG(x))
+
+unsigned long kvm_cpu__get_vcpu_mpidr(struct kvm_cpu *vcpu)
+{
+	struct kvm_one_reg reg;
+	u64 mpidr;
+
+	reg.id = ARM64_SYS_REG(ARM_CPU_ID, ARM_CPU_ID_MPIDR);
+	reg.addr = (u64)&mpidr;
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (get_mpidr vcpu%ld", vcpu->cpu_id);
+
+	return mpidr;
+}
+
+static void reset_vcpu_aarch32(struct kvm_cpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_one_reg reg;
+	u64 data;
+
+	reg.addr = (u64)&data;
+
+	/* pstate = all interrupts masked */
+	data	= COMPAT_PSR_I_BIT | COMPAT_PSR_F_BIT | COMPAT_PSR_MODE_SVC;
+	reg.id	= ARM64_CORE_REG(regs.pstate);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (spsr[EL1])");
+
+	/* Secondary cores are stopped awaiting PSCI wakeup */
+	if (vcpu->cpu_id != 0)
+		return;
+
+	/* r0 = 0 */
+	data	= 0;
+	reg.id	= ARM64_CORE_REG(regs.regs[0]);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (r0)");
+
+	/* r1 = machine type (-1) */
+	data	= -1;
+	reg.id	= ARM64_CORE_REG(regs.regs[1]);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (r1)");
+
+	/* r2 = physical address of the device tree blob */
+	data	= kvm->arch.dtb_guest_start;
+	reg.id	= ARM64_CORE_REG(regs.regs[2]);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (r2)");
+
+	/* pc = start of kernel image */
+	data	= kvm->arch.kern_guest_start;
+	reg.id	= ARM64_CORE_REG(regs.pc);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (pc)");
+}
+
+static void reset_vcpu_aarch64(struct kvm_cpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_one_reg reg;
+	u64 data;
+
+	reg.addr = (u64)&data;
+
+	/* pstate = all interrupts masked */
+	data	= PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | PSR_MODE_EL1h;
+	reg.id	= ARM64_CORE_REG(regs.pstate);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (spsr[EL1])");
+
+	/* x1...x3 = 0 */
+	data	= 0;
+	reg.id	= ARM64_CORE_REG(regs.regs[1]);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (x1)");
+
+	reg.id	= ARM64_CORE_REG(regs.regs[2]);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (x2)");
+
+	reg.id	= ARM64_CORE_REG(regs.regs[3]);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (x3)");
+
+	/* Secondary cores are stopped awaiting PSCI wakeup */
+	if (vcpu->cpu_id == 0) {
+		/* x0 = physical address of the device tree blob */
+		data	= kvm->arch.dtb_guest_start;
+		reg.id	= ARM64_CORE_REG(regs.regs[0]);
+		if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+			die_perror("KVM_SET_ONE_REG failed (x0)");
+
+		/* pc = start of kernel image */
+		data	= kvm->arch.kern_guest_start;
+		reg.id	= ARM64_CORE_REG(regs.pc);
+		if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+			die_perror("KVM_SET_ONE_REG failed (pc)");
+	}
+}
+
+void kvm_cpu__select_features(struct kvm *kvm, struct kvm_vcpu_init *init)
+{
+	/* Enable pointer authentication if available */
+	if (kvm__supports_extension(kvm, KVM_CAP_ARM_PTRAUTH_ADDRESS) &&
+	    kvm__supports_extension(kvm, KVM_CAP_ARM_PTRAUTH_GENERIC)) {
+		init->features[0] |= 1UL << KVM_ARM_VCPU_PTRAUTH_ADDRESS;
+		init->features[0] |= 1UL << KVM_ARM_VCPU_PTRAUTH_GENERIC;
+	}
+
+	/* Enable SVE if available */
+	if (kvm__supports_extension(kvm, KVM_CAP_ARM_SVE))
+		init->features[0] |= 1UL << KVM_ARM_VCPU_SVE;
+}
+
+int kvm_cpu__configure_features(struct kvm_cpu *vcpu)
+{
+	if (kvm__supports_extension(vcpu->kvm, KVM_CAP_ARM_SVE)) {
+		int feature = KVM_ARM_VCPU_SVE;
+
+		if (ioctl(vcpu->vcpu_fd, KVM_ARM_VCPU_FINALIZE, &feature)) {
+			pr_err("KVM_ARM_VCPU_FINALIZE: %s", strerror(errno));
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu)
+{
+	if (vcpu->kvm->cfg.arch.aarch32_guest)
+		return reset_vcpu_aarch32(vcpu);
+	else
+		return reset_vcpu_aarch64(vcpu);
+}
+
+int kvm_cpu__get_endianness(struct kvm_cpu *vcpu)
+{
+	struct kvm_one_reg reg;
+	u64 psr;
+	u64 sctlr;
+
+	/*
+	 * Quoting the definition given by Peter Maydell:
+	 *
+	 * "Endianness of the CPU which does the virtio reset at the
+	 * point when it does that reset"
+	 *
+	 * We first check for an AArch32 guest: its endianness can
+	 * change when using SETEND, which affects the CPSR.E bit.
+	 *
+	 * If we're AArch64, use SCTLR_EL1.E0E if access comes from
+	 * EL0, and SCTLR_EL1.EE if access comes from EL1.
+	 */
+	reg.id = ARM64_CORE_REG(regs.pstate);
+	reg.addr = (u64)&psr;
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (spsr[EL1])");
+
+	if (psr & PSR_MODE32_BIT)
+		return (psr & COMPAT_PSR_E_BIT) ? VIRTIO_ENDIAN_BE : VIRTIO_ENDIAN_LE;
+
+	reg.id = ARM64_SYS_REG(ARM_CPU_CTRL, ARM_CPU_CTRL_SCTLR_EL1);
+	reg.addr = (u64)&sctlr;
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (SCTLR_EL1)");
+
+	if ((psr & PSR_MODE_MASK) == PSR_MODE_EL0t)
+		sctlr &= SCTLR_EL1_E0E_MASK;
+	else
+		sctlr &= SCTLR_EL1_EE_MASK;
+	return sctlr ? VIRTIO_ENDIAN_BE : VIRTIO_ENDIAN_LE;
+}
+
+void kvm_cpu__show_code(struct kvm_cpu *vcpu)
+{
+	struct kvm_one_reg reg;
+	unsigned long data;
+	int debug_fd = kvm_cpu__get_debug_fd();
+
+	reg.addr = (u64)&data;
+
+	dprintf(debug_fd, "\n*pc:\n");
+	reg.id = ARM64_CORE_REG(regs.pc);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (show_code @ PC)");
+
+	kvm__dump_mem(vcpu->kvm, data, 32, debug_fd);
+
+	dprintf(debug_fd, "\n*lr:\n");
+	reg.id = ARM64_CORE_REG(regs.regs[30]);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (show_code @ LR)");
+
+	kvm__dump_mem(vcpu->kvm, data, 32, debug_fd);
+}
+
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu)
+{
+	struct kvm_one_reg reg;
+	unsigned long data;
+	int debug_fd = kvm_cpu__get_debug_fd();
+
+	reg.addr = (u64)&data;
+	dprintf(debug_fd, "\n Registers:\n");
+
+	reg.id		= ARM64_CORE_REG(regs.pc);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (pc)");
+	dprintf(debug_fd, " PC:    0x%lx\n", data);
+
+	reg.id		= ARM64_CORE_REG(regs.pstate);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (pstate)");
+	dprintf(debug_fd, " PSTATE:    0x%lx\n", data);
+
+	reg.id		= ARM64_CORE_REG(sp_el1);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (sp_el1)");
+	dprintf(debug_fd, " SP_EL1:    0x%lx\n", data);
+
+	reg.id		= ARM64_CORE_REG(regs.regs[30]);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (lr)");
+	dprintf(debug_fd, " LR:    0x%lx\n", data);
+}
diff --git a/kvmtool/arm/aarch64/kvm.c b/kvmtool/arm/aarch64/kvm.c
new file mode 100644
index 0000000..49e1dd3
--- /dev/null
+++ b/kvmtool/arm/aarch64/kvm.c
@@ -0,0 +1,48 @@
+#include "kvm/kvm.h"
+
+#include <asm/image.h>
+
+#include <linux/byteorder.h>
+
+/*
+ * Return the TEXT_OFFSET value that the guest kernel expects. Note
+ * that pre-3.17 kernels expose this value using the native endianness
+ * instead of Little-Endian. BE kernels of this vintage may fail to
+ * boot. See Documentation/arm64/booting.rst in your local kernel tree.
+ */
+unsigned long long kvm__arch_get_kern_offset(struct kvm *kvm, int fd)
+{
+	struct arm64_image_header header;
+	off_t cur_offset;
+	ssize_t size;
+	const char *warn_str;
+
+	/* the 32bit kernel offset is a well known value */
+	if (kvm->cfg.arch.aarch32_guest)
+		return 0x8000;
+
+	cur_offset = lseek(fd, 0, SEEK_CUR);
+	if (cur_offset == (off_t)-1 ||
+	    lseek(fd, 0, SEEK_SET) == (off_t)-1) {
+		warn_str = "Failed to seek in kernel image file";
+		goto fail;
+	}
+
+	size = xread(fd, &header, sizeof(header));
+	if (size < 0 || (size_t)size < sizeof(header))
+		die("Failed to read kernel image header");
+
+	lseek(fd, cur_offset, SEEK_SET);
+
+	if (memcmp(&header.magic, ARM64_IMAGE_MAGIC, sizeof(header.magic)))
+		pr_warning("Kernel image magic not matching");
+
+	if (le64_to_cpu(header.image_size))
+		return le64_to_cpu(header.text_offset);
+
+	warn_str = "Image size is 0";
+fail:
+	pr_warning("%s, assuming TEXT_OFFSET to be 0x80000", warn_str);
+	return 0x80000;
+}
+
diff --git a/kvmtool/arm/fdt.c b/kvmtool/arm/fdt.c
new file mode 100644
index 0000000..02091e9
--- /dev/null
+++ b/kvmtool/arm/fdt.c
@@ -0,0 +1,230 @@
+#include "kvm/devices.h"
+#include "kvm/fdt.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/virtio-mmio.h"
+
+#include "arm-common/gic.h"
+#include "arm-common/pci.h"
+
+#include <stdbool.h>
+
+#include <linux/byteorder.h>
+#include <linux/kernel.h>
+#include <linux/sizes.h>
+#include <linux/psci.h>
+
+static void dump_fdt(const char *dtb_file, void *fdt)
+{
+	int count, fd;
+
+	fd = open(dtb_file, O_CREAT | O_TRUNC | O_RDWR, 0666);
+	if (fd < 0)
+		die("Failed to write dtb to %s", dtb_file);
+
+	count = write(fd, fdt, FDT_MAX_SIZE);
+	if (count < 0)
+		die_perror("Failed to dump dtb");
+
+	pr_debug("Wrote %d bytes to dtb %s", count, dtb_file);
+	close(fd);
+}
+
+#define CPU_NAME_MAX_LEN 15
+static void generate_cpu_nodes(void *fdt, struct kvm *kvm)
+{
+	int cpu;
+
+	_FDT(fdt_begin_node(fdt, "cpus"));
+	_FDT(fdt_property_cell(fdt, "#address-cells", 0x1));
+	_FDT(fdt_property_cell(fdt, "#size-cells", 0x0));
+
+	for (cpu = 0; cpu < kvm->nrcpus; ++cpu) {
+		char cpu_name[CPU_NAME_MAX_LEN];
+		struct kvm_cpu *vcpu = kvm->cpus[cpu];
+		unsigned long mpidr = kvm_cpu__get_vcpu_mpidr(vcpu);
+
+		mpidr &= ARM_MPIDR_HWID_BITMASK;
+		snprintf(cpu_name, CPU_NAME_MAX_LEN, "cpu@%lx", mpidr);
+
+		_FDT(fdt_begin_node(fdt, cpu_name));
+		_FDT(fdt_property_string(fdt, "device_type", "cpu"));
+		_FDT(fdt_property_string(fdt, "compatible", vcpu->cpu_compatible));
+
+		if (kvm->nrcpus > 1)
+			_FDT(fdt_property_string(fdt, "enable-method", "psci"));
+
+		_FDT(fdt_property_cell(fdt, "reg", mpidr));
+		_FDT(fdt_end_node(fdt));
+	}
+
+	_FDT(fdt_end_node(fdt));
+}
+
+static void generate_irq_prop(void *fdt, u8 irq, enum irq_type irq_type)
+{
+	u32 irq_prop[] = {
+		cpu_to_fdt32(GIC_FDT_IRQ_TYPE_SPI),
+		cpu_to_fdt32(irq - GIC_SPI_IRQ_BASE),
+		cpu_to_fdt32(irq_type)
+	};
+
+	_FDT(fdt_property(fdt, "interrupts", irq_prop, sizeof(irq_prop)));
+}
+
+struct psci_fns {
+	u32 cpu_suspend;
+	u32 cpu_off;
+	u32 cpu_on;
+	u32 migrate;
+};
+
+static struct psci_fns psci_0_1_fns = {
+	.cpu_suspend = KVM_PSCI_FN_CPU_SUSPEND,
+	.cpu_off = KVM_PSCI_FN_CPU_OFF,
+	.cpu_on = KVM_PSCI_FN_CPU_ON,
+	.migrate = KVM_PSCI_FN_MIGRATE,
+};
+
+static struct psci_fns psci_0_2_aarch32_fns = {
+	.cpu_suspend = PSCI_0_2_FN_CPU_SUSPEND,
+	.cpu_off = PSCI_0_2_FN_CPU_OFF,
+	.cpu_on = PSCI_0_2_FN_CPU_ON,
+	.migrate = PSCI_0_2_FN_MIGRATE,
+};
+
+static struct psci_fns psci_0_2_aarch64_fns = {
+	.cpu_suspend = PSCI_0_2_FN64_CPU_SUSPEND,
+	.cpu_off = PSCI_0_2_FN_CPU_OFF,
+	.cpu_on = PSCI_0_2_FN64_CPU_ON,
+	.migrate = PSCI_0_2_FN64_MIGRATE,
+};
+
+static int setup_fdt(struct kvm *kvm)
+{
+	struct device_header *dev_hdr;
+	u8 staging_fdt[FDT_MAX_SIZE];
+	u64 mem_reg_prop[]	= {
+		cpu_to_fdt64(kvm->arch.memory_guest_start),
+		cpu_to_fdt64(kvm->ram_size),
+	};
+	struct psci_fns *fns;
+	void *fdt		= staging_fdt;
+	void *fdt_dest		= guest_flat_to_host(kvm,
+						     kvm->arch.dtb_guest_start);
+	void (*generate_mmio_fdt_nodes)(void *, struct device_header *,
+					void (*)(void *, u8, enum irq_type));
+	void (*generate_cpu_peripheral_fdt_nodes)(void *, struct kvm *)
+					= kvm->cpus[0]->generate_fdt_nodes;
+
+	/* Create new tree without a reserve map */
+	_FDT(fdt_create(fdt, FDT_MAX_SIZE));
+	_FDT(fdt_finish_reservemap(fdt));
+
+	/* Header */
+	_FDT(fdt_begin_node(fdt, ""));
+	_FDT(fdt_property_cell(fdt, "interrupt-parent", PHANDLE_GIC));
+	_FDT(fdt_property_string(fdt, "compatible", "linux,dummy-virt"));
+	_FDT(fdt_property_cell(fdt, "#address-cells", 0x2));
+	_FDT(fdt_property_cell(fdt, "#size-cells", 0x2));
+
+	/* /chosen */
+	_FDT(fdt_begin_node(fdt, "chosen"));
+
+	/* Pass on our amended command line to a Linux kernel only. */
+	if (kvm->cfg.firmware_filename) {
+		if (kvm->cfg.kernel_cmdline)
+			_FDT(fdt_property_string(fdt, "bootargs",
+						 kvm->cfg.kernel_cmdline));
+	} else
+		_FDT(fdt_property_string(fdt, "bootargs",
+					 kvm->cfg.real_cmdline));
+
+	_FDT(fdt_property_u64(fdt, "kaslr-seed", kvm->cfg.arch.kaslr_seed));
+	_FDT(fdt_property_string(fdt, "stdout-path", "serial0"));
+
+	/* Initrd */
+	if (kvm->arch.initrd_size != 0) {
+		u64 ird_st_prop = cpu_to_fdt64(kvm->arch.initrd_guest_start);
+		u64 ird_end_prop = cpu_to_fdt64(kvm->arch.initrd_guest_start +
+					       kvm->arch.initrd_size);
+
+		_FDT(fdt_property(fdt, "linux,initrd-start",
+				   &ird_st_prop, sizeof(ird_st_prop)));
+		_FDT(fdt_property(fdt, "linux,initrd-end",
+				   &ird_end_prop, sizeof(ird_end_prop)));
+	}
+	_FDT(fdt_end_node(fdt));
+
+	/* Memory */
+	_FDT(fdt_begin_node(fdt, "memory"));
+	_FDT(fdt_property_string(fdt, "device_type", "memory"));
+	_FDT(fdt_property(fdt, "reg", mem_reg_prop, sizeof(mem_reg_prop)));
+	_FDT(fdt_end_node(fdt));
+
+	/* CPU and peripherals (interrupt controller, timers, etc) */
+	generate_cpu_nodes(fdt, kvm);
+	if (generate_cpu_peripheral_fdt_nodes)
+		generate_cpu_peripheral_fdt_nodes(fdt, kvm);
+
+	/* Virtio MMIO devices */
+	dev_hdr = device__first_dev(DEVICE_BUS_MMIO);
+	while (dev_hdr) {
+		generate_mmio_fdt_nodes = dev_hdr->data;
+		generate_mmio_fdt_nodes(fdt, dev_hdr, generate_irq_prop);
+		dev_hdr = device__next_dev(dev_hdr);
+	}
+
+	/* IOPORT devices (!) */
+	dev_hdr = device__first_dev(DEVICE_BUS_IOPORT);
+	while (dev_hdr) {
+		generate_mmio_fdt_nodes = dev_hdr->data;
+		generate_mmio_fdt_nodes(fdt, dev_hdr, generate_irq_prop);
+		dev_hdr = device__next_dev(dev_hdr);
+	}
+
+	/* PCI host controller */
+	pci__generate_fdt_nodes(fdt);
+
+	/* PSCI firmware */
+	_FDT(fdt_begin_node(fdt, "psci"));
+	if (kvm__supports_extension(kvm, KVM_CAP_ARM_PSCI_0_2)) {
+		const char compatible[] = "arm,psci-0.2\0arm,psci";
+		_FDT(fdt_property(fdt, "compatible",
+				  compatible, sizeof(compatible)));
+		if (kvm->cfg.arch.aarch32_guest)
+			fns = &psci_0_2_aarch32_fns;
+		else
+			fns = &psci_0_2_aarch64_fns;
+	} else {
+		_FDT(fdt_property_string(fdt, "compatible", "arm,psci"));
+		fns = &psci_0_1_fns;
+	}
+	_FDT(fdt_property_string(fdt, "method", "hvc"));
+	_FDT(fdt_property_cell(fdt, "cpu_suspend", fns->cpu_suspend));
+	_FDT(fdt_property_cell(fdt, "cpu_off", fns->cpu_off));
+	_FDT(fdt_property_cell(fdt, "cpu_on", fns->cpu_on));
+	_FDT(fdt_property_cell(fdt, "migrate", fns->migrate));
+	_FDT(fdt_end_node(fdt));
+
+	if (fdt_stdout_path) {
+		_FDT(fdt_begin_node(fdt, "aliases"));
+		_FDT(fdt_property_string(fdt, "serial0", fdt_stdout_path));
+		_FDT(fdt_end_node(fdt));
+
+		free(fdt_stdout_path);
+		fdt_stdout_path = NULL;
+	}
+
+	/* Finalise. */
+	_FDT(fdt_end_node(fdt));
+	_FDT(fdt_finish(fdt));
+
+	_FDT(fdt_open_into(fdt, fdt_dest, FDT_MAX_SIZE));
+	_FDT(fdt_pack(fdt_dest));
+
+	if (kvm->cfg.arch.dump_dtb_filename)
+		dump_fdt(kvm->cfg.arch.dump_dtb_filename, fdt_dest);
+	return 0;
+}
+late_init(setup_fdt);
diff --git a/kvmtool/arm/gic.c b/kvmtool/arm/gic.c
new file mode 100644
index 0000000..26be4b4
--- /dev/null
+++ b/kvmtool/arm/gic.c
@@ -0,0 +1,464 @@
+#include "kvm/fdt.h"
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/virtio.h"
+
+#include "arm-common/gic.h"
+
+#include <linux/byteorder.h>
+#include <linux/kernel.h>
+#include <linux/kvm.h>
+#include <linux/sizes.h>
+
+#define IRQCHIP_GIC 0
+
+static int gic_fd = -1;
+static u64 gic_redists_base;
+static u64 gic_redists_size;
+static u64 gic_msi_base;
+static u64 gic_msi_size = 0;
+static bool vgic_is_init = false;
+
+struct kvm_irqfd_line {
+	unsigned int		gsi;
+	int			trigger_fd;
+	int			resample_fd;
+	struct list_head	list;
+};
+
+static LIST_HEAD(irqfd_lines);
+
+int irqchip_parser(const struct option *opt, const char *arg, int unset)
+{
+	enum irqchip_type *type = opt->value;
+
+	if (!strcmp(arg, "gicv2")) {
+		*type = IRQCHIP_GICV2;
+	} else if (!strcmp(arg, "gicv2m")) {
+		*type = IRQCHIP_GICV2M;
+	} else if (!strcmp(arg, "gicv3")) {
+		*type = IRQCHIP_GICV3;
+	} else if (!strcmp(arg, "gicv3-its")) {
+		*type = IRQCHIP_GICV3_ITS;
+	} else {
+		pr_err("irqchip: unknown type \"%s\"\n", arg);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int irq__setup_irqfd_lines(struct kvm *kvm)
+{
+	int ret;
+	struct kvm_irqfd_line *line, *tmp;
+
+	list_for_each_entry_safe(line, tmp, &irqfd_lines, list) {
+		ret = irq__common_add_irqfd(kvm, line->gsi, line->trigger_fd,
+					    line->resample_fd);
+		if (ret < 0) {
+			pr_err("Failed to register IRQFD");
+			return ret;
+		}
+
+		list_del(&line->list);
+		free(line);
+	}
+
+	return 0;
+}
+
+static int irq__routing_init(struct kvm *kvm)
+{
+	int r;
+	int irqlines = ALIGN(irq__get_nr_allocated_lines(), 32);
+
+	/*
+	 * This describes the default routing that the kernel uses without
+	 * any routing explicitly set up via KVM_SET_GSI_ROUTING. So we
+	 * don't need to commit these setting right now. The first actual
+	 * user (MSI routing) will engage these mappings then.
+	 */
+	for (next_gsi = 0; next_gsi < irqlines; next_gsi++) {
+		r = irq__allocate_routing_entry();
+		if (r)
+			return r;
+
+		irq_routing->entries[irq_routing->nr++] =
+			(struct kvm_irq_routing_entry) {
+				.gsi = next_gsi,
+				.type = KVM_IRQ_ROUTING_IRQCHIP,
+				.u.irqchip.irqchip = IRQCHIP_GIC,
+				.u.irqchip.pin = next_gsi,
+		};
+	}
+
+	return 0;
+}
+
+static int gic__create_its_frame(struct kvm *kvm, u64 its_frame_addr)
+{
+	struct kvm_create_device its_device = {
+		.type = KVM_DEV_TYPE_ARM_VGIC_ITS,
+		.flags	= 0,
+	};
+	struct kvm_device_attr its_attr = {
+		.group	= KVM_DEV_ARM_VGIC_GRP_ADDR,
+		.attr	= KVM_VGIC_ITS_ADDR_TYPE,
+		.addr	= (u64)(unsigned long)&its_frame_addr,
+	};
+	struct kvm_device_attr its_init_attr = {
+		.group	= KVM_DEV_ARM_VGIC_GRP_CTRL,
+		.attr	= KVM_DEV_ARM_VGIC_CTRL_INIT,
+	};
+	int err;
+
+	err = ioctl(kvm->vm_fd, KVM_CREATE_DEVICE, &its_device);
+	if (err) {
+		fprintf(stderr,
+			"GICv3 ITS requested, but kernel does not support it.\n");
+		fprintf(stderr, "Try --irqchip=gicv3 instead\n");
+		return err;
+	}
+
+	err = ioctl(its_device.fd, KVM_HAS_DEVICE_ATTR, &its_attr);
+	if (err) {
+		close(its_device.fd);
+		its_device.fd = -1;
+		return err;
+	}
+
+	err = ioctl(its_device.fd, KVM_SET_DEVICE_ATTR, &its_attr);
+	if (err)
+		return err;
+
+	return ioctl(its_device.fd, KVM_SET_DEVICE_ATTR, &its_init_attr);
+}
+
+static int gic__create_msi_frame(struct kvm *kvm, enum irqchip_type type,
+				 u64 msi_frame_addr)
+{
+	switch (type) {
+	case IRQCHIP_GICV2M:
+		return gic__create_gicv2m_frame(kvm, msi_frame_addr);
+	case IRQCHIP_GICV3_ITS:
+		return gic__create_its_frame(kvm, msi_frame_addr);
+	default:	/* No MSI frame needed */
+		return 0;
+	}
+}
+
+static int gic__create_device(struct kvm *kvm, enum irqchip_type type)
+{
+	int err;
+	u64 cpu_if_addr = ARM_GIC_CPUI_BASE;
+	u64 dist_addr = ARM_GIC_DIST_BASE;
+	struct kvm_create_device gic_device = {
+		.flags	= 0,
+	};
+	struct kvm_device_attr cpu_if_attr = {
+		.group	= KVM_DEV_ARM_VGIC_GRP_ADDR,
+		.attr	= KVM_VGIC_V2_ADDR_TYPE_CPU,
+		.addr	= (u64)(unsigned long)&cpu_if_addr,
+	};
+	struct kvm_device_attr dist_attr = {
+		.group	= KVM_DEV_ARM_VGIC_GRP_ADDR,
+		.addr	= (u64)(unsigned long)&dist_addr,
+	};
+	struct kvm_device_attr redist_attr = {
+		.group	= KVM_DEV_ARM_VGIC_GRP_ADDR,
+		.attr	= KVM_VGIC_V3_ADDR_TYPE_REDIST,
+		.addr	= (u64)(unsigned long)&gic_redists_base,
+	};
+
+	switch (type) {
+	case IRQCHIP_GICV2M:
+	case IRQCHIP_GICV2:
+		gic_device.type = KVM_DEV_TYPE_ARM_VGIC_V2;
+		dist_attr.attr  = KVM_VGIC_V2_ADDR_TYPE_DIST;
+		break;
+	case IRQCHIP_GICV3:
+	case IRQCHIP_GICV3_ITS:
+		gic_device.type = KVM_DEV_TYPE_ARM_VGIC_V3;
+		dist_attr.attr  = KVM_VGIC_V3_ADDR_TYPE_DIST;
+		break;
+	case IRQCHIP_AUTO:
+		return -ENODEV;
+	}
+
+	err = ioctl(kvm->vm_fd, KVM_CREATE_DEVICE, &gic_device);
+	if (err)
+		return err;
+
+	gic_fd = gic_device.fd;
+
+	switch (type) {
+	case IRQCHIP_GICV2M:
+	case IRQCHIP_GICV2:
+		err = ioctl(gic_fd, KVM_SET_DEVICE_ATTR, &cpu_if_attr);
+		break;
+	case IRQCHIP_GICV3_ITS:
+	case IRQCHIP_GICV3:
+		err = ioctl(gic_fd, KVM_SET_DEVICE_ATTR, &redist_attr);
+		break;
+	case IRQCHIP_AUTO:
+		return -ENODEV;
+	}
+	if (err)
+		goto out_err;
+
+	err = ioctl(gic_fd, KVM_SET_DEVICE_ATTR, &dist_attr);
+	if (err)
+		goto out_err;
+
+	err = gic__create_msi_frame(kvm, type, gic_msi_base);
+	if (err)
+		goto out_err;
+
+	return 0;
+
+out_err:
+	close(gic_fd);
+	gic_fd = -1;
+	return err;
+}
+
+static int gic__create_irqchip(struct kvm *kvm)
+{
+	int err;
+	struct kvm_arm_device_addr gic_addr[] = {
+		[0] = {
+			.id = KVM_VGIC_V2_ADDR_TYPE_DIST |
+			(KVM_ARM_DEVICE_VGIC_V2 << KVM_ARM_DEVICE_ID_SHIFT),
+			.addr = ARM_GIC_DIST_BASE,
+		},
+		[1] = {
+			.id = KVM_VGIC_V2_ADDR_TYPE_CPU |
+			(KVM_ARM_DEVICE_VGIC_V2 << KVM_ARM_DEVICE_ID_SHIFT),
+			.addr = ARM_GIC_CPUI_BASE,
+		}
+	};
+
+	err = ioctl(kvm->vm_fd, KVM_CREATE_IRQCHIP);
+	if (err)
+		return err;
+
+	err = ioctl(kvm->vm_fd, KVM_ARM_SET_DEVICE_ADDR, &gic_addr[0]);
+	if (err)
+		return err;
+
+	err = ioctl(kvm->vm_fd, KVM_ARM_SET_DEVICE_ADDR, &gic_addr[1]);
+	return err;
+}
+
+int gic__create(struct kvm *kvm, enum irqchip_type type)
+{
+	enum irqchip_type try;
+	int err;
+
+	switch (type) {
+	case IRQCHIP_AUTO:
+		for (try = IRQCHIP_GICV3_ITS; try >= IRQCHIP_GICV2; try--) {
+			err = gic__create(kvm, try);
+			if (!err)
+				break;
+		}
+		if (err)
+			return err;
+
+		kvm->cfg.arch.irqchip = try;
+		return 0;
+	case IRQCHIP_GICV2M:
+		gic_msi_size = KVM_VGIC_V2M_SIZE;
+		gic_msi_base = ARM_GIC_CPUI_BASE - gic_msi_size;
+		break;
+	case IRQCHIP_GICV2:
+		break;
+	case IRQCHIP_GICV3_ITS:
+		/* The 64K page with the doorbell is included. */
+		gic_msi_size = KVM_VGIC_V3_ITS_SIZE;
+		/* fall through */
+	case IRQCHIP_GICV3:
+		gic_redists_size = kvm->cfg.nrcpus * ARM_GIC_REDIST_SIZE;
+		gic_redists_base = ARM_GIC_DIST_BASE - gic_redists_size;
+		gic_msi_base = gic_redists_base - gic_msi_size;
+		break;
+	default:
+		return -ENODEV;
+	}
+
+	/* Try the new way first, and fallback on legacy method otherwise */
+	err = gic__create_device(kvm, type);
+	if (err && type == IRQCHIP_GICV2)
+		err = gic__create_irqchip(kvm);
+
+	return err;
+}
+
+/*
+ * Sets the number of used interrupts and finalizes the GIC init explicitly.
+ */
+static int gic__init_gic(struct kvm *kvm)
+{
+	int ret;
+
+	int lines = irq__get_nr_allocated_lines();
+	u32 nr_irqs = ALIGN(lines, 32) + GIC_SPI_IRQ_BASE;
+	struct kvm_device_attr nr_irqs_attr = {
+		.group	= KVM_DEV_ARM_VGIC_GRP_NR_IRQS,
+		.addr	= (u64)(unsigned long)&nr_irqs,
+	};
+	struct kvm_device_attr vgic_init_attr = {
+		.group	= KVM_DEV_ARM_VGIC_GRP_CTRL,
+		.attr	= KVM_DEV_ARM_VGIC_CTRL_INIT,
+	};
+
+	/*
+	 * If we didn't use the KVM_CREATE_DEVICE method, KVM will
+	 * give us some default number of interrupts. The GIC initialization
+	 * will be done automatically in this case.
+	 */
+	if (gic_fd < 0)
+		return 0;
+
+	if (!ioctl(gic_fd, KVM_HAS_DEVICE_ATTR, &nr_irqs_attr)) {
+		ret = ioctl(gic_fd, KVM_SET_DEVICE_ATTR, &nr_irqs_attr);
+		if (ret)
+			return ret;
+	}
+
+	irq__routing_init(kvm);
+
+	if (!ioctl(gic_fd, KVM_HAS_DEVICE_ATTR, &vgic_init_attr)) {
+		ret = ioctl(gic_fd, KVM_SET_DEVICE_ATTR, &vgic_init_attr);
+		if (ret)
+			return ret;
+	}
+
+	kvm->msix_needs_devid = kvm__supports_vm_extension(kvm,
+							   KVM_CAP_MSI_DEVID);
+
+	vgic_is_init = true;
+
+	return irq__setup_irqfd_lines(kvm);
+}
+late_init(gic__init_gic)
+
+void gic__generate_fdt_nodes(void *fdt, enum irqchip_type type)
+{
+	const char *compatible, *msi_compatible = NULL;
+	u64 msi_prop[2];
+	u64 reg_prop[] = {
+		cpu_to_fdt64(ARM_GIC_DIST_BASE), cpu_to_fdt64(ARM_GIC_DIST_SIZE),
+		0, 0,				/* to be filled */
+	};
+
+	switch (type) {
+	case IRQCHIP_GICV2M:
+		msi_compatible = "arm,gic-v2m-frame";
+		/* fall-through */
+	case IRQCHIP_GICV2:
+		compatible = "arm,cortex-a15-gic";
+		reg_prop[2] = cpu_to_fdt64(ARM_GIC_CPUI_BASE);
+		reg_prop[3] = cpu_to_fdt64(ARM_GIC_CPUI_SIZE);
+		break;
+	case IRQCHIP_GICV3_ITS:
+		msi_compatible = "arm,gic-v3-its";
+		/* fall-through */
+	case IRQCHIP_GICV3:
+		compatible = "arm,gic-v3";
+		reg_prop[2] = cpu_to_fdt64(gic_redists_base);
+		reg_prop[3] = cpu_to_fdt64(gic_redists_size);
+		break;
+	default:
+		return;
+	}
+
+	_FDT(fdt_begin_node(fdt, "intc"));
+	_FDT(fdt_property_string(fdt, "compatible", compatible));
+	_FDT(fdt_property_cell(fdt, "#interrupt-cells", GIC_FDT_IRQ_NUM_CELLS));
+	_FDT(fdt_property(fdt, "interrupt-controller", NULL, 0));
+	_FDT(fdt_property(fdt, "reg", reg_prop, sizeof(reg_prop)));
+	_FDT(fdt_property_cell(fdt, "phandle", PHANDLE_GIC));
+	_FDT(fdt_property_cell(fdt, "#address-cells", 2));
+	_FDT(fdt_property_cell(fdt, "#size-cells", 2));
+
+	if (msi_compatible) {
+		_FDT(fdt_property(fdt, "ranges", NULL, 0));
+
+		_FDT(fdt_begin_node(fdt, "msic"));
+		_FDT(fdt_property_string(fdt, "compatible", msi_compatible));
+		_FDT(fdt_property(fdt, "msi-controller", NULL, 0));
+		_FDT(fdt_property_cell(fdt, "phandle", PHANDLE_MSI));
+		msi_prop[0] = cpu_to_fdt64(gic_msi_base);
+		msi_prop[1] = cpu_to_fdt64(gic_msi_size);
+		_FDT(fdt_property(fdt, "reg", msi_prop, sizeof(msi_prop)));
+		_FDT(fdt_end_node(fdt));
+	}
+
+	_FDT(fdt_end_node(fdt));
+}
+
+#define KVM_IRQCHIP_IRQ(x) (KVM_ARM_IRQ_TYPE_SPI << KVM_ARM_IRQ_TYPE_SHIFT) |\
+			   ((x) & KVM_ARM_IRQ_NUM_MASK)
+
+void kvm__irq_line(struct kvm *kvm, int irq, int level)
+{
+	struct kvm_irq_level irq_level = {
+		.irq	= KVM_IRQCHIP_IRQ(irq),
+		.level	= !!level,
+	};
+
+	if (irq < GIC_SPI_IRQ_BASE || irq > GIC_MAX_IRQ)
+		pr_warning("Ignoring invalid GIC IRQ %d", irq);
+	else if (ioctl(kvm->vm_fd, KVM_IRQ_LINE, &irq_level) < 0)
+		pr_warning("Could not KVM_IRQ_LINE for irq %d", irq);
+}
+
+void kvm__irq_trigger(struct kvm *kvm, int irq)
+{
+	kvm__irq_line(kvm, irq, VIRTIO_IRQ_HIGH);
+	kvm__irq_line(kvm, irq, VIRTIO_IRQ_LOW);
+}
+
+int gic__add_irqfd(struct kvm *kvm, unsigned int gsi, int trigger_fd,
+		   int resample_fd)
+{
+	struct kvm_irqfd_line *line;
+
+	if (vgic_is_init)
+		return irq__common_add_irqfd(kvm, gsi, trigger_fd, resample_fd);
+
+	/* Postpone the routing setup until we have a distributor */
+	line = malloc(sizeof(*line));
+	if (!line)
+		return -ENOMEM;
+
+	*line = (struct kvm_irqfd_line) {
+		.gsi		= gsi,
+		.trigger_fd	= trigger_fd,
+		.resample_fd	= resample_fd,
+	};
+	list_add(&line->list, &irqfd_lines);
+
+	return 0;
+}
+
+void gic__del_irqfd(struct kvm *kvm, unsigned int gsi, int trigger_fd)
+{
+	struct kvm_irqfd_line *line;
+
+	if (vgic_is_init) {
+		irq__common_del_irqfd(kvm, gsi, trigger_fd);
+		return;
+	}
+
+	list_for_each_entry(line, &irqfd_lines, list) {
+		if (line->gsi != gsi)
+			continue;
+
+		list_del(&line->list);
+		free(line);
+		break;
+	}
+}
diff --git a/kvmtool/arm/gicv2m.c b/kvmtool/arm/gicv2m.c
new file mode 100644
index 0000000..d7e6398
--- /dev/null
+++ b/kvmtool/arm/gicv2m.c
@@ -0,0 +1,153 @@
+#include <errno.h>
+#include <stdlib.h>
+
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+#include "arm-common/gic.h"
+
+#define GICV2M_MSI_TYPER	0x008
+#define GICV2M_MSI_SETSPI	0x040
+#define GICV2M_MSI_IIDR		0xfcc
+
+#define GICV2M_SPI_MASK		0x3ff
+#define GICV2M_MSI_TYPER_VAL(start, nr)	\
+	(((start) & GICV2M_SPI_MASK) << 16 | ((nr) & GICV2M_SPI_MASK))
+
+struct gicv2m_chip {
+	int	first_spi;
+	int	num_spis;
+	int	*spis;
+	u64	base;
+	u64	size;
+};
+
+static struct gicv2m_chip v2m;
+
+/*
+ * MSI routing is setup lazily, when the guest writes the MSI tables. The guest
+ * writes which SPI is associated to an MSI vector into the message data field.
+ * The IRQ code notifies us of any change to MSI routing via this callback.
+ * Store the MSI->SPI translation for later.
+ *
+ * Data is the GIC interrupt ID, that includes SGIs and PPIs. SGIs at 0-15, PPIs
+ * are 16-31 and SPIs are 32-1019. What we're saving for later is the MSI's GSI
+ * number, a logical ID used by KVM for routing. The GSI of an SPI is implicitly
+ * defined by KVM to be its pin number (SPI index), and the GSI of an MSI is
+ * allocated by kvmtool.
+ */
+static int gicv2m_update_routing(struct kvm *kvm,
+				 struct kvm_irq_routing_entry *entry)
+{
+	int spi;
+
+	if (entry->type != KVM_IRQ_ROUTING_MSI)
+		return -EINVAL;
+
+	if (!entry->u.msi.address_hi && !entry->u.msi.address_lo)
+		return 0;
+
+	spi = entry->u.msi.data & GICV2M_SPI_MASK;
+	if (spi < v2m.first_spi || spi >= v2m.first_spi + v2m.num_spis) {
+		pr_err("invalid SPI number %d", spi);
+		return -EINVAL;
+	}
+
+	v2m.spis[spi - v2m.first_spi] = entry->gsi;
+
+	return 0;
+}
+
+/*
+ * Find SPI bound to the given MSI and return the associated GSI.
+ */
+static int gicv2m_translate_gsi(struct kvm *kvm, u32 gsi)
+{
+	int i;
+
+	for (i = 0; i < v2m.num_spis; i++) {
+		if (v2m.spis[i] == (int)gsi)
+			return i + v2m.first_spi - KVM_IRQ_OFFSET;
+	}
+
+	/* Not an MSI */
+	return gsi;
+}
+
+static bool gicv2m_can_signal_msi(struct kvm *kvm)
+{
+	return true;
+}
+
+/*
+ * Instead of setting up MSI routes, virtual devices can also trigger them
+ * manually (like a direct write to MSI_SETSPI). In this case, trigger the SPI
+ * directly.
+ */
+static int gicv2m_signal_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+	int spi = msi->data & GICV2M_SPI_MASK;
+
+	if (spi < v2m.first_spi || spi >= v2m.first_spi + v2m.num_spis) {
+		pr_err("invalid SPI number %d", spi);
+		return -EINVAL;
+	}
+
+	kvm__irq_trigger(kvm, spi);
+	return 0;
+}
+
+static struct msi_routing_ops gicv2m_routing = {
+	.update_route	= gicv2m_update_routing,
+	.translate_gsi	= gicv2m_translate_gsi,
+	.can_signal_msi	= gicv2m_can_signal_msi,
+	.signal_msi	= gicv2m_signal_msi,
+};
+
+static void gicv2m_mmio_callback(struct kvm_cpu *vcpu, u64 addr, u8 *data,
+				  u32 len, u8 is_write, void *ptr)
+{
+	if (is_write)
+		return;
+
+	addr -= v2m.base;
+
+	switch (addr) {
+		case GICV2M_MSI_TYPER:
+			*(u32 *)data = GICV2M_MSI_TYPER_VAL(v2m.first_spi,
+							    v2m.num_spis);
+			break;
+		case GICV2M_MSI_IIDR:
+			*(u32 *)data = 0x0;
+			break;
+	}
+}
+
+int gic__create_gicv2m_frame(struct kvm *kvm, u64 base)
+{
+	int i;
+	int irq = irq__alloc_line();
+
+	v2m = (struct gicv2m_chip) {
+		.first_spi	= irq,	/* Includes GIC_SPI_IRQ_BASE */
+		.num_spis	= 64,	/* arbitrary */
+		.base		= base,
+		.size		= KVM_VGIC_V2M_SIZE,
+	};
+
+	v2m.spis = calloc(v2m.num_spis, sizeof(int));
+	if (!v2m.spis)
+		return -ENOMEM;
+
+	v2m.spis[0] = -1;
+	for (i = 1; i < v2m.num_spis; i++) {
+		irq__alloc_line();
+		v2m.spis[i] = -1;
+	}
+
+	msi_routing_ops = &gicv2m_routing;
+
+	return kvm__register_mmio(kvm, base, KVM_VGIC_V2M_SIZE, false,
+				  gicv2m_mmio_callback, kvm);
+}
diff --git a/kvmtool/arm/include/arm-common/fdt-arch.h b/kvmtool/arm/include/arm-common/fdt-arch.h
new file mode 100644
index 0000000..60c2d40
--- /dev/null
+++ b/kvmtool/arm/include/arm-common/fdt-arch.h
@@ -0,0 +1,6 @@
+#ifndef ARM__FDT_H
+#define ARM__FDT_H
+
+enum phandles {PHANDLE_RESERVED = 0, PHANDLE_GIC, PHANDLE_MSI, PHANDLES_MAX};
+
+#endif /* ARM__FDT_H */
diff --git a/kvmtool/arm/include/arm-common/gic.h b/kvmtool/arm/include/arm-common/gic.h
new file mode 100644
index 0000000..ec9cf31
--- /dev/null
+++ b/kvmtool/arm/include/arm-common/gic.h
@@ -0,0 +1,47 @@
+#ifndef ARM_COMMON__GIC_H
+#define ARM_COMMON__GIC_H
+
+#define GIC_SGI_IRQ_BASE		0
+#define GIC_PPI_IRQ_BASE		16
+#define GIC_SPI_IRQ_BASE		32
+
+#define GIC_FDT_IRQ_NUM_CELLS		3
+
+#define GIC_FDT_IRQ_TYPE_SPI		0
+#define GIC_FDT_IRQ_TYPE_PPI		1
+
+#define GIC_FDT_IRQ_PPI_CPU_SHIFT	8
+#define GIC_FDT_IRQ_PPI_CPU_MASK	(0xff << GIC_FDT_IRQ_PPI_CPU_SHIFT)
+
+#define GIC_CPUI_CTLR_EN		(1 << 0)
+#define GIC_CPUI_PMR_MIN_PRIO		0xff
+
+#define GIC_CPUI_OFF_PMR		4
+
+#define GIC_MAX_CPUS			8
+#define GIC_MAX_IRQ			255
+
+#define KVM_VGIC_V2M_SIZE		0x1000
+
+enum irqchip_type {
+	IRQCHIP_AUTO,
+	IRQCHIP_GICV2,
+	IRQCHIP_GICV2M,
+	IRQCHIP_GICV3,
+	IRQCHIP_GICV3_ITS,
+};
+
+struct kvm;
+
+int gic__alloc_irqnum(void);
+int gic__create(struct kvm *kvm, enum irqchip_type type);
+int gic__create_gicv2m_frame(struct kvm *kvm, u64 msi_frame_addr);
+void gic__generate_fdt_nodes(void *fdt, enum irqchip_type type);
+
+int gic__add_irqfd(struct kvm *kvm, unsigned int gsi, int trigger_fd,
+		   int resample_fd);
+void gic__del_irqfd(struct kvm *kvm, unsigned int gsi, int trigger_fd);
+#define irq__add_irqfd gic__add_irqfd
+#define irq__del_irqfd gic__del_irqfd
+
+#endif /* ARM_COMMON__GIC_H */
diff --git a/kvmtool/arm/include/arm-common/kvm-arch.h b/kvmtool/arm/include/arm-common/kvm-arch.h
new file mode 100644
index 0000000..d84e50c
--- /dev/null
+++ b/kvmtool/arm/include/arm-common/kvm-arch.h
@@ -0,0 +1,81 @@
+#ifndef ARM_COMMON__KVM_ARCH_H
+#define ARM_COMMON__KVM_ARCH_H
+
+#include <stdbool.h>
+#include <linux/const.h>
+#include <linux/types.h>
+
+#include "arm-common/gic.h"
+
+#define ARM_IOPORT_AREA		_AC(0x0000000000000000, UL)
+#define ARM_FLASH_AREA		_AC(0x0000000002000000, UL)
+#define ARM_MMIO_AREA		_AC(0x0000000003000000, UL)
+#define ARM_AXI_AREA		_AC(0x0000000040000000, UL)
+#define ARM_MEMORY_AREA		_AC(0x0000000080000000, UL)
+
+#define ARM_LOMAP_MAX_MEMORY	((1ULL << 32) - ARM_MEMORY_AREA)
+#define ARM_HIMAP_MAX_MEMORY	((1ULL << 40) - ARM_MEMORY_AREA)
+
+#define ARM_GIC_DIST_BASE	(ARM_AXI_AREA - ARM_GIC_DIST_SIZE)
+#define ARM_GIC_CPUI_BASE	(ARM_GIC_DIST_BASE - ARM_GIC_CPUI_SIZE)
+#define ARM_GIC_SIZE		(ARM_GIC_DIST_SIZE + ARM_GIC_CPUI_SIZE)
+#define ARM_GIC_DIST_SIZE	0x10000
+#define ARM_GIC_CPUI_SIZE	0x20000
+
+#define KVM_FLASH_MMIO_BASE	ARM_FLASH_AREA
+#define KVM_FLASH_MAX_SIZE	(ARM_MMIO_AREA - ARM_FLASH_AREA)
+
+#define ARM_IOPORT_SIZE		(1U << 16)
+#define ARM_VIRTIO_MMIO_SIZE	(ARM_AXI_AREA - (ARM_MMIO_AREA + ARM_GIC_SIZE))
+#define ARM_PCI_CFG_SIZE	(1ULL << 24)
+#define ARM_PCI_MMIO_SIZE	(ARM_MEMORY_AREA - \
+				(ARM_AXI_AREA + ARM_PCI_CFG_SIZE))
+
+#define KVM_IOPORT_AREA		ARM_IOPORT_AREA
+#define KVM_PCI_CFG_AREA	ARM_AXI_AREA
+#define KVM_PCI_MMIO_AREA	(KVM_PCI_CFG_AREA + ARM_PCI_CFG_SIZE)
+#define KVM_VIRTIO_MMIO_AREA	ARM_MMIO_AREA
+
+#define KVM_IOEVENTFD_HAS_PIO	0
+
+/*
+ * On a GICv3 there must be one redistributor per vCPU.
+ * The value here is the size for one, we multiply this at runtime with
+ * the number of requested vCPUs to get the actual size.
+ */
+#define ARM_GIC_REDIST_SIZE	0x20000
+
+#define KVM_IRQ_OFFSET		GIC_SPI_IRQ_BASE
+
+#define KVM_VM_TYPE		0
+
+#define VIRTIO_DEFAULT_TRANS(kvm)	\
+	((kvm)->cfg.arch.virtio_trans_pci ? VIRTIO_PCI : VIRTIO_MMIO)
+
+#define VIRTIO_RING_ENDIAN	(VIRTIO_ENDIAN_LE | VIRTIO_ENDIAN_BE)
+
+static inline bool arm_addr_in_ioport_region(u64 phys_addr)
+{
+	u64 limit = KVM_IOPORT_AREA + ARM_IOPORT_SIZE;
+	return phys_addr >= KVM_IOPORT_AREA && phys_addr < limit;
+}
+
+struct kvm_arch {
+	/*
+	 * We may have to align the guest memory for virtio, so keep the
+	 * original pointers here for munmap.
+	 */
+	void	*ram_alloc_start;
+	u64	ram_alloc_size;
+
+	/*
+	 * Guest addresses for memory layout.
+	 */
+	u64	memory_guest_start;
+	u64	kern_guest_start;
+	u64	initrd_guest_start;
+	u64	initrd_size;
+	u64	dtb_guest_start;
+};
+
+#endif /* ARM_COMMON__KVM_ARCH_H */
diff --git a/kvmtool/arm/include/arm-common/kvm-config-arch.h b/kvmtool/arm/include/arm-common/kvm-config-arch.h
new file mode 100644
index 0000000..5734c46
--- /dev/null
+++ b/kvmtool/arm/include/arm-common/kvm-config-arch.h
@@ -0,0 +1,38 @@
+#ifndef ARM_COMMON__KVM_CONFIG_ARCH_H
+#define ARM_COMMON__KVM_CONFIG_ARCH_H
+
+#include "kvm/parse-options.h"
+
+struct kvm_config_arch {
+	const char	*dump_dtb_filename;
+	unsigned int	force_cntfrq;
+	bool		virtio_trans_pci;
+	bool		aarch32_guest;
+	bool		has_pmuv3;
+	u64		kaslr_seed;
+	enum irqchip_type irqchip;
+	u64		fw_addr;
+};
+
+int irqchip_parser(const struct option *opt, const char *arg, int unset);
+
+#define OPT_ARCH_RUN(pfx, cfg)							\
+	pfx,									\
+	ARM_OPT_ARCH_RUN(cfg)							\
+	OPT_STRING('\0', "dump-dtb", &(cfg)->dump_dtb_filename,			\
+		   ".dtb file", "Dump generated .dtb to specified file"),	\
+	OPT_UINTEGER('\0', "override-bad-firmware-cntfrq", &(cfg)->force_cntfrq,\
+		     "Specify Generic Timer frequency in guest DT to "		\
+		     "work around buggy secure firmware *Firmware should be "	\
+		     "updated to program CNTFRQ correctly*"),			\
+	OPT_BOOLEAN('\0', "force-pci", &(cfg)->virtio_trans_pci,		\
+		    "Force virtio devices to use PCI as their default "		\
+		    "transport"),						\
+        OPT_CALLBACK('\0', "irqchip", &(cfg)->irqchip,				\
+		     "[gicv2|gicv2m|gicv3|gicv3-its]",				\
+		     "Type of interrupt controller to emulate in the guest",	\
+		     irqchip_parser, NULL),					\
+	OPT_U64('\0', "firmware-address", &(cfg)->fw_addr,			\
+		"Address where firmware should be loaded"),
+
+#endif /* ARM_COMMON__KVM_CONFIG_ARCH_H */
diff --git a/kvmtool/arm/include/arm-common/kvm-cpu-arch.h b/kvmtool/arm/include/arm-common/kvm-cpu-arch.h
new file mode 100644
index 0000000..923d2c4
--- /dev/null
+++ b/kvmtool/arm/include/arm-common/kvm-cpu-arch.h
@@ -0,0 +1,62 @@
+#ifndef ARM_COMMON__KVM_CPU_ARCH_H
+#define ARM_COMMON__KVM_CPU_ARCH_H
+
+#include <linux/kvm.h>
+#include <pthread.h>
+#include <stdbool.h>
+
+struct kvm;
+
+struct kvm_cpu {
+	pthread_t	thread;
+
+	unsigned long	cpu_id;
+	unsigned long	cpu_type;
+	const char	*cpu_compatible;
+
+	struct kvm	*kvm;
+	int		vcpu_fd;
+	struct kvm_run	*kvm_run;
+	struct kvm_cpu_task	*task;
+
+	u8		is_running;
+	u8		paused;
+	u8		needs_nmi;
+
+	struct kvm_coalesced_mmio_ring	*ring;
+
+	void		(*generate_fdt_nodes)(void *fdt, struct kvm* kvm);
+};
+
+struct kvm_arm_target {
+	u32		id;
+	const char 	*compatible;
+	int		(*init)(struct kvm_cpu *vcpu);
+};
+
+void kvm_cpu__set_kvm_arm_generic_target(struct kvm_arm_target *target);
+
+int kvm_cpu__register_kvm_arm_target(struct kvm_arm_target *target);
+
+static inline bool kvm_cpu__emulate_io(struct kvm_cpu *vcpu, u16 port, void *data,
+				       int direction, int size, u32 count)
+{
+	return false;
+}
+
+static inline bool kvm_cpu__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr,
+					 u8 *data, u32 len, u8 is_write)
+{
+	if (arm_addr_in_ioport_region(phys_addr)) {
+		int direction = is_write ? KVM_EXIT_IO_OUT : KVM_EXIT_IO_IN;
+		u16 port = (phys_addr - KVM_IOPORT_AREA) & USHRT_MAX;
+
+		return kvm__emulate_io(vcpu, port, data, direction, len, 1);
+	}
+
+	return kvm__emulate_mmio(vcpu, phys_addr, data, len, is_write);
+}
+
+unsigned long kvm_cpu__get_vcpu_mpidr(struct kvm_cpu *vcpu);
+
+#endif /* ARM_COMMON__KVM_CPU_ARCH_H */
diff --git a/kvmtool/arm/include/arm-common/pci.h b/kvmtool/arm/include/arm-common/pci.h
new file mode 100644
index 0000000..9008a0e
--- /dev/null
+++ b/kvmtool/arm/include/arm-common/pci.h
@@ -0,0 +1,6 @@
+#ifndef ARM_COMMON__PCI_H
+#define ARM_COMMON__PCI_H
+
+void pci__generate_fdt_nodes(void *fdt);
+
+#endif /* ARM_COMMON__PCI_H */
diff --git a/kvmtool/arm/include/arm-common/pmu.h b/kvmtool/arm/include/arm-common/pmu.h
new file mode 100644
index 0000000..7a170a5
--- /dev/null
+++ b/kvmtool/arm/include/arm-common/pmu.h
@@ -0,0 +1,4 @@
+
+#define KVM_ARM_PMUv3_PPI			23
+
+void pmu__generate_fdt_nodes(void *fdt, struct kvm *kvm);
diff --git a/kvmtool/arm/include/arm-common/timer.h b/kvmtool/arm/include/arm-common/timer.h
new file mode 100644
index 0000000..928e9ea
--- /dev/null
+++ b/kvmtool/arm/include/arm-common/timer.h
@@ -0,0 +1,6 @@
+#ifndef ARM_COMMON__TIMER_H
+#define ARM_COMMON__TIMER_H
+
+void timer__generate_fdt_nodes(void *fdt, struct kvm *kvm, int *irqs);
+
+#endif /* ARM_COMMON__TIMER_H */
diff --git a/kvmtool/arm/ioport.c b/kvmtool/arm/ioport.c
new file mode 100644
index 0000000..2f0feb9
--- /dev/null
+++ b/kvmtool/arm/ioport.c
@@ -0,0 +1,12 @@
+#include "kvm/ioport.h"
+#include "kvm/irq.h"
+
+int ioport__setup_arch(struct kvm *kvm)
+{
+	return 0;
+}
+
+void ioport__map_irq(u8 *irq)
+{
+	*irq = irq__alloc_line();
+}
diff --git a/kvmtool/arm/kvm-cpu.c b/kvmtool/arm/kvm-cpu.c
new file mode 100644
index 0000000..2acecae
--- /dev/null
+++ b/kvmtool/arm/kvm-cpu.c
@@ -0,0 +1,153 @@
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+
+static int debug_fd;
+
+void kvm_cpu__set_debug_fd(int fd)
+{
+	debug_fd = fd;
+}
+
+int kvm_cpu__get_debug_fd(void)
+{
+	return debug_fd;
+}
+
+static struct kvm_arm_target *kvm_arm_generic_target;
+static struct kvm_arm_target *kvm_arm_targets[KVM_ARM_NUM_TARGETS];
+
+void kvm_cpu__set_kvm_arm_generic_target(struct kvm_arm_target *target)
+{
+	kvm_arm_generic_target = target;
+}
+
+int kvm_cpu__register_kvm_arm_target(struct kvm_arm_target *target)
+{
+	unsigned int i = 0;
+
+	for (i = 0; i < ARRAY_SIZE(kvm_arm_targets); ++i) {
+		if (!kvm_arm_targets[i]) {
+			kvm_arm_targets[i] = target;
+			return 0;
+		}
+	}
+
+	return -ENOSPC;
+}
+
+struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, unsigned long cpu_id)
+{
+	struct kvm_arm_target *target;
+	struct kvm_cpu *vcpu;
+	int coalesced_offset, mmap_size, err = -1;
+	unsigned int i;
+	struct kvm_vcpu_init preferred_init;
+	struct kvm_vcpu_init vcpu_init = {
+		.features = ARM_VCPU_FEATURE_FLAGS(kvm, cpu_id)
+	};
+
+	if (kvm->cfg.arch.aarch32_guest &&
+	    !kvm__supports_extension(kvm, KVM_CAP_ARM_EL1_32BIT))
+		die("32bit guests are not supported\n");
+
+	vcpu = calloc(1, sizeof(struct kvm_cpu));
+	if (!vcpu)
+		return NULL;
+
+	vcpu->vcpu_fd = ioctl(kvm->vm_fd, KVM_CREATE_VCPU, cpu_id);
+	if (vcpu->vcpu_fd < 0)
+		die_perror("KVM_CREATE_VCPU ioctl");
+
+	mmap_size = ioctl(kvm->sys_fd, KVM_GET_VCPU_MMAP_SIZE, 0);
+	if (mmap_size < 0)
+		die_perror("KVM_GET_VCPU_MMAP_SIZE ioctl");
+
+	vcpu->kvm_run = mmap(NULL, mmap_size, PROT_RW, MAP_SHARED,
+			     vcpu->vcpu_fd, 0);
+	if (vcpu->kvm_run == MAP_FAILED)
+		die("unable to mmap vcpu fd");
+
+	/* Set KVM_ARM_VCPU_PSCI_0_2 if available */
+	if (kvm__supports_extension(kvm, KVM_CAP_ARM_PSCI_0_2)) {
+		vcpu_init.features[0] |= (1UL << KVM_ARM_VCPU_PSCI_0_2);
+	}
+
+	kvm_cpu__select_features(kvm, &vcpu_init);
+
+	/*
+	 * If the preferred target ioctl is successful then
+	 * use preferred target else try each and every target type
+	 */
+	err = ioctl(kvm->vm_fd, KVM_ARM_PREFERRED_TARGET, &preferred_init);
+	if (!err) {
+		/* Match preferred target CPU type. */
+		target = NULL;
+		for (i = 0; i < ARRAY_SIZE(kvm_arm_targets); ++i) {
+			if (!kvm_arm_targets[i])
+				continue;
+			if (kvm_arm_targets[i]->id == preferred_init.target) {
+				target = kvm_arm_targets[i];
+				break;
+			}
+		}
+		if (!target) {
+			target = kvm_arm_generic_target;
+			vcpu_init.target = preferred_init.target;
+		} else {
+			vcpu_init.target = target->id;
+		}
+		err = ioctl(vcpu->vcpu_fd, KVM_ARM_VCPU_INIT, &vcpu_init);
+	} else {
+		/* Find an appropriate target CPU type. */
+		for (i = 0; i < ARRAY_SIZE(kvm_arm_targets); ++i) {
+			if (!kvm_arm_targets[i])
+				continue;
+			target = kvm_arm_targets[i];
+			vcpu_init.target = target->id;
+			err = ioctl(vcpu->vcpu_fd, KVM_ARM_VCPU_INIT, &vcpu_init);
+			if (!err)
+				break;
+		}
+		if (err)
+			die("Unable to find matching target");
+	}
+
+	if (err || target->init(vcpu))
+		die("Unable to initialise vcpu");
+
+	coalesced_offset = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION,
+				 KVM_CAP_COALESCED_MMIO);
+	if (coalesced_offset)
+		vcpu->ring = (void *)vcpu->kvm_run +
+			     (coalesced_offset * PAGE_SIZE);
+
+	/* Populate the vcpu structure. */
+	vcpu->kvm		= kvm;
+	vcpu->cpu_id		= cpu_id;
+	vcpu->cpu_type		= vcpu_init.target;
+	vcpu->cpu_compatible	= target->compatible;
+	vcpu->is_running	= true;
+
+	if (kvm_cpu__configure_features(vcpu))
+		die("Unable to configure requested vcpu features");
+
+	return vcpu;
+}
+
+void kvm_cpu__arch_nmi(struct kvm_cpu *cpu)
+{
+}
+
+void kvm_cpu__delete(struct kvm_cpu *vcpu)
+{
+	free(vcpu);
+}
+
+bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu)
+{
+	return false;
+}
+
+void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu)
+{
+}
diff --git a/kvmtool/arm/kvm.c b/kvmtool/arm/kvm.c
new file mode 100644
index 0000000..5aea18f
--- /dev/null
+++ b/kvmtool/arm/kvm.c
@@ -0,0 +1,239 @@
+#include "kvm/kvm.h"
+#include "kvm/term.h"
+#include "kvm/util.h"
+#include "kvm/8250-serial.h"
+#include "kvm/virtio-console.h"
+#include "kvm/fdt.h"
+
+#include "arm-common/gic.h"
+
+#include <linux/kernel.h>
+#include <linux/kvm.h>
+#include <linux/sizes.h>
+
+struct kvm_ext kvm_req_ext[] = {
+	{ DEFINE_KVM_EXT(KVM_CAP_IRQCHIP) },
+	{ DEFINE_KVM_EXT(KVM_CAP_ONE_REG) },
+	{ DEFINE_KVM_EXT(KVM_CAP_ARM_PSCI) },
+	{ 0, 0 },
+};
+
+bool kvm__arch_cpu_supports_vm(void)
+{
+	/* The KVM capability check is enough. */
+	return true;
+}
+
+void kvm__init_ram(struct kvm *kvm)
+{
+	int err;
+	u64 phys_start, phys_size;
+	void *host_mem;
+
+	phys_start	= ARM_MEMORY_AREA;
+	phys_size	= kvm->ram_size;
+	host_mem	= kvm->ram_start;
+
+	err = kvm__register_ram(kvm, phys_start, phys_size, host_mem);
+	if (err)
+		die("Failed to register %lld bytes of memory at physical "
+		    "address 0x%llx [err %d]", phys_size, phys_start, err);
+
+	kvm->arch.memory_guest_start = phys_start;
+}
+
+void kvm__arch_delete_ram(struct kvm *kvm)
+{
+	munmap(kvm->arch.ram_alloc_start, kvm->arch.ram_alloc_size);
+}
+
+void kvm__arch_read_term(struct kvm *kvm)
+{
+	serial8250__update_consoles(kvm);
+	virtio_console__inject_interrupt(kvm);
+}
+
+void kvm__arch_set_cmdline(char *cmdline, bool video)
+{
+}
+
+void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size)
+{
+	/*
+	 * Allocate guest memory. We must align our buffer to 64K to
+	 * correlate with the maximum guest page size for virtio-mmio.
+	 * If using THP, then our minimal alignment becomes 2M.
+	 * 2M trumps 64K, so let's go with that.
+	 */
+	kvm->ram_size = min(ram_size, (u64)ARM_MAX_MEMORY(kvm));
+	kvm->arch.ram_alloc_size = kvm->ram_size + SZ_2M;
+	kvm->arch.ram_alloc_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path,
+						kvm->arch.ram_alloc_size);
+
+	if (kvm->arch.ram_alloc_start == MAP_FAILED)
+		die("Failed to map %lld bytes for guest memory (%d)",
+		    kvm->arch.ram_alloc_size, errno);
+
+	kvm->ram_start = (void *)ALIGN((unsigned long)kvm->arch.ram_alloc_start,
+					SZ_2M);
+
+	madvise(kvm->arch.ram_alloc_start, kvm->arch.ram_alloc_size,
+		MADV_MERGEABLE);
+
+	madvise(kvm->arch.ram_alloc_start, kvm->arch.ram_alloc_size,
+		MADV_HUGEPAGE);
+
+	/* Create the virtual GIC. */
+	if (gic__create(kvm, kvm->cfg.arch.irqchip))
+		die("Failed to create virtual GIC");
+}
+
+#define FDT_ALIGN	SZ_2M
+#define INITRD_ALIGN	4
+bool kvm__arch_load_kernel_image(struct kvm *kvm, int fd_kernel, int fd_initrd,
+				 const char *kernel_cmdline)
+{
+	void *pos, *kernel_end, *limit;
+	unsigned long guest_addr;
+	ssize_t file_size;
+
+	/*
+	 * Linux requires the initrd and dtb to be mapped inside lowmem,
+	 * so we can't just place them at the top of memory.
+	 */
+	limit = kvm->ram_start + min(kvm->ram_size, (u64)SZ_256M) - 1;
+
+	pos = kvm->ram_start + kvm__arch_get_kern_offset(kvm, fd_kernel);
+	kvm->arch.kern_guest_start = host_to_guest_flat(kvm, pos);
+	file_size = read_file(fd_kernel, pos, limit - pos);
+	if (file_size < 0) {
+		if (errno == ENOMEM)
+			die("kernel image too big to contain in guest memory.");
+
+		die_perror("kernel read");
+	}
+	kernel_end = pos + file_size;
+	pr_debug("Loaded kernel to 0x%llx (%zd bytes)",
+		 kvm->arch.kern_guest_start, file_size);
+
+	/*
+	 * Now load backwards from the end of memory so the kernel
+	 * decompressor has plenty of space to work with. First up is
+	 * the device tree blob...
+	 */
+	pos = limit;
+	pos -= (FDT_MAX_SIZE + FDT_ALIGN);
+	guest_addr = ALIGN(host_to_guest_flat(kvm, pos), FDT_ALIGN);
+	pos = guest_flat_to_host(kvm, guest_addr);
+	if (pos < kernel_end)
+		die("fdt overlaps with kernel image.");
+
+	kvm->arch.dtb_guest_start = guest_addr;
+	pr_debug("Placing fdt at 0x%llx - 0x%llx",
+		 kvm->arch.dtb_guest_start,
+		 host_to_guest_flat(kvm, limit));
+	limit = pos;
+
+	/* ... and finally the initrd, if we have one. */
+	if (fd_initrd != -1) {
+		struct stat sb;
+		unsigned long initrd_start;
+
+		if (fstat(fd_initrd, &sb))
+			die_perror("fstat");
+
+		pos -= (sb.st_size + INITRD_ALIGN);
+		guest_addr = ALIGN(host_to_guest_flat(kvm, pos), INITRD_ALIGN);
+		pos = guest_flat_to_host(kvm, guest_addr);
+		if (pos < kernel_end)
+			die("initrd overlaps with kernel image.");
+
+		initrd_start = guest_addr;
+		file_size = read_file(fd_initrd, pos, limit - pos);
+		if (file_size == -1) {
+			if (errno == ENOMEM)
+				die("initrd too big to contain in guest memory.");
+
+			die_perror("initrd read");
+		}
+
+		kvm->arch.initrd_guest_start = initrd_start;
+		kvm->arch.initrd_size = file_size;
+		pr_debug("Loaded initrd to 0x%llx (%llu bytes)",
+			 kvm->arch.initrd_guest_start,
+			 kvm->arch.initrd_size);
+	} else {
+		kvm->arch.initrd_size = 0;
+	}
+
+	return true;
+}
+
+static bool validate_fw_addr(struct kvm *kvm, u64 fw_addr)
+{
+	u64 ram_phys;
+
+	ram_phys = host_to_guest_flat(kvm, kvm->ram_start);
+
+	if (fw_addr < ram_phys || fw_addr >= ram_phys + kvm->ram_size) {
+		pr_err("Provide --firmware-address an address in RAM: "
+		       "0x%016llx - 0x%016llx",
+		       ram_phys, ram_phys + kvm->ram_size);
+
+		return false;
+	}
+
+	return true;
+}
+
+bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename)
+{
+	u64 fw_addr = kvm->cfg.arch.fw_addr;
+	void *host_pos;
+	void *limit;
+	ssize_t fw_sz;
+	int fd;
+
+	limit = kvm->ram_start + kvm->ram_size;
+
+	/* For default firmware address, lets load it at the begining of RAM */
+	if (fw_addr == 0)
+		fw_addr = ARM_MEMORY_AREA;
+
+	if (!validate_fw_addr(kvm, fw_addr))
+		die("Bad firmware destination: 0x%016llx", fw_addr);
+
+	fd = open(firmware_filename, O_RDONLY);
+	if (fd < 0)
+		return false;
+
+	host_pos = guest_flat_to_host(kvm, fw_addr);
+	if (!host_pos || host_pos < kvm->ram_start)
+		return false;
+
+	fw_sz = read_file(fd, host_pos, limit - host_pos);
+	if (fw_sz < 0)
+		die("failed to load firmware");
+	close(fd);
+
+	/* Kernel isn't loaded by kvm, point start address to firmware */
+	kvm->arch.kern_guest_start = fw_addr;
+
+	/* Load dtb just after the firmware image*/
+	host_pos += fw_sz;
+	if (host_pos + FDT_MAX_SIZE > limit)
+		die("not enough space to load fdt");
+
+	kvm->arch.dtb_guest_start = ALIGN(host_to_guest_flat(kvm, host_pos),
+					  FDT_ALIGN);
+	pr_info("Placing fdt at 0x%llx - 0x%llx",
+		kvm->arch.dtb_guest_start,
+		kvm->arch.dtb_guest_start + FDT_MAX_SIZE);
+
+	return true;
+}
+
+int kvm__arch_setup_firmware(struct kvm *kvm)
+{
+	return 0;
+}
diff --git a/kvmtool/arm/pci.c b/kvmtool/arm/pci.c
new file mode 100644
index 0000000..ed325fa
--- /dev/null
+++ b/kvmtool/arm/pci.c
@@ -0,0 +1,125 @@
+#include "kvm/devices.h"
+#include "kvm/fdt.h"
+#include "kvm/kvm.h"
+#include "kvm/of_pci.h"
+#include "kvm/pci.h"
+#include "kvm/util.h"
+
+#include "arm-common/pci.h"
+
+/*
+ * An entry in the interrupt-map table looks like:
+ * <pci unit address> <pci interrupt pin> <gic phandle> <gic interrupt>
+ */
+
+struct of_gic_irq {
+	u32 type, num, flags;
+} __attribute__((packed));
+
+struct of_interrupt_map_entry {
+	struct of_pci_irq_mask		pci_irq_mask;
+	u32				gic_phandle;
+	u32				gic_addr_hi;
+	u32				gic_addr_lo;
+	struct of_gic_irq		gic_irq;
+} __attribute__((packed));
+
+void pci__generate_fdt_nodes(void *fdt)
+{
+	struct device_header *dev_hdr;
+	struct of_interrupt_map_entry irq_map[OF_PCI_IRQ_MAP_MAX];
+	unsigned nentries = 0;
+	/* Bus range */
+	u32 bus_range[] = { cpu_to_fdt32(0), cpu_to_fdt32(0), };
+	/* Configuration Space */
+	u64 cfg_reg_prop[] = { cpu_to_fdt64(KVM_PCI_CFG_AREA),
+			       cpu_to_fdt64(ARM_PCI_CFG_SIZE), };
+	/* Describe the memory ranges */
+	struct of_pci_ranges_entry ranges[] = {
+		{
+			.pci_addr = {
+				.hi	= cpu_to_fdt32(of_pci_b_ss(OF_PCI_SS_IO)),
+				.mid	= 0,
+				.lo	= 0,
+			},
+			.cpu_addr	= cpu_to_fdt64(KVM_IOPORT_AREA),
+			.length		= cpu_to_fdt64(ARM_IOPORT_SIZE),
+		},
+		{
+			.pci_addr = {
+				.hi	= cpu_to_fdt32(of_pci_b_ss(OF_PCI_SS_M32)),
+				.mid	= cpu_to_fdt32(KVM_PCI_MMIO_AREA >> 32),
+				.lo	= cpu_to_fdt32(KVM_PCI_MMIO_AREA),
+			},
+			.cpu_addr	= cpu_to_fdt64(KVM_PCI_MMIO_AREA),
+			.length		= cpu_to_fdt64(ARM_PCI_MMIO_SIZE),
+		},
+	};
+
+	/* Boilerplate PCI properties */
+	_FDT(fdt_begin_node(fdt, "pci"));
+	_FDT(fdt_property_string(fdt, "device_type", "pci"));
+	_FDT(fdt_property_cell(fdt, "#address-cells", 0x3));
+	_FDT(fdt_property_cell(fdt, "#size-cells", 0x2));
+	_FDT(fdt_property_cell(fdt, "#interrupt-cells", 0x1));
+	_FDT(fdt_property_string(fdt, "compatible", "pci-host-cam-generic"));
+	_FDT(fdt_property(fdt, "dma-coherent", NULL, 0));
+
+	_FDT(fdt_property(fdt, "bus-range", bus_range, sizeof(bus_range)));
+	_FDT(fdt_property(fdt, "reg", &cfg_reg_prop, sizeof(cfg_reg_prop)));
+	_FDT(fdt_property(fdt, "ranges", ranges, sizeof(ranges)));
+	_FDT(fdt_property_cell(fdt, "msi-parent", PHANDLE_MSI));
+
+	/* Generate the interrupt map ... */
+	dev_hdr = device__first_dev(DEVICE_BUS_PCI);
+	while (dev_hdr && nentries < ARRAY_SIZE(irq_map)) {
+		struct of_interrupt_map_entry *entry = &irq_map[nentries];
+		struct pci_device_header *pci_hdr = dev_hdr->data;
+		u8 dev_num = dev_hdr->dev_num;
+		u8 pin = pci_hdr->irq_pin;
+		u8 irq = pci_hdr->irq_line;
+		u32 irq_flags = pci_hdr->irq_type;
+
+		*entry = (struct of_interrupt_map_entry) {
+			.pci_irq_mask = {
+				.pci_addr = {
+					.hi	= cpu_to_fdt32(of_pci_b_ddddd(dev_num)),
+					.mid	= 0,
+					.lo	= 0,
+				},
+				.pci_pin	= cpu_to_fdt32(pin),
+			},
+			.gic_phandle	= cpu_to_fdt32(PHANDLE_GIC),
+			.gic_addr_hi	= 0,
+			.gic_addr_lo	= 0,
+			.gic_irq = {
+				.type	= cpu_to_fdt32(GIC_FDT_IRQ_TYPE_SPI),
+				.num	= cpu_to_fdt32(irq - GIC_SPI_IRQ_BASE),
+				.flags	= cpu_to_fdt32(irq_flags),
+			},
+		};
+
+		nentries++;
+		dev_hdr = device__next_dev(dev_hdr);
+	}
+
+	_FDT(fdt_property(fdt, "interrupt-map", irq_map,
+			  sizeof(struct of_interrupt_map_entry) * nentries));
+
+	/* ... and the corresponding mask. */
+	if (nentries) {
+		struct of_pci_irq_mask irq_mask = {
+			.pci_addr = {
+				.hi	= cpu_to_fdt32(of_pci_b_ddddd(-1)),
+				.mid	= 0,
+				.lo	= 0,
+			},
+			.pci_pin	= cpu_to_fdt32(7),
+		};
+
+		_FDT(fdt_property(fdt, "interrupt-map-mask", &irq_mask,
+				  sizeof(irq_mask)));
+	}
+
+	_FDT(fdt_end_node(fdt));
+}
diff --git a/kvmtool/arm/pmu.c b/kvmtool/arm/pmu.c
new file mode 100644
index 0000000..ffd152e
--- /dev/null
+++ b/kvmtool/arm/pmu.c
@@ -0,0 +1,81 @@
+#include "kvm/fdt.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/util.h"
+
+#include "arm-common/gic.h"
+#include "arm-common/pmu.h"
+
+#ifdef CONFIG_ARM64
+static int set_pmu_attr(struct kvm *kvm, int vcpu_idx,
+			struct kvm_device_attr *attr)
+{
+	int ret, fd;
+
+	fd = kvm->cpus[vcpu_idx]->vcpu_fd;
+
+	ret = ioctl(fd, KVM_HAS_DEVICE_ATTR, attr);
+	if (!ret) {
+		ret = ioctl(fd, KVM_SET_DEVICE_ATTR, attr);
+		if (ret)
+			perror("PMU KVM_SET_DEVICE_ATTR failed");
+	} else {
+		pr_err("Unsupported PMU on vcpu%d\n", vcpu_idx);
+	}
+
+	return ret;
+}
+
+void pmu__generate_fdt_nodes(void *fdt, struct kvm *kvm)
+{
+	const char compatible[] = "arm,armv8-pmuv3";
+	int irq = KVM_ARM_PMUv3_PPI;
+	int i, ret;
+
+	u32 cpu_mask = (((1 << kvm->nrcpus) - 1) << GIC_FDT_IRQ_PPI_CPU_SHIFT) \
+		       & GIC_FDT_IRQ_PPI_CPU_MASK;
+	u32 irq_prop[] = {
+		cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI),
+		cpu_to_fdt32(irq - 16),
+		cpu_to_fdt32(cpu_mask | IRQ_TYPE_LEVEL_HIGH),
+	};
+
+	if (!kvm->cfg.arch.has_pmuv3)
+		return;
+
+	if (!kvm__supports_extension(kvm, KVM_CAP_ARM_PMU_V3)) {
+		pr_info("PMU unsupported\n");
+		return;
+	}
+
+	for (i = 0; i < kvm->nrcpus; i++) {
+		struct kvm_device_attr pmu_attr;
+
+		pmu_attr = (struct kvm_device_attr){
+			.group	= KVM_ARM_VCPU_PMU_V3_CTRL,
+			.addr	= (u64)(unsigned long)&irq,
+			.attr	= KVM_ARM_VCPU_PMU_V3_IRQ,
+		};
+
+		ret = set_pmu_attr(kvm, i, &pmu_attr);
+		if (ret)
+			return;
+
+		pmu_attr = (struct kvm_device_attr){
+			.group	= KVM_ARM_VCPU_PMU_V3_CTRL,
+			.attr	= KVM_ARM_VCPU_PMU_V3_INIT,
+		};
+
+		ret = set_pmu_attr(kvm, i, &pmu_attr);
+		if (ret)
+			return;
+	}
+
+	_FDT(fdt_begin_node(fdt, "pmu"));
+	_FDT(fdt_property(fdt, "compatible", compatible, sizeof(compatible)));
+	_FDT(fdt_property(fdt, "interrupts", irq_prop, sizeof(irq_prop)));
+	_FDT(fdt_end_node(fdt));
+}
+#else
+void pmu__generate_fdt_nodes(void *fdt, struct kvm *kvm) { }
+#endif
diff --git a/kvmtool/arm/timer.c b/kvmtool/arm/timer.c
new file mode 100644
index 0000000..71bfe8d
--- /dev/null
+++ b/kvmtool/arm/timer.c
@@ -0,0 +1,41 @@
+#include "kvm/fdt.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/util.h"
+
+#include "arm-common/gic.h"
+#include "arm-common/timer.h"
+
+void timer__generate_fdt_nodes(void *fdt, struct kvm *kvm, int *irqs)
+{
+	const char compatible[] = "arm,armv8-timer\0arm,armv7-timer";
+
+	u32 cpu_mask = (((1 << kvm->nrcpus) - 1) << GIC_FDT_IRQ_PPI_CPU_SHIFT) \
+		       & GIC_FDT_IRQ_PPI_CPU_MASK;
+	u32 irq_prop[] = {
+		cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI),
+		cpu_to_fdt32(irqs[0]),
+		cpu_to_fdt32(cpu_mask | IRQ_TYPE_LEVEL_LOW),
+
+		cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI),
+		cpu_to_fdt32(irqs[1]),
+		cpu_to_fdt32(cpu_mask | IRQ_TYPE_LEVEL_LOW),
+
+		cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI),
+		cpu_to_fdt32(irqs[2]),
+		cpu_to_fdt32(cpu_mask | IRQ_TYPE_LEVEL_LOW),
+
+		cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI),
+		cpu_to_fdt32(irqs[3]),
+		cpu_to_fdt32(cpu_mask | IRQ_TYPE_LEVEL_LOW),
+	};
+
+	_FDT(fdt_begin_node(fdt, "timer"));
+	_FDT(fdt_property(fdt, "compatible", compatible, sizeof(compatible)));
+	_FDT(fdt_property(fdt, "interrupts", irq_prop, sizeof(irq_prop)));
+	_FDT(fdt_property(fdt, "always-on", NULL, 0));
+	if (kvm->cfg.arch.force_cntfrq > 0)
+		_FDT(fdt_property_cell(fdt, "clock-frequency", kvm->cfg.arch.force_cntfrq));
+	_FDT(fdt_end_node(fdt));
+}
+
diff --git a/kvmtool/builtin-balloon.c b/kvmtool/builtin-balloon.c
new file mode 100644
index 0000000..d158ace
--- /dev/null
+++ b/kvmtool/builtin-balloon.c
@@ -0,0 +1,80 @@
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-balloon.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm.h>
+#include <kvm/kvm-ipc.h>
+
+static const char *instance_name;
+static u64 inflate;
+static u64 deflate;
+
+static const char * const balloon_usage[] = {
+	"lkvm balloon [-n name] [-p pid] [-i amount] [-d amount]",
+	NULL
+};
+
+static const struct option balloon_options[] = {
+	OPT_GROUP("Instance options:"),
+	OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+	OPT_GROUP("Balloon options:"),
+	OPT_U64('i', "inflate", &inflate, "Amount to inflate (in MB)"),
+	OPT_U64('d', "deflate", &deflate, "Amount to deflate (in MB)"),
+	OPT_END(),
+};
+
+void kvm_balloon_help(void)
+{
+	usage_with_options(balloon_usage, balloon_options);
+}
+
+static void parse_balloon_options(int argc, const char **argv)
+{
+	while (argc != 0) {
+		argc = parse_options(argc, argv, balloon_options, balloon_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION);
+		if (argc != 0)
+			kvm_balloon_help();
+	}
+}
+
+int kvm_cmd_balloon(int argc, const char **argv, const char *prefix)
+{
+	int instance;
+	int r;
+	int amount;
+
+	parse_balloon_options(argc, argv);
+
+	if (inflate == 0 && deflate == 0)
+		kvm_balloon_help();
+
+	if (instance_name == NULL)
+		kvm_balloon_help();
+
+	instance = kvm__get_sock_by_instance(instance_name);
+
+	if (instance <= 0)
+		die("Failed locating instance");
+
+	if (inflate)
+		amount = inflate;
+	else if (deflate)
+		amount = -deflate;
+	else
+		kvm_balloon_help();
+
+	r = kvm_ipc__send_msg(instance, KVM_IPC_BALLOON,
+			sizeof(amount), (u8 *)&amount);
+
+	close(instance);
+
+	if (r < 0)
+		return -1;
+
+	return 0;
+}
diff --git a/kvmtool/builtin-debug.c b/kvmtool/builtin-debug.c
new file mode 100644
index 0000000..4ae51d2
--- /dev/null
+++ b/kvmtool/builtin-debug.c
@@ -0,0 +1,110 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-debug.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+#include <kvm/read-write.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+#define BUFFER_SIZE 100
+
+static bool all;
+static int nmi = -1;
+static bool dump;
+static const char *instance_name;
+static const char *sysrq;
+
+static const char * const debug_usage[] = {
+	"lkvm debug [--all] [-n name] [-d] [-m vcpu]",
+	NULL
+};
+
+static const struct option debug_options[] = {
+	OPT_GROUP("General options:"),
+	OPT_BOOLEAN('d', "dump", &dump, "Generate a debug dump from guest"),
+	OPT_INTEGER('m', "nmi", &nmi, "Generate NMI on VCPU"),
+	OPT_STRING('s', "sysrq", &sysrq, "sysrq", "Inject a sysrq"),
+	OPT_GROUP("Instance options:"),
+	OPT_BOOLEAN('a', "all", &all, "Debug all instances"),
+	OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+	OPT_END()
+};
+
+static void parse_debug_options(int argc, const char **argv)
+{
+	while (argc != 0) {
+		argc = parse_options(argc, argv, debug_options, debug_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION);
+		if (argc != 0)
+			kvm_debug_help();
+	}
+}
+
+void kvm_debug_help(void)
+{
+	usage_with_options(debug_usage, debug_options);
+}
+
+static int do_debug(const char *name, int sock)
+{
+	char buff[BUFFER_SIZE];
+	struct debug_cmd_params cmd = {.dbg_type = 0};
+	int r;
+
+	if (dump)
+		cmd.dbg_type |= KVM_DEBUG_CMD_TYPE_DUMP;
+
+	if (nmi != -1) {
+		cmd.dbg_type |= KVM_DEBUG_CMD_TYPE_NMI;
+		cmd.cpu = nmi;
+	}
+
+	if (sysrq) {
+		cmd.dbg_type |= KVM_DEBUG_CMD_TYPE_SYSRQ;
+		cmd.sysrq = sysrq[0];
+	}
+
+	r = kvm_ipc__send_msg(sock, KVM_IPC_DEBUG, sizeof(cmd), (u8 *)&cmd);
+	if (r < 0)
+		return r;
+
+	if (!dump)
+		return 0;
+
+	do {
+		r = xread(sock, buff, BUFFER_SIZE);
+		if (r < 0)
+			return 0;
+		printf("%.*s", r, buff);
+	} while (r > 0);
+
+	return 0;
+}
+
+int kvm_cmd_debug(int argc, const char **argv, const char *prefix)
+{
+	parse_debug_options(argc, argv);
+	int instance;
+	int r;
+
+	if (all)
+		return kvm__enumerate_instances(do_debug);
+
+	if (instance_name == NULL)
+		kvm_debug_help();
+
+	instance = kvm__get_sock_by_instance(instance_name);
+
+	if (instance <= 0)
+		die("Failed locating instance");
+
+	r = do_debug(instance_name, instance);
+
+	close(instance);
+
+	return r;
+}
diff --git a/kvmtool/builtin-help.c b/kvmtool/builtin-help.c
new file mode 100644
index 0000000..5970fb7
--- /dev/null
+++ b/kvmtool/builtin-help.c
@@ -0,0 +1,63 @@
+#include <stdio.h>
+#include <string.h>
+
+/* user defined headers */
+#include <common-cmds.h>
+
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-help.h>
+#include <kvm/kvm.h>
+
+
+const char kvm_usage_string[] =
+	"lkvm COMMAND [ARGS]";
+
+const char kvm_more_info_string[] =
+	"See 'lkvm help COMMAND' for more information on a specific command.";
+
+
+static void list_common_cmds_help(void)
+{
+	unsigned int i, longest = 0;
+
+	for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
+		if (longest < strlen(common_cmds[i].name))
+			longest = strlen(common_cmds[i].name);
+	}
+
+	puts(" The most commonly used lkvm commands are:");
+	for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
+		printf("   %-*s   ", longest, common_cmds[i].name);
+		puts(common_cmds[i].help);
+	}
+}
+
+static void kvm_help(void)
+{
+	printf("\n To start a simple non-privileged shell run '%s run'\n\n"
+		"usage: %s\n\n", KVM_BINARY_NAME, kvm_usage_string);
+	list_common_cmds_help();
+	printf("\n %s\n\n", kvm_more_info_string);
+}
+
+
+static void help_cmd(const char *cmd)
+{
+	struct cmd_struct *p;
+	p = kvm_get_command(kvm_commands, cmd);
+	if (!p)
+		kvm_help();
+	else if (p->help)
+		p->help();
+}
+
+int kvm_cmd_help(int argc, const char **argv, const char *prefix)
+{
+	if (!argv || !*argv) {
+		kvm_help();
+		return 0;
+	}
+	help_cmd(argv[0]);
+	return 0;
+}
diff --git a/kvmtool/builtin-list.c b/kvmtool/builtin-list.c
new file mode 100644
index 0000000..c35be93
--- /dev/null
+++ b/kvmtool/builtin-list.c
@@ -0,0 +1,155 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-list.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+
+#include <dirent.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+#include <fcntl.h>
+
+static bool run;
+static bool rootfs;
+
+static const char * const list_usage[] = {
+	"lkvm list",
+	NULL
+};
+
+static const struct option list_options[] = {
+	OPT_GROUP("General options:"),
+	OPT_BOOLEAN('i', "run", &run, "List running instances"),
+	OPT_BOOLEAN('r', "rootfs", &rootfs, "List rootfs instances"),
+	OPT_END()
+};
+
+#define KVM_INSTANCE_RUNNING	"running"
+#define KVM_INSTANCE_PAUSED	"paused"
+#define KVM_INSTANCE_SHUTOFF	"shut off"
+
+void kvm_list_help(void)
+{
+	usage_with_options(list_usage, list_options);
+}
+
+static pid_t get_pid(int sock)
+{
+	pid_t pid;
+	int r;
+
+	r = kvm_ipc__send(sock, KVM_IPC_PID);
+	if (r < 0)
+		return r;
+
+	r = read(sock, &pid, sizeof(pid));
+	if (r < 0)
+		return r;
+
+	return pid;
+}
+
+int get_vmstate(int sock)
+{
+	int vmstate;
+	int r;
+
+	r = kvm_ipc__send(sock, KVM_IPC_VMSTATE);
+	if (r < 0)
+		return r;
+
+	r = read(sock, &vmstate, sizeof(vmstate));
+	if (r < 0)
+		return r;
+
+	return vmstate;
+
+}
+
+static int print_guest(const char *name, int sock)
+{
+	pid_t pid;
+	int vmstate;
+
+	pid = get_pid(sock);
+	vmstate = get_vmstate(sock);
+
+	if ((int)pid < 0 || vmstate < 0)
+		return -1;
+
+	if (vmstate == KVM_VMSTATE_PAUSED)
+		printf("%5d %-20s %s\n", pid, name, KVM_INSTANCE_PAUSED);
+	else
+		printf("%5d %-20s %s\n", pid, name, KVM_INSTANCE_RUNNING);
+
+	return 0;
+}
+
+static int kvm_list_running_instances(void)
+{
+	return kvm__enumerate_instances(print_guest);
+}
+
+static int kvm_list_rootfs(void)
+{
+	DIR *dir;
+	struct dirent *dirent;
+
+	dir = opendir(kvm__get_dir());
+	if (dir == NULL)
+		return -1;
+
+	while ((dirent = readdir(dir))) {
+		if (dirent->d_type == DT_DIR &&
+			strcmp(dirent->d_name, ".") &&
+			strcmp(dirent->d_name, ".."))
+			printf("%5s %-20s %s\n", "", dirent->d_name, KVM_INSTANCE_SHUTOFF);
+	}
+
+	return 0;
+}
+
+static void parse_setup_options(int argc, const char **argv)
+{
+	while (argc != 0) {
+		argc = parse_options(argc, argv, list_options, list_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION);
+		if (argc != 0)
+			kvm_list_help();
+	}
+}
+
+int kvm_cmd_list(int argc, const char **argv, const char *prefix)
+{
+	int status, r;
+
+	parse_setup_options(argc, argv);
+
+	if (!run && !rootfs)
+		run = rootfs = true;
+
+	printf("%6s %-20s %s\n", "PID", "NAME", "STATE");
+	printf("------------------------------------\n");
+
+	status = 0;
+
+	if (run) {
+		r = kvm_list_running_instances();
+		if (r < 0)
+			perror("Error listing instances");
+
+		status |= r;
+	}
+
+	if (rootfs) {
+		r = kvm_list_rootfs();
+		if (r < 0)
+			perror("Error listing rootfs");
+
+		status |= r;
+	}
+
+	return status;
+}
diff --git a/kvmtool/builtin-pause.c b/kvmtool/builtin-pause.c
new file mode 100644
index 0000000..c08595a
--- /dev/null
+++ b/kvmtool/builtin-pause.c
@@ -0,0 +1,88 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-pause.h>
+#include <kvm/builtin-list.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+static bool all;
+static const char *instance_name;
+
+static const char * const pause_usage[] = {
+	"lkvm pause [--all] [-n name]",
+	NULL
+};
+
+static const struct option pause_options[] = {
+	OPT_GROUP("General options:"),
+	OPT_BOOLEAN('a', "all", &all, "Pause all instances"),
+	OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+	OPT_END()
+};
+
+static void parse_pause_options(int argc, const char **argv)
+{
+	while (argc != 0) {
+		argc = parse_options(argc, argv, pause_options, pause_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION);
+		if (argc != 0)
+			kvm_pause_help();
+	}
+}
+
+void kvm_pause_help(void)
+{
+	usage_with_options(pause_usage, pause_options);
+}
+
+static int do_pause(const char *name, int sock)
+{
+	int r;
+	int vmstate;
+
+	vmstate = get_vmstate(sock);
+	if (vmstate < 0)
+		return vmstate;
+	if (vmstate == KVM_VMSTATE_PAUSED) {
+		printf("Guest %s is already paused.\n", name);
+		return 0;
+	}
+
+	r = kvm_ipc__send(sock, KVM_IPC_PAUSE);
+	if (r)
+		return r;
+
+	printf("Guest %s paused\n", name);
+
+	return 0;
+}
+
+int kvm_cmd_pause(int argc, const char **argv, const char *prefix)
+{
+	int instance;
+	int r;
+
+	parse_pause_options(argc, argv);
+
+	if (all)
+		return kvm__enumerate_instances(do_pause);
+
+	if (instance_name == NULL)
+		kvm_pause_help();
+
+	instance = kvm__get_sock_by_instance(instance_name);
+
+	if (instance <= 0)
+		die("Failed locating instance");
+
+	r = do_pause(instance_name, instance);
+
+	close(instance);
+
+	return r;
+}
diff --git a/kvmtool/builtin-resume.c b/kvmtool/builtin-resume.c
new file mode 100644
index 0000000..0e954b4
--- /dev/null
+++ b/kvmtool/builtin-resume.c
@@ -0,0 +1,88 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-resume.h>
+#include <kvm/builtin-list.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+static bool all;
+static const char *instance_name;
+
+static const char * const resume_usage[] = {
+	"lkvm resume [--all] [-n name]",
+	NULL
+};
+
+static const struct option resume_options[] = {
+	OPT_GROUP("General options:"),
+	OPT_BOOLEAN('a', "all", &all, "Resume all instances"),
+	OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+	OPT_END()
+};
+
+static void parse_resume_options(int argc, const char **argv)
+{
+	while (argc != 0) {
+		argc = parse_options(argc, argv, resume_options, resume_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION);
+		if (argc != 0)
+			kvm_resume_help();
+	}
+}
+
+void kvm_resume_help(void)
+{
+	usage_with_options(resume_usage, resume_options);
+}
+
+static int do_resume(const char *name, int sock)
+{
+	int r;
+	int vmstate;
+
+	vmstate = get_vmstate(sock);
+	if (vmstate < 0)
+		return vmstate;
+	if (vmstate == KVM_VMSTATE_RUNNING) {
+		printf("Guest %s is still running.\n", name);
+		return 0;
+	}
+
+	r = kvm_ipc__send(sock, KVM_IPC_RESUME);
+	if (r)
+		return r;
+
+	printf("Guest %s resumed\n", name);
+
+	return 0;
+}
+
+int kvm_cmd_resume(int argc, const char **argv, const char *prefix)
+{
+	int instance;
+	int r;
+
+	parse_resume_options(argc, argv);
+
+	if (all)
+		return kvm__enumerate_instances(do_resume);
+
+	if (instance_name == NULL)
+		kvm_resume_help();
+
+	instance = kvm__get_sock_by_instance(instance_name);
+
+	if (instance <= 0)
+		die("Failed locating instance");
+
+	r = do_resume(instance_name, instance);
+
+	close(instance);
+
+	return r;
+}
diff --git a/kvmtool/builtin-run.c b/kvmtool/builtin-run.c
new file mode 100644
index 0000000..c23e7a2
--- /dev/null
+++ b/kvmtool/builtin-run.c
@@ -0,0 +1,704 @@
+#include "kvm/builtin-run.h"
+
+#include "kvm/builtin-setup.h"
+#include "kvm/virtio-balloon.h"
+#include "kvm/virtio-console.h"
+#include "kvm/parse-options.h"
+#include "kvm/8250-serial.h"
+#include "kvm/framebuffer.h"
+#include "kvm/disk-image.h"
+#include "kvm/threadpool.h"
+#include "kvm/virtio-scsi.h"
+#include "kvm/virtio-blk.h"
+#include "kvm/virtio-net.h"
+#include "kvm/virtio-rng.h"
+#include "kvm/ioeventfd.h"
+#include "kvm/virtio-9p.h"
+#include "kvm/barrier.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/ioport.h"
+#include "kvm/symbol.h"
+#include "kvm/i8042.h"
+#include "kvm/mutex.h"
+#include "kvm/term.h"
+#include "kvm/util.h"
+#include "kvm/strbuf.h"
+#include "kvm/vesa.h"
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/rtc.h"
+#include "kvm/sdl.h"
+#include "kvm/vnc.h"
+#include "kvm/guest_compat.h"
+#include "kvm/kvm-ipc.h"
+#include "kvm/builtin-debug.h"
+
+#include <linux/types.h>
+#include <linux/err.h>
+
+#include <sys/utsname.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <termios.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <stdio.h>
+
+#define MB_SHIFT		(20)
+#define KB_SHIFT		(10)
+#define GB_SHIFT		(30)
+
+__thread struct kvm_cpu *current_kvm_cpu;
+
+static int  kvm_run_wrapper;
+
+bool do_debug_print = false;
+
+static const char * const run_usage[] = {
+	"lkvm run [<options>] [<kernel image>]",
+	NULL
+};
+
+enum {
+	KVM_RUN_DEFAULT,
+	KVM_RUN_SANDBOX,
+};
+
+static int img_name_parser(const struct option *opt, const char *arg, int unset)
+{
+	char path[PATH_MAX];
+	struct stat st;
+
+	snprintf(path, PATH_MAX, "%s%s", kvm__get_dir(), arg);
+
+	if ((stat(arg, &st) == 0 && S_ISDIR(st.st_mode)) ||
+	   (stat(path, &st) == 0 && S_ISDIR(st.st_mode)))
+		return virtio_9p_img_name_parser(opt, arg, unset);
+	return disk_img_name_parser(opt, arg, unset);
+}
+
+void kvm_run_set_wrapper_sandbox(void)
+{
+	kvm_run_wrapper = KVM_RUN_SANDBOX;
+}
+
+#ifndef OPT_ARCH_RUN
+#define OPT_ARCH_RUN(...)
+#endif
+
+#define BUILD_OPTIONS(name, cfg, kvm)					\
+	struct option name[] = {					\
+	OPT_GROUP("Basic options:"),					\
+	OPT_STRING('\0', "name", &(cfg)->guest_name, "guest name",	\
+			"A name for the guest"),			\
+	OPT_INTEGER('c', "cpus", &(cfg)->nrcpus, "Number of CPUs"),	\
+	OPT_U64('m', "mem", &(cfg)->ram_size, "Virtual machine memory"	\
+		" size in MiB."),					\
+	OPT_CALLBACK('d', "disk", kvm, "image or rootfs_dir", "Disk "	\
+			" image or rootfs directory", img_name_parser,	\
+			kvm),						\
+	OPT_BOOLEAN('\0', "balloon", &(cfg)->balloon, "Enable virtio"	\
+			" balloon"),					\
+	OPT_BOOLEAN('\0', "vnc", &(cfg)->vnc, "Enable VNC framebuffer"),\
+	OPT_BOOLEAN('\0', "gtk", &(cfg)->gtk, "Enable GTK framebuffer"),\
+	OPT_BOOLEAN('\0', "sdl", &(cfg)->sdl, "Enable SDL framebuffer"),\
+	OPT_BOOLEAN('\0', "rng", &(cfg)->virtio_rng, "Enable virtio"	\
+			" Random Number Generator"),			\
+	OPT_CALLBACK('\0', "9p", NULL, "dir_to_share,tag_name",		\
+		     "Enable virtio 9p to share files between host and"	\
+		     " guest", virtio_9p_rootdir_parser, kvm),		\
+	OPT_STRING('\0', "console", &(cfg)->console, "serial, virtio or"\
+			" hv", "Console to use"),			\
+	OPT_STRING('\0', "dev", &(cfg)->dev, "device_file",		\
+			"KVM device file"),				\
+	OPT_CALLBACK('\0', "tty", NULL, "tty id",			\
+		     "Remap guest TTY into a pty on the host",		\
+		     tty_parser, NULL),					\
+	OPT_STRING('\0', "sandbox", &(cfg)->sandbox, "script",		\
+			"Run this script when booting into custom"	\
+			" rootfs"),					\
+	OPT_STRING('\0', "hugetlbfs", &(cfg)->hugetlbfs_path, "path",	\
+			"Hugetlbfs path"),				\
+									\
+	OPT_GROUP("Kernel options:"),					\
+	OPT_STRING('k', "kernel", &(cfg)->kernel_filename, "kernel",	\
+			"Kernel to boot in virtual machine"),		\
+	OPT_STRING('i', "initrd", &(cfg)->initrd_filename, "initrd",	\
+			"Initial RAM disk image"),			\
+	OPT_STRING('p', "params", &(cfg)->kernel_cmdline, "params",	\
+			"Kernel command line arguments"),		\
+	OPT_STRING('f', "firmware", &(cfg)->firmware_filename, "firmware",\
+			"Firmware image to boot in virtual machine"),	\
+	OPT_STRING('F', "flash", &(cfg)->flash_filename, "flash",\
+			"Flash image to present to virtual machine"),	\
+									\
+	OPT_GROUP("Networking options:"),				\
+	OPT_CALLBACK_DEFAULT('n', "network", NULL, "network params",	\
+		     "Create a new guest NIC",				\
+		     netdev_parser, NULL, kvm),				\
+	OPT_BOOLEAN('\0', "no-dhcp", &(cfg)->no_dhcp, "Disable kernel"	\
+			" DHCP in rootfs mode"),			\
+									\
+	OPT_GROUP("VFIO options:"),					\
+	OPT_CALLBACK('\0', "vfio-pci", NULL, "[domain:]bus:dev.fn",	\
+		     "Assign a PCI device to the virtual machine",	\
+		     vfio_device_parser, kvm),				\
+									\
+	OPT_GROUP("Debug options:"),					\
+	OPT_BOOLEAN('\0', "debug", &do_debug_print,			\
+			"Enable debug messages"),			\
+	OPT_BOOLEAN('\0', "debug-single-step", &(cfg)->single_step,	\
+			"Enable single stepping"),			\
+	OPT_BOOLEAN('\0', "debug-ioport", &(cfg)->ioport_debug,		\
+			"Enable ioport debugging"),			\
+	OPT_BOOLEAN('\0', "debug-mmio", &(cfg)->mmio_debug,		\
+			"Enable MMIO debugging"),			\
+	OPT_INTEGER('\0', "debug-iodelay", &(cfg)->debug_iodelay,	\
+			"Delay IO by millisecond"),			\
+									\
+	OPT_ARCH(RUN, cfg)						\
+	OPT_END()							\
+	};
+
+static void *kvm_cpu_thread(void *arg)
+{
+	char name[16];
+
+	current_kvm_cpu = arg;
+
+	sprintf(name, "kvm-vcpu-%lu", current_kvm_cpu->cpu_id);
+	kvm__set_thread_name(name);
+
+	if (kvm_cpu__start(current_kvm_cpu))
+		goto panic_kvm;
+
+	return (void *) (intptr_t) 0;
+
+panic_kvm:
+	fprintf(stderr, "KVM exit reason: %u (\"%s\")\n",
+		current_kvm_cpu->kvm_run->exit_reason,
+		kvm_exit_reasons[current_kvm_cpu->kvm_run->exit_reason]);
+	if (current_kvm_cpu->kvm_run->exit_reason == KVM_EXIT_UNKNOWN)
+		fprintf(stderr, "KVM exit code: 0x%llu\n",
+			(unsigned long long)current_kvm_cpu->kvm_run->hw.hardware_exit_reason);
+
+	kvm_cpu__set_debug_fd(STDOUT_FILENO);
+	kvm_cpu__show_registers(current_kvm_cpu);
+	kvm_cpu__show_code(current_kvm_cpu);
+	kvm_cpu__show_page_tables(current_kvm_cpu);
+
+	return (void *) (intptr_t) 1;
+}
+
+static char kernel[PATH_MAX];
+
+static const char *host_kernels[] = {
+	"/boot/vmlinuz",
+	"/boot/bzImage",
+	NULL
+};
+
+static const char *default_kernels[] = {
+	"./bzImage",
+	"arch/" BUILD_ARCH "/boot/bzImage",
+	"../../arch/" BUILD_ARCH "/boot/bzImage",
+	NULL
+};
+
+static const char *default_vmlinux[] = {
+	"vmlinux",
+	"../../../vmlinux",
+	"../../vmlinux",
+	NULL
+};
+
+static void kernel_usage_with_options(void)
+{
+	const char **k;
+	struct utsname uts;
+
+	fprintf(stderr, "Fatal: could not find default kernel image in:\n");
+	k = &default_kernels[0];
+	while (*k) {
+		fprintf(stderr, "\t%s\n", *k);
+		k++;
+	}
+
+	if (uname(&uts) < 0)
+		return;
+
+	k = &host_kernels[0];
+	while (*k) {
+		if (snprintf(kernel, PATH_MAX, "%s-%s", *k, uts.release) < 0)
+			return;
+		fprintf(stderr, "\t%s\n", kernel);
+		k++;
+	}
+	fprintf(stderr, "\nPlease see '%s run --help' for more options.\n\n",
+		KVM_BINARY_NAME);
+}
+
+static u64 host_ram_size(void)
+{
+	long page_size;
+	long nr_pages;
+
+	nr_pages	= sysconf(_SC_PHYS_PAGES);
+	if (nr_pages < 0) {
+		pr_warning("sysconf(_SC_PHYS_PAGES) failed");
+		return 0;
+	}
+
+	page_size	= sysconf(_SC_PAGE_SIZE);
+	if (page_size < 0) {
+		pr_warning("sysconf(_SC_PAGE_SIZE) failed");
+		return 0;
+	}
+
+	return (nr_pages * page_size) >> MB_SHIFT;
+}
+
+/*
+ * If user didn't specify how much memory it wants to allocate for the guest,
+ * avoid filling the whole host RAM.
+ */
+#define RAM_SIZE_RATIO		0.8
+
+static u64 get_ram_size(int nr_cpus)
+{
+	u64 available;
+	u64 ram_size;
+
+	ram_size	= 64 * (nr_cpus + 3);
+
+	available	= host_ram_size() * RAM_SIZE_RATIO;
+	if (!available)
+		available = MIN_RAM_SIZE_MB;
+
+	if (ram_size > available)
+		ram_size	= available;
+
+	return ram_size;
+}
+
+static const char *find_kernel(void)
+{
+	const char **k;
+	struct stat st;
+	struct utsname uts;
+
+	k = &default_kernels[0];
+	while (*k) {
+		if (stat(*k, &st) < 0 || !S_ISREG(st.st_mode)) {
+			k++;
+			continue;
+		}
+		strlcpy(kernel, *k, PATH_MAX);
+		return kernel;
+	}
+
+	if (uname(&uts) < 0)
+		return NULL;
+
+	k = &host_kernels[0];
+	while (*k) {
+		if (snprintf(kernel, PATH_MAX, "%s-%s", *k, uts.release) < 0)
+			return NULL;
+
+		if (stat(kernel, &st) < 0 || !S_ISREG(st.st_mode)) {
+			k++;
+			continue;
+		}
+		return kernel;
+
+	}
+	return NULL;
+}
+
+static const char *find_vmlinux(void)
+{
+	const char **vmlinux;
+
+	vmlinux = &default_vmlinux[0];
+	while (*vmlinux) {
+		struct stat st;
+
+		if (stat(*vmlinux, &st) < 0 || !S_ISREG(st.st_mode)) {
+			vmlinux++;
+			continue;
+		}
+		return *vmlinux;
+	}
+	return NULL;
+}
+
+void kvm_run_help(void)
+{
+	struct kvm *kvm = NULL;
+
+	BUILD_OPTIONS(options, &kvm->cfg, kvm);
+	usage_with_options(run_usage, options);
+}
+
+static int kvm_run_set_sandbox(struct kvm *kvm)
+{
+	const char *guestfs_name = kvm->cfg.custom_rootfs_name;
+	char path[PATH_MAX], script[PATH_MAX], *tmp;
+
+	snprintf(path, PATH_MAX, "%s%s/virt/sandbox.sh", kvm__get_dir(), guestfs_name);
+
+	remove(path);
+
+	if (kvm->cfg.sandbox == NULL)
+		return 0;
+
+	tmp = realpath(kvm->cfg.sandbox, NULL);
+	if (tmp == NULL)
+		return -ENOMEM;
+
+	snprintf(script, PATH_MAX, "/host/%s", tmp);
+	free(tmp);
+
+	return symlink(script, path);
+}
+
+static void kvm_write_sandbox_cmd_exactly(int fd, const char *arg)
+{
+	const char *single_quote;
+
+	if (!*arg) { /* zero length string */
+		if (write(fd, "''", 2) <= 0)
+			die("Failed writing sandbox script");
+		return;
+	}
+
+	while (*arg) {
+		single_quote = strchrnul(arg, '\'');
+
+		/* write non-single-quote string as #('string') */
+		if (arg != single_quote) {
+			if (write(fd, "'", 1) <= 0 ||
+			    write(fd, arg, single_quote - arg) <= 0 ||
+			    write(fd, "'", 1) <= 0)
+				die("Failed writing sandbox script");
+		}
+
+		/* write single quote as #("'") */
+		if (*single_quote) {
+			if (write(fd, "\"'\"", 3) <= 0)
+				die("Failed writing sandbox script");
+		} else
+			break;
+
+		arg = single_quote + 1;
+	}
+}
+
+static void resolve_program(const char *src, char *dst, size_t len)
+{
+	struct stat st;
+	int err;
+
+	err = stat(src, &st);
+
+	if (!err && S_ISREG(st.st_mode)) {
+		char resolved_path[PATH_MAX];
+
+		if (!realpath(src, resolved_path))
+			die("Unable to resolve program %s: %s\n", src, strerror(errno));
+
+		if (snprintf(dst, len, "/host%s", resolved_path) >= (int)len)
+			die("Pathname too long: %s -> %s\n", src, resolved_path);
+
+	} else
+		strlcpy(dst, src, len);
+}
+
+static void kvm_run_write_sandbox_cmd(struct kvm *kvm, const char **argv, int argc)
+{
+	const char script_hdr[] = "#! /bin/bash\n\n";
+	char program[PATH_MAX];
+	int fd;
+
+	remove(kvm->cfg.sandbox);
+
+	fd = open(kvm->cfg.sandbox, O_RDWR | O_CREAT, 0777);
+	if (fd < 0)
+		die("Failed creating sandbox script");
+
+	if (write(fd, script_hdr, sizeof(script_hdr) - 1) <= 0)
+		die("Failed writing sandbox script");
+
+	resolve_program(argv[0], program, PATH_MAX);
+	kvm_write_sandbox_cmd_exactly(fd, program);
+
+	argv++;
+	argc--;
+
+	while (argc) {
+		if (write(fd, " ", 1) <= 0)
+			die("Failed writing sandbox script");
+
+		kvm_write_sandbox_cmd_exactly(fd, argv[0]);
+		argv++;
+		argc--;
+	}
+	if (write(fd, "\n", 1) <= 0)
+		die("Failed writing sandbox script");
+
+	close(fd);
+}
+
+static struct kvm *kvm_cmd_run_init(int argc, const char **argv)
+{
+	static char real_cmdline[2048], default_name[20];
+	unsigned int nr_online_cpus;
+	struct kvm *kvm = kvm__new();
+	bool video;
+
+	if (IS_ERR(kvm))
+		return kvm;
+
+	nr_online_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	kvm->cfg.custom_rootfs_name = "default";
+
+	while (argc != 0) {
+		BUILD_OPTIONS(options, &kvm->cfg, kvm);
+		argc = parse_options(argc, argv, options, run_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION |
+				PARSE_OPT_KEEP_DASHDASH);
+		if (argc != 0) {
+			/* Cusrom options, should have been handled elsewhere */
+			if (strcmp(argv[0], "--") == 0) {
+				if (kvm_run_wrapper == KVM_RUN_SANDBOX) {
+					kvm->cfg.sandbox = DEFAULT_SANDBOX_FILENAME;
+					kvm_run_write_sandbox_cmd(kvm, argv+1, argc-1);
+					break;
+				}
+			}
+
+			if ((kvm_run_wrapper == KVM_RUN_DEFAULT && kvm->cfg.kernel_filename) ||
+				(kvm_run_wrapper == KVM_RUN_SANDBOX && kvm->cfg.sandbox)) {
+				fprintf(stderr, "Cannot handle parameter: "
+						"%s\n", argv[0]);
+				usage_with_options(run_usage, options);
+				free(kvm);
+				return ERR_PTR(-EINVAL);
+			}
+			if (kvm_run_wrapper == KVM_RUN_SANDBOX) {
+				/*
+				 * first unhandled parameter is treated as
+				 * sandbox command
+				 */
+				kvm->cfg.sandbox = DEFAULT_SANDBOX_FILENAME;
+				kvm_run_write_sandbox_cmd(kvm, argv, argc);
+			} else {
+				/*
+				 * first unhandled parameter is treated as a kernel
+				 * image
+				 */
+				kvm->cfg.kernel_filename = argv[0];
+			}
+			argv++;
+			argc--;
+		}
+
+	}
+
+	kvm->nr_disks = kvm->cfg.image_count;
+
+	if (!kvm->cfg.kernel_filename && !kvm->cfg.firmware_filename) {
+		kvm->cfg.kernel_filename = find_kernel();
+
+		if (!kvm->cfg.kernel_filename) {
+			kernel_usage_with_options();
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	kvm->cfg.vmlinux_filename = find_vmlinux();
+	kvm->vmlinux = kvm->cfg.vmlinux_filename;
+
+	if (kvm->cfg.nrcpus == 0)
+		kvm->cfg.nrcpus = nr_online_cpus;
+
+	if (!kvm->cfg.ram_size)
+		kvm->cfg.ram_size = get_ram_size(kvm->cfg.nrcpus);
+
+	if (kvm->cfg.ram_size > host_ram_size())
+		pr_warning("Guest memory size %lluMB exceeds host physical RAM size %lluMB",
+			(unsigned long long)kvm->cfg.ram_size,
+			(unsigned long long)host_ram_size());
+
+	kvm->cfg.ram_size <<= MB_SHIFT;
+
+	if (!kvm->cfg.dev)
+		kvm->cfg.dev = DEFAULT_KVM_DEV;
+
+	if (!kvm->cfg.console)
+		kvm->cfg.console = DEFAULT_CONSOLE;
+
+	video = kvm->cfg.vnc || kvm->cfg.sdl || kvm->cfg.gtk;
+	if (video) {
+		if ((kvm->cfg.vnc && (kvm->cfg.sdl || kvm->cfg.gtk)) ||
+		    (kvm->cfg.sdl && kvm->cfg.gtk))
+			die("Only one of --vnc, --sdl or --gtk can be specified");
+	}
+
+	if (!strncmp(kvm->cfg.console, "virtio", 6))
+		kvm->cfg.active_console  = CONSOLE_VIRTIO;
+	else if (!strncmp(kvm->cfg.console, "serial", 6))
+		kvm->cfg.active_console  = CONSOLE_8250;
+	else if (!strncmp(kvm->cfg.console, "hv", 2))
+		kvm->cfg.active_console = CONSOLE_HV;
+	else
+		pr_warning("No console!");
+
+	if (!kvm->cfg.host_ip)
+		kvm->cfg.host_ip = DEFAULT_HOST_ADDR;
+
+	if (!kvm->cfg.guest_ip)
+		kvm->cfg.guest_ip = DEFAULT_GUEST_ADDR;
+
+	if (!kvm->cfg.guest_mac)
+		kvm->cfg.guest_mac = DEFAULT_GUEST_MAC;
+
+	if (!kvm->cfg.host_mac)
+		kvm->cfg.host_mac = DEFAULT_HOST_MAC;
+
+	if (!kvm->cfg.script)
+		kvm->cfg.script = DEFAULT_SCRIPT;
+
+	if (!kvm->cfg.network)
+                kvm->cfg.network = DEFAULT_NETWORK;
+
+	memset(real_cmdline, 0, sizeof(real_cmdline));
+	kvm__arch_set_cmdline(real_cmdline, video);
+
+	if (video) {
+		strcat(real_cmdline, " console=tty0");
+	} else {
+		switch (kvm->cfg.active_console) {
+		case CONSOLE_HV:
+			/* Fallthrough */
+		case CONSOLE_VIRTIO:
+			strcat(real_cmdline, " console=hvc0");
+			break;
+		case CONSOLE_8250:
+			strcat(real_cmdline, " console=ttyS0");
+			break;
+		}
+	}
+
+	if (!kvm->cfg.guest_name) {
+		if (kvm->cfg.custom_rootfs) {
+			kvm->cfg.guest_name = kvm->cfg.custom_rootfs_name;
+		} else {
+			sprintf(default_name, "guest-%u", getpid());
+			kvm->cfg.guest_name = default_name;
+		}
+	}
+
+	if (!kvm->cfg.using_rootfs && !kvm->cfg.disk_image[0].filename && !kvm->cfg.initrd_filename) {
+		char tmp[PATH_MAX];
+
+		kvm_setup_create_new(kvm->cfg.custom_rootfs_name);
+		kvm_setup_resolv(kvm->cfg.custom_rootfs_name);
+
+		snprintf(tmp, PATH_MAX, "%s%s", kvm__get_dir(), "default");
+		if (virtio_9p__register(kvm, tmp, "/dev/root") < 0)
+			die("Unable to initialize virtio 9p");
+		if (virtio_9p__register(kvm, "/", "hostfs") < 0)
+			die("Unable to initialize virtio 9p");
+		kvm->cfg.using_rootfs = kvm->cfg.custom_rootfs = 1;
+	}
+
+	if (kvm->cfg.using_rootfs) {
+		strcat(real_cmdline, " rw rootflags=trans=virtio,version=9p2000.L,cache=loose rootfstype=9p");
+		if (kvm->cfg.custom_rootfs) {
+			kvm_run_set_sandbox(kvm);
+
+#ifdef CONFIG_GUEST_PRE_INIT
+			strcat(real_cmdline, " init=/virt/pre_init");
+#else
+			strcat(real_cmdline, " init=/virt/init");
+#endif
+
+			if (!kvm->cfg.no_dhcp)
+				strcat(real_cmdline, "  ip=dhcp");
+			if (kvm_setup_guest_init(kvm->cfg.custom_rootfs_name))
+				die("Failed to setup init for guest.");
+		}
+	} else if (!kvm->cfg.kernel_cmdline || !strstr(kvm->cfg.kernel_cmdline, "root=")) {
+		strlcat(real_cmdline, " root=/dev/vda rw ", sizeof(real_cmdline));
+	}
+
+	if (kvm->cfg.kernel_cmdline) {
+		strcat(real_cmdline, " ");
+		strlcat(real_cmdline, kvm->cfg.kernel_cmdline, sizeof(real_cmdline));
+	}
+
+	kvm->cfg.real_cmdline = real_cmdline;
+
+	if (kvm->cfg.kernel_filename) {
+		printf("  # %s run -k %s -m %Lu -c %d --name %s\n", KVM_BINARY_NAME,
+		       kvm->cfg.kernel_filename,
+		       (unsigned long long)kvm->cfg.ram_size / 1024 / 1024,
+		       kvm->cfg.nrcpus, kvm->cfg.guest_name);
+	} else if (kvm->cfg.firmware_filename) {
+		printf("  # %s run --firmware %s -m %Lu -c %d --name %s\n", KVM_BINARY_NAME,
+		       kvm->cfg.firmware_filename,
+		       (unsigned long long)kvm->cfg.ram_size / 1024 / 1024,
+		       kvm->cfg.nrcpus, kvm->cfg.guest_name);
+	}
+
+	if (init_list__init(kvm) < 0)
+		die ("Initialisation failed");
+
+	return kvm;
+}
+
+static int kvm_cmd_run_work(struct kvm *kvm)
+{
+	int i;
+
+	for (i = 0; i < kvm->nrcpus; i++) {
+		if (pthread_create(&kvm->cpus[i]->thread, NULL, kvm_cpu_thread, kvm->cpus[i]) != 0)
+			die("unable to create KVM VCPU thread");
+	}
+
+	/* Only VCPU #0 is going to exit by itself when shutting down */
+	if (pthread_join(kvm->cpus[0]->thread, NULL) != 0)
+		die("unable to join with vcpu 0");
+
+	return kvm_cpu__exit(kvm);
+}
+
+static void kvm_cmd_run_exit(struct kvm *kvm, int guest_ret)
+{
+	compat__print_all_messages();
+
+	init_list__exit(kvm);
+
+	if (guest_ret == 0)
+		printf("\n  # KVM session ended normally.\n");
+}
+
+int kvm_cmd_run(int argc, const char **argv, const char *prefix)
+{
+	int ret = -EFAULT;
+	struct kvm *kvm;
+
+	kvm = kvm_cmd_run_init(argc, argv);
+	if (IS_ERR(kvm))
+		return PTR_ERR(kvm);
+
+	ret = kvm_cmd_run_work(kvm);
+	kvm_cmd_run_exit(kvm, ret);
+
+	return ret;
+}
diff --git a/kvmtool/builtin-sandbox.c b/kvmtool/builtin-sandbox.c
new file mode 100644
index 0000000..433f536
--- /dev/null
+++ b/kvmtool/builtin-sandbox.c
@@ -0,0 +1,9 @@
+#include "kvm/builtin-sandbox.h"
+#include "kvm/builtin-run.h"
+
+int kvm_cmd_sandbox(int argc, const char **argv, const char *prefix)
+{
+	kvm_run_set_wrapper_sandbox();
+
+	return kvm_cmd_run(argc, argv, prefix);
+}
diff --git a/kvmtool/builtin-setup.c b/kvmtool/builtin-setup.c
new file mode 100644
index 0000000..b24d2a1
--- /dev/null
+++ b/kvmtool/builtin-setup.c
@@ -0,0 +1,285 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-setup.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/read-write.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <limits.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+static const char *instance_name;
+
+static const char * const setup_usage[] = {
+	"lkvm setup [name]",
+	NULL
+};
+
+static const struct option setup_options[] = {
+	OPT_END()
+};
+
+static void parse_setup_options(int argc, const char **argv)
+{
+	while (argc != 0) {
+		argc = parse_options(argc, argv, setup_options, setup_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION);
+		if (argc != 0 && instance_name)
+			kvm_setup_help();
+		else
+			instance_name = argv[0];
+		argv++;
+		argc--;
+	}
+}
+
+void kvm_setup_help(void)
+{
+	printf("\n%s setup creates a new rootfs under %s.\n"
+		"This can be used later by the '-d' parameter of '%s run'.\n",
+		KVM_BINARY_NAME, kvm__get_dir(), KVM_BINARY_NAME);
+	usage_with_options(setup_usage, setup_options);
+}
+
+static int copy_file(const char *from, const char *to)
+{
+	int in_fd, out_fd;
+	void *src, *dst;
+	struct stat st;
+	int err = -1;
+
+	in_fd = open(from, O_RDONLY);
+	if (in_fd < 0)
+		return err;
+
+	if (fstat(in_fd, &st) < 0)
+		goto error_close_in;
+
+	out_fd = open(to, O_RDWR | O_CREAT | O_TRUNC, st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO));
+	if (out_fd < 0)
+		goto error_close_in;
+
+	src = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, in_fd, 0);
+	if (src == MAP_FAILED)
+		goto error_close_out;
+
+	if (ftruncate(out_fd, st.st_size) < 0)
+		goto error_munmap_src;
+
+	dst = mmap(NULL, st.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, out_fd, 0);
+	if (dst == MAP_FAILED)
+		goto error_munmap_src;
+
+	memcpy(dst, src, st.st_size);
+
+	if (fsync(out_fd) < 0)
+		goto error_munmap_dst;
+
+	err = 0;
+
+error_munmap_dst:
+	munmap(dst, st.st_size);
+error_munmap_src:
+	munmap(src, st.st_size);
+error_close_out:
+	close(out_fd);
+error_close_in:
+	close(in_fd);
+
+	return err;
+}
+
+static const char *guestfs_dirs[] = {
+	"/dev",
+	"/etc",
+	"/home",
+	"/host",
+	"/proc",
+	"/root",
+	"/sys",
+	"/tmp",
+	"/var",
+	"/var/lib",
+	"/virt",
+	"/virt/home",
+};
+
+static const char *guestfs_symlinks[] = {
+	"/bin",
+	"/lib",
+	"/lib64",
+	"/sbin",
+	"/usr",
+	"/etc/ld.so.conf",
+};
+
+#ifdef CONFIG_GUEST_INIT
+static int extract_file(const char *guestfs_name, const char *filename,
+			const void *data, size_t size)
+{
+	char path[PATH_MAX];
+	int fd, ret;
+
+	snprintf(path, PATH_MAX, "%s%s/%s", kvm__get_dir(),
+				guestfs_name, filename);
+
+	fd = open(path, O_EXCL | O_CREAT | O_WRONLY, 0755);
+	if (fd < 0) {
+		if (errno == EEXIST)
+			return 0;
+		die("Fail to setup %s", path);
+	}
+
+	ret = xwrite(fd, data, size);
+	if (ret < 0)
+		die("Fail to setup %s", path);
+	close(fd);
+
+	return 0;
+}
+
+extern unsigned char init_binary[];
+extern unsigned long init_binary_size;
+extern unsigned char pre_init_binary[];
+extern unsigned long pre_init_binary_size;
+
+int kvm_setup_guest_init(const char *guestfs_name)
+{
+	int err;
+
+#ifdef CONFIG_GUEST_PRE_INIT
+	err = extract_file(guestfs_name, "virt/pre_init",
+			   pre_init_binary, pre_init_binary_size);
+	if (err)
+		return err;
+#endif
+	err = extract_file(guestfs_name, "virt/init",
+			   init_binary, init_binary_size);
+	return err;
+}
+#else
+int kvm_setup_guest_init(const char *guestfs_name)
+{
+	die("Guest init image not compiled in");
+	return 0;
+}
+#endif
+
+static int copy_passwd(const char *guestfs_name)
+{
+	char path[PATH_MAX];
+	FILE *file;
+	int ret;
+
+	snprintf(path, PATH_MAX, "%s%s/etc/passwd", kvm__get_dir(), guestfs_name);
+
+	file = fopen(path, "w");
+	if (!file)
+		return -1;
+
+	ret = fprintf(file, "root:x:0:0:root:/root:/bin/sh\n");
+	if (ret > 0)
+		ret = 0;
+
+	fclose(file);
+
+	return ret;
+}
+
+static int make_guestfs_symlink(const char *guestfs_name, const char *path)
+{
+	char target[PATH_MAX];
+	char name[PATH_MAX];
+
+	snprintf(name, PATH_MAX, "%s%s%s", kvm__get_dir(), guestfs_name, path);
+
+	snprintf(target, PATH_MAX, "/host%s", path);
+
+	return symlink(target, name);
+}
+
+static int make_dir(const char *dir)
+{
+	char name[PATH_MAX];
+
+	snprintf(name, PATH_MAX, "%s%s", kvm__get_dir(), dir);
+
+	return mkdir(name, 0777);
+}
+
+static void make_guestfs_dir(const char *guestfs_name, const char *dir)
+{
+	char name[PATH_MAX];
+
+	snprintf(name, PATH_MAX, "%s%s", guestfs_name, dir);
+
+	make_dir(name);
+}
+
+void kvm_setup_resolv(const char *guestfs_name)
+{
+	char path[PATH_MAX];
+
+	snprintf(path, PATH_MAX, "%s%s/etc/resolv.conf", kvm__get_dir(), guestfs_name);
+
+	copy_file("/etc/resolv.conf", path);
+}
+
+static int do_setup(const char *guestfs_name)
+{
+	unsigned int i;
+	int ret;
+
+	ret = make_dir(guestfs_name);
+	if (ret < 0)
+		return ret;
+
+	for (i = 0; i < ARRAY_SIZE(guestfs_dirs); i++)
+		make_guestfs_dir(guestfs_name, guestfs_dirs[i]);
+
+	for (i = 0; i < ARRAY_SIZE(guestfs_symlinks); i++) {
+		make_guestfs_symlink(guestfs_name, guestfs_symlinks[i]);
+	}
+
+	ret = kvm_setup_guest_init(guestfs_name);
+	if (ret < 0)
+		return ret;
+
+	return copy_passwd(guestfs_name);
+}
+
+int kvm_setup_create_new(const char *guestfs_name)
+{
+	return do_setup(guestfs_name);
+}
+
+int kvm_cmd_setup(int argc, const char **argv, const char *prefix)
+{
+	int r;
+
+	parse_setup_options(argc, argv);
+
+	if (instance_name == NULL)
+		kvm_setup_help();
+
+	r = do_setup(instance_name);
+	if (r == 0)
+		printf("A new rootfs '%s' has been created in '%s%s'.\n\n"
+			"You can now start it by running the following command:\n\n"
+			"  %s run -d %s\n",
+			instance_name, kvm__get_dir(), instance_name,
+			KVM_BINARY_NAME,instance_name);
+	else
+		printf("Unable to create rootfs in %s%s: %s\n",
+			kvm__get_dir(), instance_name, strerror(errno));
+
+	return r;
+}
diff --git a/kvmtool/builtin-stat.c b/kvmtool/builtin-stat.c
new file mode 100644
index 0000000..5d6407e
--- /dev/null
+++ b/kvmtool/builtin-stat.c
@@ -0,0 +1,127 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-stat.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+
+#include <sys/select.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+#include <linux/virtio_balloon.h>
+
+static bool mem;
+static bool all;
+static const char *instance_name;
+
+static const char * const stat_usage[] = {
+	"lkvm stat [command] [--all] [-n name]",
+	NULL
+};
+
+static const struct option stat_options[] = {
+	OPT_GROUP("Commands options:"),
+	OPT_BOOLEAN('m', "memory", &mem, "Display memory statistics"),
+	OPT_GROUP("Instance options:"),
+	OPT_BOOLEAN('a', "all", &all, "All instances"),
+	OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+	OPT_END()
+};
+
+static void parse_stat_options(int argc, const char **argv)
+{
+	while (argc != 0) {
+		argc = parse_options(argc, argv, stat_options, stat_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION);
+		if (argc != 0)
+			kvm_stat_help();
+	}
+}
+
+void kvm_stat_help(void)
+{
+	usage_with_options(stat_usage, stat_options);
+}
+
+static int do_memstat(const char *name, int sock)
+{
+	struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
+	fd_set fdset;
+	struct timeval t = { .tv_sec = 1 };
+	int r;
+	u8 i;
+
+	FD_ZERO(&fdset);
+	FD_SET(sock, &fdset);
+	r = kvm_ipc__send(sock, KVM_IPC_STAT);
+	if (r < 0)
+		return r;
+
+	r = select(1, &fdset, NULL, NULL, &t);
+	if (r < 0) {
+		pr_err("Could not retrieve mem stats from %s", name);
+		return r;
+	}
+	r = read(sock, &stats, sizeof(stats));
+	if (r < 0)
+		return r;
+
+	printf("\n\n\t*** Guest memory statistics ***\n\n");
+	for (i = 0; i < VIRTIO_BALLOON_S_NR; i++) {
+		switch (stats[i].tag) {
+		case VIRTIO_BALLOON_S_SWAP_IN:
+			printf("The amount of memory that has been swapped in (in bytes):");
+			break;
+		case VIRTIO_BALLOON_S_SWAP_OUT:
+			printf("The amount of memory that has been swapped out to disk (in bytes):");
+			break;
+		case VIRTIO_BALLOON_S_MAJFLT:
+			printf("The number of major page faults that have occurred:");
+			break;
+		case VIRTIO_BALLOON_S_MINFLT:
+			printf("The number of minor page faults that have occurred:");
+			break;
+		case VIRTIO_BALLOON_S_MEMFREE:
+			printf("The amount of memory not being used for any purpose (in bytes):");
+			break;
+		case VIRTIO_BALLOON_S_MEMTOT:
+			printf("The total amount of memory available (in bytes):");
+			break;
+		}
+		printf("%llu\n", (unsigned long long)stats[i].val);
+	}
+	printf("\n");
+
+	return 0;
+}
+
+int kvm_cmd_stat(int argc, const char **argv, const char *prefix)
+{
+	int instance;
+	int r = 0;
+
+	parse_stat_options(argc, argv);
+
+	if (!mem)
+		usage_with_options(stat_usage, stat_options);
+
+	if (mem && all)
+		return kvm__enumerate_instances(do_memstat);
+
+	if (instance_name == NULL)
+		kvm_stat_help();
+
+	instance = kvm__get_sock_by_instance(instance_name);
+
+	if (instance <= 0)
+		die("Failed locating instance");
+
+	if (mem)
+		r = do_memstat(instance_name, instance);
+
+	close(instance);
+
+	return r;
+}
diff --git a/kvmtool/builtin-stop.c b/kvmtool/builtin-stop.c
new file mode 100644
index 0000000..6067630
--- /dev/null
+++ b/kvmtool/builtin-stop.c
@@ -0,0 +1,70 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-stop.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+static bool all;
+static const char *instance_name;
+
+static const char * const stop_usage[] = {
+	"lkvm stop [--all] [-n name]",
+	NULL
+};
+
+static const struct option stop_options[] = {
+	OPT_GROUP("General options:"),
+	OPT_BOOLEAN('a', "all", &all, "Stop all instances"),
+	OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+	OPT_END()
+};
+
+static void parse_stop_options(int argc, const char **argv)
+{
+	while (argc != 0) {
+		argc = parse_options(argc, argv, stop_options, stop_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION);
+		if (argc != 0)
+			kvm_stop_help();
+	}
+}
+
+void kvm_stop_help(void)
+{
+	usage_with_options(stop_usage, stop_options);
+}
+
+static int do_stop(const char *name, int sock)
+{
+	return kvm_ipc__send(sock, KVM_IPC_STOP);
+}
+
+int kvm_cmd_stop(int argc, const char **argv, const char *prefix)
+{
+	int instance;
+	int r;
+
+	parse_stop_options(argc, argv);
+
+	if (all)
+		return kvm__enumerate_instances(do_stop);
+
+	if (instance_name == NULL)
+		kvm_stop_help();
+
+	instance = kvm__get_sock_by_instance(instance_name);
+
+	if (instance <= 0)
+		die("Failed locating instance");
+
+	r = do_stop(instance_name, instance);
+
+	close(instance);
+
+	return r;
+}
diff --git a/kvmtool/builtin-version.c b/kvmtool/builtin-version.c
new file mode 100644
index 0000000..b8bb859
--- /dev/null
+++ b/kvmtool/builtin-version.c
@@ -0,0 +1,15 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-version.h>
+#include <kvm/kvm.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+int kvm_cmd_version(int argc, const char **argv, const char *prefix)
+{
+	printf("kvm tool %s\n", KVMTOOLS_VERSION);
+
+	return 0;
+}
diff --git a/kvmtool/code16gcc.h b/kvmtool/code16gcc.h
new file mode 100644
index 0000000..d93e480
--- /dev/null
+++ b/kvmtool/code16gcc.h
@@ -0,0 +1,15 @@
+/*
+ * code16gcc.h
+ *
+ * This file is -include'd when compiling 16-bit C code.
+ * Note: this asm() needs to be emitted before gcc emits any code.
+ * Depending on gcc version, this requires -fno-unit-at-a-time or
+ * -fno-toplevel-reorder.
+ *
+ * Hopefully gcc will eventually have a real -m16 option so we can
+ * drop this hack long term.
+ */
+
+#ifndef __ASSEMBLY__
+asm(".code16gcc");
+#endif
diff --git a/kvmtool/config/feature-tests.mak b/kvmtool/config/feature-tests.mak
new file mode 100644
index 0000000..03cdb42
--- /dev/null
+++ b/kvmtool/config/feature-tests.mak
@@ -0,0 +1,208 @@
+define SOURCE_HELLO
+#include <stdio.h>
+int main(void)
+{
+	return puts(\"hi\");
+}
+endef
+
+ifndef NO_DWARF
+define SOURCE_DWARF
+#include <dwarf.h>
+#include <elfutils/libdw.h>
+#include <elfutils/version.h>
+#ifndef _ELFUTILS_PREREQ
+#error
+#endif
+
+int main(void)
+{
+	Dwarf *dbg = dwarf_begin(0, DWARF_C_READ);
+	return (long)dbg;
+}
+endef
+endif
+
+define SOURCE_LIBELF
+#include <libelf.h>
+
+int main(void)
+{
+	Elf *elf = elf_begin(0, ELF_C_READ, 0);
+	return (long)elf;
+}
+endef
+
+define SOURCE_GLIBC
+#include <gnu/libc-version.h>
+
+int main(void)
+{
+	const char *version = gnu_get_libc_version();
+	return (long)version;
+}
+endef
+
+define SOURCE_ELF_MMAP
+#include <libelf.h>
+int main(void)
+{
+	Elf *elf = elf_begin(0, ELF_C_READ_MMAP, 0);
+	return (long)elf;
+}
+endef
+
+ifndef NO_NEWT
+define SOURCE_NEWT
+#include <newt.h>
+
+int main(void)
+{
+	newtInit();
+	newtCls();
+	return newtFinished();
+}
+endef
+endif
+
+ifndef NO_LIBPERL
+define SOURCE_PERL_EMBED
+#include <EXTERN.h>
+#include <perl.h>
+
+int main(void)
+{
+perl_alloc();
+return 0;
+}
+endef
+endif
+
+ifndef NO_LIBPYTHON
+define SOURCE_PYTHON_VERSION
+#include <Python.h>
+#if PY_VERSION_HEX >= 0x03000000
+	#error
+#endif
+int main(void){}
+endef
+define SOURCE_PYTHON_EMBED
+#include <Python.h>
+int main(void)
+{
+	Py_Initialize();
+	return 0;
+}
+endef
+endif
+
+define SOURCE_BFD
+#include <bfd.h>
+
+int main(void)
+{
+	bfd_demangle(0, 0, 0);
+	return 0;
+}
+endef
+
+define SOURCE_CPLUS_DEMANGLE
+extern char *cplus_demangle(const char *, int);
+
+int main(void)
+{
+	cplus_demangle(0, 0);
+	return 0;
+}
+endef
+
+define SOURCE_STRLCPY
+#include <stdlib.h>
+extern size_t strlcpy(char *dest, const char *src, size_t size);
+
+int main(void)
+{
+	strlcpy(NULL, NULL, 0);
+	return 0;
+}
+endef
+
+define SOURCE_VNCSERVER
+#include <rfb/rfb.h>
+
+int main(void)
+{
+	rfbIsActive((void *)0);
+	return 0;
+}
+endef
+
+define SOURCE_SDL
+#include <SDL/SDL.h>
+
+int main(void)
+{
+	SDL_Init(SDL_INIT_VIDEO);
+	return 0;
+}
+endef
+
+define SOURCE_ZLIB
+#include <zlib.h>
+
+int main(void)
+{
+	inflateInit2(NULL, 0);
+	return 0;
+}
+endef
+
+define SOURCE_AIO
+#include <libaio.h>
+
+int main(void)
+{
+	io_setup(0, NULL);
+	return 0;
+}
+endef
+
+define SOURCE_STATIC
+#include <stdlib.h>
+
+int main(void)
+{
+	return 0;
+}
+endef
+
+define SOURCE_GTK3
+#include <gtk/gtk.h>
+
+int main(void)
+{
+	gtk_main();
+
+	return 0;
+}
+endef
+
+define SOURCE_LIBFDT
+#include <libfdt.h>
+
+int main(void)
+{
+	fdt_check_header(NULL);
+	return 0;
+}
+endef
+
+define SOURCE_STRLCPY
+#include <string.h>
+
+int main(void)
+{
+	strlcpy(NULL, NULL, 0);
+	return 0;
+}
+endef
diff --git a/kvmtool/config/utilities.mak b/kvmtool/config/utilities.mak
new file mode 100644
index 0000000..612ed1f
--- /dev/null
+++ b/kvmtool/config/utilities.mak
@@ -0,0 +1,204 @@
+# This allows us to work with the newline character:
+define newline
+
+
+endef
+newline := $(newline)
+
+# nl-escape
+#
+# Usage: escape = $(call nl-escape[,escape])
+#
+# This is used as the common way to specify
+# what should replace a newline when escaping
+# newlines; the default is a bizarre string.
+#
+nl-escape = $(or $(1),m822df3020w6a44id34bt574ctac44eb9f4n)
+
+# escape-nl
+#
+# Usage: escaped-text = $(call escape-nl,text[,escape])
+#
+# GNU make's $(shell ...) function converts to a
+# single space each newline character in the output
+# produced during the expansion; this may not be
+# desirable.
+#
+# The only solution is to change each newline into
+# something that won't be converted, so that the
+# information can be recovered later with
+# $(call unescape-nl...)
+#
+escape-nl = $(subst $(newline),$(call nl-escape,$(2)),$(1))
+
+# unescape-nl
+#
+# Usage: text = $(call unescape-nl,escaped-text[,escape])
+#
+# See escape-nl.
+#
+unescape-nl = $(subst $(call nl-escape,$(2)),$(newline),$(1))
+
+# shell-escape-nl
+#
+# Usage: $(shell some-command | $(call shell-escape-nl[,escape]))
+#
+# Use this to escape newlines from within a shell call;
+# the default escape is a bizarre string.
+#
+# NOTE: The escape is used directly as a string constant
+#       in an `awk' program that is delimited by shell
+#       single-quotes, so be wary of the characters
+#       that are chosen.
+#
+define shell-escape-nl
+awk 'NR==1 {t=$$0} NR>1 {t=t "$(nl-escape)" $$0} END {printf t}'
+endef
+
+# shell-unescape-nl
+#
+# Usage: $(shell some-command | $(call shell-unescape-nl[,escape]))
+#
+# Use this to unescape newlines from within a shell call;
+# the default escape is a bizarre string.
+#
+# NOTE: The escape is used directly as an extended regular
+#       expression constant in an `awk' program that is
+#       delimited by shell single-quotes, so be wary
+#       of the characters that are chosen.
+#
+# (The bash shell has a bug where `{gsub(...),...}' is
+#  misinterpreted as a brace expansion; this can be
+#  overcome by putting a space between `{' and `gsub').
+#
+define shell-unescape-nl
+awk 'NR==1 {t=$$0} NR>1 {t=t "\n" $$0} END { gsub(/$(nl-escape)/,"\n",t); printf t }'
+endef
+
+# escape-for-shell-sq
+#
+# Usage: embeddable-text = $(call escape-for-shell-sq,text)
+#
+# This function produces text that is suitable for
+# embedding in a shell string that is delimited by
+# single-quotes.
+#
+escape-for-shell-sq =  $(subst ','\'',$(1))
+
+# shell-sq
+#
+# Usage: single-quoted-and-escaped-text = $(call shell-sq,text)
+#
+shell-sq = '$(escape-for-shell-sq)'
+
+# shell-wordify
+#
+# Usage: wordified-text = $(call shell-wordify,text)
+#
+# For instance:
+#
+#  |define text
+#  |hello
+#  |world
+#  |endef
+#  |
+#  |target:
+#  |	echo $(call shell-wordify,$(text))
+#
+# At least GNU make gets confused by expanding a newline
+# within the context of a command line of a makefile rule
+# (this is in constrast to a `$(shell ...)' function call,
+# which can handle it just fine).
+#
+# This function avoids the problem by producing a string
+# that works as a shell word, regardless of whether or
+# not it contains a newline.
+#
+# If the text to be wordified contains a newline, then
+# an intrictate shell command substitution is constructed
+# to render the text as a single line; when the shell
+# processes the resulting escaped text, it transforms
+# it into the original unescaped text.
+#
+# If the text does not contain a newline, then this function
+# produces the same results as the `$(shell-sq)' function.
+#
+shell-wordify = $(if $(findstring $(newline),$(1)),$(_sw-esc-nl),$(shell-sq))
+define _sw-esc-nl
+"$$(echo $(call escape-nl,$(shell-sq),$(2)) | $(call shell-unescape-nl,$(2)))"
+endef
+
+# is-absolute
+#
+# Usage: bool-value = $(call is-absolute,path)
+#
+is-absolute = $(shell echo $(shell-sq) | grep ^/ -q && echo y)
+
+# lookup
+#
+# Usage: absolute-executable-path-or-empty = $(call lookup,path)
+#
+# (It's necessary to use `sh -c' because GNU make messes up by
+#  trying too hard and getting things wrong).
+#
+lookup = $(call unescape-nl,$(shell sh -c $(_l-sh)))
+_l-sh = $(call shell-sq,command -v $(shell-sq) | $(call shell-escape-nl,))
+
+# is-executable
+#
+# Usage: bool-value = $(call is-executable,path)
+#
+# (It's necessary to use `sh -c' because GNU make messes up by
+#  trying too hard and getting things wrong).
+#
+is-executable = $(call _is-executable-helper,$(shell-sq))
+_is-executable-helper = $(shell sh -c $(_is-executable-sh))
+_is-executable-sh = $(call shell-sq,test -f $(1) -a -x $(1) && echo y)
+
+# get-executable
+#
+# Usage: absolute-executable-path-or-empty = $(call get-executable,path)
+#
+# The goal is to get an absolute path for an executable;
+# the `command -v' is defined by POSIX, but it's not
+# necessarily very portable, so it's only used if
+# relative path resolution is requested, as determined
+# by the presence of a leading `/'.
+#
+get-executable = $(if $(1),$(if $(is-absolute),$(_ge-abspath),$(lookup)))
+_ge-abspath = $(if $(is-executable),$(1))
+
+# get-supplied-or-default-executable
+#
+# Usage: absolute-executable-path-or-empty = $(call get-executable-or-default,variable,default)
+#
+define get-executable-or-default
+$(if $($(1)),$(call _ge_attempt,$($(1)),$(1)),$(call _ge_attempt,$(2)))
+endef
+_ge_attempt = $(or $(get-executable),$(_gea_warn),$(call _gea_err,$(2)))
+_gea_warn = $(warning The path '$(1)' is not executable.)
+_gea_err  = $(if $(1),$(error Please set '$(1)' appropriately))
+
+# try-cc
+# Usage: option = $(call try-cc, source-to-build, cc-options)
+try-cc = $(shell sh -c						  \
+	'TMP="$(OUTPUT)$(TMPOUT).$$$$";				  \
+	 echo "$(1)" |						  \
+	 $(CC) -x c -c - $(2) -o "$$TMP" > /dev/null 2>&1 && echo y; \
+	 rm -f "$$TMP"')
+
+# try-build
+# Usage: option = $(call try-build, source-to-build, cc-options, link-options)
+try-build = $(shell sh -c							\
+	'TMP="$(OUTPUT)$(TMPOUT).$$$$";						\
+	echo "$(1)" |								\
+	$(CC) -x c - $(2) $(3) -o "$$TMP" > /dev/null 2>&1 && echo y;		\
+	rm -f "$$TMP"')
+
+# binary-to-C
+# create a C source file describing the binary input file as an array
+# Usage: $(call binary-to-C,binary-file,C-symbol-name,C-output-file)
+binary-to-C = stat -c "unsigned long $(2)_size = %s;" $1 > $3;		\
+	echo "unsigned char $(2)[] = {" >> $3;				\
+	od -v -tx1 -An -w12 $1 | sed -e "s/ \(..\)/0x\1, /g" >> $3;	\
+	echo "};" >> $3
diff --git a/kvmtool/devices.c b/kvmtool/devices.c
new file mode 100644
index 0000000..41cffdd
--- /dev/null
+++ b/kvmtool/devices.c
@@ -0,0 +1,93 @@
+#include "kvm/devices.h"
+#include "kvm/kvm.h"
+
+#include <linux/err.h>
+#include <linux/rbtree.h>
+
+struct device_bus {
+	struct rb_root	root;
+	int		dev_num;
+};
+
+static struct device_bus device_trees[DEVICE_BUS_MAX] = {
+	[0 ... (DEVICE_BUS_MAX - 1)] = { RB_ROOT, 0 },
+};
+
+int device__register(struct device_header *dev)
+{
+	struct device_bus *bus;
+	struct rb_node **node, *parent = NULL;
+
+	if (dev->bus_type >= DEVICE_BUS_MAX) {
+		pr_warning("Ignoring device registration on unknown bus %d\n",
+			   dev->bus_type);
+		return -EINVAL;
+	}
+
+	bus = &device_trees[dev->bus_type];
+	dev->dev_num = bus->dev_num++;
+
+	node = &bus->root.rb_node;
+	while (*node) {
+		int num = rb_entry(*node, struct device_header, node)->dev_num;
+		int result = dev->dev_num - num;
+
+		parent = *node;
+		if (result < 0)
+			node = &((*node)->rb_left);
+		else if (result > 0)
+			node = &((*node)->rb_right);
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&dev->node, parent, node);
+	rb_insert_color(&dev->node, &bus->root);
+	return 0;
+}
+
+void device__unregister(struct device_header *dev)
+{
+	struct device_bus *bus = &device_trees[dev->bus_type];
+	rb_erase(&dev->node, &bus->root);
+}
+
+struct device_header *device__find_dev(enum device_bus_type bus_type, u8 dev_num)
+{
+	struct rb_node *node;
+
+	if (bus_type >= DEVICE_BUS_MAX)
+		return ERR_PTR(-EINVAL);
+
+	node = device_trees[bus_type].root.rb_node;
+	while (node) {
+		struct device_header *dev = rb_entry(node, struct device_header,
+						     node);
+		if (dev_num < dev->dev_num) {
+			node = node->rb_left;
+		} else if (dev_num > dev->dev_num) {
+			node = node->rb_right;
+		} else {
+			return dev;
+		}
+	}
+
+	return NULL;
+}
+
+struct device_header *device__first_dev(enum device_bus_type bus_type)
+{
+	struct rb_node *node;
+
+	if (bus_type >= DEVICE_BUS_MAX)
+		return NULL;
+
+	node = rb_first(&device_trees[bus_type].root);
+	return node ? rb_entry(node, struct device_header, node) : NULL;
+}
+
+struct device_header *device__next_dev(struct device_header *dev)
+{
+	struct rb_node *node = rb_next(&dev->node);
+	return node ? rb_entry(node, struct device_header, node) : NULL;
+}
diff --git a/kvmtool/disk/aio.c b/kvmtool/disk/aio.c
new file mode 100644
index 0000000..a7418c8
--- /dev/null
+++ b/kvmtool/disk/aio.c
@@ -0,0 +1,150 @@
+#include <libaio.h>
+#include <pthread.h>
+#include <sys/eventfd.h>
+
+#include "kvm/brlock.h"
+#include "kvm/disk-image.h"
+#include "kvm/kvm.h"
+#include "linux/list.h"
+
+#define AIO_MAX 256
+
+static int aio_submit(struct disk_image *disk, int nr, struct iocb **ios)
+{
+	int ret;
+
+	__sync_fetch_and_add(&disk->aio_inflight, nr);
+	/*
+	 * A wmb() is needed here, to ensure disk_aio_thread() sees this
+	 * increase after receiving the events. It is included in the
+	 * __sync_fetch_and_add (as a full barrier).
+	 */
+restart:
+	ret = io_submit(disk->ctx, nr, ios);
+	if (ret == -EAGAIN)
+		goto restart;
+	else if (ret <= 0)
+		/* disk_aio_thread() is never going to see those */
+		__sync_fetch_and_sub(&disk->aio_inflight, nr);
+
+	return ret;
+}
+
+ssize_t raw_image__read_async(struct disk_image *disk, u64 sector,
+			      const struct iovec *iov, int iovcount,
+			      void *param)
+{
+	struct iocb iocb;
+	u64 offset = sector << SECTOR_SHIFT;
+	struct iocb *ios[1] = { &iocb };
+
+	io_prep_preadv(&iocb, disk->fd, iov, iovcount, offset);
+	io_set_eventfd(&iocb, disk->evt);
+	iocb.data = param;
+
+	return aio_submit(disk, 1, ios);
+}
+
+ssize_t raw_image__write_async(struct disk_image *disk, u64 sector,
+			       const struct iovec *iov, int iovcount,
+			       void *param)
+{
+	struct iocb iocb;
+	u64 offset = sector << SECTOR_SHIFT;
+	struct iocb *ios[1] = { &iocb };
+
+	io_prep_pwritev(&iocb, disk->fd, iov, iovcount, offset);
+	io_set_eventfd(&iocb, disk->evt);
+	iocb.data = param;
+
+	return aio_submit(disk, 1, ios);
+}
+
+/*
+ * When this function returns there are no in-flight I/O. Caller ensures that
+ * io_submit() isn't called concurrently.
+ *
+ * Returns an inaccurate number of I/O that was in-flight when the function was
+ * called.
+ */
+int raw_image__wait(struct disk_image *disk)
+{
+	u64 inflight = disk->aio_inflight;
+
+	while (disk->aio_inflight) {
+		usleep(100);
+		barrier();
+	}
+
+	return inflight;
+}
+
+static int disk_aio_get_events(struct disk_image *disk)
+{
+	struct io_event event[AIO_MAX];
+	struct timespec notime = {0};
+	int nr, i;
+
+	do {
+		nr = io_getevents(disk->ctx, 1, ARRAY_SIZE(event), event, &notime);
+		for (i = 0; i < nr; i++)
+			disk->disk_req_cb(event[i].data, event[i].res);
+
+		/* Pairs with wmb() in aio_submit() */
+		rmb();
+		__sync_fetch_and_sub(&disk->aio_inflight, nr);
+
+	} while (nr > 0);
+
+	return 0;
+}
+
+static void *disk_aio_thread(void *param)
+{
+	struct disk_image *disk = param;
+	u64 dummy;
+
+	kvm__set_thread_name("disk-image-io");
+
+	while (read(disk->evt, &dummy, sizeof(dummy)) > 0) {
+		if (disk_aio_get_events(disk))
+			break;
+	}
+
+	return NULL;
+}
+
+int disk_aio_setup(struct disk_image *disk)
+{
+	int r;
+
+	/* No need to setup AIO if the disk ops won't make use of it */
+	if (!disk->ops->async)
+		return 0;
+
+	disk->evt = eventfd(0, 0);
+	if (disk->evt < 0)
+		return -errno;
+
+	io_setup(AIO_MAX, &disk->ctx);
+	r = pthread_create(&disk->thread, NULL, disk_aio_thread, disk);
+	if (r) {
+		r = -errno;
+		close(disk->evt);
+		return r;
+	}
+
+	disk->async = true;
+	return 0;
+}
+
+void disk_aio_destroy(struct disk_image *disk)
+{
+	if (!disk->async)
+		return;
+
+	pthread_cancel(disk->thread);
+	pthread_join(disk->thread, NULL);
+	close(disk->evt);
+	io_destroy(disk->ctx);
+}
diff --git a/kvmtool/disk/blk.c b/kvmtool/disk/blk.c
new file mode 100644
index 0000000..b4c9fba
--- /dev/null
+++ b/kvmtool/disk/blk.c
@@ -0,0 +1,72 @@
+#include "kvm/disk-image.h"
+
+#include <linux/err.h>
+#include <mntent.h>
+
+/*
+ * raw image and blk dev are similar, so reuse raw image ops.
+ */
+static struct disk_image_operations blk_dev_ops = {
+	.read	= raw_image__read,
+	.write	= raw_image__write,
+	.wait	= raw_image__wait,
+	.async	= true,
+};
+
+static bool is_mounted(struct stat *st)
+{
+	struct stat st_buf;
+	struct mntent *mnt;
+	FILE *f;
+
+	f = setmntent("/proc/mounts", "r");
+	if (!f)
+		return false;
+
+	while ((mnt = getmntent(f)) != NULL) {
+		if (stat(mnt->mnt_fsname, &st_buf) == 0 &&
+		    S_ISBLK(st_buf.st_mode) && st->st_rdev == st_buf.st_rdev) {
+			fclose(f);
+			return true;
+		}
+	}
+
+	fclose(f);
+	return false;
+}
+
+struct disk_image *blkdev__probe(const char *filename, int flags, struct stat *st)
+{
+	int fd, r;
+	u64 size;
+
+	if (!S_ISBLK(st->st_mode))
+		return ERR_PTR(-EINVAL);
+
+	if (is_mounted(st)) {
+		pr_err("Block device %s is already mounted! Unmount before use.",
+		       filename);
+		return ERR_PTR(-EINVAL);
+	}
+
+	/*
+	 * Be careful! We are opening host block device!
+	 * Open it readonly since we do not want to break user's data on disk.
+	 */
+	fd = open(filename, flags);
+	if (fd < 0)
+		return ERR_PTR(fd);
+
+	if (ioctl(fd, BLKGETSIZE64, &size) < 0) {
+		r = -errno;
+		close(fd);
+		return ERR_PTR(r);
+	}
+
+	/*
+	 * FIXME: This will not work on 32-bit host because we can not
+	 * mmap large disk. There is not enough virtual address space
+	 * in 32-bit host. However, this works on 64-bit host.
+	 */
+	return disk_image__new(fd, size, &blk_dev_ops, DISK_IMAGE_REGULAR);
+}
diff --git a/kvmtool/disk/core.c b/kvmtool/disk/core.c
new file mode 100644
index 0000000..8d95c98
--- /dev/null
+++ b/kvmtool/disk/core.c
@@ -0,0 +1,345 @@
+#include "kvm/disk-image.h"
+#include "kvm/qcow.h"
+#include "kvm/virtio-blk.h"
+#include "kvm/kvm.h"
+
+#include <linux/err.h>
+#include <poll.h>
+
+int debug_iodelay;
+
+static int disk_image__close(struct disk_image *disk);
+
+int disk_img_name_parser(const struct option *opt, const char *arg, int unset)
+{
+	const char *cur;
+	char *sep;
+	struct kvm *kvm = opt->ptr;
+
+	if (kvm->cfg.image_count >= MAX_DISK_IMAGES)
+		die("Currently only 4 images are supported");
+
+	kvm->cfg.disk_image[kvm->cfg.image_count].filename = arg;
+	cur = arg;
+
+	if (strncmp(arg, "scsi:", 5) == 0) {
+		sep = strstr(arg, ":");
+		if (sep)
+			kvm->cfg.disk_image[kvm->cfg.image_count].wwpn = sep + 1;
+		sep = strstr(sep + 1, ":");
+		if (sep) {
+			*sep = 0;
+			kvm->cfg.disk_image[kvm->cfg.image_count].tpgt = sep + 1;
+		}
+		cur = sep + 1;
+	}
+
+	do {
+		sep = strstr(cur, ",");
+		if (sep) {
+			if (strncmp(sep + 1, "ro", 2) == 0)
+				kvm->cfg.disk_image[kvm->cfg.image_count].readonly = true;
+			else if (strncmp(sep + 1, "direct", 6) == 0)
+				kvm->cfg.disk_image[kvm->cfg.image_count].direct = true;
+			*sep = 0;
+			cur = sep + 1;
+		}
+	} while (sep);
+
+	kvm->cfg.image_count++;
+
+	return 0;
+}
+
+struct disk_image *disk_image__new(int fd, u64 size,
+				   struct disk_image_operations *ops,
+				   int use_mmap)
+{
+	struct disk_image *disk;
+	int r;
+
+	disk = malloc(sizeof *disk);
+	if (!disk)
+		return ERR_PTR(-ENOMEM);
+
+	*disk = (struct disk_image) {
+		.fd	= fd,
+		.size	= size,
+		.ops	= ops,
+	};
+
+	if (use_mmap == DISK_IMAGE_MMAP) {
+		/*
+		 * The write to disk image will be discarded
+		 */
+		disk->priv = mmap(NULL, size, PROT_RW, MAP_PRIVATE | MAP_NORESERVE, fd, 0);
+		if (disk->priv == MAP_FAILED) {
+			r = -errno;
+			goto err_free_disk;
+		}
+	}
+
+	r = disk_aio_setup(disk);
+	if (r)
+		goto err_unmap_disk;
+
+	return disk;
+
+err_unmap_disk:
+	if (disk->priv)
+		munmap(disk->priv, size);
+err_free_disk:
+	free(disk);
+	return ERR_PTR(r);
+}
+
+static struct disk_image *disk_image__open(const char *filename, bool readonly, bool direct)
+{
+	struct disk_image *disk;
+	struct stat st;
+	int fd, flags;
+
+	if (readonly)
+		flags = O_RDONLY;
+	else
+		flags = O_RDWR;
+	if (direct)
+		flags |= O_DIRECT;
+
+	if (stat(filename, &st) < 0)
+		return ERR_PTR(-errno);
+
+	/* blk device ?*/
+	disk = blkdev__probe(filename, flags, &st);
+	if (!IS_ERR_OR_NULL(disk)) {
+		disk->readonly = readonly;
+		return disk;
+	}
+
+	fd = open(filename, flags);
+	if (fd < 0)
+		return ERR_PTR(fd);
+
+	/* qcow image ?*/
+	disk = qcow_probe(fd, true);
+	if (!IS_ERR_OR_NULL(disk)) {
+		pr_warning("Forcing read-only support for QCOW");
+		disk->readonly = true;
+		return disk;
+	}
+
+	/* raw image ?*/
+	disk = raw_image__probe(fd, &st, readonly);
+	if (!IS_ERR_OR_NULL(disk)) {
+		disk->readonly = readonly;
+		return disk;
+	}
+
+	if (close(fd) < 0)
+		pr_warning("close() failed");
+
+	return ERR_PTR(-ENOSYS);
+}
+
+static struct disk_image **disk_image__open_all(struct kvm *kvm)
+{
+	struct disk_image **disks;
+	const char *filename;
+	const char *wwpn;
+	const char *tpgt;
+	bool readonly;
+	bool direct;
+	void *err;
+	int i;
+	struct disk_image_params *params = (struct disk_image_params *)&kvm->cfg.disk_image;
+	int count = kvm->cfg.image_count;
+
+	if (!count)
+		return ERR_PTR(-EINVAL);
+	if (count > MAX_DISK_IMAGES)
+		return ERR_PTR(-ENOSPC);
+
+	disks = calloc(count, sizeof(*disks));
+	if (!disks)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < count; i++) {
+		filename = params[i].filename;
+		readonly = params[i].readonly;
+		direct = params[i].direct;
+		wwpn = params[i].wwpn;
+		tpgt = params[i].tpgt;
+
+		if (wwpn) {
+			disks[i] = malloc(sizeof(struct disk_image));
+			if (!disks[i])
+				return ERR_PTR(-ENOMEM);
+			disks[i]->wwpn = wwpn;
+			disks[i]->tpgt = tpgt;
+			continue;
+		}
+
+		if (!filename)
+			continue;
+
+		disks[i] = disk_image__open(filename, readonly, direct);
+		if (IS_ERR_OR_NULL(disks[i])) {
+			pr_err("Loading disk image '%s' failed", filename);
+			err = disks[i];
+			goto error;
+		}
+		disks[i]->debug_iodelay = kvm->cfg.debug_iodelay;
+	}
+
+	return disks;
+error:
+	for (i = 0; i < count; i++)
+		if (!IS_ERR_OR_NULL(disks[i]))
+			disk_image__close(disks[i]);
+
+	free(disks);
+	return err;
+}
+
+int disk_image__wait(struct disk_image *disk)
+{
+	if (disk->ops->wait)
+		return disk->ops->wait(disk);
+
+	return 0;
+}
+
+int disk_image__flush(struct disk_image *disk)
+{
+	if (disk->ops->flush)
+		return disk->ops->flush(disk);
+
+	return fsync(disk->fd);
+}
+
+static int disk_image__close(struct disk_image *disk)
+{
+	/* If there was no disk image then there's nothing to do: */
+	if (!disk)
+		return 0;
+
+	disk_aio_destroy(disk);
+
+	if (disk->ops->close)
+		return disk->ops->close(disk);
+
+	if (close(disk->fd) < 0)
+		pr_warning("close() failed");
+
+	free(disk);
+
+	return 0;
+}
+
+static int disk_image__close_all(struct disk_image **disks, int count)
+{
+	while (count)
+		disk_image__close(disks[--count]);
+
+	free(disks);
+
+	return 0;
+}
+
+/*
+ * Fill iov with disk data, starting from sector 'sector'.
+ * Return amount of bytes read.
+ */
+ssize_t disk_image__read(struct disk_image *disk, u64 sector,
+			 const struct iovec *iov, int iovcount, void *param)
+{
+	ssize_t total = 0;
+
+	if (debug_iodelay)
+		msleep(debug_iodelay);
+
+	if (disk->ops->read) {
+		total = disk->ops->read(disk, sector, iov, iovcount, param);
+		if (total < 0) {
+			pr_info("disk_image__read error: total=%ld\n", (long)total);
+			return total;
+		}
+	}
+
+	if (!disk->async && disk->disk_req_cb)
+		disk->disk_req_cb(param, total);
+
+	return total;
+}
+
+/*
+ * Write iov to disk, starting from sector 'sector'.
+ * Return amount of bytes written.
+ */
+ssize_t disk_image__write(struct disk_image *disk, u64 sector,
+			  const struct iovec *iov, int iovcount, void *param)
+{
+	ssize_t total = 0;
+
+	if (debug_iodelay)
+		msleep(debug_iodelay);
+
+	if (disk->ops->write) {
+		/*
+		 * Try writev based operation first
+		 */
+
+		total = disk->ops->write(disk, sector, iov, iovcount, param);
+		if (total < 0) {
+			pr_info("disk_image__write error: total=%ld\n", (long)total);
+			return total;
+		}
+	} else {
+		/* Do nothing */
+	}
+
+	if (!disk->async && disk->disk_req_cb)
+		disk->disk_req_cb(param, total);
+
+	return total;
+}
+
+ssize_t disk_image__get_serial(struct disk_image *disk, void *buffer, ssize_t *len)
+{
+	struct stat st;
+	int r;
+
+	r = fstat(disk->fd, &st);
+	if (r)
+		return r;
+
+	*len = snprintf(buffer, *len, "%llu%llu%llu",
+			(unsigned long long)st.st_dev,
+			(unsigned long long)st.st_rdev,
+			(unsigned long long)st.st_ino);
+	return *len;
+}
+
+void disk_image__set_callback(struct disk_image *disk,
+			      void (*disk_req_cb)(void *param, long len))
+{
+	disk->disk_req_cb = disk_req_cb;
+}
+
+int disk_image__init(struct kvm *kvm)
+{
+	if (kvm->cfg.image_count) {
+		kvm->disks = disk_image__open_all(kvm);
+		if (IS_ERR(kvm->disks))
+			return PTR_ERR(kvm->disks);
+	}
+
+	return 0;
+}
+dev_base_init(disk_image__init);
+
+int disk_image__exit(struct kvm *kvm)
+{
+	return disk_image__close_all(kvm->disks, kvm->nr_disks);
+}
+dev_base_exit(disk_image__exit);
diff --git a/kvmtool/disk/qcow.c b/kvmtool/disk/qcow.c
new file mode 100644
index 0000000..dd6be62
--- /dev/null
+++ b/kvmtool/disk/qcow.c
@@ -0,0 +1,1526 @@
+#include "kvm/qcow.h"
+
+#include "kvm/disk-image.h"
+#include "kvm/read-write.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#ifdef CONFIG_HAS_ZLIB
+#include <zlib.h>
+#endif
+
+#include <linux/err.h>
+#include <linux/byteorder.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+static int update_cluster_refcount(struct qcow *q, u64 clust_idx, u16 append);
+static int qcow_write_refcount_table(struct qcow *q);
+static u64 qcow_alloc_clusters(struct qcow *q, u64 size, int update_ref);
+static void  qcow_free_clusters(struct qcow *q, u64 clust_start, u64 size);
+
+static inline int qcow_pwrite_sync(int fd,
+	void *buf, size_t count, off_t offset)
+{
+	if (pwrite_in_full(fd, buf, count, offset) < 0)
+		return -1;
+
+	return fdatasync(fd);
+}
+
+static int l2_table_insert(struct rb_root *root, struct qcow_l2_table *new)
+{
+	struct rb_node **link = &(root->rb_node), *parent = NULL;
+	u64 offset = new->offset;
+
+	/* search the tree */
+	while (*link) {
+		struct qcow_l2_table *t;
+
+		t = rb_entry(*link, struct qcow_l2_table, node);
+		if (!t)
+			goto error;
+
+		parent = *link;
+
+		if (t->offset > offset)
+			link = &(*link)->rb_left;
+		else if (t->offset < offset)
+			link = &(*link)->rb_right;
+		else
+			goto out;
+	}
+
+	/* add new node */
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, root);
+out:
+	return 0;
+error:
+	return -1;
+}
+
+static struct qcow_l2_table *l2_table_lookup(struct rb_root *root, u64 offset)
+{
+	struct rb_node *link = root->rb_node;
+
+	while (link) {
+		struct qcow_l2_table *t;
+
+		t = rb_entry(link, struct qcow_l2_table, node);
+		if (!t)
+			goto out;
+
+		if (t->offset > offset)
+			link = link->rb_left;
+		else if (t->offset < offset)
+			link = link->rb_right;
+		else
+			return t;
+	}
+out:
+	return NULL;
+}
+
+static void l1_table_free_cache(struct qcow_l1_table *l1t)
+{
+	struct rb_root *r = &l1t->root;
+	struct list_head *pos, *n;
+	struct qcow_l2_table *t;
+
+	list_for_each_safe(pos, n, &l1t->lru_list) {
+		/* Remove cache table from the list and RB tree */
+		list_del(pos);
+		t = list_entry(pos, struct qcow_l2_table, list);
+		rb_erase(&t->node, r);
+
+		/* Free the cached node */
+		free(t);
+	}
+}
+
+static int qcow_l2_cache_write(struct qcow *q, struct qcow_l2_table *c)
+{
+	struct qcow_header *header = q->header;
+	u64 size;
+
+	if (!c->dirty)
+		return 0;
+
+	size = 1 << header->l2_bits;
+
+	if (qcow_pwrite_sync(q->fd, c->table,
+		size * sizeof(u64), c->offset) < 0)
+		return -1;
+
+	c->dirty = 0;
+
+	return 0;
+}
+
+static int cache_table(struct qcow *q, struct qcow_l2_table *c)
+{
+	struct qcow_l1_table *l1t = &q->table;
+	struct rb_root *r = &l1t->root;
+	struct qcow_l2_table *lru;
+
+	if (l1t->nr_cached == MAX_CACHE_NODES) {
+		/*
+		 * The node at the head of the list is least recently used
+		 * node. Remove it from the list and replaced with a new node.
+		 */
+		lru = list_first_entry(&l1t->lru_list, struct qcow_l2_table, list);
+
+		/* Remove the node from the cache */
+		rb_erase(&lru->node, r);
+		list_del_init(&lru->list);
+		l1t->nr_cached--;
+
+		/* Free the LRUed node */
+		free(lru);
+	}
+
+	/* Add new node in RB Tree: Helps in searching faster */
+	if (l2_table_insert(r, c) < 0)
+		goto error;
+
+	/* Add in LRU replacement list */
+	list_add_tail(&c->list, &l1t->lru_list);
+	l1t->nr_cached++;
+
+	return 0;
+error:
+	return -1;
+}
+
+static struct qcow_l2_table *l2_table_search(struct qcow *q, u64 offset)
+{
+	struct qcow_l1_table *l1t = &q->table;
+	struct qcow_l2_table *l2t;
+
+	l2t = l2_table_lookup(&l1t->root, offset);
+	if (!l2t)
+		return NULL;
+
+	/* Update the LRU state, by moving the searched node to list tail */
+	list_move_tail(&l2t->list, &l1t->lru_list);
+
+	return l2t;
+}
+
+/* Allocates a new node for caching L2 table */
+static struct qcow_l2_table *new_cache_table(struct qcow *q, u64 offset)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_l2_table *c;
+	u64 l2t_sz;
+	u64 size;
+
+	l2t_sz = 1 << header->l2_bits;
+	size   = sizeof(*c) + l2t_sz * sizeof(u64);
+	c      = calloc(1, size);
+	if (!c)
+		goto out;
+
+	c->offset = offset;
+	RB_CLEAR_NODE(&c->node);
+	INIT_LIST_HEAD(&c->list);
+out:
+	return c;
+}
+
+static inline u64 get_l1_index(struct qcow *q, u64 offset)
+{
+	struct qcow_header *header = q->header;
+
+	return offset >> (header->l2_bits + header->cluster_bits);
+}
+
+static inline u64 get_l2_index(struct qcow *q, u64 offset)
+{
+	struct qcow_header *header = q->header;
+
+	return (offset >> (header->cluster_bits)) & ((1 << header->l2_bits)-1);
+}
+
+static inline u64 get_cluster_offset(struct qcow *q, u64 offset)
+{
+	struct qcow_header *header = q->header;
+
+	return offset & ((1 << header->cluster_bits)-1);
+}
+
+static struct qcow_l2_table *qcow_read_l2_table(struct qcow *q, u64 offset)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_l2_table *l2t;
+	u64 size;
+
+	size = 1 << header->l2_bits;
+
+	/* search an entry for offset in cache */
+	l2t = l2_table_search(q, offset);
+	if (l2t)
+		return l2t;
+
+	/* allocate new node for caching l2 table */
+	l2t = new_cache_table(q, offset);
+	if (!l2t)
+		goto error;
+
+	/* table not cached: read from the disk */
+	if (pread_in_full(q->fd, l2t->table, size * sizeof(u64), offset) < 0)
+		goto error;
+
+	/* cache the table */
+	if (cache_table(q, l2t) < 0)
+		goto error;
+
+	return l2t;
+error:
+	free(l2t);
+	return NULL;
+}
+
+static int qcow_decompress_buffer(u8 *out_buf, int out_buf_size,
+	const u8 *buf, int buf_size)
+{
+#ifdef CONFIG_HAS_ZLIB
+	z_stream strm1, *strm = &strm1;
+	int ret, out_len;
+
+	memset(strm, 0, sizeof(*strm));
+
+	strm->next_in	= (u8 *)buf;
+	strm->avail_in	= buf_size;
+	strm->next_out	= out_buf;
+	strm->avail_out	= out_buf_size;
+
+	ret = inflateInit2(strm, -12);
+	if (ret != Z_OK)
+		return -1;
+
+	ret = inflate(strm, Z_FINISH);
+	out_len = strm->next_out - out_buf;
+	if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+		out_len != out_buf_size) {
+		inflateEnd(strm);
+		return -1;
+	}
+
+	inflateEnd(strm);
+	return 0;
+#else
+	return -1;
+#endif
+}
+
+static ssize_t qcow1_read_cluster(struct qcow *q, u64 offset,
+	void *dst, u32 dst_len)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_l1_table *l1t = &q->table;
+	struct qcow_l2_table *l2t;
+	u64 clust_offset;
+	u64 clust_start;
+	u64 l2t_offset;
+	size_t length;
+	u64 l2t_size;
+	u64 l1_idx;
+	u64 l2_idx;
+	int coffset;
+	int csize;
+
+	l1_idx = get_l1_index(q, offset);
+	if (l1_idx >= l1t->table_size)
+		return -1;
+
+	clust_offset = get_cluster_offset(q, offset);
+	if (clust_offset >= q->cluster_size)
+		return -1;
+
+	length = q->cluster_size - clust_offset;
+	if (length > dst_len)
+		length = dst_len;
+
+	mutex_lock(&q->mutex);
+
+	l2t_offset = be64_to_cpu(l1t->l1_table[l1_idx]);
+	if (!l2t_offset)
+		goto zero_cluster;
+
+	l2t_size = 1 << header->l2_bits;
+
+	/* read and cache level 2 table */
+	l2t = qcow_read_l2_table(q, l2t_offset);
+	if (!l2t)
+		goto out_error;
+
+	l2_idx = get_l2_index(q, offset);
+	if (l2_idx >= l2t_size)
+		goto out_error;
+
+	clust_start = be64_to_cpu(l2t->table[l2_idx]);
+	if (clust_start & QCOW1_OFLAG_COMPRESSED) {
+		coffset	= clust_start & q->cluster_offset_mask;
+		csize	= clust_start >> (63 - q->header->cluster_bits);
+		csize	&= (q->cluster_size - 1);
+
+		if (pread_in_full(q->fd, q->cluster_data, csize,
+				  coffset) < 0)
+			goto out_error;
+
+		if (qcow_decompress_buffer(q->cluster_cache, q->cluster_size,
+					q->cluster_data, csize) < 0)
+			goto out_error;
+
+		memcpy(dst, q->cluster_cache + clust_offset, length);
+		mutex_unlock(&q->mutex);
+	} else {
+		if (!clust_start)
+			goto zero_cluster;
+
+		mutex_unlock(&q->mutex);
+
+		if (pread_in_full(q->fd, dst, length,
+				  clust_start + clust_offset) < 0)
+			return -1;
+	}
+
+	return length;
+
+zero_cluster:
+	mutex_unlock(&q->mutex);
+	memset(dst, 0, length);
+	return length;
+
+out_error:
+	mutex_unlock(&q->mutex);
+	length = -1;
+	return -1;
+}
+
+static ssize_t qcow2_read_cluster(struct qcow *q, u64 offset,
+	void *dst, u32 dst_len)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_l1_table *l1t = &q->table;
+	struct qcow_l2_table *l2t;
+	u64 clust_offset;
+	u64 clust_start;
+	u64 l2t_offset;
+	size_t length;
+	u64 l2t_size;
+	u64 l1_idx;
+	u64 l2_idx;
+	int coffset;
+	int sector_offset;
+	int nb_csectors;
+	int csize;
+
+	l1_idx = get_l1_index(q, offset);
+	if (l1_idx >= l1t->table_size)
+		return -1;
+
+	clust_offset = get_cluster_offset(q, offset);
+	if (clust_offset >= q->cluster_size)
+		return -1;
+
+	length = q->cluster_size - clust_offset;
+	if (length > dst_len)
+		length = dst_len;
+
+	mutex_lock(&q->mutex);
+
+	l2t_offset = be64_to_cpu(l1t->l1_table[l1_idx]);
+
+	l2t_offset &= ~QCOW2_OFLAG_COPIED;
+	if (!l2t_offset)
+		goto zero_cluster;
+
+	l2t_size = 1 << header->l2_bits;
+
+	/* read and cache level 2 table */
+	l2t = qcow_read_l2_table(q, l2t_offset);
+	if (!l2t)
+		goto out_error;
+
+	l2_idx = get_l2_index(q, offset);
+	if (l2_idx >= l2t_size)
+		goto out_error;
+
+	clust_start = be64_to_cpu(l2t->table[l2_idx]);
+	if (clust_start & QCOW2_OFLAG_COMPRESSED) {
+		coffset = clust_start & q->cluster_offset_mask;
+		nb_csectors = ((clust_start >> q->csize_shift)
+			& q->csize_mask) + 1;
+		sector_offset = coffset & (SECTOR_SIZE - 1);
+		csize = nb_csectors * SECTOR_SIZE - sector_offset;
+
+		if (pread_in_full(q->fd, q->cluster_data,
+				  nb_csectors * SECTOR_SIZE,
+				  coffset & ~(SECTOR_SIZE - 1)) < 0) {
+			goto out_error;
+		}
+
+		if (qcow_decompress_buffer(q->cluster_cache, q->cluster_size,
+					q->cluster_data + sector_offset,
+					csize) < 0) {
+			goto out_error;
+		}
+
+		memcpy(dst, q->cluster_cache + clust_offset, length);
+		mutex_unlock(&q->mutex);
+	} else {
+		clust_start &= QCOW2_OFFSET_MASK;
+		if (!clust_start)
+			goto zero_cluster;
+
+		mutex_unlock(&q->mutex);
+
+		if (pread_in_full(q->fd, dst, length,
+				  clust_start + clust_offset) < 0)
+			return -1;
+	}
+
+	return length;
+
+zero_cluster:
+	mutex_unlock(&q->mutex);
+	memset(dst, 0, length);
+	return length;
+
+out_error:
+	mutex_unlock(&q->mutex);
+	length = -1;
+	return -1;
+}
+
+static ssize_t qcow_read_sector_single(struct disk_image *disk, u64 sector,
+	void *dst, u32 dst_len)
+{
+	struct qcow *q = disk->priv;
+	struct qcow_header *header = q->header;
+	u32 nr_read;
+	u64 offset;
+	char *buf;
+	u32 nr;
+
+	buf = dst;
+	nr_read = 0;
+
+	while (nr_read < dst_len) {
+		offset = sector << SECTOR_SHIFT;
+		if (offset >= header->size)
+			return -1;
+
+		if (q->version == QCOW1_VERSION)
+			nr = qcow1_read_cluster(q, offset, buf,
+				dst_len - nr_read);
+		else
+			nr = qcow2_read_cluster(q, offset, buf,
+				dst_len - nr_read);
+
+		if (nr <= 0)
+			return -1;
+
+		nr_read	+= nr;
+		buf	+= nr;
+		sector	+= (nr >> SECTOR_SHIFT);
+	}
+
+	return dst_len;
+}
+
+static ssize_t qcow_read_sector(struct disk_image *disk, u64 sector,
+				const struct iovec *iov, int iovcount, void *param)
+{
+	ssize_t nr, total = 0;
+
+	while (iovcount--) {
+		nr = qcow_read_sector_single(disk, sector, iov->iov_base, iov->iov_len);
+		if (nr != (ssize_t)iov->iov_len) {
+			pr_info("qcow_read_sector error: nr=%ld iov_len=%ld\n", (long)nr, (long)iov->iov_len);
+			return -1;
+		}
+
+		sector += iov->iov_len >> SECTOR_SHIFT;
+		total += nr;
+		iov++;
+	}
+
+	return total;
+}
+
+static void refcount_table_free_cache(struct qcow_refcount_table *rft)
+{
+	struct rb_root *r = &rft->root;
+	struct list_head *pos, *n;
+	struct qcow_refcount_block *t;
+
+	list_for_each_safe(pos, n, &rft->lru_list) {
+		list_del(pos);
+		t = list_entry(pos, struct qcow_refcount_block, list);
+		rb_erase(&t->node, r);
+
+		free(t);
+	}
+}
+
+static int refcount_block_insert(struct rb_root *root, struct qcow_refcount_block *new)
+{
+	struct rb_node **link = &(root->rb_node), *parent = NULL;
+	u64 offset = new->offset;
+
+	/* search the tree */
+	while (*link) {
+		struct qcow_refcount_block *t;
+
+		t = rb_entry(*link, struct qcow_refcount_block, node);
+		if (!t)
+			goto error;
+
+		parent = *link;
+
+		if (t->offset > offset)
+			link = &(*link)->rb_left;
+		else if (t->offset < offset)
+			link = &(*link)->rb_right;
+		else
+			goto out;
+	}
+
+	/* add new node */
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, root);
+out:
+	return 0;
+error:
+	return -1;
+}
+
+static int write_refcount_block(struct qcow *q, struct qcow_refcount_block *rfb)
+{
+	if (!rfb->dirty)
+		return 0;
+
+	if (qcow_pwrite_sync(q->fd, rfb->entries,
+		rfb->size * sizeof(u16), rfb->offset) < 0)
+		return -1;
+
+	rfb->dirty = 0;
+
+	return 0;
+}
+
+static int cache_refcount_block(struct qcow *q, struct qcow_refcount_block *c)
+{
+	struct qcow_refcount_table *rft = &q->refcount_table;
+	struct rb_root *r = &rft->root;
+	struct qcow_refcount_block *lru;
+
+	if (rft->nr_cached == MAX_CACHE_NODES) {
+		lru = list_first_entry(&rft->lru_list, struct qcow_refcount_block, list);
+
+		rb_erase(&lru->node, r);
+		list_del_init(&lru->list);
+		rft->nr_cached--;
+
+		free(lru);
+	}
+
+	if (refcount_block_insert(r, c) < 0)
+		goto error;
+
+	list_add_tail(&c->list, &rft->lru_list);
+	rft->nr_cached++;
+
+	return 0;
+error:
+	return -1;
+}
+
+static struct qcow_refcount_block *new_refcount_block(struct qcow *q, u64 rfb_offset)
+{
+	struct qcow_refcount_block *rfb;
+
+	rfb = malloc(sizeof *rfb + q->cluster_size);
+	if (!rfb)
+		return NULL;
+
+	rfb->offset = rfb_offset;
+	rfb->size = q->cluster_size / sizeof(u16);
+	RB_CLEAR_NODE(&rfb->node);
+	INIT_LIST_HEAD(&rfb->list);
+
+	return rfb;
+}
+
+static struct qcow_refcount_block *refcount_block_lookup(struct rb_root *root, u64 offset)
+{
+	struct rb_node *link = root->rb_node;
+
+	while (link) {
+		struct qcow_refcount_block *t;
+
+		t = rb_entry(link, struct qcow_refcount_block, node);
+		if (!t)
+			goto out;
+
+		if (t->offset > offset)
+			link = link->rb_left;
+		else if (t->offset < offset)
+			link = link->rb_right;
+		else
+			return t;
+	}
+out:
+	return NULL;
+}
+
+static struct qcow_refcount_block *refcount_block_search(struct qcow *q, u64 offset)
+{
+	struct qcow_refcount_table *rft = &q->refcount_table;
+	struct qcow_refcount_block *rfb;
+
+	rfb = refcount_block_lookup(&rft->root, offset);
+	if (!rfb)
+		return NULL;
+
+	/* Update the LRU state, by moving the searched node to list tail */
+	list_move_tail(&rfb->list, &rft->lru_list);
+
+	return rfb;
+}
+
+static struct qcow_refcount_block *qcow_grow_refcount_block(struct qcow *q,
+	u64 clust_idx)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_refcount_table *rft = &q->refcount_table;
+	struct qcow_refcount_block *rfb;
+	u64 new_block_offset;
+	u64 rft_idx;
+
+	rft_idx = clust_idx >> (header->cluster_bits -
+		QCOW_REFCOUNT_BLOCK_SHIFT);
+
+	if (rft_idx >= rft->rf_size) {
+		pr_warning("Don't support grow refcount block table");
+		return NULL;
+	}
+
+	new_block_offset = qcow_alloc_clusters(q, q->cluster_size, 0);
+	if (new_block_offset == (u64)-1)
+		return NULL;
+
+	rfb = new_refcount_block(q, new_block_offset);
+	if (!rfb)
+		return NULL;
+
+	memset(rfb->entries, 0x00, q->cluster_size);
+	rfb->dirty = 1;
+
+	/* write refcount block */
+	if (write_refcount_block(q, rfb) < 0)
+		goto free_rfb;
+
+	if (cache_refcount_block(q, rfb) < 0)
+		goto free_rfb;
+
+	rft->rf_table[rft_idx] = cpu_to_be64(new_block_offset);
+	if (update_cluster_refcount(q, new_block_offset >>
+		    header->cluster_bits, 1) < 0)
+		goto recover_rft;
+
+	if (qcow_write_refcount_table(q) < 0)
+		goto recover_rft;
+
+	return rfb;
+
+recover_rft:
+	rft->rf_table[rft_idx] = 0;
+free_rfb:
+	free(rfb);
+	return NULL;
+}
+
+static struct qcow_refcount_block *qcow_read_refcount_block(struct qcow *q, u64 clust_idx)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_refcount_table *rft = &q->refcount_table;
+	struct qcow_refcount_block *rfb;
+	u64 rfb_offset;
+	u64 rft_idx;
+
+	rft_idx = clust_idx >> (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT);
+	if (rft_idx >= rft->rf_size)
+		return ERR_PTR(-ENOSPC);
+
+	rfb_offset = be64_to_cpu(rft->rf_table[rft_idx]);
+	if (!rfb_offset)
+		return ERR_PTR(-ENOSPC);
+
+	rfb = refcount_block_search(q, rfb_offset);
+	if (rfb)
+		return rfb;
+
+	rfb = new_refcount_block(q, rfb_offset);
+	if (!rfb)
+		return NULL;
+
+	if (pread_in_full(q->fd, rfb->entries, rfb->size * sizeof(u16), rfb_offset) < 0)
+		goto error_free_rfb;
+
+	if (cache_refcount_block(q, rfb) < 0)
+		goto error_free_rfb;
+
+	return rfb;
+
+error_free_rfb:
+	free(rfb);
+
+	return NULL;
+}
+
+static u16 qcow_get_refcount(struct qcow *q, u64 clust_idx)
+{
+	struct qcow_refcount_block *rfb = NULL;
+	struct qcow_header *header = q->header;
+	u64 rfb_idx;
+
+	rfb = qcow_read_refcount_block(q, clust_idx);
+	if (PTR_ERR(rfb) == -ENOSPC)
+		return 0;
+	else if (IS_ERR_OR_NULL(rfb)) {
+		pr_warning("Error while reading refcount table");
+		return -1;
+	}
+
+	rfb_idx = clust_idx & (((1ULL <<
+		(header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
+
+	if (rfb_idx >= rfb->size) {
+		pr_warning("L1: refcount block index out of bounds");
+		return -1;
+	}
+
+	return be16_to_cpu(rfb->entries[rfb_idx]);
+}
+
+static int update_cluster_refcount(struct qcow *q, u64 clust_idx, u16 append)
+{
+	struct qcow_refcount_block *rfb = NULL;
+	struct qcow_header *header = q->header;
+	u16 refcount;
+	u64 rfb_idx;
+
+	rfb = qcow_read_refcount_block(q, clust_idx);
+	if (PTR_ERR(rfb) == -ENOSPC) {
+		rfb = qcow_grow_refcount_block(q, clust_idx);
+		if (!rfb) {
+			pr_warning("error while growing refcount table");
+			return -1;
+		}
+	} else if (IS_ERR_OR_NULL(rfb)) {
+		pr_warning("error while reading refcount table");
+		return -1;
+	}
+
+	rfb_idx = clust_idx & (((1ULL <<
+		(header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
+	if (rfb_idx >= rfb->size) {
+		pr_warning("refcount block index out of bounds");
+		return -1;
+	}
+
+	refcount = be16_to_cpu(rfb->entries[rfb_idx]) + append;
+	rfb->entries[rfb_idx] = cpu_to_be16(refcount);
+	rfb->dirty = 1;
+
+	/* write refcount block */
+	if (write_refcount_block(q, rfb) < 0) {
+		pr_warning("refcount block index out of bounds");
+		return -1;
+	}
+
+	/* update free_clust_idx since refcount becomes zero */
+	if (!refcount && clust_idx < q->free_clust_idx)
+		q->free_clust_idx = clust_idx;
+
+	return 0;
+}
+
+static void  qcow_free_clusters(struct qcow *q, u64 clust_start, u64 size)
+{
+	struct qcow_header *header = q->header;
+	u64 start, end, offset;
+
+	start = clust_start & ~(q->cluster_size - 1);
+	end = (clust_start + size - 1) & ~(q->cluster_size - 1);
+	for (offset = start; offset <= end; offset += q->cluster_size)
+		update_cluster_refcount(q, offset >> header->cluster_bits, -1);
+}
+
+/*
+ * Allocate clusters according to the size. Find a postion that
+ * can satisfy the size. free_clust_idx is initialized to zero and
+ * Record last position.
+ */
+static u64 qcow_alloc_clusters(struct qcow *q, u64 size, int update_ref)
+{
+	struct qcow_header *header = q->header;
+	u16 clust_refcount;
+	u32 clust_idx = 0, i;
+	u64 clust_num;
+
+	clust_num = (size + (q->cluster_size - 1)) >> header->cluster_bits;
+
+again:
+	for (i = 0; i < clust_num; i++) {
+		clust_idx = q->free_clust_idx++;
+		clust_refcount = qcow_get_refcount(q, clust_idx);
+		if (clust_refcount == (u16)-1)
+			return -1;
+		else if (clust_refcount > 0)
+			goto again;
+	}
+
+	clust_idx++;
+
+	if (update_ref)
+		for (i = 0; i < clust_num; i++)
+			if (update_cluster_refcount(q,
+				clust_idx - clust_num + i, 1))
+				return -1;
+
+	return (clust_idx - clust_num) << header->cluster_bits;
+}
+
+static int qcow_write_l1_table(struct qcow *q)
+{
+	struct qcow_l1_table *l1t = &q->table;
+	struct qcow_header *header = q->header;
+
+	if (qcow_pwrite_sync(q->fd, l1t->l1_table,
+		l1t->table_size * sizeof(u64),
+		header->l1_table_offset) < 0)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * Get l2 table. If the table has been copied, read table directly.
+ * If the table exists, allocate a new cluster and copy the table
+ * to the new cluster.
+ */
+static int get_cluster_table(struct qcow *q, u64 offset,
+	struct qcow_l2_table **result_l2t, u64 *result_l2_index)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_l1_table *l1t = &q->table;
+	struct qcow_l2_table *l2t;
+	u64 l1t_idx;
+	u64 l2t_offset;
+	u64 l2t_idx;
+	u64 l2t_size;
+	u64 l2t_new_offset;
+
+	l2t_size = 1 << header->l2_bits;
+
+	l1t_idx = get_l1_index(q, offset);
+	if (l1t_idx >= l1t->table_size)
+		return -1;
+
+	l2t_idx = get_l2_index(q, offset);
+	if (l2t_idx >= l2t_size)
+		return -1;
+
+	l2t_offset = be64_to_cpu(l1t->l1_table[l1t_idx]);
+	if (l2t_offset & QCOW2_OFLAG_COPIED) {
+		l2t_offset &= ~QCOW2_OFLAG_COPIED;
+		l2t = qcow_read_l2_table(q, l2t_offset);
+		if (!l2t)
+			goto error;
+	} else {
+		l2t_new_offset = qcow_alloc_clusters(q,
+			l2t_size*sizeof(u64), 1);
+
+		if (l2t_new_offset != (u64)-1)
+			goto error;
+
+		l2t = new_cache_table(q, l2t_new_offset);
+		if (!l2t)
+			goto free_cluster;
+
+		if (l2t_offset) {
+			l2t = qcow_read_l2_table(q, l2t_offset);
+			if (!l2t)
+				goto free_cache;
+		} else
+			memset(l2t->table, 0x00, l2t_size * sizeof(u64));
+
+		/* write l2 table */
+		l2t->dirty = 1;
+		if (qcow_l2_cache_write(q, l2t) < 0)
+			goto free_cache;
+
+		/* cache l2 table */
+		if (cache_table(q, l2t))
+			goto free_cache;
+
+		/* update the l1 talble */
+		l1t->l1_table[l1t_idx] = cpu_to_be64(l2t_new_offset
+			| QCOW2_OFLAG_COPIED);
+		if (qcow_write_l1_table(q)) {
+			pr_warning("Update l1 table error");
+			goto free_cache;
+		}
+
+		/* free old cluster */
+		qcow_free_clusters(q, l2t_offset, q->cluster_size);
+	}
+
+	*result_l2t = l2t;
+	*result_l2_index = l2t_idx;
+
+	return 0;
+
+free_cache:
+	free(l2t);
+
+free_cluster:
+	qcow_free_clusters(q, l2t_new_offset, q->cluster_size);
+
+error:
+	return -1;
+}
+
+/*
+ * If the cluster has been copied, write data directly. If not,
+ * read the original data and write it to the new cluster with
+ * modification.
+ */
+static ssize_t qcow_write_cluster(struct qcow *q, u64 offset,
+		void *buf, u32 src_len)
+{
+	struct qcow_l2_table *l2t;
+	u64 clust_new_start;
+	u64 clust_start;
+	u64 clust_flags;
+	u64 clust_off;
+	u64 l2t_idx;
+	u64 len;
+
+	l2t = NULL;
+
+	clust_off = get_cluster_offset(q, offset);
+	if (clust_off >= q->cluster_size)
+		return -1;
+
+	len = q->cluster_size - clust_off;
+	if (len > src_len)
+		len = src_len;
+
+	mutex_lock(&q->mutex);
+
+	if (get_cluster_table(q, offset, &l2t, &l2t_idx)) {
+		pr_warning("Get l2 table error");
+		goto error;
+	}
+
+	clust_start = be64_to_cpu(l2t->table[l2t_idx]);
+	clust_flags = clust_start & QCOW2_OFLAGS_MASK;
+
+	clust_start &= QCOW2_OFFSET_MASK;
+	if (!(clust_flags & QCOW2_OFLAG_COPIED)) {
+		clust_new_start	= qcow_alloc_clusters(q, q->cluster_size, 1);
+		if (clust_new_start != (u64)-1) {
+			pr_warning("Cluster alloc error");
+			goto error;
+		}
+
+		offset &= ~(q->cluster_size - 1);
+
+		/* if clust_start is not zero, read the original data*/
+		if (clust_start) {
+			mutex_unlock(&q->mutex);
+			if (qcow2_read_cluster(q, offset, q->copy_buff,
+				q->cluster_size) < 0) {
+				pr_warning("Read copy cluster error");
+				qcow_free_clusters(q, clust_new_start,
+					q->cluster_size);
+				return -1;
+			}
+			mutex_lock(&q->mutex);
+		} else
+			memset(q->copy_buff, 0x00, q->cluster_size);
+
+		memcpy(q->copy_buff + clust_off, buf, len);
+
+		 /* Write actual data */
+		if (pwrite_in_full(q->fd, q->copy_buff, q->cluster_size,
+			clust_new_start) < 0)
+			goto free_cluster;
+
+		/* update l2 table*/
+		l2t->table[l2t_idx] = cpu_to_be64(clust_new_start
+			| QCOW2_OFLAG_COPIED);
+		l2t->dirty = 1;
+
+		if (qcow_l2_cache_write(q, l2t))
+			goto free_cluster;
+
+		/* free old cluster*/
+		if (clust_flags & QCOW2_OFLAG_COMPRESSED) {
+			int size;
+			size = ((clust_start >> q->csize_shift) &
+				q->csize_mask) + 1;
+			size *= 512;
+			clust_start &= q->cluster_offset_mask;
+			clust_start &= ~511;
+
+			qcow_free_clusters(q, clust_start, size);
+		} else if (clust_start)
+			qcow_free_clusters(q, clust_start, q->cluster_size);
+
+	} else {
+		/* Write actual data */
+		if (pwrite_in_full(q->fd, buf, len,
+			clust_start + clust_off) < 0)
+			goto error;
+	}
+	mutex_unlock(&q->mutex);
+	return len;
+
+free_cluster:
+	qcow_free_clusters(q, clust_new_start, q->cluster_size);
+
+error:
+	mutex_unlock(&q->mutex);
+	return -1;
+}
+
+static ssize_t qcow_write_sector_single(struct disk_image *disk, u64 sector, void *src, u32 src_len)
+{
+	struct qcow *q = disk->priv;
+	struct qcow_header *header = q->header;
+	u32 nr_written;
+	char *buf;
+	u64 offset;
+	ssize_t nr;
+
+	buf		= src;
+	nr_written	= 0;
+	offset		= sector << SECTOR_SHIFT;
+
+	while (nr_written < src_len) {
+		if (offset >= header->size)
+			return -1;
+
+		nr = qcow_write_cluster(q, offset, buf, src_len - nr_written);
+		if (nr < 0)
+			return -1;
+
+		nr_written	+= nr;
+		buf		+= nr;
+		offset		+= nr;
+	}
+
+	return nr_written;
+}
+
+static ssize_t qcow_write_sector(struct disk_image *disk, u64 sector,
+				const struct iovec *iov, int iovcount, void *param)
+{
+	ssize_t nr, total = 0;
+
+	while (iovcount--) {
+		nr = qcow_write_sector_single(disk, sector, iov->iov_base, iov->iov_len);
+		if (nr != (ssize_t)iov->iov_len) {
+			pr_info("qcow_write_sector error: nr=%ld iov_len=%ld\n", (long)nr, (long)iov->iov_len);
+			return -1;
+		}
+
+		sector	+= iov->iov_len >> SECTOR_SHIFT;
+		iov++;
+		total	+= nr;
+	}
+
+	return total;
+}
+
+static int qcow_disk_flush(struct disk_image *disk)
+{
+	struct qcow *q = disk->priv;
+	struct qcow_refcount_table *rft;
+	struct list_head *pos, *n;
+	struct qcow_l1_table *l1t;
+
+	l1t = &q->table;
+	rft = &q->refcount_table;
+
+	mutex_lock(&q->mutex);
+
+	list_for_each_safe(pos, n, &rft->lru_list) {
+		struct qcow_refcount_block *c = list_entry(pos, struct qcow_refcount_block, list);
+
+		if (write_refcount_block(q, c) < 0)
+			goto error_unlock;
+	}
+
+	list_for_each_safe(pos, n, &l1t->lru_list) {
+		struct qcow_l2_table *c = list_entry(pos, struct qcow_l2_table, list);
+
+		if (qcow_l2_cache_write(q, c) < 0)
+			goto error_unlock;
+	}
+
+	if (qcow_write_l1_table < 0)
+		goto error_unlock;
+
+	mutex_unlock(&q->mutex);
+
+	return fsync(disk->fd);
+
+error_unlock:
+	mutex_unlock(&q->mutex);
+	return -1;
+}
+
+static int qcow_disk_close(struct disk_image *disk)
+{
+	struct qcow *q;
+
+	if (!disk)
+		return 0;
+
+	q = disk->priv;
+
+	refcount_table_free_cache(&q->refcount_table);
+	l1_table_free_cache(&q->table);
+	free(q->copy_buff);
+	free(q->cluster_data);
+	free(q->cluster_cache);
+	free(q->refcount_table.rf_table);
+	free(q->table.l1_table);
+	free(q->header);
+	free(q);
+
+	return 0;
+}
+
+static struct disk_image_operations qcow_disk_readonly_ops = {
+	.read	= qcow_read_sector,
+	.close	= qcow_disk_close,
+};
+
+static struct disk_image_operations qcow_disk_ops = {
+	.read	= qcow_read_sector,
+	.write	= qcow_write_sector,
+	.flush	= qcow_disk_flush,
+	.close	= qcow_disk_close,
+};
+
+static int qcow_read_refcount_table(struct qcow *q)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_refcount_table *rft = &q->refcount_table;
+
+	rft->rf_size = (header->refcount_table_size * q->cluster_size)
+		/ sizeof(u64);
+
+	rft->rf_table = calloc(rft->rf_size, sizeof(u64));
+	if (!rft->rf_table)
+		return -1;
+
+	rft->root = (struct rb_root) RB_ROOT;
+	INIT_LIST_HEAD(&rft->lru_list);
+
+	return pread_in_full(q->fd, rft->rf_table, sizeof(u64) * rft->rf_size, header->refcount_table_offset);
+}
+
+static int qcow_write_refcount_table(struct qcow *q)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_refcount_table *rft = &q->refcount_table;
+
+	return qcow_pwrite_sync(q->fd, rft->rf_table,
+		rft->rf_size * sizeof(u64), header->refcount_table_offset);
+}
+
+static int qcow_read_l1_table(struct qcow *q)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_l1_table *table = &q->table;
+
+	table->table_size = header->l1_size;
+
+	table->l1_table	= calloc(table->table_size, sizeof(u64));
+	if (!table->l1_table)
+		return -1;
+
+	return pread_in_full(q->fd, table->l1_table, sizeof(u64) * table->table_size, header->l1_table_offset);
+}
+
+static void *qcow2_read_header(int fd)
+{
+	struct qcow2_header_disk f_header;
+	struct qcow_header *header;
+
+	header = malloc(sizeof(struct qcow_header));
+	if (!header)
+		return NULL;
+
+	if (pread_in_full(fd, &f_header, sizeof(struct qcow2_header_disk), 0) < 0) {
+		free(header);
+		return NULL;
+	}
+
+	be32_to_cpus(&f_header.magic);
+	be32_to_cpus(&f_header.version);
+	be64_to_cpus(&f_header.backing_file_offset);
+	be32_to_cpus(&f_header.backing_file_size);
+	be32_to_cpus(&f_header.cluster_bits);
+	be64_to_cpus(&f_header.size);
+	be32_to_cpus(&f_header.crypt_method);
+	be32_to_cpus(&f_header.l1_size);
+	be64_to_cpus(&f_header.l1_table_offset);
+	be64_to_cpus(&f_header.refcount_table_offset);
+	be32_to_cpus(&f_header.refcount_table_clusters);
+	be32_to_cpus(&f_header.nb_snapshots);
+	be64_to_cpus(&f_header.snapshots_offset);
+
+	*header		= (struct qcow_header) {
+		.size			= f_header.size,
+		.l1_table_offset	= f_header.l1_table_offset,
+		.l1_size		= f_header.l1_size,
+		.cluster_bits		= f_header.cluster_bits,
+		.l2_bits		= f_header.cluster_bits - 3,
+		.refcount_table_offset	= f_header.refcount_table_offset,
+		.refcount_table_size	= f_header.refcount_table_clusters,
+	};
+
+	return header;
+}
+
+static struct disk_image *qcow2_probe(int fd, bool readonly)
+{
+	struct disk_image *disk_image;
+	struct qcow_l1_table *l1t;
+	struct qcow_header *h;
+	struct qcow *q;
+
+	q = calloc(1, sizeof(struct qcow));
+	if (!q)
+		return NULL;
+
+	mutex_init(&q->mutex);
+	q->fd = fd;
+
+	l1t = &q->table;
+
+	l1t->root = (struct rb_root) RB_ROOT;
+	INIT_LIST_HEAD(&l1t->lru_list);
+
+	h = q->header = qcow2_read_header(fd);
+	if (!h)
+		goto free_qcow;
+
+	q->version = QCOW2_VERSION;
+	q->csize_shift = (62 - (q->header->cluster_bits - 8));
+	q->csize_mask = (1 << (q->header->cluster_bits - 8)) - 1;
+	q->cluster_offset_mask = (1LL << q->csize_shift) - 1;
+	q->cluster_size = 1 << q->header->cluster_bits;
+
+	q->copy_buff = malloc(q->cluster_size);
+	if (!q->copy_buff) {
+		pr_warning("copy buff malloc error");
+		goto free_header;
+	}
+
+	q->cluster_data = malloc(q->cluster_size);
+	if (!q->cluster_data) {
+		pr_warning("cluster data malloc error");
+		goto free_copy_buff;
+	}
+
+	q->cluster_cache = malloc(q->cluster_size);
+	if (!q->cluster_cache) {
+		pr_warning("cluster cache malloc error");
+		goto free_cluster_data;
+	}
+
+	if (qcow_read_l1_table(q) < 0)
+		goto free_cluster_cache;
+
+	if (qcow_read_refcount_table(q) < 0)
+		goto free_l1_table;
+
+	/*
+	 * Do not use mmap use read/write instead
+	 */
+	if (readonly)
+		disk_image = disk_image__new(fd, h->size, &qcow_disk_readonly_ops, DISK_IMAGE_REGULAR);
+	else
+		disk_image = disk_image__new(fd, h->size, &qcow_disk_ops, DISK_IMAGE_REGULAR);
+
+	if (IS_ERR_OR_NULL(disk_image))
+		goto free_refcount_table;
+
+	disk_image->priv = q;
+
+	return disk_image;
+
+free_refcount_table:
+	if (q->refcount_table.rf_table)
+		free(q->refcount_table.rf_table);
+free_l1_table:
+	if (q->table.l1_table)
+		free(q->table.l1_table);
+free_cluster_cache:
+	if (q->cluster_cache)
+		free(q->cluster_cache);
+free_cluster_data:
+	if (q->cluster_data)
+		free(q->cluster_data);
+free_copy_buff:
+	if (q->copy_buff)
+		free(q->copy_buff);
+free_header:
+	if (q->header)
+		free(q->header);
+free_qcow:
+	free(q);
+
+	return NULL;
+}
+
+static bool qcow2_check_image(int fd)
+{
+	struct qcow2_header_disk f_header;
+
+	if (pread_in_full(fd, &f_header, sizeof(struct qcow2_header_disk), 0) < 0)
+		return false;
+
+	be32_to_cpus(&f_header.magic);
+	be32_to_cpus(&f_header.version);
+
+	if (f_header.magic != QCOW_MAGIC)
+		return false;
+
+	if (f_header.version != QCOW2_VERSION)
+		return false;
+
+	return true;
+}
+
+static void *qcow1_read_header(int fd)
+{
+	struct qcow1_header_disk f_header;
+	struct qcow_header *header;
+
+	header = malloc(sizeof(struct qcow_header));
+	if (!header)
+		return NULL;
+
+	if (pread_in_full(fd, &f_header, sizeof(struct qcow1_header_disk), 0) < 0) {
+		free(header);
+		return NULL;
+	}
+
+	be32_to_cpus(&f_header.magic);
+	be32_to_cpus(&f_header.version);
+	be64_to_cpus(&f_header.backing_file_offset);
+	be32_to_cpus(&f_header.backing_file_size);
+	be32_to_cpus(&f_header.mtime);
+	be64_to_cpus(&f_header.size);
+	be32_to_cpus(&f_header.crypt_method);
+	be64_to_cpus(&f_header.l1_table_offset);
+
+	*header		= (struct qcow_header) {
+		.size			= f_header.size,
+		.l1_table_offset	= f_header.l1_table_offset,
+		.l1_size		= f_header.size / ((1 << f_header.l2_bits) * (1 << f_header.cluster_bits)),
+		.cluster_bits		= f_header.cluster_bits,
+		.l2_bits		= f_header.l2_bits,
+	};
+
+	return header;
+}
+
+static struct disk_image *qcow1_probe(int fd, bool readonly)
+{
+	struct disk_image *disk_image;
+	struct qcow_l1_table *l1t;
+	struct qcow_header *h;
+	struct qcow *q;
+
+	q = calloc(1, sizeof(struct qcow));
+	if (!q)
+		return NULL;
+
+	mutex_init(&q->mutex);
+	q->fd = fd;
+
+	l1t = &q->table;
+
+	l1t->root = (struct rb_root)RB_ROOT;
+	INIT_LIST_HEAD(&l1t->lru_list);
+	INIT_LIST_HEAD(&q->refcount_table.lru_list);
+
+	h = q->header = qcow1_read_header(fd);
+	if (!h)
+		goto free_qcow;
+
+	q->version = QCOW1_VERSION;
+	q->cluster_size = 1 << q->header->cluster_bits;
+	q->cluster_offset_mask = (1LL << (63 - q->header->cluster_bits)) - 1;
+	q->free_clust_idx = 0;
+
+	q->cluster_data = malloc(q->cluster_size);
+	if (!q->cluster_data) {
+		pr_warning("cluster data malloc error");
+		goto free_header;
+	}
+
+	q->cluster_cache = malloc(q->cluster_size);
+	if (!q->cluster_cache) {
+		pr_warning("cluster cache malloc error");
+		goto free_cluster_data;
+	}
+
+	if (qcow_read_l1_table(q) < 0)
+		goto free_cluster_cache;
+
+	/*
+	 * Do not use mmap use read/write instead
+	 */
+	if (readonly)
+		disk_image = disk_image__new(fd, h->size, &qcow_disk_readonly_ops, DISK_IMAGE_REGULAR);
+	else
+		disk_image = disk_image__new(fd, h->size, &qcow_disk_ops, DISK_IMAGE_REGULAR);
+
+	if (!disk_image)
+		goto free_l1_table;
+
+	disk_image->priv = q;
+
+	return disk_image;
+
+free_l1_table:
+	if (q->table.l1_table)
+		free(q->table.l1_table);
+free_cluster_cache:
+	if (q->cluster_cache)
+		free(q->cluster_cache);
+free_cluster_data:
+	if (q->cluster_data)
+		free(q->cluster_data);
+free_header:
+	if (q->header)
+		free(q->header);
+free_qcow:
+	free(q);
+
+	return NULL;
+}
+
+static bool qcow1_check_image(int fd)
+{
+	struct qcow1_header_disk f_header;
+
+	if (pread_in_full(fd, &f_header, sizeof(struct qcow1_header_disk), 0) < 0)
+		return false;
+
+	be32_to_cpus(&f_header.magic);
+	be32_to_cpus(&f_header.version);
+
+	if (f_header.magic != QCOW_MAGIC)
+		return false;
+
+	if (f_header.version != QCOW1_VERSION)
+		return false;
+
+	return true;
+}
+
+struct disk_image *qcow_probe(int fd, bool readonly)
+{
+	if (qcow1_check_image(fd))
+		return qcow1_probe(fd, readonly);
+
+	if (qcow2_check_image(fd))
+		return qcow2_probe(fd, readonly);
+
+	return NULL;
+}
diff --git a/kvmtool/disk/raw.c b/kvmtool/disk/raw.c
new file mode 100644
index 0000000..54b4e74
--- /dev/null
+++ b/kvmtool/disk/raw.c
@@ -0,0 +1,107 @@
+#include "kvm/disk-image.h"
+
+#include <linux/err.h>
+
+ssize_t raw_image__read_sync(struct disk_image *disk, u64 sector, const struct iovec *iov,
+				int iovcount, void *param)
+{
+	return preadv_in_full(disk->fd, iov, iovcount, sector << SECTOR_SHIFT);
+}
+
+ssize_t raw_image__write_sync(struct disk_image *disk, u64 sector,
+			      const struct iovec *iov, int iovcount,
+			      void *param)
+{
+	return pwritev_in_full(disk->fd, iov, iovcount, sector << SECTOR_SHIFT);
+}
+
+ssize_t raw_image__read_mmap(struct disk_image *disk, u64 sector, const struct iovec *iov,
+				int iovcount, void *param)
+{
+	u64 offset = sector << SECTOR_SHIFT;
+	ssize_t total = 0;
+
+	while (iovcount--) {
+		memcpy(iov->iov_base, disk->priv + offset, iov->iov_len);
+
+		sector	+= iov->iov_len >> SECTOR_SHIFT;
+		offset	+= iov->iov_len;
+		total	+= iov->iov_len;
+		iov++;
+	}
+
+	return total;
+}
+
+ssize_t raw_image__write_mmap(struct disk_image *disk, u64 sector, const struct iovec *iov,
+				int iovcount, void *param)
+{
+	u64 offset = sector << SECTOR_SHIFT;
+	ssize_t total = 0;
+
+	while (iovcount--) {
+		memcpy(disk->priv + offset, iov->iov_base, iov->iov_len);
+
+		sector	+= iov->iov_len >> SECTOR_SHIFT;
+		offset	+= iov->iov_len;
+		total	+= iov->iov_len;
+		iov++;
+	}
+
+	return total;
+}
+
+int raw_image__close(struct disk_image *disk)
+{
+	int ret = 0;
+
+	if (disk->priv != MAP_FAILED)
+		ret = munmap(disk->priv, disk->size);
+
+	return ret;
+}
+
+/*
+ * multiple buffer based disk image operations
+ */
+static struct disk_image_operations raw_image_regular_ops = {
+	.read	= raw_image__read,
+	.write	= raw_image__write,
+	.wait	= raw_image__wait,
+	.async	= true,
+};
+
+struct disk_image_operations ro_ops = {
+	.read	= raw_image__read_mmap,
+	.write	= raw_image__write_mmap,
+	.close	= raw_image__close,
+};
+
+struct disk_image_operations ro_ops_nowrite = {
+	.read	= raw_image__read,
+	.wait	= raw_image__wait,
+	.async	= true,
+};
+
+struct disk_image *raw_image__probe(int fd, struct stat *st, bool readonly)
+{
+	if (readonly) {
+		/*
+		 * Use mmap's MAP_PRIVATE to implement non-persistent write
+		 * FIXME: This does not work on 32-bit host.
+		 */
+		struct disk_image *disk;
+
+		disk = disk_image__new(fd, st->st_size, &ro_ops, DISK_IMAGE_MMAP);
+		if (IS_ERR_OR_NULL(disk)) {
+			disk = disk_image__new(fd, st->st_size, &ro_ops_nowrite, DISK_IMAGE_REGULAR);
+		}
+
+		return disk;
+	} else {
+		/*
+		 * Use read/write instead of mmap
+		 */
+		return disk_image__new(fd, st->st_size, &raw_image_regular_ops, DISK_IMAGE_REGULAR);
+	}
+}
diff --git a/kvmtool/framebuffer.c b/kvmtool/framebuffer.c
new file mode 100644
index 0000000..fb8f51d
--- /dev/null
+++ b/kvmtool/framebuffer.c
@@ -0,0 +1,80 @@
+#include "kvm/framebuffer.h"
+#include "kvm/kvm.h"
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <errno.h>
+
+static LIST_HEAD(framebuffers);
+
+struct framebuffer *fb__register(struct framebuffer *fb)
+{
+	INIT_LIST_HEAD(&fb->node);
+	list_add(&fb->node, &framebuffers);
+
+	return fb;
+}
+
+int fb__attach(struct framebuffer *fb, struct fb_target_operations *ops)
+{
+	if (fb->nr_targets >= FB_MAX_TARGETS)
+		return -ENOSPC;
+
+	fb->targets[fb->nr_targets++] = ops;
+
+	return 0;
+}
+
+static int start_targets(struct framebuffer *fb)
+{
+	unsigned long i;
+
+	for (i = 0; i < fb->nr_targets; i++) {
+		struct fb_target_operations *ops = fb->targets[i];
+		int err = 0;
+
+		if (ops->start)
+			err = ops->start(fb);
+
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+int fb__init(struct kvm *kvm)
+{
+	struct framebuffer *fb;
+
+	list_for_each_entry(fb, &framebuffers, node) {
+		int err;
+
+		err = start_targets(fb);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+firmware_init(fb__init);
+
+int fb__exit(struct kvm *kvm)
+{
+	struct framebuffer *fb;
+
+	list_for_each_entry(fb, &framebuffers, node) {
+		u32 i;
+
+		for (i = 0; i < fb->nr_targets; i++)
+			if (fb->targets[i]->stop)
+				fb->targets[i]->stop(fb);
+
+		munmap(fb->mem, fb->mem_size);
+	}
+
+	return 0;
+}
+firmware_exit(fb__exit);
diff --git a/kvmtool/guest/.gitignore b/kvmtool/guest/.gitignore
new file mode 100644
index 0000000..0ccdcbb
--- /dev/null
+++ b/kvmtool/guest/.gitignore
@@ -0,0 +1,2 @@
+guest_pre_init.c
+guest_init.c
diff --git a/kvmtool/guest/init.c b/kvmtool/guest/init.c
new file mode 100644
index 0000000..52f6567
--- /dev/null
+++ b/kvmtool/guest/init.c
@@ -0,0 +1,81 @@
+/*
+ * This is a simple init for shared rootfs guests. This part should be limited
+ * to doing mounts and running stage 2 of the init process.
+ */
+#include <sys/mount.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <sys/reboot.h>
+
+static int run_process(char *filename)
+{
+	char *new_argv[] = { filename, NULL };
+	char *new_env[] = { "TERM=linux", "DISPLAY=192.168.33.1:0",
+				"HOME=/virt/home", NULL };
+
+	return execve(filename, new_argv, new_env);
+}
+
+static int run_process_sandbox(char *filename)
+{
+	char *new_argv[] = { filename, "/virt/sandbox.sh", NULL };
+	char *new_env[] = { "TERM=linux", "HOME=/virt/home", NULL };
+
+	return execve(filename, new_argv, new_env);
+}
+
+static void do_mounts(void)
+{
+#ifndef CONFIG_GUEST_PRE_INIT
+	mount("hostfs", "/host", "9p", MS_RDONLY, "trans=virtio,version=9p2000.L");
+#endif
+	mount("sysfs", "/sys", "sysfs", 0, NULL);
+	mount("proc", "/proc", "proc", 0, NULL);
+	mount("devtmpfs", "/dev", "devtmpfs", 0, NULL);
+	mkdir("/dev/pts", 0755);
+	mount("devpts", "/dev/pts", "devpts", 0, NULL);
+}
+
+int main(int argc, char *argv[])
+{
+	pid_t child;
+	int status;
+
+	puts("Mounting...");
+
+	do_mounts();
+
+	/* get session leader */
+	setsid();
+
+	/* set controlling terminal */
+	ioctl(0, TIOCSCTTY, 1);
+
+	child = fork();
+	if (child < 0) {
+		printf("Fatal: fork() failed with %d\n", child);
+		return 0;
+	} else if (child == 0) {
+		if (access("/virt/sandbox.sh", R_OK) == 0)
+			run_process_sandbox("/bin/sh");
+		else
+			run_process("/bin/sh");
+	} else {
+		pid_t corpse;
+
+		do {
+			corpse = waitpid(-1, &status, 0);
+		} while (corpse != child);
+	}
+
+	sync();
+	reboot(RB_AUTOBOOT);
+
+	printf("Init failed: %s\n", strerror(errno));
+
+	return 0;
+}
diff --git a/kvmtool/guest_compat.c b/kvmtool/guest_compat.c
new file mode 100644
index 0000000..fd4704b
--- /dev/null
+++ b/kvmtool/guest_compat.c
@@ -0,0 +1,99 @@
+#include "kvm/guest_compat.h"
+
+#include "kvm/mutex.h"
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+
+struct compat_message {
+	int id;
+	char *title;
+	char *desc;
+
+	struct list_head list;
+};
+
+static int id;
+static DEFINE_MUTEX(compat_mtx);
+static LIST_HEAD(messages);
+
+static void compat__free(struct compat_message *msg)
+{
+	free(msg->title);
+	free(msg->desc);
+	free(msg);
+}
+
+int compat__add_message(const char *title, const char *desc)
+{
+	struct compat_message *msg;
+	int msg_id;
+
+	msg = malloc(sizeof(*msg));
+	if (msg == NULL)
+		goto cleanup;
+
+	msg->title = strdup(title);
+	msg->desc = strdup(desc);
+
+	if (msg->title == NULL || msg->desc == NULL)
+		goto cleanup;
+
+	mutex_lock(&compat_mtx);
+
+	msg->id = msg_id = id++;
+	list_add_tail(&msg->list, &messages);
+
+	mutex_unlock(&compat_mtx);
+
+	return msg_id;
+
+cleanup:
+	if (msg)
+		compat__free(msg);
+
+	return -ENOMEM;
+}
+
+int compat__remove_message(int id)
+{
+	struct compat_message *pos, *n;
+
+	mutex_lock(&compat_mtx);
+
+	list_for_each_entry_safe(pos, n, &messages, list) {
+		if (pos->id == id) {
+			list_del(&pos->list);
+			compat__free(pos);
+
+			mutex_unlock(&compat_mtx);
+
+			return 0;
+		}
+	}
+
+	mutex_unlock(&compat_mtx);
+
+	return -ENOENT;
+}
+
+int compat__print_all_messages(void)
+{
+	mutex_lock(&compat_mtx);
+
+	while (!list_empty(&messages)) {
+		struct compat_message *msg;
+
+		msg = list_first_entry(&messages, struct compat_message, list);
+
+		printf("\n  # KVM compatibility warning.\n\t%s\n\t%s\n",
+			msg->title, msg->desc);
+
+		list_del(&msg->list);
+		compat__free(msg);
+	}
+
+	mutex_unlock(&compat_mtx);
+
+	return 0;
+}
diff --git a/kvmtool/hw/cfi_flash.c b/kvmtool/hw/cfi_flash.c
new file mode 100644
index 0000000..7faecdf
--- /dev/null
+++ b/kvmtool/hw/cfi_flash.c
@@ -0,0 +1,639 @@
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/bitops.h>
+#include <linux/err.h>
+#include <linux/sizes.h>
+#include <linux/types.h>
+
+#include "kvm/kvm.h"
+#include "kvm/kvm-arch.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/devices.h"
+#include "kvm/fdt.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+
+/*
+ * The EDK2 driver hardcodes two 16-bit chips on a 32-bit bus.
+ * This code supports one or two chips (enforced below).
+ */
+#define CFI_NR_FLASH_CHIPS			2
+
+/* We always emulate a 32 bit bus width. */
+#define CFI_BUS_WIDTH				4
+
+/* The *effective* size of an erase block (over all chips) */
+#define FLASH_BLOCK_SIZE			SZ_64K
+#define FLASH_BLOCK_SIZE_PER_CHIP					\
+	(FLASH_BLOCK_SIZE / CFI_NR_FLASH_CHIPS)
+
+#define PROGRAM_BUFF_SIZE_BITS			7
+#define PROGRAM_BUFF_SIZE			(1U << PROGRAM_BUFF_SIZE_BITS)
+#define PROGRAM_BUFF_SIZE_BITS_PER_CHIP					\
+	(PROGRAM_BUFF_SIZE_BITS + 1 - CFI_NR_FLASH_CHIPS)
+
+/* CFI commands */
+#define CFI_CMD_LOCK_BLOCK			0x01
+#define CFI_CMD_ALTERNATE_WORD_PROGRAM		0x10
+#define CFI_CMD_ERASE_BLOCK_SETUP		0x20
+#define CFI_CMD_WORD_PROGRAM			0x40
+#define CFI_CMD_CLEAR_STATUS_REG		0x50
+#define CFI_CMD_LOCK_BLOCK_SETUP		0x60
+#define CFI_CMD_READ_STATUS_REG			0x70
+#define CFI_CMD_READ_JEDEC_DEVID		0x90
+#define CFI_CMD_READ_CFI_QUERY			0x98
+#define CFI_CMD_CONFIRM				0xd0
+#define CFI_CMD_BUFFERED_PROGRAM_SETUP		0xe8
+#define CFI_CMD_READ_ARRAY			0xff
+
+#define CFI_STATUS_PROTECT_BIT		0x02
+#define CFI_STATUS_PROGRAM_LOCK_BIT	0x10
+#define CFI_STATUS_ERASE_CLEAR_LOCK_BIT	0x20
+#define CFI_STATUS_LOCK_ERROR		CFI_STATUS_PROGRAM_LOCK_BIT |	\
+					CFI_STATUS_PROTECT_BIT
+#define CFI_STATUS_ERASE_ERROR		CFI_STATUS_ERASE_CLEAR_LOCK_BIT | \
+					CFI_STATUS_PROGRAM_LOCK_BIT
+#define CFI_STATUS_READY		0x80
+
+/*
+ * CFI query table contents, as far as it is constant.
+ * The dynamic information (size, etc.) will be generated on the fly.
+ */
+#define CFI_GEOM_OFFSET				0x27
+static const u8 cfi_query_table[] = {
+		/* CFI query identification string */
+	[0x10] = 'Q', 'R', 'Y',		/* ID string */
+	0x01, 0x00,		/* primary command set: Intel/Sharp extended */
+	0x31, 0x00,		/* address of primary extended query table */
+	0x00, 0x00,		/* alternative command set: unused */
+	0x00, 0x00,		/* address of alternative extended query table*/
+		/* system interface information */
+	[0x1b] = 0x45,			/* minimum Vcc voltage: 4.5V */
+	0x55,			/* maximum Vcc voltage: 5.5V */
+	0x00,			/* minimum Vpp voltage: 0.0V (unused) */
+	0x00,			/* maximum Vpp voltage: 0.0V *(unused) */
+	0x01,			/* timeout for single word program: 2 us */
+	0x01,			/* timeout for multi-byte program: 2 us */
+	0x01,			/* timeout for block erase: 2 ms */
+	0x00,			/* timeout for full chip erase: not supported */
+	0x00,			/* max timeout for single word program: 1x */
+	0x00,			/* max timeout for mulit-byte program: 1x */
+	0x00,			/* max timeout for block erase: 1x */
+	0x00,			/* max timeout for chip erase: not supported */
+		/* flash geometry information */
+	[0x27] = 0x00,		/* size in power-of-2 bytes, filled later */
+	0x05, 0x00,		/* interface description: 32 and 16 bits */
+	PROGRAM_BUFF_SIZE_BITS_PER_CHIP, 0x00,
+				/* number of bytes in write buffer */
+	0x01,			/* one erase block region */
+	0x00, 0x00, 0x00, 0x00, /* number and size of erase blocks, generated */
+		/* Intel primary algorithm extended query table */
+	[0x31] = 'P', 'R', 'I',
+	'1', '0',		/* version 1.0 */
+	0xa0, 0x00, 0x00, 0x00, /* optional features: instant lock & pm-read */
+	0x00,			/* no functions after suspend */
+	0x01, 0x00,		/* only lock bit supported */
+	0x50,			/* best Vcc value: 5.0V */
+	0x00,			/* best Vpp value: 0.0V (unused) */
+	0x01,			/* number of protection register fields */
+	0x00, 0x00, 0x00, 0x00,	/* protection field 1 description */
+};
+
+/*
+ * Those states represent a subset of the CFI flash state machine.
+ */
+enum cfi_flash_state {
+	READY,
+	LOCK_BLOCK_SETUP,
+	WORD_PROGRAM,
+	BUFFERED_PROGRAM_SETUP,
+	BUFFER_WRITE,
+	ERASE_BLOCK_SETUP,
+};
+
+/*
+ * The device can be in several **Read** modes.
+ * We don't implement the asynchronous burst mode.
+ */
+enum cfi_read_mode {
+	READ_ARRAY,
+	READ_STATUS_REG,
+	READ_JEDEC_DEVID,
+	READ_CFI_QUERY,
+};
+
+struct cfi_flash_device {
+	struct device_header	dev_hdr;
+	/* Protects the CFI state machine variables in this data structure. */
+	struct mutex		mutex;
+	u64			base_addr;
+	u32			size;
+
+	void			*flash_memory;
+	u8			program_buffer[PROGRAM_BUFF_SIZE];
+	unsigned long		*lock_bm;
+	u64			block_address;
+	unsigned int		buff_written;
+	unsigned int		buffer_length;
+
+	enum cfi_flash_state	state;
+	enum cfi_read_mode	read_mode;
+	u8			sr;
+	bool			is_mapped;
+};
+
+static int nr_erase_blocks(struct cfi_flash_device *sfdev)
+{
+	return sfdev->size / FLASH_BLOCK_SIZE;
+}
+
+/*
+ * CFI queries always deal with one byte of information, possibly mirrored
+ * to other bytes on the bus. This is dealt with in the callers.
+ * The address provided is the one for 8-bit addressing, and would need to
+ * be adjusted for wider accesses.
+ */
+static u8 read_cfi(struct cfi_flash_device *sfdev, u64 faddr)
+{
+	if (faddr > sizeof(cfi_query_table)) {
+		pr_debug("CFI query read access beyond the end of table");
+		return 0;
+	}
+
+	/* Fixup dynamic information in the geometry part of the table. */
+	switch (faddr) {
+	case 0x27:		/* device size in bytes, power of two */
+		return pow2_size(sfdev->size / CFI_NR_FLASH_CHIPS);
+	case 0x2d + 0:	/* number of erase blocks, minus one */
+		return (nr_erase_blocks(sfdev) - 1) & 0xff;
+	case 0x2d + 1:
+		return ((nr_erase_blocks(sfdev) - 1) >> 8) & 0xff;
+	case 0x2d + 2:	/* erase block size, in units of 256 */
+		return (FLASH_BLOCK_SIZE_PER_CHIP / 256) & 0xff;
+	case 0x2d + 3:
+		return ((FLASH_BLOCK_SIZE_PER_CHIP / 256) >> 8) & 0xff;
+	}
+
+	return cfi_query_table[faddr];
+}
+
+static bool block_is_locked(struct cfi_flash_device *sfdev, u64 faddr)
+{
+	int block_nr = faddr / FLASH_BLOCK_SIZE;
+
+	return test_bit(block_nr, sfdev->lock_bm);
+}
+
+#define DEV_ID_MASK 0x7ff
+static u16 read_dev_id(struct cfi_flash_device *sfdev, u64 faddr)
+{
+	switch ((faddr & DEV_ID_MASK) / CFI_BUS_WIDTH) {
+	case 0x0:				/* vendor ID */
+		return 0x0000;
+	case 0x1:				/* device ID */
+		return 0xffff;
+	case 0x2:
+		return block_is_locked(sfdev, faddr & ~DEV_ID_MASK);
+	default:			/* Ignore the other entries. */
+		return 0;
+	}
+}
+
+static void lock_block(struct cfi_flash_device *sfdev, u64 faddr, bool lock)
+{
+	int block_nr = faddr / FLASH_BLOCK_SIZE;
+
+	if (lock)
+		set_bit(block_nr, sfdev->lock_bm);
+	else
+		clear_bit(block_nr, sfdev->lock_bm);
+}
+
+static void word_program(struct cfi_flash_device *sfdev,
+			 u64 faddr, void *data, int len)
+{
+	if (block_is_locked(sfdev, faddr)) {
+		sfdev->sr |= CFI_STATUS_LOCK_ERROR;
+		return;
+	}
+
+	memcpy(sfdev->flash_memory + faddr, data, len);
+}
+
+/* Reset the program buffer state to prepare for follow-up writes. */
+static void buffer_setup(struct cfi_flash_device *sfdev)
+{
+	memset(sfdev->program_buffer, 0, sizeof(sfdev->program_buffer));
+	sfdev->block_address = ~0ULL;
+	sfdev->buff_written = 0;
+}
+
+static bool buffer_write(struct cfi_flash_device *sfdev,
+			 u64 faddr, void *buffer, int len)
+{
+	unsigned int buff_addr;
+
+	if (sfdev->buff_written >= sfdev->buffer_length)
+		return false;
+
+	/*
+	 * The first word written into the buffer after the setup command
+	 * happens to be the base address for the buffer.
+	 * All subsequent writes need to be within this address and this
+	 * address plus the buffer size, so keep this value around.
+	 */
+	if (sfdev->block_address == ~0ULL)
+		sfdev->block_address = faddr;
+
+	if (faddr < sfdev->block_address)
+		return false;
+	buff_addr = faddr - sfdev->block_address;
+	if (buff_addr >= PROGRAM_BUFF_SIZE)
+		return false;
+
+	memcpy(sfdev->program_buffer + buff_addr, buffer, len);
+	sfdev->buff_written += len;
+
+	return true;
+}
+
+static void buffer_confirm(struct cfi_flash_device *sfdev)
+{
+	if (block_is_locked(sfdev, sfdev->block_address)) {
+		sfdev->sr |= CFI_STATUS_LOCK_ERROR;
+		return;
+	}
+	memcpy(sfdev->flash_memory + sfdev->block_address,
+	       sfdev->program_buffer, sfdev->buff_written);
+}
+
+static void block_erase_confirm(struct cfi_flash_device *sfdev, u64 faddr)
+{
+	if (block_is_locked(sfdev, faddr)) {
+		sfdev->sr |= CFI_STATUS_LOCK_ERROR;
+		return;
+	}
+
+	memset(sfdev->flash_memory + faddr, 0xff, FLASH_BLOCK_SIZE);
+}
+
+static void cfi_flash_read(struct cfi_flash_device *sfdev,
+			   u64 faddr, u8 *data, u32 len)
+{
+	u16 cfi_value = 0;
+
+	switch (sfdev->read_mode) {
+	case READ_ARRAY:
+		/* just copy the requested bytes from the array */
+		memcpy(data, sfdev->flash_memory + faddr, len);
+		return;
+	case READ_STATUS_REG:
+		cfi_value = sfdev->sr;
+		break;
+	case READ_JEDEC_DEVID:
+		cfi_value = read_dev_id(sfdev, faddr);
+		break;
+	case READ_CFI_QUERY:
+		cfi_value = read_cfi(sfdev, faddr / CFI_BUS_WIDTH);
+		break;
+	}
+	switch (len) {
+	case 1:
+		*data = cfi_value;
+		break;
+	case 8: memset(data + 4, 0, 4);
+		/* fall-through */
+	case 4:
+		if (CFI_NR_FLASH_CHIPS == 2)
+			memcpy(data + 2, &cfi_value, 2);
+		else
+			memset(data + 2, 0, 2);
+		/* fall-through */
+	case 2:
+		memcpy(data, &cfi_value, 2);
+		break;
+	default:
+		pr_debug("CFI flash: illegal access length %d for read mode %d",
+			 len, sfdev->read_mode);
+		break;
+	}
+}
+
+/*
+ * Any writes happening in "READY" state don't actually write to the memory,
+ * but are really treated as commands to advance the state machine and select
+ * the next action.
+ * Change the state and modes according to the value written. The address
+ * that value is written to does not matter and is ignored.
+ */
+static void cfi_flash_write_ready(struct cfi_flash_device *sfdev, u8 command)
+{
+	switch (command) {
+	case CFI_CMD_READ_JEDEC_DEVID:
+		sfdev->read_mode = READ_JEDEC_DEVID;
+		break;
+	case CFI_CMD_READ_STATUS_REG:
+		sfdev->read_mode = READ_STATUS_REG;
+		break;
+	case CFI_CMD_READ_CFI_QUERY:
+		sfdev->read_mode = READ_CFI_QUERY;
+		break;
+	case CFI_CMD_CLEAR_STATUS_REG:
+		sfdev->sr = CFI_STATUS_READY;
+		break;
+	case CFI_CMD_WORD_PROGRAM:
+	case CFI_CMD_ALTERNATE_WORD_PROGRAM:
+		sfdev->state = WORD_PROGRAM;
+		sfdev->read_mode = READ_STATUS_REG;
+		break;
+	case CFI_CMD_LOCK_BLOCK_SETUP:
+		sfdev->state = LOCK_BLOCK_SETUP;
+		break;
+	case CFI_CMD_ERASE_BLOCK_SETUP:
+		sfdev->state = ERASE_BLOCK_SETUP;
+		sfdev->read_mode = READ_STATUS_REG;
+		break;
+	case CFI_CMD_BUFFERED_PROGRAM_SETUP:
+		buffer_setup(sfdev);
+		sfdev->state = BUFFERED_PROGRAM_SETUP;
+		sfdev->read_mode = READ_STATUS_REG;
+		break;
+	case CFI_CMD_CONFIRM:
+		pr_debug("CFI flash: unexpected confirm command 0xd0");
+		break;
+	default:
+		pr_debug("CFI flash: unknown command 0x%x", command);
+		/* fall-through */
+	case CFI_CMD_READ_ARRAY:
+		sfdev->read_mode = READ_ARRAY;
+		break;
+	}
+}
+
+static void cfi_flash_write(struct cfi_flash_device *sfdev, u16 command,
+			    u64 faddr, u8 *data, u32 len)
+{
+	switch (sfdev->state) {
+	case READY:
+		cfi_flash_write_ready(sfdev, command & 0xff);
+		return;
+	case LOCK_BLOCK_SETUP:
+		switch (command & 0xff) {
+		case CFI_CMD_LOCK_BLOCK:
+			lock_block(sfdev, faddr, true);
+			sfdev->read_mode = READ_STATUS_REG;
+			break;
+		case CFI_CMD_CONFIRM:
+			lock_block(sfdev, faddr, false);
+			sfdev->read_mode = READ_STATUS_REG;
+			break;
+		default:
+			sfdev->sr |= CFI_STATUS_ERASE_ERROR;
+			break;
+		}
+		sfdev->state = READY;
+		break;
+
+	case WORD_PROGRAM:
+		word_program(sfdev, faddr, data, len);
+		sfdev->read_mode = READ_STATUS_REG;
+		sfdev->state = READY;
+		break;
+
+	case BUFFER_WRITE:
+		if (buffer_write(sfdev, faddr, data, len))
+			break;
+
+		if ((command & 0xff) == CFI_CMD_CONFIRM) {
+			buffer_confirm(sfdev);
+			sfdev->read_mode = READ_STATUS_REG;
+		} else {
+			pr_debug("CFI flash: BUFFER_WRITE: expected CONFIRM(0xd0), got 0x%x @ 0x%llx",
+				 command, faddr);
+			sfdev->sr |= CFI_STATUS_PROGRAM_LOCK_BIT;
+		}
+		sfdev->state = READY;
+		break;
+
+	case BUFFERED_PROGRAM_SETUP:
+		sfdev->buffer_length = (command + 1) * CFI_BUS_WIDTH;
+		if (sfdev->buffer_length > PROGRAM_BUFF_SIZE)
+			sfdev->buffer_length = PROGRAM_BUFF_SIZE;
+		sfdev->state = BUFFER_WRITE;
+		sfdev->read_mode = READ_STATUS_REG;
+		break;
+
+	case ERASE_BLOCK_SETUP:
+		if ((command & 0xff) == CFI_CMD_CONFIRM)
+			block_erase_confirm(sfdev, faddr);
+		else
+			sfdev->sr |= CFI_STATUS_ERASE_ERROR;
+
+		sfdev->state = READY;
+		sfdev->read_mode = READ_STATUS_REG;
+		break;
+	default:
+		pr_debug("CFI flash: unexpected/unknown command 0x%x", command);
+		break;
+	}
+}
+
+/*
+ * If we are in ARRAY_READ mode, we can map the flash array directly
+ * into the guest, just as read-only. This greatly improves read
+ * performance, and avoids problems with exits due to accesses from
+ * load instructions without syndrome information (on ARM).
+ * Also it could allow code to be executed XIP in there.
+ */
+static int map_flash_memory(struct kvm *kvm, struct cfi_flash_device *sfdev)
+{
+	int ret;
+
+	ret = kvm__register_mem(kvm, sfdev->base_addr, sfdev->size,
+				sfdev->flash_memory,
+				KVM_MEM_TYPE_RAM | KVM_MEM_TYPE_READONLY);
+	if (!ret)
+		sfdev->is_mapped = true;
+
+	return ret;
+}
+
+/*
+ * Any write access changing the read mode would need to bring us back to
+ * "trap everything", as the CFI query read need proper handholding.
+ */
+static int unmap_flash_memory(struct kvm *kvm, struct cfi_flash_device *sfdev)
+{
+	int ret;
+
+	ret = kvm__destroy_mem(kvm, sfdev->base_addr, sfdev->size,
+			       sfdev->flash_memory);
+
+	if (!ret)
+		sfdev->is_mapped = false;
+
+	return ret;
+}
+
+static void cfi_flash_mmio(struct kvm_cpu *vcpu,
+			   u64 addr, u8 *data, u32 len, u8 is_write,
+			   void *context)
+{
+	struct cfi_flash_device *sfdev = context;
+	u64 faddr = addr - sfdev->base_addr;
+	u32 value;
+
+	if (!is_write) {
+		mutex_lock(&sfdev->mutex);
+
+		cfi_flash_read(sfdev, faddr, data, len);
+
+		mutex_unlock(&sfdev->mutex);
+
+		return;
+	}
+
+	if (len > 4) {
+		pr_info("CFI flash: MMIO %d-bit write access not supported",
+			 len * 8);
+		return;
+	}
+
+	memcpy(&value, data, len);
+
+	mutex_lock(&sfdev->mutex);
+
+	cfi_flash_write(sfdev, value & 0xffff, faddr, data, len);
+
+	/* Adjust our mapping status accordingly. */
+	if (!sfdev->is_mapped && sfdev->read_mode == READ_ARRAY)
+		map_flash_memory(vcpu->kvm, sfdev);
+	else if (sfdev->is_mapped && sfdev->read_mode != READ_ARRAY)
+		unmap_flash_memory(vcpu->kvm, sfdev);
+
+	mutex_unlock(&sfdev->mutex);
+}
+
+#ifdef CONFIG_HAS_LIBFDT
+static void generate_cfi_flash_fdt_node(void *fdt,
+					struct device_header *dev_hdr,
+					void (*generate_irq_prop)(void *fdt,
+								  u8 irq,
+								enum irq_type))
+{
+	struct cfi_flash_device *sfdev;
+	u64 reg_prop[2];
+
+	sfdev = container_of(dev_hdr, struct cfi_flash_device, dev_hdr);
+	reg_prop[0] = cpu_to_fdt64(sfdev->base_addr);
+	reg_prop[1] = cpu_to_fdt64(sfdev->size);
+
+	_FDT(fdt_begin_node(fdt, "flash"));
+	_FDT(fdt_property_cell(fdt, "bank-width", CFI_BUS_WIDTH));
+	_FDT(fdt_property_cell(fdt, "#address-cells", 0x1));
+	_FDT(fdt_property_cell(fdt, "#size-cells", 0x1));
+	_FDT(fdt_property_string(fdt, "compatible", "cfi-flash"));
+	_FDT(fdt_property_string(fdt, "label", "System-firmware"));
+	_FDT(fdt_property(fdt, "reg", &reg_prop, sizeof(reg_prop)));
+	_FDT(fdt_end_node(fdt));
+}
+#else
+#define generate_cfi_flash_fdt_node NULL
+#endif
+
+static struct cfi_flash_device *create_flash_device_file(struct kvm *kvm,
+							 const char *filename)
+{
+	struct cfi_flash_device *sfdev;
+	struct stat statbuf;
+	unsigned int value;
+	int ret;
+	int fd;
+
+	fd = open(filename, O_RDWR);
+	if (fd < 0)
+		return ERR_PTR(-errno);
+
+	if (fstat(fd, &statbuf) < 0) {
+		ret = -errno;
+		goto out_close;
+	}
+
+	sfdev = malloc(sizeof(struct cfi_flash_device));
+	if (!sfdev) {
+		ret = -ENOMEM;
+		goto out_close;
+	}
+
+	sfdev->size = statbuf.st_size;
+	/* Round down to nearest power-of-2 size value. */
+	sfdev->size = 1U << (pow2_size(sfdev->size + 1) - 1);
+	if (sfdev->size > KVM_FLASH_MAX_SIZE)
+		sfdev->size = KVM_FLASH_MAX_SIZE;
+	if (sfdev->size < statbuf.st_size) {
+		pr_info("flash file size (%llu bytes) is not a power of two",
+			(unsigned long long)statbuf.st_size);
+		pr_info("only using first %u bytes", sfdev->size);
+	}
+	sfdev->flash_memory = mmap(NULL, sfdev->size,
+				   PROT_READ | PROT_WRITE, MAP_SHARED,
+				   fd, 0);
+	if (sfdev->flash_memory == MAP_FAILED) {
+		ret = -errno;
+		goto out_free;
+	}
+	sfdev->base_addr = KVM_FLASH_MMIO_BASE;
+	sfdev->state = READY;
+	sfdev->read_mode = READ_ARRAY;
+	sfdev->sr = CFI_STATUS_READY;
+
+	map_flash_memory(kvm, sfdev);
+
+	value = roundup(nr_erase_blocks(sfdev), BITS_PER_LONG) / 8;
+	sfdev->lock_bm = malloc(value);
+	memset(sfdev->lock_bm, 0, value);
+
+	sfdev->dev_hdr.bus_type = DEVICE_BUS_MMIO;
+	sfdev->dev_hdr.data = generate_cfi_flash_fdt_node;
+	mutex_init(&sfdev->mutex);
+	ret = device__register(&sfdev->dev_hdr);
+	if (ret)
+		goto out_unmap;
+
+	ret = kvm__register_mmio(kvm,
+				 sfdev->base_addr, sfdev->size,
+				 false, cfi_flash_mmio, sfdev);
+	if (ret) {
+		device__unregister(&sfdev->dev_hdr);
+		goto out_unmap;
+	}
+
+	return sfdev;
+
+out_unmap:
+	munmap(sfdev->flash_memory, sfdev->size);
+out_free:
+	free(sfdev);
+out_close:
+	close(fd);
+
+	return ERR_PTR(ret);
+}
+
+static int cfi_flash__init(struct kvm *kvm)
+{
+	struct cfi_flash_device *sfdev;
+
+	BUILD_BUG_ON(CFI_NR_FLASH_CHIPS != 1 && CFI_NR_FLASH_CHIPS != 2);
+
+	if (!kvm->cfg.flash_filename)
+		return 0;
+
+	sfdev = create_flash_device_file(kvm, kvm->cfg.flash_filename);
+	if (IS_ERR(sfdev))
+		return PTR_ERR(sfdev);
+
+	return 0;
+}
+dev_init(cfi_flash__init);
diff --git a/kvmtool/hw/i8042.c b/kvmtool/hw/i8042.c
new file mode 100644
index 0000000..37a99a2
--- /dev/null
+++ b/kvmtool/hw/i8042.c
@@ -0,0 +1,367 @@
+#include "kvm/read-write.h"
+#include "kvm/ioport.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+#include "kvm/term.h"
+#include "kvm/kvm.h"
+#include "kvm/i8042.h"
+#include "kvm/kvm-cpu.h"
+
+#include <stdint.h>
+
+/*
+ * IRQs
+ */
+#define KBD_IRQ			1
+#define AUX_IRQ			12
+
+/*
+ * Registers
+ */
+#define I8042_DATA_REG		0x60
+#define I8042_PORT_B_REG	0x61
+#define I8042_COMMAND_REG	0x64
+
+/*
+ * Commands
+ */
+#define I8042_CMD_CTL_RCTR	0x20
+#define I8042_CMD_CTL_WCTR	0x60
+#define I8042_CMD_AUX_LOOP	0xD3
+#define I8042_CMD_AUX_SEND	0xD4
+#define I8042_CMD_AUX_TEST	0xA9
+#define I8042_CMD_AUX_DISABLE	0xA7
+#define I8042_CMD_AUX_ENABLE	0xA8
+#define I8042_CMD_SYSTEM_RESET	0xFE
+
+#define RESPONSE_ACK		0xFA
+
+#define MODE_DISABLE_AUX	0x20
+
+#define AUX_ENABLE_REPORTING	0x20
+#define AUX_SCALING_FLAG	0x10
+#define AUX_DEFAULT_RESOLUTION	0x2
+#define AUX_DEFAULT_SAMPLE	100
+
+/*
+ * Status register bits
+ */
+#define I8042_STR_AUXDATA	0x20
+#define I8042_STR_KEYLOCK	0x10
+#define I8042_STR_CMDDAT	0x08
+#define I8042_STR_MUXERR	0x04
+#define I8042_STR_OBF		0x01
+
+#define KBD_MODE_KBD_INT	0x01
+#define KBD_MODE_SYS		0x02
+
+#define QUEUE_SIZE		128
+
+/*
+ * This represents the current state of the PS/2 keyboard system,
+ * including the AUX device (the mouse)
+ */
+struct kbd_state {
+	struct kvm		*kvm;
+
+	char			kq[QUEUE_SIZE];	/* Keyboard queue */
+	int			kread, kwrite;	/* Indexes into the queue */
+	int			kcount;		/* number of elements in queue */
+
+	char			mq[QUEUE_SIZE];
+	int			mread, mwrite;
+	int			mcount;
+
+	u8			mstatus;	/* Mouse status byte */
+	u8			mres;		/* Current mouse resolution */
+	u8			msample;	/* Current mouse samples/second */
+
+	u8			mode;		/* i8042 mode register */
+	u8			status;		/* i8042 status register */
+	/*
+	 * Some commands (on port 0x64) have arguments;
+	 * we store the command here while we wait for the argument
+	 */
+	u32			write_cmd;
+};
+
+static struct kbd_state		state;
+
+/*
+ * If there are packets to be read, set the appropriate IRQs high
+ */
+static void kbd_update_irq(void)
+{
+	u8 klevel = 0;
+	u8 mlevel = 0;
+
+	/* First, clear the kbd and aux output buffer full bits */
+	state.status &= ~(I8042_STR_OBF | I8042_STR_AUXDATA);
+
+	if (state.kcount > 0) {
+		state.status |= I8042_STR_OBF;
+		klevel = 1;
+	}
+
+	/* Keyboard has higher priority than mouse */
+	if (klevel == 0 && state.mcount != 0) {
+		state.status |= I8042_STR_OBF | I8042_STR_AUXDATA;
+		mlevel = 1;
+	}
+
+	kvm__irq_line(state.kvm, KBD_IRQ, klevel);
+	kvm__irq_line(state.kvm, AUX_IRQ, mlevel);
+}
+
+/*
+ * Add a byte to the mouse queue, then set IRQs
+ */
+void mouse_queue(u8 c)
+{
+	if (state.mcount >= QUEUE_SIZE)
+		return;
+
+	state.mq[state.mwrite++ % QUEUE_SIZE] = c;
+
+	state.mcount++;
+	kbd_update_irq();
+}
+
+/*
+ * Add a byte to the keyboard queue, then set IRQs
+ */
+void kbd_queue(u8 c)
+{
+	if (state.kcount >= QUEUE_SIZE)
+		return;
+
+	state.kq[state.kwrite++ % QUEUE_SIZE] = c;
+
+	state.kcount++;
+	kbd_update_irq();
+}
+
+static void kbd_write_command(struct kvm *kvm, u8 val)
+{
+	switch (val) {
+	case I8042_CMD_CTL_RCTR:
+		kbd_queue(state.mode);
+		break;
+	case I8042_CMD_CTL_WCTR:
+	case I8042_CMD_AUX_SEND:
+	case I8042_CMD_AUX_LOOP:
+		state.write_cmd = val;
+		break;
+	case I8042_CMD_AUX_TEST:
+		/* 0 means we're a normal PS/2 mouse */
+		mouse_queue(0);
+		break;
+	case I8042_CMD_AUX_DISABLE:
+		state.mode |= MODE_DISABLE_AUX;
+		break;
+	case I8042_CMD_AUX_ENABLE:
+		state.mode &= ~MODE_DISABLE_AUX;
+		break;
+	case I8042_CMD_SYSTEM_RESET:
+		kvm__reboot(kvm);
+		break;
+	default:
+		break;
+	}
+}
+
+/*
+ * Called when the OS reads from port 0x60 (PS/2 data)
+ */
+static u32 kbd_read_data(void)
+{
+	u32 ret;
+	int i;
+
+	if (state.kcount != 0) {
+		/* Keyboard data gets read first */
+		ret = state.kq[state.kread++ % QUEUE_SIZE];
+		state.kcount--;
+		kvm__irq_line(state.kvm, KBD_IRQ, 0);
+		kbd_update_irq();
+	} else if (state.mcount > 0) {
+		/* Followed by the mouse */
+		ret = state.mq[state.mread++ % QUEUE_SIZE];
+		state.mcount--;
+		kvm__irq_line(state.kvm, AUX_IRQ, 0);
+		kbd_update_irq();
+	} else {
+		i = state.kread - 1;
+		if (i < 0)
+			i = QUEUE_SIZE;
+		ret = state.kq[i];
+	}
+	return ret;
+}
+
+/*
+ * Called when the OS read from port 0x64, the command port
+ */
+static u32 kbd_read_status(void)
+{
+	return (u32)state.status;
+}
+
+/*
+ * Called when the OS writes to port 0x60 (data port)
+ * Things written here are generally arguments to commands previously
+ * written to port 0x64 and stored in state.write_cmd
+ */
+static void kbd_write_data(u32 val)
+{
+	switch (state.write_cmd) {
+	case I8042_CMD_CTL_WCTR:
+		state.mode = val;
+		kbd_update_irq();
+		break;
+	case I8042_CMD_AUX_LOOP:
+		mouse_queue(val);
+		mouse_queue(RESPONSE_ACK);
+		break;
+	case I8042_CMD_AUX_SEND:
+		/* The OS wants to send a command to the mouse */
+		mouse_queue(RESPONSE_ACK);
+		switch (val) {
+		case 0xe6:
+			/* set scaling = 1:1 */
+			state.mstatus &= ~AUX_SCALING_FLAG;
+			break;
+		case 0xe8:
+			/* set resolution */
+			state.mres = val;
+			break;
+		case 0xe9:
+			/* Report mouse status/config */
+			mouse_queue(state.mstatus);
+			mouse_queue(state.mres);
+			mouse_queue(state.msample);
+			break;
+		case 0xf2:
+			/* send ID */
+			mouse_queue(0); /* normal mouse */
+			break;
+		case 0xf3:
+			/* set sample rate */
+			state.msample = val;
+			break;
+		case 0xf4:
+			/* enable reporting */
+			state.mstatus |= AUX_ENABLE_REPORTING;
+			break;
+		case 0xf5:
+			state.mstatus &= ~AUX_ENABLE_REPORTING;
+			break;
+		case 0xf6:
+			/* set defaults, just fall through to reset */
+		case 0xff:
+			/* reset */
+			state.mstatus = 0x0;
+			state.mres = AUX_DEFAULT_RESOLUTION;
+			state.msample = AUX_DEFAULT_SAMPLE;
+			break;
+		default:
+			break;
+	}
+	break;
+	case 0:
+		/* Just send the ID */
+		kbd_queue(RESPONSE_ACK);
+		kbd_queue(0xab);
+		kbd_queue(0x41);
+		kbd_update_irq();
+		break;
+	default:
+		/* Yeah whatever */
+		break;
+	}
+	state.write_cmd = 0;
+}
+
+static void kbd_reset(void)
+{
+	state = (struct kbd_state) {
+		.status		= I8042_STR_MUXERR | I8042_STR_CMDDAT | I8042_STR_KEYLOCK, /* 0x1c */
+		.mode		= KBD_MODE_KBD_INT | KBD_MODE_SYS, /* 0x3 */
+		.mres		= AUX_DEFAULT_RESOLUTION,
+		.msample	= AUX_DEFAULT_SAMPLE,
+	};
+}
+
+/*
+ * Called when the OS has written to one of the keyboard's ports (0x60 or 0x64)
+ */
+static bool kbd_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	switch (port) {
+	case I8042_COMMAND_REG: {
+		u8 value = kbd_read_status();
+		ioport__write8(data, value);
+		break;
+	}
+	case I8042_DATA_REG: {
+		u32 value = kbd_read_data();
+		ioport__write32(data, value);
+		break;
+	}
+	case I8042_PORT_B_REG: {
+		ioport__write8(data, 0x20);
+		break;
+	}
+	default:
+		return false;
+	}
+
+	return true;
+}
+
+static bool kbd_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	switch (port) {
+	case I8042_COMMAND_REG: {
+		u8 value = ioport__read8(data);
+		kbd_write_command(vcpu->kvm, value);
+		break;
+	}
+	case I8042_DATA_REG: {
+		u32 value = ioport__read32(data);
+		kbd_write_data(value);
+		break;
+	}
+	case I8042_PORT_B_REG: {
+		break;
+	}
+	default:
+		return false;
+	}
+
+	return true;
+}
+
+static struct ioport_operations kbd_ops = {
+	.io_in		= kbd_in,
+	.io_out		= kbd_out,
+};
+
+int kbd__init(struct kvm *kvm)
+{
+	int r;
+
+	kbd_reset();
+	state.kvm = kvm;
+	r = ioport__register(kvm, I8042_DATA_REG, &kbd_ops, 2, NULL);
+	if (r < 0)
+		return r;
+	r = ioport__register(kvm, I8042_COMMAND_REG, &kbd_ops, 2, NULL);
+	if (r < 0) {
+		ioport__unregister(kvm, I8042_DATA_REG);
+		return r;
+	}
+
+	return 0;
+}
+dev_init(kbd__init);
diff --git a/kvmtool/hw/rtc.c b/kvmtool/hw/rtc.c
new file mode 100644
index 0000000..5483879
--- /dev/null
+++ b/kvmtool/hw/rtc.c
@@ -0,0 +1,195 @@
+#include "kvm/rtc.h"
+
+#include "kvm/ioport.h"
+#include "kvm/kvm.h"
+
+#include <time.h>
+
+/*
+ * MC146818 RTC registers
+ */
+#define RTC_SECONDS			0x00
+#define RTC_SECONDS_ALARM		0x01
+#define RTC_MINUTES			0x02
+#define RTC_MINUTES_ALARM		0x03
+#define RTC_HOURS			0x04
+#define RTC_HOURS_ALARM			0x05
+#define RTC_DAY_OF_WEEK			0x06
+#define RTC_DAY_OF_MONTH		0x07
+#define RTC_MONTH			0x08
+#define RTC_YEAR			0x09
+#define RTC_CENTURY			0x32
+
+#define RTC_REG_A			0x0A
+#define RTC_REG_B			0x0B
+#define RTC_REG_C			0x0C
+#define RTC_REG_D			0x0D
+
+/*
+ * Register D Bits
+ */
+#define RTC_REG_D_VRT			(1 << 7)
+
+struct rtc_device {
+	u8			cmos_idx;
+	u8			cmos_data[128];
+};
+
+static struct rtc_device	rtc;
+
+static inline unsigned char bin2bcd(unsigned val)
+{
+	return ((val / 10) << 4) + val % 10;
+}
+
+static bool cmos_ram_data_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	struct tm *tm;
+	time_t ti;
+
+	time(&ti);
+
+	tm = gmtime(&ti);
+
+	switch (rtc.cmos_idx) {
+	case RTC_SECONDS:
+		ioport__write8(data, bin2bcd(tm->tm_sec));
+		break;
+	case RTC_MINUTES:
+		ioport__write8(data, bin2bcd(tm->tm_min));
+		break;
+	case RTC_HOURS:
+		ioport__write8(data, bin2bcd(tm->tm_hour));
+		break;
+	case RTC_DAY_OF_WEEK:
+		ioport__write8(data, bin2bcd(tm->tm_wday + 1));
+		break;
+	case RTC_DAY_OF_MONTH:
+		ioport__write8(data, bin2bcd(tm->tm_mday));
+		break;
+	case RTC_MONTH:
+		ioport__write8(data, bin2bcd(tm->tm_mon + 1));
+		break;
+	case RTC_YEAR: {
+		int year;
+
+		year = tm->tm_year + 1900;
+
+		ioport__write8(data, bin2bcd(year % 100));
+
+		break;
+	}
+	case RTC_CENTURY: {
+		int year;
+
+		year = tm->tm_year + 1900;
+
+		ioport__write8(data, bin2bcd(year / 100));
+
+		break;
+	}
+	default:
+		ioport__write8(data, rtc.cmos_data[rtc.cmos_idx]);
+		break;
+	}
+
+	return true;
+}
+
+static bool cmos_ram_data_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	switch (rtc.cmos_idx) {
+	case RTC_REG_C:
+	case RTC_REG_D:
+		/* Read-only */
+		break;
+	default:
+		rtc.cmos_data[rtc.cmos_idx] = ioport__read8(data);
+		break;
+	}
+
+	return true;
+}
+
+static struct ioport_operations cmos_ram_data_ioport_ops = {
+	.io_out		= cmos_ram_data_out,
+	.io_in		= cmos_ram_data_in,
+};
+
+static bool cmos_ram_index_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	u8 value = ioport__read8(data);
+
+	vcpu->kvm->nmi_disabled	= value & (1UL << 7);
+	rtc.cmos_idx		= value & ~(1UL << 7);
+
+	return true;
+}
+
+static struct ioport_operations cmos_ram_index_ioport_ops = {
+	.io_out		= cmos_ram_index_out,
+};
+
+#ifdef CONFIG_HAS_LIBFDT
+static void generate_rtc_fdt_node(void *fdt,
+				  struct device_header *dev_hdr,
+				  void (*generate_irq_prop)(void *fdt,
+							    u8 irq,
+							    enum irq_type))
+{
+	u64 reg_prop[2] = { cpu_to_fdt64(0x70), cpu_to_fdt64(2) };
+
+	_FDT(fdt_begin_node(fdt, "rtc"));
+	_FDT(fdt_property_string(fdt, "compatible", "motorola,mc146818"));
+	_FDT(fdt_property(fdt, "reg", reg_prop, sizeof(reg_prop)));
+	_FDT(fdt_end_node(fdt));
+}
+#else
+#define generate_rtc_fdt_node NULL
+#endif
+
+struct device_header rtc_dev_hdr = {
+	.bus_type = DEVICE_BUS_IOPORT,
+	.data = generate_rtc_fdt_node,
+};
+
+int rtc__init(struct kvm *kvm)
+{
+	int r;
+
+	r = device__register(&rtc_dev_hdr);
+	if (r < 0)
+		return r;
+
+	/* PORT 0070-007F - CMOS RAM/RTC (REAL TIME CLOCK) */
+	r = ioport__register(kvm, 0x0070, &cmos_ram_index_ioport_ops, 1, NULL);
+	if (r < 0)
+		goto out_device;
+
+	r = ioport__register(kvm, 0x0071, &cmos_ram_data_ioport_ops, 1, NULL);
+	if (r < 0)
+		goto out_ioport;
+
+	/* Set the VRT bit in Register D to indicate valid RAM and time */
+	rtc.cmos_data[RTC_REG_D] = RTC_REG_D_VRT;
+
+	return r;
+
+out_ioport:
+	ioport__unregister(kvm, 0x0070);
+out_device:
+	device__unregister(&rtc_dev_hdr);
+
+	return r;
+}
+dev_init(rtc__init);
+
+int rtc__exit(struct kvm *kvm)
+{
+	/* PORT 0070-007F - CMOS RAM/RTC (REAL TIME CLOCK) */
+	ioport__unregister(kvm, 0x0070);
+	ioport__unregister(kvm, 0x0071);
+
+	return 0;
+}
+dev_exit(rtc__exit);
diff --git a/kvmtool/hw/serial.c b/kvmtool/hw/serial.c
new file mode 100644
index 0000000..13c4663
--- /dev/null
+++ b/kvmtool/hw/serial.c
@@ -0,0 +1,462 @@
+#include "kvm/8250-serial.h"
+
+#include "kvm/read-write.h"
+#include "kvm/ioport.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+#include "kvm/term.h"
+#include "kvm/kvm.h"
+#include "kvm/fdt.h"
+
+#include <linux/types.h>
+#include <linux/serial_reg.h>
+
+#include <pthread.h>
+
+/*
+ * This fakes a U6_16550A. The fifo len needs to be 64 as the kernel
+ * expects that for autodetection.
+ */
+#define FIFO_LEN		64
+#define FIFO_MASK		(FIFO_LEN - 1)
+
+#define UART_IIR_TYPE_BITS	0xc0
+
+struct serial8250_device {
+	struct mutex		mutex;
+	u8			id;
+
+	u16			iobase;
+	u8			irq;
+	u8			irq_state;
+	int			txcnt;
+	int			rxcnt;
+	int			rxdone;
+	char			txbuf[FIFO_LEN];
+	char			rxbuf[FIFO_LEN];
+
+	u8			dll;
+	u8			dlm;
+	u8			iir;
+	u8			ier;
+	u8			fcr;
+	u8			lcr;
+	u8			mcr;
+	u8			lsr;
+	u8			msr;
+	u8			scr;
+};
+
+#define SERIAL_REGS_SETTING \
+	.iir			= UART_IIR_NO_INT, \
+	.lsr			= UART_LSR_TEMT | UART_LSR_THRE, \
+	.msr			= UART_MSR_DCD | UART_MSR_DSR | UART_MSR_CTS, \
+	.mcr			= UART_MCR_OUT2,
+
+static struct serial8250_device devices[] = {
+	/* ttyS0 */
+	[0]	= {
+		.mutex			= MUTEX_INITIALIZER,
+
+		.id			= 0,
+		.iobase			= 0x3f8,
+		.irq			= 4,
+
+		SERIAL_REGS_SETTING
+	},
+	/* ttyS1 */
+	[1]	= {
+		.mutex			= MUTEX_INITIALIZER,
+
+		.id			= 1,
+		.iobase			= 0x2f8,
+		.irq			= 3,
+
+		SERIAL_REGS_SETTING
+	},
+	/* ttyS2 */
+	[2]	= {
+		.mutex			= MUTEX_INITIALIZER,
+
+		.id			= 2,
+		.iobase			= 0x3e8,
+		.irq			= 4,
+
+		SERIAL_REGS_SETTING
+	},
+	/* ttyS3 */
+	[3]	= {
+		.mutex			= MUTEX_INITIALIZER,
+
+		.id			= 3,
+		.iobase			= 0x2e8,
+		.irq			= 3,
+
+		SERIAL_REGS_SETTING
+	},
+};
+
+static void serial8250_flush_tx(struct kvm *kvm, struct serial8250_device *dev)
+{
+	dev->lsr |= UART_LSR_TEMT | UART_LSR_THRE;
+
+	if (dev->txcnt) {
+		term_putc(dev->txbuf, dev->txcnt, dev->id);
+		dev->txcnt = 0;
+	}
+}
+
+static void serial8250_update_irq(struct kvm *kvm, struct serial8250_device *dev)
+{
+	u8 iir = 0;
+
+	/* Handle clear rx */
+	if (dev->lcr & UART_FCR_CLEAR_RCVR) {
+		dev->lcr &= ~UART_FCR_CLEAR_RCVR;
+		dev->rxcnt = dev->rxdone = 0;
+		dev->lsr &= ~UART_LSR_DR;
+	}
+
+	/* Handle clear tx */
+	if (dev->lcr & UART_FCR_CLEAR_XMIT) {
+		dev->lcr &= ~UART_FCR_CLEAR_XMIT;
+		dev->txcnt = 0;
+		dev->lsr |= UART_LSR_TEMT | UART_LSR_THRE;
+	}
+
+	/* Data ready and rcv interrupt enabled ? */
+	if ((dev->ier & UART_IER_RDI) && (dev->lsr & UART_LSR_DR))
+		iir |= UART_IIR_RDI;
+
+	/* Transmitter empty and interrupt enabled ? */
+	if ((dev->ier & UART_IER_THRI) && (dev->lsr & UART_LSR_TEMT))
+		iir |= UART_IIR_THRI;
+
+	/* Now update the irq line, if necessary */
+	if (!iir) {
+		dev->iir = UART_IIR_NO_INT;
+		if (dev->irq_state)
+			kvm__irq_line(kvm, dev->irq, 0);
+	} else {
+		dev->iir = iir;
+		if (!dev->irq_state)
+			kvm__irq_line(kvm, dev->irq, 1);
+	}
+	dev->irq_state = iir;
+
+	/*
+	 * If the kernel disabled the tx interrupt, we know that there
+	 * is nothing more to transmit, so we can reset our tx logic
+	 * here.
+	 */
+	if (!(dev->ier & UART_IER_THRI))
+		serial8250_flush_tx(kvm, dev);
+}
+
+#define SYSRQ_PENDING_NONE		0
+
+static int sysrq_pending;
+
+static void serial8250__sysrq(struct kvm *kvm, struct serial8250_device *dev)
+{
+	dev->lsr |= UART_LSR_DR | UART_LSR_BI;
+	dev->rxbuf[dev->rxcnt++] = sysrq_pending;
+	sysrq_pending	= SYSRQ_PENDING_NONE;
+}
+
+static void serial8250__receive(struct kvm *kvm, struct serial8250_device *dev,
+				bool handle_sysrq)
+{
+	int c;
+
+	if (dev->mcr & UART_MCR_LOOP)
+		return;
+
+	if ((dev->lsr & UART_LSR_DR) || dev->rxcnt)
+		return;
+
+	if (handle_sysrq && sysrq_pending) {
+		serial8250__sysrq(kvm, dev);
+		return;
+	}
+
+	if (kvm->cfg.active_console != CONSOLE_8250)
+		return;
+
+	while (term_readable(dev->id) &&
+	       dev->rxcnt < FIFO_LEN) {
+
+		c = term_getc(kvm, dev->id);
+
+		if (c < 0)
+			break;
+		dev->rxbuf[dev->rxcnt++] = c;
+		dev->lsr |= UART_LSR_DR;
+	}
+}
+
+void serial8250__update_consoles(struct kvm *kvm)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(devices); i++) {
+		struct serial8250_device *dev = &devices[i];
+
+		mutex_lock(&dev->mutex);
+
+		/* Restrict sysrq injection to the first port */
+		serial8250__receive(kvm, dev, i == 0);
+
+		serial8250_update_irq(kvm, dev);
+
+		mutex_unlock(&dev->mutex);
+	}
+}
+
+void serial8250__inject_sysrq(struct kvm *kvm, char sysrq)
+{
+	sysrq_pending = sysrq;
+}
+
+static bool serial8250_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port,
+			   void *data, int size)
+{
+	struct serial8250_device *dev = ioport->priv;
+	u16 offset;
+	bool ret = true;
+	char *addr = data;
+
+	mutex_lock(&dev->mutex);
+
+	offset = port - dev->iobase;
+
+	switch (offset) {
+	case UART_TX:
+		if (dev->lcr & UART_LCR_DLAB) {
+			dev->dll = ioport__read8(data);
+			break;
+		}
+
+		/* Loopback mode */
+		if (dev->mcr & UART_MCR_LOOP) {
+			if (dev->rxcnt < FIFO_LEN) {
+				dev->rxbuf[dev->rxcnt++] = *addr;
+				dev->lsr |= UART_LSR_DR;
+			}
+			break;
+		}
+
+		if (dev->txcnt < FIFO_LEN) {
+			dev->txbuf[dev->txcnt++] = *addr;
+			dev->lsr &= ~UART_LSR_TEMT;
+			if (dev->txcnt == FIFO_LEN / 2)
+				dev->lsr &= ~UART_LSR_THRE;
+			serial8250_flush_tx(vcpu->kvm, dev);
+		} else {
+			/* Should never happpen */
+			dev->lsr &= ~(UART_LSR_TEMT | UART_LSR_THRE);
+		}
+		break;
+	case UART_IER:
+		if (!(dev->lcr & UART_LCR_DLAB))
+			dev->ier = ioport__read8(data) & 0x0f;
+		else
+			dev->dlm = ioport__read8(data);
+		break;
+	case UART_FCR:
+		dev->fcr = ioport__read8(data);
+		break;
+	case UART_LCR:
+		dev->lcr = ioport__read8(data);
+		break;
+	case UART_MCR:
+		dev->mcr = ioport__read8(data);
+		break;
+	case UART_LSR:
+		/* Factory test */
+		break;
+	case UART_MSR:
+		/* Not used */
+		break;
+	case UART_SCR:
+		dev->scr = ioport__read8(data);
+		break;
+	default:
+		ret = false;
+		break;
+	}
+
+	serial8250_update_irq(vcpu->kvm, dev);
+
+	mutex_unlock(&dev->mutex);
+
+	return ret;
+}
+
+static void serial8250_rx(struct serial8250_device *dev, void *data)
+{
+	if (dev->rxdone == dev->rxcnt)
+		return;
+
+	/* Break issued ? */
+	if (dev->lsr & UART_LSR_BI) {
+		dev->lsr &= ~UART_LSR_BI;
+		ioport__write8(data, 0);
+		return;
+	}
+
+	ioport__write8(data, dev->rxbuf[dev->rxdone++]);
+	if (dev->rxcnt == dev->rxdone) {
+		dev->lsr &= ~UART_LSR_DR;
+		dev->rxcnt = dev->rxdone = 0;
+	}
+}
+
+static bool serial8250_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	struct serial8250_device *dev = ioport->priv;
+	u16 offset;
+	bool ret = true;
+
+	mutex_lock(&dev->mutex);
+
+	offset = port - dev->iobase;
+
+	switch (offset) {
+	case UART_RX:
+		if (dev->lcr & UART_LCR_DLAB)
+			ioport__write8(data, dev->dll);
+		else
+			serial8250_rx(dev, data);
+		break;
+	case UART_IER:
+		if (dev->lcr & UART_LCR_DLAB)
+			ioport__write8(data, dev->dlm);
+		else
+			ioport__write8(data, dev->ier);
+		break;
+	case UART_IIR:
+		ioport__write8(data, dev->iir | UART_IIR_TYPE_BITS);
+		break;
+	case UART_LCR:
+		ioport__write8(data, dev->lcr);
+		break;
+	case UART_MCR:
+		ioport__write8(data, dev->mcr);
+		break;
+	case UART_LSR:
+		ioport__write8(data, dev->lsr);
+		break;
+	case UART_MSR:
+		ioport__write8(data, dev->msr);
+		break;
+	case UART_SCR:
+		ioport__write8(data, dev->scr);
+		break;
+	default:
+		ret = false;
+		break;
+	}
+
+	serial8250_update_irq(vcpu->kvm, dev);
+
+	mutex_unlock(&dev->mutex);
+
+	return ret;
+}
+
+#ifdef CONFIG_HAS_LIBFDT
+
+char *fdt_stdout_path = NULL;
+
+#define DEVICE_NAME_MAX_LEN 32
+static
+void serial8250_generate_fdt_node(struct ioport *ioport, void *fdt,
+				  void (*generate_irq_prop)(void *fdt,
+							    u8 irq,
+							    enum irq_type))
+{
+	char dev_name[DEVICE_NAME_MAX_LEN];
+	struct serial8250_device *dev = ioport->priv;
+	u64 addr = KVM_IOPORT_AREA + dev->iobase;
+	u64 reg_prop[] = {
+		cpu_to_fdt64(addr),
+		cpu_to_fdt64(8),
+	};
+
+	snprintf(dev_name, DEVICE_NAME_MAX_LEN, "U6_16550A@%llx", addr);
+
+	if (!fdt_stdout_path) {
+		fdt_stdout_path = malloc(strlen(dev_name) + 2);
+		/* Assumes that this node is a child of the root node. */
+		sprintf(fdt_stdout_path, "/%s", dev_name);
+	}
+
+	_FDT(fdt_begin_node(fdt, dev_name));
+	_FDT(fdt_property_string(fdt, "compatible", "ns16550a"));
+	_FDT(fdt_property(fdt, "reg", reg_prop, sizeof(reg_prop)));
+	generate_irq_prop(fdt, dev->irq, IRQ_TYPE_LEVEL_HIGH);
+	_FDT(fdt_property_cell(fdt, "clock-frequency", 1843200));
+	_FDT(fdt_end_node(fdt));
+}
+#else
+#define serial8250_generate_fdt_node	NULL
+#endif
+
+static struct ioport_operations serial8250_ops = {
+	.io_in			= serial8250_in,
+	.io_out			= serial8250_out,
+	.generate_fdt_node	= serial8250_generate_fdt_node,
+};
+
+static int serial8250__device_init(struct kvm *kvm, struct serial8250_device *dev)
+{
+	int r;
+
+	ioport__map_irq(&dev->irq);
+	r = ioport__register(kvm, dev->iobase, &serial8250_ops, 8, dev);
+
+	return r;
+}
+
+int serial8250__init(struct kvm *kvm)
+{
+	unsigned int i, j;
+	int r = 0;
+
+	for (i = 0; i < ARRAY_SIZE(devices); i++) {
+		struct serial8250_device *dev = &devices[i];
+
+		r = serial8250__device_init(kvm, dev);
+		if (r < 0)
+			goto cleanup;
+	}
+
+	return r;
+cleanup:
+	for (j = 0; j <= i; j++) {
+		struct serial8250_device *dev = &devices[j];
+
+		ioport__unregister(kvm, dev->iobase);
+	}
+
+	return r;
+}
+dev_init(serial8250__init);
+
+int serial8250__exit(struct kvm *kvm)
+{
+	unsigned int i;
+	int r;
+
+	for (i = 0; i < ARRAY_SIZE(devices); i++) {
+		struct serial8250_device *dev = &devices[i];
+
+		r = ioport__unregister(kvm, dev->iobase);
+		if (r < 0)
+			return r;
+	}
+
+	return 0;
+}
+dev_exit(serial8250__exit);
diff --git a/kvmtool/hw/vesa.c b/kvmtool/hw/vesa.c
new file mode 100644
index 0000000..8659a00
--- /dev/null
+++ b/kvmtool/hw/vesa.c
@@ -0,0 +1,122 @@
+#include "kvm/vesa.h"
+
+#include "kvm/devices.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/framebuffer.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/ioport.h"
+#include "kvm/util.h"
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+
+#include <linux/byteorder.h>
+#include <sys/mman.h>
+#include <linux/err.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <inttypes.h>
+#include <unistd.h>
+
+static struct pci_device_header vesa_pci_device = {
+	.vendor_id	= cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET),
+	.device_id	= cpu_to_le16(PCI_DEVICE_ID_VESA),
+	.header_type	= PCI_HEADER_TYPE_NORMAL,
+	.revision_id	= 0,
+	.class[2]	= 0x03,
+	.subsys_vendor_id = cpu_to_le16(PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET),
+	.subsys_id	= cpu_to_le16(PCI_SUBSYSTEM_ID_VESA),
+	.bar[1]		= cpu_to_le32(VESA_MEM_ADDR | PCI_BASE_ADDRESS_SPACE_MEMORY),
+	.bar_size[1]	= VESA_MEM_SIZE,
+};
+
+static struct device_header vesa_device = {
+	.bus_type	= DEVICE_BUS_PCI,
+	.data		= &vesa_pci_device,
+};
+
+static struct framebuffer vesafb = {
+	.width		= VESA_WIDTH,
+	.height		= VESA_HEIGHT,
+	.depth		= VESA_BPP,
+	.mem_addr	= VESA_MEM_ADDR,
+	.mem_size	= VESA_MEM_SIZE,
+};
+
+static bool vesa_pci_io_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	return true;
+}
+
+static bool vesa_pci_io_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	return true;
+}
+
+static struct ioport_operations vesa_io_ops = {
+	.io_in			= vesa_pci_io_in,
+	.io_out			= vesa_pci_io_out,
+};
+
+static int vesa__bar_activate(struct kvm *kvm, struct pci_device_header *pci_hdr,
+			      int bar_num, void *data)
+{
+	/* We don't support remapping of the framebuffer. */
+	return 0;
+}
+
+static int vesa__bar_deactivate(struct kvm *kvm, struct pci_device_header *pci_hdr,
+				int bar_num, void *data)
+{
+	/* We don't support remapping of the framebuffer. */
+	return -EINVAL;
+}
+
+struct framebuffer *vesa__init(struct kvm *kvm)
+{
+	u16 vesa_base_addr;
+	char *mem;
+	int r;
+
+	BUILD_BUG_ON(!is_power_of_two(VESA_MEM_SIZE));
+	BUILD_BUG_ON(VESA_MEM_SIZE < VESA_BPP/8 * VESA_WIDTH * VESA_HEIGHT);
+
+	vesa_base_addr = pci_get_io_port_block(PCI_IO_SIZE);
+	r = ioport__register(kvm, vesa_base_addr, &vesa_io_ops, PCI_IO_SIZE, NULL);
+	if (r < 0)
+		goto out_error;
+
+	vesa_pci_device.bar[0]		= cpu_to_le32(vesa_base_addr | PCI_BASE_ADDRESS_SPACE_IO);
+	vesa_pci_device.bar_size[0]	= PCI_IO_SIZE;
+	r = pci__register_bar_regions(kvm, &vesa_pci_device, vesa__bar_activate,
+				      vesa__bar_deactivate, NULL);
+	if (r < 0)
+		goto unregister_ioport;
+
+	r = device__register(&vesa_device);
+	if (r < 0)
+		goto unregister_ioport;
+
+	mem = mmap(NULL, VESA_MEM_SIZE, PROT_RW, MAP_ANON_NORESERVE, -1, 0);
+	if (mem == MAP_FAILED) {
+		r = -errno;
+		goto unregister_device;
+	}
+
+	r = kvm__register_dev_mem(kvm, VESA_MEM_ADDR, VESA_MEM_SIZE, mem);
+	if (r < 0)
+		goto unmap_dev;
+
+	vesafb.mem = mem;
+	vesafb.kvm = kvm;
+	return fb__register(&vesafb);
+
+unmap_dev:
+	munmap(mem, VESA_MEM_SIZE);
+unregister_device:
+	device__unregister(&vesa_device);
+unregister_ioport:
+	ioport__unregister(kvm, vesa_base_addr);
+out_error:
+	return ERR_PTR(r);
+}
diff --git a/kvmtool/include/asm/hweight.h b/kvmtool/include/asm/hweight.h
new file mode 100644
index 0000000..1a43977
--- /dev/null
+++ b/kvmtool/include/asm/hweight.h
@@ -0,0 +1,8 @@
+#ifndef _KVM_ASM_HWEIGHT_H_
+#define _KVM_ASM_HWEIGHT_H_
+
+#include <linux/types.h>
+unsigned int hweight32(unsigned int w);
+unsigned long hweight64(__u64 w);
+
+#endif /* _KVM_ASM_HWEIGHT_H_ */
diff --git a/kvmtool/include/common-cmds.h b/kvmtool/include/common-cmds.h
new file mode 100644
index 0000000..5b95676
--- /dev/null
+++ b/kvmtool/include/common-cmds.h
@@ -0,0 +1,19 @@
+struct cmdname_help
+{
+    char name[16];
+    char help[80];
+};
+
+static struct cmdname_help common_cmds[] = {
+  {"run", "Start the virtual machine"},
+  {"setup", "Setup a new virtual machine"},
+  {"pause", "Pause the virtual machine"},
+  {"resume", "Resume the virtual machine"},
+  {"version", "Print the version of the kernel tree kvm tools"},
+  {"list", "Print a list of running instances on the host."},
+  {"debug", "Print debug information from a running instance"},
+  {"balloon", "Inflate or deflate the virtio balloon"},
+  {"stop", "Stop a running instance"},
+  {"stat", "Print statistics about a running instance"},
+  {"sandbox", "Run a command in a sandboxed guest"},
+};
diff --git a/kvmtool/include/kvm/8250-serial.h b/kvmtool/include/kvm/8250-serial.h
new file mode 100644
index 0000000..e954551
--- /dev/null
+++ b/kvmtool/include/kvm/8250-serial.h
@@ -0,0 +1,11 @@
+#ifndef KVM__8250_SERIAL_H
+#define KVM__8250_SERIAL_H
+
+struct kvm;
+
+int serial8250__init(struct kvm *kvm);
+int serial8250__exit(struct kvm *kvm);
+void serial8250__update_consoles(struct kvm *kvm);
+void serial8250__inject_sysrq(struct kvm *kvm, char sysrq);
+
+#endif /* KVM__8250_SERIAL_H */
diff --git a/kvmtool/include/kvm/apic.h b/kvmtool/include/kvm/apic.h
new file mode 100644
index 0000000..2129997
--- /dev/null
+++ b/kvmtool/include/kvm/apic.h
@@ -0,0 +1,17 @@
+#ifndef KVM_APIC_H_
+#define KVM_APIC_H_
+
+#include <asm/apicdef.h>
+
+/*
+ * APIC, IOAPIC stuff
+ */
+#define APIC_BASE_ADDR_STEP	0x00400000
+#define IOAPIC_BASE_ADDR_STEP	0x00100000
+
+#define APIC_ADDR(apic)		(APIC_DEFAULT_PHYS_BASE + apic * APIC_BASE_ADDR_STEP)
+#define IOAPIC_ADDR(ioapic)	(IO_APIC_DEFAULT_PHYS_BASE + ioapic * IOAPIC_BASE_ADDR_STEP)
+
+#define KVM_APIC_VERSION	0x14 /* xAPIC */
+
+#endif /* KVM_APIC_H_ */
diff --git a/kvmtool/include/kvm/brlock.h b/kvmtool/include/kvm/brlock.h
new file mode 100644
index 0000000..1862210
--- /dev/null
+++ b/kvmtool/include/kvm/brlock.h
@@ -0,0 +1,39 @@
+#ifndef KVM__BRLOCK_H
+#define KVM__BRLOCK_H
+
+#include "kvm/kvm.h"
+#include "kvm/barrier.h"
+
+/*
+ * brlock is a lock which is very cheap for reads, but very expensive
+ * for writes.
+ * This lock will be used when updates are very rare and reads are common.
+ * This lock is currently implemented by stopping the guest while
+ * performing the updates. We assume that the only threads whichread from
+ * the locked data are VCPU threads, and the only writer isn't a VCPU thread.
+ */
+
+#ifndef barrier
+#define barrier()		__asm__ __volatile__("": : :"memory")
+#endif
+
+#ifdef KVM_BRLOCK_DEBUG
+
+#include "kvm/rwsem.h"
+
+#define br_read_lock(kvm)	down_read(&(kvm)->brlock_sem);
+#define br_read_unlock(kvm)	up_read(&(kvm)->brlock_sem);
+
+#define br_write_lock(kvm)	down_write(&(kvm)->brlock_sem);
+#define br_write_unlock(kvm)	up_write(&(kvm)->brlock_sem);
+
+#else
+
+#define br_read_lock(kvm)	barrier()
+#define br_read_unlock(kvm)	barrier()
+
+#define br_write_lock(kvm)	kvm__pause(kvm)
+#define br_write_unlock(kvm)	kvm__continue(kvm)
+#endif
+
+#endif
diff --git a/kvmtool/include/kvm/builtin-balloon.h b/kvmtool/include/kvm/builtin-balloon.h
new file mode 100644
index 0000000..77ee656
--- /dev/null
+++ b/kvmtool/include/kvm/builtin-balloon.h
@@ -0,0 +1,9 @@
+#ifndef KVM__BALLOON_H
+#define KVM__BALLOON_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_balloon(int argc, const char **argv, const char *prefix);
+void kvm_balloon_help(void) NORETURN;
+
+#endif
diff --git a/kvmtool/include/kvm/builtin-debug.h b/kvmtool/include/kvm/builtin-debug.h
new file mode 100644
index 0000000..efa0268
--- /dev/null
+++ b/kvmtool/include/kvm/builtin-debug.h
@@ -0,0 +1,20 @@
+#ifndef KVM__DEBUG_H
+#define KVM__DEBUG_H
+
+#include <kvm/util.h>
+#include <linux/types.h>
+
+#define KVM_DEBUG_CMD_TYPE_DUMP	(1 << 0)
+#define KVM_DEBUG_CMD_TYPE_NMI	(1 << 1)
+#define KVM_DEBUG_CMD_TYPE_SYSRQ (1 << 2)
+
+struct debug_cmd_params {
+	u32 dbg_type;
+	u32 cpu;
+	char sysrq;
+};
+
+int kvm_cmd_debug(int argc, const char **argv, const char *prefix);
+void kvm_debug_help(void) NORETURN;
+
+#endif
diff --git a/kvmtool/include/kvm/builtin-help.h b/kvmtool/include/kvm/builtin-help.h
new file mode 100644
index 0000000..2946743
--- /dev/null
+++ b/kvmtool/include/kvm/builtin-help.h
@@ -0,0 +1,6 @@
+#ifndef __KVM_HELP_H__
+#define __KVM_HELP_H__
+
+int kvm_cmd_help(int argc, const char **argv, const char *prefix);
+
+#endif
diff --git a/kvmtool/include/kvm/builtin-list.h b/kvmtool/include/kvm/builtin-list.h
new file mode 100644
index 0000000..47029ca
--- /dev/null
+++ b/kvmtool/include/kvm/builtin-list.h
@@ -0,0 +1,10 @@
+#ifndef KVM__LIST_H
+#define KVM__LIST_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_list(int argc, const char **argv, const char *prefix);
+void kvm_list_help(void) NORETURN;
+int get_vmstate(int sock);
+
+#endif
diff --git a/kvmtool/include/kvm/builtin-pause.h b/kvmtool/include/kvm/builtin-pause.h
new file mode 100644
index 0000000..84aaee3
--- /dev/null
+++ b/kvmtool/include/kvm/builtin-pause.h
@@ -0,0 +1,9 @@
+#ifndef KVM__PAUSE_H
+#define KVM__PAUSE_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_pause(int argc, const char **argv, const char *prefix);
+void kvm_pause_help(void) NORETURN;
+
+#endif
diff --git a/kvmtool/include/kvm/builtin-resume.h b/kvmtool/include/kvm/builtin-resume.h
new file mode 100644
index 0000000..7de999b
--- /dev/null
+++ b/kvmtool/include/kvm/builtin-resume.h
@@ -0,0 +1,9 @@
+#ifndef KVM__RESUME_H
+#define KVM__RESUME_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_resume(int argc, const char **argv, const char *prefix);
+void kvm_resume_help(void) NORETURN;
+
+#endif
diff --git a/kvmtool/include/kvm/builtin-run.h b/kvmtool/include/kvm/builtin-run.h
new file mode 100644
index 0000000..91521a5
--- /dev/null
+++ b/kvmtool/include/kvm/builtin-run.h
@@ -0,0 +1,11 @@
+#ifndef __KVM_RUN_H__
+#define __KVM_RUN_H__
+
+#include <kvm/util.h>
+
+int kvm_cmd_run(int argc, const char **argv, const char *prefix);
+void kvm_run_help(void) NORETURN;
+
+void kvm_run_set_wrapper_sandbox(void);
+
+#endif
diff --git a/kvmtool/include/kvm/builtin-sandbox.h b/kvmtool/include/kvm/builtin-sandbox.h
new file mode 100644
index 0000000..98cd6be
--- /dev/null
+++ b/kvmtool/include/kvm/builtin-sandbox.h
@@ -0,0 +1,6 @@
+#ifndef KVM__SANDBOX_H
+#define KVM__SANDBOX_H
+
+int kvm_cmd_sandbox(int argc, const char **argv, const char *prefix);
+
+#endif
diff --git a/kvmtool/include/kvm/builtin-setup.h b/kvmtool/include/kvm/builtin-setup.h
new file mode 100644
index 0000000..239bbbd
--- /dev/null
+++ b/kvmtool/include/kvm/builtin-setup.h
@@ -0,0 +1,12 @@
+#ifndef KVM__SETUP_H
+#define KVM__SETUP_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_setup(int argc, const char **argv, const char *prefix);
+void kvm_setup_help(void) NORETURN;
+int kvm_setup_create_new(const char *guestfs_name);
+void kvm_setup_resolv(const char *guestfs_name);
+int kvm_setup_guest_init(const char *guestfs_name);
+
+#endif
diff --git a/kvmtool/include/kvm/builtin-stat.h b/kvmtool/include/kvm/builtin-stat.h
new file mode 100644
index 0000000..4fecb37
--- /dev/null
+++ b/kvmtool/include/kvm/builtin-stat.h
@@ -0,0 +1,9 @@
+#ifndef KVM__STAT_H
+#define KVM__STAT_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_stat(int argc, const char **argv, const char *prefix);
+void kvm_stat_help(void) NORETURN;
+
+#endif
diff --git a/kvmtool/include/kvm/builtin-stop.h b/kvmtool/include/kvm/builtin-stop.h
new file mode 100644
index 0000000..b26b275
--- /dev/null
+++ b/kvmtool/include/kvm/builtin-stop.h
@@ -0,0 +1,9 @@
+#ifndef KVM__STOP_H
+#define KVM__STOP_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_stop(int argc, const char **argv, const char *prefix);
+void kvm_stop_help(void) NORETURN;
+
+#endif
diff --git a/kvmtool/include/kvm/builtin-version.h b/kvmtool/include/kvm/builtin-version.h
new file mode 100644
index 0000000..83cac4d
--- /dev/null
+++ b/kvmtool/include/kvm/builtin-version.h
@@ -0,0 +1,6 @@
+#ifndef KVM__VERSION_H
+#define KVM__VERSION_H
+
+int kvm_cmd_version(int argc, const char **argv, const char *prefix);
+
+#endif
diff --git a/kvmtool/include/kvm/compiler.h b/kvmtool/include/kvm/compiler.h
new file mode 100644
index 0000000..2013a83
--- /dev/null
+++ b/kvmtool/include/kvm/compiler.h
@@ -0,0 +1,10 @@
+#ifndef KVM_COMPILER_H_
+#define KVM_COMPILER_H_
+
+#ifndef __compiletime_error
+# define __compiletime_error(message)
+#endif
+
+#define notrace __attribute__((no_instrument_function))
+
+#endif /* KVM_COMPILER_H_ */
diff --git a/kvmtool/include/kvm/devices.h b/kvmtool/include/kvm/devices.h
new file mode 100644
index 0000000..e445db6
--- /dev/null
+++ b/kvmtool/include/kvm/devices.h
@@ -0,0 +1,30 @@
+#ifndef KVM__DEVICES_H
+#define KVM__DEVICES_H
+
+#include <linux/rbtree.h>
+#include <linux/types.h>
+#include <linux/compiler.h>
+
+enum device_bus_type {
+	DEVICE_BUS_PCI,
+	DEVICE_BUS_MMIO,
+	DEVICE_BUS_IOPORT,
+	DEVICE_BUS_MAX,
+};
+
+struct device_header {
+	enum device_bus_type	bus_type;
+	void			*data;
+	int			dev_num;
+	struct rb_node		node;
+};
+
+int __must_check device__register(struct device_header *dev);
+void device__unregister(struct device_header *dev);
+struct device_header *device__find_dev(enum device_bus_type bus_type,
+				       u8 dev_num);
+
+struct device_header *device__first_dev(enum device_bus_type bus_type);
+struct device_header *device__next_dev(struct device_header *dev);
+
+#endif /* KVM__DEVICES_H */
diff --git a/kvmtool/include/kvm/disk-image.h b/kvmtool/include/kvm/disk-image.h
new file mode 100644
index 0000000..27d4f7d
--- /dev/null
+++ b/kvmtool/include/kvm/disk-image.h
@@ -0,0 +1,137 @@
+#ifndef KVM__DISK_IMAGE_H
+#define KVM__DISK_IMAGE_H
+
+#include "kvm/read-write.h"
+#include "kvm/util.h"
+#include "kvm/parse-options.h"
+
+#include <linux/types.h>
+#include <linux/fs.h>	/* for BLKGETSIZE64 */
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <sys/uio.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#ifdef CONFIG_HAS_AIO
+#include <libaio.h>
+#endif
+
+#define SECTOR_SHIFT		9
+#define SECTOR_SIZE		(1UL << SECTOR_SHIFT)
+
+enum {
+	DISK_IMAGE_REGULAR,
+	DISK_IMAGE_MMAP,
+};
+
+#define MAX_DISK_IMAGES         4
+
+struct disk_image;
+
+struct disk_image_operations {
+	ssize_t (*read)(struct disk_image *disk, u64 sector, const struct iovec *iov,
+			int iovcount, void *param);
+	ssize_t (*write)(struct disk_image *disk, u64 sector, const struct iovec *iov,
+			int iovcount, void *param);
+	int (*flush)(struct disk_image *disk);
+	int (*wait)(struct disk_image *disk);
+	int (*close)(struct disk_image *disk);
+	bool async;
+};
+
+struct disk_image_params {
+	const char *filename;
+	/*
+	 * wwpn == World Wide Port Number
+	 * tpgt == Target Portal Group Tag
+	 */
+	const char *wwpn;
+	const char *tpgt;
+	bool readonly;
+	bool direct;
+};
+
+struct disk_image {
+	int				fd;
+	u64				size;
+	struct disk_image_operations	*ops;
+	void				*priv;
+	void				*disk_req_cb_param;
+	void				(*disk_req_cb)(void *param, long len);
+	bool				readonly;
+	bool				async;
+#ifdef CONFIG_HAS_AIO
+	io_context_t			ctx;
+	int				evt;
+	pthread_t			thread;
+	u64				aio_inflight;
+#endif /* CONFIG_HAS_AIO */
+	const char			*wwpn;
+	const char			*tpgt;
+	int				debug_iodelay;
+};
+
+int disk_img_name_parser(const struct option *opt, const char *arg, int unset);
+int disk_image__init(struct kvm *kvm);
+int disk_image__exit(struct kvm *kvm);
+struct disk_image *disk_image__new(int fd, u64 size, struct disk_image_operations *ops, int mmap);
+int disk_image__flush(struct disk_image *disk);
+int disk_image__wait(struct disk_image *disk);
+ssize_t disk_image__read(struct disk_image *disk, u64 sector, const struct iovec *iov,
+				int iovcount, void *param);
+ssize_t disk_image__write(struct disk_image *disk, u64 sector, const struct iovec *iov,
+				int iovcount, void *param);
+ssize_t disk_image__get_serial(struct disk_image *disk, void *buffer, ssize_t *len);
+
+struct disk_image *raw_image__probe(int fd, struct stat *st, bool readonly);
+struct disk_image *blkdev__probe(const char *filename, int flags, struct stat *st);
+
+ssize_t raw_image__read_sync(struct disk_image *disk, u64 sector,
+			     const struct iovec *iov, int iovcount, void *param);
+ssize_t raw_image__write_sync(struct disk_image *disk, u64 sector,
+			      const struct iovec *iov, int iovcount, void *param);
+ssize_t raw_image__read_mmap(struct disk_image *disk, u64 sector,
+				const struct iovec *iov, int iovcount, void *param);
+ssize_t raw_image__write_mmap(struct disk_image *disk, u64 sector,
+				const struct iovec *iov, int iovcount, void *param);
+int raw_image__close(struct disk_image *disk);
+void disk_image__set_callback(struct disk_image *disk, void (*disk_req_cb)(void *param, long len));
+
+#ifdef CONFIG_HAS_AIO
+int disk_aio_setup(struct disk_image *disk);
+void disk_aio_destroy(struct disk_image *disk);
+ssize_t raw_image__read_async(struct disk_image *disk, u64 sector,
+			      const struct iovec *iov, int iovcount, void *param);
+ssize_t raw_image__write_async(struct disk_image *disk, u64 sector,
+			       const struct iovec *iov, int iovcount, void *param);
+int raw_image__wait(struct disk_image *disk);
+
+#define raw_image__read		raw_image__read_async
+#define raw_image__write	raw_image__write_async
+
+#else /* !CONFIG_HAS_AIO */
+static inline int disk_aio_setup(struct disk_image *disk)
+{
+	/* No-op */
+	return 0;
+}
+static inline void disk_aio_destroy(struct disk_image *disk)
+{
+}
+
+static inline int raw_image__wait(struct disk_image *disk)
+{
+	return 0;
+}
+#define raw_image__read		raw_image__read_sync
+#define raw_image__write	raw_image__write_sync
+#endif /* CONFIG_HAS_AIO */
+
+#endif /* KVM__DISK_IMAGE_H */
diff --git a/kvmtool/include/kvm/fdt.h b/kvmtool/include/kvm/fdt.h
new file mode 100644
index 0000000..4e61572
--- /dev/null
+++ b/kvmtool/include/kvm/fdt.h
@@ -0,0 +1,40 @@
+#ifndef KVM__FDT_H
+#define KVM__FDT_H
+
+#ifdef CONFIG_HAS_LIBFDT
+#include <libfdt.h>
+#endif
+
+#include <linux/types.h>
+
+#include "kvm/fdt-arch.h"
+
+#define FDT_MAX_SIZE	0x10000
+
+/* Those definitions are generic FDT values for specifying IRQ
+ * types and are used in the Linux kernel internally as well as in
+ * the dts files and their documentation.
+ */
+enum irq_type {
+	IRQ_TYPE_NONE		= 0x00000000,
+	IRQ_TYPE_EDGE_RISING	= 0x00000001,
+	IRQ_TYPE_EDGE_FALLING	= 0x00000002,
+	IRQ_TYPE_EDGE_BOTH	= (IRQ_TYPE_EDGE_FALLING | IRQ_TYPE_EDGE_RISING),
+	IRQ_TYPE_LEVEL_HIGH	= 0x00000004,
+	IRQ_TYPE_LEVEL_LOW	= 0x00000008,
+	IRQ_TYPE_LEVEL_MASK	= (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH),
+};
+
+extern char *fdt_stdout_path;
+
+/* Helper for the various bits of code that generate FDT nodes */
+#define _FDT(exp)							\
+	do {								\
+		int ret = (exp);					\
+		if (ret < 0) {						\
+			die("Error creating device tree: %s: %s\n",	\
+			    #exp, fdt_strerror(ret));			\
+		}							\
+	} while (0)
+
+#endif /* KVM__FDT_H */
diff --git a/kvmtool/include/kvm/framebuffer.h b/kvmtool/include/kvm/framebuffer.h
new file mode 100644
index 0000000..e3200e5
--- /dev/null
+++ b/kvmtool/include/kvm/framebuffer.h
@@ -0,0 +1,36 @@
+#ifndef KVM__FRAMEBUFFER_H
+#define KVM__FRAMEBUFFER_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+
+struct framebuffer;
+
+struct fb_target_operations {
+	int (*start)(struct framebuffer *fb);
+	int (*stop)(struct framebuffer *fb);
+};
+
+#define FB_MAX_TARGETS			2
+
+struct framebuffer {
+	struct list_head		node;
+
+	u32				width;
+	u32				height;
+	u8				depth;
+	char				*mem;
+	u64				mem_addr;
+	u64				mem_size;
+	struct kvm			*kvm;
+
+	unsigned long			nr_targets;
+	struct fb_target_operations	*targets[FB_MAX_TARGETS];
+};
+
+struct framebuffer *fb__register(struct framebuffer *fb);
+int fb__attach(struct framebuffer *fb, struct fb_target_operations *ops);
+int fb__init(struct kvm *kvm);
+int fb__exit(struct kvm *kvm);
+
+#endif /* KVM__FRAMEBUFFER_H */
diff --git a/kvmtool/include/kvm/gtk3.h b/kvmtool/include/kvm/gtk3.h
new file mode 100644
index 0000000..b02dc13
--- /dev/null
+++ b/kvmtool/include/kvm/gtk3.h
@@ -0,0 +1,28 @@
+#ifndef KVM__GTK3_H
+#define KVM__GTK3_H
+
+#include "kvm/util.h"
+
+struct framebuffer;
+
+#ifdef CONFIG_HAS_GTK3
+int kvm_gtk_init(struct kvm *kvm);
+int kvm_gtk_exit(struct kvm *kvm);
+#else
+static inline int kvm_gtk_init(struct kvm *kvm)
+{
+	if (kvm->cfg.gtk)
+		die("GTK3 support not compiled in. (install the gtk3-devel or libgtk3.0-dev package)");
+
+	return 0;
+}
+static inline int kvm_gtk_exit(struct kvm *kvm)
+{
+	if (kvm->cfg.gtk)
+		die("GTK3 support not compiled in. (install the gtk3-devel or libgtk3.0-dev package)");
+
+	return 0;
+}
+#endif
+
+#endif /* KVM__GTK3_H */
diff --git a/kvmtool/include/kvm/guest_compat.h b/kvmtool/include/kvm/guest_compat.h
new file mode 100644
index 0000000..ae7abbd
--- /dev/null
+++ b/kvmtool/include/kvm/guest_compat.h
@@ -0,0 +1,9 @@
+#ifndef KVM__GUEST_COMPAT_H
+#define KVM__GUEST_COMPAT_H
+
+int compat__print_all_messages(void);
+int compat__remove_message(int id);
+int compat__add_message(const char *title, const char *description);
+
+
+#endif
\ No newline at end of file
diff --git a/kvmtool/include/kvm/i8042.h b/kvmtool/include/kvm/i8042.h
new file mode 100644
index 0000000..3b4ab68
--- /dev/null
+++ b/kvmtool/include/kvm/i8042.h
@@ -0,0 +1,12 @@
+#ifndef KVM__PCKBD_H
+#define KVM__PCKBD_H
+
+#include <linux/types.h>
+
+struct kvm;
+
+void mouse_queue(u8 c);
+void kbd_queue(u8 c);
+int kbd__init(struct kvm *kvm);
+
+#endif
diff --git a/kvmtool/include/kvm/ioeventfd.h b/kvmtool/include/kvm/ioeventfd.h
new file mode 100644
index 0000000..a1cb841
--- /dev/null
+++ b/kvmtool/include/kvm/ioeventfd.h
@@ -0,0 +1,32 @@
+#ifndef KVM__IOEVENTFD_H
+#define KVM__IOEVENTFD_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <sys/eventfd.h>
+#include "kvm/util.h"
+
+struct kvm;
+
+struct ioevent {
+	u64			io_addr;
+	u8			io_len;
+	void			(*fn)(struct kvm *kvm, void *ptr);
+	struct kvm		*fn_kvm;
+	void			*fn_ptr;
+	int			fd;
+	u64			datamatch;
+	u32			flags;
+
+	struct list_head	list;
+};
+
+#define IOEVENTFD_FLAG_PIO		(1 << 0)
+#define IOEVENTFD_FLAG_USER_POLL	(1 << 1)
+
+int ioeventfd__init(struct kvm *kvm);
+int ioeventfd__exit(struct kvm *kvm);
+int ioeventfd__add_event(struct ioevent *ioevent, int flags);
+int ioeventfd__del_event(u64 addr, u64 datamatch);
+
+#endif
diff --git a/kvmtool/include/kvm/ioport.h b/kvmtool/include/kvm/ioport.h
new file mode 100644
index 0000000..039633f
--- /dev/null
+++ b/kvmtool/include/kvm/ioport.h
@@ -0,0 +1,77 @@
+#ifndef KVM__IOPORT_H
+#define KVM__IOPORT_H
+
+#include "kvm/devices.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/rbtree-interval.h"
+#include "kvm/fdt.h"
+
+#include <stdbool.h>
+#include <limits.h>
+#include <asm/types.h>
+#include <linux/types.h>
+#include <linux/byteorder.h>
+
+/* some ports we reserve for own use */
+#define IOPORT_DBG			0xe0
+
+struct kvm;
+
+struct ioport {
+	struct rb_int_node		node;
+	struct ioport_operations	*ops;
+	void				*priv;
+	struct device_header		dev_hdr;
+	u32				refcount;
+	bool				remove;
+};
+
+struct ioport_operations {
+	bool (*io_in)(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size);
+	bool (*io_out)(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size);
+	void (*generate_fdt_node)(struct ioport *ioport, void *fdt,
+				  void (*generate_irq_prop)(void *fdt,
+							    u8 irq,
+							    enum irq_type));
+};
+
+int ioport__setup_arch(struct kvm *kvm);
+void ioport__map_irq(u8 *irq);
+
+int __must_check ioport__register(struct kvm *kvm, u16 port, struct ioport_operations *ops,
+				  int count, void *param);
+int ioport__unregister(struct kvm *kvm, u16 port);
+int ioport__init(struct kvm *kvm);
+int ioport__exit(struct kvm *kvm);
+
+static inline u8 ioport__read8(u8 *data)
+{
+	return *data;
+}
+/* On BE platforms, PCI I/O is byteswapped, i.e. LE, so swap back. */
+static inline u16 ioport__read16(u16 *data)
+{
+	return le16_to_cpu(*data);
+}
+
+static inline u32 ioport__read32(u32 *data)
+{
+	return le32_to_cpu(*data);
+}
+
+static inline void ioport__write8(u8 *data, u8 value)
+{
+	*data		 = value;
+}
+
+static inline void ioport__write16(u16 *data, u16 value)
+{
+	*data		 = cpu_to_le16(value);
+}
+
+static inline void ioport__write32(u32 *data, u32 value)
+{
+	*data		 = cpu_to_le32(value);
+}
+
+#endif /* KVM__IOPORT_H */
diff --git a/kvmtool/include/kvm/iovec.h b/kvmtool/include/kvm/iovec.h
new file mode 100644
index 0000000..fe79dd4
--- /dev/null
+++ b/kvmtool/include/kvm/iovec.h
@@ -0,0 +1,21 @@
+#ifndef KVM_UTIL_IOVEC_H_
+#define KVM_UTIL_IOVEC_H_
+
+extern int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len);
+extern int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
+				size_t offset, int len);
+extern int memcpy_toiovec(struct iovec *v, unsigned char *kdata, int len);
+extern int memcpy_toiovecend(const struct iovec *v, unsigned char *kdata,
+				size_t offset, int len);
+
+static inline size_t iov_size(const struct iovec *iovec, size_t len)
+{
+	size_t size = 0, i;
+
+	for (i = 0; i < len; i++)
+		size += iovec[i].iov_len;
+
+	return size;
+}
+
+#endif
diff --git a/kvmtool/include/kvm/irq.h b/kvmtool/include/kvm/irq.h
new file mode 100644
index 0000000..2a3f8c9
--- /dev/null
+++ b/kvmtool/include/kvm/irq.h
@@ -0,0 +1,55 @@
+#ifndef KVM__IRQ_H
+#define KVM__IRQ_H
+
+#include <stdbool.h>
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/kvm.h>
+
+#include "kvm/kvm-arch.h"
+#include "kvm/msi.h"
+
+struct kvm;
+
+struct msi_routing_ops {
+	int (*update_route)(struct kvm *kvm, struct kvm_irq_routing_entry *);
+	bool (*can_signal_msi)(struct kvm *kvm);
+	int (*signal_msi)(struct kvm *kvm, struct kvm_msi *msi);
+	int (*translate_gsi)(struct kvm *kvm, u32 gsi);
+};
+
+extern struct msi_routing_ops *msi_routing_ops;
+extern struct kvm_irq_routing *irq_routing;
+extern int next_gsi;
+
+int irq__alloc_line(void);
+int irq__get_nr_allocated_lines(void);
+
+int irq__init(struct kvm *kvm);
+int irq__exit(struct kvm *kvm);
+
+int irq__allocate_routing_entry(void);
+int irq__add_msix_route(struct kvm *kvm, struct msi_msg *msg, u32 device_id);
+void irq__update_msix_route(struct kvm *kvm, u32 gsi, struct msi_msg *msg);
+
+bool irq__can_signal_msi(struct kvm *kvm);
+int irq__signal_msi(struct kvm *kvm, struct kvm_msi *msi);
+
+/*
+ * The function takes two eventfd arguments, trigger_fd and resample_fd. If
+ * resample_fd is <= 0, resampling is disabled and the IRQ is edge-triggered
+ */
+int irq__common_add_irqfd(struct kvm *kvm, unsigned int gsi, int trigger_fd,
+			   int resample_fd);
+void irq__common_del_irqfd(struct kvm *kvm, unsigned int gsi, int trigger_fd);
+
+#ifndef irq__add_irqfd
+#define irq__add_irqfd irq__common_add_irqfd
+#endif
+
+#ifndef irq__del_irqfd
+#define irq__del_irqfd irq__common_del_irqfd
+#endif
+
+#endif
diff --git a/kvmtool/include/kvm/kvm-cmd.h b/kvmtool/include/kvm/kvm-cmd.h
new file mode 100644
index 0000000..0a73bce
--- /dev/null
+++ b/kvmtool/include/kvm/kvm-cmd.h
@@ -0,0 +1,17 @@
+#ifndef __KVM_CMD_H__
+#define __KVM_CMD_H__
+
+struct cmd_struct {
+	const char *cmd;
+	int (*fn)(int, const char **, const char *);
+	void (*help)(void);
+	int option;
+};
+
+extern struct cmd_struct kvm_commands[];
+struct cmd_struct *kvm_get_command(struct cmd_struct *command,
+                const char *cmd);
+
+int handle_command(struct cmd_struct *command, int argc, const char **argv);
+
+#endif
diff --git a/kvmtool/include/kvm/kvm-config.h b/kvmtool/include/kvm/kvm-config.h
new file mode 100644
index 0000000..f4a8b83
--- /dev/null
+++ b/kvmtool/include/kvm/kvm-config.h
@@ -0,0 +1,66 @@
+#ifndef KVM_CONFIG_H_
+#define KVM_CONFIG_H_
+
+#include "kvm/disk-image.h"
+#include "kvm/vfio.h"
+#include "kvm/kvm-config-arch.h"
+
+#define DEFAULT_KVM_DEV		"/dev/kvm"
+#define DEFAULT_CONSOLE		"serial"
+#define DEFAULT_NETWORK		"user"
+#define DEFAULT_HOST_ADDR	"192.168.33.1"
+#define DEFAULT_GUEST_ADDR	"192.168.33.15"
+#define DEFAULT_GUEST_MAC	"02:15:15:15:15:15"
+#define DEFAULT_HOST_MAC	"02:01:01:01:01:01"
+#define DEFAULT_SCRIPT		"none"
+#define DEFAULT_SANDBOX_FILENAME "guest/sandbox.sh"
+
+#define MIN_RAM_SIZE_MB		(64ULL)
+#define MIN_RAM_SIZE_BYTE	(MIN_RAM_SIZE_MB << MB_SHIFT)
+
+struct kvm_config {
+	struct kvm_config_arch arch;
+	struct disk_image_params disk_image[MAX_DISK_IMAGES];
+	struct vfio_device_params *vfio_devices;
+	u64 ram_size;
+	u8  image_count;
+	u8 num_net_devices;
+	u8 num_vfio_devices;
+	bool virtio_rng;
+	int active_console;
+	int debug_iodelay;
+	int nrcpus;
+	const char *kernel_cmdline;
+	const char *kernel_filename;
+	const char *vmlinux_filename;
+	const char *initrd_filename;
+	const char *firmware_filename;
+	const char *flash_filename;
+	const char *console;
+	const char *dev;
+	const char *network;
+	const char *host_ip;
+	const char *guest_ip;
+	const char *guest_mac;
+	const char *host_mac;
+	const char *script;
+	const char *guest_name;
+	const char *sandbox;
+	const char *hugetlbfs_path;
+	const char *custom_rootfs_name;
+	const char *real_cmdline;
+	struct virtio_net_params *net_params;
+	bool single_step;
+	bool vnc;
+	bool gtk;
+	bool sdl;
+	bool balloon;
+	bool using_rootfs;
+	bool custom_rootfs;
+	bool no_net;
+	bool no_dhcp;
+	bool ioport_debug;
+	bool mmio_debug;
+};
+
+#endif
diff --git a/kvmtool/include/kvm/kvm-cpu.h b/kvmtool/include/kvm/kvm-cpu.h
new file mode 100644
index 0000000..0f16f8d
--- /dev/null
+++ b/kvmtool/include/kvm/kvm-cpu.h
@@ -0,0 +1,32 @@
+#ifndef KVM__KVM_CPU_H
+#define KVM__KVM_CPU_H
+
+#include "kvm/kvm-cpu-arch.h"
+#include <stdbool.h>
+
+struct kvm_cpu_task {
+	void (*func)(struct kvm_cpu *vcpu, void *data);
+	void *data;
+};
+
+int kvm_cpu__init(struct kvm *kvm);
+int kvm_cpu__exit(struct kvm *kvm);
+struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, unsigned long cpu_id);
+void kvm_cpu__delete(struct kvm_cpu *vcpu);
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu);
+void kvm_cpu__setup_cpuid(struct kvm_cpu *vcpu);
+void kvm_cpu__enable_singlestep(struct kvm_cpu *vcpu);
+void kvm_cpu__run(struct kvm_cpu *vcpu);
+int kvm_cpu__start(struct kvm_cpu *cpu);
+bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu);
+int kvm_cpu__get_endianness(struct kvm_cpu *vcpu);
+
+int kvm_cpu__get_debug_fd(void);
+void kvm_cpu__set_debug_fd(int fd);
+void kvm_cpu__show_code(struct kvm_cpu *vcpu);
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu);
+void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu);
+void kvm_cpu__arch_nmi(struct kvm_cpu *cpu);
+void kvm_cpu__run_on_all_cpus(struct kvm *kvm, struct kvm_cpu_task *task);
+
+#endif /* KVM__KVM_CPU_H */
diff --git a/kvmtool/include/kvm/kvm-ipc.h b/kvmtool/include/kvm/kvm-ipc.h
new file mode 100644
index 0000000..5494da4
--- /dev/null
+++ b/kvmtool/include/kvm/kvm-ipc.h
@@ -0,0 +1,26 @@
+#ifndef KVM__IPC_H_
+#define KVM__IPC_H_
+
+#include <linux/types.h>
+#include "kvm/kvm.h"
+
+enum {
+	KVM_IPC_BALLOON	= 1,
+	KVM_IPC_DEBUG	= 2,
+	KVM_IPC_STAT	= 3,
+	KVM_IPC_PAUSE	= 4,
+	KVM_IPC_RESUME	= 5,
+	KVM_IPC_STOP	= 6,
+	KVM_IPC_PID	= 7,
+	KVM_IPC_VMSTATE	= 8,
+};
+
+int kvm_ipc__register_handler(u32 type, void (*cb)(struct kvm *kvm,
+				int fd, u32 type, u32 len, u8 *msg));
+int kvm_ipc__init(struct kvm *kvm);
+int kvm_ipc__exit(struct kvm *kvm);
+
+int kvm_ipc__send(int fd, u32 type);
+int kvm_ipc__send_msg(int fd, u32 type, u32 len, u8 *msg);
+
+#endif
diff --git a/kvmtool/include/kvm/kvm.h b/kvmtool/include/kvm/kvm.h
new file mode 100644
index 0000000..53373b0
--- /dev/null
+++ b/kvmtool/include/kvm/kvm.h
@@ -0,0 +1,205 @@
+#ifndef KVM__KVM_H
+#define KVM__KVM_H
+
+#include "kvm/mutex.h"
+#include "kvm/kvm-arch.h"
+#include "kvm/kvm-config.h"
+#include "kvm/util-init.h"
+#include "kvm/kvm.h"
+
+#include <stdbool.h>
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <time.h>
+#include <signal.h>
+#include <sys/prctl.h>
+#include <limits.h>
+
+#define SIGKVMEXIT		(SIGRTMIN + 0)
+#define SIGKVMPAUSE		(SIGRTMIN + 1)
+#define SIGKVMTASK		(SIGRTMIN + 2)
+
+#define KVM_PID_FILE_PATH	"/.lkvm/"
+#define HOME_DIR		getenv("HOME")
+#define KVM_BINARY_NAME		"lkvm"
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE (sysconf(_SC_PAGE_SIZE))
+#endif
+
+#define DEFINE_KVM_EXT(ext)		\
+	.name = #ext,			\
+	.code = ext
+
+enum {
+	KVM_VMSTATE_RUNNING,
+	KVM_VMSTATE_PAUSED,
+};
+
+enum kvm_mem_type {
+	KVM_MEM_TYPE_RAM	= 1 << 0,
+	KVM_MEM_TYPE_DEVICE	= 1 << 1,
+	KVM_MEM_TYPE_RESERVED	= 1 << 2,
+	KVM_MEM_TYPE_READONLY	= 1 << 3,
+
+	KVM_MEM_TYPE_ALL	= KVM_MEM_TYPE_RAM
+				| KVM_MEM_TYPE_DEVICE
+				| KVM_MEM_TYPE_RESERVED
+				| KVM_MEM_TYPE_READONLY
+};
+
+struct kvm_ext {
+	const char *name;
+	int code;
+};
+
+struct kvm_mem_bank {
+	struct list_head	list;
+	u64			guest_phys_addr;
+	void			*host_addr;
+	u64			size;
+	enum kvm_mem_type	type;
+	u32			slot;
+};
+
+struct kvm {
+	struct kvm_arch		arch;
+	struct kvm_config	cfg;
+	int			sys_fd;		/* For system ioctls(), i.e. /dev/kvm */
+	int			vm_fd;		/* For VM ioctls() */
+	timer_t			timerid;	/* Posix timer for interrupts */
+
+	int			nrcpus;		/* Number of cpus to run */
+	struct kvm_cpu		**cpus;
+
+	u32			mem_slots;	/* for KVM_SET_USER_MEMORY_REGION */
+	u64			ram_size;
+	void			*ram_start;
+	u64			ram_pagesize;
+	struct mutex		mem_banks_lock;
+	struct list_head	mem_banks;
+
+	bool			nmi_disabled;
+	bool			msix_needs_devid;
+
+	const char		*vmlinux;
+	struct disk_image       **disks;
+	int                     nr_disks;
+
+	int			vm_state;
+
+#ifdef KVM_BRLOCK_DEBUG
+	pthread_rwlock_t	brlock_sem;
+#endif
+};
+
+void kvm__set_dir(const char *fmt, ...);
+const char *kvm__get_dir(void);
+
+int kvm__init(struct kvm *kvm);
+struct kvm *kvm__new(void);
+int kvm__recommended_cpus(struct kvm *kvm);
+int kvm__max_cpus(struct kvm *kvm);
+void kvm__init_ram(struct kvm *kvm);
+int kvm__exit(struct kvm *kvm);
+bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename);
+bool kvm__load_kernel(struct kvm *kvm, const char *kernel_filename,
+			const char *initrd_filename, const char *kernel_cmdline);
+int kvm_timer__init(struct kvm *kvm);
+int kvm_timer__exit(struct kvm *kvm);
+void kvm__irq_line(struct kvm *kvm, int irq, int level);
+void kvm__irq_trigger(struct kvm *kvm, int irq);
+bool kvm__emulate_io(struct kvm_cpu *vcpu, u16 port, void *data, int direction, int size, u32 count);
+bool kvm__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, u8 is_write);
+int kvm__destroy_mem(struct kvm *kvm, u64 guest_phys, u64 size, void *userspace_addr);
+int kvm__register_mem(struct kvm *kvm, u64 guest_phys, u64 size, void *userspace_addr,
+		      enum kvm_mem_type type);
+static inline int kvm__register_ram(struct kvm *kvm, u64 guest_phys, u64 size,
+				    void *userspace_addr)
+{
+	return kvm__register_mem(kvm, guest_phys, size, userspace_addr,
+				 KVM_MEM_TYPE_RAM);
+}
+
+static inline int kvm__register_dev_mem(struct kvm *kvm, u64 guest_phys,
+					u64 size, void *userspace_addr)
+{
+	return kvm__register_mem(kvm, guest_phys, size, userspace_addr,
+				 KVM_MEM_TYPE_DEVICE);
+}
+
+static inline int kvm__reserve_mem(struct kvm *kvm, u64 guest_phys, u64 size)
+{
+	return kvm__register_mem(kvm, guest_phys, size, NULL,
+				 KVM_MEM_TYPE_RESERVED);
+}
+
+int __must_check kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, bool coalesce,
+				    void (*mmio_fn)(struct kvm_cpu *vcpu, u64 addr, u8 *data, u32 len, u8 is_write, void *ptr),
+				    void *ptr);
+bool kvm__deregister_mmio(struct kvm *kvm, u64 phys_addr);
+void kvm__reboot(struct kvm *kvm);
+void kvm__pause(struct kvm *kvm);
+void kvm__continue(struct kvm *kvm);
+void kvm__notify_paused(void);
+int kvm__get_sock_by_instance(const char *name);
+int kvm__enumerate_instances(int (*callback)(const char *name, int pid));
+void kvm__remove_socket(const char *name);
+
+void kvm__arch_set_cmdline(char *cmdline, bool video);
+void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size);
+void kvm__arch_delete_ram(struct kvm *kvm);
+int kvm__arch_setup_firmware(struct kvm *kvm);
+int kvm__arch_free_firmware(struct kvm *kvm);
+bool kvm__arch_cpu_supports_vm(void);
+void kvm__arch_read_term(struct kvm *kvm);
+
+void *guest_flat_to_host(struct kvm *kvm, u64 offset);
+u64 host_to_guest_flat(struct kvm *kvm, void *ptr);
+
+bool kvm__arch_load_kernel_image(struct kvm *kvm, int fd_kernel, int fd_initrd,
+				 const char *kernel_cmdline);
+
+#define add_read_only(type, str)					\
+	(((type) & KVM_MEM_TYPE_READONLY) ? str " (read-only)" : str)
+static inline const char *kvm_mem_type_to_string(enum kvm_mem_type type)
+{
+	switch (type & ~KVM_MEM_TYPE_READONLY) {
+	case KVM_MEM_TYPE_ALL:
+		return "(all)";
+	case KVM_MEM_TYPE_RAM:
+		return add_read_only(type, "RAM");
+	case KVM_MEM_TYPE_DEVICE:
+		return add_read_only(type, "device");
+	case KVM_MEM_TYPE_RESERVED:
+		return add_read_only(type, "reserved");
+	}
+
+	return "???";
+}
+
+int kvm__for_each_mem_bank(struct kvm *kvm, enum kvm_mem_type type,
+			   int (*fun)(struct kvm *kvm, struct kvm_mem_bank *bank, void *data),
+			   void *data);
+
+/*
+ * Debugging
+ */
+void kvm__dump_mem(struct kvm *kvm, unsigned long addr, unsigned long size, int debug_fd);
+
+extern const char *kvm_exit_reasons[];
+
+static inline bool host_ptr_in_ram(struct kvm *kvm, void *p)
+{
+	return kvm->ram_start <= p && p < (kvm->ram_start + kvm->ram_size);
+}
+
+bool kvm__supports_extension(struct kvm *kvm, unsigned int extension);
+bool kvm__supports_vm_extension(struct kvm *kvm, unsigned int extension);
+
+static inline void kvm__set_thread_name(const char *name)
+{
+	prctl(PR_SET_NAME, name);
+}
+
+#endif /* KVM__KVM_H */
diff --git a/kvmtool/include/kvm/msi.h b/kvmtool/include/kvm/msi.h
new file mode 100644
index 0000000..885eb5b
--- /dev/null
+++ b/kvmtool/include/kvm/msi.h
@@ -0,0 +1,10 @@
+#ifndef LKVM_MSI_H
+#define LKVM_MSI_H
+
+struct msi_msg {
+	u32	address_lo;	/* low 32 bits of msi message address */
+	u32	address_hi;	/* high 32 bits of msi message address */
+	u32	data;		/* 16 bits of msi message data */
+};
+
+#endif /* LKVM_MSI_H */
diff --git a/kvmtool/include/kvm/mutex.h b/kvmtool/include/kvm/mutex.h
new file mode 100644
index 0000000..1f7d0f6
--- /dev/null
+++ b/kvmtool/include/kvm/mutex.h
@@ -0,0 +1,39 @@
+#ifndef KVM__MUTEX_H
+#define KVM__MUTEX_H
+
+#include <pthread.h>
+
+#include "kvm/util.h"
+
+/*
+ * Kernel-alike mutex API - to make it easier for kernel developers
+ * to write user-space code! :-)
+ */
+
+struct mutex {
+	pthread_mutex_t mutex;
+};
+#define MUTEX_INITIALIZER { .mutex = PTHREAD_MUTEX_INITIALIZER }
+
+#define DEFINE_MUTEX(mtx) struct mutex mtx = MUTEX_INITIALIZER
+
+static inline void mutex_init(struct mutex *lock)
+{
+	if (pthread_mutex_init(&lock->mutex, NULL) != 0)
+		die("unexpected pthread_mutex_init() failure!");
+}
+
+static inline void mutex_lock(struct mutex *lock)
+{
+	if (pthread_mutex_lock(&lock->mutex) != 0)
+		die("unexpected pthread_mutex_lock() failure!");
+
+}
+
+static inline void mutex_unlock(struct mutex *lock)
+{
+	if (pthread_mutex_unlock(&lock->mutex) != 0)
+		die("unexpected pthread_mutex_unlock() failure!");
+}
+
+#endif /* KVM__MUTEX_H */
diff --git a/kvmtool/include/kvm/of_pci.h b/kvmtool/include/kvm/of_pci.h
new file mode 100644
index 0000000..c8187ab
--- /dev/null
+++ b/kvmtool/include/kvm/of_pci.h
@@ -0,0 +1,44 @@
+#ifndef KVM__OF_PCI_H
+#define KVM__OF_PCI_H
+
+#include <linux/types.h>
+
+/*
+ * Definitions for implementing parts of the OpenFirmware PCI Bus Binding
+ * Specification (IEEE Std 1275-1994).
+ */
+
+struct of_pci_unit_address {
+	u32 hi, mid, lo;
+} __attribute__((packed));
+
+struct of_pci_irq_mask {
+	struct of_pci_unit_address	pci_addr;
+	u32				pci_pin;
+} __attribute__((packed));
+
+struct of_pci_ranges_entry {
+	struct of_pci_unit_address	pci_addr;
+	u64				cpu_addr;
+	u64				length;
+} __attribute__((packed));
+
+/* Macros to operate with address in OF binding to PCI */
+#define __b_x(x, p, l)		(((x) & ((1<<(l))-1)) << (p))
+#define of_pci_b_n(x)		__b_x((x), 31, 1)	/* 0 if relocatable */
+#define of_pci_b_p(x)		__b_x((x), 30, 1)	/* 1 if prefetchable */
+#define of_pci_b_t(x)		__b_x((x), 29, 1)	/* 1 if the address is aliased */
+#define of_pci_b_ss(x)		__b_x((x), 24, 2)	/* the space code */
+#define of_pci_b_bbbbbbbb(x)	__b_x((x), 16, 8)	/* bus number */
+#define of_pci_b_ddddd(x)	__b_x((x), 11, 5)	/* device number */
+#define of_pci_b_fff(x)		__b_x((x), 8, 3)	/* function number */
+#define of_pci_b_rrrrrrrr(x)	__b_x((x), 0, 8)	/* register number */
+
+#define OF_PCI_SS_CONFIG	0
+#define OF_PCI_SS_IO		1
+#define OF_PCI_SS_M32		2
+#define OF_PCI_SS_M64		3
+
+#define OF_PCI_IRQ_MAP_MAX	256	/* 5 bit device + 3 bit pin */
+
+#endif /* KVM__OF_PCI_H */
diff --git a/kvmtool/include/kvm/parse-options.h b/kvmtool/include/kvm/parse-options.h
new file mode 100644
index 0000000..b03f151
--- /dev/null
+++ b/kvmtool/include/kvm/parse-options.h
@@ -0,0 +1,230 @@
+#ifndef __PARSE_OPTIONS_H__
+#define __PARSE_OPTIONS_H__
+
+#include <inttypes.h>
+#include <kvm/util.h>
+
+enum parse_opt_type {
+	/* special types */
+	OPTION_END,
+	OPTION_ARGUMENT,
+	OPTION_GROUP,
+	/* options with no arguments */
+	OPTION_BIT,
+	OPTION_BOOLEAN,
+	OPTION_INCR,
+	OPTION_SET_UINT,
+	OPTION_SET_PTR,
+	/* options with arguments (usually) */
+	OPTION_STRING,
+	OPTION_INTEGER,
+	OPTION_LONG,
+	OPTION_CALLBACK,
+	OPTION_U64,
+	OPTION_UINTEGER,
+};
+
+enum parse_opt_flags {
+	PARSE_OPT_KEEP_DASHDASH = 1,
+	PARSE_OPT_STOP_AT_NON_OPTION = 2,
+	PARSE_OPT_KEEP_ARGV0 = 4,
+	PARSE_OPT_KEEP_UNKNOWN = 8,
+	PARSE_OPT_NO_INTERNAL_HELP = 16,
+};
+
+enum parse_opt_option_flags {
+	PARSE_OPT_OPTARG  = 1,
+	PARSE_OPT_NOARG   = 2,
+	PARSE_OPT_NONEG   = 4,
+	PARSE_OPT_HIDDEN  = 8,
+	PARSE_OPT_LASTARG_DEFAULT = 16,
+};
+
+struct option;
+typedef int parse_opt_cb(const struct option *, const char *arg, int unset);
+/*
+ * `type`::
+ *   holds the type of the option, you must have an OPTION_END last in your
+ *   array.
+ *
+ * `short_name`::
+ *   the character to use as a short option name, '\0' if none.
+ *
+ * `long_name`::
+ *   the long option name, without the leading dashes, NULL if none.
+ *
+ * `value`::
+ *   stores pointers to the values to be filled.
+ *
+ * `argh`::
+ *   token to explain the kind of argument this option wants. Keep it
+ *   homogenous across the repository.
+ *
+ * `help`::
+ *   the short help associated to what the option does.
+ *   Must never be NULL (except for OPTION_END).
+ *   OPTION_GROUP uses this pointer to store the group header.
+ *
+ * `flags`::
+ *   mask of parse_opt_option_flags.
+ *   PARSE_OPT_OPTARG: says that the argument is optionnal (not for BOOLEANs)
+ *   PARSE_OPT_NOARG: says that this option takes no argument, for CALLBACKs
+ *   PARSE_OPT_NONEG: says that this option cannot be negated
+ *   PARSE_OPT_HIDDEN this option is skipped in the default usage, showed in
+ *                    the long one.
+ *
+ * `callback`::
+ *   pointer to the callback to use for OPTION_CALLBACK.
+ *
+ * `defval`::
+ *   default value to fill (*->value) with for PARSE_OPT_OPTARG.
+ *   OPTION_{BIT,SET_UINT,SET_PTR} store the {mask,integer,pointer} to put in
+ *   the value when met.
+ *   CALLBACKS can use it like they want.
+ */
+struct option {
+	enum parse_opt_type type;
+	int short_name;
+	const char *long_name;
+	void *value;
+	const char *argh;
+	const char *help;
+	void *ptr;
+
+	int flags;
+	parse_opt_cb *callback;
+	intptr_t defval;
+};
+
+#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
+#define check_vtype(v, type) \
+	(BUILD_BUG_ON_ZERO(!__builtin_types_compatible_p(typeof(v), type)) + v)
+
+#define OPT_INTEGER(s, l, v, h)             \
+{                                           \
+	.type = OPTION_INTEGER,             \
+	.short_name = (s),                  \
+	.long_name = (l),                   \
+	.value = check_vtype(v, int *),     \
+	.help = (h)                         \
+}
+
+#define OPT_UINTEGER(s, l, v, h)            \
+{                                           \
+	.type = OPTION_UINTEGER,            \
+	.short_name = (s),                  \
+	.long_name = (l),                   \
+	.value = check_vtype(v, unsigned int *), \
+	.help = (h)                         \
+}
+
+#define OPT_U64(s, l, v, h)                 \
+{                                           \
+	.type = OPTION_U64,                 \
+	.short_name = (s),                  \
+	.long_name = (l),                   \
+	.value = check_vtype(v, u64 *),     \
+	.help = (h)                         \
+}
+
+#define OPT_STRING(s, l, v, a, h)           \
+{                                           \
+	.type = OPTION_STRING,              \
+	.short_name = (s),                  \
+	.long_name = (l),                   \
+	.value = check_vtype(v, const char **), (a), \
+	.help = (h)                         \
+}
+
+#define OPT_BOOLEAN(s, l, v, h)             \
+{                                           \
+	.type = OPTION_BOOLEAN,             \
+	.short_name = (s),                  \
+	.long_name = (l),                   \
+	.value = check_vtype(v, bool *),    \
+	.help = (h)                         \
+}
+
+#define OPT_INCR(s, l, v, h)                \
+{                                           \
+	.type = OPTION_INCR,	            \
+	.short_name = (s),                  \
+	.long_name = (l),                   \
+	.value = check_vtype(v, int *),     \
+	.help = (h)                         \
+}
+
+#define OPT_GROUP(h)                        \
+{                                           \
+	.type = OPTION_GROUP,               \
+	.help = (h)                         \
+}
+
+#define OPT_CALLBACK(s, l, v, a, h, f, p)   \
+{					    \
+	.type = OPTION_CALLBACK,	    \
+	.short_name = (s),		    \
+	.long_name = (l),		    \
+	.value = (v),			    \
+	(a),				    \
+	.help = (h),			    \
+	.callback = (f),		    \
+	.ptr = (p),			    \
+}
+
+#define OPT_CALLBACK_NOOPT(s, l, v, a, h, f, p) \
+{					    \
+	.type = OPTION_CALLBACK,	    \
+	.short_name = (s),		    \
+	.long_name = (l),		    \
+	.value = (v),			    \
+	(a),				    \
+	.help = (h),			    \
+	.callback = (f),		    \
+	.flags = PARSE_OPT_NOARG,	    \
+	.ptr = (p),			    \
+}
+
+#define OPT_CALLBACK_DEFAULT(s, l, v, a, h, f, d, p) \
+{					    \
+	.type = OPTION_CALLBACK,	    \
+	.short_name = (s),		    \
+	.long_name = (l),		    \
+	.value = (v), (a),		    \
+	.help = (h),			    \
+	.callback = (f),		    \
+	.defval = (intptr_t)d,		    \
+	.flags = PARSE_OPT_LASTARG_DEFAULT, \
+	.ptr = (p)			    \
+}
+
+#define OPT_END() { .type = OPTION_END }
+
+#define OPT_ARCH(cmd, cfg)		    \
+	OPT_ARCH_##cmd(OPT_GROUP("Arch-specific options:"), &(cfg)->arch)
+
+enum {
+	PARSE_OPT_HELP = -1,
+	PARSE_OPT_DONE,
+	PARSE_OPT_UNKNOWN,
+};
+
+/*
+ * It's okay for the caller to consume argv/argc in the usual way.
+ * Other fields of that structure are private to parse-options and should not
+ * be modified in any way.
+ **/
+struct parse_opt_ctx_t {
+	const char **argv;
+	const char **out;
+	int argc, cpidx;
+	const char *opt;
+	int flags;
+};
+
+/* global functions */
+void usage_with_options(const char * const *usagestr,
+		const struct option *opts) NORETURN;
+int parse_options(int argc, const char **argv, const struct option *options,
+		const char * const usagestr[], int flags);
+#endif
diff --git a/kvmtool/include/kvm/pci.h b/kvmtool/include/kvm/pci.h
new file mode 100644
index 0000000..bf81323
--- /dev/null
+++ b/kvmtool/include/kvm/pci.h
@@ -0,0 +1,243 @@
+#ifndef KVM__PCI_H
+#define KVM__PCI_H
+
+#include <linux/types.h>
+#include <linux/kvm.h>
+#include <linux/pci_regs.h>
+#include <endian.h>
+#include <stdbool.h>
+
+#include "kvm/devices.h"
+#include "kvm/msi.h"
+#include "kvm/fdt.h"
+
+#define pci_dev_err(pci_hdr, fmt, ...) \
+	pr_err("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
+#define pci_dev_warn(pci_hdr, fmt, ...) \
+	pr_warning("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
+#define pci_dev_info(pci_hdr, fmt, ...) \
+	pr_info("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
+#define pci_dev_dbg(pci_hdr, fmt, ...) \
+	pr_debug("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
+#define pci_dev_die(pci_hdr, fmt, ...) \
+	die("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
+
+/*
+ * PCI Configuration Mechanism #1 I/O ports. See Section 3.7.4.1.
+ * ("Configuration Mechanism #1") of the PCI Local Bus Specification 2.1 for
+ * details.
+ */
+#define PCI_CONFIG_ADDRESS	0xcf8
+#define PCI_CONFIG_DATA		0xcfc
+#define PCI_CONFIG_BUS_FORWARD	0xcfa
+#define PCI_IO_SIZE		0x100
+#define PCI_IOPORT_START	0x6200
+#define PCI_CFG_SIZE		(1ULL << 24)
+
+struct kvm;
+
+union pci_config_address {
+	struct {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+		unsigned	reg_offset	: 2;		/* 1  .. 0  */
+		unsigned	register_number	: 6;		/* 7  .. 2  */
+		unsigned	function_number	: 3;		/* 10 .. 8  */
+		unsigned	device_number	: 5;		/* 15 .. 11 */
+		unsigned	bus_number	: 8;		/* 23 .. 16 */
+		unsigned	reserved	: 7;		/* 30 .. 24 */
+		unsigned	enable_bit	: 1;		/* 31       */
+#else
+		unsigned	enable_bit	: 1;		/* 31       */
+		unsigned	reserved	: 7;		/* 30 .. 24 */
+		unsigned	bus_number	: 8;		/* 23 .. 16 */
+		unsigned	device_number	: 5;		/* 15 .. 11 */
+		unsigned	function_number	: 3;		/* 10 .. 8  */
+		unsigned	register_number	: 6;		/* 7  .. 2  */
+		unsigned	reg_offset	: 2;		/* 1  .. 0  */
+#endif
+	};
+	u32 w;
+};
+
+struct msix_table {
+	struct msi_msg msg;
+	u32 ctrl;
+};
+
+struct msix_cap {
+	u8 cap;
+	u8 next;
+	u16 ctrl;
+	u32 table_offset;
+	u32 pba_offset;
+};
+
+struct msi_cap_64 {
+	u8 cap;
+	u8 next;
+	u16 ctrl;
+	u32 address_lo;
+	u32 address_hi;
+	u16 data;
+	u16 _align;
+	u32 mask_bits;
+	u32 pend_bits;
+};
+
+struct msi_cap_32 {
+	u8 cap;
+	u8 next;
+	u16 ctrl;
+	u32 address_lo;
+	u16 data;
+	u16 _align;
+	u32 mask_bits;
+	u32 pend_bits;
+};
+
+struct pci_cap_hdr {
+	u8	type;
+	u8	next;
+};
+
+struct pci_device_header;
+
+typedef int (*bar_activate_fn_t)(struct kvm *kvm,
+				 struct pci_device_header *pci_hdr,
+				 int bar_num, void *data);
+typedef int (*bar_deactivate_fn_t)(struct kvm *kvm,
+				   struct pci_device_header *pci_hdr,
+				   int bar_num, void *data);
+
+#define PCI_BAR_OFFSET(b)	(offsetof(struct pci_device_header, bar[b]))
+#define PCI_DEV_CFG_SIZE	256
+#define PCI_DEV_CFG_MASK	(PCI_DEV_CFG_SIZE - 1)
+
+struct pci_config_operations {
+	void (*write)(struct kvm *kvm, struct pci_device_header *pci_hdr,
+		      u8 offset, void *data, int sz);
+	void (*read)(struct kvm *kvm, struct pci_device_header *pci_hdr,
+		     u8 offset, void *data, int sz);
+};
+
+struct pci_device_header {
+	/* Configuration space, as seen by the guest */
+	union {
+		struct {
+			u16		vendor_id;
+			u16		device_id;
+			u16		command;
+			u16		status;
+			u8		revision_id;
+			u8		class[3];
+			u8		cacheline_size;
+			u8		latency_timer;
+			u8		header_type;
+			u8		bist;
+			u32		bar[6];
+			u32		card_bus;
+			u16		subsys_vendor_id;
+			u16		subsys_id;
+			u32		exp_rom_bar;
+			u8		capabilities;
+			u8		reserved1[3];
+			u32		reserved2;
+			u8		irq_line;
+			u8		irq_pin;
+			u8		min_gnt;
+			u8		max_lat;
+			struct msix_cap msix;
+		} __attribute__((packed));
+		/* Pad to PCI config space size */
+		u8	__pad[PCI_DEV_CFG_SIZE];
+	};
+
+	/* Private to lkvm */
+	u32			bar_size[6];
+	bool			bar_active[6];
+	bar_activate_fn_t	bar_activate_fn;
+	bar_deactivate_fn_t	bar_deactivate_fn;
+	void *data;
+	struct pci_config_operations	cfg_ops;
+	/*
+	 * PCI INTx# are level-triggered, but virtual device often feature
+	 * edge-triggered INTx# for convenience.
+	 */
+	enum irq_type	irq_type;
+};
+
+#define PCI_CAP(pci_hdr, pos) ((void *)(pci_hdr) + (pos))
+
+#define pci_for_each_cap(pos, cap, hdr)				\
+	for ((pos) = (hdr)->capabilities & ~3;			\
+	     (cap) = PCI_CAP(hdr, pos), (pos) != 0;		\
+	     (pos) = ((struct pci_cap_hdr *)(cap))->next & ~3)
+
+int pci__init(struct kvm *kvm);
+int pci__exit(struct kvm *kvm);
+struct pci_device_header *pci__find_dev(u8 dev_num);
+u32 pci_get_mmio_block(u32 size);
+u16 pci_get_io_port_block(u32 size);
+int pci__assign_irq(struct pci_device_header *pci_hdr);
+void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data, int size);
+void pci__config_rd(struct kvm *kvm, union pci_config_address addr, void *data, int size);
+
+void *pci_find_cap(struct pci_device_header *hdr, u8 cap_type);
+
+int pci__register_bar_regions(struct kvm *kvm, struct pci_device_header *pci_hdr,
+			      bar_activate_fn_t bar_activate_fn,
+			      bar_deactivate_fn_t bar_deactivate_fn, void *data);
+
+static inline bool __pci__memory_space_enabled(u16 command)
+{
+	return command & PCI_COMMAND_MEMORY;
+}
+
+static inline bool pci__memory_space_enabled(struct pci_device_header *pci_hdr)
+{
+	return __pci__memory_space_enabled(pci_hdr->command);
+}
+
+static inline bool __pci__io_space_enabled(u16 command)
+{
+	return command & PCI_COMMAND_IO;
+}
+
+static inline bool pci__io_space_enabled(struct pci_device_header *pci_hdr)
+{
+	return __pci__io_space_enabled(pci_hdr->command);
+}
+
+static inline bool __pci__bar_is_io(u32 bar)
+{
+	return bar & PCI_BASE_ADDRESS_SPACE_IO;
+}
+
+static inline bool pci__bar_is_io(struct pci_device_header *pci_hdr, int bar_num)
+{
+	return __pci__bar_is_io(pci_hdr->bar[bar_num]);
+}
+
+static inline bool pci__bar_is_memory(struct pci_device_header *pci_hdr, int bar_num)
+{
+	return !pci__bar_is_io(pci_hdr, bar_num);
+}
+
+static inline u32 __pci__bar_address(u32 bar)
+{
+	if (__pci__bar_is_io(bar))
+		return bar & PCI_BASE_ADDRESS_IO_MASK;
+	return bar & PCI_BASE_ADDRESS_MEM_MASK;
+}
+
+static inline u32 pci__bar_address(struct pci_device_header *pci_hdr, int bar_num)
+{
+	return __pci__bar_address(pci_hdr->bar[bar_num]);
+}
+
+static inline u32 pci__bar_size(struct pci_device_header *pci_hdr, int bar_num)
+{
+	return pci_hdr->bar_size[bar_num];
+}
+
+#endif /* KVM__PCI_H */
diff --git a/kvmtool/include/kvm/qcow.h b/kvmtool/include/kvm/qcow.h
new file mode 100644
index 0000000..f849246
--- /dev/null
+++ b/kvmtool/include/kvm/qcow.h
@@ -0,0 +1,133 @@
+#ifndef KVM__QCOW_H
+#define KVM__QCOW_H
+
+#include "kvm/mutex.h"
+
+#include <linux/types.h>
+#include <stdbool.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+
+#define QCOW_MAGIC		(('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+
+#define QCOW1_VERSION		1
+#define QCOW2_VERSION		2
+
+#define QCOW1_OFLAG_COMPRESSED	(1ULL << 63)
+
+#define QCOW2_OFLAG_COPIED	(1ULL << 63)
+#define QCOW2_OFLAG_COMPRESSED	(1ULL << 62)
+
+#define QCOW2_OFLAGS_MASK	(QCOW2_OFLAG_COPIED|QCOW2_OFLAG_COMPRESSED)
+
+#define QCOW2_OFFSET_MASK	(~QCOW2_OFLAGS_MASK)
+
+#define MAX_CACHE_NODES         32
+
+struct qcow_l2_table {
+	u64				offset;
+	struct rb_node			node;
+	struct list_head		list;
+	u8				dirty;
+	u64				table[];
+};
+
+struct qcow_l1_table {
+	u32				table_size;
+	u64				*l1_table;
+
+	/* Level2 caching data structures */
+	struct rb_root			root;
+	struct list_head		lru_list;
+	int				nr_cached;
+};
+
+#define QCOW_REFCOUNT_BLOCK_SHIFT	1
+
+struct qcow_refcount_block {
+	u64				offset;
+	struct rb_node			node;
+	struct list_head		list;
+	u64				size;
+	u8				dirty;
+	u16				entries[];
+};
+
+struct qcow_refcount_table {
+	u32				rf_size;
+	u64				*rf_table;
+
+	/* Refcount block caching data structures */
+	struct rb_root			root;
+	struct list_head		lru_list;
+	int				nr_cached;
+};
+
+struct qcow_header {
+	u64				size;	/* in bytes */
+	u64				l1_table_offset;
+	u32				l1_size;
+	u8				cluster_bits;
+	u8				l2_bits;
+	u64				refcount_table_offset;
+	u32				refcount_table_size;
+};
+
+struct qcow {
+	struct mutex			mutex;
+	struct qcow_header		*header;
+	struct qcow_l1_table		table;
+	struct qcow_refcount_table	refcount_table;
+	int				fd;
+	int				csize_shift;
+	int				csize_mask;
+	u32				version;
+	u64				cluster_size;
+	u64				cluster_offset_mask;
+	u64				free_clust_idx;
+	void				*cluster_cache;
+	void				*cluster_data;
+	void				*copy_buff;
+};
+
+struct qcow1_header_disk {
+	u32				magic;
+	u32				version;
+
+	u64				backing_file_offset;
+	u32 				backing_file_size;
+	u32				mtime;
+
+	u64				size;	/* in bytes */
+
+	u8				cluster_bits;
+	u8				l2_bits;
+	u32				crypt_method;
+
+	u64				l1_table_offset;
+};
+
+struct qcow2_header_disk {
+	u32				magic;
+	u32				version;
+
+	u64				backing_file_offset;
+	u32				backing_file_size;
+
+	u32				cluster_bits;
+	u64				size;	/* in bytes */
+	u32				crypt_method;
+
+	u32				l1_size;
+	u64				l1_table_offset;
+
+	u64				refcount_table_offset;
+	u32				refcount_table_clusters;
+
+	u32				nb_snapshots;
+	u64				snapshots_offset;
+};
+
+struct disk_image *qcow_probe(int fd, bool readonly);
+
+#endif /* KVM__QCOW_H */
diff --git a/kvmtool/include/kvm/rbtree-interval.h b/kvmtool/include/kvm/rbtree-interval.h
new file mode 100644
index 0000000..17cd3b5
--- /dev/null
+++ b/kvmtool/include/kvm/rbtree-interval.h
@@ -0,0 +1,32 @@
+#ifndef KVM__INTERVAL_RBTREE_H
+#define KVM__INTERVAL_RBTREE_H
+
+#include <linux/rbtree.h>
+#include <linux/types.h>
+
+#define RB_INT_INIT(l, h) \
+	(struct rb_int_node){.low = l, .high = h}
+#define rb_int(n)	rb_entry(n, struct rb_int_node, node)
+#define rb_int_start(n)	((n)->low)
+#define rb_int_end(n)	((n)->low + (n)->high - 1)
+
+struct rb_int_node {
+	struct rb_node	node;
+	u64		low;
+	u64		high;
+};
+
+/* Return the rb_int_node interval in which 'point' is located. */
+struct rb_int_node *rb_int_search_single(struct rb_root *root, u64 point);
+
+/* Return the rb_int_node in which start:len is located. */
+struct rb_int_node *rb_int_search_range(struct rb_root *root, u64 low, u64 high);
+
+int rb_int_insert(struct rb_root *root, struct rb_int_node *data);
+
+static inline void rb_int_erase(struct rb_root *root, struct rb_int_node *node)
+{
+	rb_erase(&node->node, root);
+}
+
+#endif
diff --git a/kvmtool/include/kvm/read-write.h b/kvmtool/include/kvm/read-write.h
new file mode 100644
index 0000000..8375d7c
--- /dev/null
+++ b/kvmtool/include/kvm/read-write.h
@@ -0,0 +1,34 @@
+#ifndef KVM_READ_WRITE_H
+#define KVM_READ_WRITE_H
+
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+ssize_t xread(int fd, void *buf, size_t count);
+ssize_t xwrite(int fd, const void *buf, size_t count);
+
+ssize_t read_file(int fd, char *buf, size_t max_size);
+
+ssize_t read_in_full(int fd, void *buf, size_t count);
+ssize_t write_in_full(int fd, const void *buf, size_t count);
+
+ssize_t xpread(int fd, void *buf, size_t count, off_t offset);
+ssize_t xpwrite(int fd, const void *buf, size_t count, off_t offset);
+
+ssize_t pread_in_full(int fd, void *buf, size_t count, off_t offset);
+ssize_t pwrite_in_full(int fd, const void *buf, size_t count, off_t offset);
+
+ssize_t xreadv(int fd, const struct iovec *iov, int iovcnt);
+ssize_t xwritev(int fd, const struct iovec *iov, int iovcnt);
+
+ssize_t readv_in_full(int fd, const struct iovec *iov, int iovcnt);
+ssize_t writev_in_full(int fd, const struct iovec *iov, int iovcnt);
+
+ssize_t xpreadv(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+ssize_t xpwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+
+ssize_t preadv_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+ssize_t pwritev_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+
+#endif /* KVM_READ_WRITE_H */
diff --git a/kvmtool/include/kvm/rtc.h b/kvmtool/include/kvm/rtc.h
new file mode 100644
index 0000000..6aa9299
--- /dev/null
+++ b/kvmtool/include/kvm/rtc.h
@@ -0,0 +1,9 @@
+#ifndef KVM__RTC_H
+#define KVM__RTC_H
+
+struct kvm;
+
+int rtc__init(struct kvm *kvm);
+int rtc__exit(struct kvm *kvm);
+
+#endif /* KVM__RTC_H */
diff --git a/kvmtool/include/kvm/rwsem.h b/kvmtool/include/kvm/rwsem.h
new file mode 100644
index 0000000..75a22f8
--- /dev/null
+++ b/kvmtool/include/kvm/rwsem.h
@@ -0,0 +1,39 @@
+#ifndef KVM__RWSEM_H
+#define KVM__RWSEM_H
+
+#include <pthread.h>
+
+#include "kvm/util.h"
+
+/*
+ * Kernel-alike rwsem API - to make it easier for kernel developers
+ * to write user-space code! :-)
+ */
+
+#define DECLARE_RWSEM(sem) pthread_rwlock_t sem = PTHREAD_RWLOCK_INITIALIZER
+
+static inline void down_read(pthread_rwlock_t *rwsem)
+{
+	if (pthread_rwlock_rdlock(rwsem) != 0)
+		die("unexpected pthread_rwlock_rdlock() failure!");
+}
+
+static inline void down_write(pthread_rwlock_t *rwsem)
+{
+	if (pthread_rwlock_wrlock(rwsem) != 0)
+		die("unexpected pthread_rwlock_wrlock() failure!");
+}
+
+static inline void up_read(pthread_rwlock_t *rwsem)
+{
+	if (pthread_rwlock_unlock(rwsem) != 0)
+		die("unexpected pthread_rwlock_unlock() failure!");
+}
+
+static inline void up_write(pthread_rwlock_t *rwsem)
+{
+	if (pthread_rwlock_unlock(rwsem) != 0)
+		die("unexpected pthread_rwlock_unlock() failure!");
+}
+
+#endif /* KVM__RWSEM_H */
diff --git a/kvmtool/include/kvm/sdl.h b/kvmtool/include/kvm/sdl.h
new file mode 100644
index 0000000..2f0c213
--- /dev/null
+++ b/kvmtool/include/kvm/sdl.h
@@ -0,0 +1,28 @@
+#ifndef KVM__SDL_H
+#define KVM__SDL_H
+
+#include "kvm/util.h"
+
+struct framebuffer;
+
+#ifdef CONFIG_HAS_SDL
+int sdl__init(struct kvm *kvm);
+int sdl__exit(struct kvm *kvm);
+#else
+static inline int sdl__init(struct kvm *kvm)
+{
+	if (kvm->cfg.sdl)
+		die("SDL support not compiled in. (install the SDL-dev[el] package)");
+
+	return 0;
+}
+static inline int sdl__exit(struct kvm *kvm)
+{
+	if (kvm->cfg.sdl)
+		die("SDL support not compiled in. (install the SDL-dev[el] package)");
+
+	return 0;
+}
+#endif
+
+#endif /* KVM__SDL_H */
diff --git a/kvmtool/include/kvm/strbuf.h b/kvmtool/include/kvm/strbuf.h
new file mode 100644
index 0000000..7657339
--- /dev/null
+++ b/kvmtool/include/kvm/strbuf.h
@@ -0,0 +1,22 @@
+#ifndef __STRBUF_H__
+#define __STRBUF_H__
+
+#include <sys/types.h>
+#include <string.h>
+
+int prefixcmp(const char *str, const char *prefix);
+
+#ifndef HAVE_STRLCPY
+extern size_t strlcat(char *dest, const char *src, size_t count);
+extern size_t strlcpy(char *dest, const char *src, size_t size);
+#endif
+
+/* some inline functions */
+
+static inline const char *skip_prefix(const char *str, const char *prefix)
+{
+	size_t len = strlen(prefix);
+	return strncmp(str, prefix, len) ? NULL : str + len;
+}
+
+#endif
diff --git a/kvmtool/include/kvm/symbol.h b/kvmtool/include/kvm/symbol.h
new file mode 100644
index 0000000..725bbaf
--- /dev/null
+++ b/kvmtool/include/kvm/symbol.h
@@ -0,0 +1,30 @@
+#ifndef KVM__SYMBOL_H
+#define KVM__SYMBOL_H
+
+#include <stddef.h>
+#include <string.h>
+
+struct kvm;
+
+#define SYMBOL_DEFAULT_UNKNOWN "<unknown>"
+
+#ifdef CONFIG_HAS_BFD
+
+int symbol_init(struct kvm *kvm);
+int symbol_exit(struct kvm *kvm);
+char *symbol_lookup(struct kvm *kvm, unsigned long addr, char *sym, size_t size);
+
+#else
+
+static inline int symbol_init(struct kvm *kvm) { return 0; }
+static inline char *symbol_lookup(struct kvm *kvm, unsigned long addr, char *sym, size_t size)
+{
+	char *s = strncpy(sym, SYMBOL_DEFAULT_UNKNOWN, size);
+	sym[size - 1] = '\0';
+	return s;
+}
+static inline int symbol_exit(struct kvm *kvm) { return 0; }
+
+#endif
+
+#endif /* KVM__SYMBOL_H */
diff --git a/kvmtool/include/kvm/term.h b/kvmtool/include/kvm/term.h
new file mode 100644
index 0000000..b9793f0
--- /dev/null
+++ b/kvmtool/include/kvm/term.h
@@ -0,0 +1,23 @@
+#ifndef KVM__TERM_H
+#define KVM__TERM_H
+
+#include "kvm/kvm.h"
+
+#include <sys/uio.h>
+#include <stdbool.h>
+
+#define CONSOLE_8250	1
+#define CONSOLE_VIRTIO	2
+#define CONSOLE_HV	3
+
+#define TERM_MAX_DEVS	4
+
+int term_putc_iov(struct iovec *iov, int iovcnt, int term);
+int term_getc_iov(struct kvm *kvm, struct iovec *iov, int iovcnt, int term);
+int term_putc(char *addr, int cnt, int term);
+int term_getc(struct kvm *kvm, int term);
+
+bool term_readable(int term);
+int tty_parser(const struct option *opt, const char *arg, int unset);
+
+#endif /* KVM__TERM_H */
diff --git a/kvmtool/include/kvm/threadpool.h b/kvmtool/include/kvm/threadpool.h
new file mode 100644
index 0000000..880487a
--- /dev/null
+++ b/kvmtool/include/kvm/threadpool.h
@@ -0,0 +1,40 @@
+#ifndef KVM__THREADPOOL_H
+#define KVM__THREADPOOL_H
+
+#include "kvm/mutex.h"
+
+#include <linux/list.h>
+
+struct kvm;
+
+typedef void (*kvm_thread_callback_fn_t)(struct kvm *kvm, void *data);
+
+struct thread_pool__job {
+	kvm_thread_callback_fn_t	callback;
+	struct kvm			*kvm;
+	void				*data;
+
+	int				signalcount;
+	struct mutex			mutex;
+
+	struct list_head		queue;
+};
+
+static inline void thread_pool__init_job(struct thread_pool__job *job, struct kvm *kvm, kvm_thread_callback_fn_t callback, void *data)
+{
+	*job = (struct thread_pool__job) {
+		.kvm		= kvm,
+		.callback	= callback,
+		.data		= data,
+		.mutex		= MUTEX_INITIALIZER,
+	};
+	INIT_LIST_HEAD(&job->queue);
+}
+
+int thread_pool__init(struct kvm *kvm);
+int thread_pool__exit(struct kvm *kvm);
+
+void thread_pool__do_job(struct thread_pool__job *job);
+void thread_pool__cancel_job(struct thread_pool__job *job);
+
+#endif
diff --git a/kvmtool/include/kvm/uip.h b/kvmtool/include/kvm/uip.h
new file mode 100644
index 0000000..efa508a
--- /dev/null
+++ b/kvmtool/include/kvm/uip.h
@@ -0,0 +1,368 @@
+#ifndef KVM__UIP_H
+#define KVM__UIP_H
+
+#include "linux/types.h"
+#include "kvm/mutex.h"
+
+#include <netinet/in.h>
+#include <sys/uio.h>
+
+#define UIP_BUF_STATUS_FREE	0
+#define UIP_BUF_STATUS_INUSE	1
+#define UIP_BUF_STATUS_USED	2
+
+#define UIP_ETH_P_IP		0X0800
+#define UIP_ETH_P_ARP		0X0806
+
+#define UIP_IP_VER_4		0X40
+#define UIP_IP_HDR_LEN		0X05
+#define UIP_IP_TTL		0X40
+#define UIP_IP_P_UDP		0X11
+#define UIP_IP_P_TCP		0X06
+#define UIP_IP_P_ICMP		0X01
+
+#define UIP_TCP_HDR_LEN		0x50
+#define UIP_TCP_WIN_SIZE	14600
+#define UIP_TCP_FLAG_FIN	1
+#define UIP_TCP_FLAG_SYN	2
+#define UIP_TCP_FLAG_RST	4
+#define UIP_TCP_FLAG_PSH	8
+#define UIP_TCP_FLAG_ACK	16
+#define UIP_TCP_FLAG_URG	32
+
+#define UIP_BOOTP_VENDOR_SPECIFIC_LEN	64
+#define UIP_BOOTP_MAX_PAYLOAD_LEN	300
+#define UIP_DHCP_VENDOR_SPECIFIC_LEN	312
+#define UIP_DHCP_PORT_SERVER		67
+#define UIP_DHCP_PORT_CLIENT		68
+#define UIP_DHCP_MACPAD_LEN		10
+#define UIP_DHCP_HOSTNAME_LEN		64
+#define UIP_DHCP_FILENAME_LEN		128
+#define UIP_DHCP_MAGIC_COOKIE		0x63825363
+#define UIP_DHCP_MAGIC_COOKIE_LEN	4
+#define UIP_DHCP_LEASE_TIME		0x00003840
+#define UIP_DHCP_MAX_PAYLOAD_LEN	(UIP_BOOTP_MAX_PAYLOAD_LEN - UIP_BOOTP_VENDOR_SPECIFIC_LEN +  UIP_DHCP_VENDOR_SPECIFIC_LEN)
+#define UIP_DHCP_OPTION_LEN		(UIP_DHCP_VENDOR_SPECIFIC_LEN - UIP_DHCP_MAGIC_COOKIE_LEN)
+#define UIP_DHCP_DISCOVER		1
+#define UIP_DHCP_OFFER			2
+#define UIP_DHCP_REQUEST		3
+#define UIP_DHCP_ACK			5
+#define UIP_DHCP_MAX_DNS_SERVER_NR	3
+#define UIP_DHCP_MAX_DOMAIN_NAME_LEN	256
+#define UIP_DHCP_TAG_MSG_TYPE		53
+#define UIP_DHCP_TAG_MSG_TYPE_LEN	1
+#define UIP_DHCP_TAG_SERVER_ID		54
+#define UIP_DHCP_TAG_SERVER_ID_LEN	4
+#define UIP_DHCP_TAG_LEASE_TIME		51
+#define UIP_DHCP_TAG_LEASE_TIME_LEN	4
+#define UIP_DHCP_TAG_SUBMASK		1
+#define UIP_DHCP_TAG_SUBMASK_LEN	4
+#define UIP_DHCP_TAG_ROUTER		3
+#define UIP_DHCP_TAG_ROUTER_LEN		4
+#define UIP_DHCP_TAG_ROOT		17
+#define UIP_DHCP_TAG_ROOT_LEN		4
+#define UIP_DHCP_TAG_DNS_SERVER		6
+#define UIP_DHCP_TAG_DNS_SERVER_LEN	4
+#define UIP_DHCP_TAG_DOMAIN_NAME	15
+#define UIP_DHCP_TAG_END		255
+
+/*
+ * IP package maxium len == 64 KBytes
+ * IP header == 20 Bytes
+ * TCP header == 20 Bytes
+ * UDP header == 8 Bytes
+ */
+#define UIP_MAX_TCP_PAYLOAD	(64*1024 - 20 - 20 - 1)
+#define UIP_MAX_UDP_PAYLOAD	(64*1024 - 20 -  8 - 1)
+
+struct uip_eth_addr {
+	u8 addr[6];
+};
+
+struct uip_eth {
+	struct uip_eth_addr dst;
+	struct uip_eth_addr src;
+	u16 type;
+} __attribute__((packed));
+
+struct uip_arp {
+	struct uip_eth eth;
+	u16 hwtype;
+	u16 proto;
+	u8 hwlen;
+	u8 protolen;
+	u16 op;
+	struct uip_eth_addr smac;
+	u32 sip;
+	struct uip_eth_addr dmac;
+	u32 dip;
+} __attribute__((packed));
+
+struct uip_ip {
+	struct uip_eth eth;
+	u8 vhl;
+	u8 tos;
+	/*
+	 * len = IP hdr +  IP payload
+	 */
+	u16 len;
+	u16 id;
+	u16 flgfrag;
+	u8 ttl;
+	u8 proto;
+	u16 csum;
+	u32 sip;
+	u32 dip;
+} __attribute__((packed));
+
+struct uip_icmp {
+	struct uip_ip ip;
+	u8 type;
+	u8 code;
+	u16 csum;
+	u16 id;
+	u16 seq;
+} __attribute__((packed));
+
+struct uip_udp {
+	/*
+	 * FIXME: IP Options (IP hdr len > 20 bytes) are not supported
+	 */
+	struct uip_ip ip;
+	u16 sport;
+	u16 dport;
+	/*
+	 * len = UDP hdr +  UDP payload
+	 */
+	u16 len;
+	u16 csum;
+	u8 payload[0];
+} __attribute__((packed));
+
+struct uip_tcp {
+	/*
+	 * FIXME: IP Options (IP hdr len > 20 bytes) are not supported
+	 */
+	struct uip_ip ip;
+	u16 sport;
+	u16 dport;
+	u32 seq;
+	u32 ack;
+	u8  off;
+	u8  flg;
+	u16 win;
+	u16 csum;
+	u16 urgent;
+} __attribute__((packed));
+
+struct uip_pseudo_hdr {
+	u32 sip;
+	u32 dip;
+	u8 zero;
+	u8 proto;
+	u16 len;
+} __attribute__((packed));
+
+struct uip_dhcp {
+	struct uip_udp udp;
+	u8 msg_type;
+	u8 hardware_type;
+	u8 hardware_len;
+	u8 hops;
+	u32 id;
+	u16 time;
+	u16 flg;
+	u32 client_ip;
+	u32 your_ip;
+	u32 server_ip;
+	u32 agent_ip;
+	struct uip_eth_addr client_mac;
+	u8 pad[UIP_DHCP_MACPAD_LEN];
+	u8 server_hostname[UIP_DHCP_HOSTNAME_LEN];
+	u8 boot_filename[UIP_DHCP_FILENAME_LEN];
+	u32 magic_cookie;
+	u8 option[UIP_DHCP_OPTION_LEN];
+} __attribute__((packed));
+
+struct uip_info {
+	struct list_head udp_socket_head;
+	struct list_head tcp_socket_head;
+	struct mutex udp_socket_lock;
+	struct mutex tcp_socket_lock;
+	struct uip_eth_addr guest_mac;
+	struct uip_eth_addr host_mac;
+	pthread_cond_t buf_free_cond;
+	pthread_cond_t buf_used_cond;
+	struct list_head buf_head;
+	struct mutex buf_lock;
+	pthread_t udp_thread;
+	u8 *udp_buf;
+	int udp_epollfd;
+	int buf_free_nr;
+	int buf_used_nr;
+	u32 guest_ip;
+	u32 guest_netmask;
+	u32 host_ip;
+	u32 dns_ip[UIP_DHCP_MAX_DNS_SERVER_NR];
+	char *domain_name;
+	u32 buf_nr;
+	u32 vnet_hdr_len;
+};
+
+struct uip_buf {
+	struct list_head list;
+	struct uip_info *info;
+	int vnet_len;
+	int eth_len;
+	int status;
+	unsigned char *vnet;
+	unsigned char *eth;
+	int id;
+};
+
+struct uip_udp_socket {
+	struct sockaddr_in addr;
+	struct list_head list;
+	struct mutex *lock;
+	u32 dport, sport;
+	u32 dip, sip;
+	int fd;
+};
+
+struct uip_tcp_socket {
+	struct sockaddr_in addr;
+	struct list_head list;
+	struct uip_info *info;
+	pthread_cond_t	cond;
+	struct mutex *lock;
+	pthread_t thread;
+	u32 dport, sport;
+	u32 guest_acked;
+	u16 window_size;
+	/*
+	 * Initial Sequence Number
+	 */
+	u32 isn_server;
+	u32 isn_guest;
+	u32 ack_server;
+	u32 seq_server;
+	int write_done;
+	int read_done;
+	u32 dip, sip;
+	u8 *payload;
+	u8 *buf;
+	int fd;
+};
+
+struct uip_tx_arg {
+	void *vnet;
+	struct uip_info *info;
+	struct uip_eth *eth;
+	int vnet_len;
+	int eth_len;
+};
+
+static inline u16 uip_ip_hdrlen(struct uip_ip *ip)
+{
+	return (ip->vhl & 0x0f) * 4;
+}
+
+static inline u16 uip_ip_len(struct uip_ip *ip)
+{
+	return htons(ip->len);
+}
+
+static inline u16 uip_udp_hdrlen(struct uip_udp *udp)
+{
+	return 8;
+}
+
+static inline u16 uip_udp_len(struct uip_udp *udp)
+{
+	return ntohs(udp->len);
+}
+
+static inline u16 uip_tcp_hdrlen(struct uip_tcp *tcp)
+{
+	return (tcp->off >> 4) * 4;
+}
+
+static inline u16 uip_tcp_len(struct uip_tcp *tcp)
+{
+	struct uip_ip *ip;
+
+	ip = &tcp->ip;
+
+	return uip_ip_len(ip) - uip_ip_hdrlen(ip);
+}
+
+static inline u16 uip_tcp_payloadlen(struct uip_tcp *tcp)
+{
+	return uip_tcp_len(tcp) - uip_tcp_hdrlen(tcp);
+}
+
+static inline u8 *uip_tcp_payload(struct uip_tcp *tcp)
+{
+	return (u8 *)&tcp->sport + uip_tcp_hdrlen(tcp);
+}
+
+static inline bool uip_tcp_is_syn(struct uip_tcp *tcp)
+{
+	return (tcp->flg & UIP_TCP_FLAG_SYN) != 0;
+}
+
+static inline bool uip_tcp_is_fin(struct uip_tcp *tcp)
+{
+	return (tcp->flg & UIP_TCP_FLAG_FIN) != 0;
+}
+
+static inline u32 uip_tcp_isn(struct uip_tcp *tcp)
+{
+	return ntohl(tcp->seq);
+}
+
+static inline u32 uip_tcp_isn_alloc(void)
+{
+	/*
+	 * FIXME: should increase every 4ms
+	 */
+	return 10000000;
+}
+
+static inline u16 uip_eth_hdrlen(struct uip_eth *eth)
+{
+	return sizeof(*eth);
+}
+
+int uip_tx(struct iovec *iov, u16 out, struct uip_info *info);
+int uip_rx(struct iovec *iov, u16 in, struct uip_info *info);
+void uip_static_init(struct uip_info *info);
+int uip_init(struct uip_info *info);
+void uip_exit(struct uip_info *info);
+void uip_tcp_exit(struct uip_info *info);
+void uip_udp_exit(struct uip_info *info);
+
+int uip_tx_do_ipv4_udp_dhcp(struct uip_tx_arg *arg);
+int uip_tx_do_ipv4_icmp(struct uip_tx_arg *arg);
+int uip_tx_do_ipv4_tcp(struct uip_tx_arg *arg);
+int uip_tx_do_ipv4_udp(struct uip_tx_arg *arg);
+int uip_tx_do_ipv4(struct uip_tx_arg *arg);
+int uip_tx_do_arp(struct uip_tx_arg *arg);
+
+u16 uip_csum_icmp(struct uip_icmp *icmp);
+u16 uip_csum_udp(struct uip_udp *udp);
+u16 uip_csum_tcp(struct uip_tcp *tcp);
+u16 uip_csum_ip(struct uip_ip *ip);
+
+struct uip_buf *uip_buf_set_used(struct uip_info *info, struct uip_buf *buf);
+struct uip_buf *uip_buf_set_free(struct uip_info *info, struct uip_buf *buf);
+struct uip_buf *uip_buf_get_used(struct uip_info *info);
+struct uip_buf *uip_buf_get_free(struct uip_info *info);
+struct uip_buf *uip_buf_clone(struct uip_tx_arg *arg);
+
+int uip_udp_make_pkg(struct uip_info *info, struct uip_udp_socket *sk, struct uip_buf *buf, u8 *payload, int payload_len);
+bool uip_udp_is_dhcp(struct uip_udp *udp);
+
+int uip_dhcp_get_dns(struct uip_info *info);
+void uip_dhcp_exit(struct uip_info *info);
+#endif /* KVM__UIP_H */
diff --git a/kvmtool/include/kvm/util-init.h b/kvmtool/include/kvm/util-init.h
new file mode 100644
index 0000000..13d4f04
--- /dev/null
+++ b/kvmtool/include/kvm/util-init.h
@@ -0,0 +1,51 @@
+#ifndef KVM__UTIL_INIT_H
+#define KVM__UTIL_INIT_H
+
+struct kvm;
+
+struct init_item {
+	struct hlist_node n;
+	const char *fn_name;
+	int (*init)(struct kvm *);
+};
+
+int init_list__init(struct kvm *kvm);
+int init_list__exit(struct kvm *kvm);
+
+int init_list_add(struct init_item *t, int (*init)(struct kvm *),
+			int priority, const char *name);
+int exit_list_add(struct init_item *t, int (*init)(struct kvm *),
+			int priority, const char *name);
+
+#define __init_list_add(cb, l)						\
+static void __attribute__ ((constructor)) __init__##cb(void)		\
+{									\
+	static char name[] = #cb;					\
+	static struct init_item t;					\
+	init_list_add(&t, cb, l, name);					\
+}
+
+#define __exit_list_add(cb, l)						\
+static void __attribute__ ((constructor)) __init__##cb(void)		\
+{									\
+	static char name[] = #cb;					\
+	static struct init_item t;					\
+	exit_list_add(&t, cb, l, name);					\
+}
+
+#define core_init(cb) __init_list_add(cb, 0)
+#define base_init(cb) __init_list_add(cb, 2)
+#define dev_base_init(cb)  __init_list_add(cb, 4)
+#define dev_init(cb) __init_list_add(cb, 5)
+#define virtio_dev_init(cb) __init_list_add(cb, 6)
+#define firmware_init(cb) __init_list_add(cb, 7)
+#define late_init(cb) __init_list_add(cb, 9)
+
+#define core_exit(cb) __exit_list_add(cb, 0)
+#define base_exit(cb) __exit_list_add(cb, 2)
+#define dev_base_exit(cb) __exit_list_add(cb, 4)
+#define dev_exit(cb) __exit_list_add(cb, 5)
+#define virtio_dev_exit(cb) __exit_list_add(cb, 6)
+#define firmware_exit(cb) __exit_list_add(cb, 7)
+#define late_exit(cb) __exit_list_add(cb, 9)
+#endif
diff --git a/kvmtool/include/kvm/util.h b/kvmtool/include/kvm/util.h
new file mode 100644
index 0000000..d76568a
--- /dev/null
+++ b/kvmtool/include/kvm/util.h
@@ -0,0 +1,136 @@
+#include <linux/stringify.h>
+
+#ifndef KVM__UTIL_H
+#define KVM__UTIL_H
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+/*
+ * Some bits are stolen from perf tool :)
+ */
+
+#include <unistd.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <errno.h>
+#include <limits.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <linux/types.h>
+
+#ifdef __GNUC__
+#define NORETURN __attribute__((__noreturn__))
+#else
+#define NORETURN
+#ifndef __attribute__
+#define __attribute__(x)
+#endif
+#endif
+
+extern bool do_debug_print;
+
+#define PROT_RW (PROT_READ|PROT_WRITE)
+#define MAP_ANON_NORESERVE (MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE)
+
+extern void die(const char *err, ...) NORETURN __attribute__((format (printf, 1, 2)));
+extern void die_perror(const char *s) NORETURN;
+extern int pr_err(const char *err, ...) __attribute__((format (printf, 1, 2)));
+extern void pr_warning(const char *err, ...) __attribute__((format (printf, 1, 2)));
+extern void pr_info(const char *err, ...) __attribute__((format (printf, 1, 2)));
+extern void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN);
+
+#define pr_debug(fmt, ...)						\
+	do {								\
+		if (do_debug_print)					\
+			pr_info("(%s) %s:%d: " fmt, __FILE__,		\
+				__func__, __LINE__, ##__VA_ARGS__);	\
+	} while (0)
+
+
+#define BUILD_BUG_ON(condition)	((void)sizeof(char[1 - 2*!!(condition)]))
+
+#ifndef BUG_ON_HANDLER
+# define BUG_ON_HANDLER(condition)					\
+	do {								\
+		if ((condition)) {					\
+			pr_err("BUG at %s:%d", __FILE__, __LINE__);	\
+			raise(SIGABRT);					\
+		}							\
+	} while (0)
+#endif
+
+#define BUG_ON(condition)	BUG_ON_HANDLER((condition))
+
+#define DIE_IF(cnd)						\
+do {								\
+	if (cnd)						\
+	die(" at (" __FILE__ ":" __stringify(__LINE__) "): "	\
+		__stringify(cnd) "\n");				\
+} while (0)
+
+#define WARN_ON(condition) ({					\
+	int __ret_warn_on = !!(condition);			\
+	if (__ret_warn_on)					\
+		pr_warning("(%s) %s:%d: failed condition: %s",	\
+				__FILE__, __func__, __LINE__,	\
+				__stringify(condition));	\
+	__ret_warn_on;						\
+})
+
+#define MSECS_TO_USECS(s) ((s) * 1000)
+
+/* Millisecond sleep */
+static inline void msleep(unsigned int msecs)
+{
+	usleep(MSECS_TO_USECS(msecs));
+}
+
+/*
+ * Find last (most significant) bit set. Same implementation as Linux:
+ * fls(0) = 0, fls(1) = 1, fls(1UL << 63) = 64
+ */
+static inline int fls_long(unsigned long x)
+{
+	return x ? sizeof(x) * 8 - __builtin_clzl(x) : 0;
+}
+
+static inline unsigned long roundup_pow_of_two(unsigned long x)
+{
+	return x ? 1UL << fls_long(x - 1) : 0;
+}
+
+#define is_power_of_two(x)	((x) > 0 ? ((x) & ((x) - 1)) == 0 : 0)
+
+/**
+ * pow2_size: return the number of bits needed to store values
+ * @x: number of distinct values to store (or number of bytes)
+ *
+ * Determines the number of bits needed to store @x different values.
+ * Could be used to determine the number of address bits needed to
+ * store @x bytes.
+ *
+ * Example:
+ * pow2_size(255) => 8
+ * pow2_size(256) => 8
+ * pow2_size(257) => 9
+ *
+ * Return: number of bits
+ */
+static inline int pow2_size(unsigned long x)
+{
+	if (x <= 1)
+		return x;
+
+	return sizeof(x) * 8 - __builtin_clzl(x - 1);
+}
+
+struct kvm;
+void *mmap_hugetlbfs(struct kvm *kvm, const char *htlbfs_path, u64 size);
+void *mmap_anon_or_hugetlbfs(struct kvm *kvm, const char *hugetlbfs_path, u64 size);
+
+#endif /* KVM__UTIL_H */
diff --git a/kvmtool/include/kvm/vesa.h b/kvmtool/include/kvm/vesa.h
new file mode 100644
index 0000000..e7d9713
--- /dev/null
+++ b/kvmtool/include/kvm/vesa.h
@@ -0,0 +1,20 @@
+#ifndef KVM__VESA_H
+#define KVM__VESA_H
+
+#define VESA_WIDTH	640
+#define VESA_HEIGHT	480
+
+#define VESA_MEM_ADDR	0xd0000000
+#define VESA_BPP	32
+/*
+ * We actually only need VESA_BPP/8*VESA_WIDTH*VESA_HEIGHT bytes. But the memory
+ * size must be a power of 2, so we round up.
+ */
+#define VESA_MEM_SIZE	(1 << 21)
+
+struct kvm;
+struct biosregs;
+
+struct framebuffer *vesa__init(struct kvm *self);
+
+#endif
diff --git a/kvmtool/include/kvm/vfio.h b/kvmtool/include/kvm/vfio.h
new file mode 100644
index 0000000..28223cf
--- /dev/null
+++ b/kvmtool/include/kvm/vfio.h
@@ -0,0 +1,128 @@
+#ifndef KVM__VFIO_H
+#define KVM__VFIO_H
+
+#include "kvm/mutex.h"
+#include "kvm/parse-options.h"
+#include "kvm/pci.h"
+
+#include <linux/vfio.h>
+
+#define vfio_dev_err(vdev, fmt, ...) \
+	pr_err("%s: " fmt, (vdev)->params->name, ##__VA_ARGS__)
+#define vfio_dev_warn(vdev, fmt, ...) \
+	pr_warning("%s: " fmt, (vdev)->params->name, ##__VA_ARGS__)
+#define vfio_dev_info(vdev, fmt, ...) \
+	pr_info("%s: " fmt, (vdev)->params->name, ##__VA_ARGS__)
+#define vfio_dev_dbg(vdev, fmt, ...) \
+	pr_debug("%s: " fmt, (vdev)->params->name, ##__VA_ARGS__)
+#define vfio_dev_die(vdev, fmt, ...) \
+	die("%s: " fmt, (vdev)->params->name, ##__VA_ARGS__)
+
+/* Currently limited by num_vfio_devices */
+#define MAX_VFIO_DEVICES		256
+
+enum vfio_device_type {
+	VFIO_DEVICE_PCI,
+};
+
+/* MSI/MSI-X capability enabled */
+#define VFIO_PCI_MSI_STATE_ENABLED	(1 << 0)
+/* MSI/MSI-X capability or individual vector masked */
+#define VFIO_PCI_MSI_STATE_MASKED	(1 << 1)
+/* MSI-X capability has no vector enabled yet */
+#define VFIO_PCI_MSI_STATE_EMPTY	(1 << 2)
+
+struct vfio_pci_msi_entry {
+	struct msix_table		config;
+	int				gsi;
+	int				eventfd;
+	u8				phys_state;
+	u8				virt_state;
+};
+
+struct vfio_pci_msix_table {
+	size_t				size;
+	unsigned int			bar;
+	u32				guest_phys_addr;
+};
+
+struct vfio_pci_msix_pba {
+	size_t				size;
+	off_t				offset; /* in VFIO device fd */
+	unsigned int			bar;
+	u32				guest_phys_addr;
+};
+
+/* Common data for MSI and MSI-X */
+struct vfio_pci_msi_common {
+	off_t				pos;
+	u8				virt_state;
+	u8				phys_state;
+	struct mutex			mutex;
+	struct vfio_irq_info		info;
+	struct vfio_irq_set		*irq_set;
+	size_t				nr_entries;
+	struct vfio_pci_msi_entry	*entries;
+};
+
+#define VFIO_PCI_IRQ_MODE_INTX		(1 << 0)
+#define VFIO_PCI_IRQ_MODE_MSI		(1 << 1)
+#define VFIO_PCI_IRQ_MODE_MSIX		(1 << 2)
+
+struct vfio_pci_device {
+	struct pci_device_header	hdr;
+
+	unsigned long			irq_modes;
+	int				intx_fd;
+	int				unmask_fd;
+	unsigned int			intx_gsi;
+	struct vfio_pci_msi_common	msi;
+	struct vfio_pci_msi_common	msix;
+	struct vfio_pci_msix_table	msix_table;
+	struct vfio_pci_msix_pba	msix_pba;
+};
+
+struct vfio_region {
+	struct vfio_region_info		info;
+	struct vfio_device		*vdev;
+	u64				guest_phys_addr;
+	void				*host_addr;
+	u32				port_base;
+	int				is_ioport	:1;
+};
+
+struct vfio_device {
+	struct device_header		dev_hdr;
+	struct vfio_device_params	*params;
+	struct vfio_group		*group;
+
+	int				fd;
+	struct vfio_device_info		info;
+	struct vfio_region		*regions;
+
+	char				*sysfs_path;
+
+	struct vfio_pci_device		pci;
+};
+
+struct vfio_device_params {
+	char				*name;
+	const char			*bus;
+	enum vfio_device_type		type;
+};
+
+struct vfio_group {
+	unsigned long			id; /* iommu_group number in sysfs */
+	int				fd;
+	int				refs;
+	struct list_head		list;
+};
+
+int vfio_device_parser(const struct option *opt, const char *arg, int unset);
+int vfio_map_region(struct kvm *kvm, struct vfio_device *vdev,
+		    struct vfio_region *region);
+void vfio_unmap_region(struct kvm *kvm, struct vfio_region *region);
+int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *device);
+void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev);
+
+#endif /* KVM__VFIO_H */
diff --git a/kvmtool/include/kvm/virtio-9p.h b/kvmtool/include/kvm/virtio-9p.h
new file mode 100644
index 0000000..3ea7698
--- /dev/null
+++ b/kvmtool/include/kvm/virtio-9p.h
@@ -0,0 +1,75 @@
+#ifndef KVM__VIRTIO_9P_H
+#define KVM__VIRTIO_9P_H
+#include "kvm/virtio.h"
+#include "kvm/pci.h"
+#include "kvm/threadpool.h"
+#include "kvm/parse-options.h"
+
+#include <dirent.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+
+#define NUM_VIRT_QUEUES		1
+#define VIRTQUEUE_NUM		128
+#define	VIRTIO_9P_DEFAULT_TAG	"kvm_9p"
+#define VIRTIO_9P_HDR_LEN	(sizeof(u32)+sizeof(u8)+sizeof(u16))
+#define VIRTIO_9P_VERSION_DOTL	"9P2000.L"
+#define MAX_TAG_LEN		32
+
+struct p9_msg {
+	u32			size;
+	u8			cmd;
+	u16			tag;
+	u8			msg[0];
+} __attribute__((packed));
+
+struct p9_fid {
+	u32			fid;
+	u32			uid;
+	char			abs_path[PATH_MAX];
+	char			*path;
+	DIR			*dir;
+	int			fd;
+	struct rb_node		node;
+};
+
+struct p9_dev_job {
+	struct virt_queue	*vq;
+	struct p9_dev		*p9dev;
+	struct thread_pool__job job_id;
+};
+
+struct p9_dev {
+	struct list_head	list;
+	struct virtio_device	vdev;
+	struct rb_root		fids;
+
+	struct virtio_9p_config	*config;
+	u32			features;
+
+	/* virtio queue */
+	struct virt_queue	vqs[NUM_VIRT_QUEUES];
+	struct p9_dev_job	jobs[NUM_VIRT_QUEUES];
+	char			root_dir[PATH_MAX];
+};
+
+struct p9_pdu {
+	u32			queue_head;
+	size_t			read_offset;
+	size_t			write_offset;
+	u16			out_iov_cnt;
+	u16			in_iov_cnt;
+	struct iovec		in_iov[VIRTQUEUE_NUM];
+	struct iovec		out_iov[VIRTQUEUE_NUM];
+};
+
+struct kvm;
+
+int virtio_9p_rootdir_parser(const struct option *opt, const char *arg, int unset);
+int virtio_9p_img_name_parser(const struct option *opt, const char *arg, int unset);
+int virtio_9p__register(struct kvm *kvm, const char *root, const char *tag_name);
+int virtio_9p__init(struct kvm *kvm);
+int virtio_p9_pdu_readf(struct p9_pdu *pdu, const char *fmt, ...);
+int virtio_p9_pdu_writef(struct p9_pdu *pdu, const char *fmt, ...);
+
+#endif
diff --git a/kvmtool/include/kvm/virtio-balloon.h b/kvmtool/include/kvm/virtio-balloon.h
new file mode 100644
index 0000000..844a1ba
--- /dev/null
+++ b/kvmtool/include/kvm/virtio-balloon.h
@@ -0,0 +1,9 @@
+#ifndef KVM__BLN_VIRTIO_H
+#define KVM__BLN_VIRTIO_H
+
+struct kvm;
+
+int virtio_bln__init(struct kvm *kvm);
+int virtio_bln__exit(struct kvm *kvm);
+
+#endif /* KVM__BLN_VIRTIO_H */
diff --git a/kvmtool/include/kvm/virtio-blk.h b/kvmtool/include/kvm/virtio-blk.h
new file mode 100644
index 0000000..12e59b6
--- /dev/null
+++ b/kvmtool/include/kvm/virtio-blk.h
@@ -0,0 +1,12 @@
+#ifndef KVM__BLK_VIRTIO_H
+#define KVM__BLK_VIRTIO_H
+
+#include "kvm/disk-image.h"
+
+struct kvm;
+
+int virtio_blk__init(struct kvm *kvm);
+int virtio_blk__exit(struct kvm *kvm);
+void virtio_blk_complete(void *param, long len);
+
+#endif /* KVM__BLK_VIRTIO_H */
diff --git a/kvmtool/include/kvm/virtio-console.h b/kvmtool/include/kvm/virtio-console.h
new file mode 100644
index 0000000..8980920
--- /dev/null
+++ b/kvmtool/include/kvm/virtio-console.h
@@ -0,0 +1,10 @@
+#ifndef KVM__CONSOLE_VIRTIO_H
+#define KVM__CONSOLE_VIRTIO_H
+
+struct kvm;
+
+int virtio_console__init(struct kvm *kvm);
+void virtio_console__inject_interrupt(struct kvm *kvm);
+int virtio_console__exit(struct kvm *kvm);
+
+#endif /* KVM__CONSOLE_VIRTIO_H */
diff --git a/kvmtool/include/kvm/virtio-mmio.h b/kvmtool/include/kvm/virtio-mmio.h
new file mode 100644
index 0000000..6bc50bd
--- /dev/null
+++ b/kvmtool/include/kvm/virtio-mmio.h
@@ -0,0 +1,60 @@
+#ifndef KVM__VIRTIO_MMIO_H
+#define KVM__VIRTIO_MMIO_H
+
+#include <linux/types.h>
+#include <linux/virtio_mmio.h>
+
+#define VIRTIO_MMIO_MAX_VQ	32
+#define VIRTIO_MMIO_MAX_CONFIG	1
+#define VIRTIO_MMIO_IO_SIZE	0x200
+
+struct kvm;
+
+struct virtio_mmio_ioevent_param {
+	struct virtio_device	*vdev;
+	u32			vq;
+};
+
+struct virtio_mmio_hdr {
+	char	magic[4];
+	u32	version;
+	u32	device_id;
+	u32	vendor_id;
+	u32	host_features;
+	u32	host_features_sel;
+	u32	reserved_1[2];
+	u32	guest_features;
+	u32	guest_features_sel;
+	u32	guest_page_size;
+	u32	reserved_2;
+	u32	queue_sel;
+	u32	queue_num_max;
+	u32	queue_num;
+	u32	queue_align;
+	u32	queue_pfn;
+	u32	reserved_3[3];
+	u32	queue_notify;
+	u32	reserved_4[3];
+	u32	interrupt_state;
+	u32	interrupt_ack;
+	u32	reserved_5[2];
+	u32	status;
+} __attribute__((packed));
+
+struct virtio_mmio {
+	u32			addr;
+	void			*dev;
+	struct kvm		*kvm;
+	u8			irq;
+	struct virtio_mmio_hdr	hdr;
+	struct device_header	dev_hdr;
+	struct virtio_mmio_ioevent_param ioeventfds[VIRTIO_MMIO_MAX_VQ];
+};
+
+int virtio_mmio_signal_vq(struct kvm *kvm, struct virtio_device *vdev, u32 vq);
+int virtio_mmio_signal_config(struct kvm *kvm, struct virtio_device *vdev);
+int virtio_mmio_exit(struct kvm *kvm, struct virtio_device *vdev);
+int virtio_mmio_reset(struct kvm *kvm, struct virtio_device *vdev);
+int virtio_mmio_init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+		      int device_id, int subsys_id, int class);
+#endif
diff --git a/kvmtool/include/kvm/virtio-net.h b/kvmtool/include/kvm/virtio-net.h
new file mode 100644
index 0000000..d136a09
--- /dev/null
+++ b/kvmtool/include/kvm/virtio-net.h
@@ -0,0 +1,33 @@
+#ifndef KVM__VIRTIO_NET_H
+#define KVM__VIRTIO_NET_H
+
+#include "kvm/parse-options.h"
+
+struct kvm;
+
+struct virtio_net_params {
+	const char *guest_ip;
+	const char *host_ip;
+	const char *script;
+	const char *downscript;
+	const char *trans;
+	const char *tapif;
+	char guest_mac[6];
+	char host_mac[6];
+	struct kvm *kvm;
+	int mode;
+	int vhost;
+	int fd;
+	int mq;
+};
+
+int virtio_net__init(struct kvm *kvm);
+int virtio_net__exit(struct kvm *kvm);
+int netdev_parser(const struct option *opt, const char *arg, int unset);
+
+enum {
+	NET_MODE_USER,
+	NET_MODE_TAP
+};
+
+#endif /* KVM__VIRTIO_NET_H */
diff --git a/kvmtool/include/kvm/virtio-pci-dev.h b/kvmtool/include/kvm/virtio-pci-dev.h
new file mode 100644
index 0000000..48ae018
--- /dev/null
+++ b/kvmtool/include/kvm/virtio-pci-dev.h
@@ -0,0 +1,38 @@
+#ifndef VIRTIO_PCI_DEV_H_
+#define VIRTIO_PCI_DEV_H_
+
+#include <linux/virtio_ids.h>
+
+/*
+ * Virtio PCI device constants and resources
+ * they do use (such as irqs and pins).
+ */
+
+#define PCI_DEVICE_ID_VIRTIO_NET		0x1000
+#define PCI_DEVICE_ID_VIRTIO_BLK		0x1001
+#define PCI_DEVICE_ID_VIRTIO_CONSOLE		0x1003
+#define PCI_DEVICE_ID_VIRTIO_RNG		0x1004
+#define PCI_DEVICE_ID_VIRTIO_BLN		0x1005
+#define PCI_DEVICE_ID_VIRTIO_SCSI		0x1008
+#define PCI_DEVICE_ID_VIRTIO_9P			0x1009
+#define PCI_DEVICE_ID_VESA			0x2000
+#define PCI_DEVICE_ID_PCI_SHMEM			0x0001
+
+#define PCI_VENDOR_ID_REDHAT_QUMRANET		0x1af4
+#define PCI_VENDOR_ID_PCI_SHMEM			0x0001
+#define PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET	0x1af4
+
+#define PCI_SUBSYSTEM_ID_VESA			0x0004
+#define PCI_SUBSYSTEM_ID_PCI_SHMEM		0x0001
+
+#define PCI_CLASS_BLK				0x018000
+#define PCI_CLASS_NET				0x020000
+#define PCI_CLASS_CONSOLE			0x078000
+/*
+ * 0xFF Device does not fit in any defined classes
+ */
+#define PCI_CLASS_RNG				0xff0000
+#define PCI_CLASS_BLN				0xff0000
+#define PCI_CLASS_9P				0xff0000
+
+#endif /* VIRTIO_PCI_DEV_H_ */
diff --git a/kvmtool/include/kvm/virtio-pci.h b/kvmtool/include/kvm/virtio-pci.h
new file mode 100644
index 0000000..959b4b8
--- /dev/null
+++ b/kvmtool/include/kvm/virtio-pci.h
@@ -0,0 +1,59 @@
+#ifndef KVM__VIRTIO_PCI_H
+#define KVM__VIRTIO_PCI_H
+
+#include "kvm/devices.h"
+#include "kvm/pci.h"
+
+#include <linux/types.h>
+
+#define VIRTIO_PCI_MAX_VQ	32
+#define VIRTIO_PCI_MAX_CONFIG	1
+
+struct kvm;
+
+struct virtio_pci_ioevent_param {
+	struct virtio_device	*vdev;
+	u32			vq;
+};
+
+#define VIRTIO_PCI_F_SIGNAL_MSI (1 << 0)
+
+struct virtio_pci {
+	struct pci_device_header pci_hdr;
+	struct device_header	dev_hdr;
+	void			*dev;
+	struct kvm		*kvm;
+
+	u8			status;
+	u8			isr;
+	u32			features;
+
+	/*
+	 * We cannot rely on the INTERRUPT_LINE byte in the config space once
+	 * we have run guest code, as the OS is allowed to use that field
+	 * as a scratch pad to communicate between driver and PCI layer.
+	 * So store our legacy interrupt line number in here for internal use.
+	 */
+	u8			legacy_irq_line;
+
+	/* MSI-X */
+	u16			config_vector;
+	u32			config_gsi;
+	u32			vq_vector[VIRTIO_PCI_MAX_VQ];
+	u32			gsis[VIRTIO_PCI_MAX_VQ];
+	u64			msix_pba;
+	struct msix_table	msix_table[VIRTIO_PCI_MAX_VQ + VIRTIO_PCI_MAX_CONFIG];
+
+	/* virtio queue */
+	u16			queue_selector;
+	struct virtio_pci_ioevent_param ioeventfds[VIRTIO_PCI_MAX_VQ];
+};
+
+int virtio_pci__signal_vq(struct kvm *kvm, struct virtio_device *vdev, u32 vq);
+int virtio_pci__signal_config(struct kvm *kvm, struct virtio_device *vdev);
+int virtio_pci__exit(struct kvm *kvm, struct virtio_device *vdev);
+int virtio_pci__reset(struct kvm *kvm, struct virtio_device *vdev);
+int virtio_pci__init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+		     int device_id, int subsys_id, int class);
+
+#endif
diff --git a/kvmtool/include/kvm/virtio-rng.h b/kvmtool/include/kvm/virtio-rng.h
new file mode 100644
index 0000000..b585b37
--- /dev/null
+++ b/kvmtool/include/kvm/virtio-rng.h
@@ -0,0 +1,9 @@
+#ifndef KVM__RNG_VIRTIO_H
+#define KVM__RNG_VIRTIO_H
+
+struct kvm;
+
+int virtio_rng__init(struct kvm *kvm);
+int virtio_rng__exit(struct kvm *kvm);
+
+#endif /* KVM__RNG_VIRTIO_H */
diff --git a/kvmtool/include/kvm/virtio-scsi.h b/kvmtool/include/kvm/virtio-scsi.h
new file mode 100644
index 0000000..d64aa7e
--- /dev/null
+++ b/kvmtool/include/kvm/virtio-scsi.h
@@ -0,0 +1,11 @@
+#ifndef KVM__SCSI_VIRTIO_H
+#define KVM__SCSI_VIRTIO_H
+
+#include "kvm/disk-image.h"
+
+struct kvm;
+
+int virtio_scsi_init(struct kvm *kvm);
+int virtio_scsi_exit(struct kvm *kvm);
+
+#endif /* KVM__SCSI_VIRTIO_H */
diff --git a/kvmtool/include/kvm/virtio.h b/kvmtool/include/kvm/virtio.h
new file mode 100644
index 0000000..3a311f5
--- /dev/null
+++ b/kvmtool/include/kvm/virtio.h
@@ -0,0 +1,234 @@
+#ifndef KVM__VIRTIO_H
+#define KVM__VIRTIO_H
+
+#include <endian.h>
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_pci.h>
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <linux/virtio_config.h>
+#include <sys/uio.h>
+
+#include "kvm/barrier.h"
+#include "kvm/kvm.h"
+
+#define VIRTIO_IRQ_LOW		0
+#define VIRTIO_IRQ_HIGH		1
+
+#define VIRTIO_PCI_O_CONFIG	0
+#define VIRTIO_PCI_O_MSIX	1
+
+#define VIRTIO_ENDIAN_LE	(1 << 0)
+#define VIRTIO_ENDIAN_BE	(1 << 1)
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define VIRTIO_ENDIAN_HOST VIRTIO_ENDIAN_LE
+#else
+#define VIRTIO_ENDIAN_HOST VIRTIO_ENDIAN_BE
+#endif
+
+/* Reserved status bits */
+#define VIRTIO_CONFIG_S_MASK \
+	(VIRTIO_CONFIG_S_ACKNOWLEDGE |	\
+	 VIRTIO_CONFIG_S_DRIVER |	\
+	 VIRTIO_CONFIG_S_DRIVER_OK |	\
+	 VIRTIO_CONFIG_S_FEATURES_OK |	\
+	 VIRTIO_CONFIG_S_FAILED)
+
+/* Kvmtool status bits */
+/* Start the device */
+#define VIRTIO__STATUS_START		(1 << 8)
+/* Stop the device */
+#define VIRTIO__STATUS_STOP		(1 << 9)
+
+struct virt_queue {
+	struct vring	vring;
+	u32		pfn;
+	/* The last_avail_idx field is an index to ->ring of struct vring_avail.
+	   It's where we assume the next request index is at.  */
+	u16		last_avail_idx;
+	u16		last_used_signalled;
+	u16		endian;
+	bool		use_event_idx;
+	bool		enabled;
+};
+
+/*
+ * The default policy is not to cope with the guest endianness.
+ * It also helps not breaking archs that do not care about supporting
+ * such a configuration.
+ */
+#ifndef VIRTIO_RING_ENDIAN
+#define VIRTIO_RING_ENDIAN VIRTIO_ENDIAN_HOST
+#endif
+
+#if VIRTIO_RING_ENDIAN != VIRTIO_ENDIAN_HOST
+
+static inline __u16 __virtio_g2h_u16(u16 endian, __u16 val)
+{
+	return (endian == VIRTIO_ENDIAN_LE) ? le16toh(val) : be16toh(val);
+}
+
+static inline __u16 __virtio_h2g_u16(u16 endian, __u16 val)
+{
+	return (endian == VIRTIO_ENDIAN_LE) ? htole16(val) : htobe16(val);
+}
+
+static inline __u32 __virtio_g2h_u32(u16 endian, __u32 val)
+{
+	return (endian == VIRTIO_ENDIAN_LE) ? le32toh(val) : be32toh(val);
+}
+
+static inline __u32 __virtio_h2g_u32(u16 endian, __u32 val)
+{
+	return (endian == VIRTIO_ENDIAN_LE) ? htole32(val) : htobe32(val);
+}
+
+static inline __u64 __virtio_g2h_u64(u16 endian, __u64 val)
+{
+	return (endian == VIRTIO_ENDIAN_LE) ? le64toh(val) : be64toh(val);
+}
+
+static inline __u64 __virtio_h2g_u64(u16 endian, __u64 val)
+{
+	return (endian == VIRTIO_ENDIAN_LE) ? htole64(val) : htobe64(val);
+}
+
+#define virtio_guest_to_host_u16(x, v)	__virtio_g2h_u16((x)->endian, (v))
+#define virtio_host_to_guest_u16(x, v)	__virtio_h2g_u16((x)->endian, (v))
+#define virtio_guest_to_host_u32(x, v)	__virtio_g2h_u32((x)->endian, (v))
+#define virtio_host_to_guest_u32(x, v)	__virtio_h2g_u32((x)->endian, (v))
+#define virtio_guest_to_host_u64(x, v)	__virtio_g2h_u64((x)->endian, (v))
+#define virtio_host_to_guest_u64(x, v)	__virtio_h2g_u64((x)->endian, (v))
+
+#else
+
+#define virtio_guest_to_host_u16(x, v)	(v)
+#define virtio_host_to_guest_u16(x, v)	(v)
+#define virtio_guest_to_host_u32(x, v)	(v)
+#define virtio_host_to_guest_u32(x, v)	(v)
+#define virtio_guest_to_host_u64(x, v)	(v)
+#define virtio_host_to_guest_u64(x, v)	(v)
+
+#endif
+
+static inline u16 virt_queue__pop(struct virt_queue *queue)
+{
+	__u16 guest_idx;
+
+	/*
+	 * The guest updates the avail index after writing the ring entry.
+	 * Ensure that we read the updated entry once virt_queue__available()
+	 * observes the new index.
+	 */
+	rmb();
+
+	guest_idx = queue->vring.avail->ring[queue->last_avail_idx++ % queue->vring.num];
+	return virtio_guest_to_host_u16(queue, guest_idx);
+}
+
+static inline struct vring_desc *virt_queue__get_desc(struct virt_queue *queue, u16 desc_ndx)
+{
+	return &queue->vring.desc[desc_ndx];
+}
+
+static inline bool virt_queue__available(struct virt_queue *vq)
+{
+	u16 last_avail_idx = virtio_host_to_guest_u16(vq, vq->last_avail_idx);
+
+	if (!vq->vring.avail)
+		return 0;
+
+	if (vq->use_event_idx) {
+		vring_avail_event(&vq->vring) = last_avail_idx;
+		/*
+		 * After the driver writes a new avail index, it reads the event
+		 * index to see if we need any notification. Ensure that it
+		 * reads the updated index, or else we'll miss the notification.
+		 */
+		mb();
+	}
+
+	return vq->vring.avail->idx != last_avail_idx;
+}
+
+void virt_queue__used_idx_advance(struct virt_queue *queue, u16 jump);
+struct vring_used_elem * virt_queue__set_used_elem_no_update(struct virt_queue *queue, u32 head, u32 len, u16 offset);
+struct vring_used_elem *virt_queue__set_used_elem(struct virt_queue *queue, u32 head, u32 len);
+
+bool virtio_queue__should_signal(struct virt_queue *vq);
+u16 virt_queue__get_iov(struct virt_queue *vq, struct iovec iov[],
+			u16 *out, u16 *in, struct kvm *kvm);
+u16 virt_queue__get_head_iov(struct virt_queue *vq, struct iovec iov[],
+			     u16 *out, u16 *in, u16 head, struct kvm *kvm);
+u16 virt_queue__get_inout_iov(struct kvm *kvm, struct virt_queue *queue,
+			      struct iovec in_iov[], struct iovec out_iov[],
+			      u16 *in, u16 *out);
+int virtio__get_dev_specific_field(int offset, bool msix, u32 *config_off);
+
+enum virtio_trans {
+	VIRTIO_PCI,
+	VIRTIO_MMIO,
+};
+
+struct virtio_device {
+	bool			use_vhost;
+	void			*virtio;
+	struct virtio_ops	*ops;
+	u16			endian;
+	u32			features;
+	u32			status;
+};
+
+struct virtio_ops {
+	u8 *(*get_config)(struct kvm *kvm, void *dev);
+	u32 (*get_host_features)(struct kvm *kvm, void *dev);
+	void (*set_guest_features)(struct kvm *kvm, void *dev, u32 features);
+	int (*get_vq_count)(struct kvm *kvm, void *dev);
+	int (*init_vq)(struct kvm *kvm, void *dev, u32 vq, u32 page_size,
+		       u32 align, u32 pfn);
+	void (*exit_vq)(struct kvm *kvm, void *dev, u32 vq);
+	int (*notify_vq)(struct kvm *kvm, void *dev, u32 vq);
+	struct virt_queue *(*get_vq)(struct kvm *kvm, void *dev, u32 vq);
+	int (*get_size_vq)(struct kvm *kvm, void *dev, u32 vq);
+	int (*set_size_vq)(struct kvm *kvm, void *dev, u32 vq, int size);
+	void (*notify_vq_gsi)(struct kvm *kvm, void *dev, u32 vq, u32 gsi);
+	void (*notify_vq_eventfd)(struct kvm *kvm, void *dev, u32 vq, u32 efd);
+	int (*signal_vq)(struct kvm *kvm, struct virtio_device *vdev, u32 queueid);
+	int (*signal_config)(struct kvm *kvm, struct virtio_device *vdev);
+	void (*notify_status)(struct kvm *kvm, void *dev, u32 status);
+	int (*init)(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+		    int device_id, int subsys_id, int class);
+	int (*exit)(struct kvm *kvm, struct virtio_device *vdev);
+	int (*reset)(struct kvm *kvm, struct virtio_device *vdev);
+};
+
+int __must_check virtio_init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+			     struct virtio_ops *ops, enum virtio_trans trans,
+			     int device_id, int subsys_id, int class);
+int virtio_compat_add_message(const char *device, const char *config);
+const char* virtio_trans_name(enum virtio_trans trans);
+
+static inline void *virtio_get_vq(struct kvm *kvm, u32 pfn, u32 page_size)
+{
+	return guest_flat_to_host(kvm, (u64)pfn * page_size);
+}
+
+static inline void virtio_init_device_vq(struct virtio_device *vdev,
+					 struct virt_queue *vq)
+{
+	vq->endian = vdev->endian;
+	vq->use_event_idx = (vdev->features & VIRTIO_RING_F_EVENT_IDX);
+	vq->enabled = true;
+}
+
+void virtio_exit_vq(struct kvm *kvm, struct virtio_device *vdev, void *dev,
+		    int num);
+void virtio_set_guest_features(struct kvm *kvm, struct virtio_device *vdev,
+			       void *dev, u32 features);
+void virtio_notify_status(struct kvm *kvm, struct virtio_device *vdev,
+			  void *dev, u8 status);
+
+#endif /* KVM__VIRTIO_H */
diff --git a/kvmtool/include/kvm/vnc.h b/kvmtool/include/kvm/vnc.h
new file mode 100644
index 0000000..c2934a4
--- /dev/null
+++ b/kvmtool/include/kvm/vnc.h
@@ -0,0 +1,22 @@
+#ifndef KVM__VNC_H
+#define KVM__VNC_H
+
+#include "kvm/kvm.h"
+
+struct framebuffer;
+
+#ifdef CONFIG_HAS_VNCSERVER
+int vnc__init(struct kvm *kvm);
+int vnc__exit(struct kvm *kvm);
+#else
+static inline int vnc__init(struct kvm *kvm)
+{
+	return 0;
+}
+static inline int vnc__exit(struct kvm *kvm)
+{
+	return 0;
+}
+#endif
+
+#endif /* KVM__VNC_H */
diff --git a/kvmtool/include/linux/9p.h b/kvmtool/include/linux/9p.h
new file mode 100644
index 0000000..c7cfd8e
--- /dev/null
+++ b/kvmtool/include/linux/9p.h
@@ -0,0 +1,600 @@
+/*
+ * include/net/9p/9p.h
+ *
+ * 9P protocol definitions.
+ *
+ *  Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net>
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#ifndef NET_9P_H
+#define NET_9P_H
+
+/* These definitions are from Linux include/linux/uidgid.h */
+typedef struct {
+	uid_t val;
+} kuid_t;
+
+typedef struct {
+	gid_t val;
+} kgid_t;
+
+#define KUIDT_INIT(value) (kuid_t){ value }
+#define KGIDT_INIT(value) (kgid_t){ value }
+
+static inline uid_t __kuid_val(kuid_t uid)
+{
+	return uid.val;
+}
+
+static inline gid_t __kgid_val(kgid_t gid)
+{
+	return gid.val;
+}
+
+
+/**
+ * enum p9_debug_flags - bits for mount time debug parameter
+ * @P9_DEBUG_ERROR: more verbose error messages including original error string
+ * @P9_DEBUG_9P: 9P protocol tracing
+ * @P9_DEBUG_VFS: VFS API tracing
+ * @P9_DEBUG_CONV: protocol conversion tracing
+ * @P9_DEBUG_MUX: trace management of concurrent transactions
+ * @P9_DEBUG_TRANS: transport tracing
+ * @P9_DEBUG_SLABS: memory management tracing
+ * @P9_DEBUG_FCALL: verbose dump of protocol messages
+ * @P9_DEBUG_FID: fid allocation/deallocation tracking
+ * @P9_DEBUG_PKT: packet marshalling/unmarshalling
+ * @P9_DEBUG_FSC: FS-cache tracing
+ * @P9_DEBUG_VPKT: Verbose packet debugging (full packet dump)
+ *
+ * These flags are passed at mount time to turn on various levels of
+ * verbosity and tracing which will be output to the system logs.
+ */
+
+enum p9_debug_flags {
+	P9_DEBUG_ERROR = 	(1<<0),
+	P9_DEBUG_9P = 		(1<<2),
+	P9_DEBUG_VFS =		(1<<3),
+	P9_DEBUG_CONV =		(1<<4),
+	P9_DEBUG_MUX =		(1<<5),
+	P9_DEBUG_TRANS =	(1<<6),
+	P9_DEBUG_SLABS =      	(1<<7),
+	P9_DEBUG_FCALL =	(1<<8),
+	P9_DEBUG_FID =		(1<<9),
+	P9_DEBUG_PKT =		(1<<10),
+	P9_DEBUG_FSC =		(1<<11),
+	P9_DEBUG_VPKT =		(1<<12),
+};
+
+#ifdef CONFIG_NET_9P_DEBUG
+extern unsigned int p9_debug_level;
+__printf(3, 4)
+void _p9_debug(enum p9_debug_flags level, const char *func,
+	       const char *fmt, ...);
+#define p9_debug(level, fmt, ...)			\
+	_p9_debug(level, __func__, fmt, ##__VA_ARGS__)
+#else
+#define p9_debug(level, fmt, ...)			\
+	no_printk(fmt, ##__VA_ARGS__)
+#endif
+
+/**
+ * enum p9_msg_t - 9P message types
+ * @P9_TLERROR: not used
+ * @P9_RLERROR: response for any failed request for 9P2000.L
+ * @P9_TSTATFS: file system status request
+ * @P9_RSTATFS: file system status response
+ * @P9_TSYMLINK: make symlink request
+ * @P9_RSYMLINK: make symlink response
+ * @P9_TMKNOD: create a special file object request
+ * @P9_RMKNOD: create a special file object response
+ * @P9_TLCREATE: prepare a handle for I/O on an new file for 9P2000.L
+ * @P9_RLCREATE: response with file access information for 9P2000.L
+ * @P9_TRENAME: rename request
+ * @P9_RRENAME: rename response
+ * @P9_TMKDIR: create a directory request
+ * @P9_RMKDIR: create a directory response
+ * @P9_TVERSION: version handshake request
+ * @P9_RVERSION: version handshake response
+ * @P9_TAUTH: request to establish authentication channel
+ * @P9_RAUTH: response with authentication information
+ * @P9_TATTACH: establish user access to file service
+ * @P9_RATTACH: response with top level handle to file hierarchy
+ * @P9_TERROR: not used
+ * @P9_RERROR: response for any failed request
+ * @P9_TFLUSH: request to abort a previous request
+ * @P9_RFLUSH: response when previous request has been cancelled
+ * @P9_TWALK: descend a directory hierarchy
+ * @P9_RWALK: response with new handle for position within hierarchy
+ * @P9_TOPEN: prepare a handle for I/O on an existing file
+ * @P9_ROPEN: response with file access information
+ * @P9_TCREATE: prepare a handle for I/O on a new file
+ * @P9_RCREATE: response with file access information
+ * @P9_TREAD: request to transfer data from a file or directory
+ * @P9_RREAD: response with data requested
+ * @P9_TWRITE: reuqest to transfer data to a file
+ * @P9_RWRITE: response with out much data was transferred to file
+ * @P9_TCLUNK: forget about a handle to an entity within the file system
+ * @P9_RCLUNK: response when server has forgotten about the handle
+ * @P9_TREMOVE: request to remove an entity from the hierarchy
+ * @P9_RREMOVE: response when server has removed the entity
+ * @P9_TSTAT: request file entity attributes
+ * @P9_RSTAT: response with file entity attributes
+ * @P9_TWSTAT: request to update file entity attributes
+ * @P9_RWSTAT: response when file entity attributes are updated
+ *
+ * There are 14 basic operations in 9P2000, paired as
+ * requests and responses.  The one special case is ERROR
+ * as there is no @P9_TERROR request for clients to transmit to
+ * the server, but the server may respond to any other request
+ * with an @P9_RERROR.
+ *
+ * See Also: http://plan9.bell-labs.com/sys/man/5/INDEX.html
+ */
+
+enum p9_msg_t {
+	P9_TLERROR = 6,
+	P9_RLERROR,
+	P9_TSTATFS = 8,
+	P9_RSTATFS,
+	P9_TLOPEN = 12,
+	P9_RLOPEN,
+	P9_TLCREATE = 14,
+	P9_RLCREATE,
+	P9_TSYMLINK = 16,
+	P9_RSYMLINK,
+	P9_TMKNOD = 18,
+	P9_RMKNOD,
+	P9_TRENAME = 20,
+	P9_RRENAME,
+	P9_TREADLINK = 22,
+	P9_RREADLINK,
+	P9_TGETATTR = 24,
+	P9_RGETATTR,
+	P9_TSETATTR = 26,
+	P9_RSETATTR,
+	P9_TXATTRWALK = 30,
+	P9_RXATTRWALK,
+	P9_TXATTRCREATE = 32,
+	P9_RXATTRCREATE,
+	P9_TREADDIR = 40,
+	P9_RREADDIR,
+	P9_TFSYNC = 50,
+	P9_RFSYNC,
+	P9_TLOCK = 52,
+	P9_RLOCK,
+	P9_TGETLOCK = 54,
+	P9_RGETLOCK,
+	P9_TLINK = 70,
+	P9_RLINK,
+	P9_TMKDIR = 72,
+	P9_RMKDIR,
+	P9_TRENAMEAT = 74,
+	P9_RRENAMEAT,
+	P9_TUNLINKAT = 76,
+	P9_RUNLINKAT,
+	P9_TVERSION = 100,
+	P9_RVERSION,
+	P9_TAUTH = 102,
+	P9_RAUTH,
+	P9_TATTACH = 104,
+	P9_RATTACH,
+	P9_TERROR = 106,
+	P9_RERROR,
+	P9_TFLUSH = 108,
+	P9_RFLUSH,
+	P9_TWALK = 110,
+	P9_RWALK,
+	P9_TOPEN = 112,
+	P9_ROPEN,
+	P9_TCREATE = 114,
+	P9_RCREATE,
+	P9_TREAD = 116,
+	P9_RREAD,
+	P9_TWRITE = 118,
+	P9_RWRITE,
+	P9_TCLUNK = 120,
+	P9_RCLUNK,
+	P9_TREMOVE = 122,
+	P9_RREMOVE,
+	P9_TSTAT = 124,
+	P9_RSTAT,
+	P9_TWSTAT = 126,
+	P9_RWSTAT,
+};
+
+/**
+ * enum p9_open_mode_t - 9P open modes
+ * @P9_OREAD: open file for reading only
+ * @P9_OWRITE: open file for writing only
+ * @P9_ORDWR: open file for reading or writing
+ * @P9_OEXEC: open file for execution
+ * @P9_OTRUNC: truncate file to zero-length before opening it
+ * @P9_OREXEC: close the file when an exec(2) system call is made
+ * @P9_ORCLOSE: remove the file when the file is closed
+ * @P9_OAPPEND: open the file and seek to the end
+ * @P9_OEXCL: only create a file, do not open it
+ *
+ * 9P open modes differ slightly from Posix standard modes.
+ * In particular, there are extra modes which specify different
+ * semantic behaviors than may be available on standard Posix
+ * systems.  For example, @P9_OREXEC and @P9_ORCLOSE are modes that
+ * most likely will not be issued from the Linux VFS client, but may
+ * be supported by servers.
+ *
+ * See Also: http://plan9.bell-labs.com/magic/man2html/2/open
+ */
+
+enum p9_open_mode_t {
+	P9_OREAD = 0x00,
+	P9_OWRITE = 0x01,
+	P9_ORDWR = 0x02,
+	P9_OEXEC = 0x03,
+	P9_OTRUNC = 0x10,
+	P9_OREXEC = 0x20,
+	P9_ORCLOSE = 0x40,
+	P9_OAPPEND = 0x80,
+	P9_OEXCL = 0x1000,
+};
+
+/**
+ * enum p9_perm_t - 9P permissions
+ * @P9_DMDIR: mode bit for directories
+ * @P9_DMAPPEND: mode bit for is append-only
+ * @P9_DMEXCL: mode bit for excluse use (only one open handle allowed)
+ * @P9_DMMOUNT: mode bit for mount points
+ * @P9_DMAUTH: mode bit for authentication file
+ * @P9_DMTMP: mode bit for non-backed-up files
+ * @P9_DMSYMLINK: mode bit for symbolic links (9P2000.u)
+ * @P9_DMLINK: mode bit for hard-link (9P2000.u)
+ * @P9_DMDEVICE: mode bit for device files (9P2000.u)
+ * @P9_DMNAMEDPIPE: mode bit for named pipe (9P2000.u)
+ * @P9_DMSOCKET: mode bit for socket (9P2000.u)
+ * @P9_DMSETUID: mode bit for setuid (9P2000.u)
+ * @P9_DMSETGID: mode bit for setgid (9P2000.u)
+ * @P9_DMSETVTX: mode bit for sticky bit (9P2000.u)
+ *
+ * 9P permissions differ slightly from Posix standard modes.
+ *
+ * See Also: http://plan9.bell-labs.com/magic/man2html/2/stat
+ */
+enum p9_perm_t {
+	P9_DMDIR = 0x80000000,
+	P9_DMAPPEND = 0x40000000,
+	P9_DMEXCL = 0x20000000,
+	P9_DMMOUNT = 0x10000000,
+	P9_DMAUTH = 0x08000000,
+	P9_DMTMP = 0x04000000,
+/* 9P2000.u extensions */
+	P9_DMSYMLINK = 0x02000000,
+	P9_DMLINK = 0x01000000,
+	P9_DMDEVICE = 0x00800000,
+	P9_DMNAMEDPIPE = 0x00200000,
+	P9_DMSOCKET = 0x00100000,
+	P9_DMSETUID = 0x00080000,
+	P9_DMSETGID = 0x00040000,
+	P9_DMSETVTX = 0x00010000,
+};
+
+/* 9p2000.L open flags */
+#define P9_DOTL_RDONLY        00000000
+#define P9_DOTL_WRONLY        00000001
+#define P9_DOTL_RDWR          00000002
+#define P9_DOTL_NOACCESS      00000003
+#define P9_DOTL_CREATE        00000100
+#define P9_DOTL_EXCL          00000200
+#define P9_DOTL_NOCTTY        00000400
+#define P9_DOTL_TRUNC         00001000
+#define P9_DOTL_APPEND        00002000
+#define P9_DOTL_NONBLOCK      00004000
+#define P9_DOTL_DSYNC         00010000
+#define P9_DOTL_FASYNC        00020000
+#define P9_DOTL_DIRECT        00040000
+#define P9_DOTL_LARGEFILE     00100000
+#define P9_DOTL_DIRECTORY     00200000
+#define P9_DOTL_NOFOLLOW      00400000
+#define P9_DOTL_NOATIME       01000000
+#define P9_DOTL_CLOEXEC       02000000
+#define P9_DOTL_SYNC          04000000
+
+/* 9p2000.L at flags */
+#define P9_DOTL_AT_REMOVEDIR		0x200
+
+/* 9p2000.L lock type */
+#define P9_LOCK_TYPE_RDLCK 0
+#define P9_LOCK_TYPE_WRLCK 1
+#define P9_LOCK_TYPE_UNLCK 2
+
+/**
+ * enum p9_qid_t - QID types
+ * @P9_QTDIR: directory
+ * @P9_QTAPPEND: append-only
+ * @P9_QTEXCL: excluse use (only one open handle allowed)
+ * @P9_QTMOUNT: mount points
+ * @P9_QTAUTH: authentication file
+ * @P9_QTTMP: non-backed-up files
+ * @P9_QTSYMLINK: symbolic links (9P2000.u)
+ * @P9_QTLINK: hard-link (9P2000.u)
+ * @P9_QTFILE: normal files
+ *
+ * QID types are a subset of permissions - they are primarily
+ * used to differentiate semantics for a file system entity via
+ * a jump-table.  Their value is also the most significant 16 bits
+ * of the permission_t
+ *
+ * See Also: http://plan9.bell-labs.com/magic/man2html/2/stat
+ */
+enum p9_qid_t {
+	P9_QTDIR = 0x80,
+	P9_QTAPPEND = 0x40,
+	P9_QTEXCL = 0x20,
+	P9_QTMOUNT = 0x10,
+	P9_QTAUTH = 0x08,
+	P9_QTTMP = 0x04,
+	P9_QTSYMLINK = 0x02,
+	P9_QTLINK = 0x01,
+	P9_QTFILE = 0x00,
+};
+
+/* 9P Magic Numbers */
+#define P9_NOTAG	(u16)(~0)
+#define P9_NOFID	(u32)(~0)
+#define P9_MAXWELEM	16
+
+/* ample room for Twrite/Rread header */
+#define P9_IOHDRSZ	24
+
+/* Room for readdir header */
+#define P9_READDIRHDRSZ	24
+
+/* size of header for zero copy read/write */
+#define P9_ZC_HDR_SZ 4096
+
+/**
+ * struct p9_qid - file system entity information
+ * @type: 8-bit type &p9_qid_t
+ * @version: 16-bit monotonically incrementing version number
+ * @path: 64-bit per-server-unique ID for a file system element
+ *
+ * qids are identifiers used by 9P servers to track file system
+ * entities.  The type is used to differentiate semantics for operations
+ * on the entity (ie. read means something different on a directory than
+ * on a file).  The path provides a server unique index for an entity
+ * (roughly analogous to an inode number), while the version is updated
+ * every time a file is modified and can be used to maintain cache
+ * coherency between clients and serves.
+ * Servers will often differentiate purely synthetic entities by setting
+ * their version to 0, signaling that they should never be cached and
+ * should be accessed synchronously.
+ *
+ * See Also://plan9.bell-labs.com/magic/man2html/2/stat
+ */
+
+struct p9_qid {
+	u8 type;
+	u32 version;
+	u64 path;
+};
+
+/**
+ * struct p9_wstat - file system metadata information
+ * @size: length prefix for this stat structure instance
+ * @type: the type of the server (equivalent to a major number)
+ * @dev: the sub-type of the server (equivalent to a minor number)
+ * @qid: unique id from the server of type &p9_qid
+ * @mode: Plan 9 format permissions of type &p9_perm_t
+ * @atime: Last access/read time
+ * @mtime: Last modify/write time
+ * @length: file length
+ * @name: last element of path (aka filename)
+ * @uid: owner name
+ * @gid: group owner
+ * @muid: last modifier
+ * @extension: area used to encode extended UNIX support
+ * @n_uid: numeric user id of owner (part of 9p2000.u extension)
+ * @n_gid: numeric group id (part of 9p2000.u extension)
+ * @n_muid: numeric user id of laster modifier (part of 9p2000.u extension)
+ *
+ * See Also: http://plan9.bell-labs.com/magic/man2html/2/stat
+ */
+
+struct p9_wstat {
+	u16 size;
+	u16 type;
+	u32 dev;
+	struct p9_qid qid;
+	u32 mode;
+	u32 atime;
+	u32 mtime;
+	u64 length;
+	char *name;
+	char *uid;
+	char *gid;
+	char *muid;
+	char *extension;	/* 9p2000.u extensions */
+	kuid_t n_uid;		/* 9p2000.u extensions */
+	kgid_t n_gid;		/* 9p2000.u extensions */
+	kuid_t n_muid;		/* 9p2000.u extensions */
+};
+
+struct p9_stat_dotl {
+	u64 st_result_mask;
+	struct p9_qid qid;
+	u32 st_mode;
+	kuid_t st_uid;
+	kgid_t st_gid;
+	u64 st_nlink;
+	u64 st_rdev;
+	u64 st_size;
+	u64 st_blksize;
+	u64 st_blocks;
+	u64 st_atime_sec;
+	u64 st_atime_nsec;
+	u64 st_mtime_sec;
+	u64 st_mtime_nsec;
+	u64 st_ctime_sec;
+	u64 st_ctime_nsec;
+	u64 st_btime_sec;
+	u64 st_btime_nsec;
+	u64 st_gen;
+	u64 st_data_version;
+};
+
+#define P9_STATS_MODE		0x00000001ULL
+#define P9_STATS_NLINK		0x00000002ULL
+#define P9_STATS_UID		0x00000004ULL
+#define P9_STATS_GID		0x00000008ULL
+#define P9_STATS_RDEV		0x00000010ULL
+#define P9_STATS_ATIME		0x00000020ULL
+#define P9_STATS_MTIME		0x00000040ULL
+#define P9_STATS_CTIME		0x00000080ULL
+#define P9_STATS_INO		0x00000100ULL
+#define P9_STATS_SIZE		0x00000200ULL
+#define P9_STATS_BLOCKS		0x00000400ULL
+
+#define P9_STATS_BTIME		0x00000800ULL
+#define P9_STATS_GEN		0x00001000ULL
+#define P9_STATS_DATA_VERSION	0x00002000ULL
+
+#define P9_STATS_BASIC		0x000007ffULL /* Mask for fields up to BLOCKS */
+#define P9_STATS_ALL		0x00003fffULL /* Mask for All fields above */
+
+/**
+ * struct p9_iattr_dotl - P9 inode attribute for setattr
+ * @valid: bitfield specifying which fields are valid
+ *         same as in struct iattr
+ * @mode: File permission bits
+ * @uid: user id of owner
+ * @gid: group id
+ * @size: File size
+ * @atime_sec: Last access time, seconds
+ * @atime_nsec: Last access time, nanoseconds
+ * @mtime_sec: Last modification time, seconds
+ * @mtime_nsec: Last modification time, nanoseconds
+ */
+
+struct p9_iattr_dotl {
+	u32 valid;
+	u32 mode;
+	kuid_t uid;
+	kgid_t gid;
+	u64 size;
+	u64 atime_sec;
+	u64 atime_nsec;
+	u64 mtime_sec;
+	u64 mtime_nsec;
+};
+
+#define P9_LOCK_SUCCESS 0
+#define P9_LOCK_BLOCKED 1
+#define P9_LOCK_ERROR 2
+#define P9_LOCK_GRACE 3
+
+#define P9_LOCK_FLAGS_BLOCK 1
+#define P9_LOCK_FLAGS_RECLAIM 2
+
+/* struct p9_flock: POSIX lock structure
+ * @type - type of lock
+ * @flags - lock flags
+ * @start - starting offset of the lock
+ * @length - number of bytes
+ * @proc_id - process id which wants to take lock
+ * @client_id - client id
+ */
+
+struct p9_flock {
+	u8 type;
+	u32 flags;
+	u64 start;
+	u64 length;
+	u32 proc_id;
+	char *client_id;
+};
+
+/* struct p9_getlock: getlock structure
+ * @type - type of lock
+ * @start - starting offset of the lock
+ * @length - number of bytes
+ * @proc_id - process id which wants to take lock
+ * @client_id - client id
+ */
+
+struct p9_getlock {
+	u8 type;
+	u64 start;
+	u64 length;
+	u32 proc_id;
+	char *client_id;
+};
+
+struct p9_rstatfs {
+	u32 type;
+	u32 bsize;
+	u64 blocks;
+	u64 bfree;
+	u64 bavail;
+	u64 files;
+	u64 ffree;
+	u64 fsid;
+	u32 namelen;
+};
+
+/**
+ * struct p9_fcall - primary packet structure
+ * @size: prefixed length of the structure
+ * @id: protocol operating identifier of type &p9_msg_t
+ * @tag: transaction id of the request
+ * @offset: used by marshalling routines to track current position in buffer
+ * @capacity: used by marshalling routines to track total malloc'd capacity
+ * @sdata: payload
+ *
+ * &p9_fcall represents the structure for all 9P RPC
+ * transactions.  Requests are packaged into fcalls, and reponses
+ * must be extracted from them.
+ *
+ * See Also: http://plan9.bell-labs.com/magic/man2html/2/fcall
+ */
+
+struct p9_fcall {
+	u32 size;
+	u8 id;
+	u16 tag;
+
+	size_t offset;
+	size_t capacity;
+
+	u8 *sdata;
+};
+
+struct p9_idpool;
+
+int p9_errstr2errno(char *errstr, int len);
+
+struct p9_idpool *p9_idpool_create(void);
+void p9_idpool_destroy(struct p9_idpool *);
+int p9_idpool_get(struct p9_idpool *p);
+void p9_idpool_put(int id, struct p9_idpool *p);
+int p9_idpool_check(int id, struct p9_idpool *p);
+
+int p9_error_init(void);
+int p9_trans_fd_init(void);
+void p9_trans_fd_exit(void);
+#endif /* NET_9P_H */
diff --git a/kvmtool/include/linux/bitops.h b/kvmtool/include/linux/bitops.h
new file mode 100644
index 0000000..56448b7
--- /dev/null
+++ b/kvmtool/include/linux/bitops.h
@@ -0,0 +1,33 @@
+#ifndef _KVM_LINUX_BITOPS_H_
+#define _KVM_LINUX_BITOPS_H_
+
+#include <linux/kernel.h>
+#include <linux/compiler.h>
+#include <asm/hweight.h>
+
+#define BITS_PER_LONG __WORDSIZE
+#define BITS_PER_BYTE           8
+#define BITS_TO_LONGS(nr)       DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
+
+static inline void set_bit(int nr, unsigned long *addr)
+{
+	addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG);
+}
+
+static inline void clear_bit(int nr, unsigned long *addr)
+{
+	addr[nr / BITS_PER_LONG] &= ~(1UL << (nr % BITS_PER_LONG));
+}
+
+static __always_inline int test_bit(unsigned int nr, const unsigned long *addr)
+{
+	return ((1UL << (nr % BITS_PER_LONG)) &
+		(((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
+}
+
+static inline unsigned long hweight_long(unsigned long w)
+{
+	return sizeof(w) == 4 ? hweight32(w) : hweight64(w);
+}
+
+#endif
diff --git a/kvmtool/include/linux/byteorder.h b/kvmtool/include/linux/byteorder.h
new file mode 100644
index 0000000..8176361
--- /dev/null
+++ b/kvmtool/include/linux/byteorder.h
@@ -0,0 +1,34 @@
+#ifndef __BYTE_ORDER_H__
+#define __BYTE_ORDER_H__
+
+#include <asm/byteorder.h>
+
+/* taken from include/linux/byteorder/generic.h */
+#define cpu_to_le64 __cpu_to_le64
+#define le64_to_cpu __le64_to_cpu
+#define cpu_to_le32 __cpu_to_le32
+#define le32_to_cpu __le32_to_cpu
+#define cpu_to_le16 __cpu_to_le16
+#define le16_to_cpu __le16_to_cpu
+#define cpu_to_be64 __cpu_to_be64
+#define be64_to_cpu __be64_to_cpu
+#define cpu_to_be32 __cpu_to_be32
+#define be32_to_cpu __be32_to_cpu
+#define cpu_to_be16 __cpu_to_be16
+#define be16_to_cpu __be16_to_cpu
+
+/* change in situ versions */
+#define cpu_to_le64s __cpu_to_le64s
+#define le64_to_cpus __le64_to_cpus
+#define cpu_to_le32s __cpu_to_le32s
+#define le32_to_cpus __le32_to_cpus
+#define cpu_to_le16s __cpu_to_le16s
+#define le16_to_cpus __le16_to_cpus
+#define cpu_to_be64s __cpu_to_be64s
+#define be64_to_cpus __be64_to_cpus
+#define cpu_to_be32s __cpu_to_be32s
+#define be32_to_cpus __be32_to_cpus
+#define cpu_to_be16s __cpu_to_be16s
+#define be16_to_cpus __be16_to_cpus
+
+#endif
diff --git a/kvmtool/include/linux/compiler.h b/kvmtool/include/linux/compiler.h
new file mode 100644
index 0000000..a662ba0
--- /dev/null
+++ b/kvmtool/include/linux/compiler.h
@@ -0,0 +1,20 @@
+#ifndef _PERF_LINUX_COMPILER_H_
+#define _PERF_LINUX_COMPILER_H_
+
+#ifndef __always_inline
+#define __always_inline	inline
+#endif
+#define __user
+
+#ifndef __attribute_const__
+#define __attribute_const__
+#endif
+
+#define __used		__attribute__((__unused__))
+#define __packed	__attribute__((packed))
+#define __iomem
+#define __force
+#define __must_check	__attribute__((warn_unused_result))
+#define unlikely
+
+#endif
diff --git a/kvmtool/include/linux/err.h b/kvmtool/include/linux/err.h
new file mode 100644
index 0000000..a729120
--- /dev/null
+++ b/kvmtool/include/linux/err.h
@@ -0,0 +1,69 @@
+#ifndef _LINUX_ERR_H
+#define _LINUX_ERR_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+#include <asm/errno.h>
+
+/*
+ * Kernel pointers have redundant information, so we can use a
+ * scheme where we can return either an error code or a normal
+ * pointer with the same return value.
+ *
+ * This should be a per-architecture thing, to allow different
+ * error and pointer decisions.
+ */
+#define MAX_ERRNO	4095
+
+#ifndef __ASSEMBLY__
+
+#define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO)
+
+static inline void * __must_check ERR_PTR(long error)
+{
+	return (void *) error;
+}
+
+static inline long __must_check PTR_ERR(__force const void *ptr)
+{
+	return (long) ptr;
+}
+
+static inline bool __must_check IS_ERR(__force const void *ptr)
+{
+	return IS_ERR_VALUE((unsigned long)ptr);
+}
+
+static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr)
+{
+	return !ptr || IS_ERR_VALUE((unsigned long)ptr);
+}
+
+/**
+ * ERR_CAST - Explicitly cast an error-valued pointer to another pointer type
+ * @ptr: The pointer to cast.
+ *
+ * Explicitly cast an error-valued pointer to another pointer type in such a
+ * way as to make it clear that's what's going on.
+ */
+static inline void * __must_check ERR_CAST(__force const void *ptr)
+{
+	/* cast away the const */
+	return (void *) ptr;
+}
+
+static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr)
+{
+	if (IS_ERR(ptr))
+		return PTR_ERR(ptr);
+	else
+		return 0;
+}
+
+/* Deprecated */
+#define PTR_RET(p) PTR_ERR_OR_ZERO(p)
+
+#endif
+
+#endif /* _LINUX_ERR_H */
diff --git a/kvmtool/include/linux/kernel.h b/kvmtool/include/linux/kernel.h
new file mode 100644
index 0000000..f2bff5f
--- /dev/null
+++ b/kvmtool/include/linux/kernel.h
@@ -0,0 +1,51 @@
+
+#ifndef KVM__LINUX_KERNEL_H_
+#define KVM__LINUX_KERNEL_H_
+
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+
+#define ALIGN(x,a)		__ALIGN_MASK(x,(typeof(x))(a)-1)
+#define __ALIGN_MASK(x,mask)	(((x)+(mask))&~(mask))
+
+#ifndef offsetof
+#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
+#endif
+
+#ifndef container_of
+/**
+ * container_of - cast a member of a structure out to the containing structure
+ * @ptr:	the pointer to the member.
+ * @type:	the type of the container struct this is embedded in.
+ * @member:	the name of the member within the struct.
+ *
+ */
+#define container_of(ptr, type, member) ({			\
+	const typeof(((type *)0)->member) * __mptr = (ptr);	\
+	(type *)((char *)__mptr - offsetof(type, member)); })
+#endif
+
+#define min(x, y) ({				\
+	typeof(x) _min1 = (x);			\
+	typeof(y) _min2 = (y);			\
+	(void) (&_min1 == &_min2);		\
+	_min1 < _min2 ? _min1 : _min2; })
+
+#define max(x, y) ({				\
+	typeof(x) _max1 = (x);			\
+	typeof(y) _max2 = (y);			\
+	(void) (&_max1 == &_max2);		\
+	_max1 > _max2 ? _max1 : _max2; })
+
+#define min_t(type, x, y) ({                    \
+	type __min1 = (x);                      \
+	type __min2 = (y);                      \
+	__min1 < __min2 ? __min1: __min2; })
+
+#define max_t(type, x, y) ({                    \
+	type __max1 = (x);                      \
+	type __max2 = (y);                      \
+	__max1 > __max2 ? __max1: __max2; })
+
+#define true 1
+
+#endif
diff --git a/kvmtool/include/linux/kvm.h b/kvmtool/include/linux/kvm.h
new file mode 100644
index 0000000..5e3f12d
--- /dev/null
+++ b/kvmtool/include/linux/kvm.h
@@ -0,0 +1,1609 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __LINUX_KVM_H
+#define __LINUX_KVM_H
+
+/*
+ * Userspace interface for /dev/kvm - kernel based virtual machine
+ *
+ * Note: you must update KVM_API_VERSION if you change this interface.
+ */
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <linux/ioctl.h>
+#include <asm/kvm.h>
+
+#define KVM_API_VERSION 12
+
+/* *** Deprecated interfaces *** */
+
+#define KVM_TRC_SHIFT           16
+
+#define KVM_TRC_ENTRYEXIT       (1 << KVM_TRC_SHIFT)
+#define KVM_TRC_HANDLER         (1 << (KVM_TRC_SHIFT + 1))
+
+#define KVM_TRC_VMENTRY         (KVM_TRC_ENTRYEXIT + 0x01)
+#define KVM_TRC_VMEXIT          (KVM_TRC_ENTRYEXIT + 0x02)
+#define KVM_TRC_PAGE_FAULT      (KVM_TRC_HANDLER + 0x01)
+
+#define KVM_TRC_HEAD_SIZE       12
+#define KVM_TRC_CYCLE_SIZE      8
+#define KVM_TRC_EXTRA_MAX       7
+
+#define KVM_TRC_INJ_VIRQ         (KVM_TRC_HANDLER + 0x02)
+#define KVM_TRC_REDELIVER_EVT    (KVM_TRC_HANDLER + 0x03)
+#define KVM_TRC_PEND_INTR        (KVM_TRC_HANDLER + 0x04)
+#define KVM_TRC_IO_READ          (KVM_TRC_HANDLER + 0x05)
+#define KVM_TRC_IO_WRITE         (KVM_TRC_HANDLER + 0x06)
+#define KVM_TRC_CR_READ          (KVM_TRC_HANDLER + 0x07)
+#define KVM_TRC_CR_WRITE         (KVM_TRC_HANDLER + 0x08)
+#define KVM_TRC_DR_READ          (KVM_TRC_HANDLER + 0x09)
+#define KVM_TRC_DR_WRITE         (KVM_TRC_HANDLER + 0x0A)
+#define KVM_TRC_MSR_READ         (KVM_TRC_HANDLER + 0x0B)
+#define KVM_TRC_MSR_WRITE        (KVM_TRC_HANDLER + 0x0C)
+#define KVM_TRC_CPUID            (KVM_TRC_HANDLER + 0x0D)
+#define KVM_TRC_INTR             (KVM_TRC_HANDLER + 0x0E)
+#define KVM_TRC_NMI              (KVM_TRC_HANDLER + 0x0F)
+#define KVM_TRC_VMMCALL          (KVM_TRC_HANDLER + 0x10)
+#define KVM_TRC_HLT              (KVM_TRC_HANDLER + 0x11)
+#define KVM_TRC_CLTS             (KVM_TRC_HANDLER + 0x12)
+#define KVM_TRC_LMSW             (KVM_TRC_HANDLER + 0x13)
+#define KVM_TRC_APIC_ACCESS      (KVM_TRC_HANDLER + 0x14)
+#define KVM_TRC_TDP_FAULT        (KVM_TRC_HANDLER + 0x15)
+#define KVM_TRC_GTLB_WRITE       (KVM_TRC_HANDLER + 0x16)
+#define KVM_TRC_STLB_WRITE       (KVM_TRC_HANDLER + 0x17)
+#define KVM_TRC_STLB_INVAL       (KVM_TRC_HANDLER + 0x18)
+#define KVM_TRC_PPC_INSTR        (KVM_TRC_HANDLER + 0x19)
+
+struct kvm_user_trace_setup {
+	__u32 buf_size;
+	__u32 buf_nr;
+};
+
+#define __KVM_DEPRECATED_MAIN_W_0x06 \
+	_IOW(KVMIO, 0x06, struct kvm_user_trace_setup)
+#define __KVM_DEPRECATED_MAIN_0x07 _IO(KVMIO, 0x07)
+#define __KVM_DEPRECATED_MAIN_0x08 _IO(KVMIO, 0x08)
+
+#define __KVM_DEPRECATED_VM_R_0x70 _IOR(KVMIO, 0x70, struct kvm_assigned_irq)
+
+struct kvm_breakpoint {
+	__u32 enabled;
+	__u32 padding;
+	__u64 address;
+};
+
+struct kvm_debug_guest {
+	__u32 enabled;
+	__u32 pad;
+	struct kvm_breakpoint breakpoints[4];
+	__u32 singlestep;
+};
+
+#define __KVM_DEPRECATED_VCPU_W_0x87 _IOW(KVMIO, 0x87, struct kvm_debug_guest)
+
+/* *** End of deprecated interfaces *** */
+
+
+/* for KVM_CREATE_MEMORY_REGION */
+struct kvm_memory_region {
+	__u32 slot;
+	__u32 flags;
+	__u64 guest_phys_addr;
+	__u64 memory_size; /* bytes */
+};
+
+/* for KVM_SET_USER_MEMORY_REGION */
+struct kvm_userspace_memory_region {
+	__u32 slot;
+	__u32 flags;
+	__u64 guest_phys_addr;
+	__u64 memory_size; /* bytes */
+	__u64 userspace_addr; /* start of the userspace allocated memory */
+};
+
+/*
+ * The bit 0 ~ bit 15 of kvm_memory_region::flags are visible for userspace,
+ * other bits are reserved for kvm internal use which are defined in
+ * include/linux/kvm_host.h.
+ */
+#define KVM_MEM_LOG_DIRTY_PAGES	(1UL << 0)
+#define KVM_MEM_READONLY	(1UL << 1)
+
+/* for KVM_IRQ_LINE */
+struct kvm_irq_level {
+	/*
+	 * ACPI gsi notion of irq.
+	 * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47..
+	 * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23..
+	 * For ARM: See Documentation/virt/kvm/api.txt
+	 */
+	union {
+		__u32 irq;
+		__s32 status;
+	};
+	__u32 level;
+};
+
+
+struct kvm_irqchip {
+	__u32 chip_id;
+	__u32 pad;
+        union {
+		char dummy[512];  /* reserving space */
+#ifdef __KVM_HAVE_PIT
+		struct kvm_pic_state pic;
+#endif
+#ifdef __KVM_HAVE_IOAPIC
+		struct kvm_ioapic_state ioapic;
+#endif
+	} chip;
+};
+
+/* for KVM_CREATE_PIT2 */
+struct kvm_pit_config {
+	__u32 flags;
+	__u32 pad[15];
+};
+
+#define KVM_PIT_SPEAKER_DUMMY     1
+
+struct kvm_s390_skeys {
+	__u64 start_gfn;
+	__u64 count;
+	__u64 skeydata_addr;
+	__u32 flags;
+	__u32 reserved[9];
+};
+
+#define KVM_S390_CMMA_PEEK (1 << 0)
+
+/**
+ * kvm_s390_cmma_log - Used for CMMA migration.
+ *
+ * Used both for input and output.
+ *
+ * @start_gfn: Guest page number to start from.
+ * @count: Size of the result buffer.
+ * @flags: Control operation mode via KVM_S390_CMMA_* flags
+ * @remaining: Used with KVM_S390_GET_CMMA_BITS. Indicates how many dirty
+ *             pages are still remaining.
+ * @mask: Used with KVM_S390_SET_CMMA_BITS. Bitmap of bits to actually set
+ *        in the PGSTE.
+ * @values: Pointer to the values buffer.
+ *
+ * Used in KVM_S390_{G,S}ET_CMMA_BITS ioctls.
+ */
+struct kvm_s390_cmma_log {
+	__u64 start_gfn;
+	__u32 count;
+	__u32 flags;
+	union {
+		__u64 remaining;
+		__u64 mask;
+	};
+	__u64 values;
+};
+
+struct kvm_hyperv_exit {
+#define KVM_EXIT_HYPERV_SYNIC          1
+#define KVM_EXIT_HYPERV_HCALL          2
+	__u32 type;
+	union {
+		struct {
+			__u32 msr;
+			__u64 control;
+			__u64 evt_page;
+			__u64 msg_page;
+		} synic;
+		struct {
+			__u64 input;
+			__u64 result;
+			__u64 params[2];
+		} hcall;
+	} u;
+};
+
+#define KVM_S390_GET_SKEYS_NONE   1
+#define KVM_S390_SKEYS_MAX        1048576
+
+#define KVM_EXIT_UNKNOWN          0
+#define KVM_EXIT_EXCEPTION        1
+#define KVM_EXIT_IO               2
+#define KVM_EXIT_HYPERCALL        3
+#define KVM_EXIT_DEBUG            4
+#define KVM_EXIT_HLT              5
+#define KVM_EXIT_MMIO             6
+#define KVM_EXIT_IRQ_WINDOW_OPEN  7
+#define KVM_EXIT_SHUTDOWN         8
+#define KVM_EXIT_FAIL_ENTRY       9
+#define KVM_EXIT_INTR             10
+#define KVM_EXIT_SET_TPR          11
+#define KVM_EXIT_TPR_ACCESS       12
+#define KVM_EXIT_S390_SIEIC       13
+#define KVM_EXIT_S390_RESET       14
+#define KVM_EXIT_DCR              15 /* deprecated */
+#define KVM_EXIT_NMI              16
+#define KVM_EXIT_INTERNAL_ERROR   17
+#define KVM_EXIT_OSI              18
+#define KVM_EXIT_PAPR_HCALL	  19
+#define KVM_EXIT_S390_UCONTROL	  20
+#define KVM_EXIT_WATCHDOG         21
+#define KVM_EXIT_S390_TSCH        22
+#define KVM_EXIT_EPR              23
+#define KVM_EXIT_SYSTEM_EVENT     24
+#define KVM_EXIT_S390_STSI        25
+#define KVM_EXIT_IOAPIC_EOI       26
+#define KVM_EXIT_HYPERV           27
+
+/* For KVM_EXIT_INTERNAL_ERROR */
+/* Emulate instruction failed. */
+#define KVM_INTERNAL_ERROR_EMULATION	1
+/* Encounter unexpected simultaneous exceptions. */
+#define KVM_INTERNAL_ERROR_SIMUL_EX	2
+/* Encounter unexpected vm-exit due to delivery event. */
+#define KVM_INTERNAL_ERROR_DELIVERY_EV	3
+
+/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
+struct kvm_run {
+	/* in */
+	__u8 request_interrupt_window;
+	__u8 immediate_exit;
+	__u8 padding1[6];
+
+	/* out */
+	__u32 exit_reason;
+	__u8 ready_for_interrupt_injection;
+	__u8 if_flag;
+	__u16 flags;
+
+	/* in (pre_kvm_run), out (post_kvm_run) */
+	__u64 cr8;
+	__u64 apic_base;
+
+#ifdef __KVM_S390
+	/* the processor status word for s390 */
+	__u64 psw_mask; /* psw upper half */
+	__u64 psw_addr; /* psw lower half */
+#endif
+	union {
+		/* KVM_EXIT_UNKNOWN */
+		struct {
+			__u64 hardware_exit_reason;
+		} hw;
+		/* KVM_EXIT_FAIL_ENTRY */
+		struct {
+			__u64 hardware_entry_failure_reason;
+		} fail_entry;
+		/* KVM_EXIT_EXCEPTION */
+		struct {
+			__u32 exception;
+			__u32 error_code;
+		} ex;
+		/* KVM_EXIT_IO */
+		struct {
+#define KVM_EXIT_IO_IN  0
+#define KVM_EXIT_IO_OUT 1
+			__u8 direction;
+			__u8 size; /* bytes */
+			__u16 port;
+			__u32 count;
+			__u64 data_offset; /* relative to kvm_run start */
+		} io;
+		/* KVM_EXIT_DEBUG */
+		struct {
+			struct kvm_debug_exit_arch arch;
+		} debug;
+		/* KVM_EXIT_MMIO */
+		struct {
+			__u64 phys_addr;
+			__u8  data[8];
+			__u32 len;
+			__u8  is_write;
+		} mmio;
+		/* KVM_EXIT_HYPERCALL */
+		struct {
+			__u64 nr;
+			__u64 args[6];
+			__u64 ret;
+			__u32 longmode;
+			__u32 pad;
+		} hypercall;
+		/* KVM_EXIT_TPR_ACCESS */
+		struct {
+			__u64 rip;
+			__u32 is_write;
+			__u32 pad;
+		} tpr_access;
+		/* KVM_EXIT_S390_SIEIC */
+		struct {
+			__u8 icptcode;
+			__u16 ipa;
+			__u32 ipb;
+		} s390_sieic;
+		/* KVM_EXIT_S390_RESET */
+#define KVM_S390_RESET_POR       1
+#define KVM_S390_RESET_CLEAR     2
+#define KVM_S390_RESET_SUBSYSTEM 4
+#define KVM_S390_RESET_CPU_INIT  8
+#define KVM_S390_RESET_IPL       16
+		__u64 s390_reset_flags;
+		/* KVM_EXIT_S390_UCONTROL */
+		struct {
+			__u64 trans_exc_code;
+			__u32 pgm_code;
+		} s390_ucontrol;
+		/* KVM_EXIT_DCR (deprecated) */
+		struct {
+			__u32 dcrn;
+			__u32 data;
+			__u8  is_write;
+		} dcr;
+		/* KVM_EXIT_INTERNAL_ERROR */
+		struct {
+			__u32 suberror;
+			/* Available with KVM_CAP_INTERNAL_ERROR_DATA: */
+			__u32 ndata;
+			__u64 data[16];
+		} internal;
+		/* KVM_EXIT_OSI */
+		struct {
+			__u64 gprs[32];
+		} osi;
+		/* KVM_EXIT_PAPR_HCALL */
+		struct {
+			__u64 nr;
+			__u64 ret;
+			__u64 args[9];
+		} papr_hcall;
+		/* KVM_EXIT_S390_TSCH */
+		struct {
+			__u16 subchannel_id;
+			__u16 subchannel_nr;
+			__u32 io_int_parm;
+			__u32 io_int_word;
+			__u32 ipb;
+			__u8 dequeued;
+		} s390_tsch;
+		/* KVM_EXIT_EPR */
+		struct {
+			__u32 epr;
+		} epr;
+		/* KVM_EXIT_SYSTEM_EVENT */
+		struct {
+#define KVM_SYSTEM_EVENT_SHUTDOWN       1
+#define KVM_SYSTEM_EVENT_RESET          2
+#define KVM_SYSTEM_EVENT_CRASH          3
+			__u32 type;
+			__u64 flags;
+		} system_event;
+		/* KVM_EXIT_S390_STSI */
+		struct {
+			__u64 addr;
+			__u8 ar;
+			__u8 reserved;
+			__u8 fc;
+			__u8 sel1;
+			__u16 sel2;
+		} s390_stsi;
+		/* KVM_EXIT_IOAPIC_EOI */
+		struct {
+			__u8 vector;
+		} eoi;
+		/* KVM_EXIT_HYPERV */
+		struct kvm_hyperv_exit hyperv;
+		/* Fix the size of the union. */
+		char padding[256];
+	};
+
+	/* 2048 is the size of the char array used to bound/pad the size
+	 * of the union that holds sync regs.
+	 */
+	#define SYNC_REGS_SIZE_BYTES 2048
+	/*
+	 * shared registers between kvm and userspace.
+	 * kvm_valid_regs specifies the register classes set by the host
+	 * kvm_dirty_regs specified the register classes dirtied by userspace
+	 * struct kvm_sync_regs is architecture specific, as well as the
+	 * bits for kvm_valid_regs and kvm_dirty_regs
+	 */
+	__u64 kvm_valid_regs;
+	__u64 kvm_dirty_regs;
+	union {
+		struct kvm_sync_regs regs;
+		char padding[SYNC_REGS_SIZE_BYTES];
+	} s;
+};
+
+/* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */
+
+struct kvm_coalesced_mmio_zone {
+	__u64 addr;
+	__u32 size;
+	union {
+		__u32 pad;
+		__u32 pio;
+	};
+};
+
+struct kvm_coalesced_mmio {
+	__u64 phys_addr;
+	__u32 len;
+	union {
+		__u32 pad;
+		__u32 pio;
+	};
+	__u8  data[8];
+};
+
+struct kvm_coalesced_mmio_ring {
+	__u32 first, last;
+	struct kvm_coalesced_mmio coalesced_mmio[0];
+};
+
+#define KVM_COALESCED_MMIO_MAX \
+	((PAGE_SIZE - sizeof(struct kvm_coalesced_mmio_ring)) / \
+	 sizeof(struct kvm_coalesced_mmio))
+
+/* for KVM_TRANSLATE */
+struct kvm_translation {
+	/* in */
+	__u64 linear_address;
+
+	/* out */
+	__u64 physical_address;
+	__u8  valid;
+	__u8  writeable;
+	__u8  usermode;
+	__u8  pad[5];
+};
+
+/* for KVM_S390_MEM_OP */
+struct kvm_s390_mem_op {
+	/* in */
+	__u64 gaddr;		/* the guest address */
+	__u64 flags;		/* flags */
+	__u32 size;		/* amount of bytes */
+	__u32 op;		/* type of operation */
+	__u64 buf;		/* buffer in userspace */
+	__u8 ar;		/* the access register number */
+	__u8 reserved[31];	/* should be set to 0 */
+};
+/* types for kvm_s390_mem_op->op */
+#define KVM_S390_MEMOP_LOGICAL_READ	0
+#define KVM_S390_MEMOP_LOGICAL_WRITE	1
+/* flags for kvm_s390_mem_op->flags */
+#define KVM_S390_MEMOP_F_CHECK_ONLY		(1ULL << 0)
+#define KVM_S390_MEMOP_F_INJECT_EXCEPTION	(1ULL << 1)
+
+/* for KVM_INTERRUPT */
+struct kvm_interrupt {
+	/* in */
+	__u32 irq;
+};
+
+/* for KVM_GET_DIRTY_LOG */
+struct kvm_dirty_log {
+	__u32 slot;
+	__u32 padding1;
+	union {
+		void __user *dirty_bitmap; /* one bit per page */
+		__u64 padding2;
+	};
+};
+
+/* for KVM_CLEAR_DIRTY_LOG */
+struct kvm_clear_dirty_log {
+	__u32 slot;
+	__u32 num_pages;
+	__u64 first_page;
+	union {
+		void __user *dirty_bitmap; /* one bit per page */
+		__u64 padding2;
+	};
+};
+
+/* for KVM_SET_SIGNAL_MASK */
+struct kvm_signal_mask {
+	__u32 len;
+	__u8  sigset[0];
+};
+
+/* for KVM_TPR_ACCESS_REPORTING */
+struct kvm_tpr_access_ctl {
+	__u32 enabled;
+	__u32 flags;
+	__u32 reserved[8];
+};
+
+/* for KVM_SET_VAPIC_ADDR */
+struct kvm_vapic_addr {
+	__u64 vapic_addr;
+};
+
+/* for KVM_SET_MP_STATE */
+
+/* not all states are valid on all architectures */
+#define KVM_MP_STATE_RUNNABLE          0
+#define KVM_MP_STATE_UNINITIALIZED     1
+#define KVM_MP_STATE_INIT_RECEIVED     2
+#define KVM_MP_STATE_HALTED            3
+#define KVM_MP_STATE_SIPI_RECEIVED     4
+#define KVM_MP_STATE_STOPPED           5
+#define KVM_MP_STATE_CHECK_STOP        6
+#define KVM_MP_STATE_OPERATING         7
+#define KVM_MP_STATE_LOAD              8
+
+struct kvm_mp_state {
+	__u32 mp_state;
+};
+
+struct kvm_s390_psw {
+	__u64 mask;
+	__u64 addr;
+};
+
+/* valid values for type in kvm_s390_interrupt */
+#define KVM_S390_SIGP_STOP		0xfffe0000u
+#define KVM_S390_PROGRAM_INT		0xfffe0001u
+#define KVM_S390_SIGP_SET_PREFIX	0xfffe0002u
+#define KVM_S390_RESTART		0xfffe0003u
+#define KVM_S390_INT_PFAULT_INIT	0xfffe0004u
+#define KVM_S390_INT_PFAULT_DONE	0xfffe0005u
+#define KVM_S390_MCHK			0xfffe1000u
+#define KVM_S390_INT_CLOCK_COMP		0xffff1004u
+#define KVM_S390_INT_CPU_TIMER		0xffff1005u
+#define KVM_S390_INT_VIRTIO		0xffff2603u
+#define KVM_S390_INT_SERVICE		0xffff2401u
+#define KVM_S390_INT_EMERGENCY		0xffff1201u
+#define KVM_S390_INT_EXTERNAL_CALL	0xffff1202u
+/* Anything below 0xfffe0000u is taken by INT_IO */
+#define KVM_S390_INT_IO(ai,cssid,ssid,schid)   \
+	(((schid)) |			       \
+	 ((ssid) << 16) |		       \
+	 ((cssid) << 18) |		       \
+	 ((ai) << 26))
+#define KVM_S390_INT_IO_MIN		0x00000000u
+#define KVM_S390_INT_IO_MAX		0xfffdffffu
+#define KVM_S390_INT_IO_AI_MASK		0x04000000u
+
+
+struct kvm_s390_interrupt {
+	__u32 type;
+	__u32 parm;
+	__u64 parm64;
+};
+
+struct kvm_s390_io_info {
+	__u16 subchannel_id;
+	__u16 subchannel_nr;
+	__u32 io_int_parm;
+	__u32 io_int_word;
+};
+
+struct kvm_s390_ext_info {
+	__u32 ext_params;
+	__u32 pad;
+	__u64 ext_params2;
+};
+
+struct kvm_s390_pgm_info {
+	__u64 trans_exc_code;
+	__u64 mon_code;
+	__u64 per_address;
+	__u32 data_exc_code;
+	__u16 code;
+	__u16 mon_class_nr;
+	__u8 per_code;
+	__u8 per_atmid;
+	__u8 exc_access_id;
+	__u8 per_access_id;
+	__u8 op_access_id;
+#define KVM_S390_PGM_FLAGS_ILC_VALID	0x01
+#define KVM_S390_PGM_FLAGS_ILC_0	0x02
+#define KVM_S390_PGM_FLAGS_ILC_1	0x04
+#define KVM_S390_PGM_FLAGS_ILC_MASK	0x06
+#define KVM_S390_PGM_FLAGS_NO_REWIND	0x08
+	__u8 flags;
+	__u8 pad[2];
+};
+
+struct kvm_s390_prefix_info {
+	__u32 address;
+};
+
+struct kvm_s390_extcall_info {
+	__u16 code;
+};
+
+struct kvm_s390_emerg_info {
+	__u16 code;
+};
+
+#define KVM_S390_STOP_FLAG_STORE_STATUS	0x01
+struct kvm_s390_stop_info {
+	__u32 flags;
+};
+
+struct kvm_s390_mchk_info {
+	__u64 cr14;
+	__u64 mcic;
+	__u64 failing_storage_address;
+	__u32 ext_damage_code;
+	__u32 pad;
+	__u8 fixed_logout[16];
+};
+
+struct kvm_s390_irq {
+	__u64 type;
+	union {
+		struct kvm_s390_io_info io;
+		struct kvm_s390_ext_info ext;
+		struct kvm_s390_pgm_info pgm;
+		struct kvm_s390_emerg_info emerg;
+		struct kvm_s390_extcall_info extcall;
+		struct kvm_s390_prefix_info prefix;
+		struct kvm_s390_stop_info stop;
+		struct kvm_s390_mchk_info mchk;
+		char reserved[64];
+	} u;
+};
+
+struct kvm_s390_irq_state {
+	__u64 buf;
+	__u32 flags;        /* will stay unused for compatibility reasons */
+	__u32 len;
+	__u32 reserved[4];  /* will stay unused for compatibility reasons */
+};
+
+/* for KVM_SET_GUEST_DEBUG */
+
+#define KVM_GUESTDBG_ENABLE		0x00000001
+#define KVM_GUESTDBG_SINGLESTEP		0x00000002
+
+struct kvm_guest_debug {
+	__u32 control;
+	__u32 pad;
+	struct kvm_guest_debug_arch arch;
+};
+
+enum {
+	kvm_ioeventfd_flag_nr_datamatch,
+	kvm_ioeventfd_flag_nr_pio,
+	kvm_ioeventfd_flag_nr_deassign,
+	kvm_ioeventfd_flag_nr_virtio_ccw_notify,
+	kvm_ioeventfd_flag_nr_fast_mmio,
+	kvm_ioeventfd_flag_nr_max,
+};
+
+#define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
+#define KVM_IOEVENTFD_FLAG_PIO       (1 << kvm_ioeventfd_flag_nr_pio)
+#define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
+#define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \
+	(1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify)
+
+#define KVM_IOEVENTFD_VALID_FLAG_MASK  ((1 << kvm_ioeventfd_flag_nr_max) - 1)
+
+struct kvm_ioeventfd {
+	__u64 datamatch;
+	__u64 addr;        /* legal pio/mmio address */
+	__u32 len;         /* 1, 2, 4, or 8 bytes; or 0 to ignore length */
+	__s32 fd;
+	__u32 flags;
+	__u8  pad[36];
+};
+
+#define KVM_X86_DISABLE_EXITS_MWAIT          (1 << 0)
+#define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
+#define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
+#define KVM_X86_DISABLE_EXITS_CSTATE         (1 << 3)
+#define KVM_X86_DISABLE_VALID_EXITS          (KVM_X86_DISABLE_EXITS_MWAIT | \
+                                              KVM_X86_DISABLE_EXITS_HLT | \
+                                              KVM_X86_DISABLE_EXITS_PAUSE | \
+                                              KVM_X86_DISABLE_EXITS_CSTATE)
+
+/* for KVM_ENABLE_CAP */
+struct kvm_enable_cap {
+	/* in */
+	__u32 cap;
+	__u32 flags;
+	__u64 args[4];
+	__u8  pad[64];
+};
+
+/* for KVM_PPC_GET_PVINFO */
+
+#define KVM_PPC_PVINFO_FLAGS_EV_IDLE   (1<<0)
+
+struct kvm_ppc_pvinfo {
+	/* out */
+	__u32 flags;
+	__u32 hcall[4];
+	__u8  pad[108];
+};
+
+/* for KVM_PPC_GET_SMMU_INFO */
+#define KVM_PPC_PAGE_SIZES_MAX_SZ	8
+
+struct kvm_ppc_one_page_size {
+	__u32 page_shift;	/* Page shift (or 0) */
+	__u32 pte_enc;		/* Encoding in the HPTE (>>12) */
+};
+
+struct kvm_ppc_one_seg_page_size {
+	__u32 page_shift;	/* Base page shift of segment (or 0) */
+	__u32 slb_enc;		/* SLB encoding for BookS */
+	struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ];
+};
+
+#define KVM_PPC_PAGE_SIZES_REAL		0x00000001
+#define KVM_PPC_1T_SEGMENTS		0x00000002
+#define KVM_PPC_NO_HASH			0x00000004
+
+struct kvm_ppc_smmu_info {
+	__u64 flags;
+	__u32 slb_size;
+	__u16 data_keys;	/* # storage keys supported for data */
+	__u16 instr_keys;	/* # storage keys supported for instructions */
+	struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ];
+};
+
+/* for KVM_PPC_RESIZE_HPT_{PREPARE,COMMIT} */
+struct kvm_ppc_resize_hpt {
+	__u64 flags;
+	__u32 shift;
+	__u32 pad;
+};
+
+#define KVMIO 0xAE
+
+/* machine type bits, to be used as argument to KVM_CREATE_VM */
+#define KVM_VM_S390_UCONTROL	1
+
+/* on ppc, 0 indicate default, 1 should force HV and 2 PR */
+#define KVM_VM_PPC_HV 1
+#define KVM_VM_PPC_PR 2
+
+/* on MIPS, 0 forces trap & emulate, 1 forces VZ ASE */
+#define KVM_VM_MIPS_TE		0
+#define KVM_VM_MIPS_VZ		1
+
+#define KVM_S390_SIE_PAGE_OFFSET 1
+
+/*
+ * On arm64, machine type can be used to request the physical
+ * address size for the VM. Bits[7-0] are reserved for the guest
+ * PA size shift (i.e, log2(PA_Size)). For backward compatibility,
+ * value 0 implies the default IPA size, 40bits.
+ */
+#define KVM_VM_TYPE_ARM_IPA_SIZE_MASK	0xffULL
+#define KVM_VM_TYPE_ARM_IPA_SIZE(x)		\
+	((x) & KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
+/*
+ * ioctls for /dev/kvm fds:
+ */
+#define KVM_GET_API_VERSION       _IO(KVMIO,   0x00)
+#define KVM_CREATE_VM             _IO(KVMIO,   0x01) /* returns a VM fd */
+#define KVM_GET_MSR_INDEX_LIST    _IOWR(KVMIO, 0x02, struct kvm_msr_list)
+
+#define KVM_S390_ENABLE_SIE       _IO(KVMIO,   0x06)
+/*
+ * Check if a kvm extension is available.  Argument is extension number,
+ * return is 1 (yes) or 0 (no, sorry).
+ */
+#define KVM_CHECK_EXTENSION       _IO(KVMIO,   0x03)
+/*
+ * Get size for mmap(vcpu_fd)
+ */
+#define KVM_GET_VCPU_MMAP_SIZE    _IO(KVMIO,   0x04) /* in bytes */
+#define KVM_GET_SUPPORTED_CPUID   _IOWR(KVMIO, 0x05, struct kvm_cpuid2)
+#define KVM_TRACE_ENABLE          __KVM_DEPRECATED_MAIN_W_0x06
+#define KVM_TRACE_PAUSE           __KVM_DEPRECATED_MAIN_0x07
+#define KVM_TRACE_DISABLE         __KVM_DEPRECATED_MAIN_0x08
+#define KVM_GET_EMULATED_CPUID	  _IOWR(KVMIO, 0x09, struct kvm_cpuid2)
+#define KVM_GET_MSR_FEATURE_INDEX_LIST    _IOWR(KVMIO, 0x0a, struct kvm_msr_list)
+
+/*
+ * Extension capability list.
+ */
+#define KVM_CAP_IRQCHIP	  0
+#define KVM_CAP_HLT	  1
+#define KVM_CAP_MMU_SHADOW_CACHE_CONTROL 2
+#define KVM_CAP_USER_MEMORY 3
+#define KVM_CAP_SET_TSS_ADDR 4
+#define KVM_CAP_VAPIC 6
+#define KVM_CAP_EXT_CPUID 7
+#define KVM_CAP_CLOCKSOURCE 8
+#define KVM_CAP_NR_VCPUS 9       /* returns recommended max vcpus per vm */
+#define KVM_CAP_NR_MEMSLOTS 10   /* returns max memory slots per vm */
+#define KVM_CAP_PIT 11
+#define KVM_CAP_NOP_IO_DELAY 12
+#define KVM_CAP_PV_MMU 13
+#define KVM_CAP_MP_STATE 14
+#define KVM_CAP_COALESCED_MMIO 15
+#define KVM_CAP_SYNC_MMU 16  /* Changes to host mmap are reflected in guest */
+#define KVM_CAP_IOMMU 18
+/* Bug in KVM_SET_USER_MEMORY_REGION fixed: */
+#define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21
+#define KVM_CAP_USER_NMI 22
+#ifdef __KVM_HAVE_GUEST_DEBUG
+#define KVM_CAP_SET_GUEST_DEBUG 23
+#endif
+#ifdef __KVM_HAVE_PIT
+#define KVM_CAP_REINJECT_CONTROL 24
+#endif
+#define KVM_CAP_IRQ_ROUTING 25
+#define KVM_CAP_IRQ_INJECT_STATUS 26
+#define KVM_CAP_ASSIGN_DEV_IRQ 29
+/* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */
+#define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30
+#ifdef __KVM_HAVE_MCE
+#define KVM_CAP_MCE 31
+#endif
+#define KVM_CAP_IRQFD 32
+#ifdef __KVM_HAVE_PIT
+#define KVM_CAP_PIT2 33
+#endif
+#define KVM_CAP_SET_BOOT_CPU_ID 34
+#ifdef __KVM_HAVE_PIT_STATE2
+#define KVM_CAP_PIT_STATE2 35
+#endif
+#define KVM_CAP_IOEVENTFD 36
+#define KVM_CAP_SET_IDENTITY_MAP_ADDR 37
+#ifdef __KVM_HAVE_XEN_HVM
+#define KVM_CAP_XEN_HVM 38
+#endif
+#define KVM_CAP_ADJUST_CLOCK 39
+#define KVM_CAP_INTERNAL_ERROR_DATA 40
+#ifdef __KVM_HAVE_VCPU_EVENTS
+#define KVM_CAP_VCPU_EVENTS 41
+#endif
+#define KVM_CAP_S390_PSW 42
+#define KVM_CAP_PPC_SEGSTATE 43
+#define KVM_CAP_HYPERV 44
+#define KVM_CAP_HYPERV_VAPIC 45
+#define KVM_CAP_HYPERV_SPIN 46
+#define KVM_CAP_PCI_SEGMENT 47
+#define KVM_CAP_PPC_PAIRED_SINGLES 48
+#define KVM_CAP_INTR_SHADOW 49
+#ifdef __KVM_HAVE_DEBUGREGS
+#define KVM_CAP_DEBUGREGS 50
+#endif
+#define KVM_CAP_X86_ROBUST_SINGLESTEP 51
+#define KVM_CAP_PPC_OSI 52
+#define KVM_CAP_PPC_UNSET_IRQ 53
+#define KVM_CAP_ENABLE_CAP 54
+#ifdef __KVM_HAVE_XSAVE
+#define KVM_CAP_XSAVE 55
+#endif
+#ifdef __KVM_HAVE_XCRS
+#define KVM_CAP_XCRS 56
+#endif
+#define KVM_CAP_PPC_GET_PVINFO 57
+#define KVM_CAP_PPC_IRQ_LEVEL 58
+#define KVM_CAP_ASYNC_PF 59
+#define KVM_CAP_TSC_CONTROL 60
+#define KVM_CAP_GET_TSC_KHZ 61
+#define KVM_CAP_PPC_BOOKE_SREGS 62
+#define KVM_CAP_SPAPR_TCE 63
+#define KVM_CAP_PPC_SMT 64
+#define KVM_CAP_PPC_RMA	65
+#define KVM_CAP_MAX_VCPUS 66       /* returns max vcpus per vm */
+#define KVM_CAP_PPC_HIOR 67
+#define KVM_CAP_PPC_PAPR 68
+#define KVM_CAP_SW_TLB 69
+#define KVM_CAP_ONE_REG 70
+#define KVM_CAP_S390_GMAP 71
+#define KVM_CAP_TSC_DEADLINE_TIMER 72
+#define KVM_CAP_S390_UCONTROL 73
+#define KVM_CAP_SYNC_REGS 74
+#define KVM_CAP_PCI_2_3 75
+#define KVM_CAP_KVMCLOCK_CTRL 76
+#define KVM_CAP_SIGNAL_MSI 77
+#define KVM_CAP_PPC_GET_SMMU_INFO 78
+#define KVM_CAP_S390_COW 79
+#define KVM_CAP_PPC_ALLOC_HTAB 80
+#define KVM_CAP_READONLY_MEM 81
+#define KVM_CAP_IRQFD_RESAMPLE 82
+#define KVM_CAP_PPC_BOOKE_WATCHDOG 83
+#define KVM_CAP_PPC_HTAB_FD 84
+#define KVM_CAP_S390_CSS_SUPPORT 85
+#define KVM_CAP_PPC_EPR 86
+#define KVM_CAP_ARM_PSCI 87
+#define KVM_CAP_ARM_SET_DEVICE_ADDR 88
+#define KVM_CAP_DEVICE_CTRL 89
+#define KVM_CAP_IRQ_MPIC 90
+#define KVM_CAP_PPC_RTAS 91
+#define KVM_CAP_IRQ_XICS 92
+#define KVM_CAP_ARM_EL1_32BIT 93
+#define KVM_CAP_SPAPR_MULTITCE 94
+#define KVM_CAP_EXT_EMUL_CPUID 95
+#define KVM_CAP_HYPERV_TIME 96
+#define KVM_CAP_IOAPIC_POLARITY_IGNORED 97
+#define KVM_CAP_ENABLE_CAP_VM 98
+#define KVM_CAP_S390_IRQCHIP 99
+#define KVM_CAP_IOEVENTFD_NO_LENGTH 100
+#define KVM_CAP_VM_ATTRIBUTES 101
+#define KVM_CAP_ARM_PSCI_0_2 102
+#define KVM_CAP_PPC_FIXUP_HCALL 103
+#define KVM_CAP_PPC_ENABLE_HCALL 104
+#define KVM_CAP_CHECK_EXTENSION_VM 105
+#define KVM_CAP_S390_USER_SIGP 106
+#define KVM_CAP_S390_VECTOR_REGISTERS 107
+#define KVM_CAP_S390_MEM_OP 108
+#define KVM_CAP_S390_USER_STSI 109
+#define KVM_CAP_S390_SKEYS 110
+#define KVM_CAP_MIPS_FPU 111
+#define KVM_CAP_MIPS_MSA 112
+#define KVM_CAP_S390_INJECT_IRQ 113
+#define KVM_CAP_S390_IRQ_STATE 114
+#define KVM_CAP_PPC_HWRNG 115
+#define KVM_CAP_DISABLE_QUIRKS 116
+#define KVM_CAP_X86_SMM 117
+#define KVM_CAP_MULTI_ADDRESS_SPACE 118
+#define KVM_CAP_GUEST_DEBUG_HW_BPS 119
+#define KVM_CAP_GUEST_DEBUG_HW_WPS 120
+#define KVM_CAP_SPLIT_IRQCHIP 121
+#define KVM_CAP_IOEVENTFD_ANY_LENGTH 122
+#define KVM_CAP_HYPERV_SYNIC 123
+#define KVM_CAP_S390_RI 124
+#define KVM_CAP_SPAPR_TCE_64 125
+#define KVM_CAP_ARM_PMU_V3 126
+#define KVM_CAP_VCPU_ATTRIBUTES 127
+#define KVM_CAP_MAX_VCPU_ID 128
+#define KVM_CAP_X2APIC_API 129
+#define KVM_CAP_S390_USER_INSTR0 130
+#define KVM_CAP_MSI_DEVID 131
+#define KVM_CAP_PPC_HTM 132
+#define KVM_CAP_SPAPR_RESIZE_HPT 133
+#define KVM_CAP_PPC_MMU_RADIX 134
+#define KVM_CAP_PPC_MMU_HASH_V3 135
+#define KVM_CAP_IMMEDIATE_EXIT 136
+#define KVM_CAP_MIPS_VZ 137
+#define KVM_CAP_MIPS_TE 138
+#define KVM_CAP_MIPS_64BIT 139
+#define KVM_CAP_S390_GS 140
+#define KVM_CAP_S390_AIS 141
+#define KVM_CAP_SPAPR_TCE_VFIO 142
+#define KVM_CAP_X86_DISABLE_EXITS 143
+#define KVM_CAP_ARM_USER_IRQ 144
+#define KVM_CAP_S390_CMMA_MIGRATION 145
+#define KVM_CAP_PPC_FWNMI 146
+#define KVM_CAP_PPC_SMT_POSSIBLE 147
+#define KVM_CAP_HYPERV_SYNIC2 148
+#define KVM_CAP_HYPERV_VP_INDEX 149
+#define KVM_CAP_S390_AIS_MIGRATION 150
+#define KVM_CAP_PPC_GET_CPU_CHAR 151
+#define KVM_CAP_S390_BPB 152
+#define KVM_CAP_GET_MSR_FEATURES 153
+#define KVM_CAP_HYPERV_EVENTFD 154
+#define KVM_CAP_HYPERV_TLBFLUSH 155
+#define KVM_CAP_S390_HPAGE_1M 156
+#define KVM_CAP_NESTED_STATE 157
+#define KVM_CAP_ARM_INJECT_SERROR_ESR 158
+#define KVM_CAP_MSR_PLATFORM_INFO 159
+#define KVM_CAP_PPC_NESTED_HV 160
+#define KVM_CAP_HYPERV_SEND_IPI 161
+#define KVM_CAP_COALESCED_PIO 162
+#define KVM_CAP_HYPERV_ENLIGHTENED_VMCS 163
+#define KVM_CAP_EXCEPTION_PAYLOAD 164
+#define KVM_CAP_ARM_VM_IPA_SIZE 165
+#define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166 /* Obsolete */
+#define KVM_CAP_HYPERV_CPUID 167
+#define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 168
+#define KVM_CAP_PPC_IRQ_XIVE 169
+#define KVM_CAP_ARM_SVE 170
+#define KVM_CAP_ARM_PTRAUTH_ADDRESS 171
+#define KVM_CAP_ARM_PTRAUTH_GENERIC 172
+#define KVM_CAP_PMU_EVENT_FILTER 173
+
+#ifdef KVM_CAP_IRQ_ROUTING
+
+struct kvm_irq_routing_irqchip {
+	__u32 irqchip;
+	__u32 pin;
+};
+
+struct kvm_irq_routing_msi {
+	__u32 address_lo;
+	__u32 address_hi;
+	__u32 data;
+	union {
+		__u32 pad;
+		__u32 devid;
+	};
+};
+
+struct kvm_irq_routing_s390_adapter {
+	__u64 ind_addr;
+	__u64 summary_addr;
+	__u64 ind_offset;
+	__u32 summary_offset;
+	__u32 adapter_id;
+};
+
+struct kvm_irq_routing_hv_sint {
+	__u32 vcpu;
+	__u32 sint;
+};
+
+/* gsi routing entry types */
+#define KVM_IRQ_ROUTING_IRQCHIP 1
+#define KVM_IRQ_ROUTING_MSI 2
+#define KVM_IRQ_ROUTING_S390_ADAPTER 3
+#define KVM_IRQ_ROUTING_HV_SINT 4
+
+struct kvm_irq_routing_entry {
+	__u32 gsi;
+	__u32 type;
+	__u32 flags;
+	__u32 pad;
+	union {
+		struct kvm_irq_routing_irqchip irqchip;
+		struct kvm_irq_routing_msi msi;
+		struct kvm_irq_routing_s390_adapter adapter;
+		struct kvm_irq_routing_hv_sint hv_sint;
+		__u32 pad[8];
+	} u;
+};
+
+struct kvm_irq_routing {
+	__u32 nr;
+	__u32 flags;
+	struct kvm_irq_routing_entry entries[0];
+};
+
+#endif
+
+#ifdef KVM_CAP_MCE
+/* x86 MCE */
+struct kvm_x86_mce {
+	__u64 status;
+	__u64 addr;
+	__u64 misc;
+	__u64 mcg_status;
+	__u8 bank;
+	__u8 pad1[7];
+	__u64 pad2[3];
+};
+#endif
+
+#ifdef KVM_CAP_XEN_HVM
+struct kvm_xen_hvm_config {
+	__u32 flags;
+	__u32 msr;
+	__u64 blob_addr_32;
+	__u64 blob_addr_64;
+	__u8 blob_size_32;
+	__u8 blob_size_64;
+	__u8 pad2[30];
+};
+#endif
+
+#define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
+/*
+ * Available with KVM_CAP_IRQFD_RESAMPLE
+ *
+ * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies
+ * the irqfd to operate in resampling mode for level triggered interrupt
+ * emulation.  See Documentation/virt/kvm/api.txt.
+ */
+#define KVM_IRQFD_FLAG_RESAMPLE (1 << 1)
+
+struct kvm_irqfd {
+	__u32 fd;
+	__u32 gsi;
+	__u32 flags;
+	__u32 resamplefd;
+	__u8  pad[16];
+};
+
+/* For KVM_CAP_ADJUST_CLOCK */
+
+/* Do not use 1, KVM_CHECK_EXTENSION returned it before we had flags.  */
+#define KVM_CLOCK_TSC_STABLE		2
+
+struct kvm_clock_data {
+	__u64 clock;
+	__u32 flags;
+	__u32 pad[9];
+};
+
+/* For KVM_CAP_SW_TLB */
+
+#define KVM_MMU_FSL_BOOKE_NOHV		0
+#define KVM_MMU_FSL_BOOKE_HV		1
+
+struct kvm_config_tlb {
+	__u64 params;
+	__u64 array;
+	__u32 mmu_type;
+	__u32 array_len;
+};
+
+struct kvm_dirty_tlb {
+	__u64 bitmap;
+	__u32 num_dirty;
+};
+
+/* Available with KVM_CAP_ONE_REG */
+
+#define KVM_REG_ARCH_MASK	0xff00000000000000ULL
+#define KVM_REG_GENERIC		0x0000000000000000ULL
+
+/*
+ * Architecture specific registers are to be defined in arch headers and
+ * ORed with the arch identifier.
+ */
+#define KVM_REG_PPC		0x1000000000000000ULL
+#define KVM_REG_X86		0x2000000000000000ULL
+#define KVM_REG_IA64		0x3000000000000000ULL
+#define KVM_REG_ARM		0x4000000000000000ULL
+#define KVM_REG_S390		0x5000000000000000ULL
+#define KVM_REG_ARM64		0x6000000000000000ULL
+#define KVM_REG_MIPS		0x7000000000000000ULL
+
+#define KVM_REG_SIZE_SHIFT	52
+#define KVM_REG_SIZE_MASK	0x00f0000000000000ULL
+#define KVM_REG_SIZE_U8		0x0000000000000000ULL
+#define KVM_REG_SIZE_U16	0x0010000000000000ULL
+#define KVM_REG_SIZE_U32	0x0020000000000000ULL
+#define KVM_REG_SIZE_U64	0x0030000000000000ULL
+#define KVM_REG_SIZE_U128	0x0040000000000000ULL
+#define KVM_REG_SIZE_U256	0x0050000000000000ULL
+#define KVM_REG_SIZE_U512	0x0060000000000000ULL
+#define KVM_REG_SIZE_U1024	0x0070000000000000ULL
+#define KVM_REG_SIZE_U2048	0x0080000000000000ULL
+
+struct kvm_reg_list {
+	__u64 n; /* number of regs */
+	__u64 reg[0];
+};
+
+struct kvm_one_reg {
+	__u64 id;
+	__u64 addr;
+};
+
+#define KVM_MSI_VALID_DEVID	(1U << 0)
+struct kvm_msi {
+	__u32 address_lo;
+	__u32 address_hi;
+	__u32 data;
+	__u32 flags;
+	__u32 devid;
+	__u8  pad[12];
+};
+
+struct kvm_arm_device_addr {
+	__u64 id;
+	__u64 addr;
+};
+
+/*
+ * Device control API, available with KVM_CAP_DEVICE_CTRL
+ */
+#define KVM_CREATE_DEVICE_TEST		1
+
+struct kvm_create_device {
+	__u32	type;	/* in: KVM_DEV_TYPE_xxx */
+	__u32	fd;	/* out: device handle */
+	__u32	flags;	/* in: KVM_CREATE_DEVICE_xxx */
+};
+
+struct kvm_device_attr {
+	__u32	flags;		/* no flags currently defined */
+	__u32	group;		/* device-defined */
+	__u64	attr;		/* group-defined */
+	__u64	addr;		/* userspace address of attr data */
+};
+
+#define  KVM_DEV_VFIO_GROUP			1
+#define   KVM_DEV_VFIO_GROUP_ADD			1
+#define   KVM_DEV_VFIO_GROUP_DEL			2
+#define   KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE		3
+
+enum kvm_device_type {
+	KVM_DEV_TYPE_FSL_MPIC_20	= 1,
+#define KVM_DEV_TYPE_FSL_MPIC_20	KVM_DEV_TYPE_FSL_MPIC_20
+	KVM_DEV_TYPE_FSL_MPIC_42,
+#define KVM_DEV_TYPE_FSL_MPIC_42	KVM_DEV_TYPE_FSL_MPIC_42
+	KVM_DEV_TYPE_XICS,
+#define KVM_DEV_TYPE_XICS		KVM_DEV_TYPE_XICS
+	KVM_DEV_TYPE_VFIO,
+#define KVM_DEV_TYPE_VFIO		KVM_DEV_TYPE_VFIO
+	KVM_DEV_TYPE_ARM_VGIC_V2,
+#define KVM_DEV_TYPE_ARM_VGIC_V2	KVM_DEV_TYPE_ARM_VGIC_V2
+	KVM_DEV_TYPE_FLIC,
+#define KVM_DEV_TYPE_FLIC		KVM_DEV_TYPE_FLIC
+	KVM_DEV_TYPE_ARM_VGIC_V3,
+#define KVM_DEV_TYPE_ARM_VGIC_V3	KVM_DEV_TYPE_ARM_VGIC_V3
+	KVM_DEV_TYPE_ARM_VGIC_ITS,
+#define KVM_DEV_TYPE_ARM_VGIC_ITS	KVM_DEV_TYPE_ARM_VGIC_ITS
+	KVM_DEV_TYPE_XIVE,
+#define KVM_DEV_TYPE_XIVE		KVM_DEV_TYPE_XIVE
+	KVM_DEV_TYPE_MAX,
+};
+
+struct kvm_vfio_spapr_tce {
+	__s32	groupfd;
+	__s32	tablefd;
+};
+
+/*
+ * ioctls for VM fds
+ */
+#define KVM_SET_MEMORY_REGION     _IOW(KVMIO,  0x40, struct kvm_memory_region)
+/*
+ * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns
+ * a vcpu fd.
+ */
+#define KVM_CREATE_VCPU           _IO(KVMIO,   0x41)
+#define KVM_GET_DIRTY_LOG         _IOW(KVMIO,  0x42, struct kvm_dirty_log)
+/* KVM_SET_MEMORY_ALIAS is obsolete: */
+#define KVM_SET_MEMORY_ALIAS      _IOW(KVMIO,  0x43, struct kvm_memory_alias)
+#define KVM_SET_NR_MMU_PAGES      _IO(KVMIO,   0x44)
+#define KVM_GET_NR_MMU_PAGES      _IO(KVMIO,   0x45)
+#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46, \
+					struct kvm_userspace_memory_region)
+#define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
+#define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
+
+/* enable ucontrol for s390 */
+struct kvm_s390_ucas_mapping {
+	__u64 user_addr;
+	__u64 vcpu_addr;
+	__u64 length;
+};
+#define KVM_S390_UCAS_MAP        _IOW(KVMIO, 0x50, struct kvm_s390_ucas_mapping)
+#define KVM_S390_UCAS_UNMAP      _IOW(KVMIO, 0x51, struct kvm_s390_ucas_mapping)
+#define KVM_S390_VCPU_FAULT	 _IOW(KVMIO, 0x52, unsigned long)
+
+/* Device model IOC */
+#define KVM_CREATE_IRQCHIP        _IO(KVMIO,   0x60)
+#define KVM_IRQ_LINE              _IOW(KVMIO,  0x61, struct kvm_irq_level)
+#define KVM_GET_IRQCHIP           _IOWR(KVMIO, 0x62, struct kvm_irqchip)
+#define KVM_SET_IRQCHIP           _IOR(KVMIO,  0x63, struct kvm_irqchip)
+#define KVM_CREATE_PIT            _IO(KVMIO,   0x64)
+#define KVM_GET_PIT               _IOWR(KVMIO, 0x65, struct kvm_pit_state)
+#define KVM_SET_PIT               _IOR(KVMIO,  0x66, struct kvm_pit_state)
+#define KVM_IRQ_LINE_STATUS       _IOWR(KVMIO, 0x67, struct kvm_irq_level)
+#define KVM_REGISTER_COALESCED_MMIO \
+			_IOW(KVMIO,  0x67, struct kvm_coalesced_mmio_zone)
+#define KVM_UNREGISTER_COALESCED_MMIO \
+			_IOW(KVMIO,  0x68, struct kvm_coalesced_mmio_zone)
+#define KVM_ASSIGN_PCI_DEVICE     _IOR(KVMIO,  0x69, \
+				       struct kvm_assigned_pci_dev)
+#define KVM_SET_GSI_ROUTING       _IOW(KVMIO,  0x6a, struct kvm_irq_routing)
+/* deprecated, replaced by KVM_ASSIGN_DEV_IRQ */
+#define KVM_ASSIGN_IRQ            __KVM_DEPRECATED_VM_R_0x70
+#define KVM_ASSIGN_DEV_IRQ        _IOW(KVMIO,  0x70, struct kvm_assigned_irq)
+#define KVM_REINJECT_CONTROL      _IO(KVMIO,   0x71)
+#define KVM_DEASSIGN_PCI_DEVICE   _IOW(KVMIO,  0x72, \
+				       struct kvm_assigned_pci_dev)
+#define KVM_ASSIGN_SET_MSIX_NR    _IOW(KVMIO,  0x73, \
+				       struct kvm_assigned_msix_nr)
+#define KVM_ASSIGN_SET_MSIX_ENTRY _IOW(KVMIO,  0x74, \
+				       struct kvm_assigned_msix_entry)
+#define KVM_DEASSIGN_DEV_IRQ      _IOW(KVMIO,  0x75, struct kvm_assigned_irq)
+#define KVM_IRQFD                 _IOW(KVMIO,  0x76, struct kvm_irqfd)
+#define KVM_CREATE_PIT2		  _IOW(KVMIO,  0x77, struct kvm_pit_config)
+#define KVM_SET_BOOT_CPU_ID       _IO(KVMIO,   0x78)
+#define KVM_IOEVENTFD             _IOW(KVMIO,  0x79, struct kvm_ioeventfd)
+#define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct kvm_xen_hvm_config)
+#define KVM_SET_CLOCK             _IOW(KVMIO,  0x7b, struct kvm_clock_data)
+#define KVM_GET_CLOCK             _IOR(KVMIO,  0x7c, struct kvm_clock_data)
+/* Available with KVM_CAP_PIT_STATE2 */
+#define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
+#define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
+/* Available with KVM_CAP_PPC_GET_PVINFO */
+#define KVM_PPC_GET_PVINFO	  _IOW(KVMIO,  0xa1, struct kvm_ppc_pvinfo)
+/* Available with KVM_CAP_TSC_CONTROL */
+#define KVM_SET_TSC_KHZ           _IO(KVMIO,  0xa2)
+#define KVM_GET_TSC_KHZ           _IO(KVMIO,  0xa3)
+/* Available with KVM_CAP_PCI_2_3 */
+#define KVM_ASSIGN_SET_INTX_MASK  _IOW(KVMIO,  0xa4, \
+				       struct kvm_assigned_pci_dev)
+/* Available with KVM_CAP_SIGNAL_MSI */
+#define KVM_SIGNAL_MSI            _IOW(KVMIO,  0xa5, struct kvm_msi)
+/* Available with KVM_CAP_PPC_GET_SMMU_INFO */
+#define KVM_PPC_GET_SMMU_INFO	  _IOR(KVMIO,  0xa6, struct kvm_ppc_smmu_info)
+/* Available with KVM_CAP_PPC_ALLOC_HTAB */
+#define KVM_PPC_ALLOCATE_HTAB	  _IOWR(KVMIO, 0xa7, __u32)
+#define KVM_CREATE_SPAPR_TCE	  _IOW(KVMIO,  0xa8, struct kvm_create_spapr_tce)
+#define KVM_CREATE_SPAPR_TCE_64	  _IOW(KVMIO,  0xa8, \
+				       struct kvm_create_spapr_tce_64)
+/* Available with KVM_CAP_RMA */
+#define KVM_ALLOCATE_RMA	  _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
+/* Available with KVM_CAP_PPC_HTAB_FD */
+#define KVM_PPC_GET_HTAB_FD	  _IOW(KVMIO,  0xaa, struct kvm_get_htab_fd)
+/* Available with KVM_CAP_ARM_SET_DEVICE_ADDR */
+#define KVM_ARM_SET_DEVICE_ADDR	  _IOW(KVMIO,  0xab, struct kvm_arm_device_addr)
+/* Available with KVM_CAP_PPC_RTAS */
+#define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO,  0xac, struct kvm_rtas_token_args)
+/* Available with KVM_CAP_SPAPR_RESIZE_HPT */
+#define KVM_PPC_RESIZE_HPT_PREPARE _IOR(KVMIO, 0xad, struct kvm_ppc_resize_hpt)
+#define KVM_PPC_RESIZE_HPT_COMMIT  _IOR(KVMIO, 0xae, struct kvm_ppc_resize_hpt)
+/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */
+#define KVM_PPC_CONFIGURE_V3_MMU  _IOW(KVMIO,  0xaf, struct kvm_ppc_mmuv3_cfg)
+/* Available with KVM_CAP_PPC_RADIX_MMU */
+#define KVM_PPC_GET_RMMU_INFO	  _IOW(KVMIO,  0xb0, struct kvm_ppc_rmmu_info)
+/* Available with KVM_CAP_PPC_GET_CPU_CHAR */
+#define KVM_PPC_GET_CPU_CHAR	  _IOR(KVMIO,  0xb1, struct kvm_ppc_cpu_char)
+/* Available with KVM_CAP_PMU_EVENT_FILTER */
+#define KVM_SET_PMU_EVENT_FILTER  _IOW(KVMIO,  0xb2, struct kvm_pmu_event_filter)
+
+/* ioctl for vm fd */
+#define KVM_CREATE_DEVICE	  _IOWR(KVMIO,  0xe0, struct kvm_create_device)
+
+/* ioctls for fds returned by KVM_CREATE_DEVICE */
+#define KVM_SET_DEVICE_ATTR	  _IOW(KVMIO,  0xe1, struct kvm_device_attr)
+#define KVM_GET_DEVICE_ATTR	  _IOW(KVMIO,  0xe2, struct kvm_device_attr)
+#define KVM_HAS_DEVICE_ATTR	  _IOW(KVMIO,  0xe3, struct kvm_device_attr)
+
+/*
+ * ioctls for vcpu fds
+ */
+#define KVM_RUN                   _IO(KVMIO,   0x80)
+#define KVM_GET_REGS              _IOR(KVMIO,  0x81, struct kvm_regs)
+#define KVM_SET_REGS              _IOW(KVMIO,  0x82, struct kvm_regs)
+#define KVM_GET_SREGS             _IOR(KVMIO,  0x83, struct kvm_sregs)
+#define KVM_SET_SREGS             _IOW(KVMIO,  0x84, struct kvm_sregs)
+#define KVM_TRANSLATE             _IOWR(KVMIO, 0x85, struct kvm_translation)
+#define KVM_INTERRUPT             _IOW(KVMIO,  0x86, struct kvm_interrupt)
+/* KVM_DEBUG_GUEST is no longer supported, use KVM_SET_GUEST_DEBUG instead */
+#define KVM_DEBUG_GUEST           __KVM_DEPRECATED_VCPU_W_0x87
+#define KVM_GET_MSRS              _IOWR(KVMIO, 0x88, struct kvm_msrs)
+#define KVM_SET_MSRS              _IOW(KVMIO,  0x89, struct kvm_msrs)
+#define KVM_SET_CPUID             _IOW(KVMIO,  0x8a, struct kvm_cpuid)
+#define KVM_SET_SIGNAL_MASK       _IOW(KVMIO,  0x8b, struct kvm_signal_mask)
+#define KVM_GET_FPU               _IOR(KVMIO,  0x8c, struct kvm_fpu)
+#define KVM_SET_FPU               _IOW(KVMIO,  0x8d, struct kvm_fpu)
+#define KVM_GET_LAPIC             _IOR(KVMIO,  0x8e, struct kvm_lapic_state)
+#define KVM_SET_LAPIC             _IOW(KVMIO,  0x8f, struct kvm_lapic_state)
+#define KVM_SET_CPUID2            _IOW(KVMIO,  0x90, struct kvm_cpuid2)
+#define KVM_GET_CPUID2            _IOWR(KVMIO, 0x91, struct kvm_cpuid2)
+/* Available with KVM_CAP_VAPIC */
+#define KVM_TPR_ACCESS_REPORTING  _IOWR(KVMIO, 0x92, struct kvm_tpr_access_ctl)
+/* Available with KVM_CAP_VAPIC */
+#define KVM_SET_VAPIC_ADDR        _IOW(KVMIO,  0x93, struct kvm_vapic_addr)
+/* valid for virtual machine (for floating interrupt)_and_ vcpu */
+#define KVM_S390_INTERRUPT        _IOW(KVMIO,  0x94, struct kvm_s390_interrupt)
+/* store status for s390 */
+#define KVM_S390_STORE_STATUS_NOADDR    (-1ul)
+#define KVM_S390_STORE_STATUS_PREFIXED  (-2ul)
+#define KVM_S390_STORE_STATUS	  _IOW(KVMIO,  0x95, unsigned long)
+/* initial ipl psw for s390 */
+#define KVM_S390_SET_INITIAL_PSW  _IOW(KVMIO,  0x96, struct kvm_s390_psw)
+/* initial reset for s390 */
+#define KVM_S390_INITIAL_RESET    _IO(KVMIO,   0x97)
+#define KVM_GET_MP_STATE          _IOR(KVMIO,  0x98, struct kvm_mp_state)
+#define KVM_SET_MP_STATE          _IOW(KVMIO,  0x99, struct kvm_mp_state)
+/* Available with KVM_CAP_USER_NMI */
+#define KVM_NMI                   _IO(KVMIO,   0x9a)
+/* Available with KVM_CAP_SET_GUEST_DEBUG */
+#define KVM_SET_GUEST_DEBUG       _IOW(KVMIO,  0x9b, struct kvm_guest_debug)
+/* MCE for x86 */
+#define KVM_X86_SETUP_MCE         _IOW(KVMIO,  0x9c, __u64)
+#define KVM_X86_GET_MCE_CAP_SUPPORTED _IOR(KVMIO,  0x9d, __u64)
+#define KVM_X86_SET_MCE           _IOW(KVMIO,  0x9e, struct kvm_x86_mce)
+/* Available with KVM_CAP_VCPU_EVENTS */
+#define KVM_GET_VCPU_EVENTS       _IOR(KVMIO,  0x9f, struct kvm_vcpu_events)
+#define KVM_SET_VCPU_EVENTS       _IOW(KVMIO,  0xa0, struct kvm_vcpu_events)
+/* Available with KVM_CAP_DEBUGREGS */
+#define KVM_GET_DEBUGREGS         _IOR(KVMIO,  0xa1, struct kvm_debugregs)
+#define KVM_SET_DEBUGREGS         _IOW(KVMIO,  0xa2, struct kvm_debugregs)
+/*
+ * vcpu version available with KVM_ENABLE_CAP
+ * vm version available with KVM_CAP_ENABLE_CAP_VM
+ */
+#define KVM_ENABLE_CAP            _IOW(KVMIO,  0xa3, struct kvm_enable_cap)
+/* Available with KVM_CAP_XSAVE */
+#define KVM_GET_XSAVE		  _IOR(KVMIO,  0xa4, struct kvm_xsave)
+#define KVM_SET_XSAVE		  _IOW(KVMIO,  0xa5, struct kvm_xsave)
+/* Available with KVM_CAP_XCRS */
+#define KVM_GET_XCRS		  _IOR(KVMIO,  0xa6, struct kvm_xcrs)
+#define KVM_SET_XCRS		  _IOW(KVMIO,  0xa7, struct kvm_xcrs)
+/* Available with KVM_CAP_SW_TLB */
+#define KVM_DIRTY_TLB		  _IOW(KVMIO,  0xaa, struct kvm_dirty_tlb)
+/* Available with KVM_CAP_ONE_REG */
+#define KVM_GET_ONE_REG		  _IOW(KVMIO,  0xab, struct kvm_one_reg)
+#define KVM_SET_ONE_REG		  _IOW(KVMIO,  0xac, struct kvm_one_reg)
+/* VM is being stopped by host */
+#define KVM_KVMCLOCK_CTRL	  _IO(KVMIO,   0xad)
+#define KVM_ARM_VCPU_INIT	  _IOW(KVMIO,  0xae, struct kvm_vcpu_init)
+#define KVM_ARM_PREFERRED_TARGET  _IOR(KVMIO,  0xaf, struct kvm_vcpu_init)
+#define KVM_GET_REG_LIST	  _IOWR(KVMIO, 0xb0, struct kvm_reg_list)
+/* Available with KVM_CAP_S390_MEM_OP */
+#define KVM_S390_MEM_OP		  _IOW(KVMIO,  0xb1, struct kvm_s390_mem_op)
+/* Available with KVM_CAP_S390_SKEYS */
+#define KVM_S390_GET_SKEYS      _IOW(KVMIO, 0xb2, struct kvm_s390_skeys)
+#define KVM_S390_SET_SKEYS      _IOW(KVMIO, 0xb3, struct kvm_s390_skeys)
+/* Available with KVM_CAP_S390_INJECT_IRQ */
+#define KVM_S390_IRQ              _IOW(KVMIO,  0xb4, struct kvm_s390_irq)
+/* Available with KVM_CAP_S390_IRQ_STATE */
+#define KVM_S390_SET_IRQ_STATE	  _IOW(KVMIO, 0xb5, struct kvm_s390_irq_state)
+#define KVM_S390_GET_IRQ_STATE	  _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state)
+/* Available with KVM_CAP_X86_SMM */
+#define KVM_SMI                   _IO(KVMIO,   0xb7)
+/* Available with KVM_CAP_S390_CMMA_MIGRATION */
+#define KVM_S390_GET_CMMA_BITS      _IOWR(KVMIO, 0xb8, struct kvm_s390_cmma_log)
+#define KVM_S390_SET_CMMA_BITS      _IOW(KVMIO, 0xb9, struct kvm_s390_cmma_log)
+/* Memory Encryption Commands */
+#define KVM_MEMORY_ENCRYPT_OP      _IOWR(KVMIO, 0xba, unsigned long)
+
+struct kvm_enc_region {
+	__u64 addr;
+	__u64 size;
+};
+
+#define KVM_MEMORY_ENCRYPT_REG_REGION    _IOR(KVMIO, 0xbb, struct kvm_enc_region)
+#define KVM_MEMORY_ENCRYPT_UNREG_REGION  _IOR(KVMIO, 0xbc, struct kvm_enc_region)
+
+/* Available with KVM_CAP_HYPERV_EVENTFD */
+#define KVM_HYPERV_EVENTFD        _IOW(KVMIO,  0xbd, struct kvm_hyperv_eventfd)
+
+/* Available with KVM_CAP_NESTED_STATE */
+#define KVM_GET_NESTED_STATE         _IOWR(KVMIO, 0xbe, struct kvm_nested_state)
+#define KVM_SET_NESTED_STATE         _IOW(KVMIO,  0xbf, struct kvm_nested_state)
+
+/* Available with KVM_CAP_MANUAL_DIRTY_LOG_PROTECT_2 */
+#define KVM_CLEAR_DIRTY_LOG          _IOWR(KVMIO, 0xc0, struct kvm_clear_dirty_log)
+
+/* Available with KVM_CAP_HYPERV_CPUID */
+#define KVM_GET_SUPPORTED_HV_CPUID _IOWR(KVMIO, 0xc1, struct kvm_cpuid2)
+
+/* Available with KVM_CAP_ARM_SVE */
+#define KVM_ARM_VCPU_FINALIZE	  _IOW(KVMIO,  0xc2, int)
+
+/* Secure Encrypted Virtualization command */
+enum sev_cmd_id {
+	/* Guest initialization commands */
+	KVM_SEV_INIT = 0,
+	KVM_SEV_ES_INIT,
+	/* Guest launch commands */
+	KVM_SEV_LAUNCH_START,
+	KVM_SEV_LAUNCH_UPDATE_DATA,
+	KVM_SEV_LAUNCH_UPDATE_VMSA,
+	KVM_SEV_LAUNCH_SECRET,
+	KVM_SEV_LAUNCH_MEASURE,
+	KVM_SEV_LAUNCH_FINISH,
+	/* Guest migration commands (outgoing) */
+	KVM_SEV_SEND_START,
+	KVM_SEV_SEND_UPDATE_DATA,
+	KVM_SEV_SEND_UPDATE_VMSA,
+	KVM_SEV_SEND_FINISH,
+	/* Guest migration commands (incoming) */
+	KVM_SEV_RECEIVE_START,
+	KVM_SEV_RECEIVE_UPDATE_DATA,
+	KVM_SEV_RECEIVE_UPDATE_VMSA,
+	KVM_SEV_RECEIVE_FINISH,
+	/* Guest status and debug commands */
+	KVM_SEV_GUEST_STATUS,
+	KVM_SEV_DBG_DECRYPT,
+	KVM_SEV_DBG_ENCRYPT,
+	/* Guest certificates commands */
+	KVM_SEV_CERT_EXPORT,
+
+	KVM_SEV_NR_MAX,
+};
+
+struct kvm_sev_cmd {
+	__u32 id;
+	__u64 data;
+	__u32 error;
+	__u32 sev_fd;
+};
+
+struct kvm_sev_launch_start {
+	__u32 handle;
+	__u32 policy;
+	__u64 dh_uaddr;
+	__u32 dh_len;
+	__u64 session_uaddr;
+	__u32 session_len;
+};
+
+struct kvm_sev_launch_update_data {
+	__u64 uaddr;
+	__u32 len;
+};
+
+
+struct kvm_sev_launch_secret {
+	__u64 hdr_uaddr;
+	__u32 hdr_len;
+	__u64 guest_uaddr;
+	__u32 guest_len;
+	__u64 trans_uaddr;
+	__u32 trans_len;
+};
+
+struct kvm_sev_launch_measure {
+	__u64 uaddr;
+	__u32 len;
+};
+
+struct kvm_sev_guest_status {
+	__u32 handle;
+	__u32 policy;
+	__u32 state;
+};
+
+struct kvm_sev_dbg {
+	__u64 src_uaddr;
+	__u64 dst_uaddr;
+	__u32 len;
+};
+
+#define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
+#define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
+#define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)
+
+struct kvm_assigned_pci_dev {
+	__u32 assigned_dev_id;
+	__u32 busnr;
+	__u32 devfn;
+	__u32 flags;
+	__u32 segnr;
+	union {
+		__u32 reserved[11];
+	};
+};
+
+#define KVM_DEV_IRQ_HOST_INTX    (1 << 0)
+#define KVM_DEV_IRQ_HOST_MSI     (1 << 1)
+#define KVM_DEV_IRQ_HOST_MSIX    (1 << 2)
+
+#define KVM_DEV_IRQ_GUEST_INTX   (1 << 8)
+#define KVM_DEV_IRQ_GUEST_MSI    (1 << 9)
+#define KVM_DEV_IRQ_GUEST_MSIX   (1 << 10)
+
+#define KVM_DEV_IRQ_HOST_MASK	 0x00ff
+#define KVM_DEV_IRQ_GUEST_MASK   0xff00
+
+struct kvm_assigned_irq {
+	__u32 assigned_dev_id;
+	__u32 host_irq; /* ignored (legacy field) */
+	__u32 guest_irq;
+	__u32 flags;
+	union {
+		__u32 reserved[12];
+	};
+};
+
+struct kvm_assigned_msix_nr {
+	__u32 assigned_dev_id;
+	__u16 entry_nr;
+	__u16 padding;
+};
+
+#define KVM_MAX_MSIX_PER_DEV		256
+struct kvm_assigned_msix_entry {
+	__u32 assigned_dev_id;
+	__u32 gsi;
+	__u16 entry; /* The index of entry in the MSI-X table */
+	__u16 padding[3];
+};
+
+#define KVM_X2APIC_API_USE_32BIT_IDS            (1ULL << 0)
+#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK  (1ULL << 1)
+
+/* Available with KVM_CAP_ARM_USER_IRQ */
+
+/* Bits for run->s.regs.device_irq_level */
+#define KVM_ARM_DEV_EL1_VTIMER		(1 << 0)
+#define KVM_ARM_DEV_EL1_PTIMER		(1 << 1)
+#define KVM_ARM_DEV_PMU			(1 << 2)
+
+struct kvm_hyperv_eventfd {
+	__u32 conn_id;
+	__s32 fd;
+	__u32 flags;
+	__u32 padding[3];
+};
+
+#define KVM_HYPERV_CONN_ID_MASK		0x00ffffff
+#define KVM_HYPERV_EVENTFD_DEASSIGN	(1 << 0)
+
+#endif /* __LINUX_KVM_H */
diff --git a/kvmtool/include/linux/list.h b/kvmtool/include/linux/list.h
new file mode 100644
index 0000000..2e92b3f
--- /dev/null
+++ b/kvmtool/include/linux/list.h
@@ -0,0 +1,743 @@
+#ifndef _LINUX_LIST_H
+#define _LINUX_LIST_H
+
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/const.h>
+#include <linux/kernel.h>
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define LIST_HEAD(name) \
+	struct list_head name = LIST_HEAD_INIT(name)
+
+static inline void INIT_LIST_HEAD(struct list_head *list)
+{
+	list->next = list;
+	list->prev = list;
+}
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+#ifndef CONFIG_DEBUG_LIST
+static inline void __list_add(struct list_head *new,
+			      struct list_head *prev,
+			      struct list_head *next)
+{
+	next->prev = new;
+	new->next = next;
+	new->prev = prev;
+	prev->next = new;
+}
+#else
+extern void __list_add(struct list_head *new,
+			      struct list_head *prev,
+			      struct list_head *next);
+#endif
+
+/**
+ * list_add - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+	__list_add(new, head, head->next);
+}
+
+
+/**
+ * list_add_tail - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static inline void list_add_tail(struct list_head *new, struct list_head *head)
+{
+	__list_add(new, head->prev, head);
+}
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_del(struct list_head * prev, struct list_head * next)
+{
+	next->prev = prev;
+	prev->next = next;
+}
+
+/**
+ * list_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: list_empty() on entry does not return true after this, the entry is
+ * in an undefined state.
+ */
+#ifndef CONFIG_DEBUG_LIST
+static inline void __list_del_entry(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+}
+
+static inline void list_del(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	entry->next = NULL;
+	entry->prev = NULL;
+}
+#else
+extern void __list_del_entry(struct list_head *entry);
+extern void list_del(struct list_head *entry);
+#endif
+
+/**
+ * list_replace - replace old entry by new one
+ * @old : the element to be replaced
+ * @new : the new element to insert
+ *
+ * If @old was empty, it will be overwritten.
+ */
+static inline void list_replace(struct list_head *old,
+				struct list_head *new)
+{
+	new->next = old->next;
+	new->next->prev = new;
+	new->prev = old->prev;
+	new->prev->next = new;
+}
+
+static inline void list_replace_init(struct list_head *old,
+					struct list_head *new)
+{
+	list_replace(old, new);
+	INIT_LIST_HEAD(old);
+}
+
+/**
+ * list_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static inline void list_del_init(struct list_head *entry)
+{
+	__list_del_entry(entry);
+	INIT_LIST_HEAD(entry);
+}
+
+/**
+ * list_move - delete from one list and add as another's head
+ * @list: the entry to move
+ * @head: the head that will precede our entry
+ */
+static inline void list_move(struct list_head *list, struct list_head *head)
+{
+	__list_del_entry(list);
+	list_add(list, head);
+}
+
+/**
+ * list_move_tail - delete from one list and add as another's tail
+ * @list: the entry to move
+ * @head: the head that will follow our entry
+ */
+static inline void list_move_tail(struct list_head *list,
+				  struct list_head *head)
+{
+	__list_del_entry(list);
+	list_add_tail(list, head);
+}
+
+/**
+ * list_is_last - tests whether @list is the last entry in list @head
+ * @list: the entry to test
+ * @head: the head of the list
+ */
+static inline int list_is_last(const struct list_head *list,
+				const struct list_head *head)
+{
+	return list->next == head;
+}
+
+/**
+ * list_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static inline int list_empty(const struct list_head *head)
+{
+	return head->next == head;
+}
+
+/**
+ * list_empty_careful - tests whether a list is empty and not being modified
+ * @head: the list to test
+ *
+ * Description:
+ * tests whether a list is empty _and_ checks that no other CPU might be
+ * in the process of modifying either member (next or prev)
+ *
+ * NOTE: using list_empty_careful() without synchronization
+ * can only be safe if the only activity that can happen
+ * to the list entry is list_del_init(). Eg. it cannot be used
+ * if another CPU could re-list_add() it.
+ */
+static inline int list_empty_careful(const struct list_head *head)
+{
+	struct list_head *next = head->next;
+	return (next == head) && (next == head->prev);
+}
+
+/**
+ * list_rotate_left - rotate the list to the left
+ * @head: the head of the list
+ */
+static inline void list_rotate_left(struct list_head *head)
+{
+	struct list_head *first;
+
+	if (!list_empty(head)) {
+		first = head->next;
+		list_move_tail(first, head);
+	}
+}
+
+/**
+ * list_is_singular - tests whether a list has just one entry.
+ * @head: the list to test.
+ */
+static inline int list_is_singular(const struct list_head *head)
+{
+	return !list_empty(head) && (head->next == head->prev);
+}
+
+static inline void __list_cut_position(struct list_head *list,
+		struct list_head *head, struct list_head *entry)
+{
+	struct list_head *new_first = entry->next;
+	list->next = head->next;
+	list->next->prev = list;
+	list->prev = entry;
+	entry->next = list;
+	head->next = new_first;
+	new_first->prev = head;
+}
+
+/**
+ * list_cut_position - cut a list into two
+ * @list: a new list to add all removed entries
+ * @head: a list with entries
+ * @entry: an entry within head, could be the head itself
+ *	and if so we won't cut the list
+ *
+ * This helper moves the initial part of @head, up to and
+ * including @entry, from @head to @list. You should
+ * pass on @entry an element you know is on @head. @list
+ * should be an empty list or a list you do not care about
+ * losing its data.
+ *
+ */
+static inline void list_cut_position(struct list_head *list,
+		struct list_head *head, struct list_head *entry)
+{
+	if (list_empty(head))
+		return;
+	if (list_is_singular(head) &&
+		(head->next != entry && head != entry))
+		return;
+	if (entry == head)
+		INIT_LIST_HEAD(list);
+	else
+		__list_cut_position(list, head, entry);
+}
+
+static inline void __list_splice(const struct list_head *list,
+				 struct list_head *prev,
+				 struct list_head *next)
+{
+	struct list_head *first = list->next;
+	struct list_head *last = list->prev;
+
+	first->prev = prev;
+	prev->next = first;
+
+	last->next = next;
+	next->prev = last;
+}
+
+/**
+ * list_splice - join two lists, this is designed for stacks
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static inline void list_splice(const struct list_head *list,
+				struct list_head *head)
+{
+	if (!list_empty(list))
+		__list_splice(list, head, head->next);
+}
+
+/**
+ * list_splice_tail - join two lists, each list being a queue
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static inline void list_splice_tail(struct list_head *list,
+				struct list_head *head)
+{
+	if (!list_empty(list))
+		__list_splice(list, head->prev, head);
+}
+
+/**
+ * list_splice_init - join two lists and reinitialise the emptied list.
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ *
+ * The list at @list is reinitialised
+ */
+static inline void list_splice_init(struct list_head *list,
+				    struct list_head *head)
+{
+	if (!list_empty(list)) {
+		__list_splice(list, head, head->next);
+		INIT_LIST_HEAD(list);
+	}
+}
+
+/**
+ * list_splice_tail_init - join two lists and reinitialise the emptied list
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ *
+ * Each of the lists is a queue.
+ * The list at @list is reinitialised
+ */
+static inline void list_splice_tail_init(struct list_head *list,
+					 struct list_head *head)
+{
+	if (!list_empty(list)) {
+		__list_splice(list, head->prev, head);
+		INIT_LIST_HEAD(list);
+	}
+}
+
+/**
+ * list_entry - get the struct for this entry
+ * @ptr:	the &struct list_head pointer.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_head within the struct.
+ */
+#define list_entry(ptr, type, member) \
+	container_of(ptr, type, member)
+
+/**
+ * list_first_entry - get the first element from a list
+ * @ptr:	the list head to take the element from.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_head within the struct.
+ *
+ * Note, that list is expected to be not empty.
+ */
+#define list_first_entry(ptr, type, member) \
+	list_entry((ptr)->next, type, member)
+
+/**
+ * list_last_entry - get the last element from a list
+ * @ptr:	the list head to take the element from.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_head within the struct.
+ *
+ * Note, that list is expected to be not empty.
+ */
+#define list_last_entry(ptr, type, member) \
+	list_entry((ptr)->prev, type, member)
+
+/**
+ * list_first_entry_or_null - get the first element from a list
+ * @ptr:	the list head to take the element from.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_head within the struct.
+ *
+ * Note that if the list is empty, it returns NULL.
+ */
+#define list_first_entry_or_null(ptr, type, member) \
+	(!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL)
+
+/**
+ * list_next_entry - get the next element in list
+ * @pos:	the type * to cursor
+ * @member:	the name of the list_head within the struct.
+ */
+#define list_next_entry(pos, member) \
+	list_entry((pos)->member.next, typeof(*(pos)), member)
+
+/**
+ * list_prev_entry - get the prev element in list
+ * @pos:	the type * to cursor
+ * @member:	the name of the list_head within the struct.
+ */
+#define list_prev_entry(pos, member) \
+	list_entry((pos)->member.prev, typeof(*(pos)), member)
+
+/**
+ * list_for_each	-	iterate over a list
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @head:	the head for your list.
+ */
+#define list_for_each(pos, head) \
+	for (pos = (head)->next; pos != (head); pos = pos->next)
+
+/**
+ * list_for_each_prev	-	iterate over a list backwards
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @head:	the head for your list.
+ */
+#define list_for_each_prev(pos, head) \
+	for (pos = (head)->prev; pos != (head); pos = pos->prev)
+
+/**
+ * list_for_each_safe - iterate over a list safe against removal of list entry
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @n:		another &struct list_head to use as temporary storage
+ * @head:	the head for your list.
+ */
+#define list_for_each_safe(pos, n, head) \
+	for (pos = (head)->next, n = pos->next; pos != (head); \
+		pos = n, n = pos->next)
+
+/**
+ * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @n:		another &struct list_head to use as temporary storage
+ * @head:	the head for your list.
+ */
+#define list_for_each_prev_safe(pos, n, head) \
+	for (pos = (head)->prev, n = pos->prev; \
+	     pos != (head); \
+	     pos = n, n = pos->prev)
+
+/**
+ * list_for_each_entry	-	iterate over list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_head within the struct.
+ */
+#define list_for_each_entry(pos, head, member)				\
+	for (pos = list_first_entry(head, typeof(*pos), member);	\
+	     &pos->member != (head);					\
+	     pos = list_next_entry(pos, member))
+
+/**
+ * list_for_each_entry_reverse - iterate backwards over list of given type.
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_head within the struct.
+ */
+#define list_for_each_entry_reverse(pos, head, member)			\
+	for (pos = list_last_entry(head, typeof(*pos), member);		\
+	     &pos->member != (head); 					\
+	     pos = list_prev_entry(pos, member))
+
+/**
+ * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue()
+ * @pos:	the type * to use as a start point
+ * @head:	the head of the list
+ * @member:	the name of the list_head within the struct.
+ *
+ * Prepares a pos entry for use as a start point in list_for_each_entry_continue().
+ */
+#define list_prepare_entry(pos, head, member) \
+	((pos) ? : list_entry(head, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_continue - continue iteration over list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_head within the struct.
+ *
+ * Continue to iterate over list of given type, continuing after
+ * the current position.
+ */
+#define list_for_each_entry_continue(pos, head, member) 		\
+	for (pos = list_next_entry(pos, member);			\
+	     &pos->member != (head);					\
+	     pos = list_next_entry(pos, member))
+
+/**
+ * list_for_each_entry_continue_reverse - iterate backwards from the given point
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_head within the struct.
+ *
+ * Start to iterate over list of given type backwards, continuing after
+ * the current position.
+ */
+#define list_for_each_entry_continue_reverse(pos, head, member)		\
+	for (pos = list_prev_entry(pos, member);			\
+	     &pos->member != (head);					\
+	     pos = list_prev_entry(pos, member))
+
+/**
+ * list_for_each_entry_from - iterate over list of given type from the current point
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_head within the struct.
+ *
+ * Iterate over list of given type, continuing from current position.
+ */
+#define list_for_each_entry_from(pos, head, member) 			\
+	for (; &pos->member != (head);					\
+	     pos = list_next_entry(pos, member))
+
+/**
+ * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_head within the struct.
+ */
+#define list_for_each_entry_safe(pos, n, head, member)			\
+	for (pos = list_first_entry(head, typeof(*pos), member),	\
+		n = list_next_entry(pos, member);			\
+	     &pos->member != (head); 					\
+	     pos = n, n = list_next_entry(n, member))
+
+/**
+ * list_for_each_entry_safe_continue - continue list iteration safe against removal
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_head within the struct.
+ *
+ * Iterate over list of given type, continuing after current point,
+ * safe against removal of list entry.
+ */
+#define list_for_each_entry_safe_continue(pos, n, head, member) 		\
+	for (pos = list_next_entry(pos, member), 				\
+		n = list_next_entry(pos, member);				\
+	     &pos->member != (head);						\
+	     pos = n, n = list_next_entry(n, member))
+
+/**
+ * list_for_each_entry_safe_from - iterate over list from current point safe against removal
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_head within the struct.
+ *
+ * Iterate over list of given type from current point, safe against
+ * removal of list entry.
+ */
+#define list_for_each_entry_safe_from(pos, n, head, member) 			\
+	for (n = list_next_entry(pos, member);					\
+	     &pos->member != (head);						\
+	     pos = n, n = list_next_entry(n, member))
+
+/**
+ * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_head within the struct.
+ *
+ * Iterate backwards over list of given type, safe against removal
+ * of list entry.
+ */
+#define list_for_each_entry_safe_reverse(pos, n, head, member)		\
+	for (pos = list_last_entry(head, typeof(*pos), member),		\
+		n = list_prev_entry(pos, member);			\
+	     &pos->member != (head); 					\
+	     pos = n, n = list_prev_entry(n, member))
+
+/**
+ * list_safe_reset_next - reset a stale list_for_each_entry_safe loop
+ * @pos:	the loop cursor used in the list_for_each_entry_safe loop
+ * @n:		temporary storage used in list_for_each_entry_safe
+ * @member:	the name of the list_head within the struct.
+ *
+ * list_safe_reset_next is not safe to use in general if the list may be
+ * modified concurrently (eg. the lock is dropped in the loop body). An
+ * exception to this is if the cursor element (pos) is pinned in the list,
+ * and list_safe_reset_next is called after re-taking the lock and before
+ * completing the current iteration of the loop body.
+ */
+#define list_safe_reset_next(pos, n, member)				\
+	n = list_next_entry(pos, member)
+
+/*
+ * Double linked lists with a single pointer list head.
+ * Mostly useful for hash tables where the two pointer list head is
+ * too wasteful.
+ * You lose the ability to access the tail in O(1).
+ */
+
+#define HLIST_HEAD_INIT { .first = NULL }
+#define HLIST_HEAD(name) struct hlist_head name = {  .first = NULL }
+#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
+static inline void INIT_HLIST_NODE(struct hlist_node *h)
+{
+	h->next = NULL;
+	h->pprev = NULL;
+}
+
+static inline int hlist_unhashed(const struct hlist_node *h)
+{
+	return !h->pprev;
+}
+
+static inline int hlist_empty(const struct hlist_head *h)
+{
+	return !h->first;
+}
+
+static inline void __hlist_del(struct hlist_node *n)
+{
+	struct hlist_node *next = n->next;
+	struct hlist_node **pprev = n->pprev;
+	*pprev = next;
+	if (next)
+		next->pprev = pprev;
+}
+
+static inline void hlist_del(struct hlist_node *n)
+{
+	__hlist_del(n);
+	n->next = NULL;
+	n->pprev = NULL;
+}
+
+static inline void hlist_del_init(struct hlist_node *n)
+{
+	if (!hlist_unhashed(n)) {
+		__hlist_del(n);
+		INIT_HLIST_NODE(n);
+	}
+}
+
+static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
+{
+	struct hlist_node *first = h->first;
+	n->next = first;
+	if (first)
+		first->pprev = &n->next;
+	h->first = n;
+	n->pprev = &h->first;
+}
+
+/* next must be != NULL */
+static inline void hlist_add_before(struct hlist_node *n,
+					struct hlist_node *next)
+{
+	n->pprev = next->pprev;
+	n->next = next;
+	next->pprev = &n->next;
+	*(n->pprev) = n;
+}
+
+static inline void hlist_add_behind(struct hlist_node *n,
+				    struct hlist_node *prev)
+{
+	n->next = prev->next;
+	prev->next = n;
+	n->pprev = &prev->next;
+
+	if (n->next)
+		n->next->pprev  = &n->next;
+}
+
+/* after that we'll appear to be on some hlist and hlist_del will work */
+static inline void hlist_add_fake(struct hlist_node *n)
+{
+	n->pprev = &n->next;
+}
+
+/*
+ * Move a list from one list head to another. Fixup the pprev
+ * reference of the first entry if it exists.
+ */
+static inline void hlist_move_list(struct hlist_head *old,
+				   struct hlist_head *new)
+{
+	new->first = old->first;
+	if (new->first)
+		new->first->pprev = &new->first;
+	old->first = NULL;
+}
+
+#define hlist_entry(ptr, type, member) container_of(ptr,type,member)
+
+#define hlist_for_each(pos, head) \
+	for (pos = (head)->first; pos ; pos = pos->next)
+
+#define hlist_for_each_safe(pos, n, head) \
+	for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \
+	     pos = n)
+
+#define hlist_entry_safe(ptr, type, member) \
+	({ typeof(ptr) ____ptr = (ptr); \
+	   ____ptr ? hlist_entry(____ptr, type, member) : NULL; \
+	})
+
+/**
+ * hlist_for_each_entry	- iterate over list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry(pos, head, member)				\
+	for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\
+	     pos;							\
+	     pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
+
+/**
+ * hlist_for_each_entry_continue - iterate over a hlist continuing after current point
+ * @pos:	the type * to use as a loop cursor.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_continue(pos, member)			\
+	for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\
+	     pos;							\
+	     pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
+
+/**
+ * hlist_for_each_entry_from - iterate over a hlist continuing from current point
+ * @pos:	the type * to use as a loop cursor.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_from(pos, member)				\
+	for (; pos;							\
+	     pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
+
+/**
+ * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another &struct hlist_node to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_safe(pos, n, head, member) 		\
+	for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\
+	     pos && ({ n = pos->member.next; 1; });			\
+	     pos = hlist_entry_safe(n, typeof(*pos), member))
+
+#endif
diff --git a/kvmtool/include/linux/prefetch.h b/kvmtool/include/linux/prefetch.h
new file mode 100644
index 0000000..62f6788
--- /dev/null
+++ b/kvmtool/include/linux/prefetch.h
@@ -0,0 +1,6 @@
+#ifndef KVM__LINUX_PREFETCH_H
+#define KVM__LINUX_PREFETCH_H
+
+static inline void prefetch(void *a __attribute__((unused))) { }
+
+#endif
diff --git a/kvmtool/include/linux/psci.h b/kvmtool/include/linux/psci.h
new file mode 100644
index 0000000..310d83e
--- /dev/null
+++ b/kvmtool/include/linux/psci.h
@@ -0,0 +1,90 @@
+/*
+ * ARM Power State and Coordination Interface (PSCI) header
+ *
+ * This header holds common PSCI defines and macros shared
+ * by: ARM kernel, ARM64 kernel, KVM ARM/ARM64 and user space.
+ *
+ * Copyright (C) 2014 Linaro Ltd.
+ * Author: Anup Patel <anup.patel@linaro.org>
+ */
+
+#ifndef _UAPI_LINUX_PSCI_H
+#define _UAPI_LINUX_PSCI_H
+
+/*
+ * PSCI v0.1 interface
+ *
+ * The PSCI v0.1 function numbers are implementation defined.
+ *
+ * Only PSCI return values such as: SUCCESS, NOT_SUPPORTED,
+ * INVALID_PARAMS, and DENIED defined below are applicable
+ * to PSCI v0.1.
+ */
+
+/* PSCI v0.2 interface */
+#define PSCI_0_2_FN_BASE			0x84000000
+#define PSCI_0_2_FN(n)				(PSCI_0_2_FN_BASE + (n))
+#define PSCI_0_2_64BIT				0x40000000
+#define PSCI_0_2_FN64_BASE			\
+					(PSCI_0_2_FN_BASE + PSCI_0_2_64BIT)
+#define PSCI_0_2_FN64(n)			(PSCI_0_2_FN64_BASE + (n))
+
+#define PSCI_0_2_FN_PSCI_VERSION		PSCI_0_2_FN(0)
+#define PSCI_0_2_FN_CPU_SUSPEND			PSCI_0_2_FN(1)
+#define PSCI_0_2_FN_CPU_OFF			PSCI_0_2_FN(2)
+#define PSCI_0_2_FN_CPU_ON			PSCI_0_2_FN(3)
+#define PSCI_0_2_FN_AFFINITY_INFO		PSCI_0_2_FN(4)
+#define PSCI_0_2_FN_MIGRATE			PSCI_0_2_FN(5)
+#define PSCI_0_2_FN_MIGRATE_INFO_TYPE		PSCI_0_2_FN(6)
+#define PSCI_0_2_FN_MIGRATE_INFO_UP_CPU		PSCI_0_2_FN(7)
+#define PSCI_0_2_FN_SYSTEM_OFF			PSCI_0_2_FN(8)
+#define PSCI_0_2_FN_SYSTEM_RESET		PSCI_0_2_FN(9)
+
+#define PSCI_0_2_FN64_CPU_SUSPEND		PSCI_0_2_FN64(1)
+#define PSCI_0_2_FN64_CPU_ON			PSCI_0_2_FN64(3)
+#define PSCI_0_2_FN64_AFFINITY_INFO		PSCI_0_2_FN64(4)
+#define PSCI_0_2_FN64_MIGRATE			PSCI_0_2_FN64(5)
+#define PSCI_0_2_FN64_MIGRATE_INFO_UP_CPU	PSCI_0_2_FN64(7)
+
+/* PSCI v0.2 power state encoding for CPU_SUSPEND function */
+#define PSCI_0_2_POWER_STATE_ID_MASK		0xffff
+#define PSCI_0_2_POWER_STATE_ID_SHIFT		0
+#define PSCI_0_2_POWER_STATE_TYPE_SHIFT		16
+#define PSCI_0_2_POWER_STATE_TYPE_MASK		\
+				(0x1 << PSCI_0_2_POWER_STATE_TYPE_SHIFT)
+#define PSCI_0_2_POWER_STATE_AFFL_SHIFT		24
+#define PSCI_0_2_POWER_STATE_AFFL_MASK		\
+				(0x3 << PSCI_0_2_POWER_STATE_AFFL_SHIFT)
+
+/* PSCI v0.2 affinity level state returned by AFFINITY_INFO */
+#define PSCI_0_2_AFFINITY_LEVEL_ON		0
+#define PSCI_0_2_AFFINITY_LEVEL_OFF		1
+#define PSCI_0_2_AFFINITY_LEVEL_ON_PENDING	2
+
+/* PSCI v0.2 multicore support in Trusted OS returned by MIGRATE_INFO_TYPE */
+#define PSCI_0_2_TOS_UP_MIGRATE			0
+#define PSCI_0_2_TOS_UP_NO_MIGRATE		1
+#define PSCI_0_2_TOS_MP				2
+
+/* PSCI version decoding (independent of PSCI version) */
+#define PSCI_VERSION_MAJOR_SHIFT		16
+#define PSCI_VERSION_MINOR_MASK			\
+		((1U << PSCI_VERSION_MAJOR_SHIFT) - 1)
+#define PSCI_VERSION_MAJOR_MASK			~PSCI_VERSION_MINOR_MASK
+#define PSCI_VERSION_MAJOR(ver)			\
+		(((ver) & PSCI_VERSION_MAJOR_MASK) >> PSCI_VERSION_MAJOR_SHIFT)
+#define PSCI_VERSION_MINOR(ver)			\
+		((ver) & PSCI_VERSION_MINOR_MASK)
+
+/* PSCI return values (inclusive of all PSCI versions) */
+#define PSCI_RET_SUCCESS			0
+#define PSCI_RET_NOT_SUPPORTED			-1
+#define PSCI_RET_INVALID_PARAMS			-2
+#define PSCI_RET_DENIED				-3
+#define PSCI_RET_ALREADY_ON			-4
+#define PSCI_RET_ON_PENDING			-5
+#define PSCI_RET_INTERNAL_FAILURE		-6
+#define PSCI_RET_NOT_PRESENT			-7
+#define PSCI_RET_DISABLED			-8
+
+#endif /* _UAPI_LINUX_PSCI_H */
diff --git a/kvmtool/include/linux/rbtree.h b/kvmtool/include/linux/rbtree.h
new file mode 100644
index 0000000..33adf78
--- /dev/null
+++ b/kvmtool/include/linux/rbtree.h
@@ -0,0 +1,108 @@
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  linux/include/linux/rbtree.h
+
+  To use rbtrees you'll have to implement your own insert and search cores.
+  This will avoid us to use callbacks and to drop drammatically performances.
+  I know it's not the cleaner way,  but in C (not in C++) to get
+  performances and genericity...
+
+  See Documentation/rbtree.txt for documentation and samples.
+*/
+
+#ifndef	_LINUX_RBTREE_H
+#define	_LINUX_RBTREE_H
+
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+
+struct rb_node {
+	unsigned long  __rb_parent_color;
+	struct rb_node *rb_right;
+	struct rb_node *rb_left;
+} __attribute__((aligned(sizeof(long))));
+    /* The alignment might seem pointless, but allegedly CRIS needs it */
+
+struct rb_root {
+	struct rb_node *rb_node;
+};
+
+
+#define rb_parent(r)   ((struct rb_node *)((r)->__rb_parent_color & ~3))
+
+#define RB_ROOT	{ NULL, }
+#define	rb_entry(ptr, type, member) container_of(ptr, type, member)
+
+#define RB_EMPTY_ROOT(root)  ((root)->rb_node == NULL)
+
+/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
+#define RB_EMPTY_NODE(node)  \
+	((node)->__rb_parent_color == (unsigned long)(node))
+#define RB_CLEAR_NODE(node)  \
+	((node)->__rb_parent_color = (unsigned long)(node))
+
+
+extern void rb_insert_color(struct rb_node *, struct rb_root *);
+extern void rb_erase(struct rb_node *, struct rb_root *);
+
+
+/* Find logical next and previous nodes in a tree */
+extern struct rb_node *rb_next(const struct rb_node *);
+extern struct rb_node *rb_prev(const struct rb_node *);
+extern struct rb_node *rb_first(const struct rb_root *);
+extern struct rb_node *rb_last(const struct rb_root *);
+
+/* Postorder iteration - always visit the parent after its children */
+extern struct rb_node *rb_first_postorder(const struct rb_root *);
+extern struct rb_node *rb_next_postorder(const struct rb_node *);
+
+/* Fast replacement of a single node without remove/rebalance/add/rebalance */
+extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, 
+			    struct rb_root *root);
+
+static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
+				struct rb_node ** rb_link)
+{
+	node->__rb_parent_color = (unsigned long)parent;
+	node->rb_left = node->rb_right = NULL;
+
+	*rb_link = node;
+}
+
+#define rb_entry_safe(ptr, type, member) \
+	({ typeof(ptr) ____ptr = (ptr); \
+	   ____ptr ? rb_entry(____ptr, type, member) : NULL; \
+	})
+
+/**
+ * rbtree_postorder_for_each_entry_safe - iterate over rb_root in post order of
+ * given type safe against removal of rb_node entry
+ *
+ * @pos:	the 'type *' to use as a loop cursor.
+ * @n:		another 'type *' to use as temporary storage
+ * @root:	'rb_root *' of the rbtree.
+ * @field:	the name of the rb_node field within 'type'.
+ */
+#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
+	for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
+	     pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \
+			typeof(*pos), field); 1; }); \
+	     pos = n)
+
+#endif	/* _LINUX_RBTREE_H */
diff --git a/kvmtool/include/linux/rbtree_augmented.h b/kvmtool/include/linux/rbtree_augmented.h
new file mode 100644
index 0000000..378c5ee
--- /dev/null
+++ b/kvmtool/include/linux/rbtree_augmented.h
@@ -0,0 +1,242 @@
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  (C) 2002  David Woodhouse <dwmw2@infradead.org>
+  (C) 2012  Michel Lespinasse <walken@google.com>
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  linux/include/linux/rbtree_augmented.h
+*/
+
+#ifndef _LINUX_RBTREE_AUGMENTED_H
+#define _LINUX_RBTREE_AUGMENTED_H
+
+#include <linux/compiler.h>
+#include <linux/rbtree.h>
+
+/*
+ * Please note - only struct rb_augment_callbacks and the prototypes for
+ * rb_insert_augmented() and rb_erase_augmented() are intended to be public.
+ * The rest are implementation details you are not expected to depend on.
+ *
+ * See Documentation/rbtree.txt for documentation and samples.
+ */
+
+struct rb_augment_callbacks {
+	void (*propagate)(struct rb_node *node, struct rb_node *stop);
+	void (*copy)(struct rb_node *old, struct rb_node *new);
+	void (*rotate)(struct rb_node *old, struct rb_node *new);
+};
+
+extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
+/*
+ * Fixup the rbtree and update the augmented information when rebalancing.
+ *
+ * On insertion, the user must update the augmented information on the path
+ * leading to the inserted node, then call rb_link_node() as usual and
+ * rb_augment_inserted() instead of the usual rb_insert_color() call.
+ * If rb_augment_inserted() rebalances the rbtree, it will callback into
+ * a user provided function to update the augmented information on the
+ * affected subtrees.
+ */
+static inline void
+rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+		    const struct rb_augment_callbacks *augment)
+{
+	__rb_insert_augmented(node, root, augment->rotate);
+}
+
+#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield,	\
+			     rbtype, rbaugmented, rbcompute)		\
+static inline void							\
+rbname ## _propagate(struct rb_node *rb, struct rb_node *stop)		\
+{									\
+	while (rb != stop) {						\
+		rbstruct *node = rb_entry(rb, rbstruct, rbfield);	\
+		rbtype augmented = rbcompute(node);			\
+		if (node->rbaugmented == augmented)			\
+			break;						\
+		node->rbaugmented = augmented;				\
+		rb = rb_parent(&node->rbfield);				\
+	}								\
+}									\
+static inline void							\
+rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new)		\
+{									\
+	rbstruct *old = rb_entry(rb_old, rbstruct, rbfield);		\
+	rbstruct *new = rb_entry(rb_new, rbstruct, rbfield);		\
+	new->rbaugmented = old->rbaugmented;				\
+}									\
+static void								\
+rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new)	\
+{									\
+	rbstruct *old = rb_entry(rb_old, rbstruct, rbfield);		\
+	rbstruct *new = rb_entry(rb_new, rbstruct, rbfield);		\
+	new->rbaugmented = old->rbaugmented;				\
+	old->rbaugmented = rbcompute(old);				\
+}									\
+rbstatic const struct rb_augment_callbacks rbname = {			\
+	rbname ## _propagate, rbname ## _copy, rbname ## _rotate	\
+};
+
+
+#define	RB_RED		0
+#define	RB_BLACK	1
+
+#define __rb_parent(pc)    ((struct rb_node *)(pc & ~3))
+
+#define __rb_color(pc)     ((pc) & 1)
+#define __rb_is_black(pc)  __rb_color(pc)
+#define __rb_is_red(pc)    (!__rb_color(pc))
+#define rb_color(rb)       __rb_color((rb)->__rb_parent_color)
+#define rb_is_red(rb)      __rb_is_red((rb)->__rb_parent_color)
+#define rb_is_black(rb)    __rb_is_black((rb)->__rb_parent_color)
+
+static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
+{
+	rb->__rb_parent_color = rb_color(rb) | (unsigned long)p;
+}
+
+static inline void rb_set_parent_color(struct rb_node *rb,
+				       struct rb_node *p, int color)
+{
+	rb->__rb_parent_color = (unsigned long)p | color;
+}
+
+static inline void
+__rb_change_child(struct rb_node *old, struct rb_node *new,
+		  struct rb_node *parent, struct rb_root *root)
+{
+	if (parent) {
+		if (parent->rb_left == old)
+			parent->rb_left = new;
+		else
+			parent->rb_right = new;
+	} else
+		root->rb_node = new;
+}
+
+extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
+
+static __always_inline struct rb_node *
+__rb_erase_augmented(struct rb_node *node, struct rb_root *root,
+		     const struct rb_augment_callbacks *augment)
+{
+	struct rb_node *child = node->rb_right, *tmp = node->rb_left;
+	struct rb_node *parent, *rebalance;
+	unsigned long pc;
+
+	if (!tmp) {
+		/*
+		 * Case 1: node to erase has no more than 1 child (easy!)
+		 *
+		 * Note that if there is one child it must be red due to 5)
+		 * and node must be black due to 4). We adjust colors locally
+		 * so as to bypass __rb_erase_color() later on.
+		 */
+		pc = node->__rb_parent_color;
+		parent = __rb_parent(pc);
+		__rb_change_child(node, child, parent, root);
+		if (child) {
+			child->__rb_parent_color = pc;
+			rebalance = NULL;
+		} else
+			rebalance = __rb_is_black(pc) ? parent : NULL;
+		tmp = parent;
+	} else if (!child) {
+		/* Still case 1, but this time the child is node->rb_left */
+		tmp->__rb_parent_color = pc = node->__rb_parent_color;
+		parent = __rb_parent(pc);
+		__rb_change_child(node, tmp, parent, root);
+		rebalance = NULL;
+		tmp = parent;
+	} else {
+		struct rb_node *successor = child, *child2;
+		tmp = child->rb_left;
+		if (!tmp) {
+			/*
+			 * Case 2: node's successor is its right child
+			 *
+			 *    (n)          (s)
+			 *    / \          / \
+			 *  (x) (s)  ->  (x) (c)
+			 *        \
+			 *        (c)
+			 */
+			parent = successor;
+			child2 = successor->rb_right;
+			augment->copy(node, successor);
+		} else {
+			/*
+			 * Case 3: node's successor is leftmost under
+			 * node's right child subtree
+			 *
+			 *    (n)          (s)
+			 *    / \          / \
+			 *  (x) (y)  ->  (x) (y)
+			 *      /            /
+			 *    (p)          (p)
+			 *    /            /
+			 *  (s)          (c)
+			 *    \
+			 *    (c)
+			 */
+			do {
+				parent = successor;
+				successor = tmp;
+				tmp = tmp->rb_left;
+			} while (tmp);
+			parent->rb_left = child2 = successor->rb_right;
+			successor->rb_right = child;
+			rb_set_parent(child, successor);
+			augment->copy(node, successor);
+			augment->propagate(parent, successor);
+		}
+
+		successor->rb_left = tmp = node->rb_left;
+		rb_set_parent(tmp, successor);
+
+		pc = node->__rb_parent_color;
+		tmp = __rb_parent(pc);
+		__rb_change_child(node, successor, tmp, root);
+		if (child2) {
+			successor->__rb_parent_color = pc;
+			rb_set_parent_color(child2, parent, RB_BLACK);
+			rebalance = NULL;
+		} else {
+			unsigned long pc2 = successor->__rb_parent_color;
+			successor->__rb_parent_color = pc;
+			rebalance = __rb_is_black(pc2) ? parent : NULL;
+		}
+		tmp = successor;
+	}
+
+	augment->propagate(tmp, NULL);
+	return rebalance;
+}
+
+static __always_inline void
+rb_erase_augmented(struct rb_node *node, struct rb_root *root,
+		   const struct rb_augment_callbacks *augment)
+{
+	struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
+	if (rebalance)
+		__rb_erase_color(rebalance, root, augment->rotate);
+}
+
+#endif	/* _LINUX_RBTREE_AUGMENTED_H */
diff --git a/kvmtool/include/linux/sizes.h b/kvmtool/include/linux/sizes.h
new file mode 100644
index 0000000..ce3e815
--- /dev/null
+++ b/kvmtool/include/linux/sizes.h
@@ -0,0 +1,47 @@
+/*
+ * include/linux/sizes.h
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __LINUX_SIZES_H__
+#define __LINUX_SIZES_H__
+
+#define SZ_1				0x00000001
+#define SZ_2				0x00000002
+#define SZ_4				0x00000004
+#define SZ_8				0x00000008
+#define SZ_16				0x00000010
+#define SZ_32				0x00000020
+#define SZ_64				0x00000040
+#define SZ_128				0x00000080
+#define SZ_256				0x00000100
+#define SZ_512				0x00000200
+
+#define SZ_1K				0x00000400
+#define SZ_2K				0x00000800
+#define SZ_4K				0x00001000
+#define SZ_8K				0x00002000
+#define SZ_16K				0x00004000
+#define SZ_32K				0x00008000
+#define SZ_64K				0x00010000
+#define SZ_128K				0x00020000
+#define SZ_256K				0x00040000
+#define SZ_512K				0x00080000
+
+#define SZ_1M				0x00100000
+#define SZ_2M				0x00200000
+#define SZ_4M				0x00400000
+#define SZ_8M				0x00800000
+#define SZ_16M				0x01000000
+#define SZ_32M				0x02000000
+#define SZ_64M				0x04000000
+#define SZ_128M				0x08000000
+#define SZ_256M				0x10000000
+#define SZ_512M				0x20000000
+
+#define SZ_1G				0x40000000
+#define SZ_2G				0x80000000
+
+#endif /* __LINUX_SIZES_H__ */
diff --git a/kvmtool/include/linux/stddef.h b/kvmtool/include/linux/stddef.h
new file mode 100644
index 0000000..39da808
--- /dev/null
+++ b/kvmtool/include/linux/stddef.h
@@ -0,0 +1,10 @@
+#ifndef _LINUX_STDDEF_H
+#define _LINUX_STDDEF_H
+
+#undef NULL
+#define NULL ((void *)0)
+
+#undef offsetof
+#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
+
+#endif
diff --git a/kvmtool/include/linux/stringify.h b/kvmtool/include/linux/stringify.h
new file mode 100644
index 0000000..841cec8
--- /dev/null
+++ b/kvmtool/include/linux/stringify.h
@@ -0,0 +1,12 @@
+#ifndef __LINUX_STRINGIFY_H
+#define __LINUX_STRINGIFY_H
+
+/* Indirect stringification.  Doing two levels allows the parameter to be a
+ * macro itself.  For example, compile with -DFOO=bar, __stringify(FOO)
+ * converts to "bar".
+ */
+
+#define __stringify_1(x...)	#x
+#define __stringify(x...)	__stringify_1(x)
+
+#endif	/* !__LINUX_STRINGIFY_H */
diff --git a/kvmtool/include/linux/types.h b/kvmtool/include/linux/types.h
new file mode 100644
index 0000000..5e20f10
--- /dev/null
+++ b/kvmtool/include/linux/types.h
@@ -0,0 +1,51 @@
+#ifndef LINUX_TYPES_H
+#define LINUX_TYPES_H
+
+#include <kvm/compiler.h>
+#define __SANE_USERSPACE_TYPES__	/* For PPC64, to get LL64 types */
+#include <asm/types.h>
+
+typedef __u64 u64;
+typedef __s64 s64;
+
+typedef __u32 u32;
+typedef __s32 s32;
+
+typedef __u16 u16;
+typedef __s16 s16;
+
+typedef __u8  u8;
+typedef __s8  s8;
+
+#ifdef __CHECKER__
+#define __bitwise__ __attribute__((bitwise))
+#else
+#define __bitwise__
+#endif
+#ifdef __CHECK_ENDIAN__
+#define __bitwise __bitwise__
+#else
+#define __bitwise
+#endif
+
+
+typedef __u16 __bitwise __le16;
+typedef __u16 __bitwise __be16;
+typedef __u32 __bitwise __le32;
+typedef __u32 __bitwise __be32;
+typedef __u64 __bitwise __le64;
+typedef __u64 __bitwise __be64;
+
+struct list_head {
+	struct list_head *next, *prev;
+};
+
+struct hlist_head {
+	struct hlist_node *first;
+};
+
+struct hlist_node {
+	struct hlist_node *next, **pprev;
+};
+
+#endif /* LINUX_TYPES_H */
diff --git a/kvmtool/include/linux/vfio.h b/kvmtool/include/linux/vfio.h
new file mode 100644
index 0000000..4e7ab4c
--- /dev/null
+++ b/kvmtool/include/linux/vfio.h
@@ -0,0 +1,719 @@
+/*
+ * VFIO API definition
+ *
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef VFIO_H
+#define VFIO_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define VFIO_API_VERSION	0
+
+
+/* Kernel & User level defines for VFIO IOCTLs. */
+
+/* Extensions */
+
+#define VFIO_TYPE1_IOMMU		1
+#define VFIO_SPAPR_TCE_IOMMU		2
+#define VFIO_TYPE1v2_IOMMU		3
+/*
+ * IOMMU enforces DMA cache coherence (ex. PCIe NoSnoop stripping).  This
+ * capability is subject to change as groups are added or removed.
+ */
+#define VFIO_DMA_CC_IOMMU		4
+
+/* Check if EEH is supported */
+#define VFIO_EEH			5
+
+/* Two-stage IOMMU */
+#define VFIO_TYPE1_NESTING_IOMMU	6	/* Implies v2 */
+
+#define VFIO_SPAPR_TCE_v2_IOMMU		7
+
+/*
+ * The No-IOMMU IOMMU offers no translation or isolation for devices and
+ * supports no ioctls outside of VFIO_CHECK_EXTENSION.  Use of VFIO's No-IOMMU
+ * code will taint the host kernel and should be used with extreme caution.
+ */
+#define VFIO_NOIOMMU_IOMMU		8
+
+/*
+ * The IOCTL interface is designed for extensibility by embedding the
+ * structure length (argsz) and flags into structures passed between
+ * kernel and userspace.  We therefore use the _IO() macro for these
+ * defines to avoid implicitly embedding a size into the ioctl request.
+ * As structure fields are added, argsz will increase to match and flag
+ * bits will be defined to indicate additional fields with valid data.
+ * It's *always* the caller's responsibility to indicate the size of
+ * the structure passed by setting argsz appropriately.
+ */
+
+#define VFIO_TYPE	(';')
+#define VFIO_BASE	100
+
+/*
+ * For extension of INFO ioctls, VFIO makes use of a capability chain
+ * designed after PCI/e capabilities.  A flag bit indicates whether
+ * this capability chain is supported and a field defined in the fixed
+ * structure defines the offset of the first capability in the chain.
+ * This field is only valid when the corresponding bit in the flags
+ * bitmap is set.  This offset field is relative to the start of the
+ * INFO buffer, as is the next field within each capability header.
+ * The id within the header is a shared address space per INFO ioctl,
+ * while the version field is specific to the capability id.  The
+ * contents following the header are specific to the capability id.
+ */
+struct vfio_info_cap_header {
+	__u16	id;		/* Identifies capability */
+	__u16	version;	/* Version specific to the capability ID */
+	__u32	next;		/* Offset of next capability */
+};
+
+/*
+ * Callers of INFO ioctls passing insufficiently sized buffers will see
+ * the capability chain flag bit set, a zero value for the first capability
+ * offset (if available within the provided argsz), and argsz will be
+ * updated to report the necessary buffer size.  For compatibility, the
+ * INFO ioctl will not report error in this case, but the capability chain
+ * will not be available.
+ */
+
+/* -------- IOCTLs for VFIO file descriptor (/dev/vfio/vfio) -------- */
+
+/**
+ * VFIO_GET_API_VERSION - _IO(VFIO_TYPE, VFIO_BASE + 0)
+ *
+ * Report the version of the VFIO API.  This allows us to bump the entire
+ * API version should we later need to add or change features in incompatible
+ * ways.
+ * Return: VFIO_API_VERSION
+ * Availability: Always
+ */
+#define VFIO_GET_API_VERSION		_IO(VFIO_TYPE, VFIO_BASE + 0)
+
+/**
+ * VFIO_CHECK_EXTENSION - _IOW(VFIO_TYPE, VFIO_BASE + 1, __u32)
+ *
+ * Check whether an extension is supported.
+ * Return: 0 if not supported, 1 (or some other positive integer) if supported.
+ * Availability: Always
+ */
+#define VFIO_CHECK_EXTENSION		_IO(VFIO_TYPE, VFIO_BASE + 1)
+
+/**
+ * VFIO_SET_IOMMU - _IOW(VFIO_TYPE, VFIO_BASE + 2, __s32)
+ *
+ * Set the iommu to the given type.  The type must be supported by an
+ * iommu driver as verified by calling CHECK_EXTENSION using the same
+ * type.  A group must be set to this file descriptor before this
+ * ioctl is available.  The IOMMU interfaces enabled by this call are
+ * specific to the value set.
+ * Return: 0 on success, -errno on failure
+ * Availability: When VFIO group attached
+ */
+#define VFIO_SET_IOMMU			_IO(VFIO_TYPE, VFIO_BASE + 2)
+
+/* -------- IOCTLs for GROUP file descriptors (/dev/vfio/$GROUP) -------- */
+
+/**
+ * VFIO_GROUP_GET_STATUS - _IOR(VFIO_TYPE, VFIO_BASE + 3,
+ *						struct vfio_group_status)
+ *
+ * Retrieve information about the group.  Fills in provided
+ * struct vfio_group_info.  Caller sets argsz.
+ * Return: 0 on succes, -errno on failure.
+ * Availability: Always
+ */
+struct vfio_group_status {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_GROUP_FLAGS_VIABLE		(1 << 0)
+#define VFIO_GROUP_FLAGS_CONTAINER_SET	(1 << 1)
+};
+#define VFIO_GROUP_GET_STATUS		_IO(VFIO_TYPE, VFIO_BASE + 3)
+
+/**
+ * VFIO_GROUP_SET_CONTAINER - _IOW(VFIO_TYPE, VFIO_BASE + 4, __s32)
+ *
+ * Set the container for the VFIO group to the open VFIO file
+ * descriptor provided.  Groups may only belong to a single
+ * container.  Containers may, at their discretion, support multiple
+ * groups.  Only when a container is set are all of the interfaces
+ * of the VFIO file descriptor and the VFIO group file descriptor
+ * available to the user.
+ * Return: 0 on success, -errno on failure.
+ * Availability: Always
+ */
+#define VFIO_GROUP_SET_CONTAINER	_IO(VFIO_TYPE, VFIO_BASE + 4)
+
+/**
+ * VFIO_GROUP_UNSET_CONTAINER - _IO(VFIO_TYPE, VFIO_BASE + 5)
+ *
+ * Remove the group from the attached container.  This is the
+ * opposite of the SET_CONTAINER call and returns the group to
+ * an initial state.  All device file descriptors must be released
+ * prior to calling this interface.  When removing the last group
+ * from a container, the IOMMU will be disabled and all state lost,
+ * effectively also returning the VFIO file descriptor to an initial
+ * state.
+ * Return: 0 on success, -errno on failure.
+ * Availability: When attached to container
+ */
+#define VFIO_GROUP_UNSET_CONTAINER	_IO(VFIO_TYPE, VFIO_BASE + 5)
+
+/**
+ * VFIO_GROUP_GET_DEVICE_FD - _IOW(VFIO_TYPE, VFIO_BASE + 6, char)
+ *
+ * Return a new file descriptor for the device object described by
+ * the provided string.  The string should match a device listed in
+ * the devices subdirectory of the IOMMU group sysfs entry.  The
+ * group containing the device must already be added to this context.
+ * Return: new file descriptor on success, -errno on failure.
+ * Availability: When attached to container
+ */
+#define VFIO_GROUP_GET_DEVICE_FD	_IO(VFIO_TYPE, VFIO_BASE + 6)
+
+/* --------------- IOCTLs for DEVICE file descriptors --------------- */
+
+/**
+ * VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7,
+ *						struct vfio_device_info)
+ *
+ * Retrieve information about the device.  Fills in provided
+ * struct vfio_device_info.  Caller sets argsz.
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_device_info {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_DEVICE_FLAGS_RESET	(1 << 0)	/* Device supports reset */
+#define VFIO_DEVICE_FLAGS_PCI	(1 << 1)	/* vfio-pci device */
+#define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)	/* vfio-platform device */
+#define VFIO_DEVICE_FLAGS_AMBA  (1 << 3)	/* vfio-amba device */
+#define VFIO_DEVICE_FLAGS_CCW	(1 << 4)	/* vfio-ccw device */
+	__u32	num_regions;	/* Max region index + 1 */
+	__u32	num_irqs;	/* Max IRQ index + 1 */
+};
+#define VFIO_DEVICE_GET_INFO		_IO(VFIO_TYPE, VFIO_BASE + 7)
+
+/*
+ * Vendor driver using Mediated device framework should provide device_api
+ * attribute in supported type attribute groups. Device API string should be one
+ * of the following corresponding to device flags in vfio_device_info structure.
+ */
+
+#define VFIO_DEVICE_API_PCI_STRING		"vfio-pci"
+#define VFIO_DEVICE_API_PLATFORM_STRING		"vfio-platform"
+#define VFIO_DEVICE_API_AMBA_STRING		"vfio-amba"
+#define VFIO_DEVICE_API_CCW_STRING		"vfio-ccw"
+
+/**
+ * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8,
+ *				       struct vfio_region_info)
+ *
+ * Retrieve information about a device region.  Caller provides
+ * struct vfio_region_info with index value set.  Caller sets argsz.
+ * Implementation of region mapping is bus driver specific.  This is
+ * intended to describe MMIO, I/O port, as well as bus specific
+ * regions (ex. PCI config space).  Zero sized regions may be used
+ * to describe unimplemented regions (ex. unimplemented PCI BARs).
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_region_info {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_REGION_INFO_FLAG_READ	(1 << 0) /* Region supports read */
+#define VFIO_REGION_INFO_FLAG_WRITE	(1 << 1) /* Region supports write */
+#define VFIO_REGION_INFO_FLAG_MMAP	(1 << 2) /* Region supports mmap */
+#define VFIO_REGION_INFO_FLAG_CAPS	(1 << 3) /* Info supports caps */
+	__u32	index;		/* Region index */
+	__u32	cap_offset;	/* Offset within info struct of first cap */
+	__u64	size;		/* Region size (bytes) */
+	__u64	offset;		/* Region offset from start of device fd */
+};
+#define VFIO_DEVICE_GET_REGION_INFO	_IO(VFIO_TYPE, VFIO_BASE + 8)
+
+/*
+ * The sparse mmap capability allows finer granularity of specifying areas
+ * within a region with mmap support.  When specified, the user should only
+ * mmap the offset ranges specified by the areas array.  mmaps outside of the
+ * areas specified may fail (such as the range covering a PCI MSI-X table) or
+ * may result in improper device behavior.
+ *
+ * The structures below define version 1 of this capability.
+ */
+#define VFIO_REGION_INFO_CAP_SPARSE_MMAP	1
+
+struct vfio_region_sparse_mmap_area {
+	__u64	offset;	/* Offset of mmap'able area within region */
+	__u64	size;	/* Size of mmap'able area */
+};
+
+struct vfio_region_info_cap_sparse_mmap {
+	struct vfio_info_cap_header header;
+	__u32	nr_areas;
+	__u32	reserved;
+	struct vfio_region_sparse_mmap_area areas[];
+};
+
+/*
+ * The device specific type capability allows regions unique to a specific
+ * device or class of devices to be exposed.  This helps solve the problem for
+ * vfio bus drivers of defining which region indexes correspond to which region
+ * on the device, without needing to resort to static indexes, as done by
+ * vfio-pci.  For instance, if we were to go back in time, we might remove
+ * VFIO_PCI_VGA_REGION_INDEX and let vfio-pci simply define that all indexes
+ * greater than or equal to VFIO_PCI_NUM_REGIONS are device specific and we'd
+ * make a "VGA" device specific type to describe the VGA access space.  This
+ * means that non-VGA devices wouldn't need to waste this index, and thus the
+ * address space associated with it due to implementation of device file
+ * descriptor offsets in vfio-pci.
+ *
+ * The current implementation is now part of the user ABI, so we can't use this
+ * for VGA, but there are other upcoming use cases, such as opregions for Intel
+ * IGD devices and framebuffers for vGPU devices.  We missed VGA, but we'll
+ * use this for future additions.
+ *
+ * The structure below defines version 1 of this capability.
+ */
+#define VFIO_REGION_INFO_CAP_TYPE	2
+
+struct vfio_region_info_cap_type {
+	struct vfio_info_cap_header header;
+	__u32 type;	/* global per bus driver */
+	__u32 subtype;	/* type specific */
+};
+
+#define VFIO_REGION_TYPE_PCI_VENDOR_TYPE	(1 << 31)
+#define VFIO_REGION_TYPE_PCI_VENDOR_MASK	(0xffff)
+
+/* 8086 Vendor sub-types */
+#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION	(1)
+#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG	(2)
+#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG	(3)
+
+/**
+ * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
+ *				    struct vfio_irq_info)
+ *
+ * Retrieve information about a device IRQ.  Caller provides
+ * struct vfio_irq_info with index value set.  Caller sets argsz.
+ * Implementation of IRQ mapping is bus driver specific.  Indexes
+ * using multiple IRQs are primarily intended to support MSI-like
+ * interrupt blocks.  Zero count irq blocks may be used to describe
+ * unimplemented interrupt types.
+ *
+ * The EVENTFD flag indicates the interrupt index supports eventfd based
+ * signaling.
+ *
+ * The MASKABLE flags indicates the index supports MASK and UNMASK
+ * actions described below.
+ *
+ * AUTOMASKED indicates that after signaling, the interrupt line is
+ * automatically masked by VFIO and the user needs to unmask the line
+ * to receive new interrupts.  This is primarily intended to distinguish
+ * level triggered interrupts.
+ *
+ * The NORESIZE flag indicates that the interrupt lines within the index
+ * are setup as a set and new subindexes cannot be enabled without first
+ * disabling the entire index.  This is used for interrupts like PCI MSI
+ * and MSI-X where the driver may only use a subset of the available
+ * indexes, but VFIO needs to enable a specific number of vectors
+ * upfront.  In the case of MSI-X, where the user can enable MSI-X and
+ * then add and unmask vectors, it's up to userspace to make the decision
+ * whether to allocate the maximum supported number of vectors or tear
+ * down setup and incrementally increase the vectors as each is enabled.
+ */
+struct vfio_irq_info {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_IRQ_INFO_EVENTFD		(1 << 0)
+#define VFIO_IRQ_INFO_MASKABLE		(1 << 1)
+#define VFIO_IRQ_INFO_AUTOMASKED	(1 << 2)
+#define VFIO_IRQ_INFO_NORESIZE		(1 << 3)
+	__u32	index;		/* IRQ index */
+	__u32	count;		/* Number of IRQs within this index */
+};
+#define VFIO_DEVICE_GET_IRQ_INFO	_IO(VFIO_TYPE, VFIO_BASE + 9)
+
+/**
+ * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set)
+ *
+ * Set signaling, masking, and unmasking of interrupts.  Caller provides
+ * struct vfio_irq_set with all fields set.  'start' and 'count' indicate
+ * the range of subindexes being specified.
+ *
+ * The DATA flags specify the type of data provided.  If DATA_NONE, the
+ * operation performs the specified action immediately on the specified
+ * interrupt(s).  For example, to unmask AUTOMASKED interrupt [0,0]:
+ * flags = (DATA_NONE|ACTION_UNMASK), index = 0, start = 0, count = 1.
+ *
+ * DATA_BOOL allows sparse support for the same on arrays of interrupts.
+ * For example, to mask interrupts [0,1] and [0,3] (but not [0,2]):
+ * flags = (DATA_BOOL|ACTION_MASK), index = 0, start = 1, count = 3,
+ * data = {1,0,1}
+ *
+ * DATA_EVENTFD binds the specified ACTION to the provided __s32 eventfd.
+ * A value of -1 can be used to either de-assign interrupts if already
+ * assigned or skip un-assigned interrupts.  For example, to set an eventfd
+ * to be trigger for interrupts [0,0] and [0,2]:
+ * flags = (DATA_EVENTFD|ACTION_TRIGGER), index = 0, start = 0, count = 3,
+ * data = {fd1, -1, fd2}
+ * If index [0,1] is previously set, two count = 1 ioctls calls would be
+ * required to set [0,0] and [0,2] without changing [0,1].
+ *
+ * Once a signaling mechanism is set, DATA_BOOL or DATA_NONE can be used
+ * with ACTION_TRIGGER to perform kernel level interrupt loopback testing
+ * from userspace (ie. simulate hardware triggering).
+ *
+ * Setting of an event triggering mechanism to userspace for ACTION_TRIGGER
+ * enables the interrupt index for the device.  Individual subindex interrupts
+ * can be disabled using the -1 value for DATA_EVENTFD or the index can be
+ * disabled as a whole with: flags = (DATA_NONE|ACTION_TRIGGER), count = 0.
+ *
+ * Note that ACTION_[UN]MASK specify user->kernel signaling (irqfds) while
+ * ACTION_TRIGGER specifies kernel->user signaling.
+ */
+struct vfio_irq_set {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_IRQ_SET_DATA_NONE		(1 << 0) /* Data not present */
+#define VFIO_IRQ_SET_DATA_BOOL		(1 << 1) /* Data is bool (u8) */
+#define VFIO_IRQ_SET_DATA_EVENTFD	(1 << 2) /* Data is eventfd (s32) */
+#define VFIO_IRQ_SET_ACTION_MASK	(1 << 3) /* Mask interrupt */
+#define VFIO_IRQ_SET_ACTION_UNMASK	(1 << 4) /* Unmask interrupt */
+#define VFIO_IRQ_SET_ACTION_TRIGGER	(1 << 5) /* Trigger interrupt */
+	__u32	index;
+	__u32	start;
+	__u32	count;
+	__u8	data[];
+};
+#define VFIO_DEVICE_SET_IRQS		_IO(VFIO_TYPE, VFIO_BASE + 10)
+
+#define VFIO_IRQ_SET_DATA_TYPE_MASK	(VFIO_IRQ_SET_DATA_NONE | \
+					 VFIO_IRQ_SET_DATA_BOOL | \
+					 VFIO_IRQ_SET_DATA_EVENTFD)
+#define VFIO_IRQ_SET_ACTION_TYPE_MASK	(VFIO_IRQ_SET_ACTION_MASK | \
+					 VFIO_IRQ_SET_ACTION_UNMASK | \
+					 VFIO_IRQ_SET_ACTION_TRIGGER)
+/**
+ * VFIO_DEVICE_RESET - _IO(VFIO_TYPE, VFIO_BASE + 11)
+ *
+ * Reset a device.
+ */
+#define VFIO_DEVICE_RESET		_IO(VFIO_TYPE, VFIO_BASE + 11)
+
+/*
+ * The VFIO-PCI bus driver makes use of the following fixed region and
+ * IRQ index mapping.  Unimplemented regions return a size of zero.
+ * Unimplemented IRQ types return a count of zero.
+ */
+
+enum {
+	VFIO_PCI_BAR0_REGION_INDEX,
+	VFIO_PCI_BAR1_REGION_INDEX,
+	VFIO_PCI_BAR2_REGION_INDEX,
+	VFIO_PCI_BAR3_REGION_INDEX,
+	VFIO_PCI_BAR4_REGION_INDEX,
+	VFIO_PCI_BAR5_REGION_INDEX,
+	VFIO_PCI_ROM_REGION_INDEX,
+	VFIO_PCI_CONFIG_REGION_INDEX,
+	/*
+	 * Expose VGA regions defined for PCI base class 03, subclass 00.
+	 * This includes I/O port ranges 0x3b0 to 0x3bb and 0x3c0 to 0x3df
+	 * as well as the MMIO range 0xa0000 to 0xbffff.  Each implemented
+	 * range is found at it's identity mapped offset from the region
+	 * offset, for example 0x3b0 is region_info.offset + 0x3b0.  Areas
+	 * between described ranges are unimplemented.
+	 */
+	VFIO_PCI_VGA_REGION_INDEX,
+	VFIO_PCI_NUM_REGIONS = 9 /* Fixed user ABI, region indexes >=9 use */
+				 /* device specific cap to define content. */
+};
+
+enum {
+	VFIO_PCI_INTX_IRQ_INDEX,
+	VFIO_PCI_MSI_IRQ_INDEX,
+	VFIO_PCI_MSIX_IRQ_INDEX,
+	VFIO_PCI_ERR_IRQ_INDEX,
+	VFIO_PCI_REQ_IRQ_INDEX,
+	VFIO_PCI_NUM_IRQS
+};
+
+/*
+ * The vfio-ccw bus driver makes use of the following fixed region and
+ * IRQ index mapping. Unimplemented regions return a size of zero.
+ * Unimplemented IRQ types return a count of zero.
+ */
+
+enum {
+	VFIO_CCW_CONFIG_REGION_INDEX,
+	VFIO_CCW_NUM_REGIONS
+};
+
+enum {
+	VFIO_CCW_IO_IRQ_INDEX,
+	VFIO_CCW_NUM_IRQS
+};
+
+/**
+ * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IORW(VFIO_TYPE, VFIO_BASE + 12,
+ *					      struct vfio_pci_hot_reset_info)
+ *
+ * Return: 0 on success, -errno on failure:
+ *	-enospc = insufficient buffer, -enodev = unsupported for device.
+ */
+struct vfio_pci_dependent_device {
+	__u32	group_id;
+	__u16	segment;
+	__u8	bus;
+	__u8	devfn; /* Use PCI_SLOT/PCI_FUNC */
+};
+
+struct vfio_pci_hot_reset_info {
+	__u32	argsz;
+	__u32	flags;
+	__u32	count;
+	struct vfio_pci_dependent_device	devices[];
+};
+
+#define VFIO_DEVICE_GET_PCI_HOT_RESET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/**
+ * VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE + 13,
+ *				    struct vfio_pci_hot_reset)
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_pci_hot_reset {
+	__u32	argsz;
+	__u32	flags;
+	__u32	count;
+	__s32	group_fds[];
+};
+
+#define VFIO_DEVICE_PCI_HOT_RESET	_IO(VFIO_TYPE, VFIO_BASE + 13)
+
+/* -------- API for Type1 VFIO IOMMU -------- */
+
+/**
+ * VFIO_IOMMU_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 12, struct vfio_iommu_info)
+ *
+ * Retrieve information about the IOMMU object. Fills in provided
+ * struct vfio_iommu_info. Caller sets argsz.
+ *
+ * XXX Should we do these by CHECK_EXTENSION too?
+ */
+struct vfio_iommu_type1_info {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_IOMMU_INFO_PGSIZES (1 << 0)	/* supported page sizes info */
+	__u64	iova_pgsizes;		/* Bitmap of supported page sizes */
+};
+
+#define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/**
+ * VFIO_IOMMU_MAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 13, struct vfio_dma_map)
+ *
+ * Map process virtual addresses to IO virtual addresses using the
+ * provided struct vfio_dma_map. Caller sets argsz. READ &/ WRITE required.
+ */
+struct vfio_iommu_type1_dma_map {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_DMA_MAP_FLAG_READ (1 << 0)		/* readable from device */
+#define VFIO_DMA_MAP_FLAG_WRITE (1 << 1)	/* writable from device */
+	__u64	vaddr;				/* Process virtual address */
+	__u64	iova;				/* IO virtual address */
+	__u64	size;				/* Size of mapping (bytes) */
+};
+
+#define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
+
+/**
+ * VFIO_IOMMU_UNMAP_DMA - _IOWR(VFIO_TYPE, VFIO_BASE + 14,
+ *							struct vfio_dma_unmap)
+ *
+ * Unmap IO virtual addresses using the provided struct vfio_dma_unmap.
+ * Caller sets argsz.  The actual unmapped size is returned in the size
+ * field.  No guarantee is made to the user that arbitrary unmaps of iova
+ * or size different from those used in the original mapping call will
+ * succeed.
+ */
+struct vfio_iommu_type1_dma_unmap {
+	__u32	argsz;
+	__u32	flags;
+	__u64	iova;				/* IO virtual address */
+	__u64	size;				/* Size of mapping (bytes) */
+};
+
+#define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
+
+/*
+ * IOCTLs to enable/disable IOMMU container usage.
+ * No parameters are supported.
+ */
+#define VFIO_IOMMU_ENABLE	_IO(VFIO_TYPE, VFIO_BASE + 15)
+#define VFIO_IOMMU_DISABLE	_IO(VFIO_TYPE, VFIO_BASE + 16)
+
+/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
+
+/*
+ * The SPAPR TCE DDW info struct provides the information about
+ * the details of Dynamic DMA window capability.
+ *
+ * @pgsizes contains a page size bitmask, 4K/64K/16M are supported.
+ * @max_dynamic_windows_supported tells the maximum number of windows
+ * which the platform can create.
+ * @levels tells the maximum number of levels in multi-level IOMMU tables;
+ * this allows splitting a table into smaller chunks which reduces
+ * the amount of physically contiguous memory required for the table.
+ */
+struct vfio_iommu_spapr_tce_ddw_info {
+	__u64 pgsizes;			/* Bitmap of supported page sizes */
+	__u32 max_dynamic_windows_supported;
+	__u32 levels;
+};
+
+/*
+ * The SPAPR TCE info struct provides the information about the PCI bus
+ * address ranges available for DMA, these values are programmed into
+ * the hardware so the guest has to know that information.
+ *
+ * The DMA 32 bit window start is an absolute PCI bus address.
+ * The IOVA address passed via map/unmap ioctls are absolute PCI bus
+ * addresses too so the window works as a filter rather than an offset
+ * for IOVA addresses.
+ *
+ * Flags supported:
+ * - VFIO_IOMMU_SPAPR_INFO_DDW: informs the userspace that dynamic DMA windows
+ *   (DDW) support is present. @ddw is only supported when DDW is present.
+ */
+struct vfio_iommu_spapr_tce_info {
+	__u32 argsz;
+	__u32 flags;
+#define VFIO_IOMMU_SPAPR_INFO_DDW	(1 << 0)	/* DDW supported */
+	__u32 dma32_window_start;	/* 32 bit window start (bytes) */
+	__u32 dma32_window_size;	/* 32 bit window size (bytes) */
+	struct vfio_iommu_spapr_tce_ddw_info ddw;
+};
+
+#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/*
+ * EEH PE operation struct provides ways to:
+ * - enable/disable EEH functionality;
+ * - unfreeze IO/DMA for frozen PE;
+ * - read PE state;
+ * - reset PE;
+ * - configure PE;
+ * - inject EEH error.
+ */
+struct vfio_eeh_pe_err {
+	__u32 type;
+	__u32 func;
+	__u64 addr;
+	__u64 mask;
+};
+
+struct vfio_eeh_pe_op {
+	__u32 argsz;
+	__u32 flags;
+	__u32 op;
+	union {
+		struct vfio_eeh_pe_err err;
+	};
+};
+
+#define VFIO_EEH_PE_DISABLE		0	/* Disable EEH functionality */
+#define VFIO_EEH_PE_ENABLE		1	/* Enable EEH functionality  */
+#define VFIO_EEH_PE_UNFREEZE_IO		2	/* Enable IO for frozen PE   */
+#define VFIO_EEH_PE_UNFREEZE_DMA	3	/* Enable DMA for frozen PE  */
+#define VFIO_EEH_PE_GET_STATE		4	/* PE state retrieval        */
+#define  VFIO_EEH_PE_STATE_NORMAL	0	/* PE in functional state    */
+#define  VFIO_EEH_PE_STATE_RESET	1	/* PE reset in progress      */
+#define  VFIO_EEH_PE_STATE_STOPPED	2	/* Stopped DMA and IO        */
+#define  VFIO_EEH_PE_STATE_STOPPED_DMA	4	/* Stopped DMA only          */
+#define  VFIO_EEH_PE_STATE_UNAVAIL	5	/* State unavailable         */
+#define VFIO_EEH_PE_RESET_DEACTIVATE	5	/* Deassert PE reset         */
+#define VFIO_EEH_PE_RESET_HOT		6	/* Assert hot reset          */
+#define VFIO_EEH_PE_RESET_FUNDAMENTAL	7	/* Assert fundamental reset  */
+#define VFIO_EEH_PE_CONFIGURE		8	/* PE configuration          */
+#define VFIO_EEH_PE_INJECT_ERR		9	/* Inject EEH error          */
+
+#define VFIO_EEH_PE_OP			_IO(VFIO_TYPE, VFIO_BASE + 21)
+
+/**
+ * VFIO_IOMMU_SPAPR_REGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 17, struct vfio_iommu_spapr_register_memory)
+ *
+ * Registers user space memory where DMA is allowed. It pins
+ * user pages and does the locked memory accounting so
+ * subsequent VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA calls
+ * get faster.
+ */
+struct vfio_iommu_spapr_register_memory {
+	__u32	argsz;
+	__u32	flags;
+	__u64	vaddr;				/* Process virtual address */
+	__u64	size;				/* Size of mapping (bytes) */
+};
+#define VFIO_IOMMU_SPAPR_REGISTER_MEMORY	_IO(VFIO_TYPE, VFIO_BASE + 17)
+
+/**
+ * VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 18, struct vfio_iommu_spapr_register_memory)
+ *
+ * Unregisters user space memory registered with
+ * VFIO_IOMMU_SPAPR_REGISTER_MEMORY.
+ * Uses vfio_iommu_spapr_register_memory for parameters.
+ */
+#define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY	_IO(VFIO_TYPE, VFIO_BASE + 18)
+
+/**
+ * VFIO_IOMMU_SPAPR_TCE_CREATE - _IOWR(VFIO_TYPE, VFIO_BASE + 19, struct vfio_iommu_spapr_tce_create)
+ *
+ * Creates an additional TCE table and programs it (sets a new DMA window)
+ * to every IOMMU group in the container. It receives page shift, window
+ * size and number of levels in the TCE table being created.
+ *
+ * It allocates and returns an offset on a PCI bus of the new DMA window.
+ */
+struct vfio_iommu_spapr_tce_create {
+	__u32 argsz;
+	__u32 flags;
+	/* in */
+	__u32 page_shift;
+	__u32 __resv1;
+	__u64 window_size;
+	__u32 levels;
+	__u32 __resv2;
+	/* out */
+	__u64 start_addr;
+};
+#define VFIO_IOMMU_SPAPR_TCE_CREATE	_IO(VFIO_TYPE, VFIO_BASE + 19)
+
+/**
+ * VFIO_IOMMU_SPAPR_TCE_REMOVE - _IOW(VFIO_TYPE, VFIO_BASE + 20, struct vfio_iommu_spapr_tce_remove)
+ *
+ * Unprograms a TCE table from all groups in the container and destroys it.
+ * It receives a PCI bus offset as a window id.
+ */
+struct vfio_iommu_spapr_tce_remove {
+	__u32 argsz;
+	__u32 flags;
+	/* in */
+	__u64 start_addr;
+};
+#define VFIO_IOMMU_SPAPR_TCE_REMOVE	_IO(VFIO_TYPE, VFIO_BASE + 20)
+
+/* ***************************************************************** */
+
+#endif /* VFIO_H */
diff --git a/kvmtool/include/linux/vhost.h b/kvmtool/include/linux/vhost.h
new file mode 100644
index 0000000..bb6a5b4
--- /dev/null
+++ b/kvmtool/include/linux/vhost.h
@@ -0,0 +1,158 @@
+#ifndef _LINUX_VHOST_H
+#define _LINUX_VHOST_H
+/* Userspace interface for in-kernel virtio accelerators. */
+
+/* vhost is used to reduce the number of system calls involved in virtio.
+ *
+ * Existing virtio net code is used in the guest without modification.
+ *
+ * This header includes interface used by userspace hypervisor for
+ * device configuration.
+ */
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <linux/ioctl.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_ring.h>
+
+struct vhost_vring_state {
+	unsigned int index;
+	unsigned int num;
+};
+
+struct vhost_vring_file {
+	unsigned int index;
+	int fd; /* Pass -1 to unbind from file. */
+
+};
+
+struct vhost_vring_addr {
+	unsigned int index;
+	/* Option flags. */
+	unsigned int flags;
+	/* Flag values: */
+	/* Whether log address is valid. If set enables logging. */
+#define VHOST_VRING_F_LOG 0
+
+	/* Start of array of descriptors (virtually contiguous) */
+	__u64 desc_user_addr;
+	/* Used structure address. Must be 32 bit aligned */
+	__u64 used_user_addr;
+	/* Available structure address. Must be 16 bit aligned */
+	__u64 avail_user_addr;
+	/* Logging support. */
+	/* Log writes to used structure, at offset calculated from specified
+	 * address. Address must be 32 bit aligned. */
+	__u64 log_guest_addr;
+};
+
+struct vhost_memory_region {
+	__u64 guest_phys_addr;
+	__u64 memory_size; /* bytes */
+	__u64 userspace_addr;
+	__u64 flags_padding; /* No flags are currently specified. */
+};
+
+/* All region addresses and sizes must be 4K aligned. */
+#define VHOST_PAGE_SIZE 0x1000
+
+struct vhost_memory {
+	__u32 nregions;
+	__u32 padding;
+	struct vhost_memory_region regions[0];
+};
+
+/* ioctls */
+
+#define VHOST_VIRTIO 0xAF
+
+/* Features bitmask for forward compatibility.  Transport bits are used for
+ * vhost specific features. */
+#define VHOST_GET_FEATURES	_IOR(VHOST_VIRTIO, 0x00, __u64)
+#define VHOST_SET_FEATURES	_IOW(VHOST_VIRTIO, 0x00, __u64)
+
+/* Set current process as the (exclusive) owner of this file descriptor.  This
+ * must be called before any other vhost command.  Further calls to
+ * VHOST_OWNER_SET fail until VHOST_OWNER_RESET is called. */
+#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01)
+/* Give up ownership, and reset the device to default values.
+ * Allows subsequent call to VHOST_OWNER_SET to succeed. */
+#define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02)
+
+/* Set up/modify memory layout */
+#define VHOST_SET_MEM_TABLE	_IOW(VHOST_VIRTIO, 0x03, struct vhost_memory)
+
+/* Write logging setup. */
+/* Memory writes can optionally be logged by setting bit at an offset
+ * (calculated from the physical address) from specified log base.
+ * The bit is set using an atomic 32 bit operation. */
+/* Set base address for logging. */
+#define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64)
+/* Specify an eventfd file descriptor to signal on log write. */
+#define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int)
+
+/* Ring setup. */
+/* Set number of descriptors in ring. This parameter can not
+ * be modified while ring is running (bound to a device). */
+#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state)
+/* Set addresses for the ring. */
+#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr)
+/* Base value where queue looks for available descriptors */
+#define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_state)
+/* Get accessor: reads index, writes value in num */
+#define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x12, struct vhost_vring_state)
+
+/* The following ioctls use eventfd file descriptors to signal and poll
+ * for events. */
+
+/* Set eventfd to poll for added buffers */
+#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file)
+/* Set eventfd to signal when buffers have beed used */
+#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file)
+/* Set eventfd to signal an error */
+#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file)
+
+/* VHOST_NET specific defines */
+
+/* Attach virtio net ring to a raw socket, or tap device.
+ * The socket must be already bound to an ethernet device, this device will be
+ * used for transmit.  Pass fd -1 to unbind from the socket and the transmit
+ * device.  This can be used to stop the ring (e.g. for migration). */
+#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file)
+
+/* Feature bits */
+/* Log all write descriptors. Can be changed while device is active. */
+#define VHOST_F_LOG_ALL 26
+/* vhost-net should add virtio_net_hdr for RX, and strip for TX packets. */
+#define VHOST_NET_F_VIRTIO_NET_HDR 27
+
+/* VHOST_SCSI specific definitions */
+
+/*
+ * Used by QEMU userspace to ensure a consistent vhost-scsi ABI.
+ *
+ * ABI Rev 0: July 2012 version starting point for v3.6-rc merge candidate +
+ *            RFC-v2 vhost-scsi userspace.  Add GET_ABI_VERSION ioctl usage
+ * ABI Rev 1: January 2013. Ignore vhost_tpgt filed in struct vhost_scsi_target.
+ *            All the targets under vhost_wwpn can be seen and used by guset.
+ */
+
+#define VHOST_SCSI_ABI_VERSION	1
+
+struct vhost_scsi_target {
+	int abi_version;
+	char vhost_wwpn[224]; /* TRANSPORT_IQN_LEN */
+	unsigned short vhost_tpgt;
+	unsigned short reserved;
+};
+
+#define VHOST_SCSI_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x40, struct vhost_scsi_target)
+#define VHOST_SCSI_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x41, struct vhost_scsi_target)
+/* Changing this breaks userspace. */
+#define VHOST_SCSI_GET_ABI_VERSION _IOW(VHOST_VIRTIO, 0x42, int)
+/* Set and get the events missed flag */
+#define VHOST_SCSI_SET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x43, __u32)
+#define VHOST_SCSI_GET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x44, __u32)
+
+#endif
diff --git a/kvmtool/include/linux/virtio_ids.h b/kvmtool/include/linux/virtio_ids.h
new file mode 100644
index 0000000..5f60aa4
--- /dev/null
+++ b/kvmtool/include/linux/virtio_ids.h
@@ -0,0 +1,44 @@
+#ifndef _LINUX_VIRTIO_IDS_H
+#define _LINUX_VIRTIO_IDS_H
+/*
+ * Virtio IDs
+ *
+ * This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE. */
+
+#define VIRTIO_ID_NET		1 /* virtio net */
+#define VIRTIO_ID_BLOCK		2 /* virtio block */
+#define VIRTIO_ID_CONSOLE	3 /* virtio console */
+#define VIRTIO_ID_RNG		4 /* virtio rng */
+#define VIRTIO_ID_BALLOON	5 /* virtio balloon */
+#define VIRTIO_ID_RPMSG		7 /* virtio remote processor messaging */
+#define VIRTIO_ID_SCSI		8 /* virtio scsi */
+#define VIRTIO_ID_9P		9 /* 9p virtio console */
+#define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */
+#define VIRTIO_ID_CAIF	       12 /* Virtio caif */
+#define VIRTIO_ID_INPUT        18 /* virtio input */
+
+#endif /* _LINUX_VIRTIO_IDS_H */
diff --git a/kvmtool/include/linux/virtio_mmio.h b/kvmtool/include/linux/virtio_mmio.h
new file mode 100644
index 0000000..5c7b6f0
--- /dev/null
+++ b/kvmtool/include/linux/virtio_mmio.h
@@ -0,0 +1,111 @@
+/*
+ * Virtio platform device driver
+ *
+ * Copyright 2011, ARM Ltd.
+ *
+ * Based on Virtio PCI driver by Anthony Liguori, copyright IBM Corp. 2007
+ *
+ * This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_VIRTIO_MMIO_H
+#define _LINUX_VIRTIO_MMIO_H
+
+/*
+ * Control registers
+ */
+
+/* Magic value ("virt" string) - Read Only */
+#define VIRTIO_MMIO_MAGIC_VALUE		0x000
+
+/* Virtio device version - Read Only */
+#define VIRTIO_MMIO_VERSION		0x004
+
+/* Virtio device ID - Read Only */
+#define VIRTIO_MMIO_DEVICE_ID		0x008
+
+/* Virtio vendor ID - Read Only */
+#define VIRTIO_MMIO_VENDOR_ID		0x00c
+
+/* Bitmask of the features supported by the host
+ * (32 bits per set) - Read Only */
+#define VIRTIO_MMIO_HOST_FEATURES	0x010
+
+/* Host features set selector - Write Only */
+#define VIRTIO_MMIO_HOST_FEATURES_SEL	0x014
+
+/* Bitmask of features activated by the guest
+ * (32 bits per set) - Write Only */
+#define VIRTIO_MMIO_GUEST_FEATURES	0x020
+
+/* Activated features set selector - Write Only */
+#define VIRTIO_MMIO_GUEST_FEATURES_SEL	0x024
+
+/* Guest's memory page size in bytes - Write Only */
+#define VIRTIO_MMIO_GUEST_PAGE_SIZE	0x028
+
+/* Queue selector - Write Only */
+#define VIRTIO_MMIO_QUEUE_SEL		0x030
+
+/* Maximum size of the currently selected queue - Read Only */
+#define VIRTIO_MMIO_QUEUE_NUM_MAX	0x034
+
+/* Queue size for the currently selected queue - Write Only */
+#define VIRTIO_MMIO_QUEUE_NUM		0x038
+
+/* Used Ring alignment for the currently selected queue - Write Only */
+#define VIRTIO_MMIO_QUEUE_ALIGN		0x03c
+
+/* Guest's PFN for the currently selected queue - Read Write */
+#define VIRTIO_MMIO_QUEUE_PFN		0x040
+
+/* Queue notifier - Write Only */
+#define VIRTIO_MMIO_QUEUE_NOTIFY	0x050
+
+/* Interrupt status - Read Only */
+#define VIRTIO_MMIO_INTERRUPT_STATUS	0x060
+
+/* Interrupt acknowledge - Write Only */
+#define VIRTIO_MMIO_INTERRUPT_ACK	0x064
+
+/* Device status register - Read Write */
+#define VIRTIO_MMIO_STATUS		0x070
+
+/* The config space is defined by each driver as
+ * the per-driver configuration space - Read Write */
+#define VIRTIO_MMIO_CONFIG		0x100
+
+
+
+/*
+ * Interrupt flags (re: interrupt status & acknowledge registers)
+ */
+
+#define VIRTIO_MMIO_INT_VRING		(1 << 0)
+#define VIRTIO_MMIO_INT_CONFIG		(1 << 1)
+
+#endif
diff --git a/kvmtool/include/linux/virtio_net.h b/kvmtool/include/linux/virtio_net.h
new file mode 100644
index 0000000..172a7f0
--- /dev/null
+++ b/kvmtool/include/linux/virtio_net.h
@@ -0,0 +1,204 @@
+#ifndef _LINUX_VIRTIO_NET_H
+#define _LINUX_VIRTIO_NET_H
+/* This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE. */
+#include <linux/types.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+#include <linux/if_ether.h>
+
+/* The feature bitmap for virtio net */
+#define VIRTIO_NET_F_CSUM	0	/* Host handles pkts w/ partial csum */
+#define VIRTIO_NET_F_GUEST_CSUM	1	/* Guest handles pkts w/ partial csum */
+#define VIRTIO_NET_F_MAC	5	/* Host has given MAC address. */
+#define VIRTIO_NET_F_GSO	6	/* Host handles pkts w/ any GSO type */
+#define VIRTIO_NET_F_GUEST_TSO4	7	/* Guest can handle TSOv4 in. */
+#define VIRTIO_NET_F_GUEST_TSO6	8	/* Guest can handle TSOv6 in. */
+#define VIRTIO_NET_F_GUEST_ECN	9	/* Guest can handle TSO[6] w/ ECN in. */
+#define VIRTIO_NET_F_GUEST_UFO	10	/* Guest can handle UFO in. */
+#define VIRTIO_NET_F_HOST_TSO4	11	/* Host can handle TSOv4 in. */
+#define VIRTIO_NET_F_HOST_TSO6	12	/* Host can handle TSOv6 in. */
+#define VIRTIO_NET_F_HOST_ECN	13	/* Host can handle TSO[6] w/ ECN in. */
+#define VIRTIO_NET_F_HOST_UFO	14	/* Host can handle UFO in. */
+#define VIRTIO_NET_F_MRG_RXBUF	15	/* Host can merge receive buffers. */
+#define VIRTIO_NET_F_STATUS	16	/* virtio_net_config.status available */
+#define VIRTIO_NET_F_CTRL_VQ	17	/* Control channel available */
+#define VIRTIO_NET_F_CTRL_RX	18	/* Control channel RX mode support */
+#define VIRTIO_NET_F_CTRL_VLAN	19	/* Control channel VLAN filtering */
+#define VIRTIO_NET_F_CTRL_RX_EXTRA 20	/* Extra RX mode control support */
+#define VIRTIO_NET_F_GUEST_ANNOUNCE 21	/* Guest can announce device on the
+					 * network */
+#define VIRTIO_NET_F_MQ	22	/* Device supports Receive Flow
+					 * Steering */
+#define VIRTIO_NET_F_CTRL_MAC_ADDR 23	/* Set MAC address */
+
+#define VIRTIO_NET_S_LINK_UP	1	/* Link is up */
+#define VIRTIO_NET_S_ANNOUNCE	2	/* Announcement is needed */
+
+struct virtio_net_config {
+	/* The config defining mac address (if VIRTIO_NET_F_MAC) */
+	__u8 mac[ETH_ALEN];
+	/* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */
+	__u16 status;
+	/* Maximum number of each of transmit and receive queues;
+	 * see VIRTIO_NET_F_MQ and VIRTIO_NET_CTRL_MQ.
+	 * Legal values are between 1 and 0x8000
+	 */
+	__u16 max_virtqueue_pairs;
+} __attribute__((packed));
+
+/* This header comes first in the scatter-gather list.
+ * If VIRTIO_F_ANY_LAYOUT is not negotiated, it must
+ * be the first element of the scatter-gather list.  If you don't
+ * specify GSO or CSUM features, you can simply ignore the header. */
+struct virtio_net_hdr {
+#define VIRTIO_NET_HDR_F_NEEDS_CSUM	1	// Use csum_start, csum_offset
+#define VIRTIO_NET_HDR_F_DATA_VALID	2	// Csum is valid
+	__u8 flags;
+#define VIRTIO_NET_HDR_GSO_NONE		0	// Not a GSO frame
+#define VIRTIO_NET_HDR_GSO_TCPV4	1	// GSO frame, IPv4 TCP (TSO)
+#define VIRTIO_NET_HDR_GSO_UDP		3	// GSO frame, IPv4 UDP (UFO)
+#define VIRTIO_NET_HDR_GSO_TCPV6	4	// GSO frame, IPv6 TCP
+#define VIRTIO_NET_HDR_GSO_ECN		0x80	// TCP has ECN set
+	__u8 gso_type;
+	__u16 hdr_len;		/* Ethernet + IP + tcp/udp hdrs */
+	__u16 gso_size;		/* Bytes to append to hdr_len per frame */
+	__u16 csum_start;	/* Position to start checksumming from */
+	__u16 csum_offset;	/* Offset after that to place checksum */
+};
+
+/* This is the version of the header to use when the MRG_RXBUF
+ * feature has been negotiated. */
+struct virtio_net_hdr_mrg_rxbuf {
+	struct virtio_net_hdr hdr;
+	__u16 num_buffers;	/* Number of merged rx buffers */
+};
+
+/*
+ * Control virtqueue data structures
+ *
+ * The control virtqueue expects a header in the first sg entry
+ * and an ack/status response in the last entry.  Data for the
+ * command goes in between.
+ */
+struct virtio_net_ctrl_hdr {
+	__u8 class;
+	__u8 cmd;
+} __attribute__((packed));
+
+typedef __u8 virtio_net_ctrl_ack;
+
+#define VIRTIO_NET_OK     0
+#define VIRTIO_NET_ERR    1
+
+/*
+ * Control the RX mode, ie. promisucous, allmulti, etc...
+ * All commands require an "out" sg entry containing a 1 byte
+ * state value, zero = disable, non-zero = enable.  Commands
+ * 0 and 1 are supported with the VIRTIO_NET_F_CTRL_RX feature.
+ * Commands 2-5 are added with VIRTIO_NET_F_CTRL_RX_EXTRA.
+ */
+#define VIRTIO_NET_CTRL_RX    0
+ #define VIRTIO_NET_CTRL_RX_PROMISC      0
+ #define VIRTIO_NET_CTRL_RX_ALLMULTI     1
+ #define VIRTIO_NET_CTRL_RX_ALLUNI       2
+ #define VIRTIO_NET_CTRL_RX_NOMULTI      3
+ #define VIRTIO_NET_CTRL_RX_NOUNI        4
+ #define VIRTIO_NET_CTRL_RX_NOBCAST      5
+
+/*
+ * Control the MAC
+ *
+ * The MAC filter table is managed by the hypervisor, the guest should
+ * assume the size is infinite.  Filtering should be considered
+ * non-perfect, ie. based on hypervisor resources, the guest may
+ * received packets from sources not specified in the filter list.
+ *
+ * In addition to the class/cmd header, the TABLE_SET command requires
+ * two out scatterlists.  Each contains a 4 byte count of entries followed
+ * by a concatenated byte stream of the ETH_ALEN MAC addresses.  The
+ * first sg list contains unicast addresses, the second is for multicast.
+ * This functionality is present if the VIRTIO_NET_F_CTRL_RX feature
+ * is available.
+ *
+ * The ADDR_SET command requests one out scatterlist, it contains a
+ * 6 bytes MAC address. This functionality is present if the
+ * VIRTIO_NET_F_CTRL_MAC_ADDR feature is available.
+ */
+struct virtio_net_ctrl_mac {
+	__u32 entries;
+	__u8 macs[][ETH_ALEN];
+} __attribute__((packed));
+
+#define VIRTIO_NET_CTRL_MAC    1
+ #define VIRTIO_NET_CTRL_MAC_TABLE_SET        0
+ #define VIRTIO_NET_CTRL_MAC_ADDR_SET         1
+
+/*
+ * Control VLAN filtering
+ *
+ * The VLAN filter table is controlled via a simple ADD/DEL interface.
+ * VLAN IDs not added may be filterd by the hypervisor.  Del is the
+ * opposite of add.  Both commands expect an out entry containing a 2
+ * byte VLAN ID.  VLAN filterting is available with the
+ * VIRTIO_NET_F_CTRL_VLAN feature bit.
+ */
+#define VIRTIO_NET_CTRL_VLAN       2
+ #define VIRTIO_NET_CTRL_VLAN_ADD             0
+ #define VIRTIO_NET_CTRL_VLAN_DEL             1
+
+/*
+ * Control link announce acknowledgement
+ *
+ * The command VIRTIO_NET_CTRL_ANNOUNCE_ACK is used to indicate that
+ * driver has recevied the notification; device would clear the
+ * VIRTIO_NET_S_ANNOUNCE bit in the status field after it receives
+ * this command.
+ */
+#define VIRTIO_NET_CTRL_ANNOUNCE       3
+ #define VIRTIO_NET_CTRL_ANNOUNCE_ACK         0
+
+/*
+ * Control Receive Flow Steering
+ *
+ * The command VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET
+ * enables Receive Flow Steering, specifying the number of the transmit and
+ * receive queues that will be used. After the command is consumed and acked by
+ * the device, the device will not steer new packets on receive virtqueues
+ * other than specified nor read from transmit virtqueues other than specified.
+ * Accordingly, driver should not transmit new packets  on virtqueues other than
+ * specified.
+ */
+struct virtio_net_ctrl_mq {
+	__u16 virtqueue_pairs;
+};
+
+#define VIRTIO_NET_CTRL_MQ   4
+ #define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET        0
+ #define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN        1
+ #define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX        0x8000
+
+#endif /* _LINUX_VIRTIO_NET_H */
diff --git a/kvmtool/include/linux/virtio_scsi.h b/kvmtool/include/linux/virtio_scsi.h
new file mode 100644
index 0000000..de429d1
--- /dev/null
+++ b/kvmtool/include/linux/virtio_scsi.h
@@ -0,0 +1,162 @@
+/*
+ * This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_VIRTIO_SCSI_H
+#define _LINUX_VIRTIO_SCSI_H
+
+#define VIRTIO_SCSI_CDB_SIZE   32
+#define VIRTIO_SCSI_SENSE_SIZE 96
+
+/* SCSI command request, followed by data-out */
+struct virtio_scsi_cmd_req {
+	u8 lun[8];		/* Logical Unit Number */
+	u64 tag;		/* Command identifier */
+	u8 task_attr;		/* Task attribute */
+	u8 prio;		/* SAM command priority field */
+	u8 crn;
+	u8 cdb[VIRTIO_SCSI_CDB_SIZE];
+} __packed;
+
+/* SCSI command request, followed by protection information */
+struct virtio_scsi_cmd_req_pi {
+	u8 lun[8];		/* Logical Unit Number */
+	u64 tag;		/* Command identifier */
+	u8 task_attr;		/* Task attribute */
+	u8 prio;		/* SAM command priority field */
+	u8 crn;
+	u32 pi_bytesout;	/* DataOUT PI Number of bytes */
+	u32 pi_bytesin;		/* DataIN PI Number of bytes */
+	u8 cdb[VIRTIO_SCSI_CDB_SIZE];
+} __packed;
+
+/* Response, followed by sense data and data-in */
+struct virtio_scsi_cmd_resp {
+	u32 sense_len;		/* Sense data length */
+	u32 resid;		/* Residual bytes in data buffer */
+	u16 status_qualifier;	/* Status qualifier */
+	u8 status;		/* Command completion status */
+	u8 response;		/* Response values */
+	u8 sense[VIRTIO_SCSI_SENSE_SIZE];
+} __packed;
+
+/* Task Management Request */
+struct virtio_scsi_ctrl_tmf_req {
+	u32 type;
+	u32 subtype;
+	u8 lun[8];
+	u64 tag;
+} __packed;
+
+struct virtio_scsi_ctrl_tmf_resp {
+	u8 response;
+} __packed;
+
+/* Asynchronous notification query/subscription */
+struct virtio_scsi_ctrl_an_req {
+	u32 type;
+	u8 lun[8];
+	u32 event_requested;
+} __packed;
+
+struct virtio_scsi_ctrl_an_resp {
+	u32 event_actual;
+	u8 response;
+} __packed;
+
+struct virtio_scsi_event {
+	u32 event;
+	u8 lun[8];
+	u32 reason;
+} __packed;
+
+struct virtio_scsi_config {
+	u32 num_queues;
+	u32 seg_max;
+	u32 max_sectors;
+	u32 cmd_per_lun;
+	u32 event_info_size;
+	u32 sense_size;
+	u32 cdb_size;
+	u16 max_channel;
+	u16 max_target;
+	u32 max_lun;
+} __packed;
+
+/* Feature Bits */
+#define VIRTIO_SCSI_F_INOUT                    0
+#define VIRTIO_SCSI_F_HOTPLUG                  1
+#define VIRTIO_SCSI_F_CHANGE                   2
+#define VIRTIO_SCSI_F_T10_PI                   3
+
+/* Response codes */
+#define VIRTIO_SCSI_S_OK                       0
+#define VIRTIO_SCSI_S_OVERRUN                  1
+#define VIRTIO_SCSI_S_ABORTED                  2
+#define VIRTIO_SCSI_S_BAD_TARGET               3
+#define VIRTIO_SCSI_S_RESET                    4
+#define VIRTIO_SCSI_S_BUSY                     5
+#define VIRTIO_SCSI_S_TRANSPORT_FAILURE        6
+#define VIRTIO_SCSI_S_TARGET_FAILURE           7
+#define VIRTIO_SCSI_S_NEXUS_FAILURE            8
+#define VIRTIO_SCSI_S_FAILURE                  9
+#define VIRTIO_SCSI_S_FUNCTION_SUCCEEDED       10
+#define VIRTIO_SCSI_S_FUNCTION_REJECTED        11
+#define VIRTIO_SCSI_S_INCORRECT_LUN            12
+
+/* Controlq type codes.  */
+#define VIRTIO_SCSI_T_TMF                      0
+#define VIRTIO_SCSI_T_AN_QUERY                 1
+#define VIRTIO_SCSI_T_AN_SUBSCRIBE             2
+
+/* Valid TMF subtypes.  */
+#define VIRTIO_SCSI_T_TMF_ABORT_TASK           0
+#define VIRTIO_SCSI_T_TMF_ABORT_TASK_SET       1
+#define VIRTIO_SCSI_T_TMF_CLEAR_ACA            2
+#define VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET       3
+#define VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET      4
+#define VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET   5
+#define VIRTIO_SCSI_T_TMF_QUERY_TASK           6
+#define VIRTIO_SCSI_T_TMF_QUERY_TASK_SET       7
+
+/* Events.  */
+#define VIRTIO_SCSI_T_EVENTS_MISSED            0x80000000
+#define VIRTIO_SCSI_T_NO_EVENT                 0
+#define VIRTIO_SCSI_T_TRANSPORT_RESET          1
+#define VIRTIO_SCSI_T_ASYNC_NOTIFY             2
+#define VIRTIO_SCSI_T_PARAM_CHANGE             3
+
+/* Reasons of transport reset event */
+#define VIRTIO_SCSI_EVT_RESET_HARD             0
+#define VIRTIO_SCSI_EVT_RESET_RESCAN           1
+#define VIRTIO_SCSI_EVT_RESET_REMOVED          2
+
+#define VIRTIO_SCSI_S_SIMPLE                   0
+#define VIRTIO_SCSI_S_ORDERED                  1
+#define VIRTIO_SCSI_S_HEAD                     2
+#define VIRTIO_SCSI_S_ACA                      3
+
+
+#endif /* _LINUX_VIRTIO_SCSI_H */
diff --git a/kvmtool/ioeventfd.c b/kvmtool/ioeventfd.c
new file mode 100644
index 0000000..3ae8267
--- /dev/null
+++ b/kvmtool/ioeventfd.c
@@ -0,0 +1,224 @@
+#include <sys/epoll.h>
+#include <sys/ioctl.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <signal.h>
+
+#include <linux/kernel.h>
+#include <linux/kvm.h>
+#include <linux/types.h>
+
+#include "kvm/ioeventfd.h"
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+#define IOEVENTFD_MAX_EVENTS	20
+
+static struct	epoll_event events[IOEVENTFD_MAX_EVENTS];
+static int	epoll_fd, epoll_stop_fd;
+static LIST_HEAD(used_ioevents);
+static bool	ioeventfd_avail;
+
+static void *ioeventfd__thread(void *param)
+{
+	u64 tmp = 1;
+
+	kvm__set_thread_name("ioeventfd-worker");
+
+	for (;;) {
+		int nfds, i;
+
+		nfds = epoll_wait(epoll_fd, events, IOEVENTFD_MAX_EVENTS, -1);
+		for (i = 0; i < nfds; i++) {
+			struct ioevent *ioevent;
+
+			if (events[i].data.fd == epoll_stop_fd)
+				goto done;
+
+			ioevent = events[i].data.ptr;
+
+			if (read(ioevent->fd, &tmp, sizeof(tmp)) < 0)
+				die("Failed reading event");
+
+			ioevent->fn(ioevent->fn_kvm, ioevent->fn_ptr);
+		}
+	}
+
+done:
+	tmp = write(epoll_stop_fd, &tmp, sizeof(tmp));
+
+	return NULL;
+}
+
+static int ioeventfd__start(void)
+{
+	pthread_t thread;
+
+	if (!ioeventfd_avail)
+		return -ENOSYS;
+
+	return pthread_create(&thread, NULL, ioeventfd__thread, NULL);
+}
+
+int ioeventfd__init(struct kvm *kvm)
+{
+	struct epoll_event epoll_event = {.events = EPOLLIN};
+	int r;
+
+	ioeventfd_avail = kvm__supports_extension(kvm, KVM_CAP_IOEVENTFD);
+	if (!ioeventfd_avail)
+		return 1; /* Not fatal, but let caller determine no-go. */
+
+	epoll_fd = epoll_create(IOEVENTFD_MAX_EVENTS);
+	if (epoll_fd < 0)
+		return -errno;
+
+	epoll_stop_fd = eventfd(0, 0);
+	epoll_event.data.fd = epoll_stop_fd;
+
+	r = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, epoll_stop_fd, &epoll_event);
+	if (r < 0)
+		goto cleanup;
+
+	r = ioeventfd__start();
+	if (r < 0)
+		goto cleanup;
+
+	r = 0;
+
+	return r;
+
+cleanup:
+	close(epoll_stop_fd);
+	close(epoll_fd);
+
+	return r;
+}
+base_init(ioeventfd__init);
+
+int ioeventfd__exit(struct kvm *kvm)
+{
+	u64 tmp = 1;
+	int r;
+
+	if (!ioeventfd_avail)
+		return 0;
+
+	r = write(epoll_stop_fd, &tmp, sizeof(tmp));
+	if (r < 0)
+		return r;
+
+	r = read(epoll_stop_fd, &tmp, sizeof(tmp));
+	if (r < 0)
+		return r;
+
+	close(epoll_fd);
+	close(epoll_stop_fd);
+
+	return 0;
+}
+base_exit(ioeventfd__exit);
+
+int ioeventfd__add_event(struct ioevent *ioevent, int flags)
+{
+	struct kvm_ioeventfd kvm_ioevent;
+	struct epoll_event epoll_event;
+	struct ioevent *new_ioevent;
+	int event, r;
+
+	if (!ioeventfd_avail)
+		return -ENOSYS;
+
+	new_ioevent = malloc(sizeof(*new_ioevent));
+	if (new_ioevent == NULL)
+		return -ENOMEM;
+
+	*new_ioevent = *ioevent;
+	event = new_ioevent->fd;
+
+	kvm_ioevent = (struct kvm_ioeventfd) {
+		.addr		= ioevent->io_addr,
+		.len		= ioevent->io_len,
+		.datamatch	= ioevent->datamatch,
+		.fd		= event,
+		.flags		= KVM_IOEVENTFD_FLAG_DATAMATCH,
+	};
+
+	/*
+	 * For architectures that don't recognize PIO accesses, always register
+	 * on the MMIO bus. Otherwise PIO accesses will cause returns to
+	 * userspace.
+	 */
+	if (KVM_IOEVENTFD_HAS_PIO && flags & IOEVENTFD_FLAG_PIO)
+		kvm_ioevent.flags |= KVM_IOEVENTFD_FLAG_PIO;
+
+	r = ioctl(ioevent->fn_kvm->vm_fd, KVM_IOEVENTFD, &kvm_ioevent);
+	if (r) {
+		r = -errno;
+		goto cleanup;
+	}
+
+	if (flags & IOEVENTFD_FLAG_USER_POLL) {
+		epoll_event = (struct epoll_event) {
+			.events		= EPOLLIN,
+			.data.ptr	= new_ioevent,
+		};
+
+		r = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, event, &epoll_event);
+		if (r) {
+			r = -errno;
+			goto cleanup;
+		}
+	}
+
+	new_ioevent->flags = kvm_ioevent.flags;
+	list_add_tail(&new_ioevent->list, &used_ioevents);
+
+	return 0;
+
+cleanup:
+	free(new_ioevent);
+	return r;
+}
+
+int ioeventfd__del_event(u64 addr, u64 datamatch)
+{
+	struct kvm_ioeventfd kvm_ioevent;
+	struct ioevent *ioevent;
+	u8 found = 0;
+
+	if (!ioeventfd_avail)
+		return -ENOSYS;
+
+	list_for_each_entry(ioevent, &used_ioevents, list) {
+		if (ioevent->io_addr == addr &&
+		    ioevent->datamatch == datamatch) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (found == 0 || ioevent == NULL)
+		return -ENOENT;
+
+	kvm_ioevent = (struct kvm_ioeventfd) {
+		.fd			= ioevent->fd,
+		.addr			= ioevent->io_addr,
+		.len			= ioevent->io_len,
+		.datamatch		= ioevent->datamatch,
+		.flags			= ioevent->flags
+					| KVM_IOEVENTFD_FLAG_DEASSIGN,
+	};
+
+	ioctl(ioevent->fn_kvm->vm_fd, KVM_IOEVENTFD, &kvm_ioevent);
+
+	epoll_ctl(epoll_fd, EPOLL_CTL_DEL, ioevent->fd, NULL);
+
+	list_del(&ioevent->list);
+
+	close(ioevent->fd);
+	free(ioevent);
+
+	return 0;
+}
diff --git a/kvmtool/ioport.c b/kvmtool/ioport.c
new file mode 100644
index 0000000..844a832
--- /dev/null
+++ b/kvmtool/ioport.c
@@ -0,0 +1,235 @@
+#include "kvm/ioport.h"
+
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+#include "kvm/rbtree-interval.h"
+#include "kvm/mutex.h"
+
+#include <linux/kvm.h>	/* for KVM_EXIT_* */
+#include <linux/types.h>
+
+#include <stdbool.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define ioport_node(n) rb_entry(n, struct ioport, node)
+
+static DEFINE_MUTEX(ioport_lock);
+
+static struct rb_root		ioport_tree = RB_ROOT;
+
+static struct ioport *ioport_search(struct rb_root *root, u64 addr)
+{
+	struct rb_int_node *node;
+
+	node = rb_int_search_single(root, addr);
+	if (node == NULL)
+		return NULL;
+
+	return ioport_node(node);
+}
+
+static int ioport_insert(struct rb_root *root, struct ioport *data)
+{
+	return rb_int_insert(root, &data->node);
+}
+
+static void ioport_remove(struct rb_root *root, struct ioport *data)
+{
+	rb_int_erase(root, &data->node);
+}
+
+static struct ioport *ioport_get(struct rb_root *root, u64 addr)
+{
+	struct ioport *ioport;
+
+	mutex_lock(&ioport_lock);
+	ioport = ioport_search(root, addr);
+	if (ioport)
+		ioport->refcount++;
+	mutex_unlock(&ioport_lock);
+
+	return ioport;
+}
+
+/* Called with ioport_lock held. */
+static void ioport_unregister(struct rb_root *root, struct ioport *data)
+{
+	device__unregister(&data->dev_hdr);
+	ioport_remove(root, data);
+	free(data);
+}
+
+static void ioport_put(struct rb_root *root, struct ioport *data)
+{
+	mutex_lock(&ioport_lock);
+	data->refcount--;
+	if (data->remove && data->refcount == 0)
+		ioport_unregister(root, data);
+	mutex_unlock(&ioport_lock);
+}
+
+#ifdef CONFIG_HAS_LIBFDT
+static void generate_ioport_fdt_node(void *fdt,
+				     struct device_header *dev_hdr,
+				     void (*generate_irq_prop)(void *fdt,
+							       u8 irq,
+							       enum irq_type))
+{
+	struct ioport *ioport = container_of(dev_hdr, struct ioport, dev_hdr);
+	struct ioport_operations *ops = ioport->ops;
+
+	if (ops->generate_fdt_node)
+		ops->generate_fdt_node(ioport, fdt, generate_irq_prop);
+}
+#else
+static void generate_ioport_fdt_node(void *fdt,
+				     struct device_header *dev_hdr,
+				     void (*generate_irq_prop)(void *fdt,
+							       u8 irq,
+							       enum irq_type))
+{
+	die("Unable to generate device tree nodes without libfdt\n");
+}
+#endif
+
+int ioport__register(struct kvm *kvm, u16 port, struct ioport_operations *ops, int count, void *param)
+{
+	struct ioport *entry;
+	int r;
+
+	entry = malloc(sizeof(*entry));
+	if (entry == NULL)
+		return -ENOMEM;
+
+	*entry = (struct ioport) {
+		.node		= RB_INT_INIT(port, port + count),
+		.ops		= ops,
+		.priv		= param,
+		.dev_hdr	= (struct device_header) {
+			.bus_type	= DEVICE_BUS_IOPORT,
+			.data		= generate_ioport_fdt_node,
+		},
+		/*
+		 * Start from 0 because ioport__unregister() doesn't decrement
+		 * the reference count.
+		 */
+		.refcount	= 0,
+		.remove		= false,
+	};
+
+	mutex_lock(&ioport_lock);
+	r = ioport_insert(&ioport_tree, entry);
+	if (r < 0)
+		goto out_free;
+	r = device__register(&entry->dev_hdr);
+	if (r < 0)
+		goto out_remove;
+	mutex_unlock(&ioport_lock);
+
+	return port;
+
+out_remove:
+	ioport_remove(&ioport_tree, entry);
+out_free:
+	free(entry);
+	mutex_unlock(&ioport_lock);
+	return r;
+}
+
+int ioport__unregister(struct kvm *kvm, u16 port)
+{
+	struct ioport *entry;
+
+	mutex_lock(&ioport_lock);
+	entry = ioport_search(&ioport_tree, port);
+	if (!entry) {
+		mutex_unlock(&ioport_lock);
+		return -ENOENT;
+	}
+	/* The same reasoning from kvm__deregister_mmio() applies. */
+	if (entry->refcount == 0)
+		ioport_unregister(&ioport_tree, entry);
+	else
+		entry->remove = true;
+	mutex_unlock(&ioport_lock);
+
+	return 0;
+}
+
+static void ioport__unregister_all(void)
+{
+	struct ioport *entry;
+	struct rb_node *rb;
+	struct rb_int_node *rb_node;
+
+	rb = rb_first(&ioport_tree);
+	while (rb) {
+		rb_node = rb_int(rb);
+		entry = ioport_node(rb_node);
+		ioport_unregister(&ioport_tree, entry);
+		rb = rb_first(&ioport_tree);
+	}
+}
+
+static const char *to_direction(int direction)
+{
+	if (direction == KVM_EXIT_IO_IN)
+		return "IN";
+	else
+		return "OUT";
+}
+
+static void ioport_error(u16 port, void *data, int direction, int size, u32 count)
+{
+	fprintf(stderr, "IO error: %s port=%x, size=%d, count=%u\n", to_direction(direction), port, size, count);
+}
+
+bool kvm__emulate_io(struct kvm_cpu *vcpu, u16 port, void *data, int direction, int size, u32 count)
+{
+	struct ioport_operations *ops;
+	bool ret = false;
+	struct ioport *entry;
+	void *ptr = data;
+	struct kvm *kvm = vcpu->kvm;
+
+	entry = ioport_get(&ioport_tree, port);
+	if (!entry)
+		goto out;
+
+	ops	= entry->ops;
+
+	while (count--) {
+		if (direction == KVM_EXIT_IO_IN && ops->io_in)
+				ret = ops->io_in(entry, vcpu, port, ptr, size);
+		else if (direction == KVM_EXIT_IO_OUT && ops->io_out)
+				ret = ops->io_out(entry, vcpu, port, ptr, size);
+
+		ptr += size;
+	}
+
+	ioport_put(&ioport_tree, entry);
+
+out:
+	if (ret)
+		return true;
+
+	if (kvm->cfg.ioport_debug)
+		ioport_error(port, data, direction, size, count);
+
+	return !kvm->cfg.ioport_debug;
+}
+
+int ioport__init(struct kvm *kvm)
+{
+	return ioport__setup_arch(kvm);
+}
+dev_base_init(ioport__init);
+
+int ioport__exit(struct kvm *kvm)
+{
+	ioport__unregister_all();
+	return 0;
+}
+dev_base_exit(ioport__exit);
diff --git a/kvmtool/irq.c b/kvmtool/irq.c
new file mode 100644
index 0000000..cdcf992
--- /dev/null
+++ b/kvmtool/irq.c
@@ -0,0 +1,209 @@
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <linux/types.h>
+#include <linux/kvm.h>
+#include <errno.h>
+
+#include "kvm/kvm.h"
+#include "kvm/irq.h"
+#include "kvm/kvm-arch.h"
+
+static u8 next_line = KVM_IRQ_OFFSET;
+static int allocated_gsis = 0;
+
+int next_gsi;
+
+struct msi_routing_ops irq__default_routing_ops;
+struct msi_routing_ops *msi_routing_ops = &irq__default_routing_ops;
+
+struct kvm_irq_routing *irq_routing = NULL;
+
+int irq__alloc_line(void)
+{
+	return next_line++;
+}
+
+int irq__get_nr_allocated_lines(void)
+{
+	return next_line - KVM_IRQ_OFFSET;
+}
+
+int irq__allocate_routing_entry(void)
+{
+	size_t table_size = sizeof(struct kvm_irq_routing);
+	size_t old_size = table_size;
+	int nr_entries = 0;
+
+	if (irq_routing)
+		nr_entries = irq_routing->nr;
+
+	if (nr_entries < allocated_gsis)
+		return 0;
+
+	old_size += sizeof(struct kvm_irq_routing_entry) * allocated_gsis;
+	allocated_gsis = ALIGN(nr_entries + 1, 32);
+	table_size += sizeof(struct kvm_irq_routing_entry) * allocated_gsis;
+	irq_routing = realloc(irq_routing, table_size);
+
+	if (irq_routing == NULL)
+		return -ENOMEM;
+	memset((void *)irq_routing + old_size, 0, table_size - old_size);
+
+	irq_routing->nr = nr_entries;
+	irq_routing->flags = 0;
+
+	return 0;
+}
+
+static bool check_for_irq_routing(struct kvm *kvm)
+{
+	static int has_irq_routing = 0;
+
+	if (has_irq_routing == 0) {
+		if (kvm__supports_extension(kvm, KVM_CAP_IRQ_ROUTING))
+			has_irq_routing = 1;
+		else
+			has_irq_routing = -1;
+	}
+
+	return has_irq_routing > 0;
+}
+
+static int irq__update_msix_routes(struct kvm *kvm,
+				   struct kvm_irq_routing_entry *entry)
+{
+	return ioctl(kvm->vm_fd, KVM_SET_GSI_ROUTING, irq_routing);
+}
+
+static bool irq__default_can_signal_msi(struct kvm *kvm)
+{
+	return kvm__supports_extension(kvm, KVM_CAP_SIGNAL_MSI);
+}
+
+static int irq__default_signal_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+	return ioctl(kvm->vm_fd, KVM_SIGNAL_MSI, msi);
+}
+
+struct msi_routing_ops irq__default_routing_ops = {
+	.update_route	= irq__update_msix_routes,
+	.signal_msi	= irq__default_signal_msi,
+	.can_signal_msi	= irq__default_can_signal_msi,
+};
+
+bool irq__can_signal_msi(struct kvm *kvm)
+{
+	return msi_routing_ops->can_signal_msi(kvm);
+}
+
+int irq__signal_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+	return msi_routing_ops->signal_msi(kvm, msi);
+}
+
+int irq__add_msix_route(struct kvm *kvm, struct msi_msg *msg, u32 device_id)
+{
+	int r;
+	struct kvm_irq_routing_entry *entry;
+
+	if (!check_for_irq_routing(kvm))
+		return -ENXIO;
+
+	r = irq__allocate_routing_entry();
+	if (r)
+		return r;
+
+	entry = &irq_routing->entries[irq_routing->nr];
+	*entry = (struct kvm_irq_routing_entry) {
+		.gsi = next_gsi,
+		.type = KVM_IRQ_ROUTING_MSI,
+		.u.msi.address_hi = msg->address_hi,
+		.u.msi.address_lo = msg->address_lo,
+		.u.msi.data = msg->data,
+	};
+
+	if (kvm->msix_needs_devid) {
+		entry->flags = KVM_MSI_VALID_DEVID;
+		entry->u.msi.devid = device_id;
+	}
+
+	irq_routing->nr++;
+
+	r = msi_routing_ops->update_route(kvm, entry);
+	if (r)
+		return r;
+
+	return next_gsi++;
+}
+
+static bool update_data(u32 *ptr, u32 newdata)
+{
+	if (*ptr == newdata)
+		return false;
+
+	*ptr = newdata;
+	return true;
+}
+
+void irq__update_msix_route(struct kvm *kvm, u32 gsi, struct msi_msg *msg)
+{
+	struct kvm_irq_routing_msi *entry;
+	unsigned int i;
+	bool changed;
+
+	for (i = 0; i < irq_routing->nr; i++)
+		if (gsi == irq_routing->entries[i].gsi)
+			break;
+	if (i == irq_routing->nr)
+		return;
+
+	entry = &irq_routing->entries[i].u.msi;
+
+	changed  = update_data(&entry->address_hi, msg->address_hi);
+	changed |= update_data(&entry->address_lo, msg->address_lo);
+	changed |= update_data(&entry->data, msg->data);
+
+	if (!changed)
+		return;
+
+	if (msi_routing_ops->update_route(kvm, &irq_routing->entries[i]))
+		die_perror("KVM_SET_GSI_ROUTING");
+}
+
+int irq__common_add_irqfd(struct kvm *kvm, unsigned int gsi, int trigger_fd,
+			   int resample_fd)
+{
+	struct kvm_irqfd irqfd = {
+		.fd		= trigger_fd,
+		.gsi		= gsi,
+		.flags		= resample_fd > 0 ? KVM_IRQFD_FLAG_RESAMPLE : 0,
+		.resamplefd	= resample_fd,
+	};
+
+	/* If we emulate MSI routing, translate the MSI to the corresponding IRQ */
+	if (msi_routing_ops->translate_gsi)
+		irqfd.gsi = msi_routing_ops->translate_gsi(kvm, gsi);
+
+	return ioctl(kvm->vm_fd, KVM_IRQFD, &irqfd);
+}
+
+void irq__common_del_irqfd(struct kvm *kvm, unsigned int gsi, int trigger_fd)
+{
+	struct kvm_irqfd irqfd = {
+		.fd		= trigger_fd,
+		.gsi		= gsi,
+		.flags		= KVM_IRQFD_FLAG_DEASSIGN,
+	};
+
+	if (msi_routing_ops->translate_gsi)
+		irqfd.gsi = msi_routing_ops->translate_gsi(kvm, gsi);
+
+	ioctl(kvm->vm_fd, KVM_IRQFD, &irqfd);
+}
+
+int __attribute__((weak)) irq__exit(struct kvm *kvm)
+{
+	free(irq_routing);
+	return 0;
+}
+dev_base_exit(irq__exit);
diff --git a/kvmtool/kvm-cmd.c b/kvmtool/kvm-cmd.c
new file mode 100644
index 0000000..2520b08
--- /dev/null
+++ b/kvmtool/kvm-cmd.c
@@ -0,0 +1,91 @@
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+
+/* user defined header files */
+#include "kvm/builtin-debug.h"
+#include "kvm/builtin-pause.h"
+#include "kvm/builtin-resume.h"
+#include "kvm/builtin-balloon.h"
+#include "kvm/builtin-list.h"
+#include "kvm/builtin-version.h"
+#include "kvm/builtin-setup.h"
+#include "kvm/builtin-stop.h"
+#include "kvm/builtin-stat.h"
+#include "kvm/builtin-help.h"
+#include "kvm/builtin-sandbox.h"
+#include "kvm/kvm-cmd.h"
+#include "kvm/builtin-run.h"
+#include "kvm/util.h"
+
+struct cmd_struct kvm_commands[] = {
+	{ "pause",	kvm_cmd_pause,		kvm_pause_help,		0 },
+	{ "resume",	kvm_cmd_resume,		kvm_resume_help,	0 },
+	{ "debug",	kvm_cmd_debug,		kvm_debug_help,		0 },
+	{ "balloon",	kvm_cmd_balloon,	kvm_balloon_help,	0 },
+	{ "list",	kvm_cmd_list,		kvm_list_help,		0 },
+	{ "version",	kvm_cmd_version,	NULL,			0 },
+	{ "--version",	kvm_cmd_version,	NULL,			0 },
+	{ "stop",	kvm_cmd_stop,		kvm_stop_help,		0 },
+	{ "stat",	kvm_cmd_stat,		kvm_stat_help,		0 },
+	{ "help",	kvm_cmd_help,		NULL,			0 },
+	{ "setup",	kvm_cmd_setup,		kvm_setup_help,		0 },
+	{ "run",	kvm_cmd_run,		kvm_run_help,		0 },
+	{ "sandbox",	kvm_cmd_sandbox,	kvm_run_help,		0 },
+	{ NULL,		NULL,			NULL,			0 },
+};
+
+/*
+ * kvm_get_command: Searches the command in an array of the commands and
+ * returns a pointer to cmd_struct if a match is found.
+ *
+ * Input parameters:
+ * command: Array of possible commands. The last entry in the array must be
+ *          NULL.
+ * cmd: A string command to search in the array
+ *
+ * Return Value:
+ * NULL: If the cmd is not matched with any of the command in the command array
+ * p: Pointer to cmd_struct of the matching command
+ */
+struct cmd_struct *kvm_get_command(struct cmd_struct *command,
+		const char *cmd)
+{
+	struct cmd_struct *p = command;
+
+	while (p->cmd) {
+		if (!strcmp(p->cmd, cmd))
+			return p;
+		p++;
+	}
+	return NULL;
+}
+
+int handle_command(struct cmd_struct *command, int argc, const char **argv)
+{
+	struct cmd_struct *p;
+	const char *prefix = NULL;
+	int ret = 0;
+
+	if (!argv || !*argv) {
+		p = kvm_get_command(command, "help");
+		BUG_ON(!p);
+		return p->fn(argc, argv, prefix);
+	}
+
+	p = kvm_get_command(command, argv[0]);
+	if (!p) {
+		p = kvm_get_command(command, "help");
+		BUG_ON(!p);
+		p->fn(0, NULL, prefix);
+		return EINVAL;
+	}
+
+	ret = p->fn(argc - 1, &argv[1], prefix);
+	if (ret < 0) {
+		if (errno == EPERM)
+			die("Permission error - are you root?");
+	}
+
+	return ret;
+}
diff --git a/kvmtool/kvm-cpu.c b/kvmtool/kvm-cpu.c
new file mode 100644
index 0000000..7dec088
--- /dev/null
+++ b/kvmtool/kvm-cpu.c
@@ -0,0 +1,335 @@
+#include "kvm/kvm-cpu.h"
+
+#include "kvm/symbol.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/virtio.h"
+#include "kvm/mutex.h"
+#include "kvm/barrier.h"
+
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/eventfd.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+
+extern __thread struct kvm_cpu *current_kvm_cpu;
+
+int __attribute__((weak)) kvm_cpu__get_endianness(struct kvm_cpu *vcpu)
+{
+	return VIRTIO_ENDIAN_HOST;
+}
+
+void kvm_cpu__enable_singlestep(struct kvm_cpu *vcpu)
+{
+	struct kvm_guest_debug debug = {
+		.control	= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP,
+	};
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_GUEST_DEBUG, &debug) < 0)
+		pr_warning("KVM_SET_GUEST_DEBUG failed");
+}
+
+void kvm_cpu__run(struct kvm_cpu *vcpu)
+{
+	int err;
+
+	if (!vcpu->is_running)
+		return;
+
+	err = ioctl(vcpu->vcpu_fd, KVM_RUN, 0);
+	if (err < 0 && (errno != EINTR && errno != EAGAIN))
+		die_perror("KVM_RUN failed");
+}
+
+static void kvm_cpu_signal_handler(int signum)
+{
+	if (signum == SIGKVMEXIT) {
+		if (current_kvm_cpu && current_kvm_cpu->is_running)
+			current_kvm_cpu->is_running = false;
+	} else if (signum == SIGKVMPAUSE) {
+		if (current_kvm_cpu->paused)
+			die("Pause signaled for already paused CPU\n");
+
+		/* pause_lock is held by kvm__pause() */
+		current_kvm_cpu->paused = 1;
+
+		/*
+		 * This is a blocking function and uses locks. It is safe
+		 * to call it for this signal as a second pause event should
+		 * not be send to this thread until it acquires and releases
+		 * the pause_lock.
+		 */
+		kvm__notify_paused();
+	}
+
+	/* For SIGKVMTASK cpu->task is already set */
+}
+
+static void kvm_cpu__handle_coalesced_mmio(struct kvm_cpu *cpu)
+{
+	if (cpu->ring) {
+		while (cpu->ring->first != cpu->ring->last) {
+			struct kvm_coalesced_mmio *m;
+			m = &cpu->ring->coalesced_mmio[cpu->ring->first];
+			kvm_cpu__emulate_mmio(cpu,
+					      m->phys_addr,
+					      m->data,
+					      m->len,
+					      1);
+			cpu->ring->first = (cpu->ring->first + 1) % KVM_COALESCED_MMIO_MAX;
+		}
+	}
+}
+
+static DEFINE_MUTEX(task_lock);
+static int task_eventfd;
+
+static void kvm_cpu__run_task(struct kvm_cpu *cpu)
+{
+	u64 inc = 1;
+
+	pr_debug("Running task %p on cpu %lu", cpu->task, cpu->cpu_id);
+
+	/* Make sure we see the store to cpu->task */
+	rmb();
+	cpu->task->func(cpu, cpu->task->data);
+
+	/* Clear task before we signal completion */
+	cpu->task = NULL;
+	wmb();
+
+	if (write(task_eventfd, &inc, sizeof(inc)) < 0)
+		die("Failed notifying of completed task.");
+}
+
+void kvm_cpu__run_on_all_cpus(struct kvm *kvm, struct kvm_cpu_task *task)
+{
+	int i, done = 0;
+
+	pr_debug("Running task %p on all cpus", task);
+
+	mutex_lock(&task_lock);
+
+	for (i = 0; i < kvm->nrcpus; i++) {
+		if (kvm->cpus[i]->task) {
+			/* Should never happen */
+			die("CPU %d already has a task pending!", i);
+		}
+
+		kvm->cpus[i]->task = task;
+		wmb();
+
+		if (kvm->cpus[i] == current_kvm_cpu)
+			kvm_cpu__run_task(current_kvm_cpu);
+		else
+			pthread_kill(kvm->cpus[i]->thread, SIGKVMTASK);
+	}
+
+	while (done < kvm->nrcpus) {
+		u64 count;
+
+		if (read(task_eventfd, &count, sizeof(count)) < 0)
+			die("Failed reading task eventfd");
+
+		done += count;
+	}
+
+	mutex_unlock(&task_lock);
+}
+
+int kvm_cpu__start(struct kvm_cpu *cpu)
+{
+	sigset_t sigset;
+
+	sigemptyset(&sigset);
+	sigaddset(&sigset, SIGALRM);
+
+	pthread_sigmask(SIG_BLOCK, &sigset, NULL);
+
+	signal(SIGKVMEXIT, kvm_cpu_signal_handler);
+	signal(SIGKVMPAUSE, kvm_cpu_signal_handler);
+	signal(SIGKVMTASK, kvm_cpu_signal_handler);
+
+	kvm_cpu__reset_vcpu(cpu);
+
+	if (cpu->kvm->cfg.single_step)
+		kvm_cpu__enable_singlestep(cpu);
+
+	while (cpu->is_running) {
+		if (cpu->needs_nmi) {
+			kvm_cpu__arch_nmi(cpu);
+			cpu->needs_nmi = 0;
+		}
+
+		if (cpu->task)
+			kvm_cpu__run_task(cpu);
+
+		kvm_cpu__run(cpu);
+
+		switch (cpu->kvm_run->exit_reason) {
+		case KVM_EXIT_UNKNOWN:
+			break;
+		case KVM_EXIT_DEBUG:
+			kvm_cpu__show_registers(cpu);
+			kvm_cpu__show_code(cpu);
+			break;
+		case KVM_EXIT_IO: {
+			bool ret;
+
+			ret = kvm_cpu__emulate_io(cpu,
+						  cpu->kvm_run->io.port,
+						  (u8 *)cpu->kvm_run +
+						  cpu->kvm_run->io.data_offset,
+						  cpu->kvm_run->io.direction,
+						  cpu->kvm_run->io.size,
+						  cpu->kvm_run->io.count);
+
+			if (!ret)
+				goto panic_kvm;
+			break;
+		}
+		case KVM_EXIT_MMIO: {
+			bool ret;
+
+			/*
+			 * If we had MMIO exit, coalesced ring should be processed
+			 * *before* processing the exit itself
+			 */
+			kvm_cpu__handle_coalesced_mmio(cpu);
+
+			ret = kvm_cpu__emulate_mmio(cpu,
+						    cpu->kvm_run->mmio.phys_addr,
+						    cpu->kvm_run->mmio.data,
+						    cpu->kvm_run->mmio.len,
+						    cpu->kvm_run->mmio.is_write);
+
+			if (!ret)
+				goto panic_kvm;
+			break;
+		}
+		case KVM_EXIT_INTR:
+			if (cpu->is_running)
+				break;
+			goto exit_kvm;
+		case KVM_EXIT_SHUTDOWN:
+			goto exit_kvm;
+		case KVM_EXIT_SYSTEM_EVENT:
+			/*
+			 * Print the type of system event and
+			 * treat all system events as shutdown request.
+			 */
+			switch (cpu->kvm_run->system_event.type) {
+			default:
+				pr_warning("unknown system event type %d",
+					   cpu->kvm_run->system_event.type);
+				/* fall through for now */
+			case KVM_SYSTEM_EVENT_RESET:
+				/* Fall through for now */
+			case KVM_SYSTEM_EVENT_SHUTDOWN:
+				/*
+				 * Ensure that all VCPUs are torn down,
+				 * regardless of which CPU generated the event.
+				 */
+				kvm__reboot(cpu->kvm);
+				goto exit_kvm;
+			};
+			break;
+		default: {
+			bool ret;
+
+			ret = kvm_cpu__handle_exit(cpu);
+			if (!ret)
+				goto panic_kvm;
+			break;
+		}
+		}
+		kvm_cpu__handle_coalesced_mmio(cpu);
+	}
+
+exit_kvm:
+	return 0;
+
+panic_kvm:
+	return 1;
+}
+
+int kvm_cpu__init(struct kvm *kvm)
+{
+	int max_cpus, recommended_cpus, i;
+
+	max_cpus = kvm__max_cpus(kvm);
+	recommended_cpus = kvm__recommended_cpus(kvm);
+
+	if (kvm->cfg.nrcpus > max_cpus) {
+		printf("  # Limit the number of CPUs to %d\n", max_cpus);
+		kvm->cfg.nrcpus = max_cpus;
+	} else if (kvm->cfg.nrcpus > recommended_cpus) {
+		printf("  # Warning: The maximum recommended amount of VCPUs"
+			" is %d\n", recommended_cpus);
+	}
+
+	kvm->nrcpus = kvm->cfg.nrcpus;
+
+	task_eventfd = eventfd(0, 0);
+	if (task_eventfd < 0) {
+		pr_warning("Couldn't create task_eventfd");
+		return task_eventfd;
+	}
+
+	/* Alloc one pointer too many, so array ends up 0-terminated */
+	kvm->cpus = calloc(kvm->nrcpus + 1, sizeof(void *));
+	if (!kvm->cpus) {
+		pr_warning("Couldn't allocate array for %d CPUs", kvm->nrcpus);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < kvm->nrcpus; i++) {
+		kvm->cpus[i] = kvm_cpu__arch_init(kvm, i);
+		if (!kvm->cpus[i]) {
+			pr_warning("unable to initialize KVM VCPU");
+			goto fail_alloc;
+		}
+	}
+
+	return 0;
+
+fail_alloc:
+	for (i = 0; i < kvm->nrcpus; i++)
+		free(kvm->cpus[i]);
+	return -ENOMEM;
+}
+base_init(kvm_cpu__init);
+
+int kvm_cpu__exit(struct kvm *kvm)
+{
+	int i, r;
+	void *ret = NULL;
+
+	kvm_cpu__delete(kvm->cpus[0]);
+	kvm->cpus[0] = NULL;
+
+	kvm__pause(kvm);
+	for (i = 1; i < kvm->nrcpus; i++) {
+		if (kvm->cpus[i]->is_running) {
+			pthread_kill(kvm->cpus[i]->thread, SIGKVMEXIT);
+			if (pthread_join(kvm->cpus[i]->thread, &ret) != 0)
+				die("pthread_join");
+			kvm_cpu__delete(kvm->cpus[i]);
+		}
+		if (ret == NULL)
+			r = 0;
+	}
+	kvm__continue(kvm);
+
+	free(kvm->cpus);
+
+	kvm->nrcpus = 0;
+
+	close(task_eventfd);
+
+	return r;
+}
diff --git a/kvmtool/kvm-ipc.c b/kvmtool/kvm-ipc.c
new file mode 100644
index 0000000..23f7b12
--- /dev/null
+++ b/kvmtool/kvm-ipc.c
@@ -0,0 +1,556 @@
+#include <sys/epoll.h>
+#include <sys/un.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/eventfd.h>
+#include <dirent.h>
+
+#include "kvm/kvm-ipc.h"
+#include "kvm/rwsem.h"
+#include "kvm/read-write.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/builtin-debug.h"
+#include "kvm/strbuf.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/8250-serial.h"
+
+struct kvm_ipc_head {
+	u32 type;
+	u32 len;
+};
+
+#define KVM_IPC_MAX_MSGS 16
+
+#define KVM_SOCK_SUFFIX		".sock"
+#define KVM_SOCK_SUFFIX_LEN	((ssize_t)sizeof(KVM_SOCK_SUFFIX) - 1)
+
+extern __thread struct kvm_cpu *current_kvm_cpu;
+static void (*msgs[KVM_IPC_MAX_MSGS])(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg);
+static DECLARE_RWSEM(msgs_rwlock);
+static int epoll_fd, server_fd, stop_fd;
+static pthread_t thread;
+
+static int kvm__create_socket(struct kvm *kvm)
+{
+	char full_name[PATH_MAX];
+	int s;
+	struct sockaddr_un local;
+	int len, r;
+
+	/* This usually 108 bytes long */
+	BUILD_BUG_ON(sizeof(local.sun_path) < 32);
+
+	snprintf(full_name, sizeof(full_name), "%s/%s%s",
+		 kvm__get_dir(), kvm->cfg.guest_name, KVM_SOCK_SUFFIX);
+
+	s = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (s < 0) {
+		perror("socket");
+		return s;
+	}
+
+	local.sun_family = AF_UNIX;
+	strlcpy(local.sun_path, full_name, sizeof(local.sun_path));
+	len = strlen(local.sun_path) + sizeof(local.sun_family);
+	r = bind(s, (struct sockaddr *)&local, len);
+	/* Check for an existing socket file */
+	if (r < 0 && errno == EADDRINUSE) {
+		r = connect(s, (struct sockaddr *)&local, len);
+		if (r == 0) {
+			/*
+			 * If we could connect, there is already a guest
+			 * using this same name. This should not happen
+			 * for PID derived names, but could happen for user
+			 * provided guest names.
+			 */
+			pr_err("Guest socket file %s already exists.",
+			       full_name);
+			r = -EEXIST;
+			goto fail;
+		}
+		if (errno == ECONNREFUSED) {
+			/*
+			 * This is a ghost socket file, with no-one listening
+			 * on the other end. Since kvmtool will only bind
+			 * above when creating a new guest, there is no
+			 * danger in just removing the file and re-trying.
+			 */
+			unlink(full_name);
+			pr_info("Removed ghost socket file \"%s\".", full_name);
+			r = bind(s, (struct sockaddr *)&local, len);
+		}
+	}
+	if (r < 0) {
+		perror("bind");
+		goto fail;
+	}
+
+	r = listen(s, 5);
+	if (r < 0) {
+		perror("listen");
+		goto fail;
+	}
+
+	return s;
+
+fail:
+	close(s);
+	return r;
+}
+
+void kvm__remove_socket(const char *name)
+{
+	char full_name[PATH_MAX];
+
+	snprintf(full_name, sizeof(full_name), "%s/%s%s",
+		 kvm__get_dir(), name, KVM_SOCK_SUFFIX);
+	unlink(full_name);
+}
+
+int kvm__get_sock_by_instance(const char *name)
+{
+	int s, len, r;
+	char sock_file[PATH_MAX];
+	struct sockaddr_un local;
+
+	snprintf(sock_file, sizeof(sock_file), "%s/%s%s",
+		 kvm__get_dir(), name, KVM_SOCK_SUFFIX);
+	s = socket(AF_UNIX, SOCK_STREAM, 0);
+
+	local.sun_family = AF_UNIX;
+	strlcpy(local.sun_path, sock_file, sizeof(local.sun_path));
+	len = strlen(local.sun_path) + sizeof(local.sun_family);
+
+	r = connect(s, (struct sockaddr *)&local, len);
+	if (r < 0 && errno == ECONNREFUSED) {
+		/* Clean up the ghost socket file */
+		unlink(local.sun_path);
+		pr_info("Removed ghost socket file \"%s\".", sock_file);
+		return r;
+	} else if (r < 0) {
+		return r;
+	}
+
+	return s;
+}
+
+static bool is_socket(const char *base_path, const struct dirent *dent)
+{
+	switch (dent->d_type) {
+	case DT_SOCK:
+		return true;
+
+	case DT_UNKNOWN: {
+		char path[PATH_MAX];
+		struct stat st;
+
+		sprintf(path, "%s/%s", base_path, dent->d_name);
+		if (stat(path, &st))
+			return false;
+
+		return S_ISSOCK(st.st_mode);
+	}
+	default:
+		return false;
+	}
+}
+
+int kvm__enumerate_instances(int (*callback)(const char *name, int fd))
+{
+	int sock;
+	DIR *dir;
+	struct dirent *entry;
+	int ret = 0;
+	const char *path;
+
+	path = kvm__get_dir();
+
+	dir = opendir(path);
+	if (!dir)
+		return -errno;
+
+	for (;;) {
+		entry = readdir(dir);
+		if (!entry)
+			break;
+		if (is_socket(path, entry)) {
+			ssize_t name_len = strlen(entry->d_name);
+			char *p;
+
+			if (name_len <= KVM_SOCK_SUFFIX_LEN)
+				continue;
+
+			p = &entry->d_name[name_len - KVM_SOCK_SUFFIX_LEN];
+			if (memcmp(KVM_SOCK_SUFFIX, p, KVM_SOCK_SUFFIX_LEN))
+				continue;
+
+			*p = 0;
+			sock = kvm__get_sock_by_instance(entry->d_name);
+			if (sock < 0)
+				continue;
+			ret = callback(entry->d_name, sock);
+			close(sock);
+			if (ret < 0)
+				break;
+		}
+	}
+
+	closedir(dir);
+
+	return ret;
+}
+
+int kvm_ipc__register_handler(u32 type, void (*cb)(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg))
+{
+	if (type >= KVM_IPC_MAX_MSGS)
+		return -ENOSPC;
+
+	down_write(&msgs_rwlock);
+	msgs[type] = cb;
+	up_write(&msgs_rwlock);
+
+	return 0;
+}
+
+int kvm_ipc__send(int fd, u32 type)
+{
+	struct kvm_ipc_head head = {.type = type, .len = 0,};
+
+	if (write_in_full(fd, &head, sizeof(head)) < 0)
+		return -1;
+
+	return 0;
+}
+
+int kvm_ipc__send_msg(int fd, u32 type, u32 len, u8 *msg)
+{
+	struct kvm_ipc_head head = {.type = type, .len = len,};
+
+	if (write_in_full(fd, &head, sizeof(head)) < 0)
+		return -1;
+
+	if (write_in_full(fd, msg, len) < 0)
+		return -1;
+
+	return 0;
+}
+
+static int kvm_ipc__handle(struct kvm *kvm, int fd, u32 type, u32 len, u8 *data)
+{
+	void (*cb)(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg);
+
+	if (type >= KVM_IPC_MAX_MSGS)
+		return -ENOSPC;
+
+	down_read(&msgs_rwlock);
+	cb = msgs[type];
+	up_read(&msgs_rwlock);
+
+	if (cb == NULL) {
+		pr_warning("No device handles type %u\n", type);
+		return -ENODEV;
+	}
+
+	cb(kvm, fd, type, len, data);
+
+	return 0;
+}
+
+static int kvm_ipc__new_conn(int fd)
+{
+	int client;
+	struct epoll_event ev;
+
+	client = accept(fd, NULL, NULL);
+	if (client < 0)
+		return -1;
+
+	ev.events = EPOLLIN | EPOLLRDHUP;
+	ev.data.fd = client;
+	if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, client, &ev) < 0) {
+		close(client);
+		return -1;
+	}
+
+	return client;
+}
+
+static void kvm_ipc__close_conn(int fd)
+{
+	epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, NULL);
+	close(fd);
+}
+
+static int kvm_ipc__receive(struct kvm *kvm, int fd)
+{
+	struct kvm_ipc_head head;
+	u8 *msg = NULL;
+	u32 n;
+
+	n = read(fd, &head, sizeof(head));
+	if (n != sizeof(head))
+		goto done;
+
+	msg = malloc(head.len);
+	if (msg == NULL)
+		goto done;
+
+	n = read_in_full(fd, msg, head.len);
+	if (n != head.len)
+		goto done;
+
+	kvm_ipc__handle(kvm, fd, head.type, head.len, msg);
+
+	return 0;
+
+done:
+	free(msg);
+	return -1;
+}
+
+static void *kvm_ipc__thread(void *param)
+{
+	struct epoll_event event;
+	struct kvm *kvm = param;
+
+	kvm__set_thread_name("kvm-ipc");
+
+	for (;;) {
+		int nfds;
+
+		nfds = epoll_wait(epoll_fd, &event, 1, -1);
+		if (nfds > 0) {
+			int fd = event.data.fd;
+
+			if (fd == stop_fd && event.events & EPOLLIN) {
+				break;
+			} else if (fd == server_fd) {
+				int client, r;
+
+				client = kvm_ipc__new_conn(fd);
+				/*
+				 * Handle multiple IPC cmd at a time
+				 */
+				do {
+					r = kvm_ipc__receive(kvm, client);
+				} while	(r == 0);
+
+			} else if (event.events & (EPOLLERR | EPOLLRDHUP | EPOLLHUP)) {
+				kvm_ipc__close_conn(fd);
+			} else {
+				kvm_ipc__receive(kvm, fd);
+			}
+		}
+	}
+
+	return NULL;
+}
+
+static void kvm__pid(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+	pid_t pid = getpid();
+	int r = 0;
+
+	if (type == KVM_IPC_PID)
+		r = write(fd, &pid, sizeof(pid));
+
+	if (r < 0)
+		pr_warning("Failed sending PID");
+}
+
+static void handle_stop(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+	if (WARN_ON(type != KVM_IPC_STOP || len))
+		return;
+
+	kvm__reboot(kvm);
+}
+
+/* Pause/resume the guest using SIGUSR2 */
+static int is_paused;
+
+static void handle_pause(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+	if (WARN_ON(len))
+		return;
+
+	if (type == KVM_IPC_RESUME && is_paused) {
+		kvm->vm_state = KVM_VMSTATE_RUNNING;
+		kvm__continue(kvm);
+	} else if (type == KVM_IPC_PAUSE && !is_paused) {
+		kvm->vm_state = KVM_VMSTATE_PAUSED;
+		ioctl(kvm->vm_fd, KVM_KVMCLOCK_CTRL);
+		kvm__pause(kvm);
+	} else {
+		return;
+	}
+
+	is_paused = !is_paused;
+}
+
+static void handle_vmstate(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+	int r = 0;
+
+	if (type == KVM_IPC_VMSTATE)
+		r = write(fd, &kvm->vm_state, sizeof(kvm->vm_state));
+
+	if (r < 0)
+		pr_warning("Failed sending VMSTATE");
+}
+
+/*
+ * Serialize debug printout so that the output of multiple vcpus does not
+ * get mixed up:
+ */
+static int printout_done;
+
+static void handle_sigusr1(int sig)
+{
+	struct kvm_cpu *cpu = current_kvm_cpu;
+	int fd = kvm_cpu__get_debug_fd();
+
+	if (!cpu || cpu->needs_nmi)
+		return;
+
+	dprintf(fd, "\n #\n # vCPU #%ld's dump:\n #\n", cpu->cpu_id);
+	kvm_cpu__show_registers(cpu);
+	kvm_cpu__show_code(cpu);
+	kvm_cpu__show_page_tables(cpu);
+	fflush(stdout);
+	printout_done = 1;
+}
+
+static void handle_debug(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+	int i;
+	struct debug_cmd_params *params;
+	u32 dbg_type;
+	u32 vcpu;
+
+	if (WARN_ON(type != KVM_IPC_DEBUG || len != sizeof(*params)))
+		return;
+
+	params = (void *)msg;
+	dbg_type = params->dbg_type;
+	vcpu = params->cpu;
+
+	if (dbg_type & KVM_DEBUG_CMD_TYPE_SYSRQ)
+		serial8250__inject_sysrq(kvm, params->sysrq);
+
+	if (dbg_type & KVM_DEBUG_CMD_TYPE_NMI) {
+		if ((int)vcpu >= kvm->nrcpus)
+			return;
+
+		kvm->cpus[vcpu]->needs_nmi = 1;
+		pthread_kill(kvm->cpus[vcpu]->thread, SIGUSR1);
+	}
+
+	if (!(dbg_type & KVM_DEBUG_CMD_TYPE_DUMP))
+		return;
+
+	for (i = 0; i < kvm->nrcpus; i++) {
+		struct kvm_cpu *cpu = kvm->cpus[i];
+
+		if (!cpu)
+			continue;
+
+		printout_done = 0;
+
+		kvm_cpu__set_debug_fd(fd);
+		pthread_kill(cpu->thread, SIGUSR1);
+		/*
+		 * Wait for the vCPU to dump state before signalling
+		 * the next thread. Since this is debug code it does
+		 * not matter that we are burning CPU time a bit:
+		 */
+		while (!printout_done)
+			sleep(0);
+	}
+
+	close(fd);
+
+	serial8250__inject_sysrq(kvm, 'p');
+}
+
+int kvm_ipc__init(struct kvm *kvm)
+{
+	int ret;
+	int sock = kvm__create_socket(kvm);
+	struct epoll_event ev = {0};
+
+	server_fd = sock;
+
+	epoll_fd = epoll_create(KVM_IPC_MAX_MSGS);
+	if (epoll_fd < 0) {
+		perror("epoll_create");
+		ret = epoll_fd;
+		goto err;
+	}
+
+	ev.events = EPOLLIN | EPOLLET;
+	ev.data.fd = sock;
+	if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, sock, &ev) < 0) {
+		pr_err("Failed adding socket to epoll");
+		ret = -EFAULT;
+		goto err_epoll;
+	}
+
+	stop_fd = eventfd(0, 0);
+	if (stop_fd < 0) {
+		perror("eventfd");
+		ret = stop_fd;
+		goto err_epoll;
+	}
+
+	ev.events = EPOLLIN | EPOLLET;
+	ev.data.fd = stop_fd;
+	if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, stop_fd, &ev) < 0) {
+		pr_err("Failed adding stop event to epoll");
+		ret = -EFAULT;
+		goto err_stop;
+	}
+
+	if (pthread_create(&thread, NULL, kvm_ipc__thread, kvm) != 0) {
+		pr_err("Failed starting IPC thread");
+		ret = -EFAULT;
+		goto err_stop;
+	}
+
+	kvm_ipc__register_handler(KVM_IPC_PID, kvm__pid);
+	kvm_ipc__register_handler(KVM_IPC_DEBUG, handle_debug);
+	kvm_ipc__register_handler(KVM_IPC_PAUSE, handle_pause);
+	kvm_ipc__register_handler(KVM_IPC_RESUME, handle_pause);
+	kvm_ipc__register_handler(KVM_IPC_STOP, handle_stop);
+	kvm_ipc__register_handler(KVM_IPC_VMSTATE, handle_vmstate);
+	signal(SIGUSR1, handle_sigusr1);
+
+	return 0;
+
+err_stop:
+	close(stop_fd);
+err_epoll:
+	close(epoll_fd);
+err:
+	return ret;
+}
+base_init(kvm_ipc__init);
+
+int kvm_ipc__exit(struct kvm *kvm)
+{
+	u64 val = 1;
+	int ret;
+
+	ret = write(stop_fd, &val, sizeof(val));
+	if (ret < 0)
+		return ret;
+
+	close(server_fd);
+	close(epoll_fd);
+
+	kvm__remove_socket(kvm->cfg.guest_name);
+
+	return ret;
+}
+base_exit(kvm_ipc__exit);
diff --git a/kvmtool/kvm.c b/kvmtool/kvm.c
new file mode 100644
index 0000000..e327541
--- /dev/null
+++ b/kvmtool/kvm.c
@@ -0,0 +1,636 @@
+#include "kvm/kvm.h"
+#include "kvm/read-write.h"
+#include "kvm/util.h"
+#include "kvm/strbuf.h"
+#include "kvm/mutex.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/kvm-ipc.h"
+
+#include <linux/kernel.h>
+#include <linux/kvm.h>
+#include <linux/list.h>
+#include <linux/err.h>
+
+#include <sys/un.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <time.h>
+#include <sys/eventfd.h>
+#include <asm/unistd.h>
+#include <dirent.h>
+
+#define DEFINE_KVM_EXIT_REASON(reason) [reason] = #reason
+
+const char *kvm_exit_reasons[] = {
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_UNKNOWN),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_EXCEPTION),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_IO),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_HYPERCALL),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_DEBUG),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_HLT),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_MMIO),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_IRQ_WINDOW_OPEN),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_SHUTDOWN),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_FAIL_ENTRY),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_INTR),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_SET_TPR),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_TPR_ACCESS),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_S390_SIEIC),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_S390_RESET),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_DCR),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_NMI),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_INTERNAL_ERROR),
+#ifdef CONFIG_PPC64
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_PAPR_HCALL),
+#endif
+};
+
+static int pause_event;
+static DEFINE_MUTEX(pause_lock);
+extern struct kvm_ext kvm_req_ext[];
+
+static char kvm_dir[PATH_MAX];
+
+extern __thread struct kvm_cpu *current_kvm_cpu;
+
+static int set_dir(const char *fmt, va_list args)
+{
+	char tmp[PATH_MAX];
+
+	vsnprintf(tmp, sizeof(tmp), fmt, args);
+
+	mkdir(tmp, 0777);
+
+	if (!realpath(tmp, kvm_dir))
+		return -errno;
+
+	strcat(kvm_dir, "/");
+
+	return 0;
+}
+
+void kvm__set_dir(const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	set_dir(fmt, args);
+	va_end(args);
+}
+
+const char *kvm__get_dir(void)
+{
+	return kvm_dir;
+}
+
+bool kvm__supports_vm_extension(struct kvm *kvm, unsigned int extension)
+{
+	static int supports_vm_ext_check = 0;
+	int ret;
+
+	switch (supports_vm_ext_check) {
+	case 0:
+		ret = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION,
+			    KVM_CAP_CHECK_EXTENSION_VM);
+		if (ret <= 0) {
+			supports_vm_ext_check = -1;
+			return false;
+		}
+		supports_vm_ext_check = 1;
+		/* fall through */
+	case 1:
+		break;
+	case -1:
+		return false;
+	}
+
+	ret = ioctl(kvm->vm_fd, KVM_CHECK_EXTENSION, extension);
+	if (ret < 0)
+		return false;
+
+	return ret;
+}
+
+bool kvm__supports_extension(struct kvm *kvm, unsigned int extension)
+{
+	int ret;
+
+	ret = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, extension);
+	if (ret < 0)
+		return false;
+
+	return ret;
+}
+
+static int kvm__check_extensions(struct kvm *kvm)
+{
+	int i;
+
+	for (i = 0; ; i++) {
+		if (!kvm_req_ext[i].name)
+			break;
+		if (!kvm__supports_extension(kvm, kvm_req_ext[i].code)) {
+			pr_err("Unsupported KVM extension detected: %s",
+				kvm_req_ext[i].name);
+			return -i;
+		}
+	}
+
+	return 0;
+}
+
+struct kvm *kvm__new(void)
+{
+	struct kvm *kvm = calloc(1, sizeof(*kvm));
+	if (!kvm)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&kvm->mem_banks_lock);
+	kvm->sys_fd = -1;
+	kvm->vm_fd = -1;
+
+#ifdef KVM_BRLOCK_DEBUG
+	kvm->brlock_sem = (pthread_rwlock_t) PTHREAD_RWLOCK_INITIALIZER;
+#endif
+
+	return kvm;
+}
+
+int kvm__exit(struct kvm *kvm)
+{
+	struct kvm_mem_bank *bank, *tmp;
+
+	kvm__arch_delete_ram(kvm);
+
+	list_for_each_entry_safe(bank, tmp, &kvm->mem_banks, list) {
+		list_del(&bank->list);
+		free(bank);
+	}
+
+	free(kvm);
+	return 0;
+}
+core_exit(kvm__exit);
+
+int kvm__destroy_mem(struct kvm *kvm, u64 guest_phys, u64 size,
+		     void *userspace_addr)
+{
+	struct kvm_userspace_memory_region mem;
+	struct kvm_mem_bank *bank;
+	int ret;
+
+	mutex_lock(&kvm->mem_banks_lock);
+	list_for_each_entry(bank, &kvm->mem_banks, list)
+		if (bank->guest_phys_addr == guest_phys &&
+		    bank->size == size && bank->host_addr == userspace_addr)
+			break;
+
+	if (&bank->list == &kvm->mem_banks) {
+		pr_err("Region [%llx-%llx] not found", guest_phys,
+		       guest_phys + size - 1);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (bank->type == KVM_MEM_TYPE_RESERVED) {
+		pr_err("Cannot delete reserved region [%llx-%llx]",
+		       guest_phys, guest_phys + size - 1);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	mem = (struct kvm_userspace_memory_region) {
+		.slot			= bank->slot,
+		.guest_phys_addr	= guest_phys,
+		.memory_size		= 0,
+		.userspace_addr		= (unsigned long)userspace_addr,
+	};
+
+	ret = ioctl(kvm->vm_fd, KVM_SET_USER_MEMORY_REGION, &mem);
+	if (ret < 0) {
+		ret = -errno;
+		goto out;
+	}
+
+	list_del(&bank->list);
+	free(bank);
+	kvm->mem_slots--;
+	ret = 0;
+
+out:
+	mutex_unlock(&kvm->mem_banks_lock);
+	return ret;
+}
+
+int kvm__register_mem(struct kvm *kvm, u64 guest_phys, u64 size,
+		      void *userspace_addr, enum kvm_mem_type type)
+{
+	struct kvm_userspace_memory_region mem;
+	struct kvm_mem_bank *merged = NULL;
+	struct kvm_mem_bank *bank;
+	struct list_head *prev_entry;
+	u32 slot;
+	u32 flags = 0;
+	int ret;
+
+	mutex_lock(&kvm->mem_banks_lock);
+	/* Check for overlap and find first empty slot. */
+	slot = 0;
+	prev_entry = &kvm->mem_banks;
+	list_for_each_entry(bank, &kvm->mem_banks, list) {
+		u64 bank_end = bank->guest_phys_addr + bank->size - 1;
+		u64 end = guest_phys + size - 1;
+		if (guest_phys > bank_end || end < bank->guest_phys_addr) {
+			/*
+			 * Keep the banks sorted ascending by slot, so it's
+			 * easier for us to find a free slot.
+			 */
+			if (bank->slot == slot) {
+				slot++;
+				prev_entry = &bank->list;
+			}
+			continue;
+		}
+
+		/* Merge overlapping reserved regions */
+		if (bank->type == KVM_MEM_TYPE_RESERVED &&
+		    type == KVM_MEM_TYPE_RESERVED) {
+			bank->guest_phys_addr = min(bank->guest_phys_addr, guest_phys);
+			bank->size = max(bank_end, end) - bank->guest_phys_addr + 1;
+
+			if (merged) {
+				/*
+				 * This is at least the second merge, remove
+				 * previous result.
+				 */
+				list_del(&merged->list);
+				free(merged);
+			}
+
+			guest_phys = bank->guest_phys_addr;
+			size = bank->size;
+			merged = bank;
+
+			/* Keep checking that we don't overlap another region */
+			continue;
+		}
+
+		pr_err("%s region [%llx-%llx] would overlap %s region [%llx-%llx]",
+		       kvm_mem_type_to_string(type), guest_phys, guest_phys + size - 1,
+		       kvm_mem_type_to_string(bank->type), bank->guest_phys_addr,
+		       bank->guest_phys_addr + bank->size - 1);
+
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (merged) {
+		ret = 0;
+		goto out;
+	}
+
+	bank = malloc(sizeof(*bank));
+	if (!bank) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&bank->list);
+	bank->guest_phys_addr		= guest_phys;
+	bank->host_addr			= userspace_addr;
+	bank->size			= size;
+	bank->type			= type;
+	bank->slot			= slot;
+
+	if (type & KVM_MEM_TYPE_READONLY)
+		flags |= KVM_MEM_READONLY;
+
+	if (type != KVM_MEM_TYPE_RESERVED) {
+		mem = (struct kvm_userspace_memory_region) {
+			.slot			= slot,
+			.flags			= flags,
+			.guest_phys_addr	= guest_phys,
+			.memory_size		= size,
+			.userspace_addr		= (unsigned long)userspace_addr,
+		};
+
+		ret = ioctl(kvm->vm_fd, KVM_SET_USER_MEMORY_REGION, &mem);
+		if (ret < 0) {
+			ret = -errno;
+			goto out;
+		}
+	}
+
+	list_add(&bank->list, prev_entry);
+	kvm->mem_slots++;
+	ret = 0;
+
+out:
+	mutex_unlock(&kvm->mem_banks_lock);
+	return ret;
+}
+
+void *guest_flat_to_host(struct kvm *kvm, u64 offset)
+{
+	struct kvm_mem_bank *bank;
+
+	list_for_each_entry(bank, &kvm->mem_banks, list) {
+		u64 bank_start = bank->guest_phys_addr;
+		u64 bank_end = bank_start + bank->size;
+
+		if (offset >= bank_start && offset < bank_end)
+			return bank->host_addr + (offset - bank_start);
+	}
+
+	pr_warning("unable to translate guest address 0x%llx to host",
+			(unsigned long long)offset);
+	return NULL;
+}
+
+u64 host_to_guest_flat(struct kvm *kvm, void *ptr)
+{
+	struct kvm_mem_bank *bank;
+
+	list_for_each_entry(bank, &kvm->mem_banks, list) {
+		void *bank_start = bank->host_addr;
+		void *bank_end = bank_start + bank->size;
+
+		if (ptr >= bank_start && ptr < bank_end)
+			return bank->guest_phys_addr + (ptr - bank_start);
+	}
+
+	pr_warning("unable to translate host address %p to guest", ptr);
+	return 0;
+}
+
+/*
+ * Iterate over each registered memory bank. Call @fun for each bank with @data
+ * as argument. @type is a bitmask that allows to filter banks according to
+ * their type.
+ *
+ * If one call to @fun returns a non-zero value, stop iterating and return the
+ * value. Otherwise, return zero.
+ */
+int kvm__for_each_mem_bank(struct kvm *kvm, enum kvm_mem_type type,
+			   int (*fun)(struct kvm *kvm, struct kvm_mem_bank *bank, void *data),
+			   void *data)
+{
+	int ret;
+	struct kvm_mem_bank *bank;
+
+	list_for_each_entry(bank, &kvm->mem_banks, list) {
+		if (type != KVM_MEM_TYPE_ALL && !(bank->type & type))
+			continue;
+
+		ret = fun(kvm, bank, data);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+int kvm__recommended_cpus(struct kvm *kvm)
+{
+	int ret;
+
+	ret = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_NR_VCPUS);
+	if (ret <= 0)
+		/*
+		 * api.txt states that if KVM_CAP_NR_VCPUS does not exist,
+		 * assume 4.
+		 */
+		return 4;
+
+	return ret;
+}
+
+int kvm__max_cpus(struct kvm *kvm)
+{
+	int ret;
+
+	ret = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_MAX_VCPUS);
+	if (ret <= 0)
+		ret = kvm__recommended_cpus(kvm);
+
+	return ret;
+}
+
+int kvm__init(struct kvm *kvm)
+{
+	int ret;
+
+	if (!kvm__arch_cpu_supports_vm()) {
+		pr_err("Your CPU does not support hardware virtualization");
+		ret = -ENOSYS;
+		goto err;
+	}
+
+	kvm->sys_fd = open(kvm->cfg.dev, O_RDWR);
+	if (kvm->sys_fd < 0) {
+		if (errno == ENOENT)
+			pr_err("'%s' not found. Please make sure your kernel has CONFIG_KVM "
+			       "enabled and that the KVM modules are loaded.", kvm->cfg.dev);
+		else if (errno == ENODEV)
+			pr_err("'%s' KVM driver not available.\n  # (If the KVM "
+			       "module is loaded then 'dmesg' may offer further clues "
+			       "about the failure.)", kvm->cfg.dev);
+		else
+			pr_err("Could not open %s: ", kvm->cfg.dev);
+
+		ret = -errno;
+		goto err_free;
+	}
+
+	ret = ioctl(kvm->sys_fd, KVM_GET_API_VERSION, 0);
+	if (ret != KVM_API_VERSION) {
+		pr_err("KVM_API_VERSION ioctl");
+		ret = -errno;
+		goto err_sys_fd;
+	}
+
+	kvm->vm_fd = ioctl(kvm->sys_fd, KVM_CREATE_VM, KVM_VM_TYPE);
+	if (kvm->vm_fd < 0) {
+		pr_err("KVM_CREATE_VM ioctl");
+		ret = kvm->vm_fd;
+		goto err_sys_fd;
+	}
+
+	if (kvm__check_extensions(kvm)) {
+		pr_err("A required KVM extension is not supported by OS");
+		ret = -ENOSYS;
+		goto err_vm_fd;
+	}
+
+	kvm__arch_init(kvm, kvm->cfg.hugetlbfs_path, kvm->cfg.ram_size);
+
+	INIT_LIST_HEAD(&kvm->mem_banks);
+	kvm__init_ram(kvm);
+
+	if (!kvm->cfg.firmware_filename) {
+		if (!kvm__load_kernel(kvm, kvm->cfg.kernel_filename,
+				kvm->cfg.initrd_filename, kvm->cfg.real_cmdline))
+			die("unable to load kernel %s", kvm->cfg.kernel_filename);
+	}
+
+	if (kvm->cfg.firmware_filename) {
+		if (!kvm__load_firmware(kvm, kvm->cfg.firmware_filename))
+			die("unable to load firmware image %s: %s", kvm->cfg.firmware_filename, strerror(errno));
+	} else {
+		ret = kvm__arch_setup_firmware(kvm);
+		if (ret < 0)
+			die("kvm__arch_setup_firmware() failed with error %d\n", ret);
+	}
+
+	return 0;
+
+err_vm_fd:
+	close(kvm->vm_fd);
+err_sys_fd:
+	close(kvm->sys_fd);
+err_free:
+	free(kvm);
+err:
+	return ret;
+}
+core_init(kvm__init);
+
+/* RFC 1952 */
+#define GZIP_ID1		0x1f
+#define GZIP_ID2		0x8b
+#define CPIO_MAGIC		"0707"
+/* initrd may be gzipped, or a plain cpio */
+static bool initrd_check(int fd)
+{
+	unsigned char id[4];
+
+	if (read_in_full(fd, id, ARRAY_SIZE(id)) < 0)
+		return false;
+
+	if (lseek(fd, 0, SEEK_SET) < 0)
+		die_perror("lseek");
+
+	return (id[0] == GZIP_ID1 && id[1] == GZIP_ID2) ||
+		!memcmp(id, CPIO_MAGIC, 4);
+}
+
+bool kvm__load_kernel(struct kvm *kvm, const char *kernel_filename,
+		const char *initrd_filename, const char *kernel_cmdline)
+{
+	bool ret;
+	int fd_kernel = -1, fd_initrd = -1;
+
+	fd_kernel = open(kernel_filename, O_RDONLY);
+	if (fd_kernel < 0)
+		die("Unable to open kernel %s", kernel_filename);
+
+	if (initrd_filename) {
+		fd_initrd = open(initrd_filename, O_RDONLY);
+		if (fd_initrd < 0)
+			die("Unable to open initrd %s", initrd_filename);
+
+		if (!initrd_check(fd_initrd))
+			die("%s is not an initrd", initrd_filename);
+	}
+
+	ret = kvm__arch_load_kernel_image(kvm, fd_kernel, fd_initrd,
+					  kernel_cmdline);
+
+	if (initrd_filename)
+		close(fd_initrd);
+	close(fd_kernel);
+
+	if (!ret)
+		die("%s is not a valid kernel image", kernel_filename);
+	return ret;
+}
+
+void kvm__dump_mem(struct kvm *kvm, unsigned long addr, unsigned long size, int debug_fd)
+{
+	unsigned char *p;
+	unsigned long n;
+
+	size &= ~7; /* mod 8 */
+	if (!size)
+		return;
+
+	p = guest_flat_to_host(kvm, addr);
+
+	for (n = 0; n < size; n += 8) {
+		if (!host_ptr_in_ram(kvm, p + n)) {
+			dprintf(debug_fd, " 0x%08lx: <unknown>\n", addr + n);
+			continue;
+		}
+		dprintf(debug_fd, " 0x%08lx: %02x %02x %02x %02x  %02x %02x %02x %02x\n",
+			addr + n, p[n + 0], p[n + 1], p[n + 2], p[n + 3],
+				  p[n + 4], p[n + 5], p[n + 6], p[n + 7]);
+	}
+}
+
+void kvm__reboot(struct kvm *kvm)
+{
+	/* Check if the guest is running */
+	if (!kvm->cpus[0] || kvm->cpus[0]->thread == 0)
+		return;
+
+	pthread_kill(kvm->cpus[0]->thread, SIGKVMEXIT);
+}
+
+void kvm__continue(struct kvm *kvm)
+{
+	mutex_unlock(&pause_lock);
+}
+
+void kvm__pause(struct kvm *kvm)
+{
+	int i, paused_vcpus = 0;
+
+	mutex_lock(&pause_lock);
+
+	/* Check if the guest is running */
+	if (!kvm->cpus || !kvm->cpus[0] || kvm->cpus[0]->thread == 0)
+		return;
+
+	pause_event = eventfd(0, 0);
+	if (pause_event < 0)
+		die("Failed creating pause notification event");
+	for (i = 0; i < kvm->nrcpus; i++) {
+		if (kvm->cpus[i]->is_running && kvm->cpus[i]->paused == 0)
+			pthread_kill(kvm->cpus[i]->thread, SIGKVMPAUSE);
+		else
+			paused_vcpus++;
+	}
+
+	while (paused_vcpus < kvm->nrcpus) {
+		u64 cur_read;
+
+		if (read(pause_event, &cur_read, sizeof(cur_read)) < 0)
+			die("Failed reading pause event");
+		paused_vcpus += cur_read;
+	}
+	close(pause_event);
+}
+
+void kvm__notify_paused(void)
+{
+	u64 p = 1;
+
+	if (write(pause_event, &p, sizeof(p)) < 0)
+		die("Failed notifying of paused VCPU.");
+
+	mutex_lock(&pause_lock);
+	current_kvm_cpu->paused = 0;
+	mutex_unlock(&pause_lock);
+}
diff --git a/kvmtool/main.c b/kvmtool/main.c
new file mode 100644
index 0000000..05bc82c
--- /dev/null
+++ b/kvmtool/main.c
@@ -0,0 +1,19 @@
+#include "kvm/kvm.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+
+/* user defined header files */
+#include <kvm/kvm-cmd.h>
+
+static int handle_kvm_command(int argc, char **argv)
+{
+	return handle_command(kvm_commands, argc, (const char **) &argv[0]);
+}
+
+int main(int argc, char *argv[])
+{
+	kvm__set_dir("%s/%s", HOME_DIR, KVM_PID_FILE_PATH);
+
+	return handle_kvm_command(argc - 1, &argv[1]);
+}
diff --git a/kvmtool/mips/include/asm/kvm.h b/kvmtool/mips/include/asm/kvm.h
new file mode 100644
index 0000000..edcf717
--- /dev/null
+++ b/kvmtool/mips/include/asm/kvm.h
@@ -0,0 +1,227 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+ * Copyright (C) 2013 Cavium, Inc.
+ * Authors: Sanjay Lal <sanjayl@kymasys.com>
+ */
+
+#ifndef __LINUX_KVM_MIPS_H
+#define __LINUX_KVM_MIPS_H
+
+#include <linux/types.h>
+
+/*
+ * KVM MIPS specific structures and definitions.
+ *
+ * Some parts derived from the x86 version of this file.
+ */
+
+#define __KVM_HAVE_READONLY_MEM
+
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+
+/*
+ * for KVM_GET_REGS and KVM_SET_REGS
+ *
+ * If Config[AT] is zero (32-bit CPU), the register contents are
+ * stored in the lower 32-bits of the struct kvm_regs fields and sign
+ * extended to 64-bits.
+ */
+struct kvm_regs {
+	/* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
+	__u64 gpr[32];
+	__u64 hi;
+	__u64 lo;
+	__u64 pc;
+};
+
+/*
+ * for KVM_GET_FPU and KVM_SET_FPU
+ */
+struct kvm_fpu {
+};
+
+
+/*
+ * For MIPS, we use KVM_SET_ONE_REG and KVM_GET_ONE_REG to access various
+ * registers.  The id field is broken down as follows:
+ *
+ *  bits[63..52] - As per linux/kvm.h
+ *  bits[51..32] - Must be zero.
+ *  bits[31..16] - Register set.
+ *
+ * Register set = 0: GP registers from kvm_regs (see definitions below).
+ *
+ * Register set = 1: CP0 registers.
+ *  bits[15..8]  - COP0 register set.
+ *
+ *  COP0 register set = 0: Main CP0 registers.
+ *   bits[7..3]   - Register 'rd'  index.
+ *   bits[2..0]   - Register 'sel' index.
+ *
+ *  COP0 register set = 1: MAARs.
+ *   bits[7..0]   - MAAR index.
+ *
+ * Register set = 2: KVM specific registers (see definitions below).
+ *
+ * Register set = 3: FPU / MSA registers (see definitions below).
+ *
+ * Other sets registers may be added in the future.  Each set would
+ * have its own identifier in bits[31..16].
+ */
+
+#define KVM_REG_MIPS_GP		(KVM_REG_MIPS | 0x0000000000000000ULL)
+#define KVM_REG_MIPS_CP0	(KVM_REG_MIPS | 0x0000000000010000ULL)
+#define KVM_REG_MIPS_KVM	(KVM_REG_MIPS | 0x0000000000020000ULL)
+#define KVM_REG_MIPS_FPU	(KVM_REG_MIPS | 0x0000000000030000ULL)
+
+
+/*
+ * KVM_REG_MIPS_GP - General purpose registers from kvm_regs.
+ */
+
+#define KVM_REG_MIPS_R0		(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 |  0)
+#define KVM_REG_MIPS_R1		(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 |  1)
+#define KVM_REG_MIPS_R2		(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 |  2)
+#define KVM_REG_MIPS_R3		(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 |  3)
+#define KVM_REG_MIPS_R4		(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 |  4)
+#define KVM_REG_MIPS_R5		(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 |  5)
+#define KVM_REG_MIPS_R6		(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 |  6)
+#define KVM_REG_MIPS_R7		(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 |  7)
+#define KVM_REG_MIPS_R8		(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 |  8)
+#define KVM_REG_MIPS_R9		(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 |  9)
+#define KVM_REG_MIPS_R10	(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 10)
+#define KVM_REG_MIPS_R11	(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 11)
+#define KVM_REG_MIPS_R12	(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 12)
+#define KVM_REG_MIPS_R13	(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 13)
+#define KVM_REG_MIPS_R14	(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 14)
+#define KVM_REG_MIPS_R15	(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 15)
+#define KVM_REG_MIPS_R16	(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 16)
+#define KVM_REG_MIPS_R17	(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 17)
+#define KVM_REG_MIPS_R18	(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 18)
+#define KVM_REG_MIPS_R19	(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 19)
+#define KVM_REG_MIPS_R20	(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 20)
+#define KVM_REG_MIPS_R21	(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 21)
+#define KVM_REG_MIPS_R22	(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 22)
+#define KVM_REG_MIPS_R23	(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 23)
+#define KVM_REG_MIPS_R24	(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 24)
+#define KVM_REG_MIPS_R25	(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 25)
+#define KVM_REG_MIPS_R26	(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 26)
+#define KVM_REG_MIPS_R27	(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 27)
+#define KVM_REG_MIPS_R28	(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 28)
+#define KVM_REG_MIPS_R29	(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 29)
+#define KVM_REG_MIPS_R30	(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 30)
+#define KVM_REG_MIPS_R31	(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 31)
+
+#define KVM_REG_MIPS_HI		(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 32)
+#define KVM_REG_MIPS_LO		(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 33)
+#define KVM_REG_MIPS_PC		(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 34)
+
+
+/*
+ * KVM_REG_MIPS_CP0 - Coprocessor 0 registers.
+ */
+
+#define KVM_REG_MIPS_MAAR	(KVM_REG_MIPS_CP0 | (1 << 8))
+#define KVM_REG_MIPS_CP0_MAAR(n)	(KVM_REG_MIPS_MAAR | \
+					 KVM_REG_SIZE_U64 | (n))
+
+
+/*
+ * KVM_REG_MIPS_KVM - KVM specific control registers.
+ */
+
+/*
+ * CP0_Count control
+ * DC:    Set 0: Master disable CP0_Count and set COUNT_RESUME to now
+ *        Set 1: Master re-enable CP0_Count with unchanged bias, handling timer
+ *               interrupts since COUNT_RESUME
+ *        This can be used to freeze the timer to get a consistent snapshot of
+ *        the CP0_Count and timer interrupt pending state, while also resuming
+ *        safely without losing time or guest timer interrupts.
+ * Other: Reserved, do not change.
+ */
+#define KVM_REG_MIPS_COUNT_CTL	    (KVM_REG_MIPS_KVM | KVM_REG_SIZE_U64 | 0)
+#define KVM_REG_MIPS_COUNT_CTL_DC	0x00000001
+
+/*
+ * CP0_Count resume monotonic nanoseconds
+ * The monotonic nanosecond time of the last set of COUNT_CTL.DC (master
+ * disable). Any reads and writes of Count related registers while
+ * COUNT_CTL.DC=1 will appear to occur at this time. When COUNT_CTL.DC is
+ * cleared again (master enable) any timer interrupts since this time will be
+ * emulated.
+ * Modifications to times in the future are rejected.
+ */
+#define KVM_REG_MIPS_COUNT_RESUME   (KVM_REG_MIPS_KVM | KVM_REG_SIZE_U64 | 1)
+/*
+ * CP0_Count rate in Hz
+ * Specifies the rate of the CP0_Count timer in Hz. Modifications occur without
+ * discontinuities in CP0_Count.
+ */
+#define KVM_REG_MIPS_COUNT_HZ	    (KVM_REG_MIPS_KVM | KVM_REG_SIZE_U64 | 2)
+
+
+/*
+ * KVM_REG_MIPS_FPU - Floating Point and MIPS SIMD Architecture (MSA) registers.
+ *
+ *  bits[15..8]  - Register subset (see definitions below).
+ *  bits[7..5]   - Must be zero.
+ *  bits[4..0]   - Register number within register subset.
+ */
+
+#define KVM_REG_MIPS_FPR	(KVM_REG_MIPS_FPU | 0x0000000000000000ULL)
+#define KVM_REG_MIPS_FCR	(KVM_REG_MIPS_FPU | 0x0000000000000100ULL)
+#define KVM_REG_MIPS_MSACR	(KVM_REG_MIPS_FPU | 0x0000000000000200ULL)
+
+/*
+ * KVM_REG_MIPS_FPR - Floating point / Vector registers.
+ */
+#define KVM_REG_MIPS_FPR_32(n)	(KVM_REG_MIPS_FPR | KVM_REG_SIZE_U32  | (n))
+#define KVM_REG_MIPS_FPR_64(n)	(KVM_REG_MIPS_FPR | KVM_REG_SIZE_U64  | (n))
+#define KVM_REG_MIPS_VEC_128(n)	(KVM_REG_MIPS_FPR | KVM_REG_SIZE_U128 | (n))
+
+/*
+ * KVM_REG_MIPS_FCR - Floating point control registers.
+ */
+#define KVM_REG_MIPS_FCR_IR	(KVM_REG_MIPS_FCR | KVM_REG_SIZE_U32 |  0)
+#define KVM_REG_MIPS_FCR_CSR	(KVM_REG_MIPS_FCR | KVM_REG_SIZE_U32 | 31)
+
+/*
+ * KVM_REG_MIPS_MSACR - MIPS SIMD Architecture (MSA) control registers.
+ */
+#define KVM_REG_MIPS_MSA_IR	 (KVM_REG_MIPS_MSACR | KVM_REG_SIZE_U32 |  0)
+#define KVM_REG_MIPS_MSA_CSR	 (KVM_REG_MIPS_MSACR | KVM_REG_SIZE_U32 |  1)
+
+
+/*
+ * KVM MIPS specific structures and definitions
+ *
+ */
+struct kvm_debug_exit_arch {
+	__u64 epc;
+};
+
+/* for KVM_SET_GUEST_DEBUG */
+struct kvm_guest_debug_arch {
+};
+
+/* definition of registers in kvm_run */
+struct kvm_sync_regs {
+};
+
+/* dummy definition */
+struct kvm_sregs {
+};
+
+struct kvm_mips_interrupt {
+	/* in */
+	__u32 cpu;
+	__u32 irq;
+};
+
+#endif /* __LINUX_KVM_MIPS_H */
diff --git a/kvmtool/mips/include/kvm/barrier.h b/kvmtool/mips/include/kvm/barrier.h
new file mode 100644
index 0000000..45bfa72
--- /dev/null
+++ b/kvmtool/mips/include/kvm/barrier.h
@@ -0,0 +1,20 @@
+#ifndef _KVM_BARRIER_H_
+#define _KVM_BARRIER_H_
+
+#define barrier() asm volatile("": : :"memory")
+
+#define mb()	asm volatile (".set push\n\t.set mips2\n\tsync\n\t.set pop": : :"memory")
+#define rmb() mb()
+#define wmb() mb()
+
+#ifdef CONFIG_SMP
+#define smp_mb()	mb()
+#define smp_rmb()	rmb()
+#define smp_wmb()	wmb()
+#else
+#define smp_mb()	barrier()
+#define smp_rmb()	barrier()
+#define smp_wmb()	barrier()
+#endif
+
+#endif /* _KVM_BARRIER_H_ */
diff --git a/kvmtool/mips/include/kvm/fdt-arch.h b/kvmtool/mips/include/kvm/fdt-arch.h
new file mode 100644
index 0000000..b030245
--- /dev/null
+++ b/kvmtool/mips/include/kvm/fdt-arch.h
@@ -0,0 +1,6 @@
+#ifndef KVM__KVM_FDT_H
+#define KVM__KVM_FDT_H
+
+enum phandles {PHANDLE_RESERVED = 0, PHANDLES_MAX};
+
+#endif /* KVM__KVM_FDT_H */
diff --git a/kvmtool/mips/include/kvm/kvm-arch.h b/kvmtool/mips/include/kvm/kvm-arch.h
new file mode 100644
index 0000000..fdc09d8
--- /dev/null
+++ b/kvmtool/mips/include/kvm/kvm-arch.h
@@ -0,0 +1,50 @@
+#ifndef KVM__KVM_ARCH_H
+#define KVM__KVM_ARCH_H
+
+
+/*
+ * Guest memory map is:
+ *   0x00000000-0x0fffffff : System RAM
+ *   0x10000000-0x1fffffff : I/O (defined by KVM_MMIO_START and KVM_MMIO_SIZE)
+ *   0x20000000-    ...    : System RAM
+ * See also kvm__init_ram().
+ */
+
+#define KVM_MMIO_START		0x10000000
+#define KVM_PCI_CFG_AREA	KVM_MMIO_START
+#define KVM_PCI_MMIO_AREA	(KVM_MMIO_START + 0x1000000)
+#define KVM_VIRTIO_MMIO_AREA	(KVM_MMIO_START + 0x2000000)
+#define KVM_MMIO_SIZE		0x10000000
+
+/*
+ * Just for reference. This and the above corresponds to what's used
+ * in mipsvz_page_fault() in kvm_mipsvz.c of the host kernel.
+ */
+#define KVM_MIPS_IOPORT_AREA	0x1e000000
+#define KVM_MIPS_IOPORT_SIZE	0x00010000
+#define KVM_MIPS_IRQCHIP_AREA	0x1e010000
+#define KVM_MIPS_IRQCHIP_SIZE	0x00010000
+
+#define KVM_IRQ_OFFSET		1
+
+/*
+ * MIPS-VZ (trap and emulate is 0)
+ */
+#define KVM_VM_TYPE		1
+
+#define KVM_IOEVENTFD_HAS_PIO	0
+
+#define VIRTIO_DEFAULT_TRANS(kvm)	VIRTIO_PCI
+
+#include <stdbool.h>
+
+#include "linux/types.h"
+
+struct kvm_arch {
+	u64 entry_point;
+	u64 argc;
+	u64 argv;
+	bool is64bit;
+};
+
+#endif /* KVM__KVM_ARCH_H */
diff --git a/kvmtool/mips/include/kvm/kvm-config-arch.h b/kvmtool/mips/include/kvm/kvm-config-arch.h
new file mode 100644
index 0000000..8a28f9d
--- /dev/null
+++ b/kvmtool/mips/include/kvm/kvm-config-arch.h
@@ -0,0 +1,7 @@
+#ifndef KVM__KVM_CONFIG_ARCH_H
+#define KVM__KVM_CONFIG_ARCH_H
+
+struct kvm_config_arch {
+};
+
+#endif /* KVM__MIPS_KVM_CONFIG_ARCH_H */
diff --git a/kvmtool/mips/include/kvm/kvm-cpu-arch.h b/kvmtool/mips/include/kvm/kvm-cpu-arch.h
new file mode 100644
index 0000000..45e69f6
--- /dev/null
+++ b/kvmtool/mips/include/kvm/kvm-cpu-arch.h
@@ -0,0 +1,43 @@
+#ifndef KVM__KVM_CPU_ARCH_H
+#define KVM__KVM_CPU_ARCH_H
+
+#include <linux/kvm.h>	/* for struct kvm_regs */
+#include "kvm/kvm.h"	/* for kvm__emulate_{mm}io() */
+#include <pthread.h>
+
+struct kvm;
+
+struct kvm_cpu {
+	pthread_t	thread;		/* VCPU thread */
+
+	unsigned long	cpu_id;
+
+	struct kvm	*kvm;		/* parent KVM */
+	int		vcpu_fd;	/* For VCPU ioctls() */
+	struct kvm_run	*kvm_run;
+	struct kvm_cpu_task	*task;
+
+	struct kvm_regs	regs;
+
+	u8		is_running;
+	u8		paused;
+	u8		needs_nmi;
+
+	struct kvm_coalesced_mmio_ring *ring;
+};
+
+/*
+ * As these are such simple wrappers, let's have them in the header so they'll
+ * be cheaper to call:
+ */
+static inline bool kvm_cpu__emulate_io(struct kvm_cpu *vcpu, u16 port, void *data, int direction, int size, u32 count)
+{
+	return kvm__emulate_io(vcpu, port, data, direction, size, count);
+}
+
+static inline bool kvm_cpu__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, u8 is_write)
+{
+	return kvm__emulate_mmio(vcpu, phys_addr, data, len, is_write);
+}
+
+#endif /* KVM__KVM_CPU_ARCH_H */
diff --git a/kvmtool/mips/kvm-cpu.c b/kvmtool/mips/kvm-cpu.c
new file mode 100644
index 0000000..30a3de1
--- /dev/null
+++ b/kvmtool/mips/kvm-cpu.c
@@ -0,0 +1,219 @@
+#include "kvm/kvm-cpu.h"
+#include "kvm/term.h"
+
+#include <stdlib.h>
+
+static int debug_fd;
+
+void kvm_cpu__set_debug_fd(int fd)
+{
+	debug_fd = fd;
+}
+
+int kvm_cpu__get_debug_fd(void)
+{
+	return debug_fd;
+}
+
+void kvm_cpu__delete(struct kvm_cpu *vcpu)
+{
+	free(vcpu);
+}
+
+static struct kvm_cpu *kvm_cpu__new(struct kvm *kvm)
+{
+	struct kvm_cpu *vcpu;
+
+	vcpu = calloc(1, sizeof(*vcpu));
+	if (!vcpu)
+		return NULL;
+
+	vcpu->kvm = kvm;
+
+	return vcpu;
+}
+
+struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, unsigned long cpu_id)
+{
+	struct kvm_cpu *vcpu;
+	int mmap_size;
+	int coalesced_offset;
+
+	vcpu = kvm_cpu__new(kvm);
+	if (!vcpu)
+		return NULL;
+
+	vcpu->cpu_id = cpu_id;
+
+	vcpu->vcpu_fd = ioctl(vcpu->kvm->vm_fd, KVM_CREATE_VCPU, cpu_id);
+	if (vcpu->vcpu_fd < 0)
+		die_perror("KVM_CREATE_VCPU ioctl");
+
+	mmap_size = ioctl(vcpu->kvm->sys_fd, KVM_GET_VCPU_MMAP_SIZE, 0);
+	if (mmap_size < 0)
+		die_perror("KVM_GET_VCPU_MMAP_SIZE ioctl");
+
+	vcpu->kvm_run = mmap(NULL, mmap_size, PROT_RW, MAP_SHARED, vcpu->vcpu_fd, 0);
+	if (vcpu->kvm_run == MAP_FAILED)
+		die("unable to mmap vcpu fd");
+
+	vcpu->is_running = true;
+
+	coalesced_offset = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_COALESCED_MMIO);
+	if (coalesced_offset)
+		vcpu->ring = (void *)vcpu->kvm_run + (coalesced_offset * PAGE_SIZE);
+
+	return vcpu;
+}
+
+static void kvm_cpu__setup_regs(struct kvm_cpu *vcpu)
+{
+	uint32_t v;
+	struct kvm_one_reg one_reg;
+
+	memset(&vcpu->regs, 0, sizeof(vcpu->regs));
+	vcpu->regs.pc = vcpu->kvm->arch.entry_point;
+	vcpu->regs.gpr[4] = vcpu->kvm->arch.argc;
+	vcpu->regs.gpr[5] = vcpu->kvm->arch.argv;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_REGS, &vcpu->regs) < 0)
+		die_perror("KVM_SET_REGS failed");
+
+
+	one_reg.id = KVM_REG_MIPS | KVM_REG_SIZE_U32 | (0x10000 + 8 * 12 + 0); /* Status */
+	one_reg.addr = (unsigned long)(uint32_t *)&v;
+	v = 6;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &one_reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed");
+}
+
+/**
+ * kvm_cpu__reset_vcpu - reset virtual CPU to a known state
+ */
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu)
+{
+	kvm_cpu__setup_regs(vcpu);
+}
+
+static bool kvm_cpu__hypercall_write_cons(struct kvm_cpu *vcpu)
+{
+	int term = (int)vcpu->kvm_run->hypercall.args[0];
+	u64 addr = vcpu->kvm_run->hypercall.args[1];
+	int len = (int)vcpu->kvm_run->hypercall.args[2];
+	char *host_addr;
+
+	if (term < 0 || term >= TERM_MAX_DEVS) {
+		pr_warning("hypercall_write_cons term out of range <%d>", term);
+		return false;
+	}
+
+	if ((addr & 0xffffffffc0000000ull) == 0xffffffff80000000ull)
+		addr &= 0x1ffffffful; /* Convert KSEG{0,1} to physical. */
+	if ((addr & 0xc000000000000000ull) == 0x8000000000000000ull)
+		addr &= 0x07ffffffffffffffull; /* Convert XKPHYS to pysical */
+
+	host_addr = guest_flat_to_host(vcpu->kvm, addr);
+	if (!host_addr) {
+		pr_warning("hypercall_write_cons unmapped physaddr %llx", (unsigned long long)addr);
+		return false;
+	}
+
+	if ((len <= 0) || !host_ptr_in_ram(vcpu->kvm, host_addr + len)) {
+		pr_warning("hypercall_write_cons len out of range <%d>", len);
+		return false;
+	}
+
+	term_putc(host_addr, len, term);
+
+	return true;
+}
+
+#define KVM_HC_MIPS_CONSOLE_OUTPUT 8
+bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu)
+{
+	switch(vcpu->kvm_run->exit_reason) {
+	case KVM_EXIT_HYPERCALL:
+		if (vcpu->kvm_run->hypercall.nr == KVM_HC_MIPS_CONSOLE_OUTPUT) {
+			return kvm_cpu__hypercall_write_cons(vcpu);
+		} else {
+			pr_warning("KVM_EXIT_HYPERCALL unrecognized call %llu",
+				   (unsigned long long)vcpu->kvm_run->hypercall.nr);
+			return false;
+		}
+	case KVM_EXIT_EXCEPTION:
+	case KVM_EXIT_INTERNAL_ERROR:
+		return false;
+	default:
+		break;
+	}
+	return false;
+}
+
+void kvm_cpu__arch_nmi(struct kvm_cpu *cpu)
+{
+}
+
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu)
+{
+	struct kvm_regs regs;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, &regs) < 0)
+		die("KVM_GET_REGS failed");
+	dprintf(debug_fd, "\n Registers:\n");
+	dprintf(debug_fd,   " ----------\n");
+	dprintf(debug_fd, "$0   : %016llx %016llx %016llx %016llx\n",
+		(unsigned long long)regs.gpr[0],
+		(unsigned long long)regs.gpr[1],
+		(unsigned long long)regs.gpr[2],
+		(unsigned long long)regs.gpr[3]);
+	dprintf(debug_fd, "$4   : %016llx %016llx %016llx %016llx\n",
+		(unsigned long long)regs.gpr[4],
+		(unsigned long long)regs.gpr[5],
+		(unsigned long long)regs.gpr[6],
+		(unsigned long long)regs.gpr[7]);
+	dprintf(debug_fd, "$8   : %016llx %016llx %016llx %016llx\n",
+		(unsigned long long)regs.gpr[8],
+		(unsigned long long)regs.gpr[9],
+		(unsigned long long)regs.gpr[10],
+		(unsigned long long)regs.gpr[11]);
+	dprintf(debug_fd, "$12  : %016llx %016llx %016llx %016llx\n",
+		(unsigned long long)regs.gpr[12],
+		(unsigned long long)regs.gpr[13],
+		(unsigned long long)regs.gpr[14],
+		(unsigned long long)regs.gpr[15]);
+	dprintf(debug_fd, "$16  : %016llx %016llx %016llx %016llx\n",
+		(unsigned long long)regs.gpr[16],
+		(unsigned long long)regs.gpr[17],
+		(unsigned long long)regs.gpr[18],
+		(unsigned long long)regs.gpr[19]);
+	dprintf(debug_fd, "$20  : %016llx %016llx %016llx %016llx\n",
+		(unsigned long long)regs.gpr[20],
+		(unsigned long long)regs.gpr[21],
+		(unsigned long long)regs.gpr[22],
+		(unsigned long long)regs.gpr[23]);
+	dprintf(debug_fd, "$24  : %016llx %016llx %016llx %016llx\n",
+		(unsigned long long)regs.gpr[24],
+		(unsigned long long)regs.gpr[25],
+		(unsigned long long)regs.gpr[26],
+		(unsigned long long)regs.gpr[27]);
+	dprintf(debug_fd, "$28  : %016llx %016llx %016llx %016llx\n",
+		(unsigned long long)regs.gpr[28],
+		(unsigned long long)regs.gpr[29],
+		(unsigned long long)regs.gpr[30],
+		(unsigned long long)regs.gpr[31]);
+
+	dprintf(debug_fd, "hi   : %016llx\n", (unsigned long long)regs.hi);
+	dprintf(debug_fd, "lo   : %016llx\n", (unsigned long long)regs.lo);
+	dprintf(debug_fd, "epc  : %016llx\n", (unsigned long long)regs.pc);
+
+	dprintf(debug_fd, "\n");
+}
+
+void kvm_cpu__show_code(struct kvm_cpu *vcpu)
+{
+}
+
+void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu)
+{
+}
diff --git a/kvmtool/mips/kvm.c b/kvmtool/mips/kvm.c
new file mode 100644
index 0000000..2635593
--- /dev/null
+++ b/kvmtool/mips/kvm.c
@@ -0,0 +1,365 @@
+#include "kvm/kvm.h"
+#include "kvm/ioport.h"
+#include "kvm/virtio-console.h"
+
+#include <linux/kvm.h>
+
+#include <ctype.h>
+#include <unistd.h>
+#include <elf.h>
+
+struct kvm_ext kvm_req_ext[] = {
+	{ 0, 0 }
+};
+
+void kvm__arch_read_term(struct kvm *kvm)
+{
+	virtio_console__inject_interrupt(kvm);
+}
+
+void kvm__init_ram(struct kvm *kvm)
+{
+	u64	phys_start, phys_size;
+	void	*host_mem;
+
+	if (kvm->ram_size <= KVM_MMIO_START) {
+		/* one region for all memory */
+		phys_start = 0;
+		phys_size  = kvm->ram_size;
+		host_mem   = kvm->ram_start;
+
+		kvm__register_ram(kvm, phys_start, phys_size, host_mem);
+	} else {
+		/* one region for memory that fits below MMIO range */
+		phys_start = 0;
+		phys_size  = KVM_MMIO_START;
+		host_mem   = kvm->ram_start;
+
+		kvm__register_ram(kvm, phys_start, phys_size, host_mem);
+
+		/* one region for rest of memory */
+		phys_start = KVM_MMIO_START + KVM_MMIO_SIZE;
+		phys_size  = kvm->ram_size - KVM_MMIO_START;
+		host_mem   = kvm->ram_start + KVM_MMIO_START;
+
+		kvm__register_ram(kvm, phys_start, phys_size, host_mem);
+	}
+}
+
+void kvm__arch_delete_ram(struct kvm *kvm)
+{
+	munmap(kvm->ram_start, kvm->ram_size);
+}
+
+void kvm__arch_set_cmdline(char *cmdline, bool video)
+{
+
+}
+
+/* Architecture-specific KVM init */
+void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size)
+{
+	int ret;
+
+	kvm->ram_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path, ram_size);
+	kvm->ram_size = ram_size;
+
+	if (kvm->ram_start == MAP_FAILED)
+		die("out of memory");
+
+	madvise(kvm->ram_start, kvm->ram_size, MADV_MERGEABLE);
+
+	ret = ioctl(kvm->vm_fd, KVM_CREATE_IRQCHIP);
+	if (ret < 0)
+		die_perror("KVM_CREATE_IRQCHIP ioctl");
+}
+
+void kvm__irq_line(struct kvm *kvm, int irq, int level)
+{
+	struct kvm_irq_level irq_level;
+	int ret;
+
+	irq_level.irq = irq;
+	irq_level.level = level ? 1 : 0;
+
+	ret = ioctl(kvm->vm_fd, KVM_IRQ_LINE, &irq_level);
+	if (ret < 0)
+		die_perror("KVM_IRQ_LINE ioctl");
+}
+
+void kvm__irq_trigger(struct kvm *kvm, int irq)
+{
+	struct kvm_irq_level irq_level;
+	int ret;
+
+	irq_level.irq = irq;
+	irq_level.level = 1;
+
+	ret = ioctl(kvm->vm_fd, KVM_IRQ_LINE, &irq_level);
+	if (ret < 0)
+		die_perror("KVM_IRQ_LINE ioctl");
+}
+
+int ioport__setup_arch(struct kvm *kvm)
+{
+	return 0;
+}
+
+bool kvm__arch_cpu_supports_vm(void)
+{
+	return true;
+}
+bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename)
+{
+	return false;
+}
+int kvm__arch_setup_firmware(struct kvm *kvm)
+{
+	return 0;
+}
+
+static void kvm__mips_install_cmdline(struct kvm *kvm)
+{
+	char *p = kvm->ram_start;
+	u64 cmdline_offset = 0x2000;
+	u64 argv_start = 0x3000;
+	u64 argv_offset = argv_start;
+	u64 argc = 0;
+
+
+	if ((u64) kvm->ram_size <= KVM_MMIO_START)
+		sprintf(p + cmdline_offset, "mem=0x%llx@0 ",
+			(unsigned long long)kvm->ram_size);
+	else
+		sprintf(p + cmdline_offset, "mem=0x%llx@0 mem=0x%llx@0x%llx ",
+			(unsigned long long)KVM_MMIO_START,
+			(unsigned long long)kvm->ram_size - KVM_MMIO_START,
+			(unsigned long long)(KVM_MMIO_START + KVM_MMIO_SIZE));
+
+	strcat(p + cmdline_offset, kvm->cfg.real_cmdline); /* maximum size is 2K */
+
+	while (p[cmdline_offset]) {
+		if (!isspace(p[cmdline_offset])) {
+			if (kvm->arch.is64bit) {
+				*(u64 *)(p + argv_offset) = 0xffffffff80000000ull + cmdline_offset;
+				argv_offset += sizeof(u64);
+			} else {
+				*(u32 *)(p + argv_offset) = 0x80000000u + cmdline_offset;
+				argv_offset += sizeof(u32);
+			}
+			argc++;
+			while(p[cmdline_offset] && !isspace(p[cmdline_offset]))
+				cmdline_offset++;
+			continue;
+		}
+		/* Must be a space character skip over these*/
+		while(p[cmdline_offset] && isspace(p[cmdline_offset])) {
+			p[cmdline_offset] = 0;
+			cmdline_offset++;
+		}
+	}
+	kvm->arch.argc = argc;
+	kvm->arch.argv = 0xffffffff80000000ull + argv_start;
+}
+
+/* Load at the 1M point. */
+#define KERNEL_LOAD_ADDR 0x1000000
+
+static bool load_flat_binary(struct kvm *kvm, int fd_kernel)
+{
+	void *p;
+	void *k_start;
+	ssize_t kernel_size;
+
+	if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+		die_perror("lseek");
+
+	p = k_start = guest_flat_to_host(kvm, KERNEL_LOAD_ADDR);
+
+	kernel_size = read_file(fd_kernel, p,
+				kvm->cfg.ram_size - KERNEL_LOAD_ADDR);
+	if (kernel_size == -1) {
+		if (errno == ENOMEM)
+			die("kernel too big for guest memory");
+		else
+			die_perror("kernel read");
+	}
+
+	kvm->arch.is64bit = true;
+	kvm->arch.entry_point = 0xffffffff81000000ull;
+
+	pr_info("Loaded kernel to 0x%x (%zd bytes)", KERNEL_LOAD_ADDR,
+		kernel_size);
+
+	return true;
+}
+
+struct kvm__arch_elf_info {
+	u64 load_addr;
+	u64 entry_point;
+	size_t len;
+	size_t offset;
+};
+
+static bool kvm__arch_get_elf_64_info(Elf64_Ehdr *ehdr, int fd_kernel,
+				      struct kvm__arch_elf_info *ei)
+{
+	int i;
+	Elf64_Phdr phdr;
+
+	if (ehdr->e_phentsize != sizeof(phdr)) {
+		pr_info("Incompatible ELF PHENTSIZE %d", ehdr->e_phentsize);
+		return false;
+	}
+
+	ei->entry_point = ehdr->e_entry;
+
+	if (lseek(fd_kernel, ehdr->e_phoff, SEEK_SET) < 0)
+		die_perror("lseek");
+
+	phdr.p_type = PT_NULL;
+	for (i = 0; i < ehdr->e_phnum; i++) {
+		if (read_in_full(fd_kernel, &phdr, sizeof(phdr)) != sizeof(phdr)) {
+			pr_info("Couldn't read %d bytes for ELF PHDR.", (int)sizeof(phdr));
+			return false;
+		}
+		if (phdr.p_type == PT_LOAD)
+			break;
+	}
+	if (phdr.p_type != PT_LOAD) {
+		pr_info("No PT_LOAD Program Header found.");
+		return false;
+	}
+
+	ei->load_addr = phdr.p_paddr;
+
+	if ((ei->load_addr & 0xffffffffc0000000ull) == 0xffffffff80000000ull)
+		ei->load_addr &= 0x1ffffffful; /* Convert KSEG{0,1} to physical. */
+	if ((ei->load_addr & 0xc000000000000000ull) == 0x8000000000000000ull)
+		ei->load_addr &= 0x07ffffffffffffffull; /* Convert XKPHYS to pysical */
+
+
+	ei->len = phdr.p_filesz;
+	ei->offset = phdr.p_offset;
+
+	return true;
+}
+
+static bool kvm__arch_get_elf_32_info(Elf32_Ehdr *ehdr, int fd_kernel,
+				      struct kvm__arch_elf_info *ei)
+{
+	int i;
+	Elf32_Phdr phdr;
+
+	if (ehdr->e_phentsize != sizeof(phdr)) {
+		pr_info("Incompatible ELF PHENTSIZE %d", ehdr->e_phentsize);
+		return false;
+	}
+
+	ei->entry_point = (s64)((s32)ehdr->e_entry);
+
+	if (lseek(fd_kernel, ehdr->e_phoff, SEEK_SET) < 0)
+		die_perror("lseek");
+
+	phdr.p_type = PT_NULL;
+	for (i = 0; i < ehdr->e_phnum; i++) {
+		if (read_in_full(fd_kernel, &phdr, sizeof(phdr)) != sizeof(phdr)) {
+			pr_info("Couldn't read %d bytes for ELF PHDR.", (int)sizeof(phdr));
+			return false;
+		}
+		if (phdr.p_type == PT_LOAD)
+			break;
+	}
+	if (phdr.p_type != PT_LOAD) {
+		pr_info("No PT_LOAD Program Header found.");
+		return false;
+	}
+
+	ei->load_addr = (s64)((s32)phdr.p_paddr);
+
+	if ((ei->load_addr & 0xffffffffc0000000ull) == 0xffffffff80000000ull)
+		ei->load_addr &= 0x1fffffffull; /* Convert KSEG{0,1} to physical. */
+
+	ei->len = phdr.p_filesz;
+	ei->offset = phdr.p_offset;
+
+	return true;
+}
+
+static bool load_elf_binary(struct kvm *kvm, int fd_kernel)
+{
+	union {
+		Elf64_Ehdr ehdr;
+		Elf32_Ehdr ehdr32;
+	} eh;
+
+	size_t nr;
+	char *p;
+	struct kvm__arch_elf_info ei;
+
+	nr = read(fd_kernel, &eh, sizeof(eh));
+	if (nr != sizeof(eh)) {
+		pr_info("Couldn't read %d bytes for ELF header.", (int)sizeof(eh));
+		return false;
+	}
+
+	if (eh.ehdr.e_ident[EI_MAG0] != ELFMAG0 ||
+	    eh.ehdr.e_ident[EI_MAG1] != ELFMAG1 ||
+	    eh.ehdr.e_ident[EI_MAG2] != ELFMAG2 ||
+	    eh.ehdr.e_ident[EI_MAG3] != ELFMAG3 ||
+	    (eh.ehdr.e_ident[EI_CLASS] != ELFCLASS64 && eh.ehdr.e_ident[EI_CLASS] != ELFCLASS32) ||
+	    eh.ehdr.e_ident[EI_VERSION] != EV_CURRENT) {
+		pr_info("Incompatible ELF header.");
+		return false;
+	}
+	if (eh.ehdr.e_type != ET_EXEC || eh.ehdr.e_machine != EM_MIPS) {
+		pr_info("Incompatible ELF not MIPS EXEC.");
+		return false;
+	}
+
+	if (eh.ehdr.e_ident[EI_CLASS] == ELFCLASS64) {
+		if (!kvm__arch_get_elf_64_info(&eh.ehdr, fd_kernel, &ei))
+			return false;
+		kvm->arch.is64bit = true;
+	} else {
+		if (!kvm__arch_get_elf_32_info(&eh.ehdr32, fd_kernel, &ei))
+			return false;
+		kvm->arch.is64bit = false;
+	}
+
+	kvm->arch.entry_point = ei.entry_point;
+
+	if (lseek(fd_kernel, ei.offset, SEEK_SET) < 0)
+		die_perror("lseek");
+
+	p = guest_flat_to_host(kvm, ei.load_addr);
+
+	pr_info("ELF Loading 0x%lx bytes from 0x%llx to 0x%llx",
+		(unsigned long)ei.len, (unsigned long long)ei.offset,
+		(unsigned long long)ei.load_addr);
+
+	if (read_in_full(fd_kernel, p, ei.len) != (ssize_t)ei.len)
+		die_perror("read");
+
+	return true;
+}
+
+bool kvm__arch_load_kernel_image(struct kvm *kvm, int fd_kernel, int fd_initrd,
+				 const char *kernel_cmdline)
+{
+	if (fd_initrd != -1) {
+		pr_err("Initrd not supported on MIPS.");
+		return false;
+	}
+
+	if (load_elf_binary(kvm, fd_kernel)) {
+		kvm__mips_install_cmdline(kvm);
+		return true;
+	}
+
+	return load_flat_binary(kvm, fd_kernel);
+}
+
+void ioport__map_irq(u8 *irq)
+{
+}
diff --git a/kvmtool/mmio.c b/kvmtool/mmio.c
new file mode 100644
index 0000000..cd141cd
--- /dev/null
+++ b/kvmtool/mmio.c
@@ -0,0 +1,196 @@
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/rbtree-interval.h"
+#include "kvm/mutex.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <sys/ioctl.h>
+#include <linux/kvm.h>
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/err.h>
+#include <errno.h>
+
+#define mmio_node(n) rb_entry(n, struct mmio_mapping, node)
+
+static DEFINE_MUTEX(mmio_lock);
+
+struct mmio_mapping {
+	struct rb_int_node	node;
+	void			(*mmio_fn)(struct kvm_cpu *vcpu, u64 addr, u8 *data, u32 len, u8 is_write, void *ptr);
+	void			*ptr;
+	u32			refcount;
+	bool			remove;
+};
+
+static struct rb_root mmio_tree = RB_ROOT;
+
+static struct mmio_mapping *mmio_search(struct rb_root *root, u64 addr, u64 len)
+{
+	struct rb_int_node *node;
+
+	node = rb_int_search_range(root, addr, addr + len);
+	if (node == NULL)
+		return NULL;
+
+	return mmio_node(node);
+}
+
+/* Find lowest match, Check for overlap */
+static struct mmio_mapping *mmio_search_single(struct rb_root *root, u64 addr)
+{
+	struct rb_int_node *node;
+
+	node = rb_int_search_single(root, addr);
+	if (node == NULL)
+		return NULL;
+
+	return mmio_node(node);
+}
+
+static int mmio_insert(struct rb_root *root, struct mmio_mapping *data)
+{
+	return rb_int_insert(root, &data->node);
+}
+
+static void mmio_remove(struct rb_root *root, struct mmio_mapping *data)
+{
+	rb_int_erase(root, &data->node);
+}
+
+static const char *to_direction(u8 is_write)
+{
+	if (is_write)
+		return "write";
+
+	return "read";
+}
+
+static struct mmio_mapping *mmio_get(struct rb_root *root, u64 phys_addr, u32 len)
+{
+	struct mmio_mapping *mmio;
+
+	mutex_lock(&mmio_lock);
+	mmio = mmio_search(root, phys_addr, len);
+	if (mmio)
+		mmio->refcount++;
+	mutex_unlock(&mmio_lock);
+
+	return mmio;
+}
+
+/* Called with mmio_lock held. */
+static void mmio_deregister(struct kvm *kvm, struct rb_root *root, struct mmio_mapping *mmio)
+{
+	struct kvm_coalesced_mmio_zone zone = (struct kvm_coalesced_mmio_zone) {
+		.addr	= rb_int_start(&mmio->node),
+		.size	= 1,
+	};
+	ioctl(kvm->vm_fd, KVM_UNREGISTER_COALESCED_MMIO, &zone);
+
+	mmio_remove(root, mmio);
+	free(mmio);
+}
+
+static void mmio_put(struct kvm *kvm, struct rb_root *root, struct mmio_mapping *mmio)
+{
+	mutex_lock(&mmio_lock);
+	mmio->refcount--;
+	if (mmio->remove && mmio->refcount == 0)
+		mmio_deregister(kvm, root, mmio);
+	mutex_unlock(&mmio_lock);
+}
+
+int kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, bool coalesce,
+		       void (*mmio_fn)(struct kvm_cpu *vcpu, u64 addr, u8 *data, u32 len, u8 is_write, void *ptr),
+			void *ptr)
+{
+	struct mmio_mapping *mmio;
+	struct kvm_coalesced_mmio_zone zone;
+	int ret;
+
+	mmio = malloc(sizeof(*mmio));
+	if (mmio == NULL)
+		return -ENOMEM;
+
+	*mmio = (struct mmio_mapping) {
+		.node		= RB_INT_INIT(phys_addr, phys_addr + phys_addr_len),
+		.mmio_fn	= mmio_fn,
+		.ptr		= ptr,
+		/*
+		 * Start from 0 because kvm__deregister_mmio() doesn't decrement
+		 * the reference count.
+		 */
+		.refcount	= 0,
+		.remove		= false,
+	};
+
+	if (coalesce) {
+		zone = (struct kvm_coalesced_mmio_zone) {
+			.addr	= phys_addr,
+			.size	= phys_addr_len,
+		};
+		ret = ioctl(kvm->vm_fd, KVM_REGISTER_COALESCED_MMIO, &zone);
+		if (ret < 0) {
+			free(mmio);
+			return -errno;
+		}
+	}
+	mutex_lock(&mmio_lock);
+	ret = mmio_insert(&mmio_tree, mmio);
+	mutex_unlock(&mmio_lock);
+
+	return ret;
+}
+
+bool kvm__deregister_mmio(struct kvm *kvm, u64 phys_addr)
+{
+	struct mmio_mapping *mmio;
+
+	mutex_lock(&mmio_lock);
+	mmio = mmio_search_single(&mmio_tree, phys_addr);
+	if (mmio == NULL) {
+		mutex_unlock(&mmio_lock);
+		return false;
+	}
+	/*
+	 * The PCI emulation code calls this function when memory access is
+	 * disabled for a device, or when a BAR has a new address assigned. PCI
+	 * emulation doesn't use any locks and as a result we can end up in a
+	 * situation where we have called mmio_get() to do emulation on one VCPU
+	 * thread (let's call it VCPU0), and several other VCPU threads have
+	 * called kvm__deregister_mmio(). In this case, if we decrement refcount
+	 * kvm__deregister_mmio() (either directly, or by calling mmio_put()),
+	 * refcount will reach 0 and we will free the mmio node before VCPU0 has
+	 * called mmio_put(). This will trigger use-after-free errors on VCPU0.
+	 */
+	if (mmio->refcount == 0)
+		mmio_deregister(kvm, &mmio_tree, mmio);
+	else
+		mmio->remove = true;
+	mutex_unlock(&mmio_lock);
+
+	return true;
+}
+
+bool kvm__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, u8 is_write)
+{
+	struct mmio_mapping *mmio;
+
+	mmio = mmio_get(&mmio_tree, phys_addr, len);
+	if (!mmio) {
+		if (vcpu->kvm->cfg.mmio_debug)
+			fprintf(stderr,	"Warning: Ignoring MMIO %s at %016llx (length %u)\n",
+				to_direction(is_write),
+				(unsigned long long)phys_addr, len);
+		goto out;
+	}
+
+	mmio->mmio_fn(vcpu, phys_addr, data, len, is_write, mmio->ptr);
+	mmio_put(vcpu->kvm, &mmio_tree, mmio);
+
+out:
+	return true;
+}
diff --git a/kvmtool/net/uip/arp.c b/kvmtool/net/uip/arp.c
new file mode 100644
index 0000000..98423da
--- /dev/null
+++ b/kvmtool/net/uip/arp.c
@@ -0,0 +1,30 @@
+#include "kvm/uip.h"
+
+int uip_tx_do_arp(struct uip_tx_arg *arg)
+{
+	struct uip_arp *arp, *arp2;
+	struct uip_info *info;
+	struct uip_buf *buf;
+
+	info = arg->info;
+	buf = uip_buf_clone(arg);
+
+	arp	 = (struct uip_arp *)(arg->eth);
+	arp2	 = (struct uip_arp *)(buf->eth);
+
+	/*
+	 * ARP replay code: 2
+	 */
+	arp2->op   = htons(0x2);
+	arp2->dmac = arp->smac;
+	arp2->dip  = arp->sip;
+
+	if (arp->dip == htonl(info->host_ip)) {
+		arp2->smac = info->host_mac;
+		arp2->sip = htonl(info->host_ip);
+
+		uip_buf_set_used(info, buf);
+	}
+
+	return 0;
+}
diff --git a/kvmtool/net/uip/buf.c b/kvmtool/net/uip/buf.c
new file mode 100644
index 0000000..f29ad41
--- /dev/null
+++ b/kvmtool/net/uip/buf.c
@@ -0,0 +1,114 @@
+#include "kvm/uip.h"
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+
+struct uip_buf *uip_buf_get_used(struct uip_info *info)
+{
+	struct uip_buf *buf;
+	bool found = false;
+
+	mutex_lock(&info->buf_lock);
+
+	while (!(info->buf_used_nr > 0))
+		pthread_cond_wait(&info->buf_used_cond, &info->buf_lock.mutex);
+
+	list_for_each_entry(buf, &info->buf_head, list) {
+		if (buf->status == UIP_BUF_STATUS_USED) {
+			/*
+			 * Set status to INUSE immediately to prevent
+			 * someone from using this buf until we free it
+			 */
+			buf->status = UIP_BUF_STATUS_INUSE;
+			info->buf_used_nr--;
+			found = true;
+			break;
+		}
+	}
+
+	mutex_unlock(&info->buf_lock);
+
+	return found ? buf : NULL;
+}
+
+struct uip_buf *uip_buf_get_free(struct uip_info *info)
+{
+	struct uip_buf *buf;
+	bool found = false;
+
+	mutex_lock(&info->buf_lock);
+
+	while (!(info->buf_free_nr > 0))
+		pthread_cond_wait(&info->buf_free_cond, &info->buf_lock.mutex);
+
+	list_for_each_entry(buf, &info->buf_head, list) {
+		if (buf->status == UIP_BUF_STATUS_FREE) {
+			/*
+			 * Set status to INUSE immediately to prevent
+			 * someone from using this buf until we free it
+			 */
+			buf->status = UIP_BUF_STATUS_INUSE;
+			info->buf_free_nr--;
+			found = true;
+			break;
+		}
+	}
+
+	mutex_unlock(&info->buf_lock);
+
+	return found ? buf : NULL;
+}
+
+struct uip_buf *uip_buf_set_used(struct uip_info *info, struct uip_buf *buf)
+{
+	mutex_lock(&info->buf_lock);
+
+	buf->status = UIP_BUF_STATUS_USED;
+	info->buf_used_nr++;
+	pthread_cond_signal(&info->buf_used_cond);
+
+	mutex_unlock(&info->buf_lock);
+
+	return buf;
+}
+
+struct uip_buf *uip_buf_set_free(struct uip_info *info, struct uip_buf *buf)
+{
+	mutex_lock(&info->buf_lock);
+
+	buf->status = UIP_BUF_STATUS_FREE;
+	info->buf_free_nr++;
+	pthread_cond_signal(&info->buf_free_cond);
+
+	mutex_unlock(&info->buf_lock);
+
+	return buf;
+}
+
+struct uip_buf *uip_buf_clone(struct uip_tx_arg *arg)
+{
+	struct uip_buf *buf;
+	struct uip_eth *eth2;
+	struct uip_info *info;
+
+	info = arg->info;
+
+	/*
+	 * Get buffer from device to guest
+	 */
+	buf = uip_buf_get_free(info);
+
+	/*
+	 * Clone buffer
+	 */
+	memcpy(buf->vnet, arg->vnet, arg->vnet_len);
+	memcpy(buf->eth, arg->eth, arg->eth_len);
+	buf->vnet_len	= arg->vnet_len;
+	buf->eth_len	= arg->eth_len;
+
+	eth2		= (struct uip_eth *)buf->eth;
+	eth2->src	= info->host_mac;
+	eth2->dst	= arg->eth->src;
+
+	return buf;
+}
diff --git a/kvmtool/net/uip/core.c b/kvmtool/net/uip/core.c
new file mode 100644
index 0000000..977b9b0
--- /dev/null
+++ b/kvmtool/net/uip/core.c
@@ -0,0 +1,173 @@
+#include "kvm/mutex.h"
+#include "kvm/uip.h"
+
+#include <linux/virtio_net.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <kvm/iovec.h>
+
+int uip_tx(struct iovec *iov, u16 out, struct uip_info *info)
+{
+	void *vnet;
+	struct uip_tx_arg arg;
+	int eth_len, vnet_len;
+	struct uip_eth *eth;
+	u8 *buf = NULL;
+	u16 proto;
+	int i;
+
+	/*
+	 * Buffer from guest to device
+	 */
+	vnet_len = iov[0].iov_len;
+	vnet	 = iov[0].iov_base;
+
+	eth_len	 = iov[1].iov_len;
+	eth	 = iov[1].iov_base;
+
+	/*
+	 * In case, ethernet frame is in more than one iov entry.
+	 * Copy iov buffer into one linear buffer.
+	 */
+	if (out > 2) {
+		eth_len = 0;
+		for (i = 1; i < out; i++)
+			eth_len += iov[i].iov_len;
+
+		buf = malloc(eth_len);
+		if (!buf)
+			return -ENOMEM;
+
+		eth = (struct uip_eth *)buf;
+		for (i = 1; i < out; i++) {
+			memcpy(buf, iov[i].iov_base, iov[i].iov_len);
+			buf += iov[i].iov_len;
+		}
+	}
+
+	memset(&arg, 0, sizeof(arg));
+
+	arg.vnet_len = vnet_len;
+	arg.eth_len = eth_len;
+	arg.info = info;
+	arg.vnet = vnet;
+	arg.eth = eth;
+
+	/*
+	 * Check package type
+	 */
+	proto = ntohs(eth->type);
+
+	switch (proto) {
+	case UIP_ETH_P_ARP:
+		uip_tx_do_arp(&arg);
+		break;
+	case UIP_ETH_P_IP:
+		uip_tx_do_ipv4(&arg);
+		break;
+	default:
+		break;
+	}
+
+	if (out > 2 && buf)
+		free(eth);
+
+	return vnet_len + eth_len;
+}
+
+int uip_rx(struct iovec *iov, u16 in, struct uip_info *info)
+{
+	struct uip_buf *buf;
+	int len;
+
+	/*
+	 * Sleep until there is a buffer for guest
+	 */
+	buf = uip_buf_get_used(info);
+
+	memcpy_toiovecend(iov, buf->vnet, 0, buf->vnet_len);
+	memcpy_toiovecend(iov, buf->eth, buf->vnet_len, buf->eth_len);
+
+	len = buf->vnet_len + buf->eth_len;
+
+	uip_buf_set_free(info, buf);
+	return len;
+}
+
+void uip_static_init(struct uip_info *info)
+{
+	struct list_head *udp_socket_head;
+	struct list_head *tcp_socket_head;
+	struct list_head *buf_head;
+
+	udp_socket_head	= &info->udp_socket_head;
+	tcp_socket_head	= &info->tcp_socket_head;
+	buf_head	= &info->buf_head;
+
+	INIT_LIST_HEAD(udp_socket_head);
+	INIT_LIST_HEAD(tcp_socket_head);
+	INIT_LIST_HEAD(buf_head);
+
+	mutex_init(&info->udp_socket_lock);
+	mutex_init(&info->tcp_socket_lock);
+	mutex_init(&info->buf_lock);
+
+	pthread_cond_init(&info->buf_used_cond, NULL);
+	pthread_cond_init(&info->buf_free_cond, NULL);
+
+	info->buf_used_nr = 0;
+}
+
+int uip_init(struct uip_info *info)
+{
+	struct list_head *buf_head;
+	struct uip_buf *buf;
+	int buf_nr;
+	int i;
+
+	buf_head	= &info->buf_head;
+	buf_nr		= info->buf_nr;
+
+	for (i = 0; i < buf_nr; i++) {
+		buf = malloc(sizeof(*buf));
+		memset(buf, 0, sizeof(*buf));
+
+		buf->status	= UIP_BUF_STATUS_FREE;
+		buf->info	= info;
+		buf->id		= i;
+		list_add_tail(&buf->list, buf_head);
+	}
+
+	list_for_each_entry(buf, buf_head, list) {
+		buf->vnet_len   = info->vnet_hdr_len;
+		buf->vnet	= malloc(buf->vnet_len);
+		buf->eth_len    = 1024*64 + sizeof(struct uip_pseudo_hdr);
+		buf->eth	= malloc(buf->eth_len);
+
+		memset(buf->vnet, 0, buf->vnet_len);
+		memset(buf->eth, 0, buf->eth_len);
+	}
+
+	info->buf_free_nr = buf_nr;
+
+	uip_dhcp_get_dns(info);
+
+	return 0;
+}
+
+void uip_exit(struct uip_info *info)
+{
+	struct uip_buf *buf, *next;
+
+	uip_udp_exit(info);
+	uip_tcp_exit(info);
+	uip_dhcp_exit(info);
+
+	list_for_each_entry_safe(buf, next, &info->buf_head, list) {
+		free(buf->vnet);
+		free(buf->eth);
+		list_del(&buf->list);
+		free(buf);
+	}
+	uip_static_init(info);
+}
diff --git a/kvmtool/net/uip/csum.c b/kvmtool/net/uip/csum.c
new file mode 100644
index 0000000..6f5c4b3
--- /dev/null
+++ b/kvmtool/net/uip/csum.c
@@ -0,0 +1,90 @@
+#include "kvm/uip.h"
+
+static u16 uip_csum(u16 csum, u8 *addr, u16 count)
+{
+	long sum = csum;
+
+	while (count > 1) {
+		sum	+= *(u16 *)addr;
+		addr	+= 2;
+		count	-= 2;
+	}
+
+	if (count > 0)
+		sum += *(unsigned char *)addr;
+
+	while (sum>>16)
+		sum = (sum & 0xffff) + (sum >> 16);
+
+	return ~sum;
+}
+
+u16 uip_csum_ip(struct uip_ip *ip)
+{
+	return uip_csum(0, &ip->vhl, uip_ip_hdrlen(ip));
+}
+
+u16 uip_csum_icmp(struct uip_icmp *icmp)
+{
+	struct uip_ip *ip;
+
+	ip = &icmp->ip;
+	return icmp->csum = uip_csum(0, &icmp->type, htons(ip->len) - uip_ip_hdrlen(ip) - 8); /* icmp header len = 8 */
+}
+
+u16 uip_csum_udp(struct uip_udp *udp)
+{
+	struct uip_pseudo_hdr hdr;
+	struct uip_ip *ip;
+	int udp_len;
+	u8 *udp_hdr = (u8 *)udp + offsetof(struct uip_udp, sport);
+
+	ip	  = &udp->ip;
+
+	hdr.sip   = ip->sip;
+	hdr.dip	  = ip->dip;
+	hdr.zero  = 0;
+	hdr.proto = ip->proto;
+	hdr.len   = udp->len;
+
+	udp_len	  = uip_udp_len(udp);
+
+	if (udp_len % 2) {
+		udp_hdr[udp_len] = 0;		/* zero padding */
+		memcpy(udp_hdr + udp_len + 1, &hdr, sizeof(hdr));
+		return uip_csum(0, udp_hdr, udp_len + 1 + sizeof(hdr));
+	} else {
+		memcpy(udp_hdr + udp_len, &hdr, sizeof(hdr));
+		return uip_csum(0, udp_hdr, udp_len + sizeof(hdr));
+	}
+
+}
+
+u16 uip_csum_tcp(struct uip_tcp *tcp)
+{
+	struct uip_pseudo_hdr hdr;
+	struct uip_ip *ip;
+	u16 tcp_len;
+	u8 *tcp_hdr = (u8 *)tcp + offsetof(struct uip_tcp, sport);
+
+	ip	  = &tcp->ip;
+	tcp_len   = ntohs(ip->len) - uip_ip_hdrlen(ip);
+
+	hdr.sip   = ip->sip;
+	hdr.dip	  = ip->dip;
+	hdr.zero  = 0;
+	hdr.proto = ip->proto;
+	hdr.len   = htons(tcp_len);
+
+	if (tcp_len > UIP_MAX_TCP_PAYLOAD + 20)
+		pr_warning("tcp_len(%d) is too large", tcp_len);
+
+	if (tcp_len % 2) {
+		tcp_hdr[tcp_len] = 0;		/* zero padding */
+		memcpy(tcp_hdr + tcp_len + 1, &hdr, sizeof(hdr));
+		return uip_csum(0, tcp_hdr, tcp_len + 1 + sizeof(hdr));
+	} else {
+		memcpy(tcp_hdr + tcp_len, &hdr, sizeof(hdr));
+		return uip_csum(0, tcp_hdr, tcp_len + sizeof(hdr));
+	}
+}
diff --git a/kvmtool/net/uip/dhcp.c b/kvmtool/net/uip/dhcp.c
new file mode 100644
index 0000000..9de5588
--- /dev/null
+++ b/kvmtool/net/uip/dhcp.c
@@ -0,0 +1,208 @@
+#include "kvm/uip.h"
+
+#include <arpa/inet.h>
+
+#define EMPTY_ADDR "0.0.0.0"
+
+static inline bool uip_dhcp_is_discovery(struct uip_dhcp *dhcp)
+{
+	return (dhcp->option[2] == UIP_DHCP_DISCOVER &&
+		dhcp->option[1] == UIP_DHCP_TAG_MSG_TYPE_LEN &&
+		dhcp->option[0] == UIP_DHCP_TAG_MSG_TYPE);
+}
+
+static inline bool uip_dhcp_is_request(struct uip_dhcp *dhcp)
+{
+	return (dhcp->option[2] == UIP_DHCP_REQUEST &&
+		dhcp->option[1] == UIP_DHCP_TAG_MSG_TYPE_LEN &&
+		dhcp->option[0] == UIP_DHCP_TAG_MSG_TYPE);
+}
+
+bool uip_udp_is_dhcp(struct uip_udp *udp)
+{
+	struct uip_dhcp *dhcp;
+
+	if (ntohs(udp->sport) != UIP_DHCP_PORT_CLIENT ||
+	    ntohs(udp->dport) != UIP_DHCP_PORT_SERVER)
+		return false;
+
+	dhcp = (struct uip_dhcp *)udp;
+
+	if (ntohl(dhcp->magic_cookie) != UIP_DHCP_MAGIC_COOKIE)
+		return false;
+
+	return true;
+}
+
+int uip_dhcp_get_dns(struct uip_info *info)
+{
+	char key[256], val[256];
+	struct in_addr addr;
+	int ret = -1;
+	int n = 0;
+	FILE *fp;
+	u32 ip;
+
+	fp = fopen("/etc/resolv.conf", "r");
+	if (!fp)
+		return ret;
+
+	while (!feof(fp)) {
+		if (fscanf(fp, "%s %s\n", key, val) != 2)
+			continue;
+		if (strncmp("domain", key, 6) == 0)
+			info->domain_name = strndup(val, UIP_DHCP_MAX_DOMAIN_NAME_LEN);
+		else if (strncmp("nameserver", key, 10) == 0) {
+			if (!inet_aton(val, &addr))
+				continue;
+			ip = ntohl(addr.s_addr);
+			if (n < UIP_DHCP_MAX_DNS_SERVER_NR)
+				info->dns_ip[n++] = ip;
+			ret = 0;
+		}
+	}
+
+	fclose(fp);
+	return ret;
+}
+
+static int uip_dhcp_fill_option_name_and_server(struct uip_info *info, u8 *opt, int i)
+{
+	u8 domain_name_len;
+	u32 *addr;
+	int n;
+
+	if (info->domain_name) {
+		domain_name_len	= strlen(info->domain_name);
+		opt[i++]	= UIP_DHCP_TAG_DOMAIN_NAME;
+		opt[i++]	= domain_name_len;
+		memcpy(&opt[i], info->domain_name, domain_name_len);
+		i		+= domain_name_len;
+	}
+
+	for (n = 0; n < UIP_DHCP_MAX_DNS_SERVER_NR; n++) {
+		if (info->dns_ip[n] == 0)
+			continue;
+		opt[i++]	= UIP_DHCP_TAG_DNS_SERVER;
+		opt[i++]	= UIP_DHCP_TAG_DNS_SERVER_LEN;
+		addr		= (u32 *)&opt[i];
+		*addr		= htonl(info->dns_ip[n]);
+		i		+= UIP_DHCP_TAG_DNS_SERVER_LEN;
+	}
+
+	return i;
+}
+static int uip_dhcp_fill_option(struct uip_info *info, struct uip_dhcp *dhcp, int reply_msg_type)
+{
+	int i = 0;
+	u32 *addr;
+	u8 *opt;
+
+	opt		= dhcp->option;
+
+	opt[i++]	= UIP_DHCP_TAG_MSG_TYPE;
+	opt[i++]	= UIP_DHCP_TAG_MSG_TYPE_LEN;
+	opt[i++]	= reply_msg_type;
+
+	opt[i++]	= UIP_DHCP_TAG_SERVER_ID;
+	opt[i++]	= UIP_DHCP_TAG_SERVER_ID_LEN;
+	addr		= (u32 *)&opt[i];
+	*addr		= htonl(info->host_ip);
+	i		+= UIP_DHCP_TAG_SERVER_ID_LEN;
+
+	opt[i++]	= UIP_DHCP_TAG_LEASE_TIME;
+	opt[i++]	= UIP_DHCP_TAG_LEASE_TIME_LEN;
+	addr		= (u32 *)&opt[i];
+	*addr		= htonl(UIP_DHCP_LEASE_TIME);
+	i		+= UIP_DHCP_TAG_LEASE_TIME_LEN;
+
+	opt[i++]	= UIP_DHCP_TAG_SUBMASK;
+	opt[i++]	= UIP_DHCP_TAG_SUBMASK_LEN;
+	addr		= (u32 *)&opt[i];
+	*addr		= htonl(info->guest_netmask);
+	i		+= UIP_DHCP_TAG_SUBMASK_LEN;
+
+	opt[i++]	= UIP_DHCP_TAG_ROUTER;
+	opt[i++]	= UIP_DHCP_TAG_ROUTER_LEN;
+	addr		= (u32 *)&opt[i];
+	*addr		= htonl(info->host_ip);
+	i		+= UIP_DHCP_TAG_ROUTER_LEN;
+
+	opt[i++]	= UIP_DHCP_TAG_ROOT;
+	opt[i++]	= strlen(EMPTY_ADDR);
+	addr		= (u32 *)&opt[i];
+	strcpy((void *) addr, EMPTY_ADDR);
+	i		+= strlen(EMPTY_ADDR);
+
+	i 		= uip_dhcp_fill_option_name_and_server(info, opt, i);
+
+	opt[i++]	= UIP_DHCP_TAG_END;
+
+	return 0;
+}
+
+static int uip_dhcp_make_pkg(struct uip_info *info, struct uip_udp_socket *sk, struct uip_buf *buf, u8 reply_msg_type)
+{
+	struct uip_dhcp *dhcp;
+
+	dhcp		= (struct uip_dhcp *)buf->eth;
+
+	dhcp->msg_type	= 2;
+	dhcp->client_ip	= 0;
+	dhcp->your_ip	= htonl(info->guest_ip);
+	dhcp->server_ip	= htonl(info->host_ip);
+	dhcp->agent_ip	= 0;
+
+	uip_dhcp_fill_option(info, dhcp, reply_msg_type);
+
+	sk->sip		= htonl(info->guest_ip);
+	sk->dip		= htonl(info->host_ip);
+	sk->sport	= htons(UIP_DHCP_PORT_CLIENT);
+	sk->dport	= htons(UIP_DHCP_PORT_SERVER);
+
+	return 0;
+}
+
+int uip_tx_do_ipv4_udp_dhcp(struct uip_tx_arg *arg)
+{
+	struct uip_udp_socket sk;
+	struct uip_dhcp *dhcp;
+	struct uip_info *info;
+	struct uip_buf *buf;
+	u8 reply_msg_type;
+
+	dhcp = (struct uip_dhcp *)arg->eth;
+
+	if (uip_dhcp_is_discovery(dhcp))
+		reply_msg_type = UIP_DHCP_OFFER;
+	else if (uip_dhcp_is_request(dhcp))
+		reply_msg_type = UIP_DHCP_ACK;
+	else
+		return -1;
+
+	buf = uip_buf_clone(arg);
+	info = arg->info;
+
+	/*
+	 * Cook DHCP pkg
+	 */
+	uip_dhcp_make_pkg(info, &sk, buf, reply_msg_type);
+
+	/*
+	 * Cook UDP pkg
+	 */
+	uip_udp_make_pkg(info, &sk, buf, NULL, UIP_DHCP_MAX_PAYLOAD_LEN);
+
+	/*
+	 * Send data received from socket to guest
+	 */
+	uip_buf_set_used(info, buf);
+
+	return 0;
+}
+
+void uip_dhcp_exit(struct uip_info *info)
+{
+	free(info->domain_name);
+	info->domain_name = NULL;
+}
diff --git a/kvmtool/net/uip/icmp.c b/kvmtool/net/uip/icmp.c
new file mode 100644
index 0000000..233297c
--- /dev/null
+++ b/kvmtool/net/uip/icmp.c
@@ -0,0 +1,29 @@
+#include "kvm/uip.h"
+
+int uip_tx_do_ipv4_icmp(struct uip_tx_arg *arg)
+{
+	struct uip_ip *ip, *ip2;
+	struct uip_icmp *icmp2;
+	struct uip_buf *buf;
+
+	buf		= uip_buf_clone(arg);
+
+	icmp2		= (struct uip_icmp *)(buf->eth);
+	ip2		= (struct uip_ip *)(buf->eth);
+	ip		= (struct uip_ip *)(arg->eth);
+
+	ip2->sip	= ip->dip;
+	ip2->dip	= ip->sip;
+	ip2->csum	= 0;
+	/*
+	 * ICMP reply: 0
+	 */
+	icmp2->type	= 0;
+	icmp2->csum	= 0;
+	ip2->csum	= uip_csum_ip(ip2);
+	icmp2->csum	= uip_csum_icmp(icmp2);
+
+	uip_buf_set_used(arg->info, buf);
+
+	return 0;
+}
diff --git a/kvmtool/net/uip/ipv4.c b/kvmtool/net/uip/ipv4.c
new file mode 100644
index 0000000..58373fd
--- /dev/null
+++ b/kvmtool/net/uip/ipv4.c
@@ -0,0 +1,29 @@
+#include "kvm/uip.h"
+
+int uip_tx_do_ipv4(struct uip_tx_arg *arg)
+{
+	struct uip_ip *ip;
+
+	ip = (struct uip_ip *)(arg->eth);
+
+	if (uip_ip_hdrlen(ip) != 20) {
+		pr_warning("IP header length is not 20 bytes");
+		return -1;
+	}
+
+	switch (ip->proto) {
+	case UIP_IP_P_ICMP:
+		uip_tx_do_ipv4_icmp(arg);
+		break;
+	case UIP_IP_P_TCP:
+		uip_tx_do_ipv4_tcp(arg);
+		break;
+	case UIP_IP_P_UDP:
+		uip_tx_do_ipv4_udp(arg);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
diff --git a/kvmtool/net/uip/tcp.c b/kvmtool/net/uip/tcp.c
new file mode 100644
index 0000000..8e0ad52
--- /dev/null
+++ b/kvmtool/net/uip/tcp.c
@@ -0,0 +1,382 @@
+#include "kvm/uip.h"
+
+#include <kvm/kvm.h>
+#include <linux/virtio_net.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <arpa/inet.h>
+
+static int uip_tcp_socket_close(struct uip_tcp_socket *sk, int how)
+{
+	shutdown(sk->fd, how);
+
+	if (sk->write_done && sk->read_done) {
+		shutdown(sk->fd, SHUT_RDWR);
+		close(sk->fd);
+
+		mutex_lock(sk->lock);
+		list_del(&sk->list);
+		mutex_unlock(sk->lock);
+
+		free(sk->buf);
+		free(sk);
+	}
+
+	return 0;
+}
+
+static struct uip_tcp_socket *uip_tcp_socket_find(struct uip_tx_arg *arg, u32 sip, u32 dip, u16 sport, u16 dport)
+{
+	struct list_head *sk_head;
+	struct mutex *sk_lock;
+	struct uip_tcp_socket *sk;
+
+	sk_head = &arg->info->tcp_socket_head;
+	sk_lock = &arg->info->tcp_socket_lock;
+
+	mutex_lock(sk_lock);
+	list_for_each_entry(sk, sk_head, list) {
+		if (sk->sip == sip && sk->dip == dip && sk->sport == sport && sk->dport == dport) {
+			mutex_unlock(sk_lock);
+			return sk;
+		}
+	}
+	mutex_unlock(sk_lock);
+
+	return NULL;
+}
+
+static struct uip_tcp_socket *uip_tcp_socket_alloc(struct uip_tx_arg *arg, u32 sip, u32 dip, u16 sport, u16 dport)
+{
+	struct list_head *sk_head;
+	struct uip_tcp_socket *sk;
+	struct mutex *sk_lock;
+	struct uip_tcp *tcp;
+	struct uip_ip *ip;
+	int ret;
+
+	tcp = (struct uip_tcp *)arg->eth;
+	ip = (struct uip_ip *)arg->eth;
+
+	sk_head = &arg->info->tcp_socket_head;
+	sk_lock = &arg->info->tcp_socket_lock;
+
+	sk = malloc(sizeof(*sk));
+	memset(sk, 0, sizeof(*sk));
+
+	sk->lock			= sk_lock;
+	sk->info			= arg->info;
+
+	sk->fd				= socket(AF_INET, SOCK_STREAM, 0);
+	sk->addr.sin_family		= AF_INET;
+	sk->addr.sin_port		= dport;
+	sk->addr.sin_addr.s_addr	= dip;
+
+	pthread_cond_init(&sk->cond, NULL);
+
+	if (ntohl(dip) == arg->info->host_ip)
+		sk->addr.sin_addr.s_addr = inet_addr("127.0.0.1");
+
+	ret = connect(sk->fd, (struct sockaddr *)&sk->addr, sizeof(sk->addr));
+	if (ret) {
+		free(sk);
+		return NULL;
+	}
+
+	sk->sip		= ip->sip;
+	sk->dip		= ip->dip;
+	sk->sport	= tcp->sport;
+	sk->dport	= tcp->dport;
+
+	mutex_lock(sk_lock);
+	list_add_tail(&sk->list, sk_head);
+	mutex_unlock(sk_lock);
+
+	return sk;
+}
+
+/* Caller holds the sk lock */
+static void uip_tcp_socket_free(struct uip_tcp_socket *sk)
+{
+	/*
+	 * Here we assume that the virtqueues are already inactive so we don't
+	 * race with uip_tx_do_ipv4_tcp. We are racing with
+	 * uip_tcp_socket_thread though, but holding the sk lock ensures that it
+	 * cannot free data concurrently.
+	 */
+	if (sk->thread) {
+		pthread_cancel(sk->thread);
+		pthread_join(sk->thread, NULL);
+	}
+
+	sk->write_done = sk->read_done = 1;
+	uip_tcp_socket_close(sk, SHUT_RDWR);
+}
+
+static int uip_tcp_payload_send(struct uip_tcp_socket *sk, u8 flag, u16 payload_len)
+{
+	struct uip_info *info;
+	struct uip_eth *eth2;
+	struct uip_tcp *tcp2;
+	struct uip_buf *buf;
+	struct uip_ip *ip2;
+
+	info		= sk->info;
+
+	/*
+	 * Get free buffer to send data to guest
+	 */
+	buf		= uip_buf_get_free(info);
+
+	/*
+	 * Cook a ethernet frame
+	 */
+	tcp2		= (struct uip_tcp *)buf->eth;
+	eth2		= (struct uip_eth *)buf->eth;
+	ip2		= (struct uip_ip *)buf->eth;
+
+	eth2->src	= info->host_mac;
+	eth2->dst	= info->guest_mac;
+	eth2->type	= htons(UIP_ETH_P_IP);
+
+	ip2->vhl	= UIP_IP_VER_4 | UIP_IP_HDR_LEN;
+	ip2->tos	= 0;
+	ip2->id		= 0;
+	ip2->flgfrag	= 0;
+	ip2->ttl	= UIP_IP_TTL;
+	ip2->proto	= UIP_IP_P_TCP;
+	ip2->csum	= 0;
+	ip2->sip	= sk->dip;
+	ip2->dip	= sk->sip;
+
+	tcp2->sport	= sk->dport;
+	tcp2->dport	= sk->sport;
+	tcp2->seq	= htonl(sk->seq_server);
+	tcp2->ack	= htonl(sk->ack_server);
+	/*
+	 * Diable TCP options, tcp hdr len equals 20 bytes
+	 */
+	tcp2->off	= UIP_TCP_HDR_LEN;
+	tcp2->flg	= flag;
+	tcp2->win	= htons(UIP_TCP_WIN_SIZE);
+	tcp2->csum	= 0;
+	tcp2->urgent	= 0;
+
+	if (payload_len > 0)
+		memcpy(uip_tcp_payload(tcp2), sk->payload, payload_len);
+
+	ip2->len	= htons(uip_tcp_hdrlen(tcp2) + payload_len + uip_ip_hdrlen(ip2));
+	ip2->csum	= uip_csum_ip(ip2);
+	tcp2->csum	= uip_csum_tcp(tcp2);
+
+	/*
+	 * virtio_net_hdr
+	 */
+	buf->vnet_len	= info->vnet_hdr_len;
+	memset(buf->vnet, 0, buf->vnet_len);
+
+	buf->eth_len	= ntohs(ip2->len) + uip_eth_hdrlen(&ip2->eth);
+
+	/*
+	 * Increase server seq
+	 */
+	sk->seq_server  += payload_len;
+
+	/*
+	 * Send data received from socket to guest
+	 */
+	uip_buf_set_used(info, buf);
+
+	return 0;
+}
+
+static void *uip_tcp_socket_thread(void *p)
+{
+	struct uip_tcp_socket *sk;
+	int len, left, ret;
+	u8 *pos;
+
+	kvm__set_thread_name("uip-tcp");
+
+	sk = p;
+
+	while (1) {
+		pos = sk->buf;
+
+		ret = read(sk->fd, sk->buf, UIP_MAX_TCP_PAYLOAD);
+
+		if (ret <= 0 || ret > UIP_MAX_TCP_PAYLOAD)
+			goto out;
+
+		left = ret;
+
+		while (left > 0) {
+			mutex_lock(sk->lock);
+			while ((len = sk->guest_acked + sk->window_size - sk->seq_server) <= 0)
+				pthread_cond_wait(&sk->cond, &sk->lock->mutex);
+			mutex_unlock(sk->lock);
+
+			sk->payload = pos;
+			if (len > left)
+				len = left;
+			if (len > UIP_MAX_TCP_PAYLOAD)
+				len = UIP_MAX_TCP_PAYLOAD;
+			left -= len;
+			pos += len;
+
+			uip_tcp_payload_send(sk, UIP_TCP_FLAG_ACK, len);
+		}
+	}
+
+out:
+	/*
+	 * Close server to guest TCP connection
+	 */
+	uip_tcp_socket_close(sk, SHUT_RD);
+
+	uip_tcp_payload_send(sk, UIP_TCP_FLAG_FIN | UIP_TCP_FLAG_ACK, 0);
+	sk->seq_server += 1;
+
+	sk->read_done = 1;
+
+	pthread_exit(NULL);
+
+	return NULL;
+}
+
+static int uip_tcp_socket_receive(struct uip_tcp_socket *sk)
+{
+	int ret;
+
+	if (sk->thread == 0) {
+		sk->buf = malloc(UIP_MAX_TCP_PAYLOAD);
+		if (!sk->buf)
+			return -ENOMEM;
+		ret = pthread_create(&sk->thread, NULL, uip_tcp_socket_thread,
+				     (void *)sk);
+		if (ret)
+			free(sk->buf);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int uip_tcp_socket_send(struct uip_tcp_socket *sk, struct uip_tcp *tcp)
+{
+	int len;
+	int ret;
+	u8 *payload;
+
+	if (sk->write_done)
+		return 0;
+
+	payload = uip_tcp_payload(tcp);
+	len = uip_tcp_payloadlen(tcp);
+
+	ret = write(sk->fd, payload, len);
+	if (ret != len)
+		pr_warning("tcp send error");
+
+	return ret;
+}
+
+int uip_tx_do_ipv4_tcp(struct uip_tx_arg *arg)
+{
+	struct uip_tcp_socket *sk;
+	struct uip_tcp *tcp;
+	struct uip_ip *ip;
+	int ret;
+
+	tcp = (struct uip_tcp *)arg->eth;
+	ip = (struct uip_ip *)arg->eth;
+
+	/*
+	 * Guest is trying to start a TCP session, let's fake SYN-ACK to guest
+	 */
+	if (uip_tcp_is_syn(tcp)) {
+		sk = uip_tcp_socket_alloc(arg, ip->sip, ip->dip, tcp->sport, tcp->dport);
+		if (!sk)
+			return -1;
+
+		sk->window_size = ntohs(tcp->win);
+
+		/*
+		 * Setup ISN number
+		 */
+		sk->isn_guest  = uip_tcp_isn(tcp);
+		sk->isn_server = uip_tcp_isn_alloc();
+
+		sk->seq_server = sk->isn_server;
+		sk->ack_server = sk->isn_guest + 1;
+		uip_tcp_payload_send(sk, UIP_TCP_FLAG_SYN | UIP_TCP_FLAG_ACK, 0);
+		sk->seq_server += 1;
+
+		/*
+		 * Start receive thread for data from remote to guest
+		 */
+		uip_tcp_socket_receive(sk);
+
+		goto out;
+	}
+
+	/*
+	 * Find socket we have allocated
+	 */
+	sk = uip_tcp_socket_find(arg, ip->sip, ip->dip, tcp->sport, tcp->dport);
+	if (!sk)
+		return -1;
+
+	mutex_lock(sk->lock);
+	sk->window_size = ntohs(tcp->win);
+	sk->guest_acked = ntohl(tcp->ack);
+	pthread_cond_signal(&sk->cond);
+	mutex_unlock(sk->lock);
+
+	if (uip_tcp_is_fin(tcp)) {
+		if (sk->write_done)
+			goto out;
+
+		sk->write_done = 1;
+		sk->ack_server += 1;
+		uip_tcp_payload_send(sk, UIP_TCP_FLAG_ACK, 0);
+
+		/*
+		 * Close guest to server TCP connection
+		 */
+		uip_tcp_socket_close(sk, SHUT_WR);
+
+		goto out;
+	}
+
+	/*
+	 * Ignore guest to server frames with zero tcp payload
+	 */
+	if (uip_tcp_payloadlen(tcp) == 0)
+		goto out;
+
+	/*
+	 * Sent out TCP data to remote host
+	 */
+	ret = uip_tcp_socket_send(sk, tcp);
+	if (ret < 0)
+		return -1;
+	/*
+	 * Send ACK to guest imediately
+	 */
+	sk->ack_server += ret;
+	uip_tcp_payload_send(sk, UIP_TCP_FLAG_ACK, 0);
+
+out:
+	return 0;
+}
+
+void uip_tcp_exit(struct uip_info *info)
+{
+	struct uip_tcp_socket *sk, *next;
+
+	mutex_lock(&info->tcp_socket_lock);
+	list_for_each_entry_safe(sk, next, &info->tcp_socket_head, list)
+		uip_tcp_socket_free(sk);
+	mutex_unlock(&info->tcp_socket_lock);
+}
diff --git a/kvmtool/net/uip/udp.c b/kvmtool/net/uip/udp.c
new file mode 100644
index 0000000..d2580d0
--- /dev/null
+++ b/kvmtool/net/uip/udp.c
@@ -0,0 +1,268 @@
+#include "kvm/uip.h"
+
+#include <kvm/kvm.h>
+#include <linux/virtio_net.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <sys/socket.h>
+#include <sys/epoll.h>
+#include <fcntl.h>
+
+#define UIP_UDP_MAX_EVENTS 1000
+
+static struct uip_udp_socket *uip_udp_socket_find(struct uip_tx_arg *arg, u32 sip, u32 dip, u16 sport, u16 dport)
+{
+	struct list_head *sk_head;
+	struct uip_udp_socket *sk;
+	struct mutex *sk_lock;
+	struct epoll_event ev;
+	int flags;
+	int ret;
+
+	sk_head = &arg->info->udp_socket_head;
+	sk_lock = &arg->info->udp_socket_lock;
+
+	/*
+	 * Find existing sk
+	 */
+	mutex_lock(sk_lock);
+	list_for_each_entry(sk, sk_head, list) {
+		if (sk->sip == sip && sk->dip == dip && sk->sport == sport && sk->dport == dport) {
+			mutex_unlock(sk_lock);
+			return sk;
+		}
+	}
+	mutex_unlock(sk_lock);
+
+	/*
+	 * Allocate new one
+	 */
+	sk = malloc(sizeof(*sk));
+	memset(sk, 0, sizeof(*sk));
+
+	sk->lock = sk_lock;
+
+	sk->fd = socket(AF_INET, SOCK_DGRAM, 0);
+	if (sk->fd < 0)
+		goto out;
+
+	/*
+	 * Set non-blocking
+	 */
+	flags = fcntl(sk->fd, F_GETFL, 0);
+	flags |= O_NONBLOCK;
+	fcntl(sk->fd, F_SETFL, flags);
+
+	/*
+	 * Add sk->fd to epoll_wait
+	 */
+	ev.events	= EPOLLIN;
+	ev.data.fd	= sk->fd;
+	ev.data.ptr	= sk;
+	if (arg->info->udp_epollfd <= 0)
+		arg->info->udp_epollfd = epoll_create(UIP_UDP_MAX_EVENTS);
+	ret = epoll_ctl(arg->info->udp_epollfd, EPOLL_CTL_ADD, sk->fd, &ev);
+	if (ret == -1)
+		pr_warning("epoll_ctl error");
+
+	sk->addr.sin_family	 = AF_INET;
+	sk->addr.sin_addr.s_addr = dip;
+	sk->addr.sin_port	 = dport;
+
+	sk->sip			 = sip;
+	sk->dip			 = dip;
+	sk->sport		 = sport;
+	sk->dport		 = dport;
+
+	mutex_lock(sk_lock);
+	list_add_tail(&sk->list, sk_head);
+	mutex_unlock(sk_lock);
+
+	return sk;
+
+out:
+	free(sk);
+	return NULL;
+}
+
+static int uip_udp_socket_send(struct uip_udp_socket *sk, struct uip_udp *udp)
+{
+	int len;
+	int ret;
+
+	len = ntohs(udp->len) - uip_udp_hdrlen(udp);
+
+	ret = sendto(sk->fd, udp->payload, len, 0, (struct sockaddr *)&sk->addr, sizeof(sk->addr));
+	if (ret != len)
+		return -1;
+
+	return 0;
+}
+
+int uip_udp_make_pkg(struct uip_info *info, struct uip_udp_socket *sk, struct uip_buf *buf, u8* payload, int payload_len)
+{
+	struct uip_eth *eth2;
+	struct uip_udp *udp2;
+	struct uip_ip *ip2;
+
+	/*
+	 * Cook a ethernet frame
+	 */
+	udp2		= (struct uip_udp *)(buf->eth);
+	eth2		= (struct uip_eth *)buf->eth;
+	ip2		= (struct uip_ip *)(buf->eth);
+
+	eth2->src	= info->host_mac;
+	eth2->dst	= info->guest_mac;
+	eth2->type	= htons(UIP_ETH_P_IP);
+
+	ip2->vhl	= UIP_IP_VER_4 | UIP_IP_HDR_LEN;
+	ip2->tos	= 0;
+	ip2->id		= 0;
+	ip2->flgfrag	= 0;
+	ip2->ttl	= UIP_IP_TTL;
+	ip2->proto	= UIP_IP_P_UDP;
+	ip2->csum	= 0;
+
+	ip2->sip	= sk->dip;
+	ip2->dip	= sk->sip;
+	udp2->sport	= sk->dport;
+	udp2->dport	= sk->sport;
+
+	udp2->len	= htons(payload_len + uip_udp_hdrlen(udp2));
+	udp2->csum	= 0;
+
+	if (payload)
+		memcpy(udp2->payload, payload, payload_len);
+
+	ip2->len	= udp2->len + htons(uip_ip_hdrlen(ip2));
+	ip2->csum	= uip_csum_ip(ip2);
+	udp2->csum	= uip_csum_udp(udp2);
+
+	/*
+	 * virtio_net_hdr
+	 */
+	buf->vnet_len	= info->vnet_hdr_len;
+	memset(buf->vnet, 0, buf->vnet_len);
+
+	buf->eth_len	= ntohs(ip2->len) + uip_eth_hdrlen(&ip2->eth);
+
+	return 0;
+}
+
+static void *uip_udp_socket_thread(void *p)
+{
+	struct epoll_event events[UIP_UDP_MAX_EVENTS];
+	struct uip_udp_socket *sk;
+	struct uip_info *info;
+	struct uip_buf *buf;
+	int payload_len;
+	u8 *payload;
+	int nfds;
+	int i;
+
+	kvm__set_thread_name("uip-udp");
+
+	info = p;
+	payload = info->udp_buf;
+
+	while (1) {
+		nfds = epoll_wait(info->udp_epollfd, events, UIP_UDP_MAX_EVENTS, -1);
+
+		if (nfds == -1)
+			continue;
+
+		for (i = 0; i < nfds; i++) {
+
+			sk = events[i].data.ptr;
+			payload_len = recvfrom(sk->fd, payload, UIP_MAX_UDP_PAYLOAD, 0, NULL, NULL);
+			if (payload_len < 0)
+				continue;
+
+			/*
+			 * Get free buffer to send data to guest
+			 */
+			buf = uip_buf_get_free(info);
+
+			uip_udp_make_pkg(info, sk, buf, payload, payload_len);
+
+			/*
+			 * Send data received from socket to guest
+			 */
+			uip_buf_set_used(info, buf);
+		}
+	}
+
+	mutex_lock(&info->udp_socket_lock);
+	free(info->udp_buf);
+	info->udp_buf = NULL;
+	mutex_unlock(&info->udp_socket_lock);
+
+	pthread_exit(NULL);
+	return NULL;
+}
+
+int uip_tx_do_ipv4_udp(struct uip_tx_arg *arg)
+{
+	struct uip_udp_socket *sk;
+	struct uip_info *info;
+	struct uip_udp *udp;
+	struct uip_ip *ip;
+	int ret;
+
+	udp	= (struct uip_udp *)(arg->eth);
+	ip	= (struct uip_ip *)(arg->eth);
+	info	= arg->info;
+
+	if (uip_udp_is_dhcp(udp)) {
+		uip_tx_do_ipv4_udp_dhcp(arg);
+		return 0;
+	}
+
+	/*
+	 * Find socket we have allocated before, otherwise allocate one
+	 */
+	sk = uip_udp_socket_find(arg, ip->sip, ip->dip, udp->sport, udp->dport);
+	if (!sk)
+		return -1;
+
+	/*
+	 * Send out UDP data to remote host
+	 */
+	ret = uip_udp_socket_send(sk, udp);
+	if (ret)
+		return -1;
+
+	if (!info->udp_thread) {
+		info->udp_buf = malloc(UIP_MAX_UDP_PAYLOAD);
+		if (!info->udp_buf)
+			return -1;
+
+		pthread_create(&info->udp_thread, NULL, uip_udp_socket_thread, (void *)info);
+	}
+
+	return 0;
+}
+
+void uip_udp_exit(struct uip_info *info)
+{
+	struct uip_udp_socket *sk, *next;
+
+	mutex_lock(&info->udp_socket_lock);
+	if (info->udp_thread) {
+		pthread_cancel(info->udp_thread);
+		pthread_join(info->udp_thread, NULL);
+		info->udp_thread = 0;
+		free(info->udp_buf);
+	}
+	if (info->udp_epollfd > 0) {
+		close(info->udp_epollfd);
+		info->udp_epollfd = 0;
+	}
+
+	list_for_each_entry_safe(sk, next, &info->udp_socket_head, list) {
+		close(sk->fd);
+		free(sk);
+	}
+	mutex_unlock(&info->udp_socket_lock);
+}
diff --git a/kvmtool/pci.c b/kvmtool/pci.c
new file mode 100644
index 0000000..2e2c027
--- /dev/null
+++ b/kvmtool/pci.c
@@ -0,0 +1,545 @@
+#include "kvm/devices.h"
+#include "kvm/pci.h"
+#include "kvm/ioport.h"
+#include "kvm/irq.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+#include <linux/err.h>
+#include <assert.h>
+
+static u32 pci_config_address_bits;
+
+/* This is within our PCI gap - in an unused area.
+ * Note this is a PCI *bus address*, is used to assign BARs etc.!
+ * (That's why it can still 32bit even with 64bit guests-- 64bit
+ * PCI isn't currently supported.)
+ */
+static u32 mmio_blocks			= KVM_PCI_MMIO_AREA;
+static u16 io_port_blocks		= PCI_IOPORT_START;
+
+u16 pci_get_io_port_block(u32 size)
+{
+	u16 port = ALIGN(io_port_blocks, PCI_IO_SIZE);
+
+	io_port_blocks = port + size;
+	return port;
+}
+
+/*
+ * BARs must be naturally aligned, so enforce this in the allocator.
+ */
+u32 pci_get_mmio_block(u32 size)
+{
+	u32 block = ALIGN(mmio_blocks, size);
+	mmio_blocks = block + size;
+	return block;
+}
+
+void *pci_find_cap(struct pci_device_header *hdr, u8 cap_type)
+{
+	u8 pos;
+	struct pci_cap_hdr *cap;
+
+	pci_for_each_cap(pos, cap, hdr) {
+		if (cap->type == cap_type)
+			return cap;
+	}
+
+	return NULL;
+}
+
+int pci__assign_irq(struct pci_device_header *pci_hdr)
+{
+	/*
+	 * PCI supports only INTA#,B#,C#,D# per device.
+	 *
+	 * A#,B#,C#,D# are allowed for multifunctional devices so stick
+	 * with A# for our single function devices.
+	 */
+	pci_hdr->irq_pin	= 1;
+	pci_hdr->irq_line	= irq__alloc_line();
+
+	if (!pci_hdr->irq_type)
+		pci_hdr->irq_type = IRQ_TYPE_EDGE_RISING;
+
+	return pci_hdr->irq_line;
+}
+
+static bool pci_bar_is_implemented(struct pci_device_header *pci_hdr, int bar_num)
+{
+	return pci__bar_size(pci_hdr, bar_num);
+}
+
+static bool pci_bar_is_active(struct pci_device_header *pci_hdr, int bar_num)
+{
+	return  pci_hdr->bar_active[bar_num];
+}
+
+static void *pci_config_address_ptr(u16 port)
+{
+	unsigned long offset;
+	void *base;
+
+	offset	= port - PCI_CONFIG_ADDRESS;
+	base	= &pci_config_address_bits;
+
+	return base + offset;
+}
+
+static bool pci_config_address_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	void *p = pci_config_address_ptr(port);
+
+	memcpy(p, data, size);
+
+	return true;
+}
+
+static bool pci_config_address_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	void *p = pci_config_address_ptr(port);
+
+	memcpy(data, p, size);
+
+	return true;
+}
+
+static struct ioport_operations pci_config_address_ops = {
+	.io_in	= pci_config_address_in,
+	.io_out	= pci_config_address_out,
+};
+
+static bool pci_device_exists(u8 bus_number, u8 device_number, u8 function_number)
+{
+	union pci_config_address pci_config_address;
+
+	pci_config_address.w = ioport__read32(&pci_config_address_bits);
+
+	if (pci_config_address.bus_number != bus_number)
+		return false;
+
+	if (pci_config_address.function_number != function_number)
+		return false;
+
+	return !IS_ERR_OR_NULL(device__find_dev(DEVICE_BUS_PCI, device_number));
+}
+
+static bool pci_config_data_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	union pci_config_address pci_config_address;
+
+	if (size > 4)
+		size = 4;
+
+	pci_config_address.w = ioport__read32(&pci_config_address_bits);
+	/*
+	 * If someone accesses PCI configuration space offsets that are not
+	 * aligned to 4 bytes, it uses ioports to signify that.
+	 */
+	pci_config_address.reg_offset = port - PCI_CONFIG_DATA;
+
+	pci__config_wr(vcpu->kvm, pci_config_address, data, size);
+
+	return true;
+}
+
+static bool pci_config_data_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	union pci_config_address pci_config_address;
+
+	if (size > 4)
+		size = 4;
+
+	pci_config_address.w = ioport__read32(&pci_config_address_bits);
+	/*
+	 * If someone accesses PCI configuration space offsets that are not
+	 * aligned to 4 bytes, it uses ioports to signify that.
+	 */
+	pci_config_address.reg_offset = port - PCI_CONFIG_DATA;
+
+	pci__config_rd(vcpu->kvm, pci_config_address, data, size);
+
+	return true;
+}
+
+static struct ioport_operations pci_config_data_ops = {
+	.io_in	= pci_config_data_in,
+	.io_out	= pci_config_data_out,
+};
+
+static int pci_activate_bar(struct kvm *kvm, struct pci_device_header *pci_hdr,
+			    int bar_num)
+{
+	int r = 0;
+
+	if (pci_bar_is_active(pci_hdr, bar_num))
+		goto out;
+
+	r = pci_hdr->bar_activate_fn(kvm, pci_hdr, bar_num, pci_hdr->data);
+	if (r < 0) {
+		pci_dev_warn(pci_hdr, "Error activating emulation for BAR %d",
+			     bar_num);
+		goto out;
+	}
+	pci_hdr->bar_active[bar_num] = true;
+
+out:
+	return r;
+}
+
+static int pci_deactivate_bar(struct kvm *kvm, struct pci_device_header *pci_hdr,
+			      int bar_num)
+{
+	int r = 0;
+
+	if (!pci_bar_is_active(pci_hdr, bar_num))
+		goto out;
+
+	r = pci_hdr->bar_deactivate_fn(kvm, pci_hdr, bar_num, pci_hdr->data);
+	if (r < 0) {
+		pci_dev_warn(pci_hdr, "Error deactivating emulation for BAR %d",
+			     bar_num);
+		goto out;
+	}
+	pci_hdr->bar_active[bar_num] = false;
+
+out:
+	return r;
+}
+
+static void pci_config_command_wr(struct kvm *kvm,
+				  struct pci_device_header *pci_hdr,
+				  u16 new_command)
+{
+	int i;
+	bool toggle_io, toggle_mem;
+
+	toggle_io = (pci_hdr->command ^ new_command) & PCI_COMMAND_IO;
+	toggle_mem = (pci_hdr->command ^ new_command) & PCI_COMMAND_MEMORY;
+
+	for (i = 0; i < 6; i++) {
+		if (!pci_bar_is_implemented(pci_hdr, i))
+			continue;
+
+		if (toggle_io && pci__bar_is_io(pci_hdr, i)) {
+			if (__pci__io_space_enabled(new_command))
+				pci_activate_bar(kvm, pci_hdr, i);
+			else
+				pci_deactivate_bar(kvm, pci_hdr, i);
+		}
+
+		if (toggle_mem && pci__bar_is_memory(pci_hdr, i)) {
+			if (__pci__memory_space_enabled(new_command))
+				pci_activate_bar(kvm, pci_hdr, i);
+			else
+				pci_deactivate_bar(kvm, pci_hdr, i);
+		}
+	}
+
+	pci_hdr->command = new_command;
+}
+
+static int pci_toggle_bar_regions(bool activate, struct kvm *kvm, u32 start, u32 size)
+{
+	struct device_header *dev_hdr;
+	struct pci_device_header *tmp_hdr;
+	u32 tmp_start, tmp_size;
+	int i, r;
+
+	dev_hdr = device__first_dev(DEVICE_BUS_PCI);
+	while (dev_hdr) {
+		tmp_hdr = dev_hdr->data;
+		for (i = 0; i < 6; i++) {
+			if (!pci_bar_is_implemented(tmp_hdr, i))
+				continue;
+
+			tmp_start = pci__bar_address(tmp_hdr, i);
+			tmp_size = pci__bar_size(tmp_hdr, i);
+			if (tmp_start + tmp_size <= start ||
+			    tmp_start >= start + size)
+				continue;
+
+			if (activate)
+				r = pci_activate_bar(kvm, tmp_hdr, i);
+			else
+				r = pci_deactivate_bar(kvm, tmp_hdr, i);
+			if (r < 0)
+				return r;
+		}
+		dev_hdr = device__next_dev(dev_hdr);
+	}
+
+	return 0;
+}
+
+static inline int pci_activate_bar_regions(struct kvm *kvm, u32 start, u32 size)
+{
+	return pci_toggle_bar_regions(true, kvm, start, size);
+}
+
+static inline int pci_deactivate_bar_regions(struct kvm *kvm, u32 start, u32 size)
+{
+	return pci_toggle_bar_regions(false, kvm, start, size);
+}
+
+static void pci_config_bar_wr(struct kvm *kvm,
+			      struct pci_device_header *pci_hdr, int bar_num,
+			      u32 value)
+{
+	u32 old_addr, new_addr, bar_size;
+	u32 mask;
+	int r;
+
+	if (pci__bar_is_io(pci_hdr, bar_num))
+		mask = (u32)PCI_BASE_ADDRESS_IO_MASK;
+	else
+		mask = (u32)PCI_BASE_ADDRESS_MEM_MASK;
+
+	/*
+	 * If the kernel masks the BAR, it will expect to find the size of the
+	 * BAR there next time it reads from it. After the kernel reads the
+	 * size, it will write the address back.
+	 *
+	 * According to the PCI local bus specification REV 3.0: The number of
+	 * upper bits that a device actually implements depends on how much of
+	 * the address space the device will respond to. A device that wants a 1
+	 * MB memory address space (using a 32-bit base address register) would
+	 * build the top 12 bits of the address register, hardwiring the other
+	 * bits to 0.
+	 *
+	 * Furthermore, software can determine how much address space the device
+	 * requires by writing a value of all 1's to the register and then
+	 * reading the value back. The device will return 0's in all don't-care
+	 * address bits, effectively specifying the address space required.
+	 *
+	 * Software computes the size of the address space with the formula
+	 * S =  ~B + 1, where S is the memory size and B is the value read from
+	 * the BAR. This means that the BAR value that kvmtool should return is
+	 * B = ~(S - 1).
+	 */
+	if (value == 0xffffffff) {
+		value = ~(pci__bar_size(pci_hdr, bar_num) - 1);
+		/* Preserve the special bits. */
+		value = (value & mask) | (pci_hdr->bar[bar_num] & ~mask);
+		pci_hdr->bar[bar_num] = value;
+		return;
+	}
+
+	value = (value & mask) | (pci_hdr->bar[bar_num] & ~mask);
+
+	/* Don't toggle emulation when region type access is disbled. */
+	if (pci__bar_is_io(pci_hdr, bar_num) &&
+	    !pci__io_space_enabled(pci_hdr)) {
+		pci_hdr->bar[bar_num] = value;
+		return;
+	}
+
+	if (pci__bar_is_memory(pci_hdr, bar_num) &&
+	    !pci__memory_space_enabled(pci_hdr)) {
+		pci_hdr->bar[bar_num] = value;
+		return;
+	}
+
+	/*
+	 * BAR reassignment can be done while device access is enabled and
+	 * memory regions for different devices can overlap as long as no access
+	 * is made to the overlapping memory regions. To implement BAR
+	 * reasignment, we deactivate emulation for the region described by the
+	 * BAR value that the guest is changing, we disable emulation for the
+	 * regions that overlap with the new one (by scanning through all PCI
+	 * devices), we enable emulation for the new BAR value and finally we
+	 * enable emulation for all device regions that were overlapping with
+	 * the old value.
+	 */
+	old_addr = pci__bar_address(pci_hdr, bar_num);
+	new_addr = __pci__bar_address(value);
+	bar_size = pci__bar_size(pci_hdr, bar_num);
+
+	r = pci_deactivate_bar(kvm, pci_hdr, bar_num);
+	if (r < 0)
+		return;
+
+	r = pci_deactivate_bar_regions(kvm, new_addr, bar_size);
+	if (r < 0) {
+		/*
+		 * We cannot update the BAR because of an overlapping region
+		 * that failed to deactivate emulation, so keep the old BAR
+		 * value and re-activate emulation for it.
+		 */
+		pci_activate_bar(kvm, pci_hdr, bar_num);
+		return;
+	}
+
+	pci_hdr->bar[bar_num] = value;
+	r = pci_activate_bar(kvm, pci_hdr, bar_num);
+	if (r < 0) {
+		/*
+		 * New region cannot be emulated, re-enable the regions that
+		 * were overlapping.
+		 */
+		pci_activate_bar_regions(kvm, new_addr, bar_size);
+		return;
+	}
+
+	pci_activate_bar_regions(kvm, old_addr, bar_size);
+}
+
+void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data, int size)
+{
+	void *base;
+	u8 bar, offset;
+	struct pci_device_header *pci_hdr;
+	u8 dev_num = addr.device_number;
+	u32 value = 0;
+
+	if (!pci_device_exists(addr.bus_number, dev_num, 0))
+		return;
+
+	offset = addr.w & PCI_DEV_CFG_MASK;
+	base = pci_hdr = device__find_dev(DEVICE_BUS_PCI, dev_num)->data;
+
+	if (pci_hdr->cfg_ops.write)
+		pci_hdr->cfg_ops.write(kvm, pci_hdr, offset, data, size);
+
+	/*
+	 * legacy hack: ignore writes to uninitialized regions (e.g. ROM BAR).
+	 * Not very nice but has been working so far.
+	 */
+	if (*(u32 *)(base + offset) == 0)
+		return;
+
+	if (offset == PCI_COMMAND) {
+		memcpy(&value, data, size);
+		pci_config_command_wr(kvm, pci_hdr, (u16)value);
+		return;
+	}
+
+	bar = (offset - PCI_BAR_OFFSET(0)) / sizeof(u32);
+	if (bar < 6) {
+		memcpy(&value, data, size);
+		pci_config_bar_wr(kvm, pci_hdr, bar, value);
+		return;
+	}
+
+	memcpy(base + offset, data, size);
+}
+
+void pci__config_rd(struct kvm *kvm, union pci_config_address addr, void *data, int size)
+{
+	u8 offset;
+	struct pci_device_header *pci_hdr;
+	u8 dev_num = addr.device_number;
+
+	if (pci_device_exists(addr.bus_number, dev_num, 0)) {
+		pci_hdr = device__find_dev(DEVICE_BUS_PCI, dev_num)->data;
+		offset = addr.w & PCI_DEV_CFG_MASK;
+
+		if (pci_hdr->cfg_ops.read)
+			pci_hdr->cfg_ops.read(kvm, pci_hdr, offset, data, size);
+
+		memcpy(data, (void *)pci_hdr + offset, size);
+	} else {
+		memset(data, 0xff, size);
+	}
+}
+
+static void pci_config_mmio_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
+				   u32 len, u8 is_write, void *kvm)
+{
+	union pci_config_address cfg_addr;
+
+	addr			-= KVM_PCI_CFG_AREA;
+	cfg_addr.w		= (u32)addr;
+	cfg_addr.enable_bit	= 1;
+
+	if (len > 4)
+		len = 4;
+
+	if (is_write)
+		pci__config_wr(kvm, cfg_addr, data, len);
+	else
+		pci__config_rd(kvm, cfg_addr, data, len);
+}
+
+struct pci_device_header *pci__find_dev(u8 dev_num)
+{
+	struct device_header *hdr = device__find_dev(DEVICE_BUS_PCI, dev_num);
+
+	if (IS_ERR_OR_NULL(hdr))
+		return NULL;
+
+	return hdr->data;
+}
+
+int pci__register_bar_regions(struct kvm *kvm, struct pci_device_header *pci_hdr,
+			      bar_activate_fn_t bar_activate_fn,
+			      bar_deactivate_fn_t bar_deactivate_fn, void *data)
+{
+	int i, r;
+
+	assert(bar_activate_fn && bar_deactivate_fn);
+
+	pci_hdr->bar_activate_fn = bar_activate_fn;
+	pci_hdr->bar_deactivate_fn = bar_deactivate_fn;
+	pci_hdr->data = data;
+
+	for (i = 0; i < 6; i++) {
+		if (!pci_bar_is_implemented(pci_hdr, i))
+			continue;
+
+		assert(!pci_bar_is_active(pci_hdr, i));
+
+		if (pci__bar_is_io(pci_hdr, i) &&
+		    pci__io_space_enabled(pci_hdr)) {
+			r = pci_activate_bar(kvm, pci_hdr, i);
+			if (r < 0)
+				return r;
+		}
+
+		if (pci__bar_is_memory(pci_hdr, i) &&
+		    pci__memory_space_enabled(pci_hdr)) {
+			r = pci_activate_bar(kvm, pci_hdr, i);
+			if (r < 0)
+				return r;
+		}
+	}
+
+	return 0;
+}
+
+int pci__init(struct kvm *kvm)
+{
+	int r;
+
+	r = ioport__register(kvm, PCI_CONFIG_DATA + 0, &pci_config_data_ops, 4, NULL);
+	if (r < 0)
+		return r;
+
+	r = ioport__register(kvm, PCI_CONFIG_ADDRESS + 0, &pci_config_address_ops, 4, NULL);
+	if (r < 0)
+		goto err_unregister_data;
+
+	r = kvm__register_mmio(kvm, KVM_PCI_CFG_AREA, PCI_CFG_SIZE, false,
+			       pci_config_mmio_access, kvm);
+	if (r < 0)
+		goto err_unregister_addr;
+
+	return 0;
+
+err_unregister_addr:
+	ioport__unregister(kvm, PCI_CONFIG_ADDRESS);
+err_unregister_data:
+	ioport__unregister(kvm, PCI_CONFIG_DATA);
+	return r;
+}
+dev_base_init(pci__init);
+
+int pci__exit(struct kvm *kvm)
+{
+	ioport__unregister(kvm, PCI_CONFIG_DATA);
+	ioport__unregister(kvm, PCI_CONFIG_ADDRESS);
+
+	return 0;
+}
+dev_base_exit(pci__exit);
diff --git a/kvmtool/powerpc/boot.c b/kvmtool/powerpc/boot.c
new file mode 100644
index 0000000..2557fc0
--- /dev/null
+++ b/kvmtool/powerpc/boot.c
@@ -0,0 +1,8 @@
+#include "kvm/kvm.h"
+
+#include <stdbool.h>
+
+bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename)
+{
+	return false;
+}
diff --git a/kvmtool/powerpc/cpu_info.c b/kvmtool/powerpc/cpu_info.c
new file mode 100644
index 0000000..a9dfe39
--- /dev/null
+++ b/kvmtool/powerpc/cpu_info.c
@@ -0,0 +1,210 @@
+/*
+ * PPC CPU identification
+ *
+ * This is a very simple "host CPU info" struct to get us going.
+ * For the little host information we need, I don't want to grub about
+ * parsing stuff in /proc/device-tree so just match host PVR to differentiate
+ * PPC970 and POWER7 (which is all that's currently supported).
+ *
+ * Qemu does something similar but this is MUCH simpler!
+ *
+ * Copyright 2012 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <kvm/kvm.h>
+#include <sys/ioctl.h>
+
+#include "cpu_info.h"
+#include "kvm/util.h"
+
+/* POWER7 */
+
+static struct cpu_info cpu_power7_info = {
+	.name = "POWER7",
+	.tb_freq = 512000000,
+	.d_bsize = 128,
+	.i_bsize = 128,
+	.flags = CPUINFO_FLAG_DFP | CPUINFO_FLAG_VSX | CPUINFO_FLAG_VMX,
+	.mmu_info = {
+		.flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS,
+		.slb_size = 32,
+	},
+};
+
+/* POWER8 */
+
+static struct cpu_info cpu_power8_info = {
+	.name = "POWER8",
+	.tb_freq = 512000000,
+	.d_bsize = 128,
+	.i_bsize = 128,
+	.flags = CPUINFO_FLAG_DFP | CPUINFO_FLAG_VSX | CPUINFO_FLAG_VMX,
+	.mmu_info = {
+		.flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS,
+		.slb_size = 32,
+	},
+};
+
+/* PPC970/G5 */
+
+static struct cpu_info cpu_970_info = {
+	.name = "G5",
+	.tb_freq = 33333333,
+	.d_bsize = 128,
+	.i_bsize = 128,
+	.flags = CPUINFO_FLAG_VMX,
+};
+
+/* This is a default catchall for 'no match' on PVR: */
+static struct cpu_info cpu_dummy_info = { .name = "unknown" };
+
+static struct pvr_info host_pvr_info[] = {
+	{ 0xffffffff, 0x0f000003, &cpu_power7_info },
+	{ 0xffff0000, 0x003f0000, &cpu_power7_info },
+	{ 0xffff0000, 0x004a0000, &cpu_power7_info },
+	{ 0xffff0000, 0x004b0000, &cpu_power8_info },
+	{ 0xffff0000, 0x00390000, &cpu_970_info },
+	{ 0xffff0000, 0x003c0000, &cpu_970_info },
+        { 0xffff0000, 0x00440000, &cpu_970_info },
+        { 0xffff0000, 0x00450000, &cpu_970_info },
+};
+
+/* If we can't query the kernel for supported page sizes assume 4K and 16M */
+static struct kvm_ppc_one_seg_page_size fallback_sps[] = {
+	[0] = {
+		.page_shift = 12,
+		.slb_enc    = 0,
+		.enc =  {
+			[0] = {
+				.page_shift = 12,
+				.pte_enc    = 0,
+			},
+		},
+	},
+	[1] = {
+		.page_shift = 24,
+		.slb_enc    = 0x100,
+		.enc =  {
+			[0] = {
+				.page_shift = 24,
+				.pte_enc    = 0,
+			},
+		},
+	},
+};
+
+
+static void setup_mmu_info(struct kvm *kvm, struct cpu_info *cpu_info)
+{
+	static struct kvm_ppc_smmu_info *mmu_info;
+	struct kvm_ppc_one_seg_page_size *sps;
+	int i, j, k, valid;
+
+	if (!kvm__supports_extension(kvm, KVM_CAP_PPC_GET_SMMU_INFO)) {
+		memcpy(&cpu_info->mmu_info.sps, fallback_sps, sizeof(fallback_sps));
+	} else if (ioctl(kvm->vm_fd, KVM_PPC_GET_SMMU_INFO, &cpu_info->mmu_info) < 0) {
+			die_perror("KVM_PPC_GET_SMMU_INFO failed");
+	}
+
+	mmu_info = &cpu_info->mmu_info;
+
+	if (!(mmu_info->flags & KVM_PPC_PAGE_SIZES_REAL))
+		/* Guest pages are not restricted by the backing page size */
+		return;
+
+	/* Filter based on backing page size */
+
+	for (i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) {
+		sps = &mmu_info->sps[i];
+
+		if (!sps->page_shift)
+			break;
+
+		if (kvm->ram_pagesize < (1ul << sps->page_shift)) {
+			/* Mark the whole segment size invalid */
+			sps->page_shift = 0;
+			continue;
+		}
+
+		/* Check each page size for the segment */
+		for (j = 0, valid = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) {
+			if (!sps->enc[j].page_shift)
+				break;
+
+			if (kvm->ram_pagesize < (1ul << sps->enc[j].page_shift))
+				sps->enc[j].page_shift = 0;
+			else
+				valid++;
+		}
+
+		if (!valid) {
+			/* Mark the whole segment size invalid */
+			sps->page_shift = 0;
+			continue;
+		}
+
+		/* Mark any trailing entries invalid if we broke out early */
+		for (k = j; k < KVM_PPC_PAGE_SIZES_MAX_SZ; k++)
+			sps->enc[k].page_shift = 0;
+
+		/* Collapse holes */
+		for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) {
+			if (sps->enc[j].page_shift)
+				continue;
+
+			for (k = j + 1; k < KVM_PPC_PAGE_SIZES_MAX_SZ; k++) {
+				if (sps->enc[k].page_shift) {
+					sps->enc[j] = sps->enc[k];
+					sps->enc[k].page_shift = 0;
+					break;
+				}
+			}
+		}
+	}
+
+	/* Mark any trailing entries invalid if we broke out early */
+	for (j = i; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++)
+		mmu_info->sps[j].page_shift = 0;
+
+	/* Collapse holes */
+	for (i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) {
+		if (mmu_info->sps[i].page_shift)
+			continue;
+
+		for (j = i + 1; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) {
+			if (mmu_info->sps[j].page_shift) {
+				mmu_info->sps[i] = mmu_info->sps[j];
+				mmu_info->sps[j].page_shift = 0;
+				break;
+			}
+		}
+	}
+}
+
+struct cpu_info *find_cpu_info(struct kvm *kvm)
+{
+	struct cpu_info *info;
+	unsigned int i;
+	u32 pvr = kvm->arch.pvr;
+
+	for (info = NULL, i = 0; i < ARRAY_SIZE(host_pvr_info); i++) {
+		if ((pvr & host_pvr_info[i].pvr_mask) == host_pvr_info[i].pvr) {
+			info = host_pvr_info[i].cpu_info;
+			break;
+		}
+	}
+
+	/* Didn't find anything? Rut-ro. */
+	if (!info) {
+		pr_warning("Host CPU unsupported by kvmtool\n");
+		info = &cpu_dummy_info;
+	}
+
+	setup_mmu_info(kvm, info);
+
+	return info;
+}
diff --git a/kvmtool/powerpc/cpu_info.h b/kvmtool/powerpc/cpu_info.h
new file mode 100644
index 0000000..f61707a
--- /dev/null
+++ b/kvmtool/powerpc/cpu_info.h
@@ -0,0 +1,42 @@
+/*
+ * PPC CPU identification
+ *
+ * Copyright 2012 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef CPU_INFO_H
+#define CPU_INFO_H
+
+#include <kvm/kvm.h>
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/kvm.h>
+
+struct cpu_info {
+	const char	*name;
+	u32		tb_freq; /* timebase frequency */
+	u32		d_bsize; /* d-cache block size */
+	u32		i_bsize; /* i-cache block size */
+	u32		flags;
+	struct kvm_ppc_smmu_info mmu_info;
+};
+
+struct pvr_info {
+	u32		pvr_mask;
+	u32		pvr;
+	struct cpu_info *cpu_info;
+};
+
+/* Misc capabilities/CPU properties */
+#define CPUINFO_FLAG_DFP	0x00000001
+#define CPUINFO_FLAG_VMX	0x00000002
+#define CPUINFO_FLAG_VSX	0x00000004
+
+struct cpu_info *find_cpu_info(struct kvm *kvm);
+
+#endif
diff --git a/kvmtool/powerpc/include/asm/kvm.h b/kvmtool/powerpc/include/asm/kvm.h
new file mode 100644
index 0000000..b0f72de
--- /dev/null
+++ b/kvmtool/powerpc/include/asm/kvm.h
@@ -0,0 +1,726 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2007
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#ifndef __LINUX_KVM_POWERPC_H
+#define __LINUX_KVM_POWERPC_H
+
+#include <linux/types.h>
+
+/* Select powerpc specific features in <linux/kvm.h> */
+#define __KVM_HAVE_SPAPR_TCE
+#define __KVM_HAVE_PPC_SMT
+#define __KVM_HAVE_IRQCHIP
+#define __KVM_HAVE_IRQ_LINE
+#define __KVM_HAVE_GUEST_DEBUG
+
+/* Not always available, but if it is, this is the correct offset.  */
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+
+struct kvm_regs {
+	__u64 pc;
+	__u64 cr;
+	__u64 ctr;
+	__u64 lr;
+	__u64 xer;
+	__u64 msr;
+	__u64 srr0;
+	__u64 srr1;
+	__u64 pid;
+
+	__u64 sprg0;
+	__u64 sprg1;
+	__u64 sprg2;
+	__u64 sprg3;
+	__u64 sprg4;
+	__u64 sprg5;
+	__u64 sprg6;
+	__u64 sprg7;
+
+	__u64 gpr[32];
+};
+
+#define KVM_SREGS_E_IMPL_NONE	0
+#define KVM_SREGS_E_IMPL_FSL	1
+
+#define KVM_SREGS_E_FSL_PIDn	(1 << 0) /* PID1/PID2 */
+
+/* flags for kvm_run.flags */
+#define KVM_RUN_PPC_NMI_DISP_MASK		(3 << 0)
+#define   KVM_RUN_PPC_NMI_DISP_FULLY_RECOV	(1 << 0)
+#define   KVM_RUN_PPC_NMI_DISP_LIMITED_RECOV	(2 << 0)
+#define   KVM_RUN_PPC_NMI_DISP_NOT_RECOV	(3 << 0)
+
+/*
+ * Feature bits indicate which sections of the sregs struct are valid,
+ * both in KVM_GET_SREGS and KVM_SET_SREGS.  On KVM_SET_SREGS, registers
+ * corresponding to unset feature bits will not be modified.  This allows
+ * restoring a checkpoint made without that feature, while keeping the
+ * default values of the new registers.
+ *
+ * KVM_SREGS_E_BASE contains:
+ * CSRR0/1 (refers to SRR2/3 on 40x)
+ * ESR
+ * DEAR
+ * MCSR
+ * TSR
+ * TCR
+ * DEC
+ * TB
+ * VRSAVE (USPRG0)
+ */
+#define KVM_SREGS_E_BASE		(1 << 0)
+
+/*
+ * KVM_SREGS_E_ARCH206 contains:
+ *
+ * PIR
+ * MCSRR0/1
+ * DECAR
+ * IVPR
+ */
+#define KVM_SREGS_E_ARCH206		(1 << 1)
+
+/*
+ * Contains EPCR, plus the upper half of 64-bit registers
+ * that are 32-bit on 32-bit implementations.
+ */
+#define KVM_SREGS_E_64			(1 << 2)
+
+#define KVM_SREGS_E_SPRG8		(1 << 3)
+#define KVM_SREGS_E_MCIVPR		(1 << 4)
+
+/*
+ * IVORs are used -- contains IVOR0-15, plus additional IVORs
+ * in combination with an appropriate feature bit.
+ */
+#define KVM_SREGS_E_IVOR		(1 << 5)
+
+/*
+ * Contains MAS0-4, MAS6-7, TLBnCFG, MMUCFG.
+ * Also TLBnPS if MMUCFG[MAVN] = 1.
+ */
+#define KVM_SREGS_E_ARCH206_MMU		(1 << 6)
+
+/* DBSR, DBCR, IAC, DAC, DVC */
+#define KVM_SREGS_E_DEBUG		(1 << 7)
+
+/* Enhanced debug -- DSRR0/1, SPRG9 */
+#define KVM_SREGS_E_ED			(1 << 8)
+
+/* Embedded Floating Point (SPE) -- IVOR32-34 if KVM_SREGS_E_IVOR */
+#define KVM_SREGS_E_SPE			(1 << 9)
+
+/*
+ * DEPRECATED! USE ONE_REG FOR THIS ONE!
+ * External Proxy (EXP) -- EPR
+ */
+#define KVM_SREGS_EXP			(1 << 10)
+
+/* External PID (E.PD) -- EPSC/EPLC */
+#define KVM_SREGS_E_PD			(1 << 11)
+
+/* Processor Control (E.PC) -- IVOR36-37 if KVM_SREGS_E_IVOR */
+#define KVM_SREGS_E_PC			(1 << 12)
+
+/* Page table (E.PT) -- EPTCFG */
+#define KVM_SREGS_E_PT			(1 << 13)
+
+/* Embedded Performance Monitor (E.PM) -- IVOR35 if KVM_SREGS_E_IVOR */
+#define KVM_SREGS_E_PM			(1 << 14)
+
+/*
+ * Special updates:
+ *
+ * Some registers may change even while a vcpu is not running.
+ * To avoid losing these changes, by default these registers are
+ * not updated by KVM_SET_SREGS.  To force an update, set the bit
+ * in u.e.update_special corresponding to the register to be updated.
+ *
+ * The update_special field is zero on return from KVM_GET_SREGS.
+ *
+ * When restoring a checkpoint, the caller can set update_special
+ * to 0xffffffff to ensure that everything is restored, even new features
+ * that the caller doesn't know about.
+ */
+#define KVM_SREGS_E_UPDATE_MCSR		(1 << 0)
+#define KVM_SREGS_E_UPDATE_TSR		(1 << 1)
+#define KVM_SREGS_E_UPDATE_DEC		(1 << 2)
+#define KVM_SREGS_E_UPDATE_DBSR		(1 << 3)
+
+/*
+ * In KVM_SET_SREGS, reserved/pad fields must be left untouched from a
+ * previous KVM_GET_REGS.
+ *
+ * Unless otherwise indicated, setting any register with KVM_SET_SREGS
+ * directly sets its value.  It does not trigger any special semantics such
+ * as write-one-to-clear.  Calling KVM_SET_SREGS on an unmodified struct
+ * just received from KVM_GET_SREGS is always a no-op.
+ */
+struct kvm_sregs {
+	__u32 pvr;
+	union {
+		struct {
+			__u64 sdr1;
+			struct {
+				struct {
+					__u64 slbe;
+					__u64 slbv;
+				} slb[64];
+			} ppc64;
+			struct {
+				__u32 sr[16];
+				__u64 ibat[8];
+				__u64 dbat[8];
+			} ppc32;
+		} s;
+		struct {
+			union {
+				struct { /* KVM_SREGS_E_IMPL_FSL */
+					__u32 features; /* KVM_SREGS_E_FSL_ */
+					__u32 svr;
+					__u64 mcar;
+					__u32 hid0;
+
+					/* KVM_SREGS_E_FSL_PIDn */
+					__u32 pid1, pid2;
+				} fsl;
+				__u8 pad[256];
+			} impl;
+
+			__u32 features; /* KVM_SREGS_E_ */
+			__u32 impl_id;	/* KVM_SREGS_E_IMPL_ */
+			__u32 update_special; /* KVM_SREGS_E_UPDATE_ */
+			__u32 pir;	/* read-only */
+			__u64 sprg8;
+			__u64 sprg9;	/* E.ED */
+			__u64 csrr0;
+			__u64 dsrr0;	/* E.ED */
+			__u64 mcsrr0;
+			__u32 csrr1;
+			__u32 dsrr1;	/* E.ED */
+			__u32 mcsrr1;
+			__u32 esr;
+			__u64 dear;
+			__u64 ivpr;
+			__u64 mcivpr;
+			__u64 mcsr;	/* KVM_SREGS_E_UPDATE_MCSR */
+
+			__u32 tsr;	/* KVM_SREGS_E_UPDATE_TSR */
+			__u32 tcr;
+			__u32 decar;
+			__u32 dec;	/* KVM_SREGS_E_UPDATE_DEC */
+
+			/*
+			 * Userspace can read TB directly, but the
+			 * value reported here is consistent with "dec".
+			 *
+			 * Read-only.
+			 */
+			__u64 tb;
+
+			__u32 dbsr;	/* KVM_SREGS_E_UPDATE_DBSR */
+			__u32 dbcr[3];
+			/*
+			 * iac/dac registers are 64bit wide, while this API
+			 * interface provides only lower 32 bits on 64 bit
+			 * processors. ONE_REG interface is added for 64bit
+			 * iac/dac registers.
+			 */
+			__u32 iac[4];
+			__u32 dac[2];
+			__u32 dvc[2];
+			__u8 num_iac;	/* read-only */
+			__u8 num_dac;	/* read-only */
+			__u8 num_dvc;	/* read-only */
+			__u8 pad;
+
+			__u32 epr;	/* EXP */
+			__u32 vrsave;	/* a.k.a. USPRG0 */
+			__u32 epcr;	/* KVM_SREGS_E_64 */
+
+			__u32 mas0;
+			__u32 mas1;
+			__u64 mas2;
+			__u64 mas7_3;
+			__u32 mas4;
+			__u32 mas6;
+
+			__u32 ivor_low[16]; /* IVOR0-15 */
+			__u32 ivor_high[18]; /* IVOR32+, plus room to expand */
+
+			__u32 mmucfg;	/* read-only */
+			__u32 eptcfg;	/* E.PT, read-only */
+			__u32 tlbcfg[4];/* read-only */
+			__u32 tlbps[4]; /* read-only */
+
+			__u32 eplc, epsc; /* E.PD */
+		} e;
+		__u8 pad[1020];
+	} u;
+};
+
+struct kvm_fpu {
+	__u64 fpr[32];
+};
+
+/*
+ * Defines for h/w breakpoint, watchpoint (read, write or both) and
+ * software breakpoint.
+ * These are used as "type" in KVM_SET_GUEST_DEBUG ioctl and "status"
+ * for KVM_DEBUG_EXIT.
+ */
+#define KVMPPC_DEBUG_NONE		0x0
+#define KVMPPC_DEBUG_BREAKPOINT		(1UL << 1)
+#define KVMPPC_DEBUG_WATCH_WRITE	(1UL << 2)
+#define KVMPPC_DEBUG_WATCH_READ		(1UL << 3)
+struct kvm_debug_exit_arch {
+	__u64 address;
+	/*
+	 * exiting to userspace because of h/w breakpoint, watchpoint
+	 * (read, write or both) and software breakpoint.
+	 */
+	__u32 status;
+	__u32 reserved;
+};
+
+/* for KVM_SET_GUEST_DEBUG */
+struct kvm_guest_debug_arch {
+	struct {
+		/* H/W breakpoint/watchpoint address */
+		__u64 addr;
+		/*
+		 * Type denotes h/w breakpoint, read watchpoint, write
+		 * watchpoint or watchpoint (both read and write).
+		 */
+		__u32 type;
+		__u32 reserved;
+	} bp[16];
+};
+
+/* Debug related defines */
+/*
+ * kvm_guest_debug->control is a 32 bit field. The lower 16 bits are generic
+ * and upper 16 bits are architecture specific. Architecture specific defines
+ * that ioctl is for setting hardware breakpoint or software breakpoint.
+ */
+#define KVM_GUESTDBG_USE_SW_BP		0x00010000
+#define KVM_GUESTDBG_USE_HW_BP		0x00020000
+
+/* definition of registers in kvm_run */
+struct kvm_sync_regs {
+};
+
+#define KVM_INTERRUPT_SET	-1U
+#define KVM_INTERRUPT_UNSET	-2U
+#define KVM_INTERRUPT_SET_LEVEL	-3U
+
+#define KVM_CPU_440		1
+#define KVM_CPU_E500V2		2
+#define KVM_CPU_3S_32		3
+#define KVM_CPU_3S_64		4
+#define KVM_CPU_E500MC		5
+
+/* for KVM_CAP_SPAPR_TCE */
+struct kvm_create_spapr_tce {
+	__u64 liobn;
+	__u32 window_size;
+};
+
+/* for KVM_CAP_SPAPR_TCE_64 */
+struct kvm_create_spapr_tce_64 {
+	__u64 liobn;
+	__u32 page_shift;
+	__u32 flags;
+	__u64 offset;	/* in pages */
+	__u64 size;	/* in pages */
+};
+
+/* for KVM_ALLOCATE_RMA */
+struct kvm_allocate_rma {
+	__u64 rma_size;
+};
+
+/* for KVM_CAP_PPC_RTAS */
+struct kvm_rtas_token_args {
+	char name[120];
+	__u64 token;	/* Use a token of 0 to undefine a mapping */
+};
+
+struct kvm_book3e_206_tlb_entry {
+	__u32 mas8;
+	__u32 mas1;
+	__u64 mas2;
+	__u64 mas7_3;
+};
+
+struct kvm_book3e_206_tlb_params {
+	/*
+	 * For mmu types KVM_MMU_FSL_BOOKE_NOHV and KVM_MMU_FSL_BOOKE_HV:
+	 *
+	 * - The number of ways of TLB0 must be a power of two between 2 and
+	 *   16.
+	 * - TLB1 must be fully associative.
+	 * - The size of TLB0 must be a multiple of the number of ways, and
+	 *   the number of sets must be a power of two.
+	 * - The size of TLB1 may not exceed 64 entries.
+	 * - TLB0 supports 4 KiB pages.
+	 * - The page sizes supported by TLB1 are as indicated by
+	 *   TLB1CFG (if MMUCFG[MAVN] = 0) or TLB1PS (if MMUCFG[MAVN] = 1)
+	 *   as returned by KVM_GET_SREGS.
+	 * - TLB2 and TLB3 are reserved, and their entries in tlb_sizes[]
+	 *   and tlb_ways[] must be zero.
+	 *
+	 * tlb_ways[n] = tlb_sizes[n] means the array is fully associative.
+	 *
+	 * KVM will adjust TLBnCFG based on the sizes configured here,
+	 * though arrays greater than 2048 entries will have TLBnCFG[NENTRY]
+	 * set to zero.
+	 */
+	__u32 tlb_sizes[4];
+	__u32 tlb_ways[4];
+	__u32 reserved[8];
+};
+
+/* For KVM_PPC_GET_HTAB_FD */
+struct kvm_get_htab_fd {
+	__u64	flags;
+	__u64	start_index;
+	__u64	reserved[2];
+};
+
+/* Values for kvm_get_htab_fd.flags */
+#define KVM_GET_HTAB_BOLTED_ONLY	((__u64)0x1)
+#define KVM_GET_HTAB_WRITE		((__u64)0x2)
+
+/*
+ * Data read on the file descriptor is formatted as a series of
+ * records, each consisting of a header followed by a series of
+ * `n_valid' HPTEs (16 bytes each), which are all valid.  Following
+ * those valid HPTEs there are `n_invalid' invalid HPTEs, which
+ * are not represented explicitly in the stream.  The same format
+ * is used for writing.
+ */
+struct kvm_get_htab_header {
+	__u32	index;
+	__u16	n_valid;
+	__u16	n_invalid;
+};
+
+/* For KVM_PPC_CONFIGURE_V3_MMU */
+struct kvm_ppc_mmuv3_cfg {
+	__u64	flags;
+	__u64	process_table;	/* second doubleword of partition table entry */
+};
+
+/* Flag values for KVM_PPC_CONFIGURE_V3_MMU */
+#define KVM_PPC_MMUV3_RADIX	1	/* 1 = radix mode, 0 = HPT */
+#define KVM_PPC_MMUV3_GTSE	2	/* global translation shootdown enb. */
+
+/* For KVM_PPC_GET_RMMU_INFO */
+struct kvm_ppc_rmmu_info {
+	struct kvm_ppc_radix_geom {
+		__u8	page_shift;
+		__u8	level_bits[4];
+		__u8	pad[3];
+	}	geometries[8];
+	__u32	ap_encodings[8];
+};
+
+/* For KVM_PPC_GET_CPU_CHAR */
+struct kvm_ppc_cpu_char {
+	__u64	character;		/* characteristics of the CPU */
+	__u64	behaviour;		/* recommended software behaviour */
+	__u64	character_mask;		/* valid bits in character */
+	__u64	behaviour_mask;		/* valid bits in behaviour */
+};
+
+/*
+ * Values for character and character_mask.
+ * These are identical to the values used by H_GET_CPU_CHARACTERISTICS.
+ */
+#define KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31		(1ULL << 63)
+#define KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED	(1ULL << 62)
+#define KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30	(1ULL << 61)
+#define KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2	(1ULL << 60)
+#define KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV	(1ULL << 59)
+#define KVM_PPC_CPU_CHAR_BR_HINT_HONOURED	(1ULL << 58)
+#define KVM_PPC_CPU_CHAR_MTTRIG_THR_RECONF	(1ULL << 57)
+#define KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS	(1ULL << 56)
+#define KVM_PPC_CPU_CHAR_BCCTR_FLUSH_ASSIST	(1ull << 54)
+
+#define KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY	(1ULL << 63)
+#define KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR		(1ULL << 62)
+#define KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR	(1ULL << 61)
+#define KVM_PPC_CPU_BEHAV_FLUSH_COUNT_CACHE	(1ull << 58)
+
+/* Per-vcpu XICS interrupt controller state */
+#define KVM_REG_PPC_ICP_STATE	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c)
+
+#define  KVM_REG_PPC_ICP_CPPR_SHIFT	56	/* current proc priority */
+#define  KVM_REG_PPC_ICP_CPPR_MASK	0xff
+#define  KVM_REG_PPC_ICP_XISR_SHIFT	32	/* interrupt status field */
+#define  KVM_REG_PPC_ICP_XISR_MASK	0xffffff
+#define  KVM_REG_PPC_ICP_MFRR_SHIFT	24	/* pending IPI priority */
+#define  KVM_REG_PPC_ICP_MFRR_MASK	0xff
+#define  KVM_REG_PPC_ICP_PPRI_SHIFT	16	/* pending irq priority */
+#define  KVM_REG_PPC_ICP_PPRI_MASK	0xff
+
+#define KVM_REG_PPC_VP_STATE	(KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x8d)
+
+/* Device control API: PPC-specific devices */
+#define KVM_DEV_MPIC_GRP_MISC		1
+#define   KVM_DEV_MPIC_BASE_ADDR	0	/* 64-bit */
+
+#define KVM_DEV_MPIC_GRP_REGISTER	2	/* 32-bit */
+#define KVM_DEV_MPIC_GRP_IRQ_ACTIVE	3	/* 32-bit */
+
+/* One-Reg API: PPC-specific registers */
+#define KVM_REG_PPC_HIOR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1)
+#define KVM_REG_PPC_IAC1	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x2)
+#define KVM_REG_PPC_IAC2	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3)
+#define KVM_REG_PPC_IAC3	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x4)
+#define KVM_REG_PPC_IAC4	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x5)
+#define KVM_REG_PPC_DAC1	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x6)
+#define KVM_REG_PPC_DAC2	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x7)
+#define KVM_REG_PPC_DABR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8)
+#define KVM_REG_PPC_DSCR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x9)
+#define KVM_REG_PPC_PURR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa)
+#define KVM_REG_PPC_SPURR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb)
+#define KVM_REG_PPC_DAR		(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc)
+#define KVM_REG_PPC_DSISR	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xd)
+#define KVM_REG_PPC_AMR		(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xe)
+#define KVM_REG_PPC_UAMOR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xf)
+
+#define KVM_REG_PPC_MMCR0	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x10)
+#define KVM_REG_PPC_MMCR1	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x11)
+#define KVM_REG_PPC_MMCRA	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x12)
+#define KVM_REG_PPC_MMCR2	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x13)
+#define KVM_REG_PPC_MMCRS	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x14)
+#define KVM_REG_PPC_SIAR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x15)
+#define KVM_REG_PPC_SDAR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x16)
+#define KVM_REG_PPC_SIER	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x17)
+
+#define KVM_REG_PPC_PMC1	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x18)
+#define KVM_REG_PPC_PMC2	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x19)
+#define KVM_REG_PPC_PMC3	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1a)
+#define KVM_REG_PPC_PMC4	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1b)
+#define KVM_REG_PPC_PMC5	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1c)
+#define KVM_REG_PPC_PMC6	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1d)
+#define KVM_REG_PPC_PMC7	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1e)
+#define KVM_REG_PPC_PMC8	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1f)
+
+/* 32 floating-point registers */
+#define KVM_REG_PPC_FPR0	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x20)
+#define KVM_REG_PPC_FPR(n)	(KVM_REG_PPC_FPR0 + (n))
+#define KVM_REG_PPC_FPR31	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3f)
+
+/* 32 VMX/Altivec vector registers */
+#define KVM_REG_PPC_VR0		(KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x40)
+#define KVM_REG_PPC_VR(n)	(KVM_REG_PPC_VR0 + (n))
+#define KVM_REG_PPC_VR31	(KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x5f)
+
+/* 32 double-width FP registers for VSX */
+/* High-order halves overlap with FP regs */
+#define KVM_REG_PPC_VSR0	(KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x60)
+#define KVM_REG_PPC_VSR(n)	(KVM_REG_PPC_VSR0 + (n))
+#define KVM_REG_PPC_VSR31	(KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x7f)
+
+/* FP and vector status/control registers */
+#define KVM_REG_PPC_FPSCR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x80)
+/*
+ * VSCR register is documented as a 32-bit register in the ISA, but it can
+ * only be accesses via a vector register. Expose VSCR as a 32-bit register
+ * even though the kernel represents it as a 128-bit vector.
+ */
+#define KVM_REG_PPC_VSCR	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x81)
+
+/* Virtual processor areas */
+/* For SLB & DTL, address in high (first) half, length in low half */
+#define KVM_REG_PPC_VPA_ADDR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x82)
+#define KVM_REG_PPC_VPA_SLB	(KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x83)
+#define KVM_REG_PPC_VPA_DTL	(KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x84)
+
+#define KVM_REG_PPC_EPCR	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x85)
+#define KVM_REG_PPC_EPR		(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x86)
+
+/* Timer Status Register OR/CLEAR interface */
+#define KVM_REG_PPC_OR_TSR	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x87)
+#define KVM_REG_PPC_CLEAR_TSR	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x88)
+#define KVM_REG_PPC_TCR		(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x89)
+#define KVM_REG_PPC_TSR		(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8a)
+
+/* Debugging: Special instruction for software breakpoint */
+#define KVM_REG_PPC_DEBUG_INST	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8b)
+
+/* MMU registers */
+#define KVM_REG_PPC_MAS0	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8c)
+#define KVM_REG_PPC_MAS1	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8d)
+#define KVM_REG_PPC_MAS2	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8e)
+#define KVM_REG_PPC_MAS7_3	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8f)
+#define KVM_REG_PPC_MAS4	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x90)
+#define KVM_REG_PPC_MAS6	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x91)
+#define KVM_REG_PPC_MMUCFG	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x92)
+/*
+ * TLBnCFG fields TLBnCFG_N_ENTRY and TLBnCFG_ASSOC can be changed only using
+ * KVM_CAP_SW_TLB ioctl
+ */
+#define KVM_REG_PPC_TLB0CFG	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x93)
+#define KVM_REG_PPC_TLB1CFG	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x94)
+#define KVM_REG_PPC_TLB2CFG	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x95)
+#define KVM_REG_PPC_TLB3CFG	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x96)
+#define KVM_REG_PPC_TLB0PS	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x97)
+#define KVM_REG_PPC_TLB1PS	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x98)
+#define KVM_REG_PPC_TLB2PS	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x99)
+#define KVM_REG_PPC_TLB3PS	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9a)
+#define KVM_REG_PPC_EPTCFG	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9b)
+
+/* Timebase offset */
+#define KVM_REG_PPC_TB_OFFSET	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x9c)
+
+/* POWER8 registers */
+#define KVM_REG_PPC_SPMC1	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9d)
+#define KVM_REG_PPC_SPMC2	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9e)
+#define KVM_REG_PPC_IAMR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x9f)
+#define KVM_REG_PPC_TFHAR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa0)
+#define KVM_REG_PPC_TFIAR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa1)
+#define KVM_REG_PPC_TEXASR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa2)
+#define KVM_REG_PPC_FSCR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa3)
+#define KVM_REG_PPC_PSPB	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xa4)
+#define KVM_REG_PPC_EBBHR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa5)
+#define KVM_REG_PPC_EBBRR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa6)
+#define KVM_REG_PPC_BESCR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa7)
+#define KVM_REG_PPC_TAR		(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa8)
+#define KVM_REG_PPC_DPDES	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa9)
+#define KVM_REG_PPC_DAWR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xaa)
+#define KVM_REG_PPC_DAWRX	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xab)
+#define KVM_REG_PPC_CIABR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xac)
+#define KVM_REG_PPC_IC		(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xad)
+#define KVM_REG_PPC_VTB		(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xae)
+#define KVM_REG_PPC_CSIGR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xaf)
+#define KVM_REG_PPC_TACR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb0)
+#define KVM_REG_PPC_TCSCR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb1)
+#define KVM_REG_PPC_PID		(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb2)
+#define KVM_REG_PPC_ACOP	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb3)
+
+#define KVM_REG_PPC_VRSAVE	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb4)
+#define KVM_REG_PPC_LPCR	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb5)
+#define KVM_REG_PPC_LPCR_64	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb5)
+#define KVM_REG_PPC_PPR		(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb6)
+
+/* Architecture compatibility level */
+#define KVM_REG_PPC_ARCH_COMPAT	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb7)
+
+#define KVM_REG_PPC_DABRX	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb8)
+#define KVM_REG_PPC_WORT	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb9)
+#define KVM_REG_PPC_SPRG9	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xba)
+#define KVM_REG_PPC_DBSR	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbb)
+
+/* POWER9 registers */
+#define KVM_REG_PPC_TIDR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbc)
+#define KVM_REG_PPC_PSSCR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbd)
+
+#define KVM_REG_PPC_DEC_EXPIRY	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe)
+#define KVM_REG_PPC_ONLINE	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbf)
+#define KVM_REG_PPC_PTCR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc0)
+
+/* Transactional Memory checkpointed state:
+ * This is all GPRs, all VSX regs and a subset of SPRs
+ */
+#define KVM_REG_PPC_TM		(KVM_REG_PPC | 0x80000000)
+/* TM GPRs */
+#define KVM_REG_PPC_TM_GPR0	(KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0)
+#define KVM_REG_PPC_TM_GPR(n)	(KVM_REG_PPC_TM_GPR0 + (n))
+#define KVM_REG_PPC_TM_GPR31	(KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x1f)
+/* TM VSX */
+#define KVM_REG_PPC_TM_VSR0	(KVM_REG_PPC_TM | KVM_REG_SIZE_U128 | 0x20)
+#define KVM_REG_PPC_TM_VSR(n)	(KVM_REG_PPC_TM_VSR0 + (n))
+#define KVM_REG_PPC_TM_VSR63	(KVM_REG_PPC_TM | KVM_REG_SIZE_U128 | 0x5f)
+/* TM SPRS */
+#define KVM_REG_PPC_TM_CR	(KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x60)
+#define KVM_REG_PPC_TM_LR	(KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x61)
+#define KVM_REG_PPC_TM_CTR	(KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x62)
+#define KVM_REG_PPC_TM_FPSCR	(KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x63)
+#define KVM_REG_PPC_TM_AMR	(KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x64)
+#define KVM_REG_PPC_TM_PPR	(KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x65)
+#define KVM_REG_PPC_TM_VRSAVE	(KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x66)
+#define KVM_REG_PPC_TM_VSCR	(KVM_REG_PPC_TM | KVM_REG_SIZE_U32 | 0x67)
+#define KVM_REG_PPC_TM_DSCR	(KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x68)
+#define KVM_REG_PPC_TM_TAR	(KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x69)
+#define KVM_REG_PPC_TM_XER	(KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x6a)
+
+/* PPC64 eXternal Interrupt Controller Specification */
+#define KVM_DEV_XICS_GRP_SOURCES	1	/* 64-bit source attributes */
+
+/* Layout of 64-bit source attribute values */
+#define  KVM_XICS_DESTINATION_SHIFT	0
+#define  KVM_XICS_DESTINATION_MASK	0xffffffffULL
+#define  KVM_XICS_PRIORITY_SHIFT	32
+#define  KVM_XICS_PRIORITY_MASK		0xff
+#define  KVM_XICS_LEVEL_SENSITIVE	(1ULL << 40)
+#define  KVM_XICS_MASKED		(1ULL << 41)
+#define  KVM_XICS_PENDING		(1ULL << 42)
+#define  KVM_XICS_PRESENTED		(1ULL << 43)
+#define  KVM_XICS_QUEUED		(1ULL << 44)
+
+/* POWER9 XIVE Native Interrupt Controller */
+#define KVM_DEV_XIVE_GRP_CTRL		1
+#define   KVM_DEV_XIVE_RESET		1
+#define   KVM_DEV_XIVE_EQ_SYNC		2
+#define KVM_DEV_XIVE_GRP_SOURCE		2	/* 64-bit source identifier */
+#define KVM_DEV_XIVE_GRP_SOURCE_CONFIG	3	/* 64-bit source identifier */
+#define KVM_DEV_XIVE_GRP_EQ_CONFIG	4	/* 64-bit EQ identifier */
+#define KVM_DEV_XIVE_GRP_SOURCE_SYNC	5       /* 64-bit source identifier */
+
+/* Layout of 64-bit XIVE source attribute values */
+#define KVM_XIVE_LEVEL_SENSITIVE	(1ULL << 0)
+#define KVM_XIVE_LEVEL_ASSERTED		(1ULL << 1)
+
+/* Layout of 64-bit XIVE source configuration attribute values */
+#define KVM_XIVE_SOURCE_PRIORITY_SHIFT	0
+#define KVM_XIVE_SOURCE_PRIORITY_MASK	0x7
+#define KVM_XIVE_SOURCE_SERVER_SHIFT	3
+#define KVM_XIVE_SOURCE_SERVER_MASK	0xfffffff8ULL
+#define KVM_XIVE_SOURCE_MASKED_SHIFT	32
+#define KVM_XIVE_SOURCE_MASKED_MASK	0x100000000ULL
+#define KVM_XIVE_SOURCE_EISN_SHIFT	33
+#define KVM_XIVE_SOURCE_EISN_MASK	0xfffffffe00000000ULL
+
+/* Layout of 64-bit EQ identifier */
+#define KVM_XIVE_EQ_PRIORITY_SHIFT	0
+#define KVM_XIVE_EQ_PRIORITY_MASK	0x7
+#define KVM_XIVE_EQ_SERVER_SHIFT	3
+#define KVM_XIVE_EQ_SERVER_MASK		0xfffffff8ULL
+
+/* Layout of EQ configuration values (64 bytes) */
+struct kvm_ppc_xive_eq {
+	__u32 flags;
+	__u32 qshift;
+	__u64 qaddr;
+	__u32 qtoggle;
+	__u32 qindex;
+	__u8  pad[40];
+};
+
+#define KVM_XIVE_EQ_ALWAYS_NOTIFY	0x00000001
+
+#define KVM_XIVE_TIMA_PAGE_OFFSET	0
+#define KVM_XIVE_ESB_PAGE_OFFSET	4
+
+#endif /* __LINUX_KVM_POWERPC_H */
diff --git a/kvmtool/powerpc/include/kvm/barrier.h b/kvmtool/powerpc/include/kvm/barrier.h
new file mode 100644
index 0000000..4b708ae
--- /dev/null
+++ b/kvmtool/powerpc/include/kvm/barrier.h
@@ -0,0 +1,8 @@
+#ifndef _KVM_BARRIER_H_
+#define _KVM_BARRIER_H_
+
+#define mb()   asm volatile ("sync" : : : "memory")
+#define rmb()  asm volatile ("sync" : : : "memory")
+#define wmb()  asm volatile ("sync" : : : "memory")
+
+#endif /* _KVM_BARRIER_H_ */
diff --git a/kvmtool/powerpc/include/kvm/fdt-arch.h b/kvmtool/powerpc/include/kvm/fdt-arch.h
new file mode 100644
index 0000000..d48c055
--- /dev/null
+++ b/kvmtool/powerpc/include/kvm/fdt-arch.h
@@ -0,0 +1,6 @@
+#ifndef KVM__KVM_FDT_H
+#define KVM__KVM_FDT_H
+
+enum phandles {PHANDLE_RESERVED = 0, PHANDLE_XICP, PHANDLES_MAX};
+
+#endif /* KVM__KVM_FDT_H */
diff --git a/kvmtool/powerpc/include/kvm/kvm-arch.h b/kvmtool/powerpc/include/kvm/kvm-arch.h
new file mode 100644
index 0000000..26d440b
--- /dev/null
+++ b/kvmtool/powerpc/include/kvm/kvm-arch.h
@@ -0,0 +1,67 @@
+/*
+ * PPC64 architecture-specific definitions
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef KVM__KVM_ARCH_H
+#define KVM__KVM_ARCH_H
+
+#include <stdbool.h>
+#include <linux/types.h>
+#include <time.h>
+
+/*
+ * MMIO lives after RAM, but it'd be nice if it didn't constantly move.
+ * Choose a suitably high address, e.g. 63T...  This limits RAM size.
+ */
+#define PPC_MMIO_START			0x3F0000000000UL
+#define PPC_MMIO_SIZE			0x010000000000UL
+
+#define KERNEL_LOAD_ADDR        	0x0000000000000000
+#define KERNEL_START_ADDR       	0x0000000000000000
+#define KERNEL_SECONDARY_START_ADDR     0x0000000000000060
+#define INITRD_LOAD_ADDR        	0x0000000002800000
+
+#define RTAS_MAX_SIZE           	0x10000
+
+#define TIMEBASE_FREQ           	512000000ULL
+
+#define KVM_MMIO_START			PPC_MMIO_START
+
+/*
+ * This is the address that pci_get_io_port_block() starts allocating
+ * from.  Note that this is a PCI bus address.
+ */
+#define KVM_IOPORT_AREA			0x0
+#define KVM_PCI_CFG_AREA		0x1000000
+#define KVM_PCI_MMIO_AREA		0x2000000
+#define KVM_VIRTIO_MMIO_AREA		0x3000000
+
+#define KVM_IRQ_OFFSET			16
+
+#define KVM_VM_TYPE			0
+
+#define KVM_IOEVENTFD_HAS_PIO		0
+
+#define VIRTIO_DEFAULT_TRANS(kvm)	VIRTIO_PCI
+
+struct spapr_phb;
+
+struct kvm_arch {
+	u64			sdr1;
+	u32			pvr;
+	unsigned long		rtas_gra;
+	unsigned long		rtas_size;
+	unsigned long		fdt_gra;
+	unsigned long		initrd_gra;
+	unsigned long		initrd_size;
+	struct icp_state	*icp;
+	struct spapr_phb	*phb;
+};
+
+#endif /* KVM__KVM_ARCH_H */
diff --git a/kvmtool/powerpc/include/kvm/kvm-config-arch.h b/kvmtool/powerpc/include/kvm/kvm-config-arch.h
new file mode 100644
index 0000000..60f61de
--- /dev/null
+++ b/kvmtool/powerpc/include/kvm/kvm-config-arch.h
@@ -0,0 +1,7 @@
+#ifndef KVM__KVM_CONFIG_ARCH_H
+#define KVM__KVM_CONFIG_ARCH_H
+
+struct kvm_config_arch {
+};
+
+#endif /* KVM__KVM_CONFIG_ARCH_H */
diff --git a/kvmtool/powerpc/include/kvm/kvm-cpu-arch.h b/kvmtool/powerpc/include/kvm/kvm-cpu-arch.h
new file mode 100644
index 0000000..a69e0cc
--- /dev/null
+++ b/kvmtool/powerpc/include/kvm/kvm-cpu-arch.h
@@ -0,0 +1,79 @@
+/*
+ * PPC64 cpu-specific definitions
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef KVM__KVM_CPU_ARCH_H
+#define KVM__KVM_CPU_ARCH_H
+
+/* Architecture-specific kvm_cpu definitions. */
+
+#include <linux/kvm.h>	/* for struct kvm_regs */
+#include <stdbool.h>
+#include <pthread.h>
+
+#define MSR_SF		(1ULL<<63)
+#define MSR_HV		(1ULL<<60)
+#define MSR_VEC		(1ULL<<25)
+#define MSR_VSX		(1ULL<<23)
+#define MSR_POW		(1ULL<<18)
+#define MSR_EE		(1ULL<<15)
+#define MSR_PR		(1ULL<<14)
+#define MSR_FP		(1ULL<<13)
+#define MSR_ME		(1ULL<<12)
+#define MSR_FE0		(1ULL<<11)
+#define MSR_SE		(1ULL<<10)
+#define MSR_BE		(1ULL<<9)
+#define MSR_FE1		(1ULL<<8)
+#define MSR_IR		(1ULL<<5)
+#define MSR_DR		(1ULL<<4)
+#define MSR_PMM		(1ULL<<2)
+#define MSR_RI		(1ULL<<1)
+#define MSR_LE		(1ULL<<0)
+
+#define POWER7_EXT_IRQ	0
+
+#define LPCR_ILE	(1 << (63-38))
+
+struct kvm;
+
+struct kvm_cpu {
+	pthread_t		thread;		/* VCPU thread */
+
+	unsigned long		cpu_id;
+
+	struct kvm		*kvm;		/* parent KVM */
+	int			vcpu_fd;	/* For VCPU ioctls() */
+	struct kvm_run		*kvm_run;
+	struct kvm_cpu_task	*task;
+
+	struct kvm_regs		regs;
+	struct kvm_sregs	sregs;
+	struct kvm_fpu		fpu;
+
+	u8			is_running;
+	u8			paused;
+	u8			needs_nmi;
+	/*
+	 * Although PPC KVM doesn't yet support coalesced MMIO, generic code
+	 * needs this in our kvm_cpu:
+	 */
+	struct kvm_coalesced_mmio_ring  *ring;
+};
+
+void kvm_cpu__irq(struct kvm_cpu *vcpu, int pin, int level);
+
+/* This is never actually called on PPC. */
+static inline bool kvm_cpu__emulate_io(struct kvm_cpu *vcpu, u16 port, void *data, int direction, int size, u32 count)
+{
+	return false;
+}
+
+bool kvm_cpu__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, u8 is_write);
+
+#endif /* KVM__KVM_CPU_ARCH_H */
diff --git a/kvmtool/powerpc/ioport.c b/kvmtool/powerpc/ioport.c
new file mode 100644
index 0000000..0c188b6
--- /dev/null
+++ b/kvmtool/powerpc/ioport.c
@@ -0,0 +1,23 @@
+/*
+ * PPC64 ioport platform setup.  There isn't any! :-)
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/ioport.h"
+
+#include <stdlib.h>
+
+int ioport__setup_arch(struct kvm *kvm)
+{
+	/* PPC has no legacy ioports to set up */
+	return 0;
+}
+
+void ioport__map_irq(u8 *irq)
+{
+}
diff --git a/kvmtool/powerpc/kvm-cpu.c b/kvmtool/powerpc/kvm-cpu.c
new file mode 100644
index 0000000..461e0a9
--- /dev/null
+++ b/kvmtool/powerpc/kvm-cpu.c
@@ -0,0 +1,290 @@
+/*
+ * PPC64 processor support
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/kvm-cpu.h"
+
+#include "kvm/symbol.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+#include "spapr.h"
+#include "spapr_pci.h"
+#include "xics.h"
+
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+#include <assert.h>
+
+static int debug_fd;
+
+void kvm_cpu__set_debug_fd(int fd)
+{
+	debug_fd = fd;
+}
+
+int kvm_cpu__get_debug_fd(void)
+{
+	return debug_fd;
+}
+
+static struct kvm_cpu *kvm_cpu__new(struct kvm *kvm)
+{
+	struct kvm_cpu *vcpu;
+
+	vcpu		= calloc(1, sizeof *vcpu);
+	if (!vcpu)
+		return NULL;
+
+	vcpu->kvm	= kvm;
+
+	return vcpu;
+}
+
+void kvm_cpu__delete(struct kvm_cpu *vcpu)
+{
+	free(vcpu);
+}
+
+struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, unsigned long cpu_id)
+{
+	struct kvm_cpu *vcpu;
+	int mmap_size;
+	struct kvm_enable_cap papr_cap = { .cap = KVM_CAP_PPC_PAPR };
+
+	vcpu		= kvm_cpu__new(kvm);
+	if (!vcpu)
+		return NULL;
+
+	vcpu->cpu_id	= cpu_id;
+
+	vcpu->vcpu_fd = ioctl(vcpu->kvm->vm_fd, KVM_CREATE_VCPU, cpu_id);
+	if (vcpu->vcpu_fd < 0)
+		die_perror("KVM_CREATE_VCPU ioctl");
+
+	mmap_size = ioctl(vcpu->kvm->sys_fd, KVM_GET_VCPU_MMAP_SIZE, 0);
+	if (mmap_size < 0)
+		die_perror("KVM_GET_VCPU_MMAP_SIZE ioctl");
+
+	vcpu->kvm_run = mmap(NULL, mmap_size, PROT_RW, MAP_SHARED, vcpu->vcpu_fd, 0);
+	if (vcpu->kvm_run == MAP_FAILED)
+		die("unable to mmap vcpu fd");
+
+	if (ioctl(vcpu->vcpu_fd, KVM_ENABLE_CAP, &papr_cap) < 0)
+		die("unable to enable PAPR capability");
+
+	/*
+	 * We start all CPUs, directing non-primary threads into the kernel's
+	 * secondary start point.  When we come to support SLOF, we will start
+	 * only one and SLOF will RTAS call us to ask for others to be
+	 * started.  (FIXME: make more generic & interface with whichever
+	 * firmware a platform may be using.)
+	 */
+	vcpu->is_running = true;
+
+	return vcpu;
+}
+
+static void kvm_cpu__setup_fpu(struct kvm_cpu *vcpu)
+{
+	/* Don't have to do anything, there's no expected FPU state. */
+}
+
+static void kvm_cpu__setup_regs(struct kvm_cpu *vcpu)
+{
+	/*
+	 * FIXME: This assumes PPC64 and Linux guest.  It doesn't use the
+	 * OpenFirmware entry method, but instead the "embedded" entry which
+	 * passes the FDT address directly.
+	 */
+	struct kvm_regs *r = &vcpu->regs;
+
+	if (vcpu->cpu_id == 0) {
+		r->pc = KERNEL_START_ADDR;
+		r->gpr[3] = vcpu->kvm->arch.fdt_gra;
+		r->gpr[5] = 0;
+	} else {
+		r->pc = KERNEL_SECONDARY_START_ADDR;
+		r->gpr[3] = vcpu->cpu_id;
+	}
+	r->msr = 0x8000000000001000UL; /* 64bit, non-HV, ME */
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_REGS, &vcpu->regs) < 0)
+		die_perror("KVM_SET_REGS failed");
+}
+
+static void kvm_cpu__setup_sregs(struct kvm_cpu *vcpu)
+{
+	/*
+	 * Some sregs setup to initialise SDR1/PVR/HIOR on PPC64 SPAPR
+	 * platforms using PR KVM.  (Technically, this is all ignored on
+	 * SPAPR HV KVM.)  Different setup is required for non-PV non-SPAPR
+	 * platforms!  (FIXME.)
+	 */
+	struct kvm_sregs sregs;
+	struct kvm_one_reg reg = {};
+	u64 value;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &sregs) < 0)
+		die("KVM_GET_SREGS failed");
+
+	sregs.u.s.sdr1 = vcpu->kvm->arch.sdr1;
+	sregs.pvr = vcpu->kvm->arch.pvr;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_SREGS, &sregs) < 0)
+		die("KVM_SET_SREGS failed");
+
+	reg.id = KVM_REG_PPC_HIOR;
+	value = 0;
+	reg.addr = (u64)(unsigned long)&value;
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die("KVM_SET_ONE_REG failed");
+}
+
+/**
+ * kvm_cpu__reset_vcpu - reset virtual CPU to a known state
+ */
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu)
+{
+	kvm_cpu__setup_regs(vcpu);
+	kvm_cpu__setup_sregs(vcpu);
+	kvm_cpu__setup_fpu(vcpu);
+}
+
+/* kvm_cpu__irq - set KVM's IRQ flag on this vcpu */
+void kvm_cpu__irq(struct kvm_cpu *vcpu, int pin, int level)
+{
+	unsigned int virq = level ? KVM_INTERRUPT_SET_LEVEL : KVM_INTERRUPT_UNSET;
+
+	/* FIXME: POWER-specific */
+	if (pin != POWER7_EXT_IRQ)
+		return;
+	if (ioctl(vcpu->vcpu_fd, KVM_INTERRUPT, &virq) < 0)
+		pr_warning("Could not KVM_INTERRUPT.");
+}
+
+void kvm_cpu__arch_nmi(struct kvm_cpu *cpu)
+{
+}
+
+bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu)
+{
+	bool ret = true;
+	struct kvm_run *run = vcpu->kvm_run;
+	switch(run->exit_reason) {
+	case KVM_EXIT_PAPR_HCALL:
+		run->papr_hcall.ret = spapr_hypercall(vcpu, run->papr_hcall.nr,
+						      (target_ulong*)run->papr_hcall.args);
+		break;
+	default:
+		ret = false;
+	}
+	return ret;
+}
+
+bool kvm_cpu__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, u8 is_write)
+{
+	/*
+	 * FIXME: This function will need to be split in order to support
+	 * various PowerPC platforms/PHB types, etc.  It currently assumes SPAPR
+	 * PPC64 guest.
+	 */
+	bool ret = false;
+
+	if ((phys_addr >= SPAPR_PCI_WIN_START) &&
+	    (phys_addr < SPAPR_PCI_WIN_END)) {
+		ret = spapr_phb_mmio(vcpu, phys_addr, data, len, is_write);
+	} else {
+		pr_warning("MMIO %s unknown address %llx (size %d)!\n",
+			   is_write ? "write to" : "read from",
+			   phys_addr, len);
+	}
+	return ret;
+}
+
+#define CONDSTR_BIT(m, b) (((m) & MSR_##b) ? #b" " : "")
+
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu)
+{
+	struct kvm_regs regs;
+	struct kvm_sregs sregs;
+	int r;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, &regs) < 0)
+		die("KVM_GET_REGS failed");
+        if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &sregs) < 0)
+		die("KVM_GET_SREGS failed");
+
+	dprintf(debug_fd, "\n Registers:\n");
+	dprintf(debug_fd, " NIP:   %016llx  MSR:   %016llx "
+		"( %s%s%s%s%s%s%s%s%s%s%s%s)\n",
+		regs.pc, regs.msr,
+		CONDSTR_BIT(regs.msr, SF),
+		CONDSTR_BIT(regs.msr, HV), /* ! */
+		CONDSTR_BIT(regs.msr, VEC),
+		CONDSTR_BIT(regs.msr, VSX),
+		CONDSTR_BIT(regs.msr, EE),
+		CONDSTR_BIT(regs.msr, PR),
+		CONDSTR_BIT(regs.msr, FP),
+		CONDSTR_BIT(regs.msr, ME),
+		CONDSTR_BIT(regs.msr, IR),
+		CONDSTR_BIT(regs.msr, DR),
+		CONDSTR_BIT(regs.msr, RI),
+		CONDSTR_BIT(regs.msr, LE));
+	dprintf(debug_fd, " CTR:   %016llx  LR:    %016llx  CR:   %08llx\n",
+		regs.ctr, regs.lr, regs.cr);
+	dprintf(debug_fd, " SRR0:  %016llx  SRR1:  %016llx  XER:  %016llx\n",
+		regs.srr0, regs.srr1, regs.xer);
+	dprintf(debug_fd, " SPRG0: %016llx  SPRG1: %016llx\n",
+		regs.sprg0, regs.sprg1);
+	dprintf(debug_fd, " SPRG2: %016llx  SPRG3: %016llx\n",
+		regs.sprg2, regs.sprg3);
+	dprintf(debug_fd, " SPRG4: %016llx  SPRG5: %016llx\n",
+		regs.sprg4, regs.sprg5);
+	dprintf(debug_fd, " SPRG6: %016llx  SPRG7: %016llx\n",
+		regs.sprg6, regs.sprg7);
+	dprintf(debug_fd, " GPRs:\n ");
+	for (r = 0; r < 32; r++) {
+		dprintf(debug_fd, "%016llx  ", regs.gpr[r]);
+		if ((r & 3) == 3)
+			dprintf(debug_fd, "\n ");
+	}
+	dprintf(debug_fd, "\n");
+
+	/* FIXME: Assumes SLB-based (book3s) guest */
+	for (r = 0; r < 32; r++) {
+		dprintf(debug_fd, " SLB%02d  %016llx %016llx\n", r,
+			sregs.u.s.ppc64.slb[r].slbe,
+			sregs.u.s.ppc64.slb[r].slbv);
+	}
+	dprintf(debug_fd, "----------\n");
+}
+
+void kvm_cpu__show_code(struct kvm_cpu *vcpu)
+{
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, &vcpu->regs) < 0)
+		die("KVM_GET_REGS failed");
+
+	/* FIXME: Dump/disassemble some code...! */
+
+	dprintf(debug_fd, "\n Stack:\n");
+	dprintf(debug_fd,   " ------\n");
+	/* Only works in real mode: */
+	kvm__dump_mem(vcpu->kvm, vcpu->regs.gpr[1], 32, debug_fd);
+}
+
+void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu)
+{
+	/* Does nothing yet */
+}
diff --git a/kvmtool/powerpc/kvm.c b/kvmtool/powerpc/kvm.c
new file mode 100644
index 0000000..702d67d
--- /dev/null
+++ b/kvmtool/powerpc/kvm.c
@@ -0,0 +1,521 @@
+/*
+ * PPC64 (SPAPR) platform support
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * Portions of FDT setup borrowed from QEMU, copyright 2010 David Gibson, IBM
+ * Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/fdt.h"
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+#include "cpu_info.h"
+
+#include "spapr.h"
+#include "spapr_hvcons.h"
+#include "spapr_pci.h"
+
+#include <linux/kvm.h>
+
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <asm/unistd.h>
+#include <errno.h>
+
+#include <linux/byteorder.h>
+
+#define HPT_ORDER 24
+
+#define HUGETLBFS_PATH "/var/lib/hugetlbfs/global/pagesize-16MB/"
+
+static char kern_cmdline[2048];
+
+struct kvm_ext kvm_req_ext[] = {
+	{ DEFINE_KVM_EXT(KVM_CAP_PPC_UNSET_IRQ) },
+	{ DEFINE_KVM_EXT(KVM_CAP_PPC_IRQ_LEVEL) },
+	{ 0, 0 }
+};
+
+static uint32_t mfpvr(void)
+{
+	uint32_t r;
+	asm volatile ("mfpvr %0" : "=r"(r));
+	return r;
+}
+
+bool kvm__arch_cpu_supports_vm(void)
+{
+	return true;
+}
+
+void kvm__init_ram(struct kvm *kvm)
+{
+	u64	phys_start, phys_size;
+	void	*host_mem;
+
+	phys_start = 0;
+	phys_size  = kvm->ram_size;
+	host_mem   = kvm->ram_start;
+
+	/*
+	 * We put MMIO at PPC_MMIO_START, high up.  Make sure that this doesn't
+	 * crash into the end of RAM -- on PPC64 at least, this is so high
+	 * (63TB!) that this is unlikely.
+	 */
+	if (phys_size >= PPC_MMIO_START)
+		die("Too much memory (%lld, what a nice problem): "
+		    "overlaps MMIO!\n",
+		    phys_size);
+
+	kvm__register_ram(kvm, phys_start, phys_size, host_mem);
+}
+
+void kvm__arch_set_cmdline(char *cmdline, bool video)
+{
+	/* We don't need anything unusual in here. */
+}
+
+/* Architecture-specific KVM init */
+void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size)
+{
+	int cap_ppc_rma;
+	unsigned long hpt;
+
+	kvm->ram_size		= ram_size;
+
+	/* Map "default" hugetblfs path to the standard 16M mount point */
+	if (hugetlbfs_path && !strcmp(hugetlbfs_path, "default"))
+		hugetlbfs_path = HUGETLBFS_PATH;
+
+	kvm->ram_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path, kvm->ram_size);
+
+	if (kvm->ram_start == MAP_FAILED)
+		die("Couldn't map %lld bytes for RAM (%d)\n",
+		    kvm->ram_size, errno);
+
+	/* FDT goes at top of memory, RTAS just below */
+	kvm->arch.fdt_gra = kvm->ram_size - FDT_MAX_SIZE;
+	/* FIXME: Not all PPC systems have RTAS */
+	kvm->arch.rtas_gra = kvm->arch.fdt_gra - RTAS_MAX_SIZE;
+	madvise(kvm->ram_start, kvm->ram_size, MADV_MERGEABLE);
+
+	/* FIXME:  SPAPR-PR specific; allocate a guest HPT. */
+	if (posix_memalign((void **)&hpt, (1<<HPT_ORDER), (1<<HPT_ORDER)))
+		die("Can't allocate %d bytes for HPT\n", (1<<HPT_ORDER));
+
+	kvm->arch.sdr1 = ((hpt + 0x3ffffULL) & ~0x3ffffULL) | (HPT_ORDER-18);
+
+	kvm->arch.pvr = mfpvr();
+
+	/* FIXME: This is book3s-specific */
+	cap_ppc_rma = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_PPC_RMA);
+	if (cap_ppc_rma == 2)
+		die("Need contiguous RMA allocation on this hardware, "
+		    "which is not yet supported.");
+
+	/* Do these before FDT setup, IRQ setup, etc. */
+	/* FIXME: SPAPR-specific */
+	hypercall_init();
+	register_core_rtas();
+	/* Now that hypercalls are initialised, register a couple for the console: */
+	spapr_hvcons_init();
+	spapr_create_phb(kvm, "pci", SPAPR_PCI_BUID,
+			 SPAPR_PCI_MEM_WIN_ADDR,
+			 SPAPR_PCI_MEM_WIN_SIZE,
+			 SPAPR_PCI_IO_WIN_ADDR,
+			 SPAPR_PCI_IO_WIN_SIZE);
+}
+
+void kvm__arch_delete_ram(struct kvm *kvm)
+{
+	munmap(kvm->ram_start, kvm->ram_size);
+}
+
+void kvm__irq_trigger(struct kvm *kvm, int irq)
+{
+	kvm__irq_line(kvm, irq, 1);
+	kvm__irq_line(kvm, irq, 0);
+}
+
+void kvm__arch_read_term(struct kvm *kvm)
+{
+	/* FIXME: Should register callbacks to platform-specific polls */
+	spapr_hvcons_poll(kvm);
+}
+
+bool kvm__arch_load_kernel_image(struct kvm *kvm, int fd_kernel, int fd_initrd,
+				 const char *kernel_cmdline)
+{
+	void *p;
+	void *k_start;
+	ssize_t filesize;
+
+	p = k_start = guest_flat_to_host(kvm, KERNEL_LOAD_ADDR);
+
+	filesize = read_file(fd_kernel, p, INITRD_LOAD_ADDR - KERNEL_LOAD_ADDR);
+	if (filesize < 0) {
+		if (errno == ENOMEM)
+			die("Kernel overlaps initrd!");
+
+		die_perror("kernel read");
+	}
+	pr_info("Loaded kernel to 0x%x (%ld bytes)", KERNEL_LOAD_ADDR,
+		filesize);
+	if (fd_initrd != -1) {
+		if (p-k_start > INITRD_LOAD_ADDR)
+			die("Kernel overlaps initrd!");
+
+		/* Round up kernel size to 8byte alignment, and load initrd right after. */
+		p = guest_flat_to_host(kvm, INITRD_LOAD_ADDR);
+
+		filesize = read_file(fd_initrd, p,
+			       (kvm->ram_start + kvm->ram_size) - p);
+		if (filesize < 0) {
+			if (errno == ENOMEM)
+				die("initrd too big to contain in guest RAM.\n");
+			die_perror("initrd read");
+		}
+
+		pr_info("Loaded initrd to 0x%x (%ld bytes)",
+			INITRD_LOAD_ADDR, filesize);
+		kvm->arch.initrd_gra = INITRD_LOAD_ADDR;
+		kvm->arch.initrd_size = filesize;
+	} else {
+		kvm->arch.initrd_size = 0;
+	}
+	strncpy(kern_cmdline, kernel_cmdline, 2048);
+	kern_cmdline[2047] = '\0';
+
+	return true;
+}
+
+struct fdt_prop {
+	void *value;
+	int size;
+};
+
+static void generate_segment_page_sizes(struct kvm_ppc_smmu_info *info, struct fdt_prop *prop)
+{
+	struct kvm_ppc_one_seg_page_size *sps;
+	int i, j, size;
+	u32 *p;
+
+	for (size = 0, i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) {
+		sps = &info->sps[i];
+
+		if (sps->page_shift == 0)
+			break;
+
+		/* page shift, slb enc & count */
+		size += 3;
+
+		for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) {
+			if (info->sps[i].enc[j].page_shift == 0)
+				break;
+
+			/* page shift & pte enc */
+			size += 2;
+		}
+	}
+
+	if (!size) {
+		prop->value = NULL;
+		prop->size = 0;
+		return;
+	}
+
+	/* Convert size to bytes */
+	prop->size = size * sizeof(u32);
+
+	prop->value = malloc(prop->size);
+	if (!prop->value)
+		die_perror("malloc failed");
+
+	p = (u32 *)prop->value;
+	for (i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) {
+		sps = &info->sps[i];
+
+		if (sps->page_shift == 0)
+			break;
+
+		*p++ = cpu_to_be32(sps->page_shift);
+		*p++ = cpu_to_be32(sps->slb_enc);
+
+		for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++)
+			if (!info->sps[i].enc[j].page_shift)
+				break;
+
+		*p++ = cpu_to_be32(j);	/* count of enc */
+
+		for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) {
+			if (!info->sps[i].enc[j].page_shift)
+				break;
+
+			*p++ = cpu_to_be32(info->sps[i].enc[j].page_shift);
+			*p++ = cpu_to_be32(info->sps[i].enc[j].pte_enc);
+		}
+	}
+}
+
+#define SMT_THREADS 4
+
+/*
+ * Set up the FDT for the kernel: This function is currently fairly SPAPR-heavy,
+ * and whilst most PPC targets will require CPU/memory nodes, others like RTAS
+ * should eventually be added separately.
+ */
+static int setup_fdt(struct kvm *kvm)
+{
+	uint64_t 	mem_reg_property[] = { 0, cpu_to_be64(kvm->ram_size) };
+	int 		smp_cpus = kvm->nrcpus;
+	uint32_t	int_server_ranges_prop[] = {0, cpu_to_be32(smp_cpus)};
+	char 		hypertas_prop_kvm[] = "hcall-pft\0hcall-term\0"
+		"hcall-dabr\0hcall-interrupt\0hcall-tce\0hcall-vio\0"
+		"hcall-splpar\0hcall-bulk\0hcall-set-mode";
+	int 		i, j;
+	char 		cpu_name[30];
+	u8		staging_fdt[FDT_MAX_SIZE];
+	struct cpu_info *cpu_info = find_cpu_info(kvm);
+	struct fdt_prop segment_page_sizes;
+	u32 segment_sizes_1T[] = {cpu_to_be32(0x1c), cpu_to_be32(0x28), 0xffffffff, 0xffffffff};
+
+	/* Generate an appropriate DT at kvm->arch.fdt_gra */
+	void *fdt_dest = guest_flat_to_host(kvm, kvm->arch.fdt_gra);
+	void *fdt = staging_fdt;
+
+	_FDT(fdt_create(fdt, FDT_MAX_SIZE));
+	_FDT(fdt_finish_reservemap(fdt));
+
+	_FDT(fdt_begin_node(fdt, ""));
+
+	_FDT(fdt_property_string(fdt, "device_type", "chrp"));
+	_FDT(fdt_property_string(fdt, "model", "IBM pSeries (kvmtool)"));
+	_FDT(fdt_property_cell(fdt, "#address-cells", 0x2));
+	_FDT(fdt_property_cell(fdt, "#size-cells", 0x2));
+
+	/* RTAS */
+	_FDT(fdt_begin_node(fdt, "rtas"));
+	/* This is what the kernel uses to switch 'We're an LPAR'! */
+        _FDT(fdt_property(fdt, "ibm,hypertas-functions", hypertas_prop_kvm,
+                           sizeof(hypertas_prop_kvm)));
+	_FDT(fdt_property_cell(fdt, "linux,rtas-base", kvm->arch.rtas_gra));
+	_FDT(fdt_property_cell(fdt, "linux,rtas-entry", kvm->arch.rtas_gra));
+	_FDT(fdt_property_cell(fdt, "rtas-size", kvm->arch.rtas_size));
+	/* Now add properties for all RTAS tokens: */
+	if (spapr_rtas_fdt_setup(kvm, fdt))
+		die("Couldn't create RTAS FDT properties\n");
+
+	_FDT(fdt_end_node(fdt));
+
+	/* /chosen */
+	_FDT(fdt_begin_node(fdt, "chosen"));
+	/* cmdline */
+	_FDT(fdt_property_string(fdt, "bootargs", kern_cmdline));
+	/* Initrd */
+	if (kvm->arch.initrd_size != 0) {
+		uint32_t ird_st_prop = cpu_to_be32(kvm->arch.initrd_gra);
+		uint32_t ird_end_prop = cpu_to_be32(kvm->arch.initrd_gra +
+						    kvm->arch.initrd_size);
+		_FDT(fdt_property(fdt, "linux,initrd-start",
+				   &ird_st_prop, sizeof(ird_st_prop)));
+		_FDT(fdt_property(fdt, "linux,initrd-end",
+				   &ird_end_prop, sizeof(ird_end_prop)));
+	}
+
+	/*
+	 * stdout-path: This is assuming we're using the HV console.  Also, the
+	 * address is hardwired until we do a VIO bus.
+	 */
+	_FDT(fdt_property_string(fdt, "linux,stdout-path",
+				 "/vdevice/vty@30000000"));
+	_FDT(fdt_end_node(fdt));
+
+	/*
+	 * Memory: We don't alloc. a separate RMA yet.  If we ever need to
+	 * (CAP_PPC_RMA == 2) then have one memory node for 0->RMAsize, and
+	 * another RMAsize->endOfMem.
+	 */
+	_FDT(fdt_begin_node(fdt, "memory@0"));
+	_FDT(fdt_property_string(fdt, "device_type", "memory"));
+	_FDT(fdt_property(fdt, "reg", mem_reg_property,
+			  sizeof(mem_reg_property)));
+	_FDT(fdt_end_node(fdt));
+
+	generate_segment_page_sizes(&cpu_info->mmu_info, &segment_page_sizes);
+
+	/* CPUs */
+	_FDT(fdt_begin_node(fdt, "cpus"));
+	_FDT(fdt_property_cell(fdt, "#address-cells", 0x1));
+	_FDT(fdt_property_cell(fdt, "#size-cells", 0x0));
+
+	for (i = 0; i < smp_cpus; i += SMT_THREADS) {
+		int32_t pft_size_prop[] = { 0, cpu_to_be32(HPT_ORDER) };
+		uint32_t servers_prop[SMT_THREADS];
+		uint32_t gservers_prop[SMT_THREADS * 2];
+		int threads = (smp_cpus - i) >= SMT_THREADS ? SMT_THREADS :
+			smp_cpus - i;
+
+		sprintf(cpu_name, "PowerPC,%s@%d", cpu_info->name, i);
+		_FDT(fdt_begin_node(fdt, cpu_name));
+		sprintf(cpu_name, "PowerPC,%s", cpu_info->name);
+		_FDT(fdt_property_string(fdt, "name", cpu_name));
+		_FDT(fdt_property_string(fdt, "device_type", "cpu"));
+
+		_FDT(fdt_property_cell(fdt, "reg", i));
+		_FDT(fdt_property_cell(fdt, "cpu-version", kvm->arch.pvr));
+
+		_FDT(fdt_property_cell(fdt, "dcache-block-size", cpu_info->d_bsize));
+		_FDT(fdt_property_cell(fdt, "icache-block-size", cpu_info->i_bsize));
+
+		if (cpu_info->tb_freq)
+			_FDT(fdt_property_cell(fdt, "timebase-frequency", cpu_info->tb_freq));
+
+		/* Lies, but safeish lies! */
+		_FDT(fdt_property_cell(fdt, "clock-frequency", 0xddbab200));
+
+		if (cpu_info->mmu_info.slb_size)
+			_FDT(fdt_property_cell(fdt, "ibm,slb-size", cpu_info->mmu_info.slb_size));
+
+		/*
+		 * HPT size is hardwired; KVM currently fixes it at 16MB but the
+		 * moment that changes we'll need to read it out of the kernel.
+		 */
+		_FDT(fdt_property(fdt, "ibm,pft-size", pft_size_prop,
+				  sizeof(pft_size_prop)));
+
+		_FDT(fdt_property_string(fdt, "status", "okay"));
+		_FDT(fdt_property(fdt, "64-bit", NULL, 0));
+		/* A server for each thread in this core */
+		for (j = 0; j < SMT_THREADS; j++) {
+			servers_prop[j] = cpu_to_be32(i+j);
+			/*
+			 * Hack borrowed from QEMU, direct the group queues back
+			 * to cpu 0:
+			 */
+			gservers_prop[j*2] = cpu_to_be32(i+j);
+			gservers_prop[j*2 + 1] = 0;
+		}
+		_FDT(fdt_property(fdt, "ibm,ppc-interrupt-server#s",
+				   servers_prop, threads * sizeof(uint32_t)));
+		_FDT(fdt_property(fdt, "ibm,ppc-interrupt-gserver#s",
+				  gservers_prop,
+				  threads * 2 * sizeof(uint32_t)));
+
+		if (segment_page_sizes.value)
+			_FDT(fdt_property(fdt, "ibm,segment-page-sizes",
+					  segment_page_sizes.value,
+					  segment_page_sizes.size));
+
+		if (cpu_info->mmu_info.flags & KVM_PPC_1T_SEGMENTS)
+			_FDT(fdt_property(fdt, "ibm,processor-segment-sizes",
+					  segment_sizes_1T, sizeof(segment_sizes_1T)));
+
+		/* VSX / DFP options: */
+		if (cpu_info->flags & CPUINFO_FLAG_VMX)
+			_FDT(fdt_property_cell(fdt, "ibm,vmx",
+					       (cpu_info->flags &
+						CPUINFO_FLAG_VSX) ? 2 : 1));
+		if (cpu_info->flags & CPUINFO_FLAG_DFP)
+			_FDT(fdt_property_cell(fdt, "ibm,dfp", 0x1));
+		_FDT(fdt_end_node(fdt));
+	}
+	_FDT(fdt_end_node(fdt));
+
+	/* IRQ controller */
+	_FDT(fdt_begin_node(fdt, "interrupt-controller@0"));
+
+	_FDT(fdt_property_string(fdt, "device_type",
+				 "PowerPC-External-Interrupt-Presentation"));
+	_FDT(fdt_property_string(fdt, "compatible", "IBM,ppc-xicp"));
+	_FDT(fdt_property_cell(fdt, "reg", 0));
+	_FDT(fdt_property(fdt, "interrupt-controller", NULL, 0));
+	_FDT(fdt_property(fdt, "ibm,interrupt-server-ranges",
+			   int_server_ranges_prop,
+			   sizeof(int_server_ranges_prop)));
+	_FDT(fdt_property_cell(fdt, "#interrupt-cells", 2));
+	_FDT(fdt_property_cell(fdt, "linux,phandle", PHANDLE_XICP));
+	_FDT(fdt_property_cell(fdt, "phandle", PHANDLE_XICP));
+	_FDT(fdt_end_node(fdt));
+
+	/*
+	 * VIO: See comment in linux,stdout-path; we don't yet represent a VIO
+	 * bus/address allocation so addresses are hardwired here.
+	 */
+	_FDT(fdt_begin_node(fdt, "vdevice"));
+	_FDT(fdt_property_cell(fdt, "#address-cells", 0x1));
+	_FDT(fdt_property_cell(fdt, "#size-cells", 0x0));
+	_FDT(fdt_property_string(fdt, "device_type", "vdevice"));
+	_FDT(fdt_property_string(fdt, "compatible", "IBM,vdevice"));
+	_FDT(fdt_begin_node(fdt, "vty@30000000"));
+	_FDT(fdt_property_string(fdt, "name", "vty"));
+	_FDT(fdt_property_string(fdt, "device_type", "serial"));
+	_FDT(fdt_property_string(fdt, "compatible", "hvterm1"));
+	_FDT(fdt_property_cell(fdt, "reg", 0x30000000));
+	_FDT(fdt_end_node(fdt));
+	_FDT(fdt_end_node(fdt));
+
+	/* Finalise: */
+	_FDT(fdt_end_node(fdt)); /* Root node */
+	_FDT(fdt_finish(fdt));
+
+	_FDT(fdt_open_into(fdt, fdt_dest, FDT_MAX_SIZE));
+
+	/* PCI */
+	if (spapr_populate_pci_devices(kvm, PHANDLE_XICP, fdt_dest))
+		die("Fail populating PCI device nodes");
+
+	_FDT(fdt_add_mem_rsv(fdt_dest, kvm->arch.rtas_gra, kvm->arch.rtas_size));
+	_FDT(fdt_pack(fdt_dest));
+
+	free(segment_page_sizes.value);
+
+	return 0;
+}
+firmware_init(setup_fdt);
+
+/**
+ * kvm__arch_setup_firmware
+ */
+int kvm__arch_setup_firmware(struct kvm *kvm)
+{
+	/*
+	 * Set up RTAS stub.  All it is is a single hypercall:
+	 *  0:   7c 64 1b 78     mr      r4,r3
+	 *  4:   3c 60 00 00     lis     r3,0
+	 *  8:   60 63 f0 00     ori     r3,r3,61440
+	 *  c:   44 00 00 22     sc      1
+	 * 10:   4e 80 00 20     blr
+	 */
+	uint32_t *rtas = guest_flat_to_host(kvm, kvm->arch.rtas_gra);
+
+	rtas[0] = cpu_to_be32(0x7c641b78);
+	rtas[1] = cpu_to_be32(0x3c600000);
+	rtas[2] = cpu_to_be32(0x6063f000);
+	rtas[3] = cpu_to_be32(0x44000022);
+	rtas[4] = cpu_to_be32(0x4e800020);
+	kvm->arch.rtas_size = 20;
+
+	pr_info("Set up %ld bytes of RTAS at 0x%lx\n",
+		kvm->arch.rtas_size, kvm->arch.rtas_gra);
+
+	/* Load SLOF */
+
+	return 0;
+}
+
+int kvm__arch_free_firmware(struct kvm *kvm)
+{
+	return 0;
+}
diff --git a/kvmtool/powerpc/spapr.h b/kvmtool/powerpc/spapr.h
new file mode 100644
index 0000000..f851f4a
--- /dev/null
+++ b/kvmtool/powerpc/spapr.h
@@ -0,0 +1,120 @@
+/*
+ * SPAPR definitions and declarations
+ *
+ * Borrowed heavily from QEMU's spapr.h,
+ * Copyright (c) 2010 David Gibson, IBM Corporation.
+ *
+ * Modifications by Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#if !defined(__HW_SPAPR_H__)
+#define __HW_SPAPR_H__
+
+#include <inttypes.h>
+#include <linux/byteorder.h>
+
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+
+typedef unsigned long target_ulong;
+typedef uintptr_t target_phys_addr_t;
+
+#define H_SUCCESS	0
+#define H_HARDWARE	-1	/* Hardware error */
+#define H_FUNCTION	-2	/* Function not supported */
+#define H_PARAMETER	-4	/* Parameter invalid, out-of-range or conflicting */
+#define H_P2		-55
+#define H_SET_DABR		0x28
+#define H_LOGICAL_CI_LOAD	0x3c
+#define H_LOGICAL_CI_STORE	0x40
+#define H_LOGICAL_CACHE_LOAD	0x44
+#define H_LOGICAL_CACHE_STORE	0x48
+#define H_LOGICAL_ICBI		0x4c
+#define H_LOGICAL_DCBF		0x50
+#define H_GET_TERM_CHAR		0x54
+#define H_PUT_TERM_CHAR		0x58
+#define H_CPPR			0x68
+#define H_EOI			0x64
+#define H_IPI			0x6c
+#define H_XIRR			0x74
+#define H_SET_MODE		0x31C
+#define MAX_HCALL_OPCODE	H_SET_MODE
+
+/* Values for 2nd argument to H_SET_MODE */
+#define H_SET_MODE_RESOURCE_SET_CIABR		1
+#define H_SET_MODE_RESOURCE_SET_DAWR		2
+#define H_SET_MODE_RESOURCE_ADDR_TRANS_MODE	3
+#define H_SET_MODE_RESOURCE_LE			4
+
+/* Flags for H_SET_MODE_RESOURCE_LE */
+#define H_SET_MODE_ENDIAN_BIG		0
+#define H_SET_MODE_ENDIAN_LITTLE	1
+
+/*
+ * The hcalls above are standardized in PAPR and implemented by pHyp
+ * as well.
+ *
+ * We also need some hcalls which are specific to qemu / KVM-on-POWER.
+ * So far we just need one for H_RTAS, but in future we'll need more
+ * for extensions like virtio.  We put those into the 0xf000-0xfffc
+ * range which is reserved by PAPR for "platform-specific" hcalls.
+ */
+#define KVMPPC_HCALL_BASE       0xf000
+#define KVMPPC_H_RTAS           (KVMPPC_HCALL_BASE + 0x0)
+#define KVMPPC_HCALL_MAX        KVMPPC_H_RTAS
+
+#define DEBUG_SPAPR_HCALLS
+
+#ifdef DEBUG_SPAPR_HCALLS
+#define hcall_dprintf(fmt, ...) \
+    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
+#else
+#define hcall_dprintf(fmt, ...) \
+    do { } while (0)
+#endif
+
+typedef target_ulong (*spapr_hcall_fn)(struct kvm_cpu *vcpu,
+				       target_ulong opcode,
+                                       target_ulong *args);
+
+void hypercall_init(void);
+void register_core_rtas(void);
+
+void spapr_register_hypercall(target_ulong opcode, spapr_hcall_fn fn);
+target_ulong spapr_hypercall(struct kvm_cpu *vcpu, target_ulong opcode,
+                             target_ulong *args);
+
+int spapr_rtas_fdt_setup(struct kvm *kvm, void *fdt);
+
+static inline uint32_t rtas_ld(struct kvm *kvm, target_ulong phys, int n)
+{
+	return cpu_to_be32(*((uint32_t *)guest_flat_to_host(kvm, phys + 4*n)));
+}
+
+static inline void rtas_st(struct kvm *kvm, target_ulong phys, int n, uint32_t val)
+{
+	*((uint32_t *)guest_flat_to_host(kvm, phys + 4*n)) = cpu_to_be32(val);
+}
+
+typedef void (*spapr_rtas_fn)(struct kvm_cpu *vcpu, uint32_t token,
+                              uint32_t nargs, target_ulong args,
+                              uint32_t nret, target_ulong rets);
+void spapr_rtas_register(const char *name, spapr_rtas_fn fn);
+target_ulong spapr_rtas_call(struct kvm_cpu *vcpu,
+                             uint32_t token, uint32_t nargs, target_ulong args,
+                             uint32_t nret, target_ulong rets);
+
+#define SPAPR_PCI_BUID          0x800000020000001ULL
+#define SPAPR_PCI_MEM_WIN_ADDR  (KVM_MMIO_START + 0xA0000000)
+#define SPAPR_PCI_MEM_WIN_SIZE  0x20000000
+#define SPAPR_PCI_IO_WIN_ADDR   (SPAPR_PCI_MEM_WIN_ADDR + SPAPR_PCI_MEM_WIN_SIZE)
+#define SPAPR_PCI_IO_WIN_SIZE	0x2000000
+
+#define SPAPR_PCI_WIN_START	SPAPR_PCI_MEM_WIN_ADDR
+#define SPAPR_PCI_WIN_END	(SPAPR_PCI_IO_WIN_ADDR + SPAPR_PCI_IO_WIN_SIZE)
+
+#endif /* !defined (__HW_SPAPR_H__) */
diff --git a/kvmtool/powerpc/spapr_hcall.c b/kvmtool/powerpc/spapr_hcall.c
new file mode 100644
index 0000000..25bec82
--- /dev/null
+++ b/kvmtool/powerpc/spapr_hcall.c
@@ -0,0 +1,200 @@
+/*
+ * SPAPR hypercalls
+ *
+ * Borrowed heavily from QEMU's spapr_hcall.c,
+ * Copyright (c) 2010 David Gibson, IBM Corporation.
+ *
+ * Copyright (c) 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "spapr.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+
+#include <stdio.h>
+#include <assert.h>
+#include <sys/eventfd.h>
+
+static spapr_hcall_fn papr_hypercall_table[(MAX_HCALL_OPCODE / 4) + 1];
+static spapr_hcall_fn kvmppc_hypercall_table[KVMPPC_HCALL_MAX -
+					     KVMPPC_HCALL_BASE + 1];
+
+static target_ulong h_set_dabr(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+	/* FIXME:  Implement this for -PR.  (-HV does this in kernel.) */
+	return H_HARDWARE;
+}
+
+static target_ulong h_rtas(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+	target_ulong rtas_r3 = args[0];
+	/*
+	 * Pointer read from phys mem; these ptrs cannot be MMIO (!) so just
+	 * reference guest RAM directly.
+	 */
+	uint32_t token, nargs, nret;
+
+	token = rtas_ld(vcpu->kvm, rtas_r3, 0);
+	nargs = rtas_ld(vcpu->kvm, rtas_r3, 1);
+	nret  = rtas_ld(vcpu->kvm, rtas_r3, 2);
+
+	return spapr_rtas_call(vcpu, token, nargs, rtas_r3 + 12,
+			       nret, rtas_r3 + 12 + 4*nargs);
+}
+
+static target_ulong h_logical_load(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+	/* SLOF will require these, though kernel doesn't. */
+	die(__PRETTY_FUNCTION__);
+	return H_PARAMETER;
+}
+
+static target_ulong h_logical_store(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+	/* SLOF will require these, though kernel doesn't. */
+	die(__PRETTY_FUNCTION__);
+	return H_PARAMETER;
+}
+
+static target_ulong h_logical_icbi(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+	/* KVM will trap this in the kernel.  Die if it misses. */
+	die(__PRETTY_FUNCTION__);
+	return H_SUCCESS;
+}
+
+static target_ulong h_logical_dcbf(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+	/* KVM will trap this in the kernel.  Die if it misses. */
+	die(__PRETTY_FUNCTION__);
+	return H_SUCCESS;
+}
+
+struct lpcr_data {
+	struct kvm_cpu	*cpu;
+	int		mode;
+};
+
+static void get_cpu_lpcr(struct kvm_cpu *vcpu, target_ulong *lpcr)
+{
+	struct kvm_one_reg reg = {
+		.id = KVM_REG_PPC_LPCR_64,
+		.addr = (__u64)lpcr
+	};
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg))
+		die("Couldn't read vcpu reg?!");
+}
+
+static void set_cpu_lpcr(struct kvm_cpu *vcpu, target_ulong *lpcr)
+{
+	struct kvm_one_reg reg = {
+		.id = KVM_REG_PPC_LPCR_64,
+		.addr = (__u64)lpcr
+	};
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg))
+		die("Couldn't write vcpu reg?!");
+}
+
+static void set_endian_task(struct kvm_cpu *vcpu, void *data)
+{
+	target_ulong mflags = (target_ulong)data;
+	target_ulong lpcr;
+
+	get_cpu_lpcr(vcpu, &lpcr);
+
+	if (mflags == H_SET_MODE_ENDIAN_BIG)
+		lpcr &= ~LPCR_ILE;
+	else
+		lpcr |= LPCR_ILE;
+
+	set_cpu_lpcr(vcpu, &lpcr);
+}
+
+static target_ulong h_set_mode(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+	int ret;
+
+	switch (args[1]) {
+	case H_SET_MODE_RESOURCE_LE: {
+		struct kvm_cpu_task task;
+		task.func = set_endian_task;
+		task.data = (void *)args[0];
+		kvm_cpu__run_on_all_cpus(vcpu->kvm, &task);
+		ret = H_SUCCESS;
+		break;
+	}
+	default:
+		ret = H_FUNCTION;
+		break;
+	}
+
+	return ret;
+}
+
+
+void spapr_register_hypercall(target_ulong opcode, spapr_hcall_fn fn)
+{
+	spapr_hcall_fn *slot;
+
+	if (opcode <= MAX_HCALL_OPCODE) {
+		assert((opcode & 0x3) == 0);
+
+		slot = &papr_hypercall_table[opcode / 4];
+	} else {
+		assert((opcode >= KVMPPC_HCALL_BASE) &&
+		       (opcode <= KVMPPC_HCALL_MAX));
+
+		slot = &kvmppc_hypercall_table[opcode - KVMPPC_HCALL_BASE];
+	}
+
+	assert(!(*slot) || (fn == *slot));
+	*slot = fn;
+}
+
+target_ulong spapr_hypercall(struct kvm_cpu *vcpu, target_ulong opcode,
+			     target_ulong *args)
+{
+	if ((opcode <= MAX_HCALL_OPCODE)
+	    && ((opcode & 0x3) == 0)) {
+		spapr_hcall_fn fn = papr_hypercall_table[opcode / 4];
+
+		if (fn) {
+			return fn(vcpu, opcode, args);
+		}
+	} else if ((opcode >= KVMPPC_HCALL_BASE) &&
+		   (opcode <= KVMPPC_HCALL_MAX)) {
+		spapr_hcall_fn fn = kvmppc_hypercall_table[opcode -
+							   KVMPPC_HCALL_BASE];
+
+		if (fn) {
+			return fn(vcpu, opcode, args);
+		}
+	}
+
+	hcall_dprintf("Unimplemented hcall 0x%lx\n", opcode);
+	return H_FUNCTION;
+}
+
+void hypercall_init(void)
+{
+	/* hcall-dabr */
+	spapr_register_hypercall(H_SET_DABR, h_set_dabr);
+
+	spapr_register_hypercall(H_LOGICAL_CI_LOAD, h_logical_load);
+	spapr_register_hypercall(H_LOGICAL_CI_STORE, h_logical_store);
+	spapr_register_hypercall(H_LOGICAL_CACHE_LOAD, h_logical_load);
+	spapr_register_hypercall(H_LOGICAL_CACHE_STORE, h_logical_store);
+	spapr_register_hypercall(H_LOGICAL_ICBI, h_logical_icbi);
+	spapr_register_hypercall(H_LOGICAL_DCBF, h_logical_dcbf);
+	spapr_register_hypercall(H_SET_MODE, h_set_mode);
+
+	/* KVM-PPC specific hcalls */
+	spapr_register_hypercall(KVMPPC_H_RTAS, h_rtas);
+}
diff --git a/kvmtool/powerpc/spapr_hvcons.c b/kvmtool/powerpc/spapr_hvcons.c
new file mode 100644
index 0000000..605367b
--- /dev/null
+++ b/kvmtool/powerpc/spapr_hvcons.c
@@ -0,0 +1,105 @@
+/*
+ * SPAPR HV console
+ *
+ * Borrowed lightly from QEMU's spapr_vty.c, Copyright (c) 2010 David Gibson,
+ * IBM Corporation.
+ *
+ * Copyright (c) 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/term.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/util.h"
+#include "spapr.h"
+#include "spapr_hvcons.h"
+
+#include <stdio.h>
+#include <sys/uio.h>
+#include <errno.h>
+
+#include <linux/byteorder.h>
+
+union hv_chario {
+	struct {
+		uint64_t char0_7;
+		uint64_t char8_15;
+	} a;
+	uint8_t buf[16];
+};
+
+static unsigned long h_put_term_char(struct kvm_cpu *vcpu, unsigned long opcode, unsigned long *args)
+{
+	/* To do: Read register from args[0], and check it. */
+	unsigned long len = args[1];
+	union hv_chario data;
+	struct iovec iov;
+
+	if (len > 16) {
+		return H_PARAMETER;
+	}
+	data.a.char0_7 = cpu_to_be64(args[2]);
+	data.a.char8_15 = cpu_to_be64(args[3]);
+
+	iov.iov_base = data.buf;
+	iov.iov_len = len;
+	do {
+		int ret;
+
+		ret = term_putc_iov(&iov, 1, 0);
+		if (ret < 0) {
+			die("term_putc_iov error %d!\n", errno);
+		}
+		iov.iov_base += ret;
+		iov.iov_len -= ret;
+	} while (iov.iov_len > 0);
+
+	return H_SUCCESS;
+}
+
+
+static unsigned long h_get_term_char(struct kvm_cpu *vcpu, unsigned long opcode, unsigned long *args)
+{
+	/* To do: Read register from args[0], and check it. */
+	unsigned long *len = args + 0;
+	unsigned long *char0_7 = args + 1;
+	unsigned long *char8_15 = args + 2;
+	union hv_chario data;
+	struct iovec iov;
+
+	if (vcpu->kvm->cfg.active_console != CONSOLE_HV)
+		return H_SUCCESS;
+
+	if (term_readable(0)) {
+		iov.iov_base = data.buf;
+		iov.iov_len = 16;
+
+		*len = term_getc_iov(vcpu->kvm, &iov, 1, 0);
+		*char0_7 = be64_to_cpu(data.a.char0_7);
+		*char8_15 = be64_to_cpu(data.a.char8_15);
+	} else {
+		*len = 0;
+	}
+
+	return H_SUCCESS;
+}
+
+void spapr_hvcons_poll(struct kvm *kvm)
+{
+	if (term_readable(0)) {
+		/*
+		 * We can inject an IRQ to guest here if we want.  The guest
+		 * will happily poll, though, so not required.
+		 */
+	}
+}
+
+void spapr_hvcons_init(void)
+{
+	spapr_register_hypercall(H_PUT_TERM_CHAR, h_put_term_char);
+	spapr_register_hypercall(H_GET_TERM_CHAR, h_get_term_char);
+}
diff --git a/kvmtool/powerpc/spapr_hvcons.h b/kvmtool/powerpc/spapr_hvcons.h
new file mode 100644
index 0000000..d3e4414
--- /dev/null
+++ b/kvmtool/powerpc/spapr_hvcons.h
@@ -0,0 +1,19 @@
+/*
+ * SPAPR HV console
+ *
+ * Copyright (c) 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef spapr_hvcons_H
+#define spapr_hvcons_H
+
+#include "kvm/kvm.h"
+
+void spapr_hvcons_init(void);
+void spapr_hvcons_poll(struct kvm *kvm);
+
+#endif
diff --git a/kvmtool/powerpc/spapr_pci.c b/kvmtool/powerpc/spapr_pci.c
new file mode 100644
index 0000000..7be44d9
--- /dev/null
+++ b/kvmtool/powerpc/spapr_pci.c
@@ -0,0 +1,412 @@
+/*
+ * SPAPR PHB emulation, RTAS interface to PCI config space, device tree nodes
+ * for enumerated devices.
+ *
+ * Borrowed heavily from QEMU's spapr_pci.c,
+ * Copyright (c) 2011 Alexey Kardashevskiy, IBM Corporation.
+ * Copyright (c) 2011 David Gibson, IBM Corporation.
+ *
+ * Modifications copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "spapr.h"
+#include "spapr_pci.h"
+#include "kvm/devices.h"
+#include "kvm/fdt.h"
+#include "kvm/util.h"
+#include "kvm/of_pci.h"
+#include "kvm/pci.h"
+
+#include <linux/pci_regs.h>
+#include <linux/byteorder.h>
+
+
+/* #define DEBUG_PHB yes */
+#ifdef DEBUG_PHB
+#define phb_dprintf(fmt, ...)					\
+	do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
+#else
+#define phb_dprintf(fmt, ...)			\
+	do { } while (0)
+#endif
+
+static const uint32_t bars[] = {
+	PCI_BASE_ADDRESS_0, PCI_BASE_ADDRESS_1,
+	PCI_BASE_ADDRESS_2, PCI_BASE_ADDRESS_3,
+	PCI_BASE_ADDRESS_4, PCI_BASE_ADDRESS_5
+	/*, PCI_ROM_ADDRESS*/
+};
+
+#define PCI_NUM_REGIONS		7
+
+static struct spapr_phb phb;
+
+static void rtas_ibm_read_pci_config(struct kvm_cpu *vcpu,
+				     uint32_t token, uint32_t nargs,
+				     target_ulong args,
+				     uint32_t nret, target_ulong rets)
+{
+	uint32_t val = 0;
+	uint64_t buid = ((uint64_t)rtas_ld(vcpu->kvm, args, 1) << 32) | rtas_ld(vcpu->kvm, args, 2);
+	union pci_config_address addr = { .w = rtas_ld(vcpu->kvm, args, 0) };
+	struct pci_device_header *dev = pci__find_dev(addr.device_number);
+	uint32_t size = rtas_ld(vcpu->kvm, args, 3);
+
+	if (buid != phb.buid || !dev || (size > 4)) {
+		phb_dprintf("- cfgRd buid 0x%lx cfg addr 0x%x size %d not found\n",
+			    buid, addr.w, size);
+
+		rtas_st(vcpu->kvm, rets, 0, -1);
+		return;
+	}
+	pci__config_rd(vcpu->kvm, addr, &val, size);
+	/* It appears this wants a byteswapped result... */
+	switch (size) {
+	case 4:
+		val = le32_to_cpu(val);
+		break;
+	case 2:
+		val = le16_to_cpu(val>>16);
+		break;
+	case 1:
+		val = val >> 24;
+		break;
+	}
+	phb_dprintf("- cfgRd buid 0x%lx addr 0x%x (/%d): b%d,d%d,f%d,r0x%x, val 0x%x\n",
+		    buid, addr.w, size, addr.bus_number, addr.device_number, addr.function_number,
+		    addr.register_number, val);
+
+	rtas_st(vcpu->kvm, rets, 0, 0);
+	rtas_st(vcpu->kvm, rets, 1, val);
+}
+
+static void rtas_read_pci_config(struct kvm_cpu *vcpu,
+				 uint32_t token, uint32_t nargs,
+				 target_ulong args,
+				 uint32_t nret, target_ulong rets)
+{
+	uint32_t val;
+	union pci_config_address addr = { .w = rtas_ld(vcpu->kvm, args, 0) };
+	struct pci_device_header *dev = pci__find_dev(addr.device_number);
+	uint32_t size = rtas_ld(vcpu->kvm, args, 1);
+
+	if (!dev || (size > 4)) {
+		rtas_st(vcpu->kvm, rets, 0, -1);
+		return;
+	}
+	pci__config_rd(vcpu->kvm, addr, &val, size);
+	switch (size) {
+	case 4:
+		val = le32_to_cpu(val);
+		break;
+	case 2:
+		val = le16_to_cpu(val>>16); /* We're yuck-endian. */
+		break;
+	case 1:
+		val = val >> 24;
+		break;
+	}
+	phb_dprintf("- cfgRd addr 0x%x size %d, val 0x%x\n", addr.w, size, val);
+	rtas_st(vcpu->kvm, rets, 0, 0);
+	rtas_st(vcpu->kvm, rets, 1, val);
+}
+
+static void rtas_ibm_write_pci_config(struct kvm_cpu *vcpu,
+				      uint32_t token, uint32_t nargs,
+				      target_ulong args,
+				      uint32_t nret, target_ulong rets)
+{
+	uint64_t buid = ((uint64_t)rtas_ld(vcpu->kvm, args, 1) << 32) | rtas_ld(vcpu->kvm, args, 2);
+	union pci_config_address addr = { .w = rtas_ld(vcpu->kvm, args, 0) };
+	struct pci_device_header *dev = pci__find_dev(addr.device_number);
+	uint32_t size = rtas_ld(vcpu->kvm, args, 3);
+	uint32_t val = rtas_ld(vcpu->kvm, args, 4);
+
+	if (buid != phb.buid || !dev || (size > 4)) {
+		phb_dprintf("- cfgWr buid 0x%lx cfg addr 0x%x/%d error (val 0x%x)\n",
+			    buid, addr.w, size, val);
+
+		rtas_st(vcpu->kvm, rets, 0, -1);
+		return;
+	}
+	phb_dprintf("- cfgWr buid 0x%lx addr 0x%x (/%d): b%d,d%d,f%d,r0x%x, val 0x%x\n",
+		    buid, addr.w, size, addr.bus_number, addr.device_number, addr.function_number,
+		    addr.register_number, val);
+	switch (size) {
+	case 4:
+		val = le32_to_cpu(val);
+		break;
+	case 2:
+		val = le16_to_cpu(val) << 16;
+		break;
+	case 1:
+		val = val >> 24;
+		break;
+	}
+	pci__config_wr(vcpu->kvm, addr, &val, size);
+	rtas_st(vcpu->kvm, rets, 0, 0);
+}
+
+static void rtas_write_pci_config(struct kvm_cpu *vcpu,
+				  uint32_t token, uint32_t nargs,
+				  target_ulong args,
+				  uint32_t nret, target_ulong rets)
+{
+	union pci_config_address addr = { .w = rtas_ld(vcpu->kvm, args, 0) };
+	struct pci_device_header *dev = pci__find_dev(addr.device_number);
+	uint32_t size = rtas_ld(vcpu->kvm, args, 1);
+	uint32_t val = rtas_ld(vcpu->kvm, args, 2);
+
+	if (!dev || (size > 4)) {
+		rtas_st(vcpu->kvm, rets, 0, -1);
+		return;
+	}
+
+	phb_dprintf("- cfgWr addr 0x%x (/%d): b%d,d%d,f%d,r0x%x, val 0x%x\n",
+		    addr.w, size, addr.bus_number, addr.device_number, addr.function_number,
+		    addr.register_number, val);
+	switch (size) {
+	case 4:
+		val = le32_to_cpu(val);
+		break;
+	case 2:
+		val = le16_to_cpu(val) << 16;
+		break;
+	case 1:
+		val = val >> 24;
+		break;
+	}
+	pci__config_wr(vcpu->kvm, addr, &val, size);
+	rtas_st(vcpu->kvm, rets, 0, 0);
+}
+
+void spapr_create_phb(struct kvm *kvm,
+		      const char *busname, uint64_t buid,
+		      uint64_t mem_win_addr, uint64_t mem_win_size,
+		      uint64_t io_win_addr, uint64_t io_win_size)
+{
+	/*
+	 * Since kvmtool doesn't really have any concept of buses etc.,
+	 * there's nothing to register here.  Just register RTAS.
+	 */
+	spapr_rtas_register("read-pci-config", rtas_read_pci_config);
+	spapr_rtas_register("write-pci-config", rtas_write_pci_config);
+	spapr_rtas_register("ibm,read-pci-config", rtas_ibm_read_pci_config);
+	spapr_rtas_register("ibm,write-pci-config", rtas_ibm_write_pci_config);
+
+	phb.buid = buid;
+	phb.mem_addr = mem_win_addr;
+	phb.mem_size = mem_win_size;
+	phb.io_addr  = io_win_addr;
+	phb.io_size  = io_win_size;
+
+	kvm->arch.phb = &phb;
+}
+
+static uint32_t bar_to_ss(unsigned long bar)
+{
+	if ((bar & PCI_BASE_ADDRESS_SPACE) ==
+	    PCI_BASE_ADDRESS_SPACE_IO)
+		return OF_PCI_SS_IO;
+	else if (bar & PCI_BASE_ADDRESS_MEM_TYPE_64)
+		return OF_PCI_SS_M64;
+	else
+		return OF_PCI_SS_M32;
+}
+
+static unsigned long bar_to_addr(unsigned long bar)
+{
+	if ((bar & PCI_BASE_ADDRESS_SPACE) ==
+	    PCI_BASE_ADDRESS_SPACE_IO)
+		return bar & PCI_BASE_ADDRESS_IO_MASK;
+	else
+		return bar & PCI_BASE_ADDRESS_MEM_MASK;
+}
+
+int spapr_populate_pci_devices(struct kvm *kvm,
+			       uint32_t xics_phandle,
+			       void *fdt)
+{
+	int bus_off, node_off = 0, devid, fn, i, n, devices;
+	struct device_header *dev_hdr;
+	char nodename[256];
+	struct of_pci_unit64_address {
+		u32 phys_hi;
+		u64 addr;
+		u64 size;
+	} __attribute((packed)) reg[PCI_NUM_REGIONS + 1], assigned_addresses[PCI_NUM_REGIONS];
+	uint32_t bus_range[] = { cpu_to_be32(0), cpu_to_be32(0xff) };
+	struct of_pci_ranges_entry ranges[] = {
+		{
+			{
+				cpu_to_be32(of_pci_b_ss(1)),
+				cpu_to_be32(0),
+				cpu_to_be32(0),
+			},
+			cpu_to_be64(phb.io_addr),
+			cpu_to_be64(phb.io_size),
+		},
+		{
+			{
+				cpu_to_be32(of_pci_b_ss(2)),
+				cpu_to_be32(0),
+				cpu_to_be32(0),
+			},
+			cpu_to_be64(phb.mem_addr),
+			cpu_to_be64(phb.mem_size),
+		},
+	};
+	uint64_t bus_reg[] = { cpu_to_be64(phb.buid), 0 };
+	uint32_t interrupt_map_mask[] = {
+		cpu_to_be32(of_pci_b_ddddd(-1)|of_pci_b_fff(-1)), 0x0, 0x0, 0x0};
+	uint32_t interrupt_map[SPAPR_PCI_NUM_LSI][7];
+
+	/* Start populating the FDT */
+	sprintf(nodename, "pci@%" PRIx64, phb.buid);
+	bus_off = fdt_add_subnode(fdt, 0, nodename);
+	if (bus_off < 0) {
+		die("error making bus subnode, %s\n", fdt_strerror(bus_off));
+		return bus_off;
+	}
+
+	/* Write PHB properties */
+	_FDT(fdt_setprop_string(fdt, bus_off, "device_type", "pci"));
+	_FDT(fdt_setprop_string(fdt, bus_off, "compatible", "IBM,Logical_PHB"));
+	_FDT(fdt_setprop_cell(fdt, bus_off, "#address-cells", 0x3));
+	_FDT(fdt_setprop_cell(fdt, bus_off, "#size-cells", 0x2));
+	_FDT(fdt_setprop_cell(fdt, bus_off, "#interrupt-cells", 0x1));
+	_FDT(fdt_setprop(fdt, bus_off, "used-by-rtas", NULL, 0));
+	_FDT(fdt_setprop(fdt, bus_off, "bus-range", &bus_range, sizeof(bus_range)));
+	_FDT(fdt_setprop(fdt, bus_off, "ranges", &ranges, sizeof(ranges)));
+	_FDT(fdt_setprop(fdt, bus_off, "reg", &bus_reg, sizeof(bus_reg)));
+	_FDT(fdt_setprop(fdt, bus_off, "interrupt-map-mask",
+			 &interrupt_map_mask, sizeof(interrupt_map_mask)));
+
+	/* Populate PCI devices and allocate IRQs */
+	devices = 0;
+	dev_hdr = device__first_dev(DEVICE_BUS_PCI);
+	while (dev_hdr) {
+		uint32_t *irqmap = interrupt_map[devices];
+		struct pci_device_header *hdr = dev_hdr->data;
+
+		if (!hdr)
+			continue;
+
+		devid = dev_hdr->dev_num;
+		fn = 0; /* kvmtool doesn't yet do multifunction devices */
+
+		sprintf(nodename, "pci@%u,%u", devid, fn);
+
+		/* Allocate interrupt from the map */
+		if (devid > SPAPR_PCI_NUM_LSI)	{
+			die("Unexpected behaviour in spapr_populate_pci_devices,"
+			    "wrong devid %u\n", devid);
+		}
+		irqmap[0] = cpu_to_be32(of_pci_b_ddddd(devid)|of_pci_b_fff(fn));
+		irqmap[1] = 0;
+		irqmap[2] = 0;
+		irqmap[3] = 0;
+		irqmap[4] = cpu_to_be32(xics_phandle);
+		/*
+		 * This is nasty; the PCI devs are set up such that their own
+		 * header's irq_line indicates the direct XICS IRQ number to
+		 * use.  There REALLY needs to be a hierarchical system in place
+		 * to 'raise' an IRQ on the bridge which indexes/looks up which
+		 * XICS IRQ to fire.
+		 */
+		irqmap[5] = cpu_to_be32(hdr->irq_line);
+		irqmap[6] = cpu_to_be32(0x8);
+
+		/* Add node to FDT */
+		node_off = fdt_add_subnode(fdt, bus_off, nodename);
+		if (node_off < 0) {
+			die("error making node subnode, %s\n", fdt_strerror(bus_off));
+			return node_off;
+		}
+
+		_FDT(fdt_setprop_cell(fdt, node_off, "vendor-id",
+				      le16_to_cpu(hdr->vendor_id)));
+		_FDT(fdt_setprop_cell(fdt, node_off, "device-id",
+				      le16_to_cpu(hdr->device_id)));
+		_FDT(fdt_setprop_cell(fdt, node_off, "revision-id",
+				      hdr->revision_id));
+		_FDT(fdt_setprop_cell(fdt, node_off, "class-code",
+				      hdr->class[0] | (hdr->class[1] << 8) | (hdr->class[2] << 16)));
+		_FDT(fdt_setprop_cell(fdt, node_off, "subsystem-id",
+				      le16_to_cpu(hdr->subsys_id)));
+		_FDT(fdt_setprop_cell(fdt, node_off, "subsystem-vendor-id",
+				      le16_to_cpu(hdr->subsys_vendor_id)));
+
+		/* Config space region comes first */
+		reg[0].phys_hi = cpu_to_be32(
+			of_pci_b_n(0) |
+			of_pci_b_p(0) |
+			of_pci_b_t(0) |
+			of_pci_b_ss(OF_PCI_SS_CONFIG) |
+			of_pci_b_bbbbbbbb(0) |
+			of_pci_b_ddddd(devid) |
+			of_pci_b_fff(fn));
+		reg[0].addr = 0;
+		reg[0].size = 0;
+
+		n = 0;
+		/* Six BARs, no ROM supported, addresses are 32bit */
+		for (i = 0; i < 6; ++i) {
+			if (0 == hdr->bar[i]) {
+				continue;
+			}
+
+			reg[n+1].phys_hi = cpu_to_be32(
+				of_pci_b_n(0) |
+				of_pci_b_p(0) |
+				of_pci_b_t(0) |
+				of_pci_b_ss(bar_to_ss(le32_to_cpu(hdr->bar[i]))) |
+				of_pci_b_bbbbbbbb(0) |
+				of_pci_b_ddddd(devid) |
+				of_pci_b_fff(fn) |
+				of_pci_b_rrrrrrrr(bars[i]));
+			reg[n+1].size = cpu_to_be64(pci__bar_size(hdr, i));
+			reg[n+1].addr = 0;
+
+			assigned_addresses[n].phys_hi = cpu_to_be32(
+				of_pci_b_n(1) |
+				of_pci_b_p(0) |
+				of_pci_b_t(0) |
+				of_pci_b_ss(bar_to_ss(le32_to_cpu(hdr->bar[i]))) |
+				of_pci_b_bbbbbbbb(0) |
+				of_pci_b_ddddd(devid) |
+				of_pci_b_fff(fn) |
+				of_pci_b_rrrrrrrr(bars[i]));
+
+			/*
+			 * Writing zeroes to assigned_addresses causes the guest kernel to
+			 * reassign BARs
+			 */
+			assigned_addresses[n].addr = cpu_to_be64(bar_to_addr(le32_to_cpu(hdr->bar[i])));
+			assigned_addresses[n].size = reg[n+1].size;
+
+			++n;
+		}
+		_FDT(fdt_setprop(fdt, node_off, "reg", reg, sizeof(reg[0])*(n+1)));
+		_FDT(fdt_setprop(fdt, node_off, "assigned-addresses",
+				 assigned_addresses,
+				 sizeof(assigned_addresses[0])*(n)));
+		_FDT(fdt_setprop_cell(fdt, node_off, "interrupts",
+				      hdr->irq_pin));
+
+		/* We don't set ibm,dma-window property as we don't have an IOMMU. */
+
+		++devices;
+		dev_hdr = device__next_dev(dev_hdr);
+	}
+
+	/* Write interrupt map */
+	_FDT(fdt_setprop(fdt, bus_off, "interrupt-map", &interrupt_map,
+			 devices * sizeof(interrupt_map[0])));
+
+	return 0;
+}
diff --git a/kvmtool/powerpc/spapr_pci.h b/kvmtool/powerpc/spapr_pci.h
new file mode 100644
index 0000000..f659eda
--- /dev/null
+++ b/kvmtool/powerpc/spapr_pci.h
@@ -0,0 +1,57 @@
+/*
+ * SPAPR PHB definitions
+ *
+ * Modifications by Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef SPAPR_PCI_H
+#define SPAPR_PCI_H
+
+#include "kvm/kvm.h"
+#include "spapr.h"
+#include <inttypes.h>
+
+/* With XICS, we can easily accomodate 1 IRQ per PCI device. */
+
+#define SPAPR_PCI_NUM_LSI 256
+
+struct spapr_phb {
+	uint64_t buid;
+	uint64_t mem_addr;
+	uint64_t mem_size;
+	uint64_t io_addr;
+	uint64_t io_size;
+};
+
+void spapr_create_phb(struct kvm *kvm,
+                      const char *busname, uint64_t buid,
+                      uint64_t mem_win_addr, uint64_t mem_win_size,
+                      uint64_t io_win_addr, uint64_t io_win_size);
+
+int spapr_populate_pci_devices(struct kvm *kvm,
+                               uint32_t xics_phandle,
+                               void *fdt);
+
+static inline bool spapr_phb_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, u8 is_write)
+{
+	if ((phys_addr >= SPAPR_PCI_IO_WIN_ADDR) &&
+	    (phys_addr < SPAPR_PCI_IO_WIN_ADDR +
+	     SPAPR_PCI_IO_WIN_SIZE)) {
+		return kvm__emulate_io(vcpu, phys_addr - SPAPR_PCI_IO_WIN_ADDR,
+				       data, is_write ? KVM_EXIT_IO_OUT :
+				       KVM_EXIT_IO_IN,
+				       len, 1);
+	} else if ((phys_addr >= SPAPR_PCI_MEM_WIN_ADDR) &&
+		   (phys_addr < SPAPR_PCI_MEM_WIN_ADDR +
+		    SPAPR_PCI_MEM_WIN_SIZE)) {
+		return kvm__emulate_mmio(vcpu, phys_addr - SPAPR_PCI_MEM_WIN_ADDR,
+					 data, len, is_write);
+	}
+	return false;
+}
+
+#endif
diff --git a/kvmtool/powerpc/spapr_rtas.c b/kvmtool/powerpc/spapr_rtas.c
new file mode 100644
index 0000000..b898ff2
--- /dev/null
+++ b/kvmtool/powerpc/spapr_rtas.c
@@ -0,0 +1,246 @@
+/*
+ * SPAPR base RTAS calls
+ *
+ * Borrowed heavily from QEMU's spapr_rtas.c
+ * Copyright (c) 2010-2011 David Gibson, IBM Corporation.
+ *
+ * Modifications copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/util.h"
+#include "kvm/term.h"
+
+#include "spapr.h"
+
+#include <libfdt.h>
+#include <stdio.h>
+#include <assert.h>
+
+#define TOKEN_BASE      0x2000
+#define TOKEN_MAX       0x100
+
+#define RTAS_CONSOLE
+
+static struct rtas_call {
+	const char *name;
+	spapr_rtas_fn fn;
+} rtas_table[TOKEN_MAX];
+
+struct rtas_call *rtas_next = rtas_table;
+
+
+static void rtas_display_character(struct kvm_cpu *vcpu,
+                                   uint32_t token, uint32_t nargs,
+                                   target_ulong args,
+                                   uint32_t nret, target_ulong rets)
+{
+	char c = rtas_ld(vcpu->kvm, args, 0);
+	term_putc(&c, 1, 0);
+	rtas_st(vcpu->kvm, rets, 0, 0);
+}
+
+#ifdef RTAS_CONSOLE
+static void rtas_put_term_char(struct kvm_cpu *vcpu,
+			       uint32_t token, uint32_t nargs,
+			       target_ulong args,
+			       uint32_t nret, target_ulong rets)
+{
+	char c = rtas_ld(vcpu->kvm, args, 0);
+
+	term_putc(&c, 1, 0);
+
+	rtas_st(vcpu->kvm, rets, 0, 0);
+}
+
+static void rtas_get_term_char(struct kvm_cpu *vcpu,
+			       uint32_t token, uint32_t nargs,
+			       target_ulong args,
+			       uint32_t nret, target_ulong rets)
+{
+	int c;
+
+	if (vcpu->kvm->cfg.active_console == CONSOLE_HV && term_readable(0) &&
+	    (c = term_getc(vcpu->kvm, 0)) >= 0) {
+		rtas_st(vcpu->kvm, rets, 0, 0);
+		rtas_st(vcpu->kvm, rets, 1, c);
+	} else {
+		rtas_st(vcpu->kvm, rets, 0, -2);
+	}
+}
+#endif
+
+static void rtas_get_time_of_day(struct kvm_cpu *vcpu,
+                                 uint32_t token, uint32_t nargs,
+                                 target_ulong args,
+                                 uint32_t nret, target_ulong rets)
+{
+	struct tm tm;
+	time_t tnow;
+
+	if (nret != 8) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	tnow = time(NULL);
+	/* Guest time is currently not offset in any way. */
+	gmtime_r(&tnow, &tm);
+
+	rtas_st(vcpu->kvm, rets, 0, 0); /* Success */
+	rtas_st(vcpu->kvm, rets, 1, tm.tm_year + 1900);
+	rtas_st(vcpu->kvm, rets, 2, tm.tm_mon + 1);
+	rtas_st(vcpu->kvm, rets, 3, tm.tm_mday);
+	rtas_st(vcpu->kvm, rets, 4, tm.tm_hour);
+	rtas_st(vcpu->kvm, rets, 5, tm.tm_min);
+	rtas_st(vcpu->kvm, rets, 6, tm.tm_sec);
+	rtas_st(vcpu->kvm, rets, 7, 0);
+}
+
+static void rtas_set_time_of_day(struct kvm_cpu *vcpu,
+                                 uint32_t token, uint32_t nargs,
+                                 target_ulong args,
+                                 uint32_t nret, target_ulong rets)
+{
+	pr_warning("%s called; TOD set ignored.\n", __FUNCTION__);
+}
+
+static void rtas_power_off(struct kvm_cpu *vcpu,
+                           uint32_t token, uint32_t nargs, target_ulong args,
+                           uint32_t nret, target_ulong rets)
+{
+	if (nargs != 2 || nret != 1) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+	kvm__reboot(vcpu->kvm);
+}
+
+static void rtas_system_reboot(struct kvm_cpu *vcpu,
+                           uint32_t token, uint32_t nargs, target_ulong args,
+                           uint32_t nret, target_ulong rets)
+{
+	if (nargs != 0 || nret != 1) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	/* NB this actually halts the VM */
+	kvm__reboot(vcpu->kvm);
+}
+
+static void rtas_query_cpu_stopped_state(struct kvm_cpu *vcpu,
+                                         uint32_t token, uint32_t nargs,
+                                         target_ulong args,
+                                         uint32_t nret, target_ulong rets)
+{
+	if (nargs != 1 || nret != 2) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	/*
+	 * Can read id = rtas_ld(vcpu->kvm, args, 0), but
+	 * we currently start all CPUs.  So just return true.
+	 */
+	rtas_st(vcpu->kvm, rets, 0, 0);
+	rtas_st(vcpu->kvm, rets, 1, 2);
+}
+
+static void rtas_start_cpu(struct kvm_cpu *vcpu,
+                           uint32_t token, uint32_t nargs,
+                           target_ulong args,
+                           uint32_t nret, target_ulong rets)
+{
+	die(__FUNCTION__);
+}
+
+target_ulong spapr_rtas_call(struct kvm_cpu *vcpu,
+                             uint32_t token, uint32_t nargs, target_ulong args,
+                             uint32_t nret, target_ulong rets)
+{
+	if ((token >= TOKEN_BASE)
+	    && ((token - TOKEN_BASE) < TOKEN_MAX)) {
+		struct rtas_call *call = rtas_table + (token - TOKEN_BASE);
+
+		if (call->fn) {
+			call->fn(vcpu, token, nargs, args, nret, rets);
+			return H_SUCCESS;
+		}
+	}
+
+	/*
+	 * HACK: Some Linux early debug code uses RTAS display-character,
+	 * but assumes the token value is 0xa (which it is on some real
+	 * machines) without looking it up in the device tree.  This
+	 * special case makes this work
+	 */
+	if (token == 0xa) {
+		rtas_display_character(vcpu, 0xa, nargs, args, nret, rets);
+		return H_SUCCESS;
+	}
+
+	hcall_dprintf("Unknown RTAS token 0x%x\n", token);
+	rtas_st(vcpu->kvm, rets, 0, -3);
+	return H_PARAMETER;
+}
+
+void spapr_rtas_register(const char *name, spapr_rtas_fn fn)
+{
+	assert(rtas_next < (rtas_table + TOKEN_MAX));
+
+	rtas_next->name = name;
+	rtas_next->fn = fn;
+
+	rtas_next++;
+}
+
+/*
+ * This is called from the context of an open /rtas node, in order to add
+ * properties for the rtas call tokens.
+ */
+int spapr_rtas_fdt_setup(struct kvm *kvm, void *fdt)
+{
+	int ret;
+	int i;
+
+	for (i = 0; i < TOKEN_MAX; i++) {
+		struct rtas_call *call = &rtas_table[i];
+
+		if (!call->fn) {
+			continue;
+		}
+
+		ret = fdt_property_cell(fdt, call->name, i + TOKEN_BASE);
+
+		if (ret < 0) {
+			pr_warning("Couldn't add rtas token for %s: %s\n",
+				   call->name, fdt_strerror(ret));
+			return ret;
+		}
+
+	}
+	return 0;
+}
+
+void register_core_rtas(void)
+{
+	spapr_rtas_register("display-character", rtas_display_character);
+	spapr_rtas_register("get-time-of-day", rtas_get_time_of_day);
+	spapr_rtas_register("set-time-of-day", rtas_set_time_of_day);
+	spapr_rtas_register("power-off", rtas_power_off);
+	spapr_rtas_register("system-reboot", rtas_system_reboot);
+	spapr_rtas_register("query-cpu-stopped-state",
+			    rtas_query_cpu_stopped_state);
+	spapr_rtas_register("start-cpu", rtas_start_cpu);
+#ifdef RTAS_CONSOLE
+	/* These are unused: We do console I/O via hcalls, not rtas. */
+	spapr_rtas_register("put-term-char", rtas_put_term_char);
+	spapr_rtas_register("get-term-char", rtas_get_term_char);
+#endif
+}
diff --git a/kvmtool/powerpc/xics.c b/kvmtool/powerpc/xics.c
new file mode 100644
index 0000000..071b928
--- /dev/null
+++ b/kvmtool/powerpc/xics.c
@@ -0,0 +1,495 @@
+/*
+ * PAPR Virtualized Interrupt System, aka ICS/ICP aka xics
+ *
+ * Borrowed heavily from QEMU's xics.c,
+ * Copyright (c) 2010,2011 David Gibson, IBM Corporation.
+ *
+ * Modifications copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "spapr.h"
+#include "xics.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+#include <stdio.h>
+#include <malloc.h>
+
+#define XICS_NUM_IRQS	1024
+
+
+/* #define DEBUG_XICS yes */
+#ifdef DEBUG_XICS
+#define xics_dprintf(fmt, ...)					\
+	do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
+#else
+#define xics_dprintf(fmt, ...)			\
+	do { } while (0)
+#endif
+
+/*
+ * ICP: Presentation layer
+ */
+
+struct icp_server_state {
+	uint32_t xirr;
+	uint8_t pending_priority;
+	uint8_t mfrr;
+	struct kvm_cpu *cpu;
+};
+
+#define XICS_IRQ_OFFSET KVM_IRQ_OFFSET
+#define XISR_MASK	0x00ffffff
+#define CPPR_MASK	0xff000000
+
+#define XISR(ss)   (((ss)->xirr) & XISR_MASK)
+#define CPPR(ss)   (((ss)->xirr) >> 24)
+
+struct ics_state;
+
+struct icp_state {
+	unsigned long nr_servers;
+	struct icp_server_state *ss;
+	struct ics_state *ics;
+};
+
+static void ics_reject(struct ics_state *ics, int nr);
+static void ics_resend(struct ics_state *ics);
+static void ics_eoi(struct ics_state *ics, int nr);
+
+static inline void cpu_irq_raise(struct kvm_cpu *vcpu)
+{
+	xics_dprintf("INT1[%p]\n", vcpu);
+	kvm_cpu__irq(vcpu, POWER7_EXT_IRQ, 1);
+}
+
+static inline void cpu_irq_lower(struct kvm_cpu *vcpu)
+{
+	xics_dprintf("INT0[%p]\n", vcpu);
+	kvm_cpu__irq(vcpu, POWER7_EXT_IRQ, 0);
+}
+
+static void icp_check_ipi(struct icp_state *icp, int server)
+{
+	struct icp_server_state *ss = icp->ss + server;
+
+	if (XISR(ss) && (ss->pending_priority <= ss->mfrr)) {
+		return;
+	}
+
+	if (XISR(ss)) {
+		ics_reject(icp->ics, XISR(ss));
+	}
+
+	ss->xirr = (ss->xirr & ~XISR_MASK) | XICS_IPI;
+	ss->pending_priority = ss->mfrr;
+	cpu_irq_raise(ss->cpu);
+}
+
+static void icp_resend(struct icp_state *icp, int server)
+{
+	struct icp_server_state *ss = icp->ss + server;
+
+	if (ss->mfrr < CPPR(ss)) {
+		icp_check_ipi(icp, server);
+	}
+	ics_resend(icp->ics);
+}
+
+static void icp_set_cppr(struct icp_state *icp, int server, uint8_t cppr)
+{
+	struct icp_server_state *ss = icp->ss + server;
+	uint8_t old_cppr;
+	uint32_t old_xisr;
+
+	old_cppr = CPPR(ss);
+	ss->xirr = (ss->xirr & ~CPPR_MASK) | (cppr << 24);
+
+	if (cppr < old_cppr) {
+		if (XISR(ss) && (cppr <= ss->pending_priority)) {
+			old_xisr = XISR(ss);
+			ss->xirr &= ~XISR_MASK; /* Clear XISR */
+			cpu_irq_lower(ss->cpu);
+			ics_reject(icp->ics, old_xisr);
+		}
+	} else {
+		if (!XISR(ss)) {
+			icp_resend(icp, server);
+		}
+	}
+}
+
+static void icp_set_mfrr(struct icp_state *icp, int nr, uint8_t mfrr)
+{
+	struct icp_server_state *ss = icp->ss + nr;
+
+	ss->mfrr = mfrr;
+	if (mfrr < CPPR(ss)) {
+		icp_check_ipi(icp, nr);
+	}
+}
+
+static uint32_t icp_accept(struct icp_server_state *ss)
+{
+	uint32_t xirr;
+
+	cpu_irq_lower(ss->cpu);
+	xirr = ss->xirr;
+	ss->xirr = ss->pending_priority << 24;
+	return xirr;
+}
+
+static void icp_eoi(struct icp_state *icp, int server, uint32_t xirr)
+{
+	struct icp_server_state *ss = icp->ss + server;
+
+	ics_eoi(icp->ics, xirr & XISR_MASK);
+	/* Send EOI -> ICS */
+	ss->xirr = (ss->xirr & ~CPPR_MASK) | (xirr & CPPR_MASK);
+	if (!XISR(ss)) {
+		icp_resend(icp, server);
+	}
+}
+
+static void icp_irq(struct icp_state *icp, int server, int nr, uint8_t priority)
+{
+	struct icp_server_state *ss = icp->ss + server;
+	xics_dprintf("icp_irq(nr %d, server %d, prio 0x%x)\n", nr, server, priority);
+	if ((priority >= CPPR(ss))
+	    || (XISR(ss) && (ss->pending_priority <= priority))) {
+		xics_dprintf("reject %d, CPPR 0x%x, XISR 0x%x, pprio 0x%x, prio 0x%x\n",
+			     nr, CPPR(ss), XISR(ss), ss->pending_priority, priority);
+		ics_reject(icp->ics, nr);
+	} else {
+		if (XISR(ss)) {
+			xics_dprintf("reject %d, CPPR 0x%x, XISR 0x%x, pprio 0x%x, prio 0x%x\n",
+				     nr, CPPR(ss), XISR(ss), ss->pending_priority, priority);
+			ics_reject(icp->ics, XISR(ss));
+		}
+		ss->xirr = (ss->xirr & ~XISR_MASK) | (nr & XISR_MASK);
+		ss->pending_priority = priority;
+		cpu_irq_raise(ss->cpu);
+	}
+}
+
+/*
+ * ICS: Source layer
+ */
+
+struct ics_irq_state {
+	int server;
+	uint8_t priority;
+	uint8_t saved_priority;
+	int rejected:1;
+	int masked_pending:1;
+};
+
+struct ics_state {
+	unsigned int nr_irqs;
+	unsigned int offset;
+	struct ics_irq_state *irqs;
+	struct icp_state *icp;
+};
+
+static int ics_valid_irq(struct ics_state *ics, uint32_t nr)
+{
+	return (nr >= ics->offset)
+		&& (nr < (ics->offset + ics->nr_irqs));
+}
+
+static void ics_set_irq_msi(struct ics_state *ics, int srcno, int val)
+{
+	struct ics_irq_state *irq = ics->irqs + srcno;
+
+	if (val) {
+		if (irq->priority == 0xff) {
+			xics_dprintf(" irq pri ff, masked pending\n");
+			irq->masked_pending = 1;
+		} else	{
+			icp_irq(ics->icp, irq->server, srcno + ics->offset, irq->priority);
+		}
+	}
+}
+
+static void ics_reject_msi(struct ics_state *ics, int nr)
+{
+	struct ics_irq_state *irq = ics->irqs + nr - ics->offset;
+
+	irq->rejected = 1;
+}
+
+static void ics_resend_msi(struct ics_state *ics)
+{
+	unsigned int i;
+
+	for (i = 0; i < ics->nr_irqs; i++) {
+		struct ics_irq_state *irq = ics->irqs + i;
+
+		/* FIXME: filter by server#? */
+		if (irq->rejected) {
+			irq->rejected = 0;
+			if (irq->priority != 0xff) {
+				icp_irq(ics->icp, irq->server, i + ics->offset, irq->priority);
+			}
+		}
+	}
+}
+
+static void ics_write_xive_msi(struct ics_state *ics, int nr, int server,
+			       uint8_t priority)
+{
+	struct ics_irq_state *irq = ics->irqs + nr - ics->offset;
+
+	irq->server = server;
+	irq->priority = priority;
+	xics_dprintf("ics_write_xive_msi(nr %d, server %d, pri 0x%x)\n", nr, server, priority);
+
+	if (!irq->masked_pending || (priority == 0xff)) {
+		return;
+	}
+
+	irq->masked_pending = 0;
+	icp_irq(ics->icp, server, nr, priority);
+}
+
+static void ics_reject(struct ics_state *ics, int nr)
+{
+	ics_reject_msi(ics, nr);
+}
+
+static void ics_resend(struct ics_state *ics)
+{
+	ics_resend_msi(ics);
+}
+
+static void ics_eoi(struct ics_state *ics, int nr)
+{
+}
+
+/*
+ * Exported functions
+ */
+
+static target_ulong h_cppr(struct kvm_cpu *vcpu,
+			   target_ulong opcode, target_ulong *args)
+{
+	target_ulong cppr = args[0];
+
+	xics_dprintf("h_cppr(%lx)\n", cppr);
+	icp_set_cppr(vcpu->kvm->arch.icp, vcpu->cpu_id, cppr);
+	return H_SUCCESS;
+}
+
+static target_ulong h_ipi(struct kvm_cpu *vcpu,
+			  target_ulong opcode, target_ulong *args)
+{
+	target_ulong server = args[0];
+	target_ulong mfrr = args[1];
+
+	xics_dprintf("h_ipi(%lx, %lx)\n", server, mfrr);
+	if (server >= vcpu->kvm->arch.icp->nr_servers) {
+		return H_PARAMETER;
+	}
+
+	icp_set_mfrr(vcpu->kvm->arch.icp, server, mfrr);
+	return H_SUCCESS;
+}
+
+static target_ulong h_xirr(struct kvm_cpu *vcpu,
+			   target_ulong opcode, target_ulong *args)
+{
+	uint32_t xirr = icp_accept(vcpu->kvm->arch.icp->ss + vcpu->cpu_id);
+
+	xics_dprintf("h_xirr() = %x\n", xirr);
+	args[0] = xirr;
+	return H_SUCCESS;
+}
+
+static target_ulong h_eoi(struct kvm_cpu *vcpu,
+			  target_ulong opcode, target_ulong *args)
+{
+	target_ulong xirr = args[0];
+
+	xics_dprintf("h_eoi(%lx)\n", xirr);
+	icp_eoi(vcpu->kvm->arch.icp, vcpu->cpu_id, xirr);
+	return H_SUCCESS;
+}
+
+static void rtas_set_xive(struct kvm_cpu *vcpu, uint32_t token,
+			  uint32_t nargs, target_ulong args,
+			  uint32_t nret, target_ulong rets)
+{
+	struct ics_state *ics = vcpu->kvm->arch.icp->ics;
+	uint32_t nr, server, priority;
+
+	if ((nargs != 3) || (nret != 1)) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	nr = rtas_ld(vcpu->kvm, args, 0);
+	server = rtas_ld(vcpu->kvm, args, 1);
+	priority = rtas_ld(vcpu->kvm, args, 2);
+
+	xics_dprintf("rtas_set_xive(%x,%x,%x)\n", nr, server, priority);
+	if (!ics_valid_irq(ics, nr) || (server >= ics->icp->nr_servers)
+	    || (priority > 0xff)) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	ics_write_xive_msi(ics, nr, server, priority);
+
+	rtas_st(vcpu->kvm, rets, 0, 0); /* Success */
+}
+
+static void rtas_get_xive(struct kvm_cpu *vcpu, uint32_t token,
+			  uint32_t nargs, target_ulong args,
+			  uint32_t nret, target_ulong rets)
+{
+	struct ics_state *ics = vcpu->kvm->arch.icp->ics;
+	uint32_t nr;
+
+	if ((nargs != 1) || (nret != 3)) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	nr = rtas_ld(vcpu->kvm, args, 0);
+
+	if (!ics_valid_irq(ics, nr)) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	rtas_st(vcpu->kvm, rets, 0, 0); /* Success */
+	rtas_st(vcpu->kvm, rets, 1, ics->irqs[nr - ics->offset].server);
+	rtas_st(vcpu->kvm, rets, 2, ics->irqs[nr - ics->offset].priority);
+}
+
+static void rtas_int_off(struct kvm_cpu *vcpu, uint32_t token,
+			 uint32_t nargs, target_ulong args,
+			 uint32_t nret, target_ulong rets)
+{
+	struct ics_state *ics = vcpu->kvm->arch.icp->ics;
+	uint32_t nr;
+
+	if ((nargs != 1) || (nret != 1)) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	nr = rtas_ld(vcpu->kvm, args, 0);
+
+	if (!ics_valid_irq(ics, nr)) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	/* ME: QEMU wrote xive_msi here, in #if 0.  Deleted. */
+
+	rtas_st(vcpu->kvm, rets, 0, 0); /* Success */
+}
+
+static void rtas_int_on(struct kvm_cpu *vcpu, uint32_t token,
+			uint32_t nargs, target_ulong args,
+			uint32_t nret, target_ulong rets)
+{
+	struct ics_state *ics = vcpu->kvm->arch.icp->ics;
+	uint32_t nr;
+
+	if ((nargs != 1) || (nret != 1)) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	nr = rtas_ld(vcpu->kvm, args, 0);
+
+	if (!ics_valid_irq(ics, nr)) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	/* ME: QEMU wrote xive_msi here, in #if 0.  Deleted. */
+
+	rtas_st(vcpu->kvm, rets, 0, 0); /* Success */
+}
+
+static int xics_init(struct kvm *kvm)
+{
+	unsigned int i;
+	struct icp_state *icp;
+	struct ics_state *ics;
+	int j;
+
+	icp = malloc(sizeof(*icp));
+	icp->nr_servers = kvm->nrcpus;
+	icp->ss = malloc(icp->nr_servers * sizeof(struct icp_server_state));
+
+	for (i = 0; i < icp->nr_servers; i++) {
+		icp->ss[i].xirr = 0;
+		icp->ss[i].pending_priority = 0;
+		icp->ss[i].cpu = 0;
+		icp->ss[i].mfrr = 0xff;
+	}
+
+	/*
+	 * icp->ss[env->cpu_index].cpu is set by CPUs calling in to
+	 * xics_cpu_register().
+	 */
+
+	ics = malloc(sizeof(*ics));
+	ics->nr_irqs = XICS_NUM_IRQS;
+	ics->offset = XICS_IRQ_OFFSET;
+	ics->irqs = malloc(ics->nr_irqs * sizeof(struct ics_irq_state));
+
+	icp->ics = ics;
+	ics->icp = icp;
+
+	for (i = 0; i < ics->nr_irqs; i++) {
+		ics->irqs[i].server = 0;
+		ics->irqs[i].priority = 0xff;
+		ics->irqs[i].saved_priority = 0xff;
+		ics->irqs[i].rejected = 0;
+		ics->irqs[i].masked_pending = 0;
+	}
+
+	spapr_register_hypercall(H_CPPR, h_cppr);
+	spapr_register_hypercall(H_IPI, h_ipi);
+	spapr_register_hypercall(H_XIRR, h_xirr);
+	spapr_register_hypercall(H_EOI, h_eoi);
+
+	spapr_rtas_register("ibm,set-xive", rtas_set_xive);
+	spapr_rtas_register("ibm,get-xive", rtas_get_xive);
+	spapr_rtas_register("ibm,int-off", rtas_int_off);
+	spapr_rtas_register("ibm,int-on", rtas_int_on);
+
+	for (j = 0; j < kvm->nrcpus; j++) {
+		struct kvm_cpu *vcpu = kvm->cpus[j];
+
+		if (vcpu->cpu_id >= icp->nr_servers)
+			die("Invalid server number for cpuid %ld\n", vcpu->cpu_id);
+
+		icp->ss[vcpu->cpu_id].cpu = vcpu;
+	}
+
+	kvm->arch.icp = icp;
+
+	return 0;
+}
+dev_base_init(xics_init);
+
+
+void kvm__irq_line(struct kvm *kvm, int irq, int level)
+{
+	/*
+	 * Route event to ICS, which routes to ICP, which eventually does a
+	 * kvm_cpu__irq(vcpu, POWER7_EXT_IRQ, 1)
+	 */
+	xics_dprintf("Raising IRQ %d -> %d\n", irq, level);
+	ics_set_irq_msi(kvm->arch.icp->ics, irq - kvm->arch.icp->ics->offset, level);
+}
diff --git a/kvmtool/powerpc/xics.h b/kvmtool/powerpc/xics.h
new file mode 100644
index 0000000..d5bc6f9
--- /dev/null
+++ b/kvmtool/powerpc/xics.h
@@ -0,0 +1,18 @@
+/*
+ * PAPR Virtualized Interrupt System, aka ICS/ICP aka xics
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef XICS_H
+#define XICS_H
+
+#define XICS_IPI        0x2
+
+int xics_alloc_irqnum(void);
+
+#endif
diff --git a/kvmtool/symbol.c b/kvmtool/symbol.c
new file mode 100644
index 0000000..07dd9d5
--- /dev/null
+++ b/kvmtool/symbol.c
@@ -0,0 +1,133 @@
+#include "kvm/symbol.h"
+
+#include "kvm/kvm.h"
+
+#include <linux/err.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <bfd.h>
+
+static bfd *abfd;
+
+int symbol_init(struct kvm *kvm)
+{
+	int ret = 0;
+
+	if (!kvm->vmlinux)
+		return 0;
+
+	bfd_init();
+
+	abfd = bfd_openr(kvm->vmlinux, NULL);
+	if (abfd == NULL) {
+		bfd_error_type err = bfd_get_error();
+
+		switch (err) {
+		case bfd_error_no_memory:
+			ret = -ENOMEM;
+			break;
+		case bfd_error_invalid_target:
+			ret = -EINVAL;
+			break;
+		default:
+			ret = -EFAULT;
+			break;
+		}
+	}
+
+	return ret;
+}
+late_init(symbol_init);
+
+static asymbol *lookup(asymbol **symbols, int nr_symbols, const char *symbol_name)
+{
+	int i, ret;
+
+	ret = -ENOENT;
+
+	for (i = 0; i < nr_symbols; i++) {
+		asymbol *symbol = symbols[i];
+
+		if (!strcmp(bfd_asymbol_name(symbol), symbol_name))
+			return symbol;
+	}
+
+	return ERR_PTR(ret);
+}
+
+char *symbol_lookup(struct kvm *kvm, unsigned long addr, char *sym, size_t size)
+{
+	const char *filename;
+	bfd_vma sym_offset;
+	bfd_vma sym_start;
+	asection *section;
+	unsigned int line;
+	const char *func;
+	long symtab_size;
+	asymbol *symbol;
+	asymbol **syms;
+	int nr_syms, ret;
+
+	ret = -ENOENT;
+	if (!abfd)
+		goto not_found;
+
+	if (!bfd_check_format(abfd, bfd_object))
+		goto not_found;
+
+	symtab_size = bfd_get_symtab_upper_bound(abfd);
+	if (!symtab_size)
+		goto not_found;
+
+	ret = -ENOMEM;
+	syms = malloc(symtab_size);
+	if (!syms)
+		goto not_found;
+
+	nr_syms = bfd_canonicalize_symtab(abfd, syms);
+
+	ret = -ENOENT;
+	section = bfd_get_section_by_name(abfd, ".debug_aranges");
+	if (!section)
+		goto not_found;
+
+	if (!bfd_find_nearest_line(abfd, section, NULL, addr, &filename, &func, &line))
+		goto not_found;
+
+	if (!func)
+		goto not_found;
+
+	symbol = lookup(syms, nr_syms, func);
+	if (IS_ERR(symbol))
+		goto not_found;
+
+	sym_start = bfd_asymbol_value(symbol);
+
+	sym_offset = addr - sym_start;
+
+	snprintf(sym, size, "%s+%llx (%s:%i)", func, (long long) sym_offset, filename, line);
+
+	sym[size - 1] = '\0';
+
+	free(syms);
+
+	return sym;
+
+not_found:
+	return ERR_PTR(ret);
+}
+
+int symbol_exit(struct kvm *kvm)
+{
+	bfd_boolean ret = TRUE;
+
+	if (abfd)
+		ret = bfd_close(abfd);
+
+	if (ret == TRUE)
+		return 0;
+
+	return -EFAULT;
+}
+late_exit(symbol_exit);
diff --git a/kvmtool/term.c b/kvmtool/term.c
new file mode 100644
index 0000000..b8a70fe
--- /dev/null
+++ b/kvmtool/term.c
@@ -0,0 +1,213 @@
+#include <poll.h>
+#include <stdbool.h>
+#include <termios.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/uio.h>
+#include <signal.h>
+#include <pty.h>
+#include <utmp.h>
+
+#include "kvm/read-write.h"
+#include "kvm/term.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+
+#define TERM_FD_IN      0
+#define TERM_FD_OUT     1
+
+static struct termios	orig_term;
+
+static int term_fds[TERM_MAX_DEVS][2];
+
+static pthread_t term_poll_thread;
+
+/* ctrl-a is used for escape */
+#define term_escape_char	0x01
+
+int term_getc(struct kvm *kvm, int term)
+{
+	static bool term_got_escape = false;
+	unsigned char c;
+
+	if (read_in_full(term_fds[term][TERM_FD_IN], &c, 1) < 0)
+		return -1;
+
+	if (term_got_escape) {
+		term_got_escape = false;
+		if (c == 'x')
+			kvm__reboot(kvm);
+		if (c == term_escape_char)
+			return c;
+	}
+
+	if (c == term_escape_char) {
+		term_got_escape = true;
+		return -1;
+	}
+
+	return c;
+}
+
+int term_putc(char *addr, int cnt, int term)
+{
+	int ret;
+	int num_remaining = cnt;
+
+	while (num_remaining) {
+		ret = write(term_fds[term][TERM_FD_OUT], addr, num_remaining);
+		if (ret < 0)
+			return cnt - num_remaining;
+		num_remaining -= ret;
+		addr += ret;
+	}
+
+	return cnt;
+}
+
+int term_getc_iov(struct kvm *kvm, struct iovec *iov, int iovcnt, int term)
+{
+	int c;
+
+	c = term_getc(kvm, term);
+
+	if (c < 0)
+		return 0;
+
+	*((char *)iov[TERM_FD_IN].iov_base)	= (char)c;
+
+	return sizeof(char);
+}
+
+int term_putc_iov(struct iovec *iov, int iovcnt, int term)
+{
+	return writev(term_fds[term][TERM_FD_OUT], iov, iovcnt);
+}
+
+bool term_readable(int term)
+{
+	struct pollfd pollfd = (struct pollfd) {
+		.fd	= term_fds[term][TERM_FD_IN],
+		.events	= POLLIN,
+		.revents = 0,
+	};
+	int err;
+
+	err = poll(&pollfd, 1, 0);
+	return (err > 0 && (pollfd.revents & POLLIN));
+}
+
+static void *term_poll_thread_loop(void *param)
+{
+	struct pollfd fds[TERM_MAX_DEVS];
+	struct kvm *kvm = (struct kvm *) param;
+	int i;
+
+	kvm__set_thread_name("term-poll");
+
+	for (i = 0; i < TERM_MAX_DEVS; i++) {
+		fds[i].fd = term_fds[i][TERM_FD_IN];
+		fds[i].events = POLLIN;
+		fds[i].revents = 0;
+	}
+
+	while (1) {
+		/* Poll with infinite timeout */
+		if(poll(fds, TERM_MAX_DEVS, -1) < 1)
+			break;
+		kvm__arch_read_term(kvm);
+	}
+
+	die("term_poll_thread_loop: error polling device fds %d\n", errno);
+	return NULL;
+}
+
+static void term_cleanup(void)
+{
+	int i;
+
+	for (i = 0; i < TERM_MAX_DEVS; i++)
+		tcsetattr(term_fds[i][TERM_FD_IN], TCSANOW, &orig_term);
+}
+
+static void term_sig_cleanup(int sig)
+{
+	term_cleanup();
+	signal(sig, SIG_DFL);
+	raise(sig);
+}
+
+static void term_set_tty(int term)
+{
+	struct termios orig_term;
+	int master, slave;
+	char new_pty[PATH_MAX];
+
+	if (tcgetattr(STDIN_FILENO, &orig_term) < 0)
+		die("unable to save initial standard input settings");
+
+	orig_term.c_lflag &= ~(ICANON | ECHO | ISIG);
+
+	if (openpty(&master, &slave, new_pty, &orig_term, NULL) < 0)
+		return;
+
+	close(slave);
+
+	pr_info("Assigned terminal %d to pty %s\n", term, new_pty);
+
+	term_fds[term][TERM_FD_IN] = term_fds[term][TERM_FD_OUT] = master;
+}
+
+int tty_parser(const struct option *opt, const char *arg, int unset)
+{
+	int tty = atoi(arg);
+
+	term_set_tty(tty);
+
+	return 0;
+}
+
+static int term_init(struct kvm *kvm)
+{
+	struct termios term;
+	int i, r;
+
+	for (i = 0; i < TERM_MAX_DEVS; i++)
+		if (term_fds[i][TERM_FD_IN] == 0) {
+			term_fds[i][TERM_FD_IN] = STDIN_FILENO;
+			term_fds[i][TERM_FD_OUT] = STDOUT_FILENO;
+		}
+
+	if (!isatty(STDIN_FILENO) || !isatty(STDOUT_FILENO))
+		return 0;
+
+	r = tcgetattr(STDIN_FILENO, &orig_term);
+	if (r < 0) {
+		pr_warning("unable to save initial standard input settings");
+		return r;
+	}
+
+
+	term = orig_term;
+	term.c_iflag &= ~(ICRNL);
+	term.c_lflag &= ~(ICANON | ECHO | ISIG);
+	tcsetattr(STDIN_FILENO, TCSANOW, &term);
+
+
+	/* Use our own blocking thread to read stdin, don't require a tick */
+	if(pthread_create(&term_poll_thread, NULL, term_poll_thread_loop,kvm))
+		die("Unable to create console input poll thread\n");
+
+	signal(SIGTERM, term_sig_cleanup);
+	atexit(term_cleanup);
+
+	return 0;
+}
+dev_init(term_init);
+
+static int term_exit(struct kvm *kvm)
+{
+	return 0;
+}
+dev_exit(term_exit);
diff --git a/kvmtool/tests/Makefile b/kvmtool/tests/Makefile
new file mode 100644
index 0000000..cad14ec
--- /dev/null
+++ b/kvmtool/tests/Makefile
@@ -0,0 +1,19 @@
+all: kernel pit boot
+
+kernel:
+	$(MAKE) -C kernel
+.PHONY: kernel
+
+pit:
+	$(MAKE) -C pit
+.PHONY: pit
+
+boot:
+	$(MAKE) -C boot
+.PHONY: boot
+
+clean:
+	$(MAKE) -C kernel clean
+	$(MAKE) -C pit clean
+	$(MAKE) -C boot clean
+.PHONY: clean
diff --git a/kvmtool/tests/boot/Makefile b/kvmtool/tests/boot/Makefile
new file mode 100644
index 0000000..2b950d2
--- /dev/null
+++ b/kvmtool/tests/boot/Makefile
@@ -0,0 +1,21 @@
+NAME	:= init
+
+OBJ	:= $(NAME).o
+MKISOFS := $(shell which mkisofs)
+ifndef MKISOFS
+MKISOFS := $(shell which xorrisofs)
+endif
+
+all: $(.o)
+	rm -rf rootfs
+	mkdir rootfs
+	gcc -static init.c -o rootfs/init
+ifdef MKISOFS
+	$(MKISOFS) rootfs -o boot_test.iso
+else
+	$(error "mkisofs or xorriso needed to build boot_test.iso")
+endif
+
+clean:
+	rm -rf rootfs boot_test.iso
+.PHONY: clean
diff --git a/kvmtool/tests/boot/init.c b/kvmtool/tests/boot/init.c
new file mode 100644
index 0000000..094f8ba
--- /dev/null
+++ b/kvmtool/tests/boot/init.c
@@ -0,0 +1,11 @@
+#include <linux/reboot.h>
+#include <unistd.h>
+
+int main(int argc, char *argv[])
+{
+	puts("hello, KVM guest!\r");
+
+	reboot(LINUX_REBOOT_CMD_RESTART);
+
+	return 0;
+}
diff --git a/kvmtool/tests/kernel/.gitignore b/kvmtool/tests/kernel/.gitignore
new file mode 100644
index 0000000..d0cd209
--- /dev/null
+++ b/kvmtool/tests/kernel/.gitignore
@@ -0,0 +1,2 @@
+kernel.bin
+kernel.elf
diff --git a/kvmtool/tests/kernel/Makefile b/kvmtool/tests/kernel/Makefile
new file mode 100644
index 0000000..c7dd8da
--- /dev/null
+++ b/kvmtool/tests/kernel/Makefile
@@ -0,0 +1,20 @@
+NAME	:= kernel
+
+BIN	:= $(NAME).bin
+ELF	:= $(NAME).elf
+OBJ	:= $(NAME).o
+
+all: $(BIN)
+
+$(BIN): $(ELF)
+	objcopy -O binary $< $@
+
+$(ELF): $(OBJ)
+	ld -Ttext=0x00 -nostdlib -static $< -o $@
+
+%.o: %.S
+	gcc -nostdinc -c $< -o $@
+
+clean:
+	rm -f $(BIN) $(ELF) $(OBJ)
+.PHONY: clean
diff --git a/kvmtool/tests/kernel/README b/kvmtool/tests/kernel/README
new file mode 100644
index 0000000..2923777
--- /dev/null
+++ b/kvmtool/tests/kernel/README
@@ -0,0 +1,16 @@
+Compiling
+---------
+
+You can simply type:
+
+  $ make
+
+to build a 16-bit binary that uses the i8086 instruction set.
+
+Disassembling
+-------------
+
+Use the "-m i8086" command line option with objdump to make sure it knows we're
+dealing with i8086 instruction set:
+
+  $ objdump -d -m i8086 i8086.elf
diff --git a/kvmtool/tests/kernel/kernel.S b/kvmtool/tests/kernel/kernel.S
new file mode 100644
index 0000000..2824b64
--- /dev/null
+++ b/kvmtool/tests/kernel/kernel.S
@@ -0,0 +1,8 @@
+	.code16gcc
+	.text
+	.globl  _start
+	.type   _start, @function
+_start:
+	# "This is probably the largest possible kernel that is bug free." -- Avi Kivity
+	1:
+	jmp 1b
diff --git a/kvmtool/tests/pit/.gitignore b/kvmtool/tests/pit/.gitignore
new file mode 100644
index 0000000..43f0aa8
--- /dev/null
+++ b/kvmtool/tests/pit/.gitignore
@@ -0,0 +1,2 @@
+*.bin
+*.elf
diff --git a/kvmtool/tests/pit/Makefile b/kvmtool/tests/pit/Makefile
new file mode 100644
index 0000000..2fae9b2
--- /dev/null
+++ b/kvmtool/tests/pit/Makefile
@@ -0,0 +1,20 @@
+NAME	:= tick
+
+BIN	:= $(NAME).bin
+ELF	:= $(NAME).elf
+OBJ	:= $(NAME).o
+
+all: $(BIN)
+
+$(BIN): $(ELF)
+	objcopy -O binary $< $@
+
+$(ELF): $(OBJ)
+	ld -Ttext=0x00 -nostdlib -static $< -o $@
+
+%.o: %.S
+	gcc -nostdinc -c $< -o $@
+
+clean:
+	rm -f $(BIN) $(ELF) $(OBJ)
+.PHONY: clean
diff --git a/kvmtool/tests/pit/README b/kvmtool/tests/pit/README
new file mode 100644
index 0000000..2923777
--- /dev/null
+++ b/kvmtool/tests/pit/README
@@ -0,0 +1,16 @@
+Compiling
+---------
+
+You can simply type:
+
+  $ make
+
+to build a 16-bit binary that uses the i8086 instruction set.
+
+Disassembling
+-------------
+
+Use the "-m i8086" command line option with objdump to make sure it knows we're
+dealing with i8086 instruction set:
+
+  $ objdump -d -m i8086 i8086.elf
diff --git a/kvmtool/tests/pit/tick.S b/kvmtool/tests/pit/tick.S
new file mode 100644
index 0000000..635dc8d
--- /dev/null
+++ b/kvmtool/tests/pit/tick.S
@@ -0,0 +1,101 @@
+#define IO_PIC		0x20
+#define IRQ_OFFSET	32
+#define IO_PIT		0x40
+#define TIMER_FREQ	1193182
+#define TIMER_DIV(x)	((TIMER_FREQ+(x)/2)/(x))
+
+#define TEST_COUNT	0x0200
+
+	.code16gcc
+	.text
+	.globl	_start
+	.type	_start, @function
+_start:
+/*
+ * fill up noop handlers
+ */
+	xorw	%ax, %ax
+	xorw	%di, %di
+	movw	%ax, %es
+	movw	$256, %cx
+fill_noop_idt:
+	movw	$noop_handler, %es:(%di)
+	movw	%cs, %es:2(%di)
+	add	$4, %di
+	loop	fill_noop_idt
+
+set_idt:
+	movw	$timer_isr, %es:(IRQ_OFFSET*4)
+	movw	%cs, %es:(IRQ_OFFSET*4+2)
+
+set_pic:
+	# ICW1
+	mov	$0x11, %al
+	mov	$(IO_PIC), %dx
+	out	%al,%dx
+	# ICW2
+	mov	$(IRQ_OFFSET), %al
+	mov	$(IO_PIC+1), %dx
+	out	%al, %dx
+	# ICW3
+	mov	$0x00, %al
+	mov	$(IO_PIC+1), %dx
+	out	%al, %dx
+	# ICW4
+	mov	$0x3, %al
+	mov	$(IO_PIC+1), %dx
+	out	%al, %dx
+
+set_pit:
+	# set 8254 mode
+	mov	$(IO_PIT+3), %dx
+	mov	$0x34, %al
+	outb	%al, %dx
+	# set 8254 freq 1KHz
+	mov	$(IO_PIT), %dx
+	movb	$(TIMER_DIV(1000) % 256), %al
+	outb	%al, %dx
+	movb	$(TIMER_DIV(1000) / 256), %al
+	outb	%al, %dx
+
+enable_irq0:
+	mov	$0xfe, %al
+	mov	$(IO_PIC+1), %dx
+	out	%al, %dx
+	sti
+loop:
+	1:
+	jmp	1b
+
+test_ok:
+	mov	$0x3f8,%dx
+	cs lea	msg2, %si
+	mov	$(msg2_end-msg2), %cx
+	cs rep/outsb
+
+	/* Reboot by using the i8042 reboot line */
+	mov	$0xfe, %al
+	outb	%al, $0x64
+
+timer_isr:
+	cli
+	pushaw
+	pushfw
+	mov	$0x3f8,%dx
+	mov	$0x2e, %al	# .
+	out	%al,%dx
+	decw	count
+	jz	test_ok
+	popfw
+	popaw
+	iretw
+
+noop_handler:
+	iretw
+
+count:
+	.word	TEST_COUNT
+
+msg2:
+	.asciz "\nTest OK\n"
+msg2_end:
diff --git a/kvmtool/ui/gtk3.c b/kvmtool/ui/gtk3.c
new file mode 100644
index 0000000..1e08a8f
--- /dev/null
+++ b/kvmtool/ui/gtk3.c
@@ -0,0 +1,326 @@
+#include "kvm/gtk3.h"
+
+#include "kvm/framebuffer.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/i8042.h"
+#include "kvm/vesa.h"
+#include "kvm/kvm.h"
+
+#include <gtk/gtk.h>
+#include <pthread.h>
+#include <linux/err.h>
+
+#define FRAME_RATE			25
+
+#define SCANCODE_UNKNOWN		0
+#define SCANCODE_NORMAL			1
+#define SCANCODE_ESCAPED		2
+#define SCANCODE_KEY_PAUSE		3
+#define SCANCODE_KEY_PRNTSCRN		4
+
+struct set2_scancode {
+	u8 code;
+	u8 type;
+};
+
+#define DEFINE_SC(_code) {		\
+	.code = _code,			\
+	.type = SCANCODE_NORMAL,	\
+}
+
+/* escaped scancodes */
+#define DEFINE_ESC(_code) {		\
+	.code = _code,			\
+	.type = SCANCODE_ESCAPED,	\
+}
+
+static const struct set2_scancode keymap[256] = {
+	[9]	= DEFINE_SC(0x76),	/* <esc> */
+	[10]	= DEFINE_SC(0x16),	/* 1 */
+	[11]	= DEFINE_SC(0x1e),	/* 2 */
+	[12]	= DEFINE_SC(0x26),	/* 3 */
+	[13]	= DEFINE_SC(0x25),	/* 4 */
+	[14]	= DEFINE_SC(0x2e),	/* 5 */
+	[15]	= DEFINE_SC(0x36),	/* 6 */
+	[16]	= DEFINE_SC(0x3d),	/* 7 */
+	[17]	= DEFINE_SC(0x3e),	/* 8 */
+	[18]	= DEFINE_SC(0x46),	/* 9 */
+	[19]	= DEFINE_SC(0x45),	/* 9 */
+	[20]	= DEFINE_SC(0x4e),	/* - */
+	[21]	= DEFINE_SC(0x55),	/* + */
+	[22]	= DEFINE_SC(0x66),	/* <backspace> */
+	[23]	= DEFINE_SC(0x0d),	/* <tab> */
+	[24]	= DEFINE_SC(0x15),	/* q */
+	[25]	= DEFINE_SC(0x1d),	/* w */
+	[26]	= DEFINE_SC(0x24),	/* e */
+	[27]	= DEFINE_SC(0x2d),	/* r */
+	[28]	= DEFINE_SC(0x2c),	/* t */
+	[29]	= DEFINE_SC(0x35),	/* y */
+	[30]	= DEFINE_SC(0x3c),	/* u */
+	[31]	= DEFINE_SC(0x43),	/* i */
+	[32]	= DEFINE_SC(0x44),	/* o */
+	[33]	= DEFINE_SC(0x4d),	/* p */
+	[34]	= DEFINE_SC(0x54),	/* [ */
+	[35]	= DEFINE_SC(0x5b),	/* ] */
+	[36]	= DEFINE_SC(0x5a),	/* <enter> */
+	[37]	= DEFINE_SC(0x14),	/* <left ctrl> */
+	[38]	= DEFINE_SC(0x1c),	/* a */
+	[39]	= DEFINE_SC(0x1b),	/* s */
+	[40]	= DEFINE_SC(0x23),	/* d */
+	[41]	= DEFINE_SC(0x2b),	/* f */
+	[42]	= DEFINE_SC(0x34),	/* g */
+	[43]	= DEFINE_SC(0x33),	/* h */
+	[44]	= DEFINE_SC(0x3b),	/* j */
+	[45]	= DEFINE_SC(0x42),	/* k */
+	[46]	= DEFINE_SC(0x4b),	/* l */
+	[47]	= DEFINE_SC(0x4c),	/* ; */
+	[48]	= DEFINE_SC(0x52),	/* ' */
+	[49]	= DEFINE_SC(0x0e),	/* ` */
+	[50]	= DEFINE_SC(0x12),	/* <left shift> */
+	[51]	= DEFINE_SC(0x5d),	/* \ */
+	[52]	= DEFINE_SC(0x1a),	/* z */
+	[53]	= DEFINE_SC(0x22),	/* x */
+	[54]	= DEFINE_SC(0x21),	/* c */
+	[55]	= DEFINE_SC(0x2a),	/* v */
+	[56]	= DEFINE_SC(0x32),	/* b */
+	[57]	= DEFINE_SC(0x31),	/* n */
+	[58]	= DEFINE_SC(0x3a),	/* m */
+	[59]	= DEFINE_SC(0x41),	/* < */
+	[60]	= DEFINE_SC(0x49),	/* > */
+	[61]	= DEFINE_SC(0x4a),	/* / */
+	[62]	= DEFINE_SC(0x59),	/* <right shift> */
+	[63]	= DEFINE_SC(0x7c),	/* keypad * */
+	[64]	= DEFINE_SC(0x11),	/* <left alt> */
+	[65]	= DEFINE_SC(0x29),	/* <space> */
+
+	[67]	= DEFINE_SC(0x05),	/* <F1> */
+	[68]	= DEFINE_SC(0x06),	/* <F2> */
+	[69]	= DEFINE_SC(0x04),	/* <F3> */
+	[70]	= DEFINE_SC(0x0c),	/* <F4> */
+	[71]	= DEFINE_SC(0x03),	/* <F5> */
+	[72]	= DEFINE_SC(0x0b),	/* <F6> */
+	[73]	= DEFINE_SC(0x83),	/* <F7> */
+	[74]	= DEFINE_SC(0x0a),	/* <F8> */
+	[75]	= DEFINE_SC(0x01),	/* <F9> */
+	[76]	= DEFINE_SC(0x09),	/* <F10> */
+
+	[79]	= DEFINE_SC(0x6c),	/* keypad 7 */
+	[80]	= DEFINE_SC(0x75),	/* keypad 8 */
+	[81]	= DEFINE_SC(0x7d),	/* keypad 9 */
+	[82]	= DEFINE_SC(0x7b),	/* keypad - */
+	[83]	= DEFINE_SC(0x6b),	/* keypad 4 */
+	[84]	= DEFINE_SC(0x73),	/* keypad 5 */
+	[85]	= DEFINE_SC(0x74),	/* keypad 6 */
+	[86]	= DEFINE_SC(0x79),	/* keypad + */
+	[87]	= DEFINE_SC(0x69),	/* keypad 1 */
+	[88]	= DEFINE_SC(0x72),	/* keypad 2 */
+	[89]	= DEFINE_SC(0x7a),	/* keypad 3 */
+	[90]	= DEFINE_SC(0x70),	/* keypad 0 */
+	[91]	= DEFINE_SC(0x71),	/* keypad . */
+
+	[94]	= DEFINE_SC(0x61),	/* <INT 1> */
+	[95]	= DEFINE_SC(0x78),	/* <F11> */
+	[96]	= DEFINE_SC(0x07),	/* <F12> */
+
+	[104]	= DEFINE_ESC(0x5a),	/* keypad <enter> */
+	[105]	= DEFINE_ESC(0x14),	/* <right ctrl> */
+	[106]	= DEFINE_ESC(0x4a),	/* keypad / */
+	[108]	= DEFINE_ESC(0x11),	/* <right alt> */
+	[110]	= DEFINE_ESC(0x6c),	/* <home> */
+	[111]	= DEFINE_ESC(0x75),	/* <up> */
+	[112]	= DEFINE_ESC(0x7d),	/* <pag up> */
+	[113]	= DEFINE_ESC(0x6b),	/* <left> */
+	[114]	= DEFINE_ESC(0x74),	/* <right> */
+	[115]	= DEFINE_ESC(0x69),	/* <end> */
+	[116]	= DEFINE_ESC(0x72),	/* <down> */
+	[117]	= DEFINE_ESC(0x7a),	/* <pag down> */
+	[118]	= DEFINE_ESC(0x70),	/* <ins> */
+	[119]	= DEFINE_ESC(0x71),	/* <delete> */
+};
+
+static cairo_surface_t	*surface;
+static bool		done;
+
+static const struct set2_scancode *to_code(u8 scancode)
+{
+        return &keymap[scancode];
+}
+
+static gboolean
+kvm_gtk_configure_event(GtkWidget * widget, GdkEventConfigure * event, gpointer data)
+{
+	struct framebuffer *fb = data;
+	int stride;
+
+	if (surface)
+		cairo_surface_destroy(surface);
+
+	stride = cairo_format_stride_for_width(CAIRO_FORMAT_RGB24, fb->width);
+
+	surface =
+	    cairo_image_surface_create_for_data((void *) fb->mem,
+						CAIRO_FORMAT_RGB24,
+						fb->width,
+						fb->height,
+						stride);
+
+	return TRUE;
+}
+
+static gboolean kvm_gtk_draw(GtkWidget *widget, cairo_t *cr, gpointer data)
+{
+	cairo_set_source_surface(cr, surface, 0, 0);
+
+	cairo_paint(cr);
+
+	return FALSE;
+}
+
+static void kvm_gtk_destroy(void)
+{
+	if (surface)
+		cairo_surface_destroy(surface);
+
+	gtk_main_quit();
+}
+
+static gboolean kvm_gtk_redraw(GtkWidget * window)
+{
+	gtk_widget_queue_draw(window);
+
+	return TRUE;
+}
+
+static gboolean
+kvm_gtk_key_press(GtkWidget *widget, GdkEventKey *event, gpointer user_data)
+{
+	const struct set2_scancode *sc = to_code(event->hardware_keycode);
+
+        switch (sc->type) {
+        case SCANCODE_ESCAPED:
+                kbd_queue(0xe0);
+                /* fallthrough */
+        case SCANCODE_NORMAL:
+                kbd_queue(sc->code);
+                break;
+        case SCANCODE_KEY_PAUSE:
+                kbd_queue(0xe1);
+                kbd_queue(0x14);
+                kbd_queue(0x77);
+                kbd_queue(0xe1);
+                kbd_queue(0xf0);
+                kbd_queue(0x14);
+                kbd_queue(0x77);
+                break;
+        case SCANCODE_KEY_PRNTSCRN:
+                kbd_queue(0xe0);
+                kbd_queue(0x12);
+                kbd_queue(0xe0);
+                kbd_queue(0x7c);
+                break;
+        }
+
+	return TRUE;
+}
+
+static void *kvm_gtk_thread(void *p)
+{
+	struct framebuffer *fb = p;
+	GtkWidget *window;
+	GtkWidget *frame;
+	GtkWidget *da;
+
+	gtk_init(NULL, NULL);
+
+	window = gtk_window_new(GTK_WINDOW_TOPLEVEL);
+
+	gtk_window_set_title(GTK_WINDOW(window), "VM");
+
+	g_signal_connect(window, "destroy", G_CALLBACK(kvm_gtk_destroy), NULL);
+
+	gtk_container_set_border_width(GTK_CONTAINER(window), 8);
+
+	frame = gtk_frame_new(NULL);
+
+	gtk_frame_set_shadow_type(GTK_FRAME(frame), GTK_SHADOW_IN);
+	gtk_container_add(GTK_CONTAINER(window), frame);
+
+	da = gtk_drawing_area_new();
+
+	gtk_widget_set_size_request(da, 100, 100);
+
+	gtk_container_add(GTK_CONTAINER(frame), da);
+
+	g_signal_connect(da, "draw", G_CALLBACK(kvm_gtk_draw), NULL);
+	g_signal_connect(da, "configure-event",
+			 G_CALLBACK(kvm_gtk_configure_event), fb);
+	g_signal_connect(G_OBJECT (window), "key_press_event", G_CALLBACK(kvm_gtk_key_press), NULL);
+
+
+	gtk_widget_set_events(da, gtk_widget_get_events(da)
+			      | GDK_BUTTON_PRESS_MASK
+			      | GDK_POINTER_MOTION_MASK
+			      | GDK_POINTER_MOTION_HINT_MASK);
+
+	gtk_widget_show_all(window);
+
+	g_timeout_add(1000 / FRAME_RATE, (GSourceFunc) kvm_gtk_redraw, window);
+
+	gtk_main();
+
+	done = true;
+
+	return NULL;
+}
+
+static int kvm_gtk_start(struct framebuffer *fb)
+{
+	pthread_t thread;
+
+	if (pthread_create(&thread, NULL, kvm_gtk_thread, fb) != 0)
+		return -1;
+
+	return 0;
+}
+
+static int kvm_gtk_stop(struct framebuffer *fb)
+{
+	gtk_main_quit();
+
+	while (!done)
+		sleep(0);
+
+	return 0;
+}
+
+static struct fb_target_operations kvm_gtk_ops = {
+	.start		= kvm_gtk_start,
+	.stop		= kvm_gtk_stop,
+};
+
+int kvm_gtk_init(struct kvm *kvm)
+{
+	struct framebuffer *fb;
+
+	if (!kvm->cfg.gtk)
+		return 0;
+
+	fb = vesa__init(kvm);
+	if (IS_ERR(fb)) {
+		pr_err("vesa__init() failed with error %ld\n", PTR_ERR(fb));
+		return PTR_ERR(fb);
+	}
+
+	return fb__attach(fb, &kvm_gtk_ops);
+}
+
+int kvm_gtk_exit(struct kvm *kvm)
+{
+	if (kvm->cfg.gtk)
+		return kvm_gtk_stop(NULL);
+
+	return 0;
+}
+
+dev_init(kvm_gtk_init);
+dev_exit(kvm_gtk_exit);
diff --git a/kvmtool/ui/sdl.c b/kvmtool/ui/sdl.c
new file mode 100644
index 0000000..5035405
--- /dev/null
+++ b/kvmtool/ui/sdl.c
@@ -0,0 +1,324 @@
+#include "kvm/sdl.h"
+
+#include "kvm/framebuffer.h"
+#include "kvm/i8042.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/vesa.h"
+
+#include <SDL/SDL.h>
+#include <pthread.h>
+#include <signal.h>
+#include <linux/err.h>
+
+#define FRAME_RATE		25
+
+#define SCANCODE_UNKNOWN      0
+#define SCANCODE_NORMAL       1
+#define SCANCODE_ESCAPED      2
+#define SCANCODE_KEY_PAUSE    3
+#define SCANCODE_KEY_PRNTSCRN 4
+
+struct set2_scancode {
+	u8 code;
+	u8 type;
+};
+
+#define DEFINE_SC(_code) {\
+	.code = _code,\
+	.type = SCANCODE_NORMAL,\
+}
+
+/* escaped scancodes */
+#define DEFINE_ESC(_code) {\
+	.code = _code,\
+	.type = SCANCODE_ESCAPED,\
+}
+
+static const struct set2_scancode keymap[256] = {
+	[9]	= DEFINE_SC(0x76),	/* <esc> */
+	[10]	= DEFINE_SC(0x16),	/* 1 */
+	[11]	= DEFINE_SC(0x1e),	/* 2 */
+	[12]	= DEFINE_SC(0x26),	/* 3 */
+	[13]	= DEFINE_SC(0x25),	/* 4 */
+	[14]	= DEFINE_SC(0x2e),	/* 5 */
+	[15]	= DEFINE_SC(0x36),	/* 6 */
+	[16]	= DEFINE_SC(0x3d),	/* 7 */
+	[17]	= DEFINE_SC(0x3e),	/* 8 */
+	[18]	= DEFINE_SC(0x46),	/* 9 */
+	[19]	= DEFINE_SC(0x45),	/* 9 */
+	[20]	= DEFINE_SC(0x4e),	/* - */
+	[21]	= DEFINE_SC(0x55),	/* + */
+	[22]	= DEFINE_SC(0x66),	/* <backspace> */
+	[23]	= DEFINE_SC(0x0d),	/* <tab> */
+	[24]	= DEFINE_SC(0x15),	/* q */
+	[25]	= DEFINE_SC(0x1d),	/* w */
+	[26]	= DEFINE_SC(0x24),	/* e */
+	[27]	= DEFINE_SC(0x2d),	/* r */
+	[28]	= DEFINE_SC(0x2c),	/* t */
+	[29]	= DEFINE_SC(0x35),	/* y */
+	[30]	= DEFINE_SC(0x3c),	/* u */
+	[31]	= DEFINE_SC(0x43),	/* i */
+	[32]	= DEFINE_SC(0x44),	/* o */
+	[33]	= DEFINE_SC(0x4d),	/* p */
+	[34]	= DEFINE_SC(0x54),	/* [ */
+	[35]	= DEFINE_SC(0x5b),	/* ] */
+	[36]	= DEFINE_SC(0x5a),	/* <enter> */
+	[37]	= DEFINE_SC(0x14),	/* <left ctrl> */
+	[38]	= DEFINE_SC(0x1c),	/* a */
+	[39]	= DEFINE_SC(0x1b),	/* s */
+	[40]	= DEFINE_SC(0x23),	/* d */
+	[41]	= DEFINE_SC(0x2b),	/* f */
+	[42]	= DEFINE_SC(0x34),	/* g */
+	[43]	= DEFINE_SC(0x33),	/* h */
+	[44]	= DEFINE_SC(0x3b),	/* j */
+	[45]	= DEFINE_SC(0x42),	/* k */
+	[46]	= DEFINE_SC(0x4b),	/* l */
+	[47]	= DEFINE_SC(0x4c),	/* ; */
+	[48]	= DEFINE_SC(0x52),	/* ' */
+	[49]	= DEFINE_SC(0x0e),	/* ` */
+	[50]	= DEFINE_SC(0x12),	/* <left shift> */
+	[51]	= DEFINE_SC(0x5d),	/* \ */
+	[52]	= DEFINE_SC(0x1a),	/* z */
+	[53]	= DEFINE_SC(0x22),	/* x */
+	[54]	= DEFINE_SC(0x21),	/* c */
+	[55]	= DEFINE_SC(0x2a),	/* v */
+	[56]	= DEFINE_SC(0x32),	/* b */
+	[57]	= DEFINE_SC(0x31),	/* n */
+	[58]	= DEFINE_SC(0x3a),	/* m */
+	[59]	= DEFINE_SC(0x41),	/* < */
+	[60]	= DEFINE_SC(0x49),	/* > */
+	[61]	= DEFINE_SC(0x4a),	/* / */
+	[62]	= DEFINE_SC(0x59),	/* <right shift> */
+	[63]	= DEFINE_SC(0x7c),	/* keypad * */
+	[64]	= DEFINE_SC(0x11),	/* <left alt> */
+	[65]	= DEFINE_SC(0x29),	/* <space> */
+
+	[67]	= DEFINE_SC(0x05),	/* <F1> */
+	[68]	= DEFINE_SC(0x06),	/* <F2> */
+	[69]	= DEFINE_SC(0x04),	/* <F3> */
+	[70]	= DEFINE_SC(0x0c),	/* <F4> */
+	[71]	= DEFINE_SC(0x03),	/* <F5> */
+	[72]	= DEFINE_SC(0x0b),	/* <F6> */
+	[73]	= DEFINE_SC(0x83),	/* <F7> */
+	[74]	= DEFINE_SC(0x0a),	/* <F8> */
+	[75]	= DEFINE_SC(0x01),	/* <F9> */
+	[76]	= DEFINE_SC(0x09),	/* <F10> */
+
+	[79]	= DEFINE_SC(0x6c),	/* keypad 7 */
+	[80]	= DEFINE_SC(0x75),	/* keypad 8 */
+	[81]	= DEFINE_SC(0x7d),	/* keypad 9 */
+	[82]	= DEFINE_SC(0x7b),	/* keypad - */
+	[83]	= DEFINE_SC(0x6b),	/* keypad 4 */
+	[84]	= DEFINE_SC(0x73),	/* keypad 5 */
+	[85]	= DEFINE_SC(0x74),	/* keypad 6 */
+	[86]	= DEFINE_SC(0x79),	/* keypad + */
+	[87]	= DEFINE_SC(0x69),	/* keypad 1 */
+	[88]	= DEFINE_SC(0x72),	/* keypad 2 */
+	[89]	= DEFINE_SC(0x7a),	/* keypad 3 */
+	[90]	= DEFINE_SC(0x70),	/* keypad 0 */
+	[91]	= DEFINE_SC(0x71),	/* keypad . */
+
+	[94]	= DEFINE_SC(0x61),	/* <INT 1> */
+	[95]	= DEFINE_SC(0x78),	/* <F11> */
+	[96]	= DEFINE_SC(0x07),	/* <F12> */
+
+	[104]	= DEFINE_ESC(0x5a),	/* keypad <enter> */
+	[105]	= DEFINE_ESC(0x14),	/* <right ctrl> */
+	[106]	= DEFINE_ESC(0x4a),	/* keypad / */
+	[108]	= DEFINE_ESC(0x11),	/* <right alt> */
+	[110]	= DEFINE_ESC(0x6c),	/* <home> */
+	[111]	= DEFINE_ESC(0x75),	/* <up> */
+	[112]	= DEFINE_ESC(0x7d),	/* <pag up> */
+	[113]	= DEFINE_ESC(0x6b),	/* <left> */
+	[114]	= DEFINE_ESC(0x74),	/* <right> */
+	[115]	= DEFINE_ESC(0x69),	/* <end> */
+	[116]	= DEFINE_ESC(0x72),	/* <down> */
+	[117]	= DEFINE_ESC(0x7a),	/* <pag down> */
+	[118]	= DEFINE_ESC(0x70),	/* <ins> */
+	[119]	= DEFINE_ESC(0x71),	/* <delete> */
+};
+static bool running, done;
+
+static const struct set2_scancode *to_code(u8 scancode)
+{
+	return &keymap[scancode];
+}
+
+static void key_press(const struct set2_scancode *sc)
+{
+	switch (sc->type) {
+	case SCANCODE_ESCAPED:
+		kbd_queue(0xe0);
+		/* fallthrough */
+	case SCANCODE_NORMAL:
+		kbd_queue(sc->code);
+		break;
+	case SCANCODE_KEY_PAUSE:
+		kbd_queue(0xe1);
+		kbd_queue(0x14);
+		kbd_queue(0x77);
+		kbd_queue(0xe1);
+		kbd_queue(0xf0);
+		kbd_queue(0x14);
+		kbd_queue(0x77);
+		break;
+	case SCANCODE_KEY_PRNTSCRN:
+		kbd_queue(0xe0);
+		kbd_queue(0x12);
+		kbd_queue(0xe0);
+		kbd_queue(0x7c);
+		break;
+	}
+}
+
+static void key_release(const struct set2_scancode *sc)
+{
+	switch (sc->type) {
+	case SCANCODE_ESCAPED:
+		kbd_queue(0xe0);
+		/* fallthrough */
+	case SCANCODE_NORMAL:
+		kbd_queue(0xf0);
+		kbd_queue(sc->code);
+		break;
+	case SCANCODE_KEY_PAUSE:
+		/* nothing to do */
+		break;
+	case SCANCODE_KEY_PRNTSCRN:
+		kbd_queue(0xe0);
+		kbd_queue(0xf0);
+		kbd_queue(0x7c);
+		kbd_queue(0xe0);
+		kbd_queue(0xf0);
+		kbd_queue(0x12);
+		break;
+	}
+}
+
+static void *sdl__thread(void *p)
+{
+	Uint32 rmask, gmask, bmask, amask;
+	struct framebuffer *fb = p;
+	SDL_Surface *guest_screen;
+	SDL_Surface *screen;
+	SDL_Event ev;
+	Uint32 flags;
+
+	kvm__set_thread_name("kvm-sdl-worker");
+
+	if (SDL_Init(SDL_INIT_VIDEO) != 0)
+		die("Unable to initialize SDL");
+
+	rmask = 0x000000ff;
+	gmask = 0x0000ff00;
+	bmask = 0x00ff0000;
+	amask = 0x00000000;
+
+	guest_screen = SDL_CreateRGBSurfaceFrom(fb->mem, fb->width, fb->height, fb->depth, fb->width * fb->depth / 8, rmask, gmask, bmask, amask);
+	if (!guest_screen)
+		die("Unable to create SDL RBG surface");
+
+	flags = SDL_HWSURFACE | SDL_ASYNCBLIT | SDL_HWACCEL | SDL_DOUBLEBUF;
+
+	SDL_WM_SetCaption("KVM tool", "KVM tool");
+
+	screen = SDL_SetVideoMode(fb->width, fb->height, fb->depth, flags);
+	if (!screen)
+		die("Unable to set SDL video mode");
+
+	SDL_EnableKeyRepeat(200, 50);
+
+	while (running) {
+		SDL_BlitSurface(guest_screen, NULL, screen, NULL);
+		SDL_Flip(screen);
+
+		while (SDL_PollEvent(&ev)) {
+			switch (ev.type) {
+			case SDL_KEYDOWN: {
+				const struct set2_scancode *sc = to_code(ev.key.keysym.scancode);
+				if (sc->type == SCANCODE_UNKNOWN) {
+					pr_warning("key '%d' not found in keymap", ev.key.keysym.scancode);
+					break;
+				}
+				key_press(sc);
+				break;
+			}
+			case SDL_KEYUP: {
+				const struct set2_scancode *sc = to_code(ev.key.keysym.scancode);
+				if (sc->type == SCANCODE_UNKNOWN)
+					break;
+				key_release(sc);
+				break;
+			}
+			case SDL_QUIT:
+				goto exit;
+			}
+		}
+
+		SDL_Delay(1000 / FRAME_RATE);
+	}
+
+	if (running == false && done == false) {
+		done = true;
+		return NULL;
+	}
+exit:
+	done = true;
+	kvm__reboot(fb->kvm);
+
+	return NULL;
+}
+
+static int sdl__start(struct framebuffer *fb)
+{
+	pthread_t thread;
+
+	running = true;
+
+	if (pthread_create(&thread, NULL, sdl__thread, fb) != 0)
+		return -1;
+
+	return 0;
+}
+
+static int sdl__stop(struct framebuffer *fb)
+{
+	running = false;
+	while (done == false)
+		sleep(0);
+
+	return 0;
+}
+
+static struct fb_target_operations sdl_ops = {
+	.start	= sdl__start,
+	.stop	= sdl__stop,
+};
+
+int sdl__init(struct kvm *kvm)
+{
+	struct framebuffer *fb;
+
+	if (!kvm->cfg.sdl)
+		return 0;
+
+	fb = vesa__init(kvm);
+	if (IS_ERR(fb)) {
+		pr_err("vesa__init() failed with error %ld\n", PTR_ERR(fb));
+		return PTR_ERR(fb);
+	}
+
+	return fb__attach(fb, &sdl_ops);
+}
+dev_init(sdl__init);
+
+int sdl__exit(struct kvm *kvm)
+{
+	if (kvm->cfg.sdl)
+		return sdl__stop(NULL);
+
+	return 0;
+}
+dev_exit(sdl__exit);
diff --git a/kvmtool/ui/vnc.c b/kvmtool/ui/vnc.c
new file mode 100644
index 0000000..12e4bd5
--- /dev/null
+++ b/kvmtool/ui/vnc.c
@@ -0,0 +1,250 @@
+#include "kvm/vnc.h"
+
+#include "kvm/framebuffer.h"
+#include "kvm/i8042.h"
+#include "kvm/vesa.h"
+
+#include <linux/types.h>
+#include <rfb/keysym.h>
+#include <rfb/rfb.h>
+#include <pthread.h>
+#include <linux/err.h>
+
+#define VESA_QUEUE_SIZE		128
+#define VESA_IRQ		14
+
+/*
+ * This "6000" value is pretty much the result of experimentation
+ * It seems that around this value, things update pretty smoothly
+ */
+#define VESA_UPDATE_TIME	6000
+
+/*
+ * We can map the letters and numbers without a fuss,
+ * but the other characters not so much.
+ */
+static char letters[26] = {
+	0x1c, 0x32, 0x21, 0x23, 0x24, /* a-e */
+	0x2b, 0x34, 0x33, 0x43, 0x3b, /* f-j */
+	0x42, 0x4b, 0x3a, 0x31, 0x44, /* k-o */
+	0x4d, 0x15, 0x2d, 0x1b, 0x2c, /* p-t */
+	0x3c, 0x2a, 0x1d, 0x22, 0x35, /* u-y */
+	0x1a,
+};
+
+static rfbScreenInfoPtr server;
+static char num[10] = {
+	0x45, 0x16, 0x1e, 0x26, 0x2e, 0x23, 0x36, 0x3d, 0x3e, 0x46,
+};
+
+/*
+ * This is called when the VNC server receives a key event
+ * The reason this function is such a beast is that we have
+ * to convert from ASCII characters (which is what VNC gets)
+ * to PC keyboard scancodes, which is what Linux expects to
+ * get from its keyboard. ASCII and the scancode set don't
+ * really seem to mesh in any good way beyond some basics with
+ * the letters and numbers.
+ */
+static void kbd_handle_key(rfbBool down, rfbKeySym key, rfbClientPtr cl)
+{
+	char tosend = 0;
+
+	if (key >= 0x41 && key <= 0x5a)
+		key += 0x20; /* convert to lowercase */
+
+	if (key >= 0x61 && key <= 0x7a) /* a-z */
+		tosend = letters[key - 0x61];
+
+	if (key >= 0x30 && key <= 0x39)
+		tosend = num[key - 0x30];
+
+	switch (key) {
+	case XK_Insert:		kbd_queue(0xe0);	tosend = 0x70;	break;
+	case XK_Delete:		kbd_queue(0xe0);	tosend = 0x71;	break;
+	case XK_Up:		kbd_queue(0xe0);	tosend = 0x75;	break;
+	case XK_Down:		kbd_queue(0xe0);	tosend = 0x72;	break;
+	case XK_Left:		kbd_queue(0xe0);	tosend = 0x6b;	break;
+	case XK_Right:		kbd_queue(0xe0);	tosend = 0x74;	break;
+	case XK_Page_Up:	kbd_queue(0xe0);	tosend = 0x7d;	break;
+	case XK_Page_Down:	kbd_queue(0xe0);	tosend = 0x7a;	break;
+	case XK_Home:		kbd_queue(0xe0);	tosend = 0x6c;	break;
+	case XK_BackSpace:	tosend = 0x66;		break;
+	case XK_Tab:		tosend = 0x0d;		break;
+	case XK_Return:		tosend = 0x5a;		break;
+	case XK_Escape:		tosend = 0x76;		break;
+	case XK_End:		tosend = 0x69;		break;
+	case XK_Shift_L:	tosend = 0x12;		break;
+	case XK_Shift_R:	tosend = 0x59;		break;
+	case XK_Control_R:	kbd_queue(0xe0);
+	case XK_Control_L:	tosend = 0x14;		break;
+	case XK_Alt_R:		kbd_queue(0xe0);
+	case XK_Alt_L:		tosend = 0x11;		break;
+	case XK_quoteleft:	tosend = 0x0e;		break;
+	case XK_minus:		tosend = 0x4e;		break;
+	case XK_equal:		tosend = 0x55;		break;
+	case XK_bracketleft:	tosend = 0x54;		break;
+	case XK_bracketright:	tosend = 0x5b;		break;
+	case XK_backslash:	tosend = 0x5d;		break;
+	case XK_Caps_Lock:	tosend = 0x58;		break;
+	case XK_semicolon:	tosend = 0x4c;		break;
+	case XK_quoteright:	tosend = 0x52;		break;
+	case XK_comma:		tosend = 0x41;		break;
+	case XK_period:		tosend = 0x49;		break;
+	case XK_slash:		tosend = 0x4a;		break;
+	case XK_space:		tosend = 0x29;		break;
+
+	/*
+	 * This is where I handle the shifted characters.
+	 * They don't really map nicely the way A-Z maps to a-z,
+	 * so I'm doing it manually
+	 */
+	case XK_exclam:		tosend = 0x16;		break;
+	case XK_quotedbl:	tosend = 0x52;		break;
+	case XK_numbersign:	tosend = 0x26;		break;
+	case XK_dollar:		tosend = 0x25;		break;
+	case XK_percent:	tosend = 0x2e;		break;
+	case XK_ampersand:	tosend = 0x3d;		break;
+	case XK_parenleft:	tosend = 0x46;		break;
+	case XK_parenright:	tosend = 0x45;		break;
+	case XK_asterisk:	tosend = 0x3e;		break;
+	case XK_plus:		tosend = 0x55;		break;
+	case XK_colon:		tosend = 0x4c;		break;
+	case XK_less:		tosend = 0x41;		break;
+	case XK_greater:	tosend = 0x49;		break;
+	case XK_question:	tosend = 0x4a;		break;
+	case XK_at:		tosend = 0x1e;		break;
+	case XK_asciicircum:	tosend = 0x36;		break;
+	case XK_underscore:	tosend = 0x4e;		break;
+	case XK_braceleft:	tosend = 0x54;		break;
+	case XK_braceright:	tosend = 0x5b;		break;
+	case XK_bar:		tosend = 0x5d;		break;
+	case XK_asciitilde:	tosend = 0x0e;		break;
+	default:		break;
+	}
+
+	/*
+	 * If this is a "key up" event (the user has released the key, we
+	 * need to send 0xf0 first.
+	 */
+	if (!down && tosend != 0x0)
+		kbd_queue(0xf0);
+
+	if (tosend)
+		kbd_queue(tosend);
+}
+
+/* The previous X and Y coordinates of the mouse */
+static int xlast, ylast = -1;
+
+/*
+ * This function is called by the VNC server whenever a mouse event occurs.
+ */
+static void kbd_handle_ptr(int buttonMask, int x, int y, rfbClientPtr cl)
+{
+	int dx, dy;
+	char b1 = 0x8;
+
+	/* The VNC mask and the PS/2 button encoding are the same */
+	b1 |= buttonMask;
+
+	if (xlast >= 0 && ylast >= 0) {
+		/* The PS/2 mouse sends deltas, not absolutes */
+		dx = x - xlast;
+		dy = ylast - y;
+
+		/* Set overflow bits if needed */
+		if (dy > 255)
+			b1 |= 0x80;
+		if (dx > 255)
+			b1 |= 0x40;
+
+		/* Set negative bits if needed */
+		if (dy < 0)
+			b1 |= 0x20;
+		if (dx < 0)
+			b1 |= 0x10;
+
+		mouse_queue(b1);
+		mouse_queue(dx);
+		mouse_queue(dy);
+	}
+
+	xlast = x;
+	ylast = y;
+	rfbDefaultPtrAddEvent(buttonMask, x, y, cl);
+}
+
+static void *vnc__thread(void *p)
+{
+	struct framebuffer *fb = p;
+	/*
+	 * Make a fake argc and argv because the getscreen function
+	 * seems to want it.
+	 */
+	char argv[1][1] = {{0}};
+	int argc = 1;
+
+	kvm__set_thread_name("kvm-vnc-worker");
+
+	server = rfbGetScreen(&argc, (char **) argv, fb->width, fb->height, 8, 3, 4);
+	server->frameBuffer		= fb->mem;
+	server->alwaysShared		= TRUE;
+	server->kbdAddEvent		= kbd_handle_key;
+	server->ptrAddEvent		= kbd_handle_ptr;
+	rfbInitServer(server);
+
+	while (rfbIsActive(server)) {
+		rfbMarkRectAsModified(server, 0, 0, fb->width, fb->height);
+		rfbProcessEvents(server, server->deferUpdateTime * VESA_UPDATE_TIME);
+	}
+	return NULL;
+}
+
+static int vnc__start(struct framebuffer *fb)
+{
+	pthread_t thread;
+
+	if (pthread_create(&thread, NULL, vnc__thread, fb) != 0)
+		return -1;
+
+	return 0;
+}
+
+static int vnc__stop(struct framebuffer *fb)
+{
+	rfbShutdownServer(server, TRUE);
+
+	return 0;
+}
+
+static struct fb_target_operations vnc_ops = {
+	.start	= vnc__start,
+	.stop	= vnc__stop,
+};
+
+int vnc__init(struct kvm *kvm)
+{
+	struct framebuffer *fb;
+
+	if (!kvm->cfg.vnc)
+		return 0;
+
+	fb = vesa__init(kvm);
+	if (IS_ERR(fb)) {
+		pr_err("vesa__init() failed with error %ld\n", PTR_ERR(fb));
+		return PTR_ERR(fb);
+	}
+
+	return fb__attach(fb, &vnc_ops);
+}
+dev_init(vnc__init);
+
+int vnc__exit(struct kvm *kvm)
+{
+	if (kvm->cfg.vnc)
+		return vnc__stop(NULL);
+
+	return 0;
+}
+dev_exit(vnc__exit);
diff --git a/kvmtool/util/KVMTOOLS-VERSION-GEN b/kvmtool/util/KVMTOOLS-VERSION-GEN
new file mode 100755
index 0000000..91ee2c2
--- /dev/null
+++ b/kvmtool/util/KVMTOOLS-VERSION-GEN
@@ -0,0 +1,39 @@
+#!/bin/sh
+
+if [ $# -eq 1 ]  ; then
+	OUTPUT=$1
+fi
+
+GVF=${OUTPUT}KVMTOOLS-VERSION-FILE
+
+LF='
+'
+
+# First check if there is a .git to get the version from git describe
+# otherwise try to get the version from the kernel makefile
+if test -d .git -o -f .git &&
+	VN=$(git describe --abbrev=4 HEAD 2>/dev/null) &&
+	case "$VN" in
+	*$LF*) (exit 1) ;;
+	v[0-9]*)
+		git update-index -q --refresh
+		test -z "$(git diff-index --name-only HEAD --)" ||
+		VN="$VN-dirty" ;;
+	esac
+then
+	VN=$(echo "$VN" | sed -e 's/-/./g');
+else
+	VN=3.18.0
+fi
+
+VN=$(expr "$VN" : v*'\(.*\)')
+
+if test -r $GVF
+then
+	VC=$(sed -e 's/^KVMTOOLS_VERSION = //' <$GVF)
+else
+	VC=unset
+fi
+test "$VN" = "$VC" || {
+	echo "KVMTOOLS_VERSION = $VN" >$GVF
+}
diff --git a/kvmtool/util/generate-cmdlist.sh b/kvmtool/util/generate-cmdlist.sh
new file mode 100755
index 0000000..c8be0bd
--- /dev/null
+++ b/kvmtool/util/generate-cmdlist.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+echo "/* Automatically generated by $0 */
+struct cmdname_help
+{
+    char name[16];
+    char help[80];
+};
+
+static struct cmdname_help common_cmds[] = {"
+
+sed -n 's/^lkvm-\([^ \t]*\).*common/\1/p' command-list.txt |
+while read cmd
+do
+	 # TODO following sed command should be fixed
+     sed -n '/^NAME/,/^lkvm-'"$cmd"'/ {
+		 /NAME/d
+		 /--/d
+		 s/.*kvm-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
+	     p
+	 }' "Documentation/kvm-$cmd.txt"
+done
+echo "};"
diff --git a/kvmtool/util/init.c b/kvmtool/util/init.c
new file mode 100644
index 0000000..d4ce144
--- /dev/null
+++ b/kvmtool/util/init.c
@@ -0,0 +1,67 @@
+#include <linux/list.h>
+#include <linux/kernel.h>
+
+#include "kvm/kvm.h"
+#include "kvm/util-init.h"
+
+#define PRIORITY_LISTS 10
+
+static struct hlist_head init_lists[PRIORITY_LISTS];
+static struct hlist_head exit_lists[PRIORITY_LISTS];
+
+int init_list_add(struct init_item *t, int (*init)(struct kvm *),
+			int priority, const char *name)
+{
+	t->init = init;
+	t->fn_name = name;
+	hlist_add_head(&t->n, &init_lists[priority]);
+
+	return 0;
+}
+
+int exit_list_add(struct init_item *t, int (*init)(struct kvm *),
+			int priority, const char *name)
+{
+	t->init = init;
+	t->fn_name = name;
+	hlist_add_head(&t->n, &exit_lists[priority]);
+
+	return 0;
+}
+
+int init_list__init(struct kvm *kvm)
+{
+	unsigned int i;
+	int r = 0;
+	struct init_item *t;
+
+	for (i = 0; i < ARRAY_SIZE(init_lists); i++)
+		hlist_for_each_entry(t, &init_lists[i], n) {
+			r = t->init(kvm);
+			if (r < 0) {
+				pr_warning("Failed init: %s\n", t->fn_name);
+				goto fail;
+			}
+		}
+
+fail:
+	return r;
+}
+
+int init_list__exit(struct kvm *kvm)
+{
+	int i;
+	int r = 0;
+	struct init_item *t;
+
+	for (i = ARRAY_SIZE(exit_lists) - 1; i >= 0; i--)
+		hlist_for_each_entry(t, &exit_lists[i], n) {
+			r = t->init(kvm);
+			if (r < 0) {
+				pr_warning("%s failed.\n", t->fn_name);
+				goto fail;
+			}
+		}
+fail:
+	return r;
+}
diff --git a/kvmtool/util/iovec.c b/kvmtool/util/iovec.c
new file mode 100644
index 0000000..089f105
--- /dev/null
+++ b/kvmtool/util/iovec.c
@@ -0,0 +1,121 @@
+/*
+ *	iovec manipulation routines.
+ *
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *	Fixes:
+ *		Andrew Lunn	:	Errors in iovec copying.
+ *		Pedro Roque	:	Added memcpy_fromiovecend and
+ *					csum_..._fromiovecend.
+ *		Andi Kleen	:	fixed error handling for 2.1
+ *		Alexey Kuznetsov:	2.1 optimisations
+ *		Andi Kleen	:	Fix csum*fromiovecend for IPv6.
+ */
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/compiler.h>
+#include <sys/uio.h>
+#include <kvm/iovec.h>
+#include <string.h>
+
+/*
+ *	Copy kernel to iovec. Returns -EFAULT on error.
+ *
+ *	Note: this modifies the original iovec.
+ */
+
+int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len)
+{
+	while (len > 0) {
+		if (iov->iov_len) {
+			int copy = min_t(unsigned int, iov->iov_len, len);
+			memcpy(iov->iov_base, kdata, copy);
+			kdata += copy;
+			len -= copy;
+			iov->iov_len -= copy;
+			iov->iov_base += copy;
+		}
+		iov++;
+	}
+
+	return 0;
+}
+
+/*
+ *	Copy kernel to iovec. Returns -EFAULT on error.
+ */
+
+int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata,
+		      size_t offset, int len)
+{
+	int copy;
+	for (; len > 0; ++iov) {
+		/* Skip over the finished iovecs */
+		if (unlikely(offset >= iov->iov_len)) {
+			offset -= iov->iov_len;
+			continue;
+		}
+		copy = min_t(unsigned int, iov->iov_len - offset, len);
+		memcpy(iov->iov_base + offset, kdata, copy);
+		offset = 0;
+		kdata += copy;
+		len -= copy;
+	}
+
+	return 0;
+}
+
+/*
+ *	Copy iovec to kernel. Returns -EFAULT on error.
+ *
+ *	Note: this modifies the original iovec.
+ */
+
+int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len)
+{
+	while (len > 0) {
+		if (iov->iov_len) {
+			int copy = min_t(unsigned int, len, iov->iov_len);
+			memcpy(kdata, iov->iov_base, copy);
+			len -= copy;
+			kdata += copy;
+			iov->iov_base += copy;
+			iov->iov_len -= copy;
+		}
+		iov++;
+	}
+
+	return 0;
+}
+
+/*
+ *	Copy iovec from kernel. Returns -EFAULT on error.
+ */
+
+int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
+			size_t offset, int len)
+{
+	/* Skip over the finished iovecs */
+	while (offset >= iov->iov_len) {
+		offset -= iov->iov_len;
+		iov++;
+	}
+
+	while (len > 0) {
+		char *base = iov->iov_base + offset;
+		int copy = min_t(unsigned int, len, iov->iov_len - offset);
+
+		offset = 0;
+		memcpy(kdata, base, copy);
+		len -= copy;
+		kdata += copy;
+		iov++;
+	}
+
+	return 0;
+}
diff --git a/kvmtool/util/kvm-ifup-vbr0 b/kvmtool/util/kvm-ifup-vbr0
new file mode 100755
index 0000000..a91c37f
--- /dev/null
+++ b/kvmtool/util/kvm-ifup-vbr0
@@ -0,0 +1,6 @@
+#!/bin/sh
+switch=vbr0
+/sbin/ifconfig $1 0.0.0.0 up
+/usr/sbin/brctl addif ${switch} $1
+/usr/sbin/brctl setfd ${switch} 0
+/usr/sbin/brctl stp ${switch} off
diff --git a/kvmtool/util/parse-options.c b/kvmtool/util/parse-options.c
new file mode 100644
index 0000000..9a1bbee
--- /dev/null
+++ b/kvmtool/util/parse-options.c
@@ -0,0 +1,577 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <stdbool.h>
+
+/* user defined includes */
+#include <linux/types.h>
+#include <kvm/util.h>
+#include <kvm/parse-options.h>
+#include <kvm/strbuf.h>
+
+#define OPT_SHORT 1
+#define OPT_UNSET 2
+
+static int opterror(const struct option *opt, const char *reason, int flags)
+{
+	if (flags & OPT_SHORT)
+		return pr_err("switch `%c' %s", opt->short_name, reason);
+	if (flags & OPT_UNSET)
+		return pr_err("option `no-%s' %s", opt->long_name, reason);
+	return pr_err("option `%s' %s", opt->long_name, reason);
+}
+
+static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt,
+		int flags, const char **arg)
+{
+	if (p->opt) {
+		*arg = p->opt;
+		p->opt = NULL;
+	} else if ((opt->flags & PARSE_OPT_LASTARG_DEFAULT) && (p->argc == 1 ||
+				**(p->argv + 1) == '-')) {
+		*arg = (const char *)opt->defval;
+	} else if (p->argc > 1) {
+		p->argc--;
+		*arg = *++p->argv;
+	} else
+		return opterror(opt, "requires a value", flags);
+	return 0;
+}
+
+static int readnum(const struct option *opt, int flags,
+		   const char *str, char **end)
+{
+	switch (opt->type) {
+	case OPTION_INTEGER:
+		*(int *)opt->value = strtol(str, end, 0);
+		break;
+	case OPTION_UINTEGER:
+		*(unsigned int *)opt->value = strtol(str, end, 0);
+		break;
+	case OPTION_LONG:
+		*(long *)opt->value = strtol(str, end, 0);
+		break;
+	case OPTION_U64:
+		*(u64 *)opt->value = strtoull(str, end, 0);
+		break;
+	default:
+		return opterror(opt, "invalid numeric conversion", flags);
+	}
+
+	return 0;
+}
+
+static int get_value(struct parse_opt_ctx_t *p,
+		const struct option *opt, int flags)
+{
+	const char *s, *arg = NULL;
+	const int unset = flags & OPT_UNSET;
+
+	if (unset && p->opt)
+		return opterror(opt, "takes no value", flags);
+	if (unset && (opt->flags & PARSE_OPT_NONEG))
+		return opterror(opt, "isn't available", flags);
+
+	if (!(flags & OPT_SHORT) && p->opt) {
+		switch (opt->type) {
+		case OPTION_CALLBACK:
+			if (!(opt->flags & PARSE_OPT_NOARG))
+				break;
+		/* FALLTHROUGH */
+		case OPTION_BOOLEAN:
+		case OPTION_INCR:
+		case OPTION_BIT:
+		case OPTION_SET_UINT:
+		case OPTION_SET_PTR:
+			return opterror(opt, "takes no value", flags);
+		case OPTION_END:
+		case OPTION_ARGUMENT:
+		case OPTION_GROUP:
+		case OPTION_STRING:
+		case OPTION_INTEGER:
+		case OPTION_UINTEGER:
+		case OPTION_LONG:
+		case OPTION_U64:
+		default:
+			break;
+		}
+	}
+
+	switch (opt->type) {
+	case OPTION_BIT:
+		if (unset)
+			*(int *)opt->value &= ~opt->defval;
+		else
+			*(int *)opt->value |= opt->defval;
+		return 0;
+
+	case OPTION_BOOLEAN:
+		*(bool *)opt->value = unset ? false : true;
+		return 0;
+
+	case OPTION_INCR:
+		*(int *)opt->value = unset ? 0 : *(int *)opt->value + 1;
+		return 0;
+
+	case OPTION_SET_UINT:
+		*(unsigned int *)opt->value = unset ? 0 : opt->defval;
+		return 0;
+
+	case OPTION_SET_PTR:
+		*(void **)opt->value = unset ? NULL : (void *)opt->defval;
+		return 0;
+
+	case OPTION_STRING:
+		if (unset)
+			*(const char **)opt->value = NULL;
+		else if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+			*(const char **)opt->value = (const char *)opt->defval;
+		else
+			return get_arg(p, opt, flags,
+					(const char **)opt->value);
+		return 0;
+
+	case OPTION_CALLBACK:
+		if (unset)
+			return (*opt->callback)(opt, NULL, 1) ? (-1) : 0;
+		if (opt->flags & PARSE_OPT_NOARG)
+			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		return (*opt->callback)(opt, arg, 0) ? (-1) : 0;
+
+	case OPTION_INTEGER:
+		if (unset) {
+			*(int *)opt->value = 0;
+			return 0;
+		}
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+			*(int *)opt->value = opt->defval;
+			return 0;
+		}
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		return readnum(opt, flags, arg, (char **)&s);
+
+	case OPTION_UINTEGER:
+		if (unset) {
+			*(unsigned int *)opt->value = 0;
+			return 0;
+		}
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+			*(unsigned int *)opt->value = opt->defval;
+			return 0;
+		}
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		return readnum(opt, flags, arg, (char **)&s);
+
+	case OPTION_LONG:
+		if (unset) {
+			*(long *)opt->value = 0;
+			return 0;
+		}
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+			*(long *)opt->value = opt->defval;
+			return 0;
+		}
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		return readnum(opt, flags, arg, (char **)&s);
+
+	case OPTION_U64:
+		if (unset) {
+			*(u64 *)opt->value = 0;
+			return 0;
+		}
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+			*(u64 *)opt->value = opt->defval;
+			return 0;
+		}
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		return readnum(opt, flags, arg, (char **)&s);
+
+	case OPTION_END:
+	case OPTION_ARGUMENT:
+	case OPTION_GROUP:
+	default:
+		die("should not happen, someone must be hit on the forehead");
+	}
+}
+
+#define USAGE_OPTS_WIDTH 24
+#define USAGE_GAP         2
+
+static int usage_with_options_internal(const char * const *usagestr,
+		const struct option *opts, int full)
+{
+	if (!usagestr)
+		return PARSE_OPT_HELP;
+
+	fprintf(stderr, "\n usage: %s\n", *usagestr++);
+	while (*usagestr && **usagestr)
+		fprintf(stderr, "    or: %s\n", *usagestr++);
+	while (*usagestr) {
+		fprintf(stderr, "%s%s\n",
+				**usagestr ? "    " : "",
+				*usagestr);
+		usagestr++;
+	}
+
+	if (opts->type != OPTION_GROUP)
+		fputc('\n', stderr);
+
+	for (; opts->type != OPTION_END; opts++) {
+		size_t pos;
+		int pad;
+
+		if (opts->type == OPTION_GROUP) {
+			fputc('\n', stderr);
+			if (*opts->help)
+				fprintf(stderr, "%s\n", opts->help);
+			continue;
+		}
+		if (!full && (opts->flags & PARSE_OPT_HIDDEN))
+			continue;
+
+		pos = fprintf(stderr, "    ");
+		if (opts->short_name)
+			pos += fprintf(stderr, "-%c", opts->short_name);
+		else
+			pos += fprintf(stderr, "    ");
+
+		if (opts->long_name && opts->short_name)
+			pos += fprintf(stderr, ", ");
+		if (opts->long_name)
+			pos += fprintf(stderr, "--%s", opts->long_name);
+
+		switch (opts->type) {
+		case OPTION_ARGUMENT:
+			break;
+		case OPTION_LONG:
+		case OPTION_U64:
+		case OPTION_INTEGER:
+		case OPTION_UINTEGER:
+			if (opts->flags & PARSE_OPT_OPTARG)
+				if (opts->long_name)
+					pos += fprintf(stderr, "[=<n>]");
+				else
+					pos += fprintf(stderr, "[<n>]");
+			else
+				pos += fprintf(stderr, " <n>");
+			break;
+		case OPTION_CALLBACK:
+			if (opts->flags & PARSE_OPT_NOARG)
+				break;
+		/* FALLTHROUGH */
+		case OPTION_STRING:
+			if (opts->argh) {
+				if (opts->flags & PARSE_OPT_OPTARG)
+					if (opts->long_name)
+						pos += fprintf(stderr, "[=<%s>]", opts->argh);
+					else
+						pos += fprintf(stderr, "[<%s>]", opts->argh);
+				else
+					pos += fprintf(stderr, " <%s>", opts->argh);
+			} else {
+				if (opts->flags & PARSE_OPT_OPTARG)
+					if (opts->long_name)
+						pos += fprintf(stderr, "[=...]");
+					else
+						pos += fprintf(stderr, "[...]");
+				else
+					pos += fprintf(stderr, " ...");
+			}
+				break;
+		default: /* OPTION_{BIT,BOOLEAN,SET_UINT,SET_PTR} */
+		case OPTION_END:
+		case OPTION_GROUP:
+		case OPTION_BIT:
+		case OPTION_BOOLEAN:
+		case OPTION_INCR:
+		case OPTION_SET_UINT:
+		case OPTION_SET_PTR:
+			break;
+		}
+		if (pos <= USAGE_OPTS_WIDTH)
+			pad = USAGE_OPTS_WIDTH - pos;
+		else {
+			fputc('\n', stderr);
+			pad = USAGE_OPTS_WIDTH;
+		}
+		fprintf(stderr, "%*s%s\n", pad + USAGE_GAP, "", opts->help);
+	}
+	fputc('\n', stderr);
+
+	return PARSE_OPT_HELP;
+}
+
+void usage_with_options(const char * const *usagestr,
+		const struct option *opts)
+{
+	usage_with_options_internal(usagestr, opts, 0);
+	exit(129);
+}
+
+static void check_typos(const char *arg, const struct option *options)
+{
+	if (strlen(arg) < 3)
+		return;
+
+	if (!prefixcmp(arg, "no-")) {
+		pr_err("did you mean `--%s` (with two dashes ?)", arg);
+		exit(129);
+	}
+
+	for (; options->type != OPTION_END; options++) {
+		if (!options->long_name)
+			continue;
+		if (!prefixcmp(options->long_name, arg)) {
+			pr_err("did you mean `--%s` (with two dashes ?)", arg);
+			exit(129);
+		}
+	}
+}
+
+static int parse_options_usage(const char * const *usagestr,
+		const struct option *opts)
+{
+	return usage_with_options_internal(usagestr, opts, 0);
+}
+
+static int parse_short_opt(struct parse_opt_ctx_t *p,
+        const struct option *options)
+{
+	for (; options->type != OPTION_END; options++) {
+		if (options->short_name == *p->opt) {
+			p->opt = p->opt[1] ? p->opt + 1 : NULL;
+			return get_value(p, options, OPT_SHORT);
+		}
+	}
+	return -2;
+}
+
+static int parse_long_opt(struct parse_opt_ctx_t *p, const char *arg,
+		const struct option *options)
+{
+	const char *arg_end = strchr(arg, '=');
+	const struct option *abbrev_option = NULL, *ambiguous_option = NULL;
+	int abbrev_flags = 0, ambiguous_flags = 0;
+
+	if (!arg_end)
+		arg_end = arg + strlen(arg);
+
+	for (; options->type != OPTION_END; options++) {
+		const char *rest;
+		int flags = 0;
+
+		if (!options->long_name)
+			continue;
+
+		rest = skip_prefix(arg, options->long_name);
+		if (options->type == OPTION_ARGUMENT) {
+			if (!rest)
+				continue;
+			if (*rest == '=')
+				return opterror(options, "takes no value",
+						flags);
+			if (*rest)
+				continue;
+			p->out[p->cpidx++] = arg - 2;
+			return 0;
+		}
+		if (!rest) {
+			/* abbreviated? */
+			if (!strncmp(options->long_name, arg, arg_end - arg)) {
+is_abbreviated:
+				if (abbrev_option) {
+					/*
+					 * If this is abbreviated, it is
+					 * ambiguous. So when there is no
+					 * exact match later, we need to
+					 * error out.
+					 */
+					ambiguous_option = abbrev_option;
+					ambiguous_flags = abbrev_flags;
+				}
+				if (!(flags & OPT_UNSET) && *arg_end)
+					p->opt = arg_end + 1;
+				abbrev_option = options;
+				abbrev_flags = flags;
+				continue;
+			}
+			/* negated and abbreviated very much? */
+			if (!prefixcmp("no-", arg)) {
+				flags |= OPT_UNSET;
+				goto is_abbreviated;
+			}
+			/* negated? */
+			if (strncmp(arg, "no-", 3))
+				continue;
+			flags |= OPT_UNSET;
+			rest = skip_prefix(arg + 3, options->long_name);
+			/* abbreviated and negated? */
+			if (!rest && !prefixcmp(options->long_name, arg + 3))
+				goto is_abbreviated;
+			if (!rest)
+				continue;
+		}
+		if (*rest) {
+			if (*rest != '=')
+				continue;
+			p->opt = rest + 1;
+		}
+		return get_value(p, options, flags);
+	}
+
+	if (ambiguous_option)
+		return pr_err("Ambiguous option: %s "
+				"(could be --%s%s or --%s%s)",
+				arg,
+				(ambiguous_flags & OPT_UNSET) ?  "no-" : "",
+				ambiguous_option->long_name,
+				(abbrev_flags & OPT_UNSET) ?  "no-" : "",
+				abbrev_option->long_name);
+	if (abbrev_option)
+		return get_value(p, abbrev_option, abbrev_flags);
+	return -2;
+}
+
+
+static void parse_options_start(struct parse_opt_ctx_t *ctx, int argc,
+		const char **argv, int flags)
+{
+	memset(ctx, 0, sizeof(*ctx));
+	ctx->argc = argc;
+	ctx->argv = argv;
+	ctx->out  = argv;
+	ctx->cpidx = ((flags & PARSE_OPT_KEEP_ARGV0) != 0);
+	ctx->flags = flags;
+	if ((flags & PARSE_OPT_KEEP_UNKNOWN) &&
+			(flags & PARSE_OPT_STOP_AT_NON_OPTION))
+		die("STOP_AT_NON_OPTION and KEEP_UNKNOWN don't go together");
+}
+
+static int parse_options_end(struct parse_opt_ctx_t *ctx)
+{
+	memmove(ctx->out + ctx->cpidx, ctx->argv, ctx->argc * sizeof(*ctx->out));
+	ctx->out[ctx->cpidx + ctx->argc] = NULL;
+	return ctx->cpidx + ctx->argc;
+}
+
+
+static int parse_options_step(struct parse_opt_ctx_t *ctx,
+		const struct option *options, const char * const usagestr[])
+{
+	int internal_help = !(ctx->flags & PARSE_OPT_NO_INTERNAL_HELP);
+
+	/* we must reset ->opt, unknown short option leave it dangling */
+	ctx->opt = NULL;
+
+	for (; ctx->argc; ctx->argc--, ctx->argv++) {
+		const char *arg = ctx->argv[0];
+
+		if (*arg != '-' || !arg[1]) {
+			if (ctx->flags & PARSE_OPT_STOP_AT_NON_OPTION)
+				break;
+			ctx->out[ctx->cpidx++] = ctx->argv[0];
+			continue;
+		}
+
+		if (arg[1] != '-') {
+			ctx->opt = arg + 1;
+			if (internal_help && *ctx->opt == 'h')
+				return parse_options_usage(usagestr, options);
+			switch (parse_short_opt(ctx, options)) {
+			case -1:
+				return parse_options_usage(usagestr, options);
+			case -2:
+				goto unknown;
+			default:
+				break;
+			}
+			if (ctx->opt)
+				check_typos(arg + 1, options);
+			while (ctx->opt) {
+				if (internal_help && *ctx->opt == 'h')
+					return parse_options_usage(usagestr,
+							options);
+				switch (parse_short_opt(ctx, options)) {
+				case -1:
+					return parse_options_usage(usagestr,
+							options);
+				case -2:
+					/* fake a short option thing to hide
+					 * the fact that we may have
+					 * started to parse aggregated stuff
+					 *
+					 * This is leaky, too bad.
+					 */
+					ctx->argv[0] = strdup(ctx->opt - 1);
+					*(char *)ctx->argv[0] = '-';
+					goto unknown;
+				default:
+					break;
+				}
+			}
+			continue;
+		}
+
+		if (!arg[2]) { /* "--" */
+			if (!(ctx->flags & PARSE_OPT_KEEP_DASHDASH)) {
+				ctx->argc--;
+				ctx->argv++;
+			}
+			break;
+		}
+
+		if (internal_help && !strcmp(arg + 2, "help-all"))
+			return usage_with_options_internal(usagestr, options,
+					1);
+		if (internal_help && !strcmp(arg + 2, "help"))
+			return parse_options_usage(usagestr, options);
+		switch (parse_long_opt(ctx, arg + 2, options)) {
+		case -1:
+			return parse_options_usage(usagestr, options);
+		case -2:
+			goto unknown;
+		default:
+			break;
+		}
+		continue;
+unknown:
+		if (!(ctx->flags & PARSE_OPT_KEEP_UNKNOWN))
+			return PARSE_OPT_UNKNOWN;
+		ctx->out[ctx->cpidx++] = ctx->argv[0];
+		ctx->opt = NULL;
+	}
+	return PARSE_OPT_DONE;
+}
+
+int parse_options(int argc, const char **argv, const struct option *options,
+		const char * const usagestr[], int flags)
+{
+	struct parse_opt_ctx_t ctx;
+
+	parse_options_start(&ctx, argc, argv, flags);
+	switch (parse_options_step(&ctx, options, usagestr)) {
+	case PARSE_OPT_HELP:
+		exit(129);
+	case PARSE_OPT_DONE:
+		break;
+	default: /* PARSE_OPT_UNKNOWN */
+		if (ctx.argv[0][1] == '-') {
+			pr_err("unknown option `%s'", ctx.argv[0] + 2);
+		} else {
+			pr_err("unknown switch `%c'", *ctx.opt);
+		}
+		usage_with_options(usagestr, options);
+	}
+
+	return parse_options_end(&ctx);
+}
diff --git a/kvmtool/util/rbtree-interval.c b/kvmtool/util/rbtree-interval.c
new file mode 100644
index 0000000..3630a6d
--- /dev/null
+++ b/kvmtool/util/rbtree-interval.c
@@ -0,0 +1,58 @@
+#include <kvm/rbtree-interval.h>
+#include <stddef.h>
+#include <errno.h>
+
+struct rb_int_node *rb_int_search_single(struct rb_root *root, u64 point)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct rb_int_node *cur = rb_int(node);
+
+		if (point < cur->low)
+			node = node->rb_left;
+		else if (cur->high <= point)
+			node = node->rb_right;
+		else
+			return cur;
+	}
+
+	return NULL;
+}
+
+struct rb_int_node *rb_int_search_range(struct rb_root *root, u64 low, u64 high)
+{
+	struct rb_int_node *range;
+
+	range = rb_int_search_single(root, low);
+	if (range == NULL)
+		return NULL;
+
+	/* We simply verify that 'high' is smaller than the end of the range where 'low' is located */
+	if (range->high < high)
+		return NULL;
+
+	return range;
+}
+
+int rb_int_insert(struct rb_root *root, struct rb_int_node *i_node)
+{
+	struct rb_node **node = &root->rb_node, *parent = NULL;
+
+	while (*node) {
+		struct rb_int_node *cur = rb_int(*node);
+
+		parent = *node;
+		if (i_node->high <= cur->low)
+			node = &cur->node.rb_left;
+		else if (cur->high <= i_node->low)
+			node = &cur->node.rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&i_node->node, parent, node);
+	rb_insert_color(&i_node->node, root);
+
+	return 0;
+}
diff --git a/kvmtool/util/rbtree.c b/kvmtool/util/rbtree.c
new file mode 100644
index 0000000..9c2dedb
--- /dev/null
+++ b/kvmtool/util/rbtree.c
@@ -0,0 +1,548 @@
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  (C) 2002  David Woodhouse <dwmw2@infradead.org>
+  (C) 2012  Michel Lespinasse <walken@google.com>
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  linux/lib/rbtree.c
+*/
+
+#include <linux/rbtree_augmented.h>
+
+/*
+ * red-black trees properties:  http://en.wikipedia.org/wiki/Rbtree
+ *
+ *  1) A node is either red or black
+ *  2) The root is black
+ *  3) All leaves (NULL) are black
+ *  4) Both children of every red node are black
+ *  5) Every simple path from root to leaves contains the same number
+ *     of black nodes.
+ *
+ *  4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two
+ *  consecutive red nodes in a path and every red node is therefore followed by
+ *  a black. So if B is the number of black nodes on every simple path (as per
+ *  5), then the longest possible path due to 4 is 2B.
+ *
+ *  We shall indicate color with case, where black nodes are uppercase and red
+ *  nodes will be lowercase. Unknown color nodes shall be drawn as red within
+ *  parentheses and have some accompanying text comment.
+ */
+
+static inline void rb_set_black(struct rb_node *rb)
+{
+	rb->__rb_parent_color |= RB_BLACK;
+}
+
+static inline struct rb_node *rb_red_parent(struct rb_node *red)
+{
+	return (struct rb_node *)red->__rb_parent_color;
+}
+
+/*
+ * Helper function for rotations:
+ * - old's parent and color get assigned to new
+ * - old gets assigned new as a parent and 'color' as a color.
+ */
+static inline void
+__rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
+			struct rb_root *root, int color)
+{
+	struct rb_node *parent = rb_parent(old);
+	new->__rb_parent_color = old->__rb_parent_color;
+	rb_set_parent_color(old, new, color);
+	__rb_change_child(old, new, parent, root);
+}
+
+static __always_inline void
+__rb_insert(struct rb_node *node, struct rb_root *root,
+	    void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
+{
+	struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
+
+	while (true) {
+		/*
+		 * Loop invariant: node is red
+		 *
+		 * If there is a black parent, we are done.
+		 * Otherwise, take some corrective action as we don't
+		 * want a red root or two consecutive red nodes.
+		 */
+		if (!parent) {
+			rb_set_parent_color(node, NULL, RB_BLACK);
+			break;
+		} else if (rb_is_black(parent))
+			break;
+
+		gparent = rb_red_parent(parent);
+
+		tmp = gparent->rb_right;
+		if (parent != tmp) {	/* parent == gparent->rb_left */
+			if (tmp && rb_is_red(tmp)) {
+				/*
+				 * Case 1 - color flips
+				 *
+				 *       G            g
+				 *      / \          / \
+				 *     p   u  -->   P   U
+				 *    /            /
+				 *   n            n
+				 *
+				 * However, since g's parent might be red, and
+				 * 4) does not allow this, we need to recurse
+				 * at g.
+				 */
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+				rb_set_parent_color(parent, gparent, RB_BLACK);
+				node = gparent;
+				parent = rb_parent(node);
+				rb_set_parent_color(node, parent, RB_RED);
+				continue;
+			}
+
+			tmp = parent->rb_right;
+			if (node == tmp) {
+				/*
+				 * Case 2 - left rotate at parent
+				 *
+				 *      G             G
+				 *     / \           / \
+				 *    p   U  -->    n   U
+				 *     \           /
+				 *      n         p
+				 *
+				 * This still leaves us in violation of 4), the
+				 * continuation into Case 3 will fix that.
+				 */
+				parent->rb_right = tmp = node->rb_left;
+				node->rb_left = parent;
+				if (tmp)
+					rb_set_parent_color(tmp, parent,
+							    RB_BLACK);
+				rb_set_parent_color(parent, node, RB_RED);
+				augment_rotate(parent, node);
+				parent = node;
+				tmp = node->rb_right;
+			}
+
+			/*
+			 * Case 3 - right rotate at gparent
+			 *
+			 *        G           P
+			 *       / \         / \
+			 *      p   U  -->  n   g
+			 *     /                 \
+			 *    n                   U
+			 */
+			gparent->rb_left = tmp;  /* == parent->rb_right */
+			parent->rb_right = gparent;
+			if (tmp)
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
+			augment_rotate(gparent, parent);
+			break;
+		} else {
+			tmp = gparent->rb_left;
+			if (tmp && rb_is_red(tmp)) {
+				/* Case 1 - color flips */
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+				rb_set_parent_color(parent, gparent, RB_BLACK);
+				node = gparent;
+				parent = rb_parent(node);
+				rb_set_parent_color(node, parent, RB_RED);
+				continue;
+			}
+
+			tmp = parent->rb_left;
+			if (node == tmp) {
+				/* Case 2 - right rotate at parent */
+				parent->rb_left = tmp = node->rb_right;
+				node->rb_right = parent;
+				if (tmp)
+					rb_set_parent_color(tmp, parent,
+							    RB_BLACK);
+				rb_set_parent_color(parent, node, RB_RED);
+				augment_rotate(parent, node);
+				parent = node;
+				tmp = node->rb_left;
+			}
+
+			/* Case 3 - left rotate at gparent */
+			gparent->rb_right = tmp;  /* == parent->rb_left */
+			parent->rb_left = gparent;
+			if (tmp)
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
+			augment_rotate(gparent, parent);
+			break;
+		}
+	}
+}
+
+/*
+ * Inline version for rb_erase() use - we want to be able to inline
+ * and eliminate the dummy_rotate callback there
+ */
+static __always_inline void
+____rb_erase_color(struct rb_node *parent, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
+{
+	struct rb_node *node = NULL, *sibling, *tmp1, *tmp2;
+
+	while (true) {
+		/*
+		 * Loop invariants:
+		 * - node is black (or NULL on first iteration)
+		 * - node is not the root (parent is not NULL)
+		 * - All leaf paths going through parent and node have a
+		 *   black node count that is 1 lower than other leaf paths.
+		 */
+		sibling = parent->rb_right;
+		if (node != sibling) {	/* node == parent->rb_left */
+			if (rb_is_red(sibling)) {
+				/*
+				 * Case 1 - left rotate at parent
+				 *
+				 *     P               S
+				 *    / \             / \
+				 *   N   s    -->    p   Sr
+				 *      / \         / \
+				 *     Sl  Sr      N   Sl
+				 */
+				parent->rb_right = tmp1 = sibling->rb_left;
+				sibling->rb_left = parent;
+				rb_set_parent_color(tmp1, parent, RB_BLACK);
+				__rb_rotate_set_parents(parent, sibling, root,
+							RB_RED);
+				augment_rotate(parent, sibling);
+				sibling = tmp1;
+			}
+			tmp1 = sibling->rb_right;
+			if (!tmp1 || rb_is_black(tmp1)) {
+				tmp2 = sibling->rb_left;
+				if (!tmp2 || rb_is_black(tmp2)) {
+					/*
+					 * Case 2 - sibling color flip
+					 * (p could be either color here)
+					 *
+					 *    (p)           (p)
+					 *    / \           / \
+					 *   N   S    -->  N   s
+					 *      / \           / \
+					 *     Sl  Sr        Sl  Sr
+					 *
+					 * This leaves us violating 5) which
+					 * can be fixed by flipping p to black
+					 * if it was red, or by recursing at p.
+					 * p is red when coming from Case 1.
+					 */
+					rb_set_parent_color(sibling, parent,
+							    RB_RED);
+					if (rb_is_red(parent))
+						rb_set_black(parent);
+					else {
+						node = parent;
+						parent = rb_parent(node);
+						if (parent)
+							continue;
+					}
+					break;
+				}
+				/*
+				 * Case 3 - right rotate at sibling
+				 * (p could be either color here)
+				 *
+				 *   (p)           (p)
+				 *   / \           / \
+				 *  N   S    -->  N   Sl
+				 *     / \             \
+				 *    sl  Sr            s
+				 *                       \
+				 *                        Sr
+				 */
+				sibling->rb_left = tmp1 = tmp2->rb_right;
+				tmp2->rb_right = sibling;
+				parent->rb_right = tmp2;
+				if (tmp1)
+					rb_set_parent_color(tmp1, sibling,
+							    RB_BLACK);
+				augment_rotate(sibling, tmp2);
+				tmp1 = sibling;
+				sibling = tmp2;
+			}
+			/*
+			 * Case 4 - left rotate at parent + color flips
+			 * (p and sl could be either color here.
+			 *  After rotation, p becomes black, s acquires
+			 *  p's color, and sl keeps its color)
+			 *
+			 *      (p)             (s)
+			 *      / \             / \
+			 *     N   S     -->   P   Sr
+			 *        / \         / \
+			 *      (sl) sr      N  (sl)
+			 */
+			parent->rb_right = tmp2 = sibling->rb_left;
+			sibling->rb_left = parent;
+			rb_set_parent_color(tmp1, sibling, RB_BLACK);
+			if (tmp2)
+				rb_set_parent(tmp2, parent);
+			__rb_rotate_set_parents(parent, sibling, root,
+						RB_BLACK);
+			augment_rotate(parent, sibling);
+			break;
+		} else {
+			sibling = parent->rb_left;
+			if (rb_is_red(sibling)) {
+				/* Case 1 - right rotate at parent */
+				parent->rb_left = tmp1 = sibling->rb_right;
+				sibling->rb_right = parent;
+				rb_set_parent_color(tmp1, parent, RB_BLACK);
+				__rb_rotate_set_parents(parent, sibling, root,
+							RB_RED);
+				augment_rotate(parent, sibling);
+				sibling = tmp1;
+			}
+			tmp1 = sibling->rb_left;
+			if (!tmp1 || rb_is_black(tmp1)) {
+				tmp2 = sibling->rb_right;
+				if (!tmp2 || rb_is_black(tmp2)) {
+					/* Case 2 - sibling color flip */
+					rb_set_parent_color(sibling, parent,
+							    RB_RED);
+					if (rb_is_red(parent))
+						rb_set_black(parent);
+					else {
+						node = parent;
+						parent = rb_parent(node);
+						if (parent)
+							continue;
+					}
+					break;
+				}
+				/* Case 3 - right rotate at sibling */
+				sibling->rb_right = tmp1 = tmp2->rb_left;
+				tmp2->rb_left = sibling;
+				parent->rb_left = tmp2;
+				if (tmp1)
+					rb_set_parent_color(tmp1, sibling,
+							    RB_BLACK);
+				augment_rotate(sibling, tmp2);
+				tmp1 = sibling;
+				sibling = tmp2;
+			}
+			/* Case 4 - left rotate at parent + color flips */
+			parent->rb_left = tmp2 = sibling->rb_right;
+			sibling->rb_right = parent;
+			rb_set_parent_color(tmp1, sibling, RB_BLACK);
+			if (tmp2)
+				rb_set_parent(tmp2, parent);
+			__rb_rotate_set_parents(parent, sibling, root,
+						RB_BLACK);
+			augment_rotate(parent, sibling);
+			break;
+		}
+	}
+}
+
+/* Non-inline version for rb_erase_augmented() use */
+void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
+{
+	____rb_erase_color(parent, root, augment_rotate);
+}
+
+/*
+ * Non-augmented rbtree manipulation functions.
+ *
+ * We use dummy augmented callbacks here, and have the compiler optimize them
+ * out of the rb_insert_color() and rb_erase() function definitions.
+ */
+
+static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {}
+static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {}
+static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {}
+
+static const struct rb_augment_callbacks dummy_callbacks = {
+	dummy_propagate, dummy_copy, dummy_rotate
+};
+
+void rb_insert_color(struct rb_node *node, struct rb_root *root)
+{
+	__rb_insert(node, root, dummy_rotate);
+}
+
+void rb_erase(struct rb_node *node, struct rb_root *root)
+{
+	struct rb_node *rebalance;
+	rebalance = __rb_erase_augmented(node, root, &dummy_callbacks);
+	if (rebalance)
+		____rb_erase_color(rebalance, root, dummy_rotate);
+}
+
+/*
+ * Augmented rbtree manipulation functions.
+ *
+ * This instantiates the same __always_inline functions as in the non-augmented
+ * case, but this time with user-defined callbacks.
+ */
+
+void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
+{
+	__rb_insert(node, root, augment_rotate);
+}
+
+/*
+ * This function returns the first node (in sort order) of the tree.
+ */
+struct rb_node *rb_first(const struct rb_root *root)
+{
+	struct rb_node	*n;
+
+	n = root->rb_node;
+	if (!n)
+		return NULL;
+	while (n->rb_left)
+		n = n->rb_left;
+	return n;
+}
+
+struct rb_node *rb_last(const struct rb_root *root)
+{
+	struct rb_node	*n;
+
+	n = root->rb_node;
+	if (!n)
+		return NULL;
+	while (n->rb_right)
+		n = n->rb_right;
+	return n;
+}
+
+struct rb_node *rb_next(const struct rb_node *node)
+{
+	struct rb_node *parent;
+
+	if (RB_EMPTY_NODE(node))
+		return NULL;
+
+	/*
+	 * If we have a right-hand child, go down and then left as far
+	 * as we can.
+	 */
+	if (node->rb_right) {
+		node = node->rb_right; 
+		while (node->rb_left)
+			node=node->rb_left;
+		return (struct rb_node *)node;
+	}
+
+	/*
+	 * No right-hand children. Everything down and left is smaller than us,
+	 * so any 'next' node must be in the general direction of our parent.
+	 * Go up the tree; any time the ancestor is a right-hand child of its
+	 * parent, keep going up. First time it's a left-hand child of its
+	 * parent, said parent is our 'next' node.
+	 */
+	while ((parent = rb_parent(node)) && node == parent->rb_right)
+		node = parent;
+
+	return parent;
+}
+
+struct rb_node *rb_prev(const struct rb_node *node)
+{
+	struct rb_node *parent;
+
+	if (RB_EMPTY_NODE(node))
+		return NULL;
+
+	/*
+	 * If we have a left-hand child, go down and then right as far
+	 * as we can.
+	 */
+	if (node->rb_left) {
+		node = node->rb_left; 
+		while (node->rb_right)
+			node=node->rb_right;
+		return (struct rb_node *)node;
+	}
+
+	/*
+	 * No left-hand children. Go up till we find an ancestor which
+	 * is a right-hand child of its parent.
+	 */
+	while ((parent = rb_parent(node)) && node == parent->rb_left)
+		node = parent;
+
+	return parent;
+}
+
+void rb_replace_node(struct rb_node *victim, struct rb_node *new,
+		     struct rb_root *root)
+{
+	struct rb_node *parent = rb_parent(victim);
+
+	/* Set the surrounding nodes to point to the replacement */
+	__rb_change_child(victim, new, parent, root);
+	if (victim->rb_left)
+		rb_set_parent(victim->rb_left, new);
+	if (victim->rb_right)
+		rb_set_parent(victim->rb_right, new);
+
+	/* Copy the pointers/colour from the victim to the replacement */
+	*new = *victim;
+}
+
+static struct rb_node *rb_left_deepest_node(const struct rb_node *node)
+{
+	for (;;) {
+		if (node->rb_left)
+			node = node->rb_left;
+		else if (node->rb_right)
+			node = node->rb_right;
+		else
+			return (struct rb_node *)node;
+	}
+}
+
+struct rb_node *rb_next_postorder(const struct rb_node *node)
+{
+	const struct rb_node *parent;
+	if (!node)
+		return NULL;
+	parent = rb_parent(node);
+
+	/* If we're sitting on node, we've already seen our children */
+	if (parent && node == parent->rb_left && parent->rb_right) {
+		/* If we are the parent's left node, go to the parent's right
+		 * node then all the way down to the left */
+		return rb_left_deepest_node(parent->rb_right);
+	} else
+		/* Otherwise we are the parent's right node, and the parent
+		 * should be next */
+		return (struct rb_node *)parent;
+}
+
+struct rb_node *rb_first_postorder(const struct rb_root *root)
+{
+	if (!root->rb_node)
+		return NULL;
+
+	return rb_left_deepest_node(root->rb_node);
+}
diff --git a/kvmtool/util/read-write.c b/kvmtool/util/read-write.c
new file mode 100644
index 0000000..06fc0df
--- /dev/null
+++ b/kvmtool/util/read-write.c
@@ -0,0 +1,339 @@
+#include "kvm/read-write.h"
+
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+
+/* Same as read(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xread(int fd, void *buf, size_t count)
+{
+	ssize_t nr;
+
+restart:
+	nr = read(fd, buf, count);
+	if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+		goto restart;
+
+	return nr;
+}
+
+/* Same as write(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xwrite(int fd, const void *buf, size_t count)
+{
+	ssize_t nr;
+
+restart:
+	nr = write(fd, buf, count);
+	if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+		goto restart;
+
+	return nr;
+}
+
+/*
+ * Read in the whole file while not exceeding max_size bytes of the buffer.
+ * Returns -1 (with errno set) in case of an error (ENOMEM if buffer was
+ * too small) or the filesize if the whole file could be read.
+ */
+ssize_t read_file(int fd, char *buf, size_t max_size)
+{
+	ssize_t ret;
+	char dummy;
+
+	errno = 0;
+	ret = read_in_full(fd, buf, max_size);
+
+	/* Probe whether we reached EOF. */
+	if (xread(fd, &dummy, 1) == 0)
+		return ret;
+
+	errno = ENOMEM;
+	return -1;
+}
+
+ssize_t read_in_full(int fd, void *buf, size_t count)
+{
+	ssize_t total = 0;
+	char *p = buf;
+
+	while (count > 0) {
+		ssize_t nr;
+
+		nr = xread(fd, p, count);
+		if (nr <= 0) {
+			if (total > 0)
+				return total;
+
+			return -1;
+		}
+
+		count -= nr;
+		total += nr;
+		p += nr;
+	}
+
+	return total;
+}
+
+ssize_t write_in_full(int fd, const void *buf, size_t count)
+{
+	const char *p = buf;
+	ssize_t total = 0;
+
+	while (count > 0) {
+		ssize_t nr;
+
+		nr = xwrite(fd, p, count);
+		if (nr < 0)
+			return -1;
+		if (nr == 0) {
+			errno = ENOSPC;
+			return -1;
+		}
+		count -= nr;
+		total += nr;
+		p += nr;
+	}
+
+	return total;
+}
+
+/* Same as pread(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xpread(int fd, void *buf, size_t count, off_t offset)
+{
+	ssize_t nr;
+
+restart:
+	nr = pread(fd, buf, count, offset);
+	if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+		goto restart;
+
+	return nr;
+}
+
+/* Same as pwrite(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xpwrite(int fd, const void *buf, size_t count, off_t offset)
+{
+	ssize_t nr;
+
+restart:
+	nr = pwrite(fd, buf, count, offset);
+	if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+		goto restart;
+
+	return nr;
+}
+
+ssize_t pread_in_full(int fd, void *buf, size_t count, off_t offset)
+{
+	ssize_t total = 0;
+	char *p = buf;
+
+	while (count > 0) {
+		ssize_t nr;
+
+		nr = xpread(fd, p, count, offset);
+		if (nr <= 0) {
+			if (total > 0)
+				return total;
+
+			return -1;
+		}
+
+		count -= nr;
+		total += nr;
+		p += nr;
+		offset += nr;
+	}
+
+	return total;
+}
+
+ssize_t pwrite_in_full(int fd, const void *buf, size_t count, off_t offset)
+{
+	const char *p = buf;
+	ssize_t total = 0;
+
+	while (count > 0) {
+		ssize_t nr;
+
+		nr = xpwrite(fd, p, count, offset);
+		if (nr < 0)
+			return -1;
+		if (nr == 0) {
+			errno = ENOSPC;
+			return -1;
+		}
+		count -= nr;
+		total += nr;
+		p += nr;
+		offset += nr;
+	}
+
+	return total;
+}
+
+/* Same as readv(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xreadv(int fd, const struct iovec *iov, int iovcnt)
+{
+	ssize_t nr;
+
+restart:
+	nr = readv(fd, iov, iovcnt);
+	if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+		goto restart;
+
+	return nr;
+}
+
+/* Same as writev(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xwritev(int fd, const struct iovec *iov, int iovcnt)
+{
+	ssize_t nr;
+
+restart:
+	nr = writev(fd, iov, iovcnt);
+	if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+		goto restart;
+
+	return nr;
+}
+
+static inline ssize_t get_iov_size(const struct iovec *iov, int iovcnt)
+{
+	size_t size = 0;
+	while (iovcnt--)
+		size += (iov++)->iov_len;
+
+	return size;
+}
+
+static inline void shift_iovec(const struct iovec **iov, int *iovcnt,
+				size_t nr, ssize_t *total, size_t *count, off_t *offset)
+{
+	while (nr >= (*iov)->iov_len) {
+		nr -= (*iov)->iov_len;
+		*total += (*iov)->iov_len;
+		*count -= (*iov)->iov_len;
+		if (offset)
+			*offset += (*iov)->iov_len;
+		(*iovcnt)--;
+		(*iov)++;
+	}
+}
+
+ssize_t readv_in_full(int fd, const struct iovec *iov, int iovcnt)
+{
+	ssize_t total = 0;
+	size_t count = get_iov_size(iov, iovcnt);
+
+	while (count > 0) {
+		ssize_t nr;
+
+		nr = xreadv(fd, iov, iovcnt);
+		if (nr <= 0) {
+			if (total > 0)
+				return total;
+
+			return -1;
+		}
+
+		shift_iovec(&iov, &iovcnt, nr, &total, &count, NULL);
+	}
+
+	return total;
+}
+
+ssize_t writev_in_full(int fd, const struct iovec *iov, int iovcnt)
+{
+	ssize_t total = 0;
+	size_t count = get_iov_size(iov, iovcnt);
+
+	while (count > 0) {
+		ssize_t nr;
+
+		nr = xwritev(fd, iov, iovcnt);
+		if (nr < 0)
+			return -1;
+		if (nr == 0) {
+			errno = ENOSPC;
+			return -1;
+		}
+
+		shift_iovec(&iov, &iovcnt, nr, &total, &count, NULL);
+	}
+
+	return total;
+}
+
+/* Same as preadv(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xpreadv(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+	ssize_t nr;
+
+restart:
+	nr = preadv(fd, iov, iovcnt, offset);
+	if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+		goto restart;
+
+	return nr;
+}
+
+/* Same as pwritev(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xpwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+	ssize_t nr;
+
+restart:
+	nr = pwritev(fd, iov, iovcnt, offset);
+	if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+		goto restart;
+
+	return nr;
+}
+
+ssize_t preadv_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+	ssize_t total = 0;
+	size_t count = get_iov_size(iov, iovcnt);
+
+	while (count > 0) {
+		ssize_t nr;
+
+		nr = xpreadv(fd, iov, iovcnt, offset);
+		if (nr <= 0) {
+			if (total > 0)
+				return total;
+
+			return -1;
+		}
+
+		shift_iovec(&iov, &iovcnt, nr, &total, &count, &offset);
+	}
+
+	return total;
+}
+
+ssize_t pwritev_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+	ssize_t total = 0;
+	size_t count = get_iov_size(iov, iovcnt);
+
+	while (count > 0) {
+		ssize_t nr;
+
+		nr = xpwritev(fd, iov, iovcnt, offset);
+		if (nr < 0)
+			return -1;
+		if (nr == 0) {
+			errno = ENOSPC;
+			return -1;
+		}
+
+		shift_iovec(&iov, &iovcnt, nr, &total, &count, &offset);
+	}
+
+	return total;
+}
diff --git a/kvmtool/util/set_private_br.sh b/kvmtool/util/set_private_br.sh
new file mode 100755
index 0000000..49867dd
--- /dev/null
+++ b/kvmtool/util/set_private_br.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+#
+# Author: Amos Kong <kongjianjun@gmail.com>
+# Date: Apr 14, 2011
+# Description: this script is used to create/delete a private bridge,
+# launch a dhcp server on the bridge by dnsmasq.
+#
+# @ ./set_private_br.sh $bridge_name $subnet_prefix
+# @ ./set_private_br.sh vbr0 192.168.33
+
+brname='vbr0'
+subnet='192.168.33'
+
+add_br()
+{
+    echo "add new private bridge: $brname"
+    /usr/sbin/brctl addbr $brname
+    echo 1 > /proc/sys/net/ipv6/conf/$brname/disable_ipv6
+    echo 1 > /proc/sys/net/ipv4/ip_forward
+    /usr/sbin/brctl stp $brname on
+    /usr/sbin/brctl setfd $brname 0
+    ifconfig $brname $subnet.1
+    ifconfig $brname up
+    # Add forward rule, then guest can access public network
+    iptables -t nat -A POSTROUTING -s $subnet.254/24 ! -d $subnet.254/24 -j MASQUERADE
+    /etc/init.d/dnsmasq stop
+    /etc/init.d/tftpd-hpa stop 2>/dev/null
+    dnsmasq --strict-order --bind-interfaces --listen-address $subnet.1 --dhcp-range $subnet.1,$subnet.254 $tftp_cmd
+}
+
+del_br()
+{
+    echo "cleanup bridge setup"
+    kill -9 `pgrep dnsmasq|tail -1`
+    ifconfig $brname down
+    /usr/sbin/brctl delbr $brname
+    iptables -t nat -D POSTROUTING -s $subnet.254/24 ! -d $subnet.254/24 -j MASQUERADE
+}
+
+
+if [ $# = 0 ]; then
+    del_br 2>/dev/null
+    exit
+fi
+if [ $# > 1 ]; then
+    brname="$1"
+fi
+if [ $# = 2 ]; then
+    subnet="$2"
+fi
+add_br
diff --git a/kvmtool/util/strbuf.c b/kvmtool/util/strbuf.c
new file mode 100644
index 0000000..2c6e8ad
--- /dev/null
+++ b/kvmtool/util/strbuf.c
@@ -0,0 +1,64 @@
+
+/* user defined headers */
+#include <kvm/util.h>
+#include <kvm/strbuf.h>
+
+int prefixcmp(const char *str, const char *prefix)
+{
+	for (; ; str++, prefix++) {
+		if (!*prefix)
+			return 0;
+		else if (*str != *prefix)
+			return (unsigned char)*prefix - (unsigned char)*str;
+	}
+}
+
+#ifndef HAVE_STRLCPY
+/**
+ * strlcat - Append a length-limited, %NUL-terminated string to another
+ * @dest: The string to be appended to
+ * @src: The string to append to it
+ * @count: The size of the destination buffer.
+ */
+size_t strlcat(char *dest, const char *src, size_t count)
+{
+	size_t dsize = strlen(dest);
+	size_t len = strlen(src);
+	size_t res = dsize + len;
+
+	DIE_IF(dsize >= count);
+
+	dest += dsize;
+	count -= dsize;
+	if (len >= count)
+		len = count - 1;
+
+	memcpy(dest, src, len);
+	dest[len] = 0;
+
+	return res;
+}
+
+/**
+ * strlcpy - Copy a %NUL terminated string into a sized buffer
+ * @dest: Where to copy the string to
+ * @src: Where to copy the string from
+ * @size: size of destination buffer
+ *
+ * Compatible with *BSD: the result is always a valid
+ * NUL-terminated string that fits in the buffer (unless,
+ * of course, the buffer size is zero). It does not pad
+ * out the result like strncpy() does.
+ */
+size_t strlcpy(char *dest, const char *src, size_t size)
+{
+	size_t ret = strlen(src);
+
+	if (size) {
+		size_t len = (ret >= size) ? size - 1 : ret;
+		memcpy(dest, src, len);
+		dest[len] = '\0';
+	}
+	return ret;
+}
+#endif
diff --git a/kvmtool/util/threadpool.c b/kvmtool/util/threadpool.c
new file mode 100644
index 0000000..1dc3bf7
--- /dev/null
+++ b/kvmtool/util/threadpool.c
@@ -0,0 +1,198 @@
+#include "kvm/threadpool.h"
+#include "kvm/mutex.h"
+#include "kvm/kvm.h"
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <pthread.h>
+#include <stdbool.h>
+
+static DEFINE_MUTEX(job_mutex);
+static DEFINE_MUTEX(thread_mutex);
+static pthread_cond_t job_cond = PTHREAD_COND_INITIALIZER;
+
+static LIST_HEAD(head);
+
+static pthread_t	*threads;
+static long		threadcount;
+static bool		running;
+
+static struct thread_pool__job *thread_pool__job_pop_locked(void)
+{
+	struct thread_pool__job *job;
+
+	if (list_empty(&head))
+		return NULL;
+
+	job = list_first_entry(&head, struct thread_pool__job, queue);
+	list_del_init(&job->queue);
+
+	return job;
+}
+
+static void thread_pool__job_push_locked(struct thread_pool__job *job)
+{
+	list_add_tail(&job->queue, &head);
+}
+
+static struct thread_pool__job *thread_pool__job_pop(void)
+{
+	struct thread_pool__job *job;
+
+	mutex_lock(&job_mutex);
+	job = thread_pool__job_pop_locked();
+	mutex_unlock(&job_mutex);
+	return job;
+}
+
+static void thread_pool__job_push(struct thread_pool__job *job)
+{
+	mutex_lock(&job_mutex);
+	thread_pool__job_push_locked(job);
+	mutex_unlock(&job_mutex);
+}
+
+static void thread_pool__handle_job(struct thread_pool__job *job)
+{
+	while (job) {
+		job->callback(job->kvm, job->data);
+
+		mutex_lock(&job->mutex);
+
+		if (--job->signalcount > 0)
+			/* If the job was signaled again while we were working */
+			thread_pool__job_push(job);
+
+		mutex_unlock(&job->mutex);
+
+		job = thread_pool__job_pop();
+	}
+}
+
+static void thread_pool__threadfunc_cleanup(void *param)
+{
+	mutex_unlock(&job_mutex);
+}
+
+static void *thread_pool__threadfunc(void *param)
+{
+	pthread_cleanup_push(thread_pool__threadfunc_cleanup, NULL);
+
+	kvm__set_thread_name("threadpool-worker");
+
+	while (running) {
+		struct thread_pool__job *curjob = NULL;
+
+		mutex_lock(&job_mutex);
+		while (running && (curjob = thread_pool__job_pop_locked()) == NULL)
+			pthread_cond_wait(&job_cond, &job_mutex.mutex);
+		mutex_unlock(&job_mutex);
+
+		if (running)
+			thread_pool__handle_job(curjob);
+	}
+
+	pthread_cleanup_pop(0);
+
+	return NULL;
+}
+
+static int thread_pool__addthread(void)
+{
+	int res;
+	void *newthreads;
+
+	mutex_lock(&thread_mutex);
+	newthreads = realloc(threads, (threadcount + 1) * sizeof(pthread_t));
+	if (newthreads == NULL) {
+		mutex_unlock(&thread_mutex);
+		return -1;
+	}
+
+	threads = newthreads;
+
+	res = pthread_create(threads + threadcount, NULL,
+			     thread_pool__threadfunc, NULL);
+
+	if (res == 0)
+		threadcount++;
+	mutex_unlock(&thread_mutex);
+
+	return res;
+}
+
+int thread_pool__init(struct kvm *kvm)
+{
+	unsigned long i;
+	unsigned int thread_count = sysconf(_SC_NPROCESSORS_ONLN);
+
+	running = true;
+
+	for (i = 0; i < thread_count; i++)
+		if (thread_pool__addthread() < 0)
+			return i;
+
+	return i;
+}
+late_init(thread_pool__init);
+
+int thread_pool__exit(struct kvm *kvm)
+{
+	int i;
+	void *NUL = NULL;
+
+	running = false;
+
+	for (i = 0; i < threadcount; i++) {
+		mutex_lock(&job_mutex);
+		pthread_cond_signal(&job_cond);
+		mutex_unlock(&job_mutex);
+	}
+
+	for (i = 0; i < threadcount; i++) {
+		pthread_join(threads[i], NUL);
+	}
+
+	return 0;
+}
+late_exit(thread_pool__exit);
+
+void thread_pool__do_job(struct thread_pool__job *job)
+{
+	struct thread_pool__job *jobinfo = job;
+
+	if (jobinfo == NULL || jobinfo->callback == NULL)
+		return;
+
+	mutex_lock(&jobinfo->mutex);
+	if (jobinfo->signalcount++ == 0)
+		thread_pool__job_push(job);
+	mutex_unlock(&jobinfo->mutex);
+
+	mutex_lock(&job_mutex);
+	pthread_cond_signal(&job_cond);
+	mutex_unlock(&job_mutex);
+}
+
+void thread_pool__cancel_job(struct thread_pool__job *job)
+{
+	bool running;
+
+	/*
+	 * If the job is queued but not running, remove it. Otherwise, wait for
+	 * the signalcount to drop to 0, indicating that it has finished
+	 * running. We assume that nobody is queueing this job -
+	 * thread_pool__do_job() isn't called - while this function is running.
+	 */
+	do {
+		mutex_lock(&job_mutex);
+		if (list_empty(&job->queue)) {
+			running = job->signalcount > 0;
+		} else {
+			list_del_init(&job->queue);
+			job->signalcount = 0;
+			running = false;
+		}
+		mutex_unlock(&job_mutex);
+	} while (running);
+}
diff --git a/kvmtool/util/update_headers.sh b/kvmtool/util/update_headers.sh
new file mode 100755
index 0000000..bf87ef6
--- /dev/null
+++ b/kvmtool/util/update_headers.sh
@@ -0,0 +1,49 @@
+#!/bin/sh
+
+########################################################################
+# Updates the kvmtool tree with up-to-date public header files from
+# a Linux source tree.
+# If no directory is given on the command line, it will try to find one
+# using the lib/modules/`uname -r`/source link.
+########################################################################
+
+set -ue
+
+if [ "$#" -ge 1 ]
+then
+	LINUX_ROOT="$1"
+else
+	LINUX_ROOT="/lib/modules/$(uname -r)/source"
+fi
+
+if [ ! -d "$LINUX_ROOT/include/uapi/linux" ]
+then
+	echo "$LINUX_ROOT does not seem to be valid Linux source tree."
+	echo "usage: $0 [path-to-Linux-source-tree]"
+	exit 1
+fi
+
+cp -- "$LINUX_ROOT/include/uapi/linux/kvm.h" include/linux
+
+unset KVMTOOL_PATH
+
+copy_optional_arch () {
+	local src="$LINUX_ROOT/arch/$arch/include/uapi/$1"
+
+	if [ -r "$src" ]
+	then
+		cp -- "$src" "$KVMTOOL_PATH/include/asm/"
+	fi
+}
+
+for arch in arm arm64 mips powerpc x86
+do
+	case "$arch" in
+		arm) KVMTOOL_PATH=arm/aarch32 ;;
+		arm64)	KVMTOOL_PATH=arm/aarch64
+			copy_optional_arch asm/sve_context.h ;;
+		*) KVMTOOL_PATH=$arch ;;
+	esac
+	cp -- "$LINUX_ROOT/arch/$arch/include/uapi/asm/kvm.h" \
+		"$KVMTOOL_PATH/include/asm"
+done
diff --git a/kvmtool/util/util.c b/kvmtool/util/util.c
new file mode 100644
index 0000000..1877105
--- /dev/null
+++ b/kvmtool/util/util.c
@@ -0,0 +1,133 @@
+/*
+ * Taken from perf which in turn take it from GIT
+ */
+
+#include "kvm/util.h"
+
+#include <kvm/kvm.h>
+#include <linux/magic.h>	/* For HUGETLBFS_MAGIC */
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+
+static void report(const char *prefix, const char *err, va_list params)
+{
+	char msg[1024];
+	vsnprintf(msg, sizeof(msg), err, params);
+	fprintf(stderr, " %s%s\n", prefix, msg);
+}
+
+static NORETURN void die_builtin(const char *err, va_list params)
+{
+	report(" Fatal: ", err, params);
+	exit(128);
+}
+
+static void error_builtin(const char *err, va_list params)
+{
+	report(" Error: ", err, params);
+}
+
+static void warn_builtin(const char *warn, va_list params)
+{
+	report(" Warning: ", warn, params);
+}
+
+static void info_builtin(const char *info, va_list params)
+{
+	report(" Info: ", info, params);
+}
+
+void die(const char *err, ...)
+{
+	va_list params;
+
+	va_start(params, err);
+	die_builtin(err, params);
+	va_end(params);
+}
+
+int pr_err(const char *err, ...)
+{
+	va_list params;
+
+	va_start(params, err);
+	error_builtin(err, params);
+	va_end(params);
+	return -1;
+}
+
+void pr_warning(const char *warn, ...)
+{
+	va_list params;
+
+	va_start(params, warn);
+	warn_builtin(warn, params);
+	va_end(params);
+}
+
+void pr_info(const char *info, ...)
+{
+	va_list params;
+
+	va_start(params, info);
+	info_builtin(info, params);
+	va_end(params);
+}
+
+void die_perror(const char *s)
+{
+	perror(s);
+	exit(1);
+}
+
+void *mmap_hugetlbfs(struct kvm *kvm, const char *htlbfs_path, u64 size)
+{
+	char mpath[PATH_MAX];
+	int fd;
+	struct statfs sfs;
+	void *addr;
+	unsigned long blk_size;
+
+	if (statfs(htlbfs_path, &sfs) < 0)
+		die("Can't stat %s\n", htlbfs_path);
+
+	if ((unsigned int)sfs.f_type != HUGETLBFS_MAGIC)
+		die("%s is not hugetlbfs!\n", htlbfs_path);
+
+	blk_size = (unsigned long)sfs.f_bsize;
+	if (sfs.f_bsize == 0 || blk_size > size) {
+		die("Can't use hugetlbfs pagesize %ld for mem size %lld\n",
+			blk_size, (unsigned long long)size);
+	}
+
+	kvm->ram_pagesize = blk_size;
+
+	snprintf(mpath, PATH_MAX, "%s/kvmtoolXXXXXX", htlbfs_path);
+	fd = mkstemp(mpath);
+	if (fd < 0)
+		die("Can't open %s for hugetlbfs map\n", mpath);
+	unlink(mpath);
+	if (ftruncate(fd, size) < 0)
+		die("Can't ftruncate for mem mapping size %lld\n",
+			(unsigned long long)size);
+	addr = mmap(NULL, size, PROT_RW, MAP_PRIVATE, fd, 0);
+	close(fd);
+
+	return addr;
+}
+
+/* This function wraps the decision between hugetlbfs map (if requested) or normal mmap */
+void *mmap_anon_or_hugetlbfs(struct kvm *kvm, const char *hugetlbfs_path, u64 size)
+{
+	if (hugetlbfs_path)
+		/*
+		 * We don't /need/ to map guest RAM from hugetlbfs, but we do so
+		 * if the user specifies a hugetlbfs path.
+		 */
+		return mmap_hugetlbfs(kvm, hugetlbfs_path, size);
+	else {
+		kvm->ram_pagesize = getpagesize();
+		return mmap(NULL, size, PROT_RW, MAP_ANON_NORESERVE, -1, 0);
+	}
+}
diff --git a/kvmtool/vfio/core.c b/kvmtool/vfio/core.c
new file mode 100644
index 0000000..0b45e78
--- /dev/null
+++ b/kvmtool/vfio/core.c
@@ -0,0 +1,690 @@
+#include "kvm/kvm.h"
+#include "kvm/vfio.h"
+#include "kvm/ioport.h"
+
+#include <linux/list.h>
+
+#define VFIO_DEV_DIR		"/dev/vfio"
+#define VFIO_DEV_NODE		VFIO_DEV_DIR "/vfio"
+#define IOMMU_GROUP_DIR		"/sys/kernel/iommu_groups"
+
+static int vfio_container;
+static LIST_HEAD(vfio_groups);
+static struct vfio_device *vfio_devices;
+
+static int vfio_device_pci_parser(const struct option *opt, char *arg,
+				  struct vfio_device_params *dev)
+{
+	unsigned int domain, bus, devnr, fn;
+
+	int nr = sscanf(arg, "%4x:%2x:%2x.%1x", &domain, &bus, &devnr, &fn);
+	if (nr < 4) {
+		domain = 0;
+		nr = sscanf(arg, "%2x:%2x.%1x", &bus, &devnr, &fn);
+		if (nr < 3) {
+			pr_err("Invalid device identifier %s", arg);
+			return -EINVAL;
+		}
+	}
+
+	dev->type = VFIO_DEVICE_PCI;
+	dev->bus = "pci";
+	dev->name = malloc(13);
+	if (!dev->name)
+		return -ENOMEM;
+
+	snprintf(dev->name, 13, "%04x:%02x:%02x.%x", domain, bus, devnr, fn);
+
+	return 0;
+}
+
+int vfio_device_parser(const struct option *opt, const char *arg, int unset)
+{
+	int ret = -EINVAL;
+	static int idx = 0;
+	struct kvm *kvm = opt->ptr;
+	struct vfio_device_params *dev, *devs;
+	char *cur, *buf = strdup(arg);
+
+	if (!buf)
+		return -ENOMEM;
+
+	if (idx >= MAX_VFIO_DEVICES) {
+		pr_warning("Too many VFIO devices");
+		goto out_free_buf;
+	}
+
+	devs = realloc(kvm->cfg.vfio_devices, sizeof(*dev) * (idx + 1));
+	if (!devs) {
+		ret = -ENOMEM;
+		goto out_free_buf;
+	}
+
+	kvm->cfg.vfio_devices = devs;
+	dev = &devs[idx];
+
+	cur = strtok(buf, ",");
+	if (!cur)
+		goto out_free_buf;
+
+	if (!strcmp(opt->long_name, "vfio-pci"))
+		ret = vfio_device_pci_parser(opt, cur, dev);
+	else
+		ret = -EINVAL;
+
+	if (!ret)
+		kvm->cfg.num_vfio_devices = ++idx;
+
+out_free_buf:
+	free(buf);
+
+	return ret;
+}
+
+static bool vfio_ioport_in(struct ioport *ioport, struct kvm_cpu *vcpu,
+			   u16 port, void *data, int len)
+{
+	u32 val;
+	ssize_t nr;
+	struct vfio_region *region = ioport->priv;
+	struct vfio_device *vdev = region->vdev;
+
+	u32 offset = port - region->port_base;
+
+	if (!(region->info.flags & VFIO_REGION_INFO_FLAG_READ))
+		return false;
+
+	nr = pread(vdev->fd, &val, len, region->info.offset + offset);
+	if (nr != len) {
+		vfio_dev_err(vdev, "could not read %d bytes from I/O port 0x%x\n",
+			     len, port);
+		return false;
+	}
+
+	switch (len) {
+	case 1:
+		ioport__write8(data, val);
+		break;
+	case 2:
+		ioport__write16(data, val);
+		break;
+	case 4:
+		ioport__write32(data, val);
+		break;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
+static bool vfio_ioport_out(struct ioport *ioport, struct kvm_cpu *vcpu,
+			    u16 port, void *data, int len)
+{
+	u32 val;
+	ssize_t nr;
+	struct vfio_region *region = ioport->priv;
+	struct vfio_device *vdev = region->vdev;
+
+	u32 offset = port - region->port_base;
+
+	if (!(region->info.flags & VFIO_REGION_INFO_FLAG_WRITE))
+		return false;
+
+	switch (len) {
+	case 1:
+		val = ioport__read8(data);
+		break;
+	case 2:
+		val = ioport__read16(data);
+		break;
+	case 4:
+		val = ioport__read32(data);
+		break;
+	default:
+		return false;
+	}
+
+	nr = pwrite(vdev->fd, &val, len, region->info.offset + offset);
+	if (nr != len)
+		vfio_dev_err(vdev, "could not write %d bytes to I/O port 0x%x",
+			     len, port);
+
+	return nr == len;
+}
+
+static struct ioport_operations vfio_ioport_ops = {
+	.io_in	= vfio_ioport_in,
+	.io_out	= vfio_ioport_out,
+};
+
+static void vfio_mmio_access(struct kvm_cpu *vcpu, u64 addr, u8 *data, u32 len,
+			     u8 is_write, void *ptr)
+{
+	u64 val;
+	ssize_t nr;
+	struct vfio_region *region = ptr;
+	struct vfio_device *vdev = region->vdev;
+
+	u32 offset = addr - region->guest_phys_addr;
+
+	if (len < 1 || len > 8)
+		goto err_report;
+
+	if (is_write) {
+		if (!(region->info.flags & VFIO_REGION_INFO_FLAG_WRITE))
+			goto err_report;
+
+		memcpy(&val, data, len);
+
+		nr = pwrite(vdev->fd, &val, len, region->info.offset + offset);
+		if ((u32)nr != len)
+			goto err_report;
+	} else {
+		if (!(region->info.flags & VFIO_REGION_INFO_FLAG_READ))
+			goto err_report;
+
+		nr = pread(vdev->fd, &val, len, region->info.offset + offset);
+		if ((u32)nr != len)
+			goto err_report;
+
+		memcpy(data, &val, len);
+	}
+
+	return;
+
+err_report:
+	vfio_dev_err(vdev, "could not %s %u bytes at 0x%x (0x%llx)", is_write ?
+		     "write" : "read", len, offset, addr);
+}
+
+static int vfio_setup_trap_region(struct kvm *kvm, struct vfio_device *vdev,
+				  struct vfio_region *region)
+{
+	if (region->is_ioport) {
+		int port = ioport__register(kvm, region->port_base,
+					   &vfio_ioport_ops, region->info.size,
+					   region);
+		if (port < 0)
+			return port;
+		return 0;
+	}
+
+	return kvm__register_mmio(kvm, region->guest_phys_addr,
+				  region->info.size, false, vfio_mmio_access,
+				  region);
+}
+
+int vfio_map_region(struct kvm *kvm, struct vfio_device *vdev,
+		    struct vfio_region *region)
+{
+	void *base;
+	int ret, prot = 0;
+	/* KVM needs page-aligned regions */
+	u64 map_size = ALIGN(region->info.size, PAGE_SIZE);
+
+	if (!(region->info.flags & VFIO_REGION_INFO_FLAG_MMAP))
+		return vfio_setup_trap_region(kvm, vdev, region);
+
+	/*
+	 * KVM_SET_USER_MEMORY_REGION will fail because the guest physical
+	 * address isn't page aligned, let's emulate the region ourselves.
+	 */
+	if (region->guest_phys_addr & (PAGE_SIZE - 1))
+		return kvm__register_mmio(kvm, region->guest_phys_addr,
+					  region->info.size, false,
+					  vfio_mmio_access, region);
+
+	if (region->info.flags & VFIO_REGION_INFO_FLAG_READ)
+		prot |= PROT_READ;
+	if (region->info.flags & VFIO_REGION_INFO_FLAG_WRITE)
+		prot |= PROT_WRITE;
+
+	base = mmap(NULL, region->info.size, prot, MAP_SHARED, vdev->fd,
+		    region->info.offset);
+	if (base == MAP_FAILED) {
+		/* TODO: support sparse mmap */
+		vfio_dev_warn(vdev, "failed to mmap region %u (0x%llx bytes), falling back to trapping",
+			 region->info.index, region->info.size);
+		return vfio_setup_trap_region(kvm, vdev, region);
+	}
+	region->host_addr = base;
+
+	ret = kvm__register_dev_mem(kvm, region->guest_phys_addr, map_size,
+				    region->host_addr);
+	if (ret) {
+		vfio_dev_err(vdev, "failed to register region with KVM");
+		return ret;
+	}
+
+	return 0;
+}
+
+void vfio_unmap_region(struct kvm *kvm, struct vfio_region *region)
+{
+	u64 map_size;
+
+	if (region->host_addr) {
+		map_size = ALIGN(region->info.size, PAGE_SIZE);
+		kvm__destroy_mem(kvm, region->guest_phys_addr, map_size,
+				 region->host_addr);
+		munmap(region->host_addr, region->info.size);
+		region->host_addr = NULL;
+	} else if (region->is_ioport) {
+		ioport__unregister(kvm, region->port_base);
+	} else {
+		kvm__deregister_mmio(kvm, region->guest_phys_addr);
+	}
+}
+
+static int vfio_configure_device(struct kvm *kvm, struct vfio_device *vdev)
+{
+	int ret;
+	struct vfio_group *group = vdev->group;
+
+	vdev->fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD,
+			 vdev->params->name);
+	if (vdev->fd < 0) {
+		vfio_dev_warn(vdev, "failed to get fd");
+
+		/* The device might be a bridge without an fd */
+		return 0;
+	}
+
+	vdev->info.argsz = sizeof(vdev->info);
+	if (ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &vdev->info)) {
+		ret = -errno;
+		vfio_dev_err(vdev, "failed to get info");
+		goto err_close_device;
+	}
+
+	if (vdev->info.flags & VFIO_DEVICE_FLAGS_RESET &&
+	    ioctl(vdev->fd, VFIO_DEVICE_RESET) < 0)
+		vfio_dev_warn(vdev, "failed to reset device");
+
+	vdev->regions = calloc(vdev->info.num_regions, sizeof(*vdev->regions));
+	if (!vdev->regions) {
+		ret = -ENOMEM;
+		goto err_close_device;
+	}
+
+	/* Now for the bus-specific initialization... */
+	switch (vdev->params->type) {
+	case VFIO_DEVICE_PCI:
+		BUG_ON(!(vdev->info.flags & VFIO_DEVICE_FLAGS_PCI));
+		ret = vfio_pci_setup_device(kvm, vdev);
+		break;
+	default:
+		BUG_ON(1);
+		ret = -EINVAL;
+	}
+
+	if (ret)
+		goto err_free_regions;
+
+	vfio_dev_info(vdev, "assigned to device number 0x%x in group %lu",
+		      vdev->dev_hdr.dev_num, group->id);
+
+	return 0;
+
+err_free_regions:
+	free(vdev->regions);
+err_close_device:
+	close(vdev->fd);
+
+	return ret;
+}
+
+static int vfio_configure_devices(struct kvm *kvm)
+{
+	int i, ret;
+
+	for (i = 0; i < kvm->cfg.num_vfio_devices; ++i) {
+		ret = vfio_configure_device(kvm, &vfio_devices[i]);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int vfio_get_iommu_type(void)
+{
+	if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU))
+		return VFIO_TYPE1v2_IOMMU;
+
+	if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU))
+		return VFIO_TYPE1_IOMMU;
+
+	return -ENODEV;
+}
+
+static int vfio_map_mem_bank(struct kvm *kvm, struct kvm_mem_bank *bank, void *data)
+{
+	int ret = 0;
+	struct vfio_iommu_type1_dma_map dma_map = {
+		.argsz	= sizeof(dma_map),
+		.flags	= VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
+		.vaddr	= (unsigned long)bank->host_addr,
+		.iova	= (u64)bank->guest_phys_addr,
+		.size	= bank->size,
+	};
+
+	/* Map the guest memory for DMA (i.e. provide isolation) */
+	if (ioctl(vfio_container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
+		ret = -errno;
+		pr_err("Failed to map 0x%llx -> 0x%llx (%llu) for DMA",
+		       dma_map.iova, dma_map.vaddr, dma_map.size);
+	}
+
+	return ret;
+}
+
+static int vfio_unmap_mem_bank(struct kvm *kvm, struct kvm_mem_bank *bank, void *data)
+{
+	struct vfio_iommu_type1_dma_unmap dma_unmap = {
+		.argsz = sizeof(dma_unmap),
+		.size = bank->size,
+		.iova = bank->guest_phys_addr,
+	};
+
+	ioctl(vfio_container, VFIO_IOMMU_UNMAP_DMA, &dma_unmap);
+
+	return 0;
+}
+
+static int vfio_configure_reserved_regions(struct kvm *kvm,
+					   struct vfio_group *group)
+{
+	FILE *file;
+	int ret = 0;
+	char type[9];
+	char filename[PATH_MAX];
+	unsigned long long start, end;
+
+	snprintf(filename, PATH_MAX, IOMMU_GROUP_DIR "/%lu/reserved_regions",
+		 group->id);
+
+	/* reserved_regions might not be present on older systems */
+	if (access(filename, F_OK))
+		return 0;
+
+	file = fopen(filename, "r");
+	if (!file)
+		return -errno;
+
+	while (fscanf(file, "0x%llx 0x%llx %8s\n", &start, &end, type) == 3) {
+		ret = kvm__reserve_mem(kvm, start, end - start + 1);
+		if (ret)
+			break;
+	}
+
+	fclose(file);
+
+	return ret;
+}
+
+static int vfio_configure_groups(struct kvm *kvm)
+{
+	int ret;
+	struct vfio_group *group;
+
+	list_for_each_entry(group, &vfio_groups, list) {
+		ret = vfio_configure_reserved_regions(kvm, group);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static struct vfio_group *vfio_group_create(struct kvm *kvm, unsigned long id)
+{
+	int ret;
+	struct vfio_group *group;
+	char group_node[PATH_MAX];
+	struct vfio_group_status group_status = {
+		.argsz = sizeof(group_status),
+	};
+
+	group = calloc(1, sizeof(*group));
+	if (!group)
+		return NULL;
+
+	group->id	= id;
+	group->refs	= 1;
+
+	ret = snprintf(group_node, PATH_MAX, VFIO_DEV_DIR "/%lu", id);
+	if (ret < 0 || ret == PATH_MAX)
+		return NULL;
+
+	group->fd = open(group_node, O_RDWR);
+	if (group->fd < 0) {
+		pr_err("Failed to open IOMMU group %s", group_node);
+		goto err_free_group;
+	}
+
+	if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &group_status)) {
+		pr_err("Failed to determine status of IOMMU group %lu", id);
+		goto err_close_group;
+	}
+
+	if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+		pr_err("IOMMU group %lu is not viable", id);
+		goto err_close_group;
+	}
+
+	if (ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &vfio_container)) {
+		pr_err("Failed to add IOMMU group %lu to VFIO container", id);
+		goto err_close_group;
+	}
+
+	list_add(&group->list, &vfio_groups);
+
+	return group;
+
+err_close_group:
+	close(group->fd);
+err_free_group:
+	free(group);
+
+	return NULL;
+}
+
+static void vfio_group_exit(struct kvm *kvm, struct vfio_group *group)
+{
+	if (--group->refs != 0)
+		return;
+
+	ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER);
+
+	list_del(&group->list);
+	close(group->fd);
+	free(group);
+}
+
+static struct vfio_group *
+vfio_group_get_for_dev(struct kvm *kvm, struct vfio_device *vdev)
+{
+	int dirfd;
+	ssize_t ret;
+	char *group_name;
+	unsigned long group_id;
+	char group_path[PATH_MAX];
+	struct vfio_group *group = NULL;
+
+	/* Find IOMMU group for this device */
+	dirfd = open(vdev->sysfs_path, O_DIRECTORY | O_PATH | O_RDONLY);
+	if (dirfd < 0) {
+		vfio_dev_err(vdev, "failed to open '%s'", vdev->sysfs_path);
+		return NULL;
+	}
+
+	ret = readlinkat(dirfd, "iommu_group", group_path, PATH_MAX);
+	if (ret < 0) {
+		vfio_dev_err(vdev, "no iommu_group");
+		goto out_close;
+	}
+	if (ret == PATH_MAX)
+		goto out_close;
+
+	group_path[ret] = '\0';
+
+	group_name = basename(group_path);
+	errno = 0;
+	group_id = strtoul(group_name, NULL, 10);
+	if (errno)
+		goto out_close;
+
+	list_for_each_entry(group, &vfio_groups, list) {
+		if (group->id == group_id) {
+			group->refs++;
+			return group;
+		}
+	}
+
+	group = vfio_group_create(kvm, group_id);
+
+out_close:
+	close(dirfd);
+	return group;
+}
+
+static int vfio_device_init(struct kvm *kvm, struct vfio_device *vdev)
+{
+	int ret;
+	char dev_path[PATH_MAX];
+	struct vfio_group *group;
+
+	ret = snprintf(dev_path, PATH_MAX, "/sys/bus/%s/devices/%s",
+		       vdev->params->bus, vdev->params->name);
+	if (ret < 0 || ret == PATH_MAX)
+		return -EINVAL;
+
+	vdev->sysfs_path = strndup(dev_path, PATH_MAX);
+	if (!vdev->sysfs_path)
+		return -errno;
+
+	group = vfio_group_get_for_dev(kvm, vdev);
+	if (!group) {
+		free(vdev->sysfs_path);
+		return -EINVAL;
+	}
+
+	vdev->group = group;
+
+	return 0;
+}
+
+static void vfio_device_exit(struct kvm *kvm, struct vfio_device *vdev)
+{
+	vfio_group_exit(kvm, vdev->group);
+
+	switch (vdev->params->type) {
+	case VFIO_DEVICE_PCI:
+		vfio_pci_teardown_device(kvm, vdev);
+		break;
+	default:
+		vfio_dev_warn(vdev, "no teardown function for device");
+	}
+
+	close(vdev->fd);
+
+	free(vdev->regions);
+	free(vdev->sysfs_path);
+}
+
+static int vfio_container_init(struct kvm *kvm)
+{
+	int api, i, ret, iommu_type;;
+
+	/* Create a container for our IOMMU groups */
+	vfio_container = open(VFIO_DEV_NODE, O_RDWR);
+	if (vfio_container == -1) {
+		ret = errno;
+		pr_err("Failed to open %s", VFIO_DEV_NODE);
+		return ret;
+	}
+
+	api = ioctl(vfio_container, VFIO_GET_API_VERSION);
+	if (api != VFIO_API_VERSION) {
+		pr_err("Unknown VFIO API version %d", api);
+		return -ENODEV;
+	}
+
+	iommu_type = vfio_get_iommu_type();
+	if (iommu_type < 0) {
+		pr_err("VFIO type-1 IOMMU not supported on this platform");
+		return iommu_type;
+	}
+
+	/* Create groups for our devices and add them to the container */
+	for (i = 0; i < kvm->cfg.num_vfio_devices; ++i) {
+		vfio_devices[i].params = &kvm->cfg.vfio_devices[i];
+
+		ret = vfio_device_init(kvm, &vfio_devices[i]);
+		if (ret)
+			return ret;
+	}
+
+	/* Finalise the container */
+	if (ioctl(vfio_container, VFIO_SET_IOMMU, iommu_type)) {
+		ret = -errno;
+		pr_err("Failed to set IOMMU type %d for VFIO container",
+		       iommu_type);
+		return ret;
+	} else {
+		pr_info("Using IOMMU type %d for VFIO container", iommu_type);
+	}
+
+	return kvm__for_each_mem_bank(kvm, KVM_MEM_TYPE_RAM, vfio_map_mem_bank,
+				      NULL);
+}
+
+static int vfio__init(struct kvm *kvm)
+{
+	int ret;
+
+	if (!kvm->cfg.num_vfio_devices)
+		return 0;
+
+	vfio_devices = calloc(kvm->cfg.num_vfio_devices, sizeof(*vfio_devices));
+	if (!vfio_devices)
+		return -ENOMEM;
+
+	ret = vfio_container_init(kvm);
+	if (ret)
+		return ret;
+
+	ret = vfio_configure_groups(kvm);
+	if (ret)
+		return ret;
+
+	ret = vfio_configure_devices(kvm);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+dev_base_init(vfio__init);
+
+static int vfio__exit(struct kvm *kvm)
+{
+	int i;
+
+	if (!kvm->cfg.num_vfio_devices)
+		return 0;
+
+	for (i = 0; i < kvm->cfg.num_vfio_devices; i++)
+		vfio_device_exit(kvm, &vfio_devices[i]);
+
+	free(vfio_devices);
+
+	kvm__for_each_mem_bank(kvm, KVM_MEM_TYPE_RAM, vfio_unmap_mem_bank, NULL);
+	close(vfio_container);
+
+	free(kvm->cfg.vfio_devices);
+
+	return 0;
+}
+dev_base_exit(vfio__exit);
diff --git a/kvmtool/vfio/pci.c b/kvmtool/vfio/pci.c
new file mode 100644
index 0000000..49ecd12
--- /dev/null
+++ b/kvmtool/vfio/pci.c
@@ -0,0 +1,1373 @@
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/vfio.h"
+
+#include <assert.h>
+
+#include <sys/ioctl.h>
+#include <sys/eventfd.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+
+#include <assert.h>
+
+/* Wrapper around UAPI vfio_irq_set */
+union vfio_irq_eventfd {
+	struct vfio_irq_set	irq;
+	u8 buffer[sizeof(struct vfio_irq_set) + sizeof(int)];
+};
+
+static void set_vfio_irq_eventd_payload(union vfio_irq_eventfd *evfd, int fd)
+{
+	memcpy(&evfd->irq.data, &fd, sizeof(fd));
+}
+
+#define msi_is_enabled(state)		((state) & VFIO_PCI_MSI_STATE_ENABLED)
+#define msi_is_masked(state)		((state) & VFIO_PCI_MSI_STATE_MASKED)
+#define msi_is_empty(state)		((state) & VFIO_PCI_MSI_STATE_EMPTY)
+
+#define msi_update_state(state, val, bit)				\
+	(state) = (val) ? (state) | bit : (state) & ~bit;
+#define msi_set_enabled(state, val)					\
+	msi_update_state(state, val, VFIO_PCI_MSI_STATE_ENABLED)
+#define msi_set_masked(state, val)					\
+	msi_update_state(state, val, VFIO_PCI_MSI_STATE_MASKED)
+#define msi_set_empty(state, val)					\
+	msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY)
+
+static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev);
+static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev);
+
+static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev,
+				bool msix)
+{
+	size_t i;
+	int ret = 0;
+	int *eventfds;
+	struct vfio_pci_device *pdev = &vdev->pci;
+	struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi;
+	union vfio_irq_eventfd single = {
+		.irq = {
+			.argsz	= sizeof(single),
+			.flags	= VFIO_IRQ_SET_DATA_EVENTFD |
+				  VFIO_IRQ_SET_ACTION_TRIGGER,
+			.index	= msis->info.index,
+			.count	= 1,
+		},
+	};
+
+	if (!msi_is_enabled(msis->virt_state))
+		return 0;
+
+	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
+		/*
+		 * PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same
+		 * time. Since INTx has to be enabled from the start (we don't
+		 * have a reliable way to know when the guest starts using it),
+		 * disable it now.
+		 */
+		vfio_pci_disable_intx(kvm, vdev);
+
+	eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
+
+	/*
+	 * Initial registration of the full range. This enables the physical
+	 * MSI/MSI-X capability, which might have desired side effects. For
+	 * instance when assigning virtio legacy devices, enabling the MSI
+	 * capability modifies the config space layout!
+	 *
+	 * As an optimization, only update MSIs when guest unmasks the
+	 * capability. This greatly reduces the initialization time for Linux
+	 * guest with 2048+ MSIs. Linux guest starts by enabling the MSI-X cap
+	 * masked, then fills individual vectors, then unmasks the whole
+	 * function. So we only do one VFIO ioctl when enabling for the first
+	 * time, and then one when unmasking.
+	 *
+	 * phys_state is empty when it is enabled but no vector has been
+	 * registered via SET_IRQS yet.
+	 */
+	if (!msi_is_enabled(msis->phys_state) ||
+	    (!msi_is_masked(msis->virt_state) &&
+	     msi_is_empty(msis->phys_state))) {
+		bool empty = true;
+
+		for (i = 0; i < msis->nr_entries; i++) {
+			eventfds[i] = msis->entries[i].gsi >= 0 ?
+				      msis->entries[i].eventfd : -1;
+
+			if (eventfds[i] >= 0)
+				empty = false;
+		}
+
+		ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, msis->irq_set);
+		if (ret < 0) {
+			perror("VFIO_DEVICE_SET_IRQS(multi)");
+			return ret;
+		}
+
+		msi_set_enabled(msis->phys_state, true);
+		msi_set_empty(msis->phys_state, empty);
+
+		return 0;
+	}
+
+	if (msi_is_masked(msis->virt_state)) {
+		/* TODO: if phys_state is not empty nor masked, mask all vectors */
+		return 0;
+	}
+
+	/* Update individual vectors to avoid breaking those in use */
+	for (i = 0; i < msis->nr_entries; i++) {
+		struct vfio_pci_msi_entry *entry = &msis->entries[i];
+		int fd = entry->gsi >= 0 ? entry->eventfd : -1;
+
+		if (fd == eventfds[i])
+			continue;
+
+		single.irq.start = i;
+		set_vfio_irq_eventd_payload(&single, fd);
+
+		ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &single);
+		if (ret < 0) {
+			perror("VFIO_DEVICE_SET_IRQS(single)");
+			break;
+		}
+
+		eventfds[i] = fd;
+
+		if (msi_is_empty(msis->phys_state) && fd >= 0)
+			msi_set_empty(msis->phys_state, false);
+	}
+
+	return ret;
+}
+
+static int vfio_pci_disable_msis(struct kvm *kvm, struct vfio_device *vdev,
+				 bool msix)
+{
+	int ret;
+	struct vfio_pci_device *pdev = &vdev->pci;
+	struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi;
+	struct vfio_irq_set irq_set = {
+		.argsz	= sizeof(irq_set),
+		.flags 	= VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
+		.index 	= msis->info.index,
+		.start 	= 0,
+		.count	= 0,
+	};
+
+	if (!msi_is_enabled(msis->phys_state))
+		return 0;
+
+	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+	if (ret < 0) {
+		perror("VFIO_DEVICE_SET_IRQS(NONE)");
+		return ret;
+	}
+
+	msi_set_enabled(msis->phys_state, false);
+	msi_set_empty(msis->phys_state, true);
+
+	/*
+	 * When MSI or MSIX is disabled, this might be called when
+	 * PCI driver detects the MSI interrupt failure and wants to
+	 * rollback to INTx mode.  Thus enable INTx if the device
+	 * supports INTx mode in this case.
+	 */
+	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
+		ret = vfio_pci_enable_intx(kvm, vdev);
+
+	return ret >= 0 ? 0 : ret;
+}
+
+static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev,
+				     struct vfio_pci_msi_entry *entry)
+{
+	int ret;
+
+	if (entry->eventfd < 0) {
+		entry->eventfd = eventfd(0, 0);
+		if (entry->eventfd < 0) {
+			ret = -errno;
+			vfio_dev_err(vdev, "cannot create eventfd");
+			return ret;
+		}
+	}
+
+	/* Allocate IRQ if necessary */
+	if (entry->gsi < 0) {
+		int ret = irq__add_msix_route(kvm, &entry->config.msg,
+					      vdev->dev_hdr.dev_num << 3);
+		if (ret < 0) {
+			vfio_dev_err(vdev, "cannot create MSI-X route");
+			return ret;
+		}
+		entry->gsi = ret;
+	} else {
+		irq__update_msix_route(kvm, entry->gsi, &entry->config.msg);
+	}
+
+	/*
+	 * MSI masking is unimplemented in VFIO, so we have to handle it by
+	 * disabling/enabling IRQ route instead. We do it on the KVM side rather
+	 * than VFIO, because:
+	 * - it is 8x faster
+	 * - it allows to decouple masking logic from capability state.
+	 * - in masked state, after removing irqfd route, we could easily plug
+	 *   the eventfd in a local handler, in order to serve Pending Bit reads
+	 *   to the guest.
+	 *
+	 * So entry->phys_state is masked when there is no active irqfd route.
+	 */
+	if (msi_is_masked(entry->virt_state) == msi_is_masked(entry->phys_state))
+		return 0;
+
+	if (msi_is_masked(entry->phys_state)) {
+		ret = irq__add_irqfd(kvm, entry->gsi, entry->eventfd, -1);
+		if (ret < 0) {
+			vfio_dev_err(vdev, "cannot setup irqfd");
+			return ret;
+		}
+	} else {
+		irq__del_irqfd(kvm, entry->gsi, entry->eventfd);
+	}
+
+	msi_set_masked(entry->phys_state, msi_is_masked(entry->virt_state));
+
+	return 0;
+}
+
+static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
+				     u32 len, u8 is_write, void *ptr)
+{
+	struct vfio_pci_device *pdev = ptr;
+	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
+	u64 offset = addr - pba->guest_phys_addr;
+	struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
+
+	if (is_write)
+		return;
+
+	/*
+	 * TODO: emulate PBA. Hardware MSI-X is never masked, so reading the PBA
+	 * is completely useless here. Note that Linux doesn't use PBA.
+	 */
+	if (pread(vdev->fd, data, len, pba->offset + offset) != (ssize_t)len)
+		vfio_dev_err(vdev, "cannot access MSIX PBA\n");
+}
+
+static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
+				       u32 len, u8 is_write, void *ptr)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct vfio_pci_msi_entry *entry;
+	struct vfio_pci_device *pdev = ptr;
+	struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
+
+	u64 offset = addr - pdev->msix_table.guest_phys_addr;
+
+	size_t vector = offset / PCI_MSIX_ENTRY_SIZE;
+	off_t field = offset % PCI_MSIX_ENTRY_SIZE;
+
+	/*
+	 * PCI spec says that software must use aligned 4 or 8 bytes accesses
+	 * for the MSI-X tables.
+	 */
+	if ((len != 4 && len != 8) || addr & (len - 1)) {
+		vfio_dev_warn(vdev, "invalid MSI-X table access");
+		return;
+	}
+
+	entry = &pdev->msix.entries[vector];
+
+	mutex_lock(&pdev->msix.mutex);
+
+	if (!is_write) {
+		memcpy(data, (void *)&entry->config + field, len);
+		goto out_unlock;
+	}
+
+	memcpy((void *)&entry->config + field, data, len);
+
+	/*
+	 * Check if access touched the vector control register, which is at the
+	 * end of the MSI-X entry.
+	 */
+	if (field + len <= PCI_MSIX_ENTRY_VECTOR_CTRL)
+		goto out_unlock;
+
+	msi_set_masked(entry->virt_state, entry->config.ctrl &
+		       PCI_MSIX_ENTRY_CTRL_MASKBIT);
+
+	if (vfio_pci_update_msi_entry(kvm, vdev, entry) < 0)
+		/* Not much we can do here. */
+		vfio_dev_err(vdev, "failed to configure MSIX vector %zu", vector);
+
+	/* Update the physical capability if necessary */
+	if (vfio_pci_enable_msis(kvm, vdev, true))
+		vfio_dev_err(vdev, "cannot enable MSIX");
+
+out_unlock:
+	mutex_unlock(&pdev->msix.mutex);
+}
+
+static void vfio_pci_msix_cap_write(struct kvm *kvm,
+				    struct vfio_device *vdev, u8 off,
+				    void *data, int sz)
+{
+	struct vfio_pci_device *pdev = &vdev->pci;
+	off_t enable_pos = PCI_MSIX_FLAGS + 1;
+	bool enable;
+	u16 flags;
+
+	off -= pdev->msix.pos;
+
+	/* Check if access intersects with the MSI-X Enable bit */
+	if (off > enable_pos || off + sz <= enable_pos)
+		return;
+
+	/* Read byte that contains the Enable bit */
+	flags = *(u8 *)(data + enable_pos - off) << 8;
+
+	mutex_lock(&pdev->msix.mutex);
+
+	msi_set_masked(pdev->msix.virt_state, flags & PCI_MSIX_FLAGS_MASKALL);
+	enable = flags & PCI_MSIX_FLAGS_ENABLE;
+	msi_set_enabled(pdev->msix.virt_state, enable);
+
+	if (enable && vfio_pci_enable_msis(kvm, vdev, true))
+		vfio_dev_err(vdev, "cannot enable MSIX");
+	else if (!enable && vfio_pci_disable_msis(kvm, vdev, true))
+		vfio_dev_err(vdev, "cannot disable MSIX");
+
+	mutex_unlock(&pdev->msix.mutex);
+}
+
+static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev,
+				     u8 off, u8 *data, u32 sz)
+{
+	size_t i;
+	u32 mask = 0;
+	size_t mask_pos, start, limit;
+	struct vfio_pci_msi_entry *entry;
+	struct vfio_pci_device *pdev = &vdev->pci;
+	struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos);
+
+	if (!(msi_cap_64->ctrl & PCI_MSI_FLAGS_MASKBIT))
+		return 0;
+
+	if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT)
+		mask_pos = PCI_MSI_MASK_64;
+	else
+		mask_pos = PCI_MSI_MASK_32;
+
+	if (off >= mask_pos + 4 || off + sz <= mask_pos)
+		return 0;
+
+	/* Set mask to current state */
+	for (i = 0; i < pdev->msi.nr_entries; i++) {
+		entry = &pdev->msi.entries[i];
+		mask |= !!msi_is_masked(entry->virt_state) << i;
+	}
+
+	/* Update mask following the intersection of access and register */
+	start = max_t(size_t, off, mask_pos);
+	limit = min_t(size_t, off + sz, mask_pos + 4);
+
+	memcpy((void *)&mask + start - mask_pos, data + start - off,
+	       limit - start);
+
+	/* Update states if necessary */
+	for (i = 0; i < pdev->msi.nr_entries; i++) {
+		bool masked = mask & (1 << i);
+
+		entry = &pdev->msi.entries[i];
+		if (masked != msi_is_masked(entry->virt_state)) {
+			msi_set_masked(entry->virt_state, masked);
+			vfio_pci_update_msi_entry(kvm, vdev, entry);
+		}
+	}
+
+	return 1;
+}
+
+static void vfio_pci_msi_cap_write(struct kvm *kvm, struct vfio_device *vdev,
+				   u8 off, u8 *data, u32 sz)
+{
+	u8 ctrl;
+	struct msi_msg msg;
+	size_t i, nr_vectors;
+	struct vfio_pci_msi_entry *entry;
+	struct vfio_pci_device *pdev = &vdev->pci;
+	struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos);
+
+	off -= pdev->msi.pos;
+
+	mutex_lock(&pdev->msi.mutex);
+
+	/* Check if the guest is trying to update mask bits */
+	if (vfio_pci_msi_vector_write(kvm, vdev, off, data, sz))
+		goto out_unlock;
+
+	/* Only modify routes when guest pokes the enable bit */
+	if (off > PCI_MSI_FLAGS || off + sz <= PCI_MSI_FLAGS)
+		goto out_unlock;
+
+	ctrl = *(u8 *)(data + PCI_MSI_FLAGS - off);
+
+	msi_set_enabled(pdev->msi.virt_state, ctrl & PCI_MSI_FLAGS_ENABLE);
+
+	if (!msi_is_enabled(pdev->msi.virt_state)) {
+		vfio_pci_disable_msis(kvm, vdev, false);
+		goto out_unlock;
+	}
+
+	/* Create routes for the requested vectors */
+	nr_vectors = 1 << ((ctrl & PCI_MSI_FLAGS_QSIZE) >> 4);
+
+	msg.address_lo = msi_cap_64->address_lo;
+	if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) {
+		msg.address_hi = msi_cap_64->address_hi;
+		msg.data = msi_cap_64->data;
+	} else {
+		struct msi_cap_32 *msi_cap_32 = (void *)msi_cap_64;
+		msg.address_hi = 0;
+		msg.data = msi_cap_32->data;
+	}
+
+	for (i = 0; i < nr_vectors; i++) {
+		entry = &pdev->msi.entries[i];
+
+		/*
+		 * Set the MSI data value as required by the PCI local
+		 * bus specifications, MSI capability, "Message Data".
+		 */
+		msg.data &= ~(nr_vectors - 1);
+		msg.data |= i;
+
+		entry->config.msg = msg;
+		vfio_pci_update_msi_entry(kvm, vdev, entry);
+	}
+
+	/* Update the physical capability if necessary */
+	if (vfio_pci_enable_msis(kvm, vdev, false))
+		vfio_dev_err(vdev, "cannot enable MSI");
+
+out_unlock:
+	mutex_unlock(&pdev->msi.mutex);
+}
+
+static int vfio_pci_bar_activate(struct kvm *kvm,
+				 struct pci_device_header *pci_hdr,
+				 int bar_num, void *data)
+{
+	struct vfio_device *vdev = data;
+	struct vfio_pci_device *pdev = &vdev->pci;
+	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
+	struct vfio_pci_msix_table *table = &pdev->msix_table;
+	struct vfio_region *region;
+	u32 bar_addr;
+	bool has_msix;
+	int ret;
+
+	assert((u32)bar_num < vdev->info.num_regions);
+
+	region = &vdev->regions[bar_num];
+	has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX;
+
+	bar_addr = pci__bar_address(pci_hdr, bar_num);
+	if (pci__bar_is_io(pci_hdr, bar_num))
+		region->port_base = bar_addr;
+	else
+		region->guest_phys_addr = bar_addr;
+
+	if (has_msix && (u32)bar_num == table->bar) {
+		table->guest_phys_addr = region->guest_phys_addr;
+		ret = kvm__register_mmio(kvm, table->guest_phys_addr,
+					 table->size, false,
+					 vfio_pci_msix_table_access, pdev);
+		/*
+		 * The MSIX table and the PBA structure can share the same BAR,
+		 * but for convenience we register different regions for mmio
+		 * emulation. We want to we update both if they share the same
+		 * BAR.
+		 */
+		if (ret < 0 || table->bar != pba->bar)
+			goto out;
+	}
+
+	if (has_msix && (u32)bar_num == pba->bar) {
+		if (pba->bar == table->bar)
+			pba->guest_phys_addr = table->guest_phys_addr + table->size;
+		else
+			pba->guest_phys_addr = region->guest_phys_addr;
+		ret = kvm__register_mmio(kvm, pba->guest_phys_addr,
+					 pba->size, false,
+					 vfio_pci_msix_pba_access, pdev);
+		goto out;
+	}
+
+	ret = vfio_map_region(kvm, vdev, region);
+out:
+	return ret;
+}
+
+static int vfio_pci_bar_deactivate(struct kvm *kvm,
+				   struct pci_device_header *pci_hdr,
+				   int bar_num, void *data)
+{
+	struct vfio_device *vdev = data;
+	struct vfio_pci_device *pdev = &vdev->pci;
+	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
+	struct vfio_pci_msix_table *table = &pdev->msix_table;
+	struct vfio_region *region;
+	bool has_msix, success;
+	int ret;
+
+	assert((u32)bar_num < vdev->info.num_regions);
+
+	region = &vdev->regions[bar_num];
+	has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX;
+
+	if (has_msix && (u32)bar_num == table->bar) {
+		success = kvm__deregister_mmio(kvm, table->guest_phys_addr);
+		/* kvm__deregister_mmio fails when the region is not found. */
+		ret = (success ? 0 : -ENOENT);
+		/* See vfio_pci_bar_activate(). */
+		if (ret < 0 || table->bar!= pba->bar)
+			goto out;
+	}
+
+	if (has_msix && (u32)bar_num == pba->bar) {
+		success = kvm__deregister_mmio(kvm, pba->guest_phys_addr);
+		ret = (success ? 0 : -ENOENT);
+		goto out;
+	}
+
+	vfio_unmap_region(kvm, region);
+	ret = 0;
+
+out:
+	return ret;
+}
+
+static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr,
+			      u8 offset, void *data, int sz)
+{
+	struct vfio_region_info *info;
+	struct vfio_pci_device *pdev;
+	struct vfio_device *vdev;
+	char base[sz];
+
+	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
+	vdev = container_of(pdev, struct vfio_device, pci);
+	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+
+	/* Dummy read in case of side-effects */
+	if (pread(vdev->fd, base, sz, info->offset + offset) != sz)
+		vfio_dev_warn(vdev, "failed to read %d bytes from Configuration Space at 0x%x",
+			      sz, offset);
+}
+
+static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr,
+			       u8 offset, void *data, int sz)
+{
+	struct vfio_region_info *info;
+	struct vfio_pci_device *pdev;
+	struct vfio_device *vdev;
+	u32 tmp;
+
+	/* Make sure a larger size will not overrun tmp on the stack. */
+	assert(sz <= 4);
+
+	if (offset == PCI_ROM_ADDRESS)
+		return;
+
+	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
+	vdev = container_of(pdev, struct vfio_device, pci);
+	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+
+	if (pwrite(vdev->fd, data, sz, info->offset + offset) != sz)
+		vfio_dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x",
+			      sz, offset);
+
+	/* Handle MSI write now, since it might update the hardware capability */
+	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX)
+		vfio_pci_msix_cap_write(kvm, vdev, offset, data, sz);
+
+	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI)
+		vfio_pci_msi_cap_write(kvm, vdev, offset, data, sz);
+
+	if (pread(vdev->fd, &tmp, sz, info->offset + offset) != sz)
+		vfio_dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x",
+			      sz, offset);
+}
+
+static ssize_t vfio_pci_msi_cap_size(struct msi_cap_64 *cap_hdr)
+{
+	size_t size = 10;
+
+	if (cap_hdr->ctrl & PCI_MSI_FLAGS_64BIT)
+		size += 4;
+	if (cap_hdr->ctrl & PCI_MSI_FLAGS_MASKBIT)
+		size += 10;
+
+	return size;
+}
+
+static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr)
+{
+	switch (cap_hdr->type) {
+	case PCI_CAP_ID_MSIX:
+		return PCI_CAP_MSIX_SIZEOF;
+	case PCI_CAP_ID_MSI:
+		return vfio_pci_msi_cap_size((void *)cap_hdr);
+	default:
+		pr_err("unknown PCI capability 0x%x", cap_hdr->type);
+		return 0;
+	}
+}
+
+static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr,
+			    struct pci_cap_hdr *cap, off_t pos)
+{
+	struct pci_cap_hdr *last;
+	struct pci_device_header *hdr = &vdev->pci.hdr;
+
+	cap->next = 0;
+
+	if (!hdr->capabilities) {
+		hdr->capabilities = pos;
+		hdr->status |= PCI_STATUS_CAP_LIST;
+	} else {
+		last = PCI_CAP(virt_hdr, hdr->capabilities);
+
+		while (last->next)
+			last = PCI_CAP(virt_hdr, last->next);
+
+		last->next = pos;
+	}
+
+	memcpy(virt_hdr + pos, cap, vfio_pci_cap_size(cap));
+
+	return 0;
+}
+
+static int vfio_pci_parse_caps(struct vfio_device *vdev)
+{
+	int ret;
+	size_t size;
+	u8 pos, next;
+	struct pci_cap_hdr *cap;
+	u8 virt_hdr[PCI_DEV_CFG_SIZE];
+	struct vfio_pci_device *pdev = &vdev->pci;
+
+	if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
+		return 0;
+
+	memset(virt_hdr, 0, PCI_DEV_CFG_SIZE);
+
+	pos = pdev->hdr.capabilities & ~3;
+
+	pdev->hdr.status &= ~PCI_STATUS_CAP_LIST;
+	pdev->hdr.capabilities = 0;
+
+	for (; pos; pos = next) {
+		cap = PCI_CAP(&pdev->hdr, pos);
+		next = cap->next;
+
+		switch (cap->type) {
+		case PCI_CAP_ID_MSIX:
+			ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
+			if (ret)
+				return ret;
+
+			pdev->msix.pos = pos;
+			pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSIX;
+			break;
+		case PCI_CAP_ID_MSI:
+			ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
+			if (ret)
+				return ret;
+
+			pdev->msi.pos = pos;
+			pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSI;
+			break;
+		}
+	}
+
+	/* Wipe remaining capabilities */
+	pos = PCI_STD_HEADER_SIZEOF;
+	size = PCI_DEV_CFG_SIZE - PCI_STD_HEADER_SIZEOF;
+	memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size);
+
+	return 0;
+}
+
+static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
+{
+	ssize_t sz = PCI_DEV_CFG_SIZE;
+	struct vfio_region_info *info;
+	struct vfio_pci_device *pdev = &vdev->pci;
+
+	if (vdev->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
+		vfio_dev_err(vdev, "Config Space not found");
+		return -ENODEV;
+	}
+
+	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+	*info = (struct vfio_region_info) {
+			.argsz = sizeof(*info),
+			.index = VFIO_PCI_CONFIG_REGION_INDEX,
+	};
+
+	ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
+	if (!info->size) {
+		vfio_dev_err(vdev, "Config Space has size zero?!");
+		return -EINVAL;
+	}
+
+	/* Read standard headers and capabilities */
+	if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) {
+		vfio_dev_err(vdev, "failed to read %zd bytes of Config Space", sz);
+		return -EIO;
+	}
+
+	/* Strip bit 7, that indicates multifunction */
+	pdev->hdr.header_type &= 0x7f;
+
+	if (pdev->hdr.header_type != PCI_HEADER_TYPE_NORMAL) {
+		vfio_dev_err(vdev, "unsupported header type %u",
+			     pdev->hdr.header_type);
+		return -EOPNOTSUPP;
+	}
+
+	if (pdev->hdr.irq_pin)
+		pdev->irq_modes |= VFIO_PCI_IRQ_MODE_INTX;
+
+	vfio_pci_parse_caps(vdev);
+
+	return 0;
+}
+
+static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
+{
+	int i;
+	u64 base;
+	ssize_t hdr_sz;
+	struct msix_cap *msix;
+	struct vfio_region_info *info;
+	struct vfio_pci_device *pdev = &vdev->pci;
+	struct vfio_region *region;
+
+	/* Initialise the BARs */
+	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
+		if ((u32)i == vdev->info.num_regions)
+			break;
+
+		region = &vdev->regions[i];
+		/* Construct a fake reg to match what we've mapped. */
+		if (region->is_ioport) {
+			base = (region->port_base & PCI_BASE_ADDRESS_IO_MASK) |
+				PCI_BASE_ADDRESS_SPACE_IO;
+		} else {
+			base = (region->guest_phys_addr &
+				PCI_BASE_ADDRESS_MEM_MASK) |
+				PCI_BASE_ADDRESS_SPACE_MEMORY;
+		}
+
+		pdev->hdr.bar[i] = base;
+
+		if (!base)
+			continue;
+
+		pdev->hdr.bar_size[i] = region->info.size;
+	}
+
+	/* I really can't be bothered to support cardbus. */
+	pdev->hdr.card_bus = 0;
+
+	/*
+	 * Nuke the expansion ROM for now. If we want to do this properly,
+	 * we need to save its size somewhere and map into the guest.
+	 */
+	pdev->hdr.exp_rom_bar = 0;
+
+	/* Plumb in our fake MSI-X capability, if we have it. */
+	msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX);
+	if (msix) {
+		/* Add a shortcut to the PBA region for the MMIO handler */
+		int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar;
+		pdev->msix_pba.offset = vdev->regions[pba_index].info.offset +
+					(msix->pba_offset & PCI_MSIX_PBA_OFFSET);
+
+		/* Tidy up the capability */
+		msix->table_offset &= PCI_MSIX_TABLE_BIR;
+		msix->pba_offset &= PCI_MSIX_PBA_BIR;
+		if (pdev->msix_table.bar == pdev->msix_pba.bar)
+			msix->pba_offset |= pdev->msix_table.size &
+					    PCI_MSIX_PBA_OFFSET;
+	}
+
+	/* Install our fake Configuration Space */
+	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+	hdr_sz = PCI_DEV_CFG_SIZE;
+	if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
+		vfio_dev_err(vdev, "failed to write %zd bytes to Config Space",
+			     hdr_sz);
+		return -EIO;
+	}
+
+	/* Register callbacks for cfg accesses */
+	pdev->hdr.cfg_ops = (struct pci_config_operations) {
+		.read	= vfio_pci_cfg_read,
+		.write	= vfio_pci_cfg_write,
+	};
+
+	pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH;
+
+	return 0;
+}
+
+static int vfio_pci_get_region_info(struct vfio_device *vdev, u32 index,
+				    struct vfio_region_info *info)
+{
+	int ret;
+
+	*info = (struct vfio_region_info) {
+		.argsz = sizeof(*info),
+		.index = index,
+	};
+
+	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
+	if (ret) {
+		ret = -errno;
+		vfio_dev_err(vdev, "cannot get info for BAR %u", index);
+		return ret;
+	}
+
+	if (info->size && !is_power_of_two(info->size)) {
+		vfio_dev_err(vdev, "region is not power of two: 0x%llx",
+				info->size);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int vfio_pci_create_msix_table(struct kvm *kvm, struct vfio_device *vdev)
+{
+	int ret;
+	size_t i;
+	size_t map_size;
+	size_t nr_entries;
+	struct vfio_pci_msi_entry *entries;
+	struct vfio_pci_device *pdev = &vdev->pci;
+	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
+	struct vfio_pci_msix_table *table = &pdev->msix_table;
+	struct msix_cap *msix = PCI_CAP(&pdev->hdr, pdev->msix.pos);
+	struct vfio_region_info info;
+
+	table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR;
+	pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR;
+
+	/*
+	 * KVM needs memory regions to be multiple of and aligned on PAGE_SIZE.
+	 */
+	nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
+	table->size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, PAGE_SIZE);
+	pba->size = ALIGN(DIV_ROUND_UP(nr_entries, 64), PAGE_SIZE);
+
+	entries = calloc(nr_entries, sizeof(struct vfio_pci_msi_entry));
+	if (!entries)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_entries; i++)
+		entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
+
+	ret = vfio_pci_get_region_info(vdev, table->bar, &info);
+	if (ret)
+		return ret;
+	if (!info.size)
+		return -EINVAL;
+	map_size = info.size;
+
+	if (table->bar != pba->bar) {
+		ret = vfio_pci_get_region_info(vdev, pba->bar, &info);
+		if (ret)
+			return ret;
+		if (!info.size)
+			return -EINVAL;
+		map_size += info.size;
+	}
+
+	/*
+	 * To ease MSI-X cap configuration in case they share the same BAR,
+	 * collapse table and pending array. The size of the BAR regions must be
+	 * powers of two.
+	 */
+	map_size = ALIGN(map_size, PAGE_SIZE);
+	table->guest_phys_addr = pci_get_mmio_block(map_size);
+	if (!table->guest_phys_addr) {
+		pr_err("cannot allocate MMIO space");
+		ret = -ENOMEM;
+		goto out_free;
+	}
+
+	/*
+	 * We could map the physical PBA directly into the guest, but it's
+	 * likely smaller than a page, and we can only hand full pages to the
+	 * guest. Even though the PCI spec disallows sharing a page used for
+	 * MSI-X with any other resource, it allows to share the same page
+	 * between MSI-X table and PBA. For the sake of isolation, create a
+	 * virtual PBA.
+	 */
+	pba->guest_phys_addr = table->guest_phys_addr + table->size;
+
+	pdev->msix.entries = entries;
+	pdev->msix.nr_entries = nr_entries;
+
+	return 0;
+
+out_free:
+	free(entries);
+
+	return ret;
+}
+
+static int vfio_pci_create_msi_cap(struct kvm *kvm, struct vfio_pci_device *pdev)
+{
+	struct msi_cap_64 *cap = PCI_CAP(&pdev->hdr, pdev->msi.pos);
+
+	pdev->msi.nr_entries = 1 << ((cap->ctrl & PCI_MSI_FLAGS_QMASK) >> 1),
+	pdev->msi.entries = calloc(pdev->msi.nr_entries,
+				   sizeof(struct vfio_pci_msi_entry));
+	if (!pdev->msi.entries)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev,
+				  size_t nr)
+{
+	int ret;
+	u32 bar;
+	size_t map_size;
+	struct vfio_pci_device *pdev = &vdev->pci;
+	struct vfio_region *region;
+
+	if (nr >= vdev->info.num_regions)
+		return 0;
+
+	region = &vdev->regions[nr];
+	bar = pdev->hdr.bar[nr];
+
+	region->vdev = vdev;
+	region->is_ioport = !!(bar & PCI_BASE_ADDRESS_SPACE_IO);
+
+	ret = vfio_pci_get_region_info(vdev, nr, &region->info);
+	if (ret)
+		return ret;
+
+	/* Ignore invalid or unimplemented regions */
+	if (!region->info.size)
+		return 0;
+
+	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
+		/* Trap and emulate MSI-X table */
+		if (nr == pdev->msix_table.bar) {
+			region->guest_phys_addr = pdev->msix_table.guest_phys_addr;
+			return 0;
+		} else if (nr == pdev->msix_pba.bar) {
+			region->guest_phys_addr = pdev->msix_pba.guest_phys_addr;
+			return 0;
+		}
+	}
+
+	if (region->is_ioport) {
+		region->port_base = pci_get_io_port_block(region->info.size);
+	} else {
+		/* Grab some MMIO space in the guest */
+		map_size = ALIGN(region->info.size, PAGE_SIZE);
+		region->guest_phys_addr = pci_get_mmio_block(map_size);
+	}
+
+	return 0;
+}
+
+static int vfio_pci_configure_dev_regions(struct kvm *kvm,
+					  struct vfio_device *vdev)
+{
+	int ret;
+	u32 bar;
+	size_t i;
+	bool is_64bit = false;
+	struct vfio_pci_device *pdev = &vdev->pci;
+
+	ret = vfio_pci_parse_cfg_space(vdev);
+	if (ret)
+		return ret;
+
+	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
+		ret = vfio_pci_create_msix_table(kvm, vdev);
+		if (ret)
+			return ret;
+	}
+
+	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) {
+		ret = vfio_pci_create_msi_cap(kvm, pdev);
+		if (ret)
+			return ret;
+	}
+
+	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
+		/* Ignore top half of 64-bit BAR */
+		if (is_64bit) {
+			is_64bit = false;
+			continue;
+		}
+
+		ret = vfio_pci_configure_bar(kvm, vdev, i);
+		if (ret)
+			return ret;
+
+		bar = pdev->hdr.bar[i];
+		is_64bit = (bar & PCI_BASE_ADDRESS_SPACE) ==
+			   PCI_BASE_ADDRESS_SPACE_MEMORY &&
+			   bar & PCI_BASE_ADDRESS_MEM_TYPE_64;
+	}
+
+	/* We've configured the BARs, fake up a Configuration Space */
+	ret = vfio_pci_fixup_cfg_space(vdev);
+	if (ret)
+		return ret;
+
+	return pci__register_bar_regions(kvm, &pdev->hdr, vfio_pci_bar_activate,
+					 vfio_pci_bar_deactivate, vdev);
+}
+
+/*
+ * Attempt to update the FD limit, if opening an eventfd for each IRQ vector
+ * would hit the limit. Which is likely to happen when a device uses 2048 MSIs.
+ */
+static int vfio_pci_reserve_irq_fds(size_t num)
+{
+	/*
+	 * I counted around 27 fds under normal load. Let's add 100 for good
+	 * measure.
+	 */
+	static size_t needed = 128;
+	struct rlimit fd_limit, new_limit;
+
+	needed += num;
+
+	if (getrlimit(RLIMIT_NOFILE, &fd_limit)) {
+		perror("getrlimit(RLIMIT_NOFILE)");
+		return 0;
+	}
+
+	if (fd_limit.rlim_cur >= needed)
+		return 0;
+
+	new_limit.rlim_cur = needed;
+
+	if (fd_limit.rlim_max < needed)
+		/* Try to bump hard limit (root only) */
+		new_limit.rlim_max = needed;
+	else
+		new_limit.rlim_max = fd_limit.rlim_max;
+
+	if (setrlimit(RLIMIT_NOFILE, &new_limit)) {
+		perror("setrlimit(RLIMIT_NOFILE)");
+		pr_warning("not enough FDs for full MSI-X support (estimated need: %zu)",
+			   (size_t)(needed - fd_limit.rlim_cur));
+	}
+
+	return 0;
+}
+
+static int vfio_pci_init_msis(struct kvm *kvm, struct vfio_device *vdev,
+			     struct vfio_pci_msi_common *msis)
+{
+	int ret;
+	size_t i;
+	int *eventfds;
+	size_t irq_set_size;
+	struct vfio_pci_msi_entry *entry;
+	size_t nr_entries = msis->nr_entries;
+
+	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &msis->info);
+	if (ret || msis->info.count == 0) {
+		vfio_dev_err(vdev, "no MSI reported by VFIO");
+		return -ENODEV;
+	}
+
+	if (!(msis->info.flags & VFIO_IRQ_INFO_EVENTFD)) {
+		vfio_dev_err(vdev, "interrupt not EVENTFD capable");
+		return -EINVAL;
+	}
+
+	if (msis->info.count != nr_entries) {
+		vfio_dev_err(vdev, "invalid number of MSIs reported by VFIO");
+		return -EINVAL;
+	}
+
+	mutex_init(&msis->mutex);
+
+	vfio_pci_reserve_irq_fds(nr_entries);
+
+	irq_set_size = sizeof(struct vfio_irq_set) + nr_entries * sizeof(int);
+	msis->irq_set = malloc(irq_set_size);
+	if (!msis->irq_set)
+		return -ENOMEM;
+
+	*msis->irq_set = (struct vfio_irq_set) {
+		.argsz	= irq_set_size,
+		.flags 	= VFIO_IRQ_SET_DATA_EVENTFD |
+			  VFIO_IRQ_SET_ACTION_TRIGGER,
+		.index 	= msis->info.index,
+		.start 	= 0,
+		.count 	= nr_entries,
+	};
+
+	eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
+
+	for (i = 0; i < nr_entries; i++) {
+		entry = &msis->entries[i];
+		entry->gsi = -1;
+		entry->eventfd = -1;
+		msi_set_masked(entry->virt_state, true);
+		msi_set_masked(entry->phys_state, true);
+		eventfds[i] = -1;
+	}
+
+	return 0;
+}
+
+static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev)
+{
+	struct vfio_pci_device *pdev = &vdev->pci;
+	int gsi = pdev->intx_gsi;
+	struct vfio_irq_set irq_set = {
+		.argsz	= sizeof(irq_set),
+		.flags	= VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
+		.index	= VFIO_PCI_INTX_IRQ_INDEX,
+	};
+
+	if (pdev->intx_fd == -1)
+		return;
+
+	pr_debug("user requested MSI, disabling INTx %d", gsi);
+
+	ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+	irq__del_irqfd(kvm, gsi, pdev->intx_fd);
+
+	close(pdev->intx_fd);
+	close(pdev->unmask_fd);
+	pdev->intx_fd = -1;
+}
+
+static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev)
+{
+	int ret;
+	int trigger_fd, unmask_fd;
+	union vfio_irq_eventfd	trigger;
+	union vfio_irq_eventfd	unmask;
+	struct vfio_pci_device *pdev = &vdev->pci;
+	int gsi = pdev->intx_gsi;
+
+	if (pdev->intx_fd != -1)
+		return 0;
+
+	/*
+	 * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd
+	 * signals an interrupt from host to guest, and unmask_fd signals the
+	 * deassertion of the line from guest to host.
+	 */
+	trigger_fd = eventfd(0, 0);
+	if (trigger_fd < 0) {
+		vfio_dev_err(vdev, "failed to create trigger eventfd");
+		return trigger_fd;
+	}
+
+	unmask_fd = eventfd(0, 0);
+	if (unmask_fd < 0) {
+		vfio_dev_err(vdev, "failed to create unmask eventfd");
+		close(trigger_fd);
+		return unmask_fd;
+	}
+
+	ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd);
+	if (ret)
+		goto err_close;
+
+	trigger.irq = (struct vfio_irq_set) {
+		.argsz	= sizeof(trigger),
+		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
+		.index	= VFIO_PCI_INTX_IRQ_INDEX,
+		.start	= 0,
+		.count	= 1,
+	};
+	set_vfio_irq_eventd_payload(&trigger, trigger_fd);
+
+	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
+	if (ret < 0) {
+		vfio_dev_err(vdev, "failed to setup VFIO IRQ");
+		goto err_delete_line;
+	}
+
+	unmask.irq = (struct vfio_irq_set) {
+		.argsz	= sizeof(unmask),
+		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK,
+		.index	= VFIO_PCI_INTX_IRQ_INDEX,
+		.start	= 0,
+		.count	= 1,
+	};
+	set_vfio_irq_eventd_payload(&unmask, unmask_fd);
+
+	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &unmask);
+	if (ret < 0) {
+		vfio_dev_err(vdev, "failed to setup unmask IRQ");
+		goto err_remove_event;
+	}
+
+	pdev->intx_fd = trigger_fd;
+	pdev->unmask_fd = unmask_fd;
+
+	return 0;
+
+err_remove_event:
+	/* Remove trigger event */
+	trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
+	trigger.irq.count = 0;
+	ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
+
+err_delete_line:
+	irq__del_irqfd(kvm, gsi, trigger_fd);
+
+err_close:
+	close(trigger_fd);
+	close(unmask_fd);
+	return ret;
+}
+
+static int vfio_pci_init_intx(struct kvm *kvm, struct vfio_device *vdev)
+{
+	int ret;
+	struct vfio_pci_device *pdev = &vdev->pci;
+	struct vfio_irq_info irq_info = {
+		.argsz = sizeof(irq_info),
+		.index = VFIO_PCI_INTX_IRQ_INDEX,
+	};
+
+	vfio_pci_reserve_irq_fds(2);
+
+	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
+	if (ret || irq_info.count == 0) {
+		vfio_dev_err(vdev, "no INTx reported by VFIO");
+		return -ENODEV;
+	}
+
+	if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
+		vfio_dev_err(vdev, "interrupt not eventfd capable");
+		return -EINVAL;
+	}
+
+	if (!(irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) {
+		vfio_dev_err(vdev, "INTx interrupt not AUTOMASKED");
+		return -EINVAL;
+	}
+
+	/* Guest is going to ovewrite our irq_line... */
+	pdev->intx_gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET;
+
+	pdev->intx_fd = -1;
+
+	return 0;
+}
+
+static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev)
+{
+	int ret = 0;
+	struct vfio_pci_device *pdev = &vdev->pci;
+
+	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
+		pdev->msix.info = (struct vfio_irq_info) {
+			.argsz = sizeof(pdev->msix.info),
+			.index = VFIO_PCI_MSIX_IRQ_INDEX,
+		};
+		ret = vfio_pci_init_msis(kvm, vdev, &pdev->msix);
+		if (ret)
+			return ret;
+	}
+
+	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) {
+		pdev->msi.info = (struct vfio_irq_info) {
+			.argsz = sizeof(pdev->msi.info),
+			.index = VFIO_PCI_MSI_IRQ_INDEX,
+		};
+		ret = vfio_pci_init_msis(kvm, vdev, &pdev->msi);
+		if (ret)
+			return ret;
+	}
+
+	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) {
+		pci__assign_irq(&vdev->pci.hdr);
+
+		ret = vfio_pci_init_intx(kvm, vdev);
+		if (ret)
+			return ret;
+
+		ret = vfio_pci_enable_intx(kvm, vdev);
+	}
+
+	return ret;
+}
+
+int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev)
+{
+	int ret;
+
+	ret = vfio_pci_configure_dev_regions(kvm, vdev);
+	if (ret) {
+		vfio_dev_err(vdev, "failed to configure regions");
+		return ret;
+	}
+
+	vdev->dev_hdr = (struct device_header) {
+		.bus_type	= DEVICE_BUS_PCI,
+		.data		= &vdev->pci.hdr,
+	};
+
+	ret = device__register(&vdev->dev_hdr);
+	if (ret) {
+		vfio_dev_err(vdev, "failed to register VFIO device");
+		return ret;
+	}
+
+	ret = vfio_pci_configure_dev_irqs(kvm, vdev);
+	if (ret) {
+		vfio_dev_err(vdev, "failed to configure IRQs");
+		return ret;
+	}
+
+	return 0;
+}
+
+void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev)
+{
+	size_t i;
+	struct vfio_pci_device *pdev = &vdev->pci;
+
+	for (i = 0; i < vdev->info.num_regions; i++)
+		vfio_unmap_region(kvm, &vdev->regions[i]);
+
+	device__unregister(&vdev->dev_hdr);
+
+	free(pdev->msix.irq_set);
+	free(pdev->msix.entries);
+	free(pdev->msi.irq_set);
+	free(pdev->msi.entries);
+}
diff --git a/kvmtool/virtio/9p-pdu.c b/kvmtool/virtio/9p-pdu.c
new file mode 100644
index 0000000..c0b95e0
--- /dev/null
+++ b/kvmtool/virtio/9p-pdu.c
@@ -0,0 +1,289 @@
+#include "kvm/util.h"
+#include "kvm/virtio-9p.h"
+
+#include <endian.h>
+#include <stdint.h>
+
+#include <linux/compiler.h>
+#include <linux/9p.h>
+
+static void virtio_p9_pdu_read(struct p9_pdu *pdu, void *data, size_t size)
+{
+	size_t len;
+	int i, copied = 0;
+	u16 iov_cnt = pdu->out_iov_cnt;
+	size_t offset = pdu->read_offset;
+	struct iovec *iov = pdu->out_iov;
+
+	for (i = 0; i < iov_cnt && size; i++) {
+		if (offset >= iov[i].iov_len) {
+			offset -= iov[i].iov_len;
+			continue;
+		} else {
+			len = MIN(iov[i].iov_len - offset, size);
+			memcpy(data, iov[i].iov_base + offset, len);
+			size -= len;
+			data += len;
+			offset = 0;
+			copied += len;
+		}
+	}
+	pdu->read_offset += copied;
+}
+
+static void virtio_p9_pdu_write(struct p9_pdu *pdu,
+				const void *data, size_t size)
+{
+	size_t len;
+	int i, copied = 0;
+	u16 iov_cnt = pdu->in_iov_cnt;
+	size_t offset = pdu->write_offset;
+	struct iovec *iov = pdu->in_iov;
+
+	for (i = 0; i < iov_cnt && size; i++) {
+		if (offset >= iov[i].iov_len) {
+			offset -= iov[i].iov_len;
+			continue;
+		} else {
+			len = MIN(iov[i].iov_len - offset, size);
+			memcpy(iov[i].iov_base + offset, data, len);
+			size -= len;
+			data += len;
+			offset = 0;
+			copied += len;
+		}
+	}
+	pdu->write_offset += copied;
+}
+
+static void virtio_p9_wstat_free(struct p9_wstat *stbuf)
+{
+	free(stbuf->name);
+	free(stbuf->uid);
+	free(stbuf->gid);
+	free(stbuf->muid);
+}
+
+static int virtio_p9_decode(struct p9_pdu *pdu, const char *fmt, va_list ap)
+{
+	int retval = 0;
+	const char *ptr;
+
+	for (ptr = fmt; *ptr; ptr++) {
+		switch (*ptr) {
+		case 'b':
+		{
+			int8_t *val = va_arg(ap, int8_t *);
+			virtio_p9_pdu_read(pdu, val, sizeof(*val));
+		}
+		break;
+		case 'w':
+		{
+			int16_t le_val;
+			int16_t *val = va_arg(ap, int16_t *);
+			virtio_p9_pdu_read(pdu, &le_val, sizeof(le_val));
+			*val = le16toh(le_val);
+		}
+		break;
+		case 'd':
+		{
+			int32_t le_val;
+			int32_t *val = va_arg(ap, int32_t *);
+			virtio_p9_pdu_read(pdu, &le_val, sizeof(le_val));
+			*val = le32toh(le_val);
+		}
+		break;
+		case 'q':
+		{
+			int64_t le_val;
+			int64_t *val = va_arg(ap, int64_t *);
+			virtio_p9_pdu_read(pdu, &le_val, sizeof(le_val));
+			*val = le64toh(le_val);
+		}
+		break;
+		case 's':
+		{
+			int16_t len;
+			char **str = va_arg(ap, char **);
+
+			virtio_p9_pdu_readf(pdu, "w", &len);
+			*str = malloc(len + 1);
+			if (*str == NULL) {
+				retval = ENOMEM;
+				break;
+			}
+			virtio_p9_pdu_read(pdu, *str, len);
+			(*str)[len] = 0;
+		}
+		break;
+		case 'Q':
+		{
+			struct p9_qid *qid = va_arg(ap, struct p9_qid *);
+			retval = virtio_p9_pdu_readf(pdu, "bdq",
+						     &qid->type, &qid->version,
+						     &qid->path);
+		}
+		break;
+		case 'S':
+		{
+			struct p9_wstat *stbuf = va_arg(ap, struct p9_wstat *);
+			memset(stbuf, 0, sizeof(struct p9_wstat));
+			stbuf->n_uid = KUIDT_INIT(-1);
+			stbuf->n_gid = KGIDT_INIT(-1);
+			stbuf->n_muid = KUIDT_INIT(-1);
+			retval = virtio_p9_pdu_readf(pdu, "wwdQdddqssss",
+						&stbuf->size, &stbuf->type,
+						&stbuf->dev, &stbuf->qid,
+						&stbuf->mode, &stbuf->atime,
+						&stbuf->mtime, &stbuf->length,
+						&stbuf->name, &stbuf->uid,
+						&stbuf->gid, &stbuf->muid);
+			if (retval)
+				virtio_p9_wstat_free(stbuf);
+		}
+		break;
+		case 'I':
+		{
+			struct p9_iattr_dotl *p9attr = va_arg(ap,
+						       struct p9_iattr_dotl *);
+
+			retval = virtio_p9_pdu_readf(pdu, "ddddqqqqq",
+						     &p9attr->valid,
+						     &p9attr->mode,
+						     &p9attr->uid,
+						     &p9attr->gid,
+						     &p9attr->size,
+						     &p9attr->atime_sec,
+						     &p9attr->atime_nsec,
+						     &p9attr->mtime_sec,
+						     &p9attr->mtime_nsec);
+		}
+		break;
+		default:
+			retval = EINVAL;
+			break;
+		}
+	}
+	return retval;
+}
+
+static int virtio_p9_pdu_encode(struct p9_pdu *pdu, const char *fmt, va_list ap)
+{
+	int retval = 0;
+	const char *ptr;
+
+	for (ptr = fmt; *ptr; ptr++) {
+		switch (*ptr) {
+		case 'b':
+		{
+			int8_t val = va_arg(ap, int);
+			virtio_p9_pdu_write(pdu, &val, sizeof(val));
+		}
+		break;
+		case 'w':
+		{
+			int16_t val = htole16(va_arg(ap, int));
+			virtio_p9_pdu_write(pdu, &val, sizeof(val));
+		}
+		break;
+		case 'd':
+		{
+			int32_t val = htole32(va_arg(ap, int32_t));
+			virtio_p9_pdu_write(pdu, &val, sizeof(val));
+		}
+		break;
+		case 'q':
+		{
+			int64_t val = htole64(va_arg(ap, int64_t));
+			virtio_p9_pdu_write(pdu, &val, sizeof(val));
+		}
+		break;
+		case 's':
+		{
+			uint16_t len = 0;
+			const char *s = va_arg(ap, char *);
+			if (s)
+				len = MIN(strlen(s), USHRT_MAX);
+			virtio_p9_pdu_writef(pdu, "w", len);
+			virtio_p9_pdu_write(pdu, s, len);
+		}
+		break;
+		case 'Q':
+		{
+			struct p9_qid *qid = va_arg(ap, struct p9_qid *);
+			retval = virtio_p9_pdu_writef(pdu, "bdq",
+						      qid->type, qid->version,
+						      qid->path);
+		}
+		break;
+		case 'S':
+		{
+			struct p9_wstat *stbuf = va_arg(ap, struct p9_wstat *);
+			retval = virtio_p9_pdu_writef(pdu, "wwdQdddqssss",
+						stbuf->size, stbuf->type,
+						stbuf->dev, &stbuf->qid,
+						stbuf->mode, stbuf->atime,
+						stbuf->mtime, stbuf->length,
+						stbuf->name, stbuf->uid,
+						stbuf->gid, stbuf->muid);
+		}
+		break;
+		case 'A':
+		{
+			struct p9_stat_dotl *stbuf = va_arg(ap,
+						      struct p9_stat_dotl *);
+			retval  = virtio_p9_pdu_writef(pdu,
+						       "qQdddqqqqqqqqqqqqqqq",
+						       stbuf->st_result_mask,
+						       &stbuf->qid,
+						       stbuf->st_mode,
+						       stbuf->st_uid,
+						       stbuf->st_gid,
+						       stbuf->st_nlink,
+						       stbuf->st_rdev,
+						       stbuf->st_size,
+						       stbuf->st_blksize,
+						       stbuf->st_blocks,
+						       stbuf->st_atime_sec,
+						       stbuf->st_atime_nsec,
+						       stbuf->st_mtime_sec,
+						       stbuf->st_mtime_nsec,
+						       stbuf->st_ctime_sec,
+						       stbuf->st_ctime_nsec,
+						       stbuf->st_btime_sec,
+						       stbuf->st_btime_nsec,
+						       stbuf->st_gen,
+						       stbuf->st_data_version);
+		}
+		break;
+		default:
+			retval = EINVAL;
+			break;
+		}
+	}
+	return retval;
+}
+
+int virtio_p9_pdu_readf(struct p9_pdu *pdu, const char *fmt, ...)
+{
+	int ret;
+	va_list ap;
+
+	va_start(ap, fmt);
+	ret = virtio_p9_decode(pdu, fmt, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+int virtio_p9_pdu_writef(struct p9_pdu *pdu, const char *fmt, ...)
+{
+	int ret;
+	va_list ap;
+
+	va_start(ap, fmt);
+	ret = virtio_p9_pdu_encode(pdu, fmt, ap);
+	va_end(ap);
+
+	return ret;
+}
diff --git a/kvmtool/virtio/9p.c b/kvmtool/virtio/9p.c
new file mode 100644
index 0000000..b78f2b3
--- /dev/null
+++ b/kvmtool/virtio/9p.c
@@ -0,0 +1,1609 @@
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/ioport.h"
+#include "kvm/util.h"
+#include "kvm/threadpool.h"
+#include "kvm/irq.h"
+#include "kvm/virtio-9p.h"
+#include "kvm/guest_compat.h"
+#include "kvm/builtin-setup.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/vfs.h>
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_9p.h>
+#include <linux/9p.h>
+
+static LIST_HEAD(devs);
+static int compat_id = -1;
+
+static int insert_new_fid(struct p9_dev *dev, struct p9_fid *fid);
+static struct p9_fid *find_or_create_fid(struct p9_dev *dev, u32 fid)
+{
+	struct rb_node *node = dev->fids.rb_node;
+	struct p9_fid *pfid = NULL;
+	size_t len;
+
+	while (node) {
+		struct p9_fid *cur = rb_entry(node, struct p9_fid, node);
+
+		if (fid < cur->fid) {
+			node = node->rb_left;
+		} else if (fid > cur->fid) {
+			node = node->rb_right;
+		} else {
+			return cur;
+		}
+	}
+
+	pfid = calloc(sizeof(*pfid), 1);
+	if (!pfid)
+		return NULL;
+
+	len = strlen(dev->root_dir);
+	if (len >= sizeof(pfid->abs_path)) {
+		free(pfid);
+		return NULL;
+	}
+
+	pfid->fid = fid;
+	strcpy(pfid->abs_path, dev->root_dir);
+	pfid->path = pfid->abs_path + strlen(pfid->abs_path);
+
+	insert_new_fid(dev, pfid);
+
+	return pfid;
+}
+
+static int insert_new_fid(struct p9_dev *dev, struct p9_fid *fid)
+{
+	struct rb_node **node = &(dev->fids.rb_node), *parent = NULL;
+
+	while (*node) {
+		int result = fid->fid - rb_entry(*node, struct p9_fid, node)->fid;
+
+		parent = *node;
+		if (result < 0)
+			node    = &((*node)->rb_left);
+		else if (result > 0)
+			node    = &((*node)->rb_right);
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&fid->node, parent, node);
+	rb_insert_color(&fid->node, &dev->fids);
+	return 0;
+}
+
+static struct p9_fid *get_fid(struct p9_dev *p9dev, int fid)
+{
+	struct p9_fid *new;
+
+	new = find_or_create_fid(p9dev, fid);
+
+	return new;
+}
+
+static void stat2qid(struct stat *st, struct p9_qid *qid)
+{
+	*qid = (struct p9_qid) {
+		.path		= st->st_ino,
+		.version	= st->st_mtime,
+	};
+
+	if (S_ISDIR(st->st_mode))
+		qid->type	|= P9_QTDIR;
+}
+
+static void close_fid(struct p9_dev *p9dev, u32 fid)
+{
+	struct p9_fid *pfid = get_fid(p9dev, fid);
+
+	if (pfid->fd > 0)
+		close(pfid->fd);
+
+	if (pfid->dir)
+		closedir(pfid->dir);
+
+	rb_erase(&pfid->node, &p9dev->fids);
+	free(pfid);
+}
+
+static void virtio_p9_set_reply_header(struct p9_pdu *pdu, u32 size)
+{
+	u8 cmd;
+	u16 tag;
+
+	pdu->read_offset = sizeof(u32);
+	virtio_p9_pdu_readf(pdu, "bw", &cmd, &tag);
+	pdu->write_offset = 0;
+	/* cmd + 1 is the reply message */
+	virtio_p9_pdu_writef(pdu, "dbw", size, cmd + 1, tag);
+}
+
+static u16 virtio_p9_update_iov_cnt(struct iovec iov[], u32 count, int iov_cnt)
+{
+	int i;
+	u32 total = 0;
+	for (i = 0; (i < iov_cnt) && (total < count); i++) {
+		if (total + iov[i].iov_len > count) {
+			/* we don't need this iov fully */
+			iov[i].iov_len -= ((total + iov[i].iov_len) - count);
+			i++;
+			break;
+		}
+		total += iov[i].iov_len;
+	}
+	return i;
+}
+
+static void virtio_p9_error_reply(struct p9_dev *p9dev,
+				  struct p9_pdu *pdu, int err, u32 *outlen)
+{
+	u16 tag;
+
+	/* EMFILE at server implies ENFILE for the VM */
+	if (err == EMFILE)
+		err = ENFILE;
+
+	pdu->write_offset = VIRTIO_9P_HDR_LEN;
+	virtio_p9_pdu_writef(pdu, "d", err);
+	*outlen = pdu->write_offset;
+
+	/* read the tag from input */
+	pdu->read_offset = sizeof(u32) + sizeof(u8);
+	virtio_p9_pdu_readf(pdu, "w", &tag);
+
+	/* Update the header */
+	pdu->write_offset = 0;
+	virtio_p9_pdu_writef(pdu, "dbw", *outlen, P9_RLERROR, tag);
+}
+
+static void virtio_p9_version(struct p9_dev *p9dev,
+			      struct p9_pdu *pdu, u32 *outlen)
+{
+	u32 msize;
+	char *version;
+	virtio_p9_pdu_readf(pdu, "ds", &msize, &version);
+	/*
+	 * reply with the same msize the client sent us
+	 * Error out if the request is not for 9P2000.L
+	 */
+	if (!strcmp(version, VIRTIO_9P_VERSION_DOTL))
+		virtio_p9_pdu_writef(pdu, "ds", msize, version);
+	else
+		virtio_p9_pdu_writef(pdu, "ds", msize, "unknown");
+
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	free(version);
+	return;
+}
+
+static void virtio_p9_clunk(struct p9_dev *p9dev,
+			    struct p9_pdu *pdu, u32 *outlen)
+{
+	u32 fid;
+
+	virtio_p9_pdu_readf(pdu, "d", &fid);
+	close_fid(p9dev, fid);
+
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+}
+
+/*
+ * FIXME!! Need to map to protocol independent value. Upstream
+ * 9p also have the same BUG
+ */
+static int virtio_p9_openflags(int flags)
+{
+	flags &= ~(O_NOCTTY | O_ASYNC | O_CREAT | O_DIRECT);
+	flags |= O_NOFOLLOW;
+	return flags;
+}
+
+static bool is_dir(struct p9_fid *fid)
+{
+	struct stat st;
+
+	stat(fid->abs_path, &st);
+
+	return S_ISDIR(st.st_mode);
+}
+
+/* path is always absolute */
+static bool path_is_illegal(const char *path)
+{
+	size_t len;
+
+	if (strstr(path, "/../") != NULL)
+		return true;
+
+	len = strlen(path);
+	if (len >= 3 && strcmp(path + len - 3, "/..") == 0)
+		return true;
+
+	return false;
+}
+
+static int get_full_path_helper(char *full_path, size_t size,
+			 const char *dirname, const char *name)
+{
+	int ret;
+
+	ret = snprintf(full_path, size, "%s/%s", dirname, name);
+	if (ret >= (int)size) {
+		errno = ENAMETOOLONG;
+		return -1;
+	}
+
+	if (path_is_illegal(full_path)) {
+		errno = EACCES;
+		return -1;
+	}
+
+	return 0;
+}
+
+static int get_full_path(char *full_path, size_t size, struct p9_fid *fid,
+			 const char *name)
+{
+	return get_full_path_helper(full_path, size, fid->abs_path, name);
+}
+
+static int stat_rel(struct p9_dev *p9dev, const char *path, struct stat *st)
+{
+	char full_path[PATH_MAX];
+
+	if (get_full_path_helper(full_path, sizeof(full_path), p9dev->root_dir, path) != 0)
+		return -1;
+
+	if (lstat(full_path, st) != 0)
+		return -1;
+
+	return 0;
+}
+
+static void virtio_p9_open(struct p9_dev *p9dev,
+			   struct p9_pdu *pdu, u32 *outlen)
+{
+	u32 fid, flags;
+	struct stat st;
+	struct p9_qid qid;
+	struct p9_fid *new_fid;
+
+
+	virtio_p9_pdu_readf(pdu, "dd", &fid, &flags);
+	new_fid = get_fid(p9dev, fid);
+
+	if (lstat(new_fid->abs_path, &st) < 0)
+		goto err_out;
+
+	stat2qid(&st, &qid);
+
+	if (is_dir(new_fid)) {
+		new_fid->dir = opendir(new_fid->abs_path);
+		if (!new_fid->dir)
+			goto err_out;
+	} else {
+		new_fid->fd  = open(new_fid->abs_path,
+				    virtio_p9_openflags(flags));
+		if (new_fid->fd < 0)
+			goto err_out;
+	}
+	/* FIXME!! need ot send proper iounit  */
+	virtio_p9_pdu_writef(pdu, "Qd", &qid, 0);
+
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_create(struct p9_dev *p9dev,
+			     struct p9_pdu *pdu, u32 *outlen)
+{
+	int fd, ret;
+	char *name;
+	size_t size;
+	struct stat st;
+	struct p9_qid qid;
+	struct p9_fid *dfid;
+	char full_path[PATH_MAX];
+	char *tmp_path;
+	u32 dfid_val, flags, mode, gid;
+
+	virtio_p9_pdu_readf(pdu, "dsddd", &dfid_val,
+			    &name, &flags, &mode, &gid);
+	dfid = get_fid(p9dev, dfid_val);
+
+	if (get_full_path(full_path, sizeof(full_path), dfid, name) != 0)
+		goto err_out;
+
+	size = sizeof(dfid->abs_path) - (dfid->path - dfid->abs_path);
+
+	tmp_path = strdup(dfid->path);
+	if (!tmp_path)
+		goto err_out;
+
+	ret = snprintf(dfid->path, size, "%s/%s", tmp_path, name);
+	free(tmp_path);
+	if (ret >= (int)size) {
+		errno = ENAMETOOLONG;
+		if (size > 0)
+			dfid->path[size] = '\x00';
+		goto err_out;
+	}
+
+	flags = virtio_p9_openflags(flags);
+
+	fd = open(full_path, flags | O_CREAT, mode);
+	if (fd < 0)
+		goto err_out;
+	dfid->fd = fd;
+
+	if (lstat(full_path, &st) < 0)
+		goto err_out;
+
+	ret = chmod(full_path, mode & 0777);
+	if (ret < 0)
+		goto err_out;
+
+	stat2qid(&st, &qid);
+	virtio_p9_pdu_writef(pdu, "Qd", &qid, 0);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	free(name);
+	return;
+err_out:
+	free(name);
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_mkdir(struct p9_dev *p9dev,
+			    struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	char *name;
+	struct stat st;
+	struct p9_qid qid;
+	struct p9_fid *dfid;
+	char full_path[PATH_MAX];
+	u32 dfid_val, mode, gid;
+
+	virtio_p9_pdu_readf(pdu, "dsdd", &dfid_val,
+			    &name, &mode, &gid);
+	dfid = get_fid(p9dev, dfid_val);
+
+	if (get_full_path(full_path, sizeof(full_path), dfid, name) != 0)
+		goto err_out;
+
+	ret = mkdir(full_path, mode);
+	if (ret < 0)
+		goto err_out;
+
+	if (lstat(full_path, &st) < 0)
+		goto err_out;
+
+	ret = chmod(full_path, mode & 0777);
+	if (ret < 0)
+		goto err_out;
+
+	stat2qid(&st, &qid);
+	virtio_p9_pdu_writef(pdu, "Qd", &qid, 0);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	free(name);
+	return;
+err_out:
+	free(name);
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static int join_path(struct p9_fid *fid, const char *name)
+{
+	size_t len, size;
+
+	size = sizeof(fid->abs_path) - (fid->path - fid->abs_path);
+	len = strlen(name);
+	if (len >= size)
+		return -1;
+
+	strncpy(fid->path, name, size);
+	return 0;
+}
+
+static void virtio_p9_walk(struct p9_dev *p9dev,
+			   struct p9_pdu *pdu, u32 *outlen)
+{
+	u8 i;
+	u16 nwqid;
+	u16 nwname;
+	struct p9_qid wqid;
+	struct p9_fid *new_fid, *old_fid;
+	u32 fid_val, newfid_val;
+
+
+	virtio_p9_pdu_readf(pdu, "ddw", &fid_val, &newfid_val, &nwname);
+	new_fid	= get_fid(p9dev, newfid_val);
+
+	nwqid = 0;
+	if (nwname) {
+		struct p9_fid *fid = get_fid(p9dev, fid_val);
+
+		if (join_path(new_fid, fid->path) != 0) {
+			errno = ENAMETOOLONG;
+			goto err_out;
+		}
+
+		/* skip the space for count */
+		pdu->write_offset += sizeof(u16);
+		for (i = 0; i < nwname; i++) {
+			struct stat st;
+			char tmp[PATH_MAX] = {0};
+			char *str;
+			int ret;
+
+			virtio_p9_pdu_readf(pdu, "s", &str);
+
+			/* Format the new path we're 'walk'ing into */
+			ret = snprintf(tmp, sizeof(tmp), "%s/%s", new_fid->path, str);
+			if (ret >= (int)sizeof(tmp)) {
+				errno = ENAMETOOLONG;
+				goto err_out;
+			}
+
+			free(str);
+
+			if (stat_rel(p9dev, tmp, &st) != 0)
+				goto err_out;
+
+			stat2qid(&st, &wqid);
+			if (join_path(new_fid, tmp) != 0) {
+				errno = ENAMETOOLONG;
+				goto err_out;
+			}
+			new_fid->uid = fid->uid;
+			nwqid++;
+			virtio_p9_pdu_writef(pdu, "Q", &wqid);
+		}
+	} else {
+		/*
+		 * update write_offset so our outlen get correct value
+		 */
+		pdu->write_offset += sizeof(u16);
+		old_fid = get_fid(p9dev, fid_val);
+		if (join_path(new_fid, old_fid->path) != 0) {
+			errno = ENAMETOOLONG;
+			goto err_out;
+		}
+		new_fid->uid    = old_fid->uid;
+	}
+	*outlen = pdu->write_offset;
+	pdu->write_offset = VIRTIO_9P_HDR_LEN;
+	virtio_p9_pdu_writef(pdu, "d", nwqid);
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_attach(struct p9_dev *p9dev,
+			     struct p9_pdu *pdu, u32 *outlen)
+{
+	char *uname;
+	char *aname;
+	struct stat st;
+	struct p9_qid qid;
+	struct p9_fid *fid;
+	u32 fid_val, afid, uid;
+
+	virtio_p9_pdu_readf(pdu, "ddssd", &fid_val, &afid,
+			    &uname, &aname, &uid);
+
+	free(uname);
+	free(aname);
+
+	if (lstat(p9dev->root_dir, &st) < 0)
+		goto err_out;
+
+	stat2qid(&st, &qid);
+
+	fid = get_fid(p9dev, fid_val);
+	fid->uid = uid;
+	if (join_path(fid, "/") != 0) {
+		errno = ENAMETOOLONG;
+		goto err_out;
+	}
+
+	virtio_p9_pdu_writef(pdu, "Q", &qid);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_fill_stat(struct p9_dev *p9dev,
+				struct stat *st, struct p9_stat_dotl *statl)
+{
+	memset(statl, 0, sizeof(*statl));
+	statl->st_mode		= st->st_mode;
+	statl->st_nlink		= st->st_nlink;
+	statl->st_uid		= KUIDT_INIT(st->st_uid);
+	statl->st_gid		= KGIDT_INIT(st->st_gid);
+	statl->st_rdev		= st->st_rdev;
+	statl->st_size		= st->st_size;
+	statl->st_blksize	= st->st_blksize;
+	statl->st_blocks	= st->st_blocks;
+	statl->st_atime_sec	= st->st_atime;
+	statl->st_atime_nsec	= st->st_atim.tv_nsec;
+	statl->st_mtime_sec	= st->st_mtime;
+	statl->st_mtime_nsec	= st->st_mtim.tv_nsec;
+	statl->st_ctime_sec	= st->st_ctime;
+	statl->st_ctime_nsec	= st->st_ctim.tv_nsec;
+	/* Currently we only support BASIC fields in stat */
+	statl->st_result_mask	= P9_STATS_BASIC;
+	stat2qid(st, &statl->qid);
+}
+
+static void virtio_p9_read(struct p9_dev *p9dev,
+			   struct p9_pdu *pdu, u32 *outlen)
+{
+	u64 offset;
+	u32 fid_val;
+	u16 iov_cnt;
+	void *iov_base;
+	size_t iov_len;
+	u32 count, rcount;
+	struct p9_fid *fid;
+
+
+	rcount = 0;
+	virtio_p9_pdu_readf(pdu, "dqd", &fid_val, &offset, &count);
+	fid = get_fid(p9dev, fid_val);
+
+	iov_base = pdu->in_iov[0].iov_base;
+	iov_len  = pdu->in_iov[0].iov_len;
+	iov_cnt  = pdu->in_iov_cnt;
+	pdu->in_iov[0].iov_base += VIRTIO_9P_HDR_LEN + sizeof(u32);
+	pdu->in_iov[0].iov_len -= VIRTIO_9P_HDR_LEN + sizeof(u32);
+	pdu->in_iov_cnt = virtio_p9_update_iov_cnt(pdu->in_iov,
+						   count,
+						   pdu->in_iov_cnt);
+	rcount = preadv(fid->fd, pdu->in_iov,
+			pdu->in_iov_cnt, offset);
+	if (rcount > count)
+		rcount = count;
+	/*
+	 * Update the iov_base back, so that rest of
+	 * pdu_writef works correctly.
+	 */
+	pdu->in_iov[0].iov_base = iov_base;
+	pdu->in_iov[0].iov_len  = iov_len;
+	pdu->in_iov_cnt         = iov_cnt;
+
+	pdu->write_offset = VIRTIO_9P_HDR_LEN;
+	virtio_p9_pdu_writef(pdu, "d", rcount);
+	*outlen = pdu->write_offset + rcount;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+}
+
+static int virtio_p9_dentry_size(struct dirent *dent)
+{
+	/*
+	 * Size of each dirent:
+	 * qid(13) + offset(8) + type(1) + name_len(2) + name
+	 */
+	return 24 + strlen(dent->d_name);
+}
+
+static void virtio_p9_readdir(struct p9_dev *p9dev,
+			      struct p9_pdu *pdu, u32 *outlen)
+{
+	u32 fid_val;
+	u32 count, rcount;
+	struct stat st;
+	struct p9_fid *fid;
+	struct dirent *dent;
+	u64 offset, old_offset;
+
+	rcount = 0;
+	virtio_p9_pdu_readf(pdu, "dqd", &fid_val, &offset, &count);
+	fid = get_fid(p9dev, fid_val);
+
+	if (!is_dir(fid)) {
+		errno = EINVAL;
+		goto err_out;
+	}
+
+	/* Move the offset specified */
+	seekdir(fid->dir, offset);
+
+	old_offset = offset;
+	/* If reading a dir, fill the buffer with p9_stat entries */
+	dent = readdir(fid->dir);
+
+	/* Skip the space for writing count */
+	pdu->write_offset += sizeof(u32);
+	while (dent) {
+		u32 read;
+		struct p9_qid qid;
+
+		if ((rcount + virtio_p9_dentry_size(dent)) > count) {
+			/* seek to the previous offset and return */
+			seekdir(fid->dir, old_offset);
+			break;
+		}
+		old_offset = dent->d_off;
+		if (stat_rel(p9dev, dent->d_name, &st) != 0)
+			memset(&st, -1, sizeof(st));
+		stat2qid(&st, &qid);
+		read = pdu->write_offset;
+		virtio_p9_pdu_writef(pdu, "Qqbs", &qid, dent->d_off,
+				     dent->d_type, dent->d_name);
+		rcount += pdu->write_offset - read;
+		dent = readdir(fid->dir);
+	}
+
+	pdu->write_offset = VIRTIO_9P_HDR_LEN;
+	virtio_p9_pdu_writef(pdu, "d", rcount);
+	*outlen = pdu->write_offset + rcount;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+
+static void virtio_p9_getattr(struct p9_dev *p9dev,
+			      struct p9_pdu *pdu, u32 *outlen)
+{
+	u32 fid_val;
+	struct stat st;
+	u64 request_mask;
+	struct p9_fid *fid;
+	struct p9_stat_dotl statl;
+
+	virtio_p9_pdu_readf(pdu, "dq", &fid_val, &request_mask);
+	fid = get_fid(p9dev, fid_val);
+	if (lstat(fid->abs_path, &st) < 0)
+		goto err_out;
+
+	virtio_p9_fill_stat(p9dev, &st, &statl);
+	virtio_p9_pdu_writef(pdu, "A", &statl);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+/* FIXME!! from linux/fs.h */
+/*
+ * Attribute flags.  These should be or-ed together to figure out what
+ * has been changed!
+ */
+#define ATTR_MODE	(1 << 0)
+#define ATTR_UID	(1 << 1)
+#define ATTR_GID	(1 << 2)
+#define ATTR_SIZE	(1 << 3)
+#define ATTR_ATIME	(1 << 4)
+#define ATTR_MTIME	(1 << 5)
+#define ATTR_CTIME	(1 << 6)
+#define ATTR_ATIME_SET	(1 << 7)
+#define ATTR_MTIME_SET	(1 << 8)
+#define ATTR_FORCE	(1 << 9) /* Not a change, but a change it */
+#define ATTR_ATTR_FLAG	(1 << 10)
+#define ATTR_KILL_SUID	(1 << 11)
+#define ATTR_KILL_SGID	(1 << 12)
+#define ATTR_FILE	(1 << 13)
+#define ATTR_KILL_PRIV	(1 << 14)
+#define ATTR_OPEN	(1 << 15) /* Truncating from open(O_TRUNC) */
+#define ATTR_TIMES_SET	(1 << 16)
+
+#define ATTR_MASK    127
+
+static void virtio_p9_setattr(struct p9_dev *p9dev,
+			      struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret = 0;
+	u32 fid_val;
+	struct p9_fid *fid;
+	struct p9_iattr_dotl p9attr;
+
+	virtio_p9_pdu_readf(pdu, "dI", &fid_val, &p9attr);
+	fid = get_fid(p9dev, fid_val);
+
+	if (p9attr.valid & ATTR_MODE) {
+		ret = chmod(fid->abs_path, p9attr.mode);
+		if (ret < 0)
+			goto err_out;
+	}
+	if (p9attr.valid & (ATTR_ATIME | ATTR_MTIME)) {
+		struct timespec times[2];
+		if (p9attr.valid & ATTR_ATIME) {
+			if (p9attr.valid & ATTR_ATIME_SET) {
+				times[0].tv_sec = p9attr.atime_sec;
+				times[0].tv_nsec = p9attr.atime_nsec;
+			} else {
+				times[0].tv_nsec = UTIME_NOW;
+			}
+		} else {
+			times[0].tv_nsec = UTIME_OMIT;
+		}
+		if (p9attr.valid & ATTR_MTIME) {
+			if (p9attr.valid & ATTR_MTIME_SET) {
+				times[1].tv_sec = p9attr.mtime_sec;
+				times[1].tv_nsec = p9attr.mtime_nsec;
+			} else {
+				times[1].tv_nsec = UTIME_NOW;
+			}
+		} else
+			times[1].tv_nsec = UTIME_OMIT;
+
+		ret = utimensat(-1, fid->abs_path, times, AT_SYMLINK_NOFOLLOW);
+		if (ret < 0)
+			goto err_out;
+	}
+	/*
+	 * If the only valid entry in iattr is ctime we can call
+	 * chown(-1,-1) to update the ctime of the file
+	 */
+	if ((p9attr.valid & (ATTR_UID | ATTR_GID)) ||
+	    ((p9attr.valid & ATTR_CTIME)
+	     && !((p9attr.valid & ATTR_MASK) & ~ATTR_CTIME))) {
+		if (!(p9attr.valid & ATTR_UID))
+			p9attr.uid = KUIDT_INIT(-1);
+
+		if (!(p9attr.valid & ATTR_GID))
+			p9attr.gid = KGIDT_INIT(-1);
+
+		ret = lchown(fid->abs_path, __kuid_val(p9attr.uid),
+				__kgid_val(p9attr.gid));
+		if (ret < 0)
+			goto err_out;
+	}
+	if (p9attr.valid & (ATTR_SIZE)) {
+		ret = truncate(fid->abs_path, p9attr.size);
+		if (ret < 0)
+			goto err_out;
+	}
+	*outlen = VIRTIO_9P_HDR_LEN;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_write(struct p9_dev *p9dev,
+			    struct p9_pdu *pdu, u32 *outlen)
+{
+
+	u64 offset;
+	u32 fid_val;
+	u32 count;
+	ssize_t res;
+	u16 iov_cnt;
+	void *iov_base;
+	size_t iov_len;
+	struct p9_fid *fid;
+	/* u32 fid + u64 offset + u32 count */
+	int twrite_size = sizeof(u32) + sizeof(u64) + sizeof(u32);
+
+	virtio_p9_pdu_readf(pdu, "dqd", &fid_val, &offset, &count);
+	fid = get_fid(p9dev, fid_val);
+
+	iov_base = pdu->out_iov[0].iov_base;
+	iov_len  = pdu->out_iov[0].iov_len;
+	iov_cnt  = pdu->out_iov_cnt;
+
+	/* Adjust the iovec to skip the header and meta data */
+	pdu->out_iov[0].iov_base += (sizeof(struct p9_msg) + twrite_size);
+	pdu->out_iov[0].iov_len -=  (sizeof(struct p9_msg) + twrite_size);
+	pdu->out_iov_cnt = virtio_p9_update_iov_cnt(pdu->out_iov, count,
+						    pdu->out_iov_cnt);
+	res = pwritev(fid->fd, pdu->out_iov, pdu->out_iov_cnt, offset);
+	/*
+	 * Update the iov_base back, so that rest of
+	 * pdu_readf works correctly.
+	 */
+	pdu->out_iov[0].iov_base = iov_base;
+	pdu->out_iov[0].iov_len  = iov_len;
+	pdu->out_iov_cnt         = iov_cnt;
+
+	if (res < 0)
+		goto err_out;
+	virtio_p9_pdu_writef(pdu, "d", res);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_remove(struct p9_dev *p9dev,
+			     struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	u32 fid_val;
+	struct p9_fid *fid;
+
+	virtio_p9_pdu_readf(pdu, "d", &fid_val);
+	fid = get_fid(p9dev, fid_val);
+
+	ret = remove(fid->abs_path);
+	if (ret < 0)
+		goto err_out;
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_rename(struct p9_dev *p9dev,
+			     struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	u32 fid_val, new_fid_val;
+	struct p9_fid *fid, *new_fid;
+	char full_path[PATH_MAX], *new_name;
+
+	virtio_p9_pdu_readf(pdu, "dds", &fid_val, &new_fid_val, &new_name);
+	fid = get_fid(p9dev, fid_val);
+	new_fid = get_fid(p9dev, new_fid_val);
+
+	if (get_full_path(full_path, sizeof(full_path), new_fid, new_name) != 0)
+		goto err_out;
+
+	ret = rename(fid->abs_path, full_path);
+	if (ret < 0)
+		goto err_out;
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_readlink(struct p9_dev *p9dev,
+			       struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	u32 fid_val;
+	struct p9_fid *fid;
+	char target_path[PATH_MAX];
+
+	virtio_p9_pdu_readf(pdu, "d", &fid_val);
+	fid = get_fid(p9dev, fid_val);
+
+	memset(target_path, 0, PATH_MAX);
+	ret = readlink(fid->abs_path, target_path, PATH_MAX - 1);
+	if (ret < 0)
+		goto err_out;
+
+	virtio_p9_pdu_writef(pdu, "s", target_path);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_statfs(struct p9_dev *p9dev,
+			     struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	u64 fsid;
+	u32 fid_val;
+	struct p9_fid *fid;
+	struct statfs stat_buf;
+
+	virtio_p9_pdu_readf(pdu, "d", &fid_val);
+	fid = get_fid(p9dev, fid_val);
+
+	ret = statfs(fid->abs_path, &stat_buf);
+	if (ret < 0)
+		goto err_out;
+	/* FIXME!! f_blocks needs update based on client msize */
+	fsid = (unsigned int) stat_buf.f_fsid.__val[0] |
+		(unsigned long long)stat_buf.f_fsid.__val[1] << 32;
+	virtio_p9_pdu_writef(pdu, "ddqqqqqqd", stat_buf.f_type,
+			     stat_buf.f_bsize, stat_buf.f_blocks,
+			     stat_buf.f_bfree, stat_buf.f_bavail,
+			     stat_buf.f_files, stat_buf.f_ffree,
+			     fsid, stat_buf.f_namelen);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_mknod(struct p9_dev *p9dev,
+			    struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	char *name;
+	struct stat st;
+	struct p9_fid *dfid;
+	struct p9_qid qid;
+	char full_path[PATH_MAX];
+	u32 fid_val, mode, major, minor, gid;
+
+	virtio_p9_pdu_readf(pdu, "dsdddd", &fid_val, &name, &mode,
+			    &major, &minor, &gid);
+
+	dfid = get_fid(p9dev, fid_val);
+
+	if (get_full_path(full_path, sizeof(full_path), dfid, name) != 0)
+		goto err_out;
+
+	ret = mknod(full_path, mode, makedev(major, minor));
+	if (ret < 0)
+		goto err_out;
+
+	if (lstat(full_path, &st) < 0)
+		goto err_out;
+
+	ret = chmod(full_path, mode & 0777);
+	if (ret < 0)
+		goto err_out;
+
+	stat2qid(&st, &qid);
+	virtio_p9_pdu_writef(pdu, "Q", &qid);
+	free(name);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	free(name);
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_fsync(struct p9_dev *p9dev,
+			    struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret, fd;
+	struct p9_fid *fid;
+	u32 fid_val, datasync;
+
+	virtio_p9_pdu_readf(pdu, "dd", &fid_val, &datasync);
+	fid = get_fid(p9dev, fid_val);
+
+	if (fid->dir)
+		fd = dirfd(fid->dir);
+	else
+		fd = fid->fd;
+
+	if (datasync)
+		ret = fdatasync(fd);
+	else
+		ret = fsync(fd);
+	if (ret < 0)
+		goto err_out;
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_symlink(struct p9_dev *p9dev,
+			      struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	struct stat st;
+	u32 fid_val, gid;
+	struct p9_qid qid;
+	struct p9_fid *dfid;
+	char new_name[PATH_MAX];
+	char *old_path, *name;
+
+	virtio_p9_pdu_readf(pdu, "dssd", &fid_val, &name, &old_path, &gid);
+
+	dfid = get_fid(p9dev, fid_val);
+
+	if (get_full_path(new_name, sizeof(new_name), dfid, name) != 0)
+		goto err_out;
+
+	ret = symlink(old_path, new_name);
+	if (ret < 0)
+		goto err_out;
+
+	if (lstat(new_name, &st) < 0)
+		goto err_out;
+
+	stat2qid(&st, &qid);
+	virtio_p9_pdu_writef(pdu, "Q", &qid);
+	free(name);
+	free(old_path);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	free(name);
+	free(old_path);
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_link(struct p9_dev *p9dev,
+			   struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	char *name;
+	u32 fid_val, dfid_val;
+	struct p9_fid *dfid, *fid;
+	char full_path[PATH_MAX];
+
+	virtio_p9_pdu_readf(pdu, "dds", &dfid_val, &fid_val, &name);
+
+	dfid = get_fid(p9dev, dfid_val);
+	fid =  get_fid(p9dev, fid_val);
+
+	if (get_full_path(full_path, sizeof(full_path), dfid, name) != 0)
+		goto err_out;
+
+	ret = link(fid->abs_path, full_path);
+	if (ret < 0)
+		goto err_out;
+	free(name);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	free(name);
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+
+}
+
+static void virtio_p9_lock(struct p9_dev *p9dev,
+			   struct p9_pdu *pdu, u32 *outlen)
+{
+	u8 ret;
+	u32 fid_val;
+	struct p9_flock flock;
+
+	virtio_p9_pdu_readf(pdu, "dbdqqds", &fid_val, &flock.type,
+			    &flock.flags, &flock.start, &flock.length,
+			    &flock.proc_id, &flock.client_id);
+
+	/* Just return success */
+	ret = P9_LOCK_SUCCESS;
+	virtio_p9_pdu_writef(pdu, "d", ret);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	free(flock.client_id);
+	return;
+}
+
+static void virtio_p9_getlock(struct p9_dev *p9dev,
+			      struct p9_pdu *pdu, u32 *outlen)
+{
+	u32 fid_val;
+	struct p9_getlock glock;
+	virtio_p9_pdu_readf(pdu, "dbqqds", &fid_val, &glock.type,
+			    &glock.start, &glock.length, &glock.proc_id,
+			    &glock.client_id);
+
+	/* Just return success */
+	glock.type = F_UNLCK;
+	virtio_p9_pdu_writef(pdu, "bqqds", glock.type,
+			     glock.start, glock.length, glock.proc_id,
+			     glock.client_id);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	free(glock.client_id);
+	return;
+}
+
+static int virtio_p9_ancestor(char *path, char *ancestor)
+{
+	int size = strlen(ancestor);
+	if (!strncmp(path, ancestor, size)) {
+		/*
+		 * Now check whether ancestor is a full name or
+		 * or directory component and not just part
+		 * of a name.
+		 */
+		if (path[size] == '\0' || path[size] == '/')
+			return 1;
+	}
+	return 0;
+}
+
+static int virtio_p9_fix_path(struct p9_fid *fid, char *old_name, char *new_name)
+{
+	int ret;
+	char *p, tmp_name[PATH_MAX];
+	size_t rp_sz = strlen(old_name);
+
+	if (rp_sz == strlen(fid->path)) {
+		/* replace the full name */
+		p = new_name;
+	} else {
+		/* save the trailing path details */
+		ret = snprintf(tmp_name, sizeof(tmp_name), "%s%s", new_name, fid->path + rp_sz);
+		if (ret >= (int)sizeof(tmp_name))
+			return -1;
+		p = tmp_name;
+	}
+
+	return join_path(fid, p);
+}
+
+static void rename_fids(struct p9_dev *p9dev, char *old_name, char *new_name)
+{
+	struct rb_node *node = rb_first(&p9dev->fids);
+
+	while (node) {
+		struct p9_fid *fid = rb_entry(node, struct p9_fid, node);
+
+		if (fid->fid != P9_NOFID && virtio_p9_ancestor(fid->path, old_name)) {
+				virtio_p9_fix_path(fid, old_name, new_name);
+		}
+		node = rb_next(node);
+	}
+}
+
+static void virtio_p9_renameat(struct p9_dev *p9dev,
+			       struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	char *old_name, *new_name;
+	u32 old_dfid_val, new_dfid_val;
+	struct p9_fid *old_dfid, *new_dfid;
+	char old_full_path[PATH_MAX], new_full_path[PATH_MAX];
+
+
+	virtio_p9_pdu_readf(pdu, "dsds", &old_dfid_val, &old_name,
+			    &new_dfid_val, &new_name);
+
+	old_dfid = get_fid(p9dev, old_dfid_val);
+	new_dfid = get_fid(p9dev, new_dfid_val);
+
+	if (get_full_path(old_full_path, sizeof(old_full_path), old_dfid, old_name) != 0)
+		goto err_out;
+
+	if (get_full_path(new_full_path, sizeof(new_full_path), new_dfid, new_name) != 0)
+		goto err_out;
+
+	ret = rename(old_full_path, new_full_path);
+	if (ret < 0)
+		goto err_out;
+	/*
+	 * Now fix path in other fids, if the renamed path is part of
+	 * that.
+	 */
+	rename_fids(p9dev, old_name, new_name);
+	free(old_name);
+	free(new_name);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	free(old_name);
+	free(new_name);
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_unlinkat(struct p9_dev *p9dev,
+			       struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	char *name;
+	u32 fid_val, flags;
+	struct p9_fid *fid;
+	char full_path[PATH_MAX];
+
+	virtio_p9_pdu_readf(pdu, "dsd", &fid_val, &name, &flags);
+	fid = get_fid(p9dev, fid_val);
+
+	if (get_full_path(full_path, sizeof(full_path), fid, name) != 0)
+		goto err_out;
+
+	ret = remove(full_path);
+	if (ret < 0)
+		goto err_out;
+	free(name);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	free(name);
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_flush(struct p9_dev *p9dev,
+				struct p9_pdu *pdu, u32 *outlen)
+{
+	u16 tag, oldtag;
+
+	virtio_p9_pdu_readf(pdu, "ww", &tag, &oldtag);
+	virtio_p9_pdu_writef(pdu, "w", tag);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+
+	return;
+}
+
+static void virtio_p9_eopnotsupp(struct p9_dev *p9dev,
+				 struct p9_pdu *pdu, u32 *outlen)
+{
+	return virtio_p9_error_reply(p9dev, pdu, EOPNOTSUPP, outlen);
+}
+
+typedef void p9_handler(struct p9_dev *p9dev,
+			struct p9_pdu *pdu, u32 *outlen);
+
+/* FIXME should be removed when merging with latest linus tree */
+#define P9_TRENAMEAT 74
+#define P9_TUNLINKAT 76
+
+static p9_handler *virtio_9p_dotl_handler [] = {
+	[P9_TREADDIR]     = virtio_p9_readdir,
+	[P9_TSTATFS]      = virtio_p9_statfs,
+	[P9_TGETATTR]     = virtio_p9_getattr,
+	[P9_TSETATTR]     = virtio_p9_setattr,
+	[P9_TXATTRWALK]   = virtio_p9_eopnotsupp,
+	[P9_TXATTRCREATE] = virtio_p9_eopnotsupp,
+	[P9_TMKNOD]       = virtio_p9_mknod,
+	[P9_TLOCK]        = virtio_p9_lock,
+	[P9_TGETLOCK]     = virtio_p9_getlock,
+	[P9_TRENAMEAT]    = virtio_p9_renameat,
+	[P9_TREADLINK]    = virtio_p9_readlink,
+	[P9_TUNLINKAT]    = virtio_p9_unlinkat,
+	[P9_TMKDIR]       = virtio_p9_mkdir,
+	[P9_TVERSION]     = virtio_p9_version,
+	[P9_TLOPEN]       = virtio_p9_open,
+	[P9_TATTACH]      = virtio_p9_attach,
+	[P9_TWALK]        = virtio_p9_walk,
+	[P9_TCLUNK]       = virtio_p9_clunk,
+	[P9_TFSYNC]       = virtio_p9_fsync,
+	[P9_TREAD]        = virtio_p9_read,
+	[P9_TFLUSH]       = virtio_p9_flush,
+	[P9_TLINK]        = virtio_p9_link,
+	[P9_TSYMLINK]     = virtio_p9_symlink,
+	[P9_TLCREATE]     = virtio_p9_create,
+	[P9_TWRITE]       = virtio_p9_write,
+	[P9_TREMOVE]      = virtio_p9_remove,
+	[P9_TRENAME]      = virtio_p9_rename,
+};
+
+static struct p9_pdu *virtio_p9_pdu_init(struct kvm *kvm, struct virt_queue *vq)
+{
+	struct p9_pdu *pdu = calloc(1, sizeof(*pdu));
+	if (!pdu)
+		return NULL;
+
+	/* skip the pdu header p9_msg */
+	pdu->read_offset	= VIRTIO_9P_HDR_LEN;
+	pdu->write_offset	= VIRTIO_9P_HDR_LEN;
+	pdu->queue_head		= virt_queue__get_inout_iov(kvm, vq, pdu->in_iov,
+					pdu->out_iov, &pdu->in_iov_cnt, &pdu->out_iov_cnt);
+	return pdu;
+}
+
+static u8 virtio_p9_get_cmd(struct p9_pdu *pdu)
+{
+	struct p9_msg *msg;
+	/*
+	 * we can peek directly into pdu for a u8
+	 * value. The host endianess won't be an issue
+	 */
+	msg = pdu->out_iov[0].iov_base;
+	return msg->cmd;
+}
+
+static bool virtio_p9_do_io_request(struct kvm *kvm, struct p9_dev_job *job)
+{
+	u8 cmd;
+	u32 len = 0;
+	p9_handler *handler;
+	struct p9_dev *p9dev;
+	struct virt_queue *vq;
+	struct p9_pdu *p9pdu;
+
+	vq = job->vq;
+	p9dev = job->p9dev;
+
+	p9pdu = virtio_p9_pdu_init(kvm, vq);
+	cmd = virtio_p9_get_cmd(p9pdu);
+
+	if ((cmd >= ARRAY_SIZE(virtio_9p_dotl_handler)) ||
+	    !virtio_9p_dotl_handler[cmd])
+		handler = virtio_p9_eopnotsupp;
+	else
+		handler = virtio_9p_dotl_handler[cmd];
+
+	handler(p9dev, p9pdu, &len);
+	virt_queue__set_used_elem(vq, p9pdu->queue_head, len);
+	free(p9pdu);
+	return true;
+}
+
+static void virtio_p9_do_io(struct kvm *kvm, void *param)
+{
+	struct p9_dev_job *job = (struct p9_dev_job *)param;
+	struct p9_dev *p9dev   = job->p9dev;
+	struct virt_queue *vq  = job->vq;
+
+	while (virt_queue__available(vq)) {
+		virtio_p9_do_io_request(kvm, job);
+		p9dev->vdev.ops->signal_vq(kvm, &p9dev->vdev, vq - p9dev->vqs);
+	}
+}
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+	struct p9_dev *p9dev = dev;
+
+	return ((u8 *)(p9dev->config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+	return 1 << VIRTIO_9P_MOUNT_TAG;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+	struct p9_dev *p9dev = dev;
+	struct virtio_9p_config *conf = p9dev->config;
+
+	p9dev->features = features;
+	conf->tag_len = virtio_host_to_guest_u16(&p9dev->vdev, conf->tag_len);
+}
+
+static void notify_status(struct kvm *kvm, void *dev, u32 status)
+{
+	struct p9_dev *p9dev = dev;
+	struct p9_fid *pfid, *next;
+
+	if (!(status & VIRTIO__STATUS_STOP))
+		return;
+
+	rbtree_postorder_for_each_entry_safe(pfid, next, &p9dev->fids, node)
+		close_fid(p9dev, pfid->fid);
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+		   u32 pfn)
+{
+	struct p9_dev *p9dev = dev;
+	struct p9_dev_job *job;
+	struct virt_queue *queue;
+	void *p;
+
+	compat__remove_message(compat_id);
+
+	queue		= &p9dev->vqs[vq];
+	queue->pfn	= pfn;
+	p		= virtio_get_vq(kvm, queue->pfn, page_size);
+	job		= &p9dev->jobs[vq];
+
+	vring_init(&queue->vring, VIRTQUEUE_NUM, p, align);
+	virtio_init_device_vq(&p9dev->vdev, queue);
+
+	*job		= (struct p9_dev_job) {
+		.vq		= queue,
+		.p9dev		= p9dev,
+	};
+	thread_pool__init_job(&job->job_id, kvm, virtio_p9_do_io, job);
+
+	return 0;
+}
+
+static void exit_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct p9_dev *p9dev = dev;
+
+	thread_pool__cancel_job(&p9dev->jobs[vq].job_id);
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct p9_dev *p9dev = dev;
+
+	thread_pool__do_job(&p9dev->jobs[vq].job_id);
+
+	return 0;
+}
+
+static struct virt_queue *get_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct p9_dev *p9dev = dev;
+
+	return &p9dev->vqs[vq];
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	return VIRTQUEUE_NUM;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+	/* FIXME: dynamic */
+	return size;
+}
+
+static int get_vq_count(struct kvm *kvm, void *dev)
+{
+	return NUM_VIRT_QUEUES;
+}
+
+struct virtio_ops p9_dev_virtio_ops = {
+	.get_config		= get_config,
+	.get_host_features	= get_host_features,
+	.set_guest_features	= set_guest_features,
+	.init_vq		= init_vq,
+	.exit_vq		= exit_vq,
+	.notify_status		= notify_status,
+	.notify_vq		= notify_vq,
+	.get_vq			= get_vq,
+	.get_size_vq		= get_size_vq,
+	.set_size_vq		= set_size_vq,
+	.get_vq_count		= get_vq_count,
+};
+
+int virtio_9p_rootdir_parser(const struct option *opt, const char *arg, int unset)
+{
+	char *tag_name;
+	char tmp[PATH_MAX];
+	struct kvm *kvm = opt->ptr;
+
+	/*
+	 * 9p dir can be of the form dirname,tag_name or
+	 * just dirname. In the later case we use the
+	 * default tag name
+	 */
+	tag_name = strstr(arg, ",");
+	if (tag_name) {
+		*tag_name = '\0';
+		tag_name++;
+	}
+	if (realpath(arg, tmp)) {
+		if (virtio_9p__register(kvm, tmp, tag_name) < 0)
+			die("Unable to initialize virtio 9p");
+	} else
+		die("Failed resolving 9p path");
+	return 0;
+}
+
+int virtio_9p_img_name_parser(const struct option *opt, const char *arg, int unset)
+{
+	char path[PATH_MAX];
+	struct stat st;
+	struct kvm *kvm = opt->ptr;
+
+	if (stat(arg, &st) == 0 &&
+	    S_ISDIR(st.st_mode)) {
+		char tmp[PATH_MAX];
+
+		if (kvm->cfg.using_rootfs)
+			die("Please use only one rootfs directory atmost");
+
+		if (realpath(arg, tmp) == 0 ||
+		    virtio_9p__register(kvm, tmp, "/dev/root") < 0)
+			die("Unable to initialize virtio 9p");
+		kvm->cfg.using_rootfs = 1;
+		return 0;
+	}
+
+	snprintf(path, PATH_MAX, "%s%s", kvm__get_dir(), arg);
+
+	if (stat(path, &st) == 0 &&
+	    S_ISDIR(st.st_mode)) {
+		char tmp[PATH_MAX];
+
+		if (kvm->cfg.using_rootfs)
+			die("Please use only one rootfs directory atmost");
+
+		if (realpath(path, tmp) == 0 ||
+		    virtio_9p__register(kvm, tmp, "/dev/root") < 0)
+			die("Unable to initialize virtio 9p");
+		if (virtio_9p__register(kvm, "/", "hostfs") < 0)
+			die("Unable to initialize virtio 9p");
+		kvm_setup_resolv(arg);
+		kvm->cfg.using_rootfs = kvm->cfg.custom_rootfs = 1;
+		kvm->cfg.custom_rootfs_name = arg;
+		return 0;
+	}
+
+	return -1;
+}
+
+int virtio_9p__init(struct kvm *kvm)
+{
+	struct p9_dev *p9dev;
+	int r;
+
+	list_for_each_entry(p9dev, &devs, list) {
+		r = virtio_init(kvm, p9dev, &p9dev->vdev, &p9_dev_virtio_ops,
+				VIRTIO_DEFAULT_TRANS(kvm), PCI_DEVICE_ID_VIRTIO_9P,
+				VIRTIO_ID_9P, PCI_CLASS_9P);
+		if (r < 0)
+			return r;
+	}
+
+	return 0;
+}
+virtio_dev_init(virtio_9p__init);
+
+int virtio_9p__register(struct kvm *kvm, const char *root, const char *tag_name)
+{
+	struct p9_dev *p9dev;
+	int err = 0;
+
+	p9dev = calloc(1, sizeof(*p9dev));
+	if (!p9dev)
+		return -ENOMEM;
+
+	if (!tag_name)
+		tag_name = VIRTIO_9P_DEFAULT_TAG;
+
+	p9dev->config = calloc(1, sizeof(*p9dev->config) + strlen(tag_name) + 1);
+	if (p9dev->config == NULL) {
+		err = -ENOMEM;
+		goto free_p9dev;
+	}
+
+	strncpy(p9dev->root_dir, root, sizeof(p9dev->root_dir));
+	p9dev->root_dir[sizeof(p9dev->root_dir)-1] = '\x00';
+
+	p9dev->config->tag_len = strlen(tag_name);
+	if (p9dev->config->tag_len > MAX_TAG_LEN) {
+		err = -EINVAL;
+		goto free_p9dev_config;
+	}
+
+	memcpy(&p9dev->config->tag, tag_name, strlen(tag_name));
+
+	list_add(&p9dev->list, &devs);
+
+	if (compat_id == -1)
+		compat_id = virtio_compat_add_message("virtio-9p", "CONFIG_NET_9P_VIRTIO");
+
+	return err;
+
+free_p9dev_config:
+	free(p9dev->config);
+free_p9dev:
+	free(p9dev);
+	return err;
+}
diff --git a/kvmtool/virtio/balloon.c b/kvmtool/virtio/balloon.c
new file mode 100644
index 0000000..8e8803f
--- /dev/null
+++ b/kvmtool/virtio/balloon.c
@@ -0,0 +1,295 @@
+#include "kvm/virtio-balloon.h"
+
+#include "kvm/virtio-pci-dev.h"
+
+#include "kvm/virtio.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/threadpool.h"
+#include "kvm/guest_compat.h"
+#include "kvm/kvm-ipc.h"
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_balloon.h>
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <pthread.h>
+#include <sys/eventfd.h>
+
+#define NUM_VIRT_QUEUES		3
+#define VIRTIO_BLN_QUEUE_SIZE	128
+#define VIRTIO_BLN_INFLATE	0
+#define VIRTIO_BLN_DEFLATE	1
+#define VIRTIO_BLN_STATS	2
+
+struct bln_dev {
+	struct list_head	list;
+	struct virtio_device	vdev;
+
+	u32			features;
+
+	/* virtio queue */
+	struct virt_queue	vqs[NUM_VIRT_QUEUES];
+	struct thread_pool__job	jobs[NUM_VIRT_QUEUES];
+
+	struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
+	struct virtio_balloon_stat *cur_stat;
+	u32			cur_stat_head;
+	u16			stat_count;
+	int			stat_waitfd;
+
+	struct virtio_balloon_config config;
+};
+
+static struct bln_dev bdev;
+static int compat_id = -1;
+
+static bool virtio_bln_do_io_request(struct kvm *kvm, struct bln_dev *bdev, struct virt_queue *queue)
+{
+	struct iovec iov[VIRTIO_BLN_QUEUE_SIZE];
+	unsigned int len = 0;
+	u16 out, in, head;
+	u32 *ptrs, i;
+
+	head	= virt_queue__get_iov(queue, iov, &out, &in, kvm);
+	ptrs	= iov[0].iov_base;
+	len	= iov[0].iov_len / sizeof(u32);
+
+	for (i = 0 ; i < len ; i++) {
+		void *guest_ptr;
+
+		guest_ptr = guest_flat_to_host(kvm, (u64)ptrs[i] << VIRTIO_BALLOON_PFN_SHIFT);
+		if (queue == &bdev->vqs[VIRTIO_BLN_INFLATE]) {
+			madvise(guest_ptr, 1 << VIRTIO_BALLOON_PFN_SHIFT, MADV_DONTNEED);
+			bdev->config.actual++;
+		} else if (queue == &bdev->vqs[VIRTIO_BLN_DEFLATE]) {
+			bdev->config.actual--;
+		}
+	}
+
+	virt_queue__set_used_elem(queue, head, len);
+
+	return true;
+}
+
+static bool virtio_bln_do_stat_request(struct kvm *kvm, struct bln_dev *bdev, struct virt_queue *queue)
+{
+	struct iovec iov[VIRTIO_BLN_QUEUE_SIZE];
+	u16 out, in, head;
+	struct virtio_balloon_stat *stat;
+	u64 wait_val = 1;
+
+	head = virt_queue__get_iov(queue, iov, &out, &in, kvm);
+	stat = iov[0].iov_base;
+
+	/* Initial empty stat buffer */
+	if (bdev->cur_stat == NULL) {
+		bdev->cur_stat = stat;
+		bdev->cur_stat_head = head;
+
+		return true;
+	}
+
+	memcpy(bdev->stats, stat, iov[0].iov_len);
+
+	bdev->stat_count = iov[0].iov_len / sizeof(struct virtio_balloon_stat);
+	bdev->cur_stat = stat;
+	bdev->cur_stat_head = head;
+
+	if (write(bdev->stat_waitfd, &wait_val, sizeof(wait_val)) <= 0)
+		return -EFAULT;
+
+	return 1;
+}
+
+static void virtio_bln_do_io(struct kvm *kvm, void *param)
+{
+	struct virt_queue *vq = param;
+
+	if (vq == &bdev.vqs[VIRTIO_BLN_STATS]) {
+		virtio_bln_do_stat_request(kvm, &bdev, vq);
+		bdev.vdev.ops->signal_vq(kvm, &bdev.vdev, VIRTIO_BLN_STATS);
+		return;
+	}
+
+	while (virt_queue__available(vq)) {
+		virtio_bln_do_io_request(kvm, &bdev, vq);
+		bdev.vdev.ops->signal_vq(kvm, &bdev.vdev, vq - bdev.vqs);
+	}
+}
+
+static int virtio_bln__collect_stats(struct kvm *kvm)
+{
+	u64 tmp;
+
+	virt_queue__set_used_elem(&bdev.vqs[VIRTIO_BLN_STATS], bdev.cur_stat_head,
+				  sizeof(struct virtio_balloon_stat));
+	bdev.vdev.ops->signal_vq(kvm, &bdev.vdev, VIRTIO_BLN_STATS);
+
+	if (read(bdev.stat_waitfd, &tmp, sizeof(tmp)) <= 0)
+		return -EFAULT;
+
+	return 0;
+}
+
+static void virtio_bln__print_stats(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+	int r;
+
+	if (WARN_ON(type != KVM_IPC_STAT || len))
+		return;
+
+	if (virtio_bln__collect_stats(kvm) < 0)
+		return;
+
+	r = write(fd, bdev.stats, sizeof(bdev.stats));
+	if (r < 0)
+		pr_warning("Failed sending memory stats");
+}
+
+static void handle_mem(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+	int mem;
+
+	if (WARN_ON(type != KVM_IPC_BALLOON || len != sizeof(int)))
+		return;
+
+	mem = *(int *)msg;
+	if (mem > 0) {
+		bdev.config.num_pages += 256 * mem;
+	} else if (mem < 0) {
+		if (bdev.config.num_pages < (u32)(256 * (-mem)))
+			return;
+
+		bdev.config.num_pages += 256 * mem;
+	}
+
+	/* Notify that the configuration space has changed */
+	bdev.vdev.ops->signal_config(kvm, &bdev.vdev);
+}
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+	struct bln_dev *bdev = dev;
+
+	return ((u8 *)(&bdev->config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+	return 1 << VIRTIO_BALLOON_F_STATS_VQ;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+	struct bln_dev *bdev = dev;
+
+	bdev->features = features;
+}
+
+static void notify_status(struct kvm *kvm, void *dev, u32 status)
+{
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+		   u32 pfn)
+{
+	struct bln_dev *bdev = dev;
+	struct virt_queue *queue;
+	void *p;
+
+	compat__remove_message(compat_id);
+
+	queue		= &bdev->vqs[vq];
+	queue->pfn	= pfn;
+	p		= virtio_get_vq(kvm, queue->pfn, page_size);
+
+	thread_pool__init_job(&bdev->jobs[vq], kvm, virtio_bln_do_io, queue);
+	vring_init(&queue->vring, VIRTIO_BLN_QUEUE_SIZE, p, align);
+	virtio_init_device_vq(&bdev->vdev, queue);
+
+	return 0;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct bln_dev *bdev = dev;
+
+	thread_pool__do_job(&bdev->jobs[vq]);
+
+	return 0;
+}
+
+static struct virt_queue *get_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct bln_dev *bdev = dev;
+
+	return &bdev->vqs[vq];
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	return VIRTIO_BLN_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+	/* FIXME: dynamic */
+	return size;
+}
+
+static int get_vq_count(struct kvm *kvm, void *dev)
+{
+	return NUM_VIRT_QUEUES;
+}
+
+struct virtio_ops bln_dev_virtio_ops = {
+	.get_config		= get_config,
+	.get_host_features	= get_host_features,
+	.set_guest_features	= set_guest_features,
+	.init_vq		= init_vq,
+	.notify_status		= notify_status,
+	.notify_vq		= notify_vq,
+	.get_vq			= get_vq,
+	.get_size_vq		= get_size_vq,
+	.set_size_vq            = set_size_vq,
+	.get_vq_count		= get_vq_count,
+};
+
+int virtio_bln__init(struct kvm *kvm)
+{
+	int r;
+
+	if (!kvm->cfg.balloon)
+		return 0;
+
+	kvm_ipc__register_handler(KVM_IPC_BALLOON, handle_mem);
+	kvm_ipc__register_handler(KVM_IPC_STAT, virtio_bln__print_stats);
+
+	bdev.stat_waitfd	= eventfd(0, 0);
+	memset(&bdev.config, 0, sizeof(struct virtio_balloon_config));
+
+	r = virtio_init(kvm, &bdev, &bdev.vdev, &bln_dev_virtio_ops,
+			VIRTIO_DEFAULT_TRANS(kvm), PCI_DEVICE_ID_VIRTIO_BLN,
+			VIRTIO_ID_BALLOON, PCI_CLASS_BLN);
+	if (r < 0)
+		return r;
+
+	if (compat_id == -1)
+		compat_id = virtio_compat_add_message("virtio-balloon", "CONFIG_VIRTIO_BALLOON");
+
+	return 0;
+}
+virtio_dev_init(virtio_bln__init);
+
+int virtio_bln__exit(struct kvm *kvm)
+{
+	return 0;
+}
+virtio_dev_exit(virtio_bln__exit);
diff --git a/kvmtool/virtio/blk.c b/kvmtool/virtio/blk.c
new file mode 100644
index 0000000..4d02d10
--- /dev/null
+++ b/kvmtool/virtio/blk.c
@@ -0,0 +1,381 @@
+#include "kvm/virtio-blk.h"
+
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/disk-image.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/threadpool.h"
+#include "kvm/ioeventfd.h"
+#include "kvm/guest_compat.h"
+#include "kvm/virtio-pci.h"
+#include "kvm/virtio.h"
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_blk.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <pthread.h>
+
+#define VIRTIO_BLK_MAX_DEV		4
+
+/*
+ * the header and status consume too entries
+ */
+#define DISK_SEG_MAX			(VIRTIO_BLK_QUEUE_SIZE - 2)
+#define VIRTIO_BLK_QUEUE_SIZE		256
+#define NUM_VIRT_QUEUES			1
+
+struct blk_dev_req {
+	struct virt_queue		*vq;
+	struct blk_dev			*bdev;
+	struct iovec			iov[VIRTIO_BLK_QUEUE_SIZE];
+	u16				out, in, head;
+	struct kvm			*kvm;
+};
+
+struct blk_dev {
+	struct mutex			mutex;
+
+	struct list_head		list;
+
+	struct virtio_device		vdev;
+	struct virtio_blk_config	blk_config;
+	struct disk_image		*disk;
+	u32				features;
+
+	struct virt_queue		vqs[NUM_VIRT_QUEUES];
+	struct blk_dev_req		reqs[VIRTIO_BLK_QUEUE_SIZE];
+
+	pthread_t			io_thread;
+	int				io_efd;
+
+	struct kvm			*kvm;
+};
+
+static LIST_HEAD(bdevs);
+static int compat_id = -1;
+
+void virtio_blk_complete(void *param, long len)
+{
+	struct blk_dev_req *req = param;
+	struct blk_dev *bdev = req->bdev;
+	int queueid = req->vq - bdev->vqs;
+	u8 *status;
+
+	/* status */
+	status	= req->iov[req->out + req->in - 1].iov_base;
+	*status	= (len < 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
+
+	mutex_lock(&bdev->mutex);
+	virt_queue__set_used_elem(req->vq, req->head, len);
+	mutex_unlock(&bdev->mutex);
+
+	if (virtio_queue__should_signal(&bdev->vqs[queueid]))
+		bdev->vdev.ops->signal_vq(req->kvm, &bdev->vdev, queueid);
+}
+
+static void virtio_blk_do_io_request(struct kvm *kvm, struct virt_queue *vq, struct blk_dev_req *req)
+{
+	struct virtio_blk_outhdr *req_hdr;
+	ssize_t block_cnt;
+	struct blk_dev *bdev;
+	struct iovec *iov;
+	u16 out, in;
+	u32 type;
+	u64 sector;
+
+	block_cnt	= -1;
+	bdev		= req->bdev;
+	iov		= req->iov;
+	out		= req->out;
+	in		= req->in;
+	req_hdr		= iov[0].iov_base;
+
+	type = virtio_guest_to_host_u32(vq, req_hdr->type);
+	sector = virtio_guest_to_host_u64(vq, req_hdr->sector);
+
+	switch (type) {
+	case VIRTIO_BLK_T_IN:
+		block_cnt = disk_image__read(bdev->disk, sector,
+				iov + 1, in + out - 2, req);
+		break;
+	case VIRTIO_BLK_T_OUT:
+		block_cnt = disk_image__write(bdev->disk, sector,
+				iov + 1, in + out - 2, req);
+		break;
+	case VIRTIO_BLK_T_FLUSH:
+		block_cnt = disk_image__flush(bdev->disk);
+		virtio_blk_complete(req, block_cnt);
+		break;
+	case VIRTIO_BLK_T_GET_ID:
+		block_cnt = VIRTIO_BLK_ID_BYTES;
+		disk_image__get_serial(bdev->disk,
+				(iov + 1)->iov_base, &block_cnt);
+		virtio_blk_complete(req, block_cnt);
+		break;
+	default:
+		pr_warning("request type %d", type);
+		block_cnt	= -1;
+		break;
+	}
+}
+
+static void virtio_blk_do_io(struct kvm *kvm, struct virt_queue *vq, struct blk_dev *bdev)
+{
+	struct blk_dev_req *req;
+	u16 head;
+
+	while (virt_queue__available(vq)) {
+		head		= virt_queue__pop(vq);
+		req		= &bdev->reqs[head];
+		req->head	= virt_queue__get_head_iov(vq, req->iov, &req->out,
+					&req->in, head, kvm);
+		req->vq		= vq;
+
+		virtio_blk_do_io_request(kvm, vq, req);
+	}
+}
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+	struct blk_dev *bdev = dev;
+
+	return ((u8 *)(&bdev->blk_config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+	struct blk_dev *bdev = dev;
+
+	return	1UL << VIRTIO_BLK_F_SEG_MAX
+		| 1UL << VIRTIO_BLK_F_FLUSH
+		| 1UL << VIRTIO_RING_F_EVENT_IDX
+		| 1UL << VIRTIO_RING_F_INDIRECT_DESC
+		| (bdev->disk->readonly ? 1UL << VIRTIO_BLK_F_RO : 0);
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+	struct blk_dev *bdev = dev;
+	struct virtio_blk_config *conf = &bdev->blk_config;
+
+	bdev->features = features;
+
+	conf->capacity = virtio_host_to_guest_u64(&bdev->vdev, conf->capacity);
+	conf->size_max = virtio_host_to_guest_u32(&bdev->vdev, conf->size_max);
+	conf->seg_max = virtio_host_to_guest_u32(&bdev->vdev, conf->seg_max);
+
+	/* Geometry */
+	conf->geometry.cylinders = virtio_host_to_guest_u16(&bdev->vdev,
+						conf->geometry.cylinders);
+
+	conf->blk_size = virtio_host_to_guest_u32(&bdev->vdev, conf->blk_size);
+	conf->min_io_size = virtio_host_to_guest_u16(&bdev->vdev, conf->min_io_size);
+	conf->opt_io_size = virtio_host_to_guest_u32(&bdev->vdev, conf->opt_io_size);
+}
+
+static void notify_status(struct kvm *kvm, void *dev, u32 status)
+{
+}
+
+static void *virtio_blk_thread(void *dev)
+{
+	struct blk_dev *bdev = dev;
+	u64 data;
+	int r;
+
+	kvm__set_thread_name("virtio-blk-io");
+
+	while (1) {
+		r = read(bdev->io_efd, &data, sizeof(u64));
+		if (r < 0)
+			continue;
+		virtio_blk_do_io(bdev->kvm, &bdev->vqs[0], bdev);
+	}
+
+	pthread_exit(NULL);
+	return NULL;
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+		   u32 pfn)
+{
+	unsigned int i;
+	struct blk_dev *bdev = dev;
+	struct virt_queue *queue;
+	void *p;
+
+	compat__remove_message(compat_id);
+
+	queue		= &bdev->vqs[vq];
+	queue->pfn	= pfn;
+	p		= virtio_get_vq(kvm, queue->pfn, page_size);
+
+	vring_init(&queue->vring, VIRTIO_BLK_QUEUE_SIZE, p, align);
+	virtio_init_device_vq(&bdev->vdev, queue);
+
+	if (vq != 0)
+		return 0;
+
+	for (i = 0; i < ARRAY_SIZE(bdev->reqs); i++) {
+		bdev->reqs[i] = (struct blk_dev_req) {
+			.bdev = bdev,
+			.kvm = kvm,
+		};
+	}
+
+	mutex_init(&bdev->mutex);
+	bdev->io_efd = eventfd(0, 0);
+	if (bdev->io_efd < 0)
+		return -errno;
+
+	if (pthread_create(&bdev->io_thread, NULL, virtio_blk_thread, bdev))
+		return -errno;
+
+	return 0;
+}
+
+static void exit_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct blk_dev *bdev = dev;
+
+	if (vq != 0)
+		return;
+
+	close(bdev->io_efd);
+	pthread_cancel(bdev->io_thread);
+	pthread_join(bdev->io_thread, NULL);
+
+	disk_image__wait(bdev->disk);
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct blk_dev *bdev = dev;
+	u64 data = 1;
+	int r;
+
+	r = write(bdev->io_efd, &data, sizeof(data));
+	if (r < 0)
+		return r;
+
+	return 0;
+}
+
+static struct virt_queue *get_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct blk_dev *bdev = dev;
+
+	return &bdev->vqs[vq];
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	/* FIXME: dynamic */
+	return VIRTIO_BLK_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+	/* FIXME: dynamic */
+	return size;
+}
+
+static int get_vq_count(struct kvm *kvm, void *dev)
+{
+	return NUM_VIRT_QUEUES;
+}
+
+static struct virtio_ops blk_dev_virtio_ops = {
+	.get_config		= get_config,
+	.get_host_features	= get_host_features,
+	.set_guest_features	= set_guest_features,
+	.get_vq_count		= get_vq_count,
+	.init_vq		= init_vq,
+	.exit_vq		= exit_vq,
+	.notify_status		= notify_status,
+	.notify_vq		= notify_vq,
+	.get_vq			= get_vq,
+	.get_size_vq		= get_size_vq,
+	.set_size_vq		= set_size_vq,
+};
+
+static int virtio_blk__init_one(struct kvm *kvm, struct disk_image *disk)
+{
+	struct blk_dev *bdev;
+	int r;
+
+	if (!disk)
+		return -EINVAL;
+
+	bdev = calloc(1, sizeof(struct blk_dev));
+	if (bdev == NULL)
+		return -ENOMEM;
+
+	*bdev = (struct blk_dev) {
+		.disk			= disk,
+		.blk_config		= (struct virtio_blk_config) {
+			.capacity	= disk->size / SECTOR_SIZE,
+			.seg_max	= DISK_SEG_MAX,
+		},
+		.kvm			= kvm,
+	};
+
+	list_add_tail(&bdev->list, &bdevs);
+
+	r = virtio_init(kvm, bdev, &bdev->vdev, &blk_dev_virtio_ops,
+			VIRTIO_DEFAULT_TRANS(kvm), PCI_DEVICE_ID_VIRTIO_BLK,
+			VIRTIO_ID_BLOCK, PCI_CLASS_BLK);
+	if (r < 0)
+		return r;
+
+	disk_image__set_callback(bdev->disk, virtio_blk_complete);
+
+	if (compat_id == -1)
+		compat_id = virtio_compat_add_message("virtio-blk", "CONFIG_VIRTIO_BLK");
+
+	return 0;
+}
+
+static int virtio_blk__exit_one(struct kvm *kvm, struct blk_dev *bdev)
+{
+	list_del(&bdev->list);
+	free(bdev);
+
+	return 0;
+}
+
+int virtio_blk__init(struct kvm *kvm)
+{
+	int i, r = 0;
+
+	for (i = 0; i < kvm->nr_disks; i++) {
+		if (kvm->disks[i]->wwpn)
+			continue;
+		r = virtio_blk__init_one(kvm, kvm->disks[i]);
+		if (r < 0)
+			goto cleanup;
+	}
+
+	return 0;
+cleanup:
+	virtio_blk__exit(kvm);
+	return r;
+}
+virtio_dev_init(virtio_blk__init);
+
+int virtio_blk__exit(struct kvm *kvm)
+{
+	while (!list_empty(&bdevs)) {
+		struct blk_dev *bdev;
+
+		bdev = list_first_entry(&bdevs, struct blk_dev, list);
+		virtio_blk__exit_one(kvm, bdev);
+	}
+
+	return 0;
+}
+virtio_dev_exit(virtio_blk__exit);
diff --git a/kvmtool/virtio/console.c b/kvmtool/virtio/console.c
new file mode 100644
index 0000000..e0b98df
--- /dev/null
+++ b/kvmtool/virtio/console.c
@@ -0,0 +1,255 @@
+#include "kvm/virtio-console.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/disk-image.h"
+#include "kvm/virtio.h"
+#include "kvm/ioport.h"
+#include "kvm/util.h"
+#include "kvm/term.h"
+#include "kvm/mutex.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/threadpool.h"
+#include "kvm/irq.h"
+#include "kvm/guest_compat.h"
+
+#include <linux/virtio_console.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_blk.h>
+
+#include <sys/uio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <termios.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define VIRTIO_CONSOLE_QUEUE_SIZE	128
+#define VIRTIO_CONSOLE_NUM_QUEUES	2
+#define VIRTIO_CONSOLE_RX_QUEUE		0
+#define VIRTIO_CONSOLE_TX_QUEUE		1
+
+struct con_dev {
+	struct mutex			mutex;
+
+	struct virtio_device		vdev;
+	struct virt_queue		vqs[VIRTIO_CONSOLE_NUM_QUEUES];
+	struct virtio_console_config	config;
+	u32				features;
+	int				vq_ready;
+
+	struct thread_pool__job		jobs[VIRTIO_CONSOLE_NUM_QUEUES];
+};
+
+static struct con_dev cdev = {
+	.mutex				= MUTEX_INITIALIZER,
+
+	.vq_ready			= 0,
+
+	.config = {
+		.cols			= 80,
+		.rows			= 24,
+		.max_nr_ports		= 1,
+	},
+};
+
+static int compat_id = -1;
+
+/*
+ * Interrupts are injected for hvc0 only.
+ */
+static void virtio_console__inject_interrupt_callback(struct kvm *kvm, void *param)
+{
+	struct iovec iov[VIRTIO_CONSOLE_QUEUE_SIZE];
+	struct virt_queue *vq;
+	u16 out, in;
+	u16 head;
+	int len;
+
+	mutex_lock(&cdev.mutex);
+
+	vq = param;
+
+	if (term_readable(0) && virt_queue__available(vq)) {
+		head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+		len = term_getc_iov(kvm, iov, in, 0);
+		virt_queue__set_used_elem(vq, head, len);
+		cdev.vdev.ops->signal_vq(kvm, &cdev.vdev, vq - cdev.vqs);
+	}
+
+	mutex_unlock(&cdev.mutex);
+}
+
+void virtio_console__inject_interrupt(struct kvm *kvm)
+{
+	if (kvm->cfg.active_console != CONSOLE_VIRTIO)
+		return;
+
+	mutex_lock(&cdev.mutex);
+	if (cdev.vq_ready)
+		thread_pool__do_job(&cdev.jobs[VIRTIO_CONSOLE_RX_QUEUE]);
+	mutex_unlock(&cdev.mutex);
+}
+
+static void virtio_console_handle_callback(struct kvm *kvm, void *param)
+{
+	struct iovec iov[VIRTIO_CONSOLE_QUEUE_SIZE];
+	struct virt_queue *vq;
+	u16 out, in;
+	u16 head;
+	u32 len;
+
+	vq = param;
+
+	/*
+	 * The current Linux implementation polls for the buffer
+	 * to be used, rather than waiting for an interrupt.
+	 * So there is no need to inject an interrupt for the tx path.
+	 */
+
+	while (virt_queue__available(vq)) {
+		head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+		len = term_putc_iov(iov, out, 0);
+		virt_queue__set_used_elem(vq, head, len);
+	}
+
+}
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+	struct con_dev *cdev = dev;
+
+	return ((u8 *)(&cdev->config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+	return 0;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+	struct con_dev *cdev = dev;
+	struct virtio_console_config *conf = &cdev->config;
+
+	conf->cols = virtio_host_to_guest_u16(&cdev->vdev, conf->cols);
+	conf->rows = virtio_host_to_guest_u16(&cdev->vdev, conf->rows);
+	conf->max_nr_ports = virtio_host_to_guest_u32(&cdev->vdev, conf->max_nr_ports);
+}
+
+static void notify_status(struct kvm *kvm, void *dev, u32 status)
+{
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+		   u32 pfn)
+{
+	struct virt_queue *queue;
+	void *p;
+
+	BUG_ON(vq >= VIRTIO_CONSOLE_NUM_QUEUES);
+
+	compat__remove_message(compat_id);
+
+	queue		= &cdev.vqs[vq];
+	queue->pfn	= pfn;
+	p		= virtio_get_vq(kvm, queue->pfn, page_size);
+
+	vring_init(&queue->vring, VIRTIO_CONSOLE_QUEUE_SIZE, p, align);
+	virtio_init_device_vq(&cdev.vdev, queue);
+
+	if (vq == VIRTIO_CONSOLE_TX_QUEUE) {
+		thread_pool__init_job(&cdev.jobs[vq], kvm, virtio_console_handle_callback, queue);
+	} else if (vq == VIRTIO_CONSOLE_RX_QUEUE) {
+		thread_pool__init_job(&cdev.jobs[vq], kvm, virtio_console__inject_interrupt_callback, queue);
+		/* Tell the waiting poll thread that we're ready to go */
+		mutex_lock(&cdev.mutex);
+		cdev.vq_ready = 1;
+		mutex_unlock(&cdev.mutex);
+	}
+
+	return 0;
+}
+
+static void exit_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	if (vq == VIRTIO_CONSOLE_RX_QUEUE) {
+		mutex_lock(&cdev.mutex);
+		cdev.vq_ready = 0;
+		mutex_unlock(&cdev.mutex);
+		thread_pool__cancel_job(&cdev.jobs[vq]);
+	} else if (vq == VIRTIO_CONSOLE_TX_QUEUE) {
+		thread_pool__cancel_job(&cdev.jobs[vq]);
+	}
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct con_dev *cdev = dev;
+
+	thread_pool__do_job(&cdev->jobs[vq]);
+
+	return 0;
+}
+
+static struct virt_queue *get_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct con_dev *cdev = dev;
+
+	return &cdev->vqs[vq];
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	return VIRTIO_CONSOLE_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+	/* FIXME: dynamic */
+	return size;
+}
+
+static int get_vq_count(struct kvm *kvm, void *dev)
+{
+	return VIRTIO_CONSOLE_NUM_QUEUES;
+}
+
+static struct virtio_ops con_dev_virtio_ops = {
+	.get_config		= get_config,
+	.get_host_features	= get_host_features,
+	.set_guest_features	= set_guest_features,
+	.get_vq_count		= get_vq_count,
+	.init_vq		= init_vq,
+	.exit_vq		= exit_vq,
+	.notify_status		= notify_status,
+	.notify_vq		= notify_vq,
+	.get_vq			= get_vq,
+	.get_size_vq		= get_size_vq,
+	.set_size_vq		= set_size_vq,
+};
+
+int virtio_console__init(struct kvm *kvm)
+{
+	int r;
+
+	if (kvm->cfg.active_console != CONSOLE_VIRTIO)
+		return 0;
+
+	r = virtio_init(kvm, &cdev, &cdev.vdev, &con_dev_virtio_ops,
+			VIRTIO_DEFAULT_TRANS(kvm), PCI_DEVICE_ID_VIRTIO_CONSOLE,
+			VIRTIO_ID_CONSOLE, PCI_CLASS_CONSOLE);
+	if (r < 0)
+		return r;
+
+	if (compat_id == -1)
+		compat_id = virtio_compat_add_message("virtio-console", "CONFIG_VIRTIO_CONSOLE");
+
+	return 0;
+}
+virtio_dev_init(virtio_console__init);
+
+int virtio_console__exit(struct kvm *kvm)
+{
+	return 0;
+}
+virtio_dev_exit(virtio_console__exit);
diff --git a/kvmtool/virtio/core.c b/kvmtool/virtio/core.c
new file mode 100644
index 0000000..f5b3c07
--- /dev/null
+++ b/kvmtool/virtio/core.c
@@ -0,0 +1,328 @@
+#include <linux/virtio_ring.h>
+#include <linux/types.h>
+#include <sys/uio.h>
+#include <stdlib.h>
+
+#include "kvm/guest_compat.h"
+#include "kvm/barrier.h"
+#include "kvm/virtio.h"
+#include "kvm/virtio-pci.h"
+#include "kvm/virtio-mmio.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+
+const char* virtio_trans_name(enum virtio_trans trans)
+{
+	if (trans == VIRTIO_PCI)
+		return "pci";
+	else if (trans == VIRTIO_MMIO)
+		return "mmio";
+	return "unknown";
+}
+
+void virt_queue__used_idx_advance(struct virt_queue *queue, u16 jump)
+{
+	u16 idx = virtio_guest_to_host_u16(queue, queue->vring.used->idx);
+
+	/*
+	 * Use wmb to assure that used elem was updated with head and len.
+	 * We need a wmb here since we can't advance idx unless we're ready
+	 * to pass the used element to the guest.
+	 */
+	wmb();
+	idx += jump;
+	queue->vring.used->idx = virtio_host_to_guest_u16(queue, idx);
+
+	/*
+	 * Use wmb to assure used idx has been increased before we signal the guest.
+	 * Without a wmb here the guest may ignore the queue since it won't see
+	 * an updated idx.
+	 */
+	wmb();
+}
+
+struct vring_used_elem *
+virt_queue__set_used_elem_no_update(struct virt_queue *queue, u32 head,
+				    u32 len, u16 offset)
+{
+	struct vring_used_elem *used_elem;
+	u16 idx = virtio_guest_to_host_u16(queue, queue->vring.used->idx);
+
+	idx += offset;
+	used_elem	= &queue->vring.used->ring[idx % queue->vring.num];
+	used_elem->id	= virtio_host_to_guest_u32(queue, head);
+	used_elem->len	= virtio_host_to_guest_u32(queue, len);
+
+	return used_elem;
+}
+
+struct vring_used_elem *virt_queue__set_used_elem(struct virt_queue *queue, u32 head, u32 len)
+{
+	struct vring_used_elem *used_elem;
+
+	used_elem = virt_queue__set_used_elem_no_update(queue, head, len, 0);
+	virt_queue__used_idx_advance(queue, 1);
+
+	return used_elem;
+}
+
+static inline bool virt_desc__test_flag(struct virt_queue *vq,
+					struct vring_desc *desc, u16 flag)
+{
+	return !!(virtio_guest_to_host_u16(vq, desc->flags) & flag);
+}
+
+/*
+ * Each buffer in the virtqueues is actually a chain of descriptors.  This
+ * function returns the next descriptor in the chain, or max if we're at the
+ * end.
+ */
+static unsigned next_desc(struct virt_queue *vq, struct vring_desc *desc,
+			  unsigned int i, unsigned int max)
+{
+	unsigned int next;
+
+	/* If this descriptor says it doesn't chain, we're done. */
+	if (!virt_desc__test_flag(vq, &desc[i], VRING_DESC_F_NEXT))
+		return max;
+
+	next = virtio_guest_to_host_u16(vq, desc[i].next);
+
+	/* Ensure they're not leading us off end of descriptors. */
+	return min(next, max);
+}
+
+u16 virt_queue__get_head_iov(struct virt_queue *vq, struct iovec iov[], u16 *out, u16 *in, u16 head, struct kvm *kvm)
+{
+	struct vring_desc *desc;
+	u16 idx;
+	u16 max;
+
+	idx = head;
+	*out = *in = 0;
+	max = vq->vring.num;
+	desc = vq->vring.desc;
+
+	if (virt_desc__test_flag(vq, &desc[idx], VRING_DESC_F_INDIRECT)) {
+		max = virtio_guest_to_host_u32(vq, desc[idx].len) / sizeof(struct vring_desc);
+		desc = guest_flat_to_host(kvm, virtio_guest_to_host_u64(vq, desc[idx].addr));
+		idx = 0;
+	}
+
+	do {
+		/* Grab the first descriptor, and check it's OK. */
+		iov[*out + *in].iov_len = virtio_guest_to_host_u32(vq, desc[idx].len);
+		iov[*out + *in].iov_base = guest_flat_to_host(kvm,
+							      virtio_guest_to_host_u64(vq, desc[idx].addr));
+		/* If this is an input descriptor, increment that count. */
+		if (virt_desc__test_flag(vq, &desc[idx], VRING_DESC_F_WRITE))
+			(*in)++;
+		else
+			(*out)++;
+	} while ((idx = next_desc(vq, desc, idx, max)) != max);
+
+	return head;
+}
+
+u16 virt_queue__get_iov(struct virt_queue *vq, struct iovec iov[], u16 *out, u16 *in, struct kvm *kvm)
+{
+	u16 head;
+
+	head = virt_queue__pop(vq);
+
+	return virt_queue__get_head_iov(vq, iov, out, in, head, kvm);
+}
+
+/* in and out are relative to guest */
+u16 virt_queue__get_inout_iov(struct kvm *kvm, struct virt_queue *queue,
+			      struct iovec in_iov[], struct iovec out_iov[],
+			      u16 *in, u16 *out)
+{
+	struct vring_desc *desc;
+	u16 head, idx;
+
+	idx = head = virt_queue__pop(queue);
+	*out = *in = 0;
+	do {
+		u64 addr;
+		desc = virt_queue__get_desc(queue, idx);
+		addr = virtio_guest_to_host_u64(queue, desc->addr);
+		if (virt_desc__test_flag(queue, desc, VRING_DESC_F_WRITE)) {
+			in_iov[*in].iov_base = guest_flat_to_host(kvm, addr);
+			in_iov[*in].iov_len = virtio_guest_to_host_u32(queue, desc->len);
+			(*in)++;
+		} else {
+			out_iov[*out].iov_base = guest_flat_to_host(kvm, addr);
+			out_iov[*out].iov_len = virtio_guest_to_host_u32(queue, desc->len);
+			(*out)++;
+		}
+		if (virt_desc__test_flag(queue, desc, VRING_DESC_F_NEXT))
+			idx = virtio_guest_to_host_u16(queue, desc->next);
+		else
+			break;
+	} while (1);
+
+	return head;
+}
+
+void virtio_exit_vq(struct kvm *kvm, struct virtio_device *vdev,
+			   void *dev, int num)
+{
+	struct virt_queue *vq = vdev->ops->get_vq(kvm, dev, num);
+
+	if (vq->enabled && vdev->ops->exit_vq)
+		vdev->ops->exit_vq(kvm, dev, num);
+	memset(vq, 0, sizeof(*vq));
+}
+
+int virtio__get_dev_specific_field(int offset, bool msix, u32 *config_off)
+{
+	if (msix) {
+		if (offset < 4)
+			return VIRTIO_PCI_O_MSIX;
+		else
+			offset -= 4;
+	}
+
+	*config_off = offset;
+
+	return VIRTIO_PCI_O_CONFIG;
+}
+
+bool virtio_queue__should_signal(struct virt_queue *vq)
+{
+	u16 old_idx, new_idx, event_idx;
+
+	if (!vq->use_event_idx) {
+		/*
+		 * When VIRTIO_RING_F_EVENT_IDX isn't negotiated, interrupt the
+		 * guest if it didn't explicitly request to be left alone.
+		 */
+		return !(virtio_guest_to_host_u16(vq, vq->vring.avail->flags) &
+			 VRING_AVAIL_F_NO_INTERRUPT);
+	}
+
+	old_idx		= vq->last_used_signalled;
+	new_idx		= virtio_guest_to_host_u16(vq, vq->vring.used->idx);
+	event_idx	= virtio_guest_to_host_u16(vq, vring_used_event(&vq->vring));
+
+	if (vring_need_event(event_idx, new_idx, old_idx)) {
+		vq->last_used_signalled = new_idx;
+		return true;
+	}
+
+	return false;
+}
+
+void virtio_set_guest_features(struct kvm *kvm, struct virtio_device *vdev,
+			       void *dev, u32 features)
+{
+	/* TODO: fail negotiation if features & ~host_features */
+
+	vdev->features = features;
+	vdev->ops->set_guest_features(kvm, dev, features);
+}
+
+void virtio_notify_status(struct kvm *kvm, struct virtio_device *vdev,
+			  void *dev, u8 status)
+{
+	u32 ext_status = status;
+
+	vdev->status &= ~VIRTIO_CONFIG_S_MASK;
+	vdev->status |= status;
+
+	/* Add a few hints to help devices */
+	if ((status & VIRTIO_CONFIG_S_DRIVER_OK) &&
+	    !(vdev->status & VIRTIO__STATUS_START)) {
+		vdev->status |= VIRTIO__STATUS_START;
+		ext_status |= VIRTIO__STATUS_START;
+
+	} else if (!status && (vdev->status & VIRTIO__STATUS_START)) {
+		vdev->status &= ~VIRTIO__STATUS_START;
+		ext_status |= VIRTIO__STATUS_STOP;
+
+		/*
+		 * Reset virtqueues and stop all traffic now, so that the device
+		 * can safely reset the backend in notify_status().
+		 */
+		if (ext_status & VIRTIO__STATUS_STOP)
+			vdev->ops->reset(kvm, vdev);
+	}
+
+	if (vdev->ops->notify_status)
+		vdev->ops->notify_status(kvm, dev, ext_status);
+}
+
+int virtio_init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+		struct virtio_ops *ops, enum virtio_trans trans,
+		int device_id, int subsys_id, int class)
+{
+	void *virtio;
+	int r;
+
+	switch (trans) {
+	case VIRTIO_PCI:
+		virtio = calloc(sizeof(struct virtio_pci), 1);
+		if (!virtio)
+			return -ENOMEM;
+		vdev->virtio			= virtio;
+		vdev->ops			= ops;
+		vdev->ops->signal_vq		= virtio_pci__signal_vq;
+		vdev->ops->signal_config	= virtio_pci__signal_config;
+		vdev->ops->init			= virtio_pci__init;
+		vdev->ops->exit			= virtio_pci__exit;
+		vdev->ops->reset		= virtio_pci__reset;
+		r = vdev->ops->init(kvm, dev, vdev, device_id, subsys_id, class);
+		break;
+	case VIRTIO_MMIO:
+		virtio = calloc(sizeof(struct virtio_mmio), 1);
+		if (!virtio)
+			return -ENOMEM;
+		vdev->virtio			= virtio;
+		vdev->ops			= ops;
+		vdev->ops->signal_vq		= virtio_mmio_signal_vq;
+		vdev->ops->signal_config	= virtio_mmio_signal_config;
+		vdev->ops->init			= virtio_mmio_init;
+		vdev->ops->exit			= virtio_mmio_exit;
+		vdev->ops->reset		= virtio_mmio_reset;
+		r = vdev->ops->init(kvm, dev, vdev, device_id, subsys_id, class);
+		break;
+	default:
+		r = -1;
+	};
+
+	return r;
+}
+
+int virtio_compat_add_message(const char *device, const char *config)
+{
+	int len = 1024;
+	int compat_id;
+	char *title;
+	char *desc;
+
+	title = malloc(len);
+	if (!title)
+		return -ENOMEM;
+
+	desc = malloc(len);
+	if (!desc) {
+		free(title);
+		return -ENOMEM;
+	}
+
+	snprintf(title, len, "%s device was not detected.", device);
+	snprintf(desc,  len, "While you have requested a %s device, "
+			     "the guest kernel did not initialize it.\n"
+			     "\tPlease make sure that the guest kernel was "
+			     "compiled with %s=y enabled in .config.",
+			     device, config);
+
+	compat_id = compat__add_message(title, desc);
+
+	free(desc);
+	free(title);
+
+	return compat_id;
+}
diff --git a/kvmtool/virtio/mmio.c b/kvmtool/virtio/mmio.c
new file mode 100644
index 0000000..875a288
--- /dev/null
+++ b/kvmtool/virtio/mmio.c
@@ -0,0 +1,350 @@
+#include "kvm/devices.h"
+#include "kvm/virtio-mmio.h"
+#include "kvm/ioeventfd.h"
+#include "kvm/ioport.h"
+#include "kvm/virtio.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/irq.h"
+#include "kvm/fdt.h"
+
+#include <linux/virtio_mmio.h>
+#include <string.h>
+
+static u32 virtio_mmio_io_space_blocks = KVM_VIRTIO_MMIO_AREA;
+
+static u32 virtio_mmio_get_io_space_block(u32 size)
+{
+	u32 block = virtio_mmio_io_space_blocks;
+	virtio_mmio_io_space_blocks += size;
+
+	return block;
+}
+
+static void virtio_mmio_ioevent_callback(struct kvm *kvm, void *param)
+{
+	struct virtio_mmio_ioevent_param *ioeventfd = param;
+	struct virtio_mmio *vmmio = ioeventfd->vdev->virtio;
+
+	ioeventfd->vdev->ops->notify_vq(kvm, vmmio->dev, ioeventfd->vq);
+}
+
+static int virtio_mmio_init_ioeventfd(struct kvm *kvm,
+				      struct virtio_device *vdev, u32 vq)
+{
+	struct virtio_mmio *vmmio = vdev->virtio;
+	struct ioevent ioevent;
+	int err;
+
+	vmmio->ioeventfds[vq] = (struct virtio_mmio_ioevent_param) {
+		.vdev		= vdev,
+		.vq		= vq,
+	};
+
+	ioevent = (struct ioevent) {
+		.io_addr	= vmmio->addr + VIRTIO_MMIO_QUEUE_NOTIFY,
+		.io_len		= sizeof(u32),
+		.fn		= virtio_mmio_ioevent_callback,
+		.fn_ptr		= &vmmio->ioeventfds[vq],
+		.datamatch	= vq,
+		.fn_kvm		= kvm,
+		.fd		= eventfd(0, 0),
+	};
+
+	if (vdev->use_vhost)
+		/*
+		 * Vhost will poll the eventfd in host kernel side,
+		 * no need to poll in userspace.
+		 */
+		err = ioeventfd__add_event(&ioevent, 0);
+	else
+		/* Need to poll in userspace. */
+		err = ioeventfd__add_event(&ioevent, IOEVENTFD_FLAG_USER_POLL);
+	if (err)
+		return err;
+
+	if (vdev->ops->notify_vq_eventfd)
+		vdev->ops->notify_vq_eventfd(kvm, vmmio->dev, vq, ioevent.fd);
+
+	return 0;
+}
+
+int virtio_mmio_signal_vq(struct kvm *kvm, struct virtio_device *vdev, u32 vq)
+{
+	struct virtio_mmio *vmmio = vdev->virtio;
+
+	vmmio->hdr.interrupt_state |= VIRTIO_MMIO_INT_VRING;
+	kvm__irq_trigger(vmmio->kvm, vmmio->irq);
+
+	return 0;
+}
+
+static void virtio_mmio_exit_vq(struct kvm *kvm, struct virtio_device *vdev,
+				int vq)
+{
+	struct virtio_mmio *vmmio = vdev->virtio;
+
+	ioeventfd__del_event(vmmio->addr + VIRTIO_MMIO_QUEUE_NOTIFY, vq);
+	virtio_exit_vq(kvm, vdev, vmmio->dev, vq);
+}
+
+int virtio_mmio_signal_config(struct kvm *kvm, struct virtio_device *vdev)
+{
+	struct virtio_mmio *vmmio = vdev->virtio;
+
+	vmmio->hdr.interrupt_state |= VIRTIO_MMIO_INT_CONFIG;
+	kvm__irq_trigger(vmmio->kvm, vmmio->irq);
+
+	return 0;
+}
+
+static void virtio_mmio_device_specific(struct kvm_cpu *vcpu,
+					u64 addr, u8 *data, u32 len,
+					u8 is_write, struct virtio_device *vdev)
+{
+	struct virtio_mmio *vmmio = vdev->virtio;
+	u32 i;
+
+	for (i = 0; i < len; i++) {
+		if (is_write)
+			vdev->ops->get_config(vmmio->kvm, vmmio->dev)[addr + i] =
+					      *(u8 *)data + i;
+		else
+			data[i] = vdev->ops->get_config(vmmio->kvm,
+							vmmio->dev)[addr + i];
+	}
+}
+
+static void virtio_mmio_config_in(struct kvm_cpu *vcpu,
+				  u64 addr, void *data, u32 len,
+				  struct virtio_device *vdev)
+{
+	struct virtio_mmio *vmmio = vdev->virtio;
+	struct virt_queue *vq;
+	u32 val = 0;
+
+	switch (addr) {
+	case VIRTIO_MMIO_MAGIC_VALUE:
+	case VIRTIO_MMIO_VERSION:
+	case VIRTIO_MMIO_DEVICE_ID:
+	case VIRTIO_MMIO_VENDOR_ID:
+	case VIRTIO_MMIO_STATUS:
+	case VIRTIO_MMIO_INTERRUPT_STATUS:
+		ioport__write32(data, *(u32 *)(((void *)&vmmio->hdr) + addr));
+		break;
+	case VIRTIO_MMIO_HOST_FEATURES:
+		if (vmmio->hdr.host_features_sel == 0)
+			val = vdev->ops->get_host_features(vmmio->kvm,
+							   vmmio->dev);
+		ioport__write32(data, val);
+		break;
+	case VIRTIO_MMIO_QUEUE_PFN:
+		vq = vdev->ops->get_vq(vmmio->kvm, vmmio->dev,
+				       vmmio->hdr.queue_sel);
+		ioport__write32(data, vq->pfn);
+		break;
+	case VIRTIO_MMIO_QUEUE_NUM_MAX:
+		val = vdev->ops->get_size_vq(vmmio->kvm, vmmio->dev,
+					     vmmio->hdr.queue_sel);
+		ioport__write32(data, val);
+		break;
+	default:
+		break;
+	}
+}
+
+static void virtio_mmio_config_out(struct kvm_cpu *vcpu,
+				   u64 addr, void *data, u32 len,
+				   struct virtio_device *vdev)
+{
+	struct virtio_mmio *vmmio = vdev->virtio;
+	struct kvm *kvm = vmmio->kvm;
+	u32 val = 0;
+
+	switch (addr) {
+	case VIRTIO_MMIO_HOST_FEATURES_SEL:
+	case VIRTIO_MMIO_GUEST_FEATURES_SEL:
+	case VIRTIO_MMIO_QUEUE_SEL:
+		val = ioport__read32(data);
+		*(u32 *)(((void *)&vmmio->hdr) + addr) = val;
+		break;
+	case VIRTIO_MMIO_STATUS:
+		vmmio->hdr.status = ioport__read32(data);
+		if (!vmmio->hdr.status) /* Sample endianness on reset */
+			vdev->endian = kvm_cpu__get_endianness(vcpu);
+		virtio_notify_status(kvm, vdev, vmmio->dev, vmmio->hdr.status);
+		break;
+	case VIRTIO_MMIO_GUEST_FEATURES:
+		if (vmmio->hdr.guest_features_sel == 0) {
+			val = ioport__read32(data);
+			virtio_set_guest_features(vmmio->kvm, vdev,
+						  vmmio->dev, val);
+		}
+		break;
+	case VIRTIO_MMIO_GUEST_PAGE_SIZE:
+		val = ioport__read32(data);
+		vmmio->hdr.guest_page_size = val;
+		break;
+	case VIRTIO_MMIO_QUEUE_NUM:
+		val = ioport__read32(data);
+		vmmio->hdr.queue_num = val;
+		vdev->ops->set_size_vq(vmmio->kvm, vmmio->dev,
+				       vmmio->hdr.queue_sel, val);
+		break;
+	case VIRTIO_MMIO_QUEUE_ALIGN:
+		val = ioport__read32(data);
+		vmmio->hdr.queue_align = val;
+		break;
+	case VIRTIO_MMIO_QUEUE_PFN:
+		val = ioport__read32(data);
+		if (val) {
+			virtio_mmio_init_ioeventfd(vmmio->kvm, vdev,
+						   vmmio->hdr.queue_sel);
+			vdev->ops->init_vq(vmmio->kvm, vmmio->dev,
+					   vmmio->hdr.queue_sel,
+					   vmmio->hdr.guest_page_size,
+					   vmmio->hdr.queue_align,
+					   val);
+		} else {
+			virtio_mmio_exit_vq(kvm, vdev, vmmio->hdr.queue_sel);
+		}
+		break;
+	case VIRTIO_MMIO_QUEUE_NOTIFY:
+		val = ioport__read32(data);
+		vdev->ops->notify_vq(vmmio->kvm, vmmio->dev, val);
+		break;
+	case VIRTIO_MMIO_INTERRUPT_ACK:
+		val = ioport__read32(data);
+		vmmio->hdr.interrupt_state &= ~val;
+		break;
+	default:
+		break;
+	};
+}
+
+static void virtio_mmio_mmio_callback(struct kvm_cpu *vcpu,
+				      u64 addr, u8 *data, u32 len,
+				      u8 is_write, void *ptr)
+{
+	struct virtio_device *vdev = ptr;
+	struct virtio_mmio *vmmio = vdev->virtio;
+	u32 offset = addr - vmmio->addr;
+
+	if (offset >= VIRTIO_MMIO_CONFIG) {
+		offset -= VIRTIO_MMIO_CONFIG;
+		virtio_mmio_device_specific(vcpu, offset, data, len, is_write, ptr);
+		return;
+	}
+
+	if (is_write)
+		virtio_mmio_config_out(vcpu, offset, data, len, ptr);
+	else
+		virtio_mmio_config_in(vcpu, offset, data, len, ptr);
+}
+
+#ifdef CONFIG_HAS_LIBFDT
+#define DEVICE_NAME_MAX_LEN 32
+static
+void generate_virtio_mmio_fdt_node(void *fdt,
+				   struct device_header *dev_hdr,
+				   void (*generate_irq_prop)(void *fdt,
+							     u8 irq,
+							     enum irq_type))
+{
+	char dev_name[DEVICE_NAME_MAX_LEN];
+	struct virtio_mmio *vmmio = container_of(dev_hdr,
+						 struct virtio_mmio,
+						 dev_hdr);
+	u64 addr = vmmio->addr;
+	u64 reg_prop[] = {
+		cpu_to_fdt64(addr),
+		cpu_to_fdt64(VIRTIO_MMIO_IO_SIZE),
+	};
+
+	snprintf(dev_name, DEVICE_NAME_MAX_LEN, "virtio@%llx", addr);
+
+	_FDT(fdt_begin_node(fdt, dev_name));
+	_FDT(fdt_property_string(fdt, "compatible", "virtio,mmio"));
+	_FDT(fdt_property(fdt, "reg", reg_prop, sizeof(reg_prop)));
+	_FDT(fdt_property(fdt, "dma-coherent", NULL, 0));
+	generate_irq_prop(fdt, vmmio->irq, IRQ_TYPE_EDGE_RISING);
+	_FDT(fdt_end_node(fdt));
+}
+#else
+static void generate_virtio_mmio_fdt_node(void *fdt,
+					  struct device_header *dev_hdr,
+					  void (*generate_irq_prop)(void *fdt,
+								    u8 irq))
+{
+	die("Unable to generate device tree nodes without libfdt\n");
+}
+#endif
+
+int virtio_mmio_init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+		     int device_id, int subsys_id, int class)
+{
+	struct virtio_mmio *vmmio = vdev->virtio;
+	int r;
+
+	vmmio->addr	= virtio_mmio_get_io_space_block(VIRTIO_MMIO_IO_SIZE);
+	vmmio->kvm	= kvm;
+	vmmio->dev	= dev;
+
+	r = kvm__register_mmio(kvm, vmmio->addr, VIRTIO_MMIO_IO_SIZE,
+			       false, virtio_mmio_mmio_callback, vdev);
+	if (r < 0)
+		return r;
+
+	vmmio->hdr = (struct virtio_mmio_hdr) {
+		.magic		= {'v', 'i', 'r', 't'},
+		.version	= 1,
+		.device_id	= subsys_id,
+		.vendor_id	= 0x4d564b4c , /* 'LKVM' */
+		.queue_num_max	= 256,
+	};
+
+	vmmio->dev_hdr = (struct device_header) {
+		.bus_type	= DEVICE_BUS_MMIO,
+		.data		= generate_virtio_mmio_fdt_node,
+	};
+
+	vmmio->irq = irq__alloc_line();
+
+	r = device__register(&vmmio->dev_hdr);
+	if (r < 0) {
+		kvm__deregister_mmio(kvm, vmmio->addr);
+		return r;
+	}
+
+	/*
+	 * Instantiate guest virtio-mmio devices using kernel command line
+	 * (or module) parameter, e.g
+	 *
+	 * virtio_mmio.devices=0x200@0xd2000000:5,0x200@0xd2000200:6
+	 */
+	pr_debug("virtio-mmio.devices=0x%x@0x%x:%d", VIRTIO_MMIO_IO_SIZE,
+		 vmmio->addr, vmmio->irq);
+
+	return 0;
+}
+
+int virtio_mmio_reset(struct kvm *kvm, struct virtio_device *vdev)
+{
+	int vq;
+	struct virtio_mmio *vmmio = vdev->virtio;
+
+	for (vq = 0; vq < vdev->ops->get_vq_count(kvm, vmmio->dev); vq++)
+		virtio_mmio_exit_vq(kvm, vdev, vq);
+
+	return 0;
+}
+
+int virtio_mmio_exit(struct kvm *kvm, struct virtio_device *vdev)
+{
+	struct virtio_mmio *vmmio = vdev->virtio;
+
+	virtio_mmio_reset(kvm, vdev);
+	kvm__deregister_mmio(kvm, vmmio->addr);
+
+	return 0;
+}
diff --git a/kvmtool/virtio/net.c b/kvmtool/virtio/net.c
new file mode 100644
index 0000000..1ee3c19
--- /dev/null
+++ b/kvmtool/virtio/net.c
@@ -0,0 +1,1042 @@
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/virtio-net.h"
+#include "kvm/virtio.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/irq.h"
+#include "kvm/uip.h"
+#include "kvm/guest_compat.h"
+#include "kvm/iovec.h"
+#include "kvm/strbuf.h"
+
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <linux/if_tun.h>
+#include <linux/types.h>
+
+#include <arpa/inet.h>
+#include <net/if.h>
+
+#include <unistd.h>
+#include <fcntl.h>
+
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/eventfd.h>
+
+#define VIRTIO_NET_QUEUE_SIZE		256
+#define VIRTIO_NET_NUM_QUEUES		8
+
+struct net_dev;
+
+struct net_dev_operations {
+	int (*rx)(struct iovec *iov, u16 in, struct net_dev *ndev);
+	int (*tx)(struct iovec *iov, u16 in, struct net_dev *ndev);
+};
+
+struct net_dev_queue {
+	int				id;
+	struct net_dev			*ndev;
+	struct virt_queue		vq;
+	pthread_t			thread;
+	struct mutex			lock;
+	pthread_cond_t			cond;
+	int				gsi;
+	int				irqfd;
+};
+
+struct net_dev {
+	struct mutex			mutex;
+	struct virtio_device		vdev;
+	struct list_head		list;
+
+	struct net_dev_queue		queues[VIRTIO_NET_NUM_QUEUES * 2 + 1];
+	struct virtio_net_config	config;
+	u32				features, queue_pairs;
+
+	int				vhost_fd;
+	int				tap_fd;
+	char				tap_name[IFNAMSIZ];
+	bool				tap_ufo;
+
+	int				mode;
+
+	struct uip_info			info;
+	struct net_dev_operations	*ops;
+	struct kvm			*kvm;
+
+	struct virtio_net_params	*params;
+};
+
+static LIST_HEAD(ndevs);
+static int compat_id = -1;
+
+#define MAX_PACKET_SIZE 65550
+
+static bool has_virtio_feature(struct net_dev *ndev, u32 feature)
+{
+	return ndev->features & (1 << feature);
+}
+
+static void virtio_net_fix_tx_hdr(struct virtio_net_hdr *hdr, struct net_dev *ndev)
+{
+	hdr->hdr_len		= virtio_guest_to_host_u16(&ndev->vdev, hdr->hdr_len);
+	hdr->gso_size		= virtio_guest_to_host_u16(&ndev->vdev, hdr->gso_size);
+	hdr->csum_start		= virtio_guest_to_host_u16(&ndev->vdev, hdr->csum_start);
+	hdr->csum_offset	= virtio_guest_to_host_u16(&ndev->vdev, hdr->csum_offset);
+}
+
+static void virtio_net_fix_rx_hdr(struct virtio_net_hdr *hdr, struct net_dev *ndev)
+{
+	hdr->hdr_len		= virtio_host_to_guest_u16(&ndev->vdev, hdr->hdr_len);
+	hdr->gso_size		= virtio_host_to_guest_u16(&ndev->vdev, hdr->gso_size);
+	hdr->csum_start		= virtio_host_to_guest_u16(&ndev->vdev, hdr->csum_start);
+	hdr->csum_offset	= virtio_host_to_guest_u16(&ndev->vdev, hdr->csum_offset);
+}
+
+static void *virtio_net_rx_thread(void *p)
+{
+	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
+	struct net_dev_queue *queue = p;
+	struct virt_queue *vq = &queue->vq;
+	struct net_dev *ndev = queue->ndev;
+	struct kvm *kvm;
+	u16 out, in;
+	u16 head;
+	int len, copied;
+
+	kvm__set_thread_name("virtio-net-rx");
+
+	kvm = ndev->kvm;
+	while (1) {
+		mutex_lock(&queue->lock);
+		if (!virt_queue__available(vq))
+			pthread_cond_wait(&queue->cond, &queue->lock.mutex);
+		mutex_unlock(&queue->lock);
+
+		while (virt_queue__available(vq)) {
+			unsigned char buffer[MAX_PACKET_SIZE + sizeof(struct virtio_net_hdr_mrg_rxbuf)];
+			struct iovec dummy_iov = {
+				.iov_base = buffer,
+				.iov_len  = sizeof(buffer),
+			};
+			struct virtio_net_hdr_mrg_rxbuf *hdr;
+			u16 num_buffers;
+
+			len = ndev->ops->rx(&dummy_iov, 1, ndev);
+			if (len < 0) {
+				pr_warning("%s: rx on vq %u failed (%d), exiting thread\n",
+						__func__, queue->id, len);
+				goto out_err;
+			}
+
+			copied = num_buffers = 0;
+			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+			hdr = iov[0].iov_base;
+			while (copied < len) {
+				size_t iovsize = min_t(size_t, len - copied, iov_size(iov, in));
+
+				memcpy_toiovec(iov, buffer + copied, iovsize);
+				copied += iovsize;
+				virt_queue__set_used_elem_no_update(vq, head, iovsize, num_buffers++);
+				if (copied == len)
+					break;
+				while (!virt_queue__available(vq))
+					sleep(0);
+				head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+			}
+
+			virtio_net_fix_rx_hdr(&hdr->hdr, ndev);
+			if (has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF))
+				hdr->num_buffers = virtio_host_to_guest_u16(vq, num_buffers);
+
+			virt_queue__used_idx_advance(vq, num_buffers);
+
+			/* We should interrupt guest right now, otherwise latency is huge. */
+			if (virtio_queue__should_signal(vq))
+				ndev->vdev.ops->signal_vq(kvm, &ndev->vdev, queue->id);
+		}
+	}
+
+out_err:
+	pthread_exit(NULL);
+	return NULL;
+
+}
+
+static void *virtio_net_tx_thread(void *p)
+{
+	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
+	struct net_dev_queue *queue = p;
+	struct virt_queue *vq = &queue->vq;
+	struct net_dev *ndev = queue->ndev;
+	struct kvm *kvm;
+	u16 out, in;
+	u16 head;
+	int len;
+
+	kvm__set_thread_name("virtio-net-tx");
+
+	kvm = ndev->kvm;
+
+	while (1) {
+		mutex_lock(&queue->lock);
+		if (!virt_queue__available(vq))
+			pthread_cond_wait(&queue->cond, &queue->lock.mutex);
+		mutex_unlock(&queue->lock);
+
+		while (virt_queue__available(vq)) {
+			struct virtio_net_hdr *hdr;
+			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+			hdr = iov[0].iov_base;
+			virtio_net_fix_tx_hdr(hdr, ndev);
+			len = ndev->ops->tx(iov, out, ndev);
+			if (len < 0) {
+				pr_warning("%s: tx on vq %u failed (%d)\n",
+						__func__, queue->id, errno);
+				goto out_err;
+			}
+
+			virt_queue__set_used_elem(vq, head, len);
+		}
+
+		if (virtio_queue__should_signal(vq))
+			ndev->vdev.ops->signal_vq(kvm, &ndev->vdev, queue->id);
+	}
+
+out_err:
+	pthread_exit(NULL);
+	return NULL;
+}
+
+static virtio_net_ctrl_ack virtio_net_handle_mq(struct kvm* kvm, struct net_dev *ndev, struct virtio_net_ctrl_hdr *ctrl)
+{
+	/* Not much to do here */
+	return VIRTIO_NET_OK;
+}
+
+static void *virtio_net_ctrl_thread(void *p)
+{
+	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
+	struct net_dev_queue *queue = p;
+	struct virt_queue *vq = &queue->vq;
+	struct net_dev *ndev = queue->ndev;
+	u16 out, in, head;
+	struct kvm *kvm = ndev->kvm;
+	struct virtio_net_ctrl_hdr *ctrl;
+	virtio_net_ctrl_ack *ack;
+
+	kvm__set_thread_name("virtio-net-ctrl");
+
+	while (1) {
+		mutex_lock(&queue->lock);
+		if (!virt_queue__available(vq))
+			pthread_cond_wait(&queue->cond, &queue->lock.mutex);
+		mutex_unlock(&queue->lock);
+
+		while (virt_queue__available(vq)) {
+			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+			ctrl = iov[0].iov_base;
+			ack = iov[out].iov_base;
+
+			switch (ctrl->class) {
+			case VIRTIO_NET_CTRL_MQ:
+				*ack = virtio_net_handle_mq(kvm, ndev, ctrl);
+				break;
+			default:
+				*ack = VIRTIO_NET_ERR;
+				break;
+			}
+			virt_queue__set_used_elem(vq, head, iov[out].iov_len);
+		}
+
+		if (virtio_queue__should_signal(vq))
+			ndev->vdev.ops->signal_vq(kvm, &ndev->vdev, queue->id);
+	}
+
+	pthread_exit(NULL);
+
+	return NULL;
+}
+
+static void virtio_net_handle_callback(struct kvm *kvm, struct net_dev *ndev, int queue)
+{
+	struct net_dev_queue *net_queue = &ndev->queues[queue];
+
+	if ((u32)queue >= (ndev->queue_pairs * 2 + 1)) {
+		pr_warning("Unknown queue index %u", queue);
+		return;
+	}
+
+	mutex_lock(&net_queue->lock);
+	pthread_cond_signal(&net_queue->cond);
+	mutex_unlock(&net_queue->lock);
+}
+
+static int virtio_net_request_tap(struct net_dev *ndev, struct ifreq *ifr,
+				  const char *tapname)
+{
+	int ret;
+
+	memset(ifr, 0, sizeof(*ifr));
+	ifr->ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
+	if (tapname)
+		strlcpy(ifr->ifr_name, tapname, sizeof(ifr->ifr_name));
+
+	ret = ioctl(ndev->tap_fd, TUNSETIFF, ifr);
+
+	if (ret >= 0)
+		strlcpy(ndev->tap_name, ifr->ifr_name, sizeof(ndev->tap_name));
+	return ret;
+}
+
+static int virtio_net_exec_script(const char* script, const char *tap_name)
+{
+	pid_t pid;
+	int status;
+
+	pid = fork();
+	if (pid == 0) {
+		execl(script, script, tap_name, NULL);
+		_exit(1);
+	} else {
+		waitpid(pid, &status, 0);
+		if (WIFEXITED(status) && WEXITSTATUS(status) != 0) {
+			pr_warning("Fail to setup tap by %s", script);
+			return -1;
+		}
+	}
+	return 0;
+}
+
+static bool virtio_net__tap_init(struct net_dev *ndev)
+{
+	int sock = socket(AF_INET, SOCK_STREAM, 0);
+	int hdr_len;
+	struct sockaddr_in sin = {0};
+	struct ifreq ifr;
+	const struct virtio_net_params *params = ndev->params;
+	bool skipconf = !!params->tapif;
+
+	hdr_len = has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF) ?
+			sizeof(struct virtio_net_hdr_mrg_rxbuf) :
+			sizeof(struct virtio_net_hdr);
+	if (ioctl(ndev->tap_fd, TUNSETVNETHDRSZ, &hdr_len) < 0)
+		pr_warning("Config tap device TUNSETVNETHDRSZ error");
+
+	if (strcmp(params->script, "none")) {
+		if (virtio_net_exec_script(params->script, ndev->tap_name) < 0)
+			goto fail;
+	} else if (!skipconf) {
+		memset(&ifr, 0, sizeof(ifr));
+		strncpy(ifr.ifr_name, ndev->tap_name, sizeof(ifr.ifr_name));
+		sin.sin_addr.s_addr = inet_addr(params->host_ip);
+		memcpy(&(ifr.ifr_addr), &sin, sizeof(ifr.ifr_addr));
+		ifr.ifr_addr.sa_family = AF_INET;
+		if (ioctl(sock, SIOCSIFADDR, &ifr) < 0) {
+			pr_warning("Could not set ip address on tap device");
+			goto fail;
+		}
+	}
+
+	if (!skipconf) {
+		memset(&ifr, 0, sizeof(ifr));
+		strncpy(ifr.ifr_name, ndev->tap_name, sizeof(ifr.ifr_name));
+		ioctl(sock, SIOCGIFFLAGS, &ifr);
+		ifr.ifr_flags |= IFF_UP | IFF_RUNNING;
+		if (ioctl(sock, SIOCSIFFLAGS, &ifr) < 0)
+			pr_warning("Could not bring tap device up");
+	}
+
+	close(sock);
+
+	return 1;
+
+fail:
+	if (sock >= 0)
+		close(sock);
+	if (ndev->tap_fd >= 0)
+		close(ndev->tap_fd);
+
+	return 0;
+}
+
+static void virtio_net__tap_exit(struct net_dev *ndev)
+{
+	int sock;
+	struct ifreq ifr;
+
+	if (ndev->params->tapif)
+		return;
+
+	sock = socket(AF_INET, SOCK_STREAM, 0);
+	strncpy(ifr.ifr_name, ndev->tap_name, sizeof(ifr.ifr_name));
+	ioctl(sock, SIOCGIFFLAGS, &ifr);
+	ifr.ifr_flags &= ~(IFF_UP | IFF_RUNNING);
+	if (ioctl(sock, SIOCGIFFLAGS, &ifr) < 0)
+		pr_warning("Count not bring tap device down");
+	close(sock);
+}
+
+static bool virtio_net__tap_create(struct net_dev *ndev)
+{
+	int offload;
+	struct ifreq ifr;
+	const struct virtio_net_params *params = ndev->params;
+	bool macvtap = (!!params->tapif) && (params->tapif[0] == '/');
+
+	/* Did the user already gave us the FD? */
+	if (params->fd)
+		ndev->tap_fd = params->fd;
+	else {
+		const char *tap_file = "/dev/net/tun";
+
+		/* Did the user ask us to use macvtap? */
+		if (macvtap)
+			tap_file = params->tapif;
+
+		ndev->tap_fd = open(tap_file, O_RDWR);
+		if (ndev->tap_fd < 0) {
+			pr_warning("Unable to open %s", tap_file);
+			return 0;
+		}
+	}
+
+	if (!macvtap &&
+	    virtio_net_request_tap(ndev, &ifr, params->tapif) < 0) {
+		pr_warning("Config tap device error. Are you root?");
+		goto fail;
+	}
+
+	/*
+	 * The UFO support had been removed from kernel in commit:
+	 * ID: fb652fdfe83710da0ca13448a41b7ed027d0a984
+	 * https://www.spinics.net/lists/netdev/msg443562.html
+	 * In oder to support the older kernels without this commit,
+	 * we set the TUN_F_UFO to offload by default to test the status of
+	 * UFO kernel support.
+	 */
+	ndev->tap_ufo = true;
+	offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_UFO;
+	if (ioctl(ndev->tap_fd, TUNSETOFFLOAD, offload) < 0) {
+		/*
+		 * Is this failure caused by kernel remove the UFO support?
+		 * Try TUNSETOFFLOAD without TUN_F_UFO.
+		 */
+		offload &= ~TUN_F_UFO;
+		if (ioctl(ndev->tap_fd, TUNSETOFFLOAD, offload) < 0) {
+			pr_warning("Config tap device TUNSETOFFLOAD error");
+			goto fail;
+		}
+		ndev->tap_ufo = false;
+	}
+
+	return 1;
+
+fail:
+	if ((ndev->tap_fd >= 0) || (!params->fd) )
+		close(ndev->tap_fd);
+
+	return 0;
+}
+
+static inline int tap_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
+{
+	return writev(ndev->tap_fd, iov, out);
+}
+
+static inline int tap_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
+{
+	return readv(ndev->tap_fd, iov, in);
+}
+
+static inline int uip_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
+{
+	return uip_tx(iov, out, &ndev->info);
+}
+
+static inline int uip_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
+{
+	return uip_rx(iov, in, &ndev->info);
+}
+
+static struct net_dev_operations tap_ops = {
+	.rx	= tap_ops_rx,
+	.tx	= tap_ops_tx,
+};
+
+static struct net_dev_operations uip_ops = {
+	.rx	= uip_ops_rx,
+	.tx	= uip_ops_tx,
+};
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+	struct net_dev *ndev = dev;
+
+	return ((u8 *)(&ndev->config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+	u32 features;
+	struct net_dev *ndev = dev;
+
+	features = 1UL << VIRTIO_NET_F_MAC
+		| 1UL << VIRTIO_NET_F_CSUM
+		| 1UL << VIRTIO_NET_F_HOST_TSO4
+		| 1UL << VIRTIO_NET_F_HOST_TSO6
+		| 1UL << VIRTIO_NET_F_GUEST_TSO4
+		| 1UL << VIRTIO_NET_F_GUEST_TSO6
+		| 1UL << VIRTIO_RING_F_EVENT_IDX
+		| 1UL << VIRTIO_RING_F_INDIRECT_DESC
+		| 1UL << VIRTIO_NET_F_CTRL_VQ
+		| 1UL << VIRTIO_NET_F_MRG_RXBUF
+		| 1UL << (ndev->queue_pairs > 1 ? VIRTIO_NET_F_MQ : 0);
+
+	/*
+	 * The UFO feature for host and guest only can be enabled when the
+	 * kernel has TAP UFO support.
+	 */
+	if (ndev->tap_ufo)
+		features |= (1UL << VIRTIO_NET_F_HOST_UFO
+				| 1UL << VIRTIO_NET_F_GUEST_UFO);
+
+	return features;
+}
+
+static int virtio_net__vhost_set_features(struct net_dev *ndev)
+{
+	u64 features = 1UL << VIRTIO_RING_F_EVENT_IDX;
+	u64 vhost_features;
+
+	if (ioctl(ndev->vhost_fd, VHOST_GET_FEATURES, &vhost_features) != 0)
+		die_perror("VHOST_GET_FEATURES failed");
+
+	/* make sure both side support mergable rx buffers */
+	if (vhost_features & 1UL << VIRTIO_NET_F_MRG_RXBUF &&
+			has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF))
+		features |= 1UL << VIRTIO_NET_F_MRG_RXBUF;
+
+	return ioctl(ndev->vhost_fd, VHOST_SET_FEATURES, &features);
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+	struct net_dev *ndev = dev;
+	struct virtio_net_config *conf = &ndev->config;
+
+	ndev->features = features;
+
+	conf->status = virtio_host_to_guest_u16(&ndev->vdev, conf->status);
+	conf->max_virtqueue_pairs = virtio_host_to_guest_u16(&ndev->vdev,
+							     conf->max_virtqueue_pairs);
+}
+
+static void virtio_net_start(struct net_dev *ndev)
+{
+	if (ndev->mode == NET_MODE_TAP) {
+		if (!virtio_net__tap_init(ndev))
+			die_perror("TAP device initialized failed because");
+
+		if (ndev->vhost_fd &&
+				virtio_net__vhost_set_features(ndev) != 0)
+			die_perror("VHOST_SET_FEATURES failed");
+	} else {
+		ndev->info.vnet_hdr_len = has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF) ?
+						sizeof(struct virtio_net_hdr_mrg_rxbuf) :
+						sizeof(struct virtio_net_hdr);
+		uip_init(&ndev->info);
+	}
+}
+
+static void virtio_net_stop(struct net_dev *ndev)
+{
+	/* Undo whatever start() did */
+	if (ndev->mode == NET_MODE_TAP)
+		virtio_net__tap_exit(ndev);
+	else
+		uip_exit(&ndev->info);
+}
+
+static void notify_status(struct kvm *kvm, void *dev, u32 status)
+{
+	if (status & VIRTIO__STATUS_START)
+		virtio_net_start(dev);
+	else if (status & VIRTIO__STATUS_STOP)
+		virtio_net_stop(dev);
+}
+
+static bool is_ctrl_vq(struct net_dev *ndev, u32 vq)
+{
+	return vq == (u32)(ndev->queue_pairs * 2);
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+		   u32 pfn)
+{
+	struct vhost_vring_state state = { .index = vq };
+	struct net_dev_queue *net_queue;
+	struct vhost_vring_addr addr;
+	struct net_dev *ndev = dev;
+	struct virt_queue *queue;
+	void *p;
+	int r;
+
+	compat__remove_message(compat_id);
+
+	net_queue	= &ndev->queues[vq];
+	net_queue->id	= vq;
+	net_queue->ndev	= ndev;
+	queue		= &net_queue->vq;
+	queue->pfn	= pfn;
+	p		= virtio_get_vq(kvm, queue->pfn, page_size);
+
+	vring_init(&queue->vring, VIRTIO_NET_QUEUE_SIZE, p, align);
+	virtio_init_device_vq(&ndev->vdev, queue);
+
+	mutex_init(&net_queue->lock);
+	pthread_cond_init(&net_queue->cond, NULL);
+	if (is_ctrl_vq(ndev, vq)) {
+		pthread_create(&net_queue->thread, NULL, virtio_net_ctrl_thread,
+			       net_queue);
+
+		return 0;
+	} else if (ndev->vhost_fd == 0 ) {
+		if (vq & 1)
+			pthread_create(&net_queue->thread, NULL,
+				       virtio_net_tx_thread, net_queue);
+		else
+			pthread_create(&net_queue->thread, NULL,
+				       virtio_net_rx_thread, net_queue);
+
+		return 0;
+	}
+
+	if (queue->endian != VIRTIO_ENDIAN_HOST)
+		die_perror("VHOST requires the same endianness in guest and host");
+
+	state.num = queue->vring.num;
+	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_NUM, &state);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_NUM failed");
+	state.num = 0;
+	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_BASE, &state);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_BASE failed");
+
+	addr = (struct vhost_vring_addr) {
+		.index = vq,
+		.desc_user_addr = (u64)(unsigned long)queue->vring.desc,
+		.avail_user_addr = (u64)(unsigned long)queue->vring.avail,
+		.used_user_addr = (u64)(unsigned long)queue->vring.used,
+	};
+
+	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_ADDR, &addr);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_ADDR failed");
+
+	return 0;
+}
+
+static void exit_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct net_dev *ndev = dev;
+	struct net_dev_queue *queue = &ndev->queues[vq];
+
+	if (!is_ctrl_vq(ndev, vq) && queue->gsi) {
+		irq__del_irqfd(kvm, queue->gsi, queue->irqfd);
+		close(queue->irqfd);
+		queue->gsi = queue->irqfd = 0;
+	}
+
+	/*
+	 * TODO: vhost reset owner. It's the only way to cleanly stop vhost, but
+	 * we can't restart it at the moment.
+	 */
+	if (ndev->vhost_fd && !is_ctrl_vq(ndev, vq)) {
+		pr_warning("Cannot reset VHOST queue");
+		ioctl(ndev->vhost_fd, VHOST_RESET_OWNER);
+		return;
+	}
+
+	/*
+	 * Threads are waiting on cancellation points (readv or
+	 * pthread_cond_wait) and should stop gracefully.
+	 */
+	pthread_cancel(queue->thread);
+	pthread_join(queue->thread, NULL);
+}
+
+static void notify_vq_gsi(struct kvm *kvm, void *dev, u32 vq, u32 gsi)
+{
+	struct net_dev *ndev = dev;
+	struct net_dev_queue *queue = &ndev->queues[vq];
+	struct vhost_vring_file file;
+	int r;
+
+	if (ndev->vhost_fd == 0)
+		return;
+
+	file = (struct vhost_vring_file) {
+		.index	= vq,
+		.fd	= eventfd(0, 0),
+	};
+
+	r = irq__add_irqfd(kvm, gsi, file.fd, -1);
+	if (r < 0)
+		die_perror("KVM_IRQFD failed");
+
+	queue->irqfd = file.fd;
+	queue->gsi = gsi;
+
+	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_CALL, &file);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_CALL failed");
+	file.fd = ndev->tap_fd;
+	r = ioctl(ndev->vhost_fd, VHOST_NET_SET_BACKEND, &file);
+	if (r != 0)
+		die("VHOST_NET_SET_BACKEND failed %d", errno);
+
+}
+
+static void notify_vq_eventfd(struct kvm *kvm, void *dev, u32 vq, u32 efd)
+{
+	struct net_dev *ndev = dev;
+	struct vhost_vring_file file = {
+		.index	= vq,
+		.fd	= efd,
+	};
+	int r;
+
+	if (ndev->vhost_fd == 0 || is_ctrl_vq(ndev, vq))
+		return;
+
+	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_KICK, &file);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_KICK failed");
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct net_dev *ndev = dev;
+
+	virtio_net_handle_callback(kvm, ndev, vq);
+
+	return 0;
+}
+
+static struct virt_queue *get_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct net_dev *ndev = dev;
+
+	return &ndev->queues[vq].vq;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	/* FIXME: dynamic */
+	return VIRTIO_NET_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+	/* FIXME: dynamic */
+	return size;
+}
+
+static int get_vq_count(struct kvm *kvm, void *dev)
+{
+	struct net_dev *ndev = dev;
+
+	return ndev->queue_pairs * 2 + 1;
+}
+
+static struct virtio_ops net_dev_virtio_ops = {
+	.get_config		= get_config,
+	.get_host_features	= get_host_features,
+	.set_guest_features	= set_guest_features,
+	.get_vq_count		= get_vq_count,
+	.init_vq		= init_vq,
+	.exit_vq		= exit_vq,
+	.get_vq			= get_vq,
+	.get_size_vq		= get_size_vq,
+	.set_size_vq		= set_size_vq,
+	.notify_vq		= notify_vq,
+	.notify_vq_gsi		= notify_vq_gsi,
+	.notify_vq_eventfd	= notify_vq_eventfd,
+	.notify_status		= notify_status,
+};
+
+static void virtio_net__vhost_init(struct kvm *kvm, struct net_dev *ndev)
+{
+	struct kvm_mem_bank *bank;
+	struct vhost_memory *mem;
+	int r, i;
+
+	ndev->vhost_fd = open("/dev/vhost-net", O_RDWR);
+	if (ndev->vhost_fd < 0)
+		die_perror("Failed openning vhost-net device");
+
+	mem = calloc(1, sizeof(*mem) + kvm->mem_slots * sizeof(struct vhost_memory_region));
+	if (mem == NULL)
+		die("Failed allocating memory for vhost memory map");
+
+	i = 0;
+	list_for_each_entry(bank, &kvm->mem_banks, list) {
+		mem->regions[i] = (struct vhost_memory_region) {
+			.guest_phys_addr = bank->guest_phys_addr,
+			.memory_size	 = bank->size,
+			.userspace_addr	 = (unsigned long)bank->host_addr,
+		};
+		i++;
+	}
+	mem->nregions = i;
+
+	r = ioctl(ndev->vhost_fd, VHOST_SET_OWNER);
+	if (r != 0)
+		die_perror("VHOST_SET_OWNER failed");
+
+	r = ioctl(ndev->vhost_fd, VHOST_SET_MEM_TABLE, mem);
+	if (r != 0)
+		die_perror("VHOST_SET_MEM_TABLE failed");
+
+	ndev->vdev.use_vhost = true;
+
+	free(mem);
+}
+
+static inline void str_to_mac(const char *str, char *mac)
+{
+	sscanf(str, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
+		mac, mac+1, mac+2, mac+3, mac+4, mac+5);
+}
+static int set_net_param(struct kvm *kvm, struct virtio_net_params *p,
+			const char *param, const char *val)
+{
+	if (strcmp(param, "guest_mac") == 0) {
+		str_to_mac(val, p->guest_mac);
+	} else if (strcmp(param, "mode") == 0) {
+		if (!strncmp(val, "user", 4)) {
+			int i;
+
+			for (i = 0; i < kvm->cfg.num_net_devices; i++)
+				if (kvm->cfg.net_params[i].mode == NET_MODE_USER)
+					die("Only one usermode network device allowed at a time");
+			p->mode = NET_MODE_USER;
+		} else if (!strncmp(val, "tap", 3)) {
+			p->mode = NET_MODE_TAP;
+		} else if (!strncmp(val, "none", 4)) {
+			kvm->cfg.no_net = 1;
+			return -1;
+		} else
+			die("Unknown network mode %s, please use user, tap or none", kvm->cfg.network);
+	} else if (strcmp(param, "script") == 0) {
+		p->script = strdup(val);
+	} else if (strcmp(param, "downscript") == 0) {
+		p->downscript = strdup(val);
+	} else if (strcmp(param, "guest_ip") == 0) {
+		p->guest_ip = strdup(val);
+	} else if (strcmp(param, "host_ip") == 0) {
+		p->host_ip = strdup(val);
+	} else if (strcmp(param, "trans") == 0) {
+		p->trans = strdup(val);
+	} else if (strcmp(param, "tapif") == 0) {
+		p->tapif = strdup(val);
+	} else if (strcmp(param, "vhost") == 0) {
+		p->vhost = atoi(val);
+	} else if (strcmp(param, "fd") == 0) {
+		p->fd = atoi(val);
+	} else if (strcmp(param, "mq") == 0) {
+		p->mq = atoi(val);
+	} else
+		die("Unknown network parameter %s", param);
+
+	return 0;
+}
+
+int netdev_parser(const struct option *opt, const char *arg, int unset)
+{
+	struct virtio_net_params p;
+	char *buf = NULL, *cmd = NULL, *cur = NULL;
+	bool on_cmd = true;
+	struct kvm *kvm = opt->ptr;
+
+	if (arg) {
+		buf = strdup(arg);
+		if (buf == NULL)
+			die("Failed allocating new net buffer");
+		cur = strtok(buf, ",=");
+	}
+
+	p = (struct virtio_net_params) {
+		.guest_ip	= DEFAULT_GUEST_ADDR,
+		.host_ip	= DEFAULT_HOST_ADDR,
+		.script		= DEFAULT_SCRIPT,
+		.downscript	= DEFAULT_SCRIPT,
+		.mode		= NET_MODE_TAP,
+	};
+
+	str_to_mac(DEFAULT_GUEST_MAC, p.guest_mac);
+	p.guest_mac[5] += kvm->cfg.num_net_devices;
+
+	while (cur) {
+		if (on_cmd) {
+			cmd = cur;
+		} else {
+			if (set_net_param(kvm, &p, cmd, cur) < 0)
+				goto done;
+		}
+		on_cmd = !on_cmd;
+
+		cur = strtok(NULL, ",=");
+	};
+
+	kvm->cfg.num_net_devices++;
+
+	kvm->cfg.net_params = realloc(kvm->cfg.net_params, kvm->cfg.num_net_devices * sizeof(*kvm->cfg.net_params));
+	if (kvm->cfg.net_params == NULL)
+		die("Failed adding new network device");
+
+	kvm->cfg.net_params[kvm->cfg.num_net_devices - 1] = p;
+
+done:
+	free(buf);
+	return 0;
+}
+
+static int virtio_net__init_one(struct virtio_net_params *params)
+{
+	int i, r;
+	struct net_dev *ndev;
+	struct virtio_ops *ops;
+	enum virtio_trans trans = VIRTIO_DEFAULT_TRANS(params->kvm);
+
+	ndev = calloc(1, sizeof(struct net_dev));
+	if (ndev == NULL)
+		return -ENOMEM;
+
+	list_add_tail(&ndev->list, &ndevs);
+
+	ops = malloc(sizeof(*ops));
+	if (ops == NULL)
+		return -ENOMEM;
+
+	ndev->kvm = params->kvm;
+	ndev->params = params;
+
+	mutex_init(&ndev->mutex);
+	ndev->queue_pairs = max(1, min(VIRTIO_NET_NUM_QUEUES, params->mq));
+	ndev->config.status = VIRTIO_NET_S_LINK_UP;
+	if (ndev->queue_pairs > 1)
+		ndev->config.max_virtqueue_pairs = ndev->queue_pairs;
+
+	for (i = 0 ; i < 6 ; i++) {
+		ndev->config.mac[i]		= params->guest_mac[i];
+		ndev->info.guest_mac.addr[i]	= params->guest_mac[i];
+		ndev->info.host_mac.addr[i]	= params->host_mac[i];
+	}
+
+	ndev->mode = params->mode;
+	if (ndev->mode == NET_MODE_TAP) {
+		ndev->ops = &tap_ops;
+		if (!virtio_net__tap_create(ndev))
+			die_perror("You have requested a TAP device, but creation of one has failed because");
+	} else {
+		ndev->info.host_ip		= ntohl(inet_addr(params->host_ip));
+		ndev->info.guest_ip		= ntohl(inet_addr(params->guest_ip));
+		ndev->info.guest_netmask	= ntohl(inet_addr("255.255.255.0"));
+		ndev->info.buf_nr		= 20,
+		ndev->ops = &uip_ops;
+		uip_static_init(&ndev->info);
+	}
+
+	*ops = net_dev_virtio_ops;
+
+	if (params->trans) {
+		if (strcmp(params->trans, "mmio") == 0)
+			trans = VIRTIO_MMIO;
+		else if (strcmp(params->trans, "pci") == 0)
+			trans = VIRTIO_PCI;
+		else
+			pr_warning("virtio-net: Unknown transport method : %s, "
+				   "falling back to %s.", params->trans,
+				   virtio_trans_name(trans));
+	}
+
+	r = virtio_init(params->kvm, ndev, &ndev->vdev, ops, trans,
+			PCI_DEVICE_ID_VIRTIO_NET, VIRTIO_ID_NET, PCI_CLASS_NET);
+	if (r < 0) {
+		free(ops);
+		return r;
+	}
+
+	if (params->vhost)
+		virtio_net__vhost_init(params->kvm, ndev);
+
+	if (compat_id == -1)
+		compat_id = virtio_compat_add_message("virtio-net", "CONFIG_VIRTIO_NET");
+
+	return 0;
+}
+
+int virtio_net__init(struct kvm *kvm)
+{
+	int i, r;
+
+	for (i = 0; i < kvm->cfg.num_net_devices; i++) {
+		kvm->cfg.net_params[i].kvm = kvm;
+		r = virtio_net__init_one(&kvm->cfg.net_params[i]);
+		if (r < 0)
+			goto cleanup;
+	}
+
+	if (kvm->cfg.num_net_devices == 0 && kvm->cfg.no_net == 0) {
+		static struct virtio_net_params net_params;
+
+		net_params = (struct virtio_net_params) {
+			.guest_ip	= kvm->cfg.guest_ip,
+			.host_ip	= kvm->cfg.host_ip,
+			.kvm		= kvm,
+			.script		= kvm->cfg.script,
+			.mode		= NET_MODE_USER,
+		};
+		str_to_mac(kvm->cfg.guest_mac, net_params.guest_mac);
+		str_to_mac(kvm->cfg.host_mac, net_params.host_mac);
+
+		r = virtio_net__init_one(&net_params);
+		if (r < 0)
+			goto cleanup;
+	}
+
+	return 0;
+
+cleanup:
+	virtio_net__exit(kvm);
+	return r;
+}
+virtio_dev_init(virtio_net__init);
+
+int virtio_net__exit(struct kvm *kvm)
+{
+	struct virtio_net_params *params;
+	struct net_dev *ndev;
+	struct list_head *ptr, *n;
+
+	list_for_each_safe(ptr, n, &ndevs) {
+		ndev = list_entry(ptr, struct net_dev, list);
+		params = ndev->params;
+		/* Cleanup any tap device which attached to bridge */
+		if (ndev->mode == NET_MODE_TAP &&
+		    strcmp(params->downscript, "none"))
+			virtio_net_exec_script(params->downscript, ndev->tap_name);
+
+		list_del(&ndev->list);
+		free(ndev);
+	}
+	return 0;
+}
+virtio_dev_exit(virtio_net__exit);
diff --git a/kvmtool/virtio/pci.c b/kvmtool/virtio/pci.c
new file mode 100644
index 0000000..6eea6c6
--- /dev/null
+++ b/kvmtool/virtio/pci.c
@@ -0,0 +1,631 @@
+#include "kvm/virtio-pci.h"
+
+#include "kvm/ioport.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/irq.h"
+#include "kvm/virtio.h"
+#include "kvm/ioeventfd.h"
+
+#include <sys/ioctl.h>
+#include <linux/virtio_pci.h>
+#include <linux/byteorder.h>
+#include <assert.h>
+#include <string.h>
+
+static u16 virtio_pci__port_addr(struct virtio_pci *vpci)
+{
+	return pci__bar_address(&vpci->pci_hdr, 0);
+}
+
+static u32 virtio_pci__mmio_addr(struct virtio_pci *vpci)
+{
+	return pci__bar_address(&vpci->pci_hdr, 1);
+}
+
+static u32 virtio_pci__msix_io_addr(struct virtio_pci *vpci)
+{
+	return pci__bar_address(&vpci->pci_hdr, 2);
+}
+
+static void virtio_pci__ioevent_callback(struct kvm *kvm, void *param)
+{
+	struct virtio_pci_ioevent_param *ioeventfd = param;
+	struct virtio_pci *vpci = ioeventfd->vdev->virtio;
+
+	ioeventfd->vdev->ops->notify_vq(kvm, vpci->dev, ioeventfd->vq);
+}
+
+static int virtio_pci__init_ioeventfd(struct kvm *kvm, struct virtio_device *vdev, u32 vq)
+{
+	struct ioevent ioevent;
+	struct virtio_pci *vpci = vdev->virtio;
+	u32 mmio_addr = virtio_pci__mmio_addr(vpci);
+	u16 port_addr = virtio_pci__port_addr(vpci);
+	int r, flags = 0;
+	int fd;
+
+	vpci->ioeventfds[vq] = (struct virtio_pci_ioevent_param) {
+		.vdev		= vdev,
+		.vq		= vq,
+	};
+
+	ioevent = (struct ioevent) {
+		.fn		= virtio_pci__ioevent_callback,
+		.fn_ptr		= &vpci->ioeventfds[vq],
+		.datamatch	= vq,
+		.fn_kvm		= kvm,
+	};
+
+	/*
+	 * Vhost will poll the eventfd in host kernel side, otherwise we
+	 * need to poll in userspace.
+	 */
+	if (!vdev->use_vhost)
+		flags |= IOEVENTFD_FLAG_USER_POLL;
+
+	/* ioport */
+	ioevent.io_addr	= port_addr + VIRTIO_PCI_QUEUE_NOTIFY;
+	ioevent.io_len	= sizeof(u16);
+	ioevent.fd	= fd = eventfd(0, 0);
+	r = ioeventfd__add_event(&ioevent, flags | IOEVENTFD_FLAG_PIO);
+	if (r)
+		return r;
+
+	/* mmio */
+	ioevent.io_addr	= mmio_addr + VIRTIO_PCI_QUEUE_NOTIFY;
+	ioevent.io_len	= sizeof(u16);
+	ioevent.fd	= eventfd(0, 0);
+	r = ioeventfd__add_event(&ioevent, flags);
+	if (r)
+		goto free_ioport_evt;
+
+	if (vdev->ops->notify_vq_eventfd)
+		vdev->ops->notify_vq_eventfd(kvm, vpci->dev, vq, fd);
+	return 0;
+
+free_ioport_evt:
+	ioeventfd__del_event(port_addr + VIRTIO_PCI_QUEUE_NOTIFY, vq);
+	return r;
+}
+
+static void virtio_pci_exit_vq(struct kvm *kvm, struct virtio_device *vdev,
+			       int vq)
+{
+	struct virtio_pci *vpci = vdev->virtio;
+	u32 mmio_addr = virtio_pci__mmio_addr(vpci);
+	u16 port_addr = virtio_pci__port_addr(vpci);
+
+	ioeventfd__del_event(mmio_addr + VIRTIO_PCI_QUEUE_NOTIFY, vq);
+	ioeventfd__del_event(port_addr + VIRTIO_PCI_QUEUE_NOTIFY, vq);
+	virtio_exit_vq(kvm, vdev, vpci->dev, vq);
+}
+
+static inline bool virtio_pci__msix_enabled(struct virtio_pci *vpci)
+{
+	return vpci->pci_hdr.msix.ctrl & cpu_to_le16(PCI_MSIX_FLAGS_ENABLE);
+}
+
+static bool virtio_pci__specific_data_in(struct kvm *kvm, struct virtio_device *vdev,
+					 void *data, int size, unsigned long offset)
+{
+	u32 config_offset;
+	struct virtio_pci *vpci = vdev->virtio;
+	int type = virtio__get_dev_specific_field(offset - 20,
+							virtio_pci__msix_enabled(vpci),
+							&config_offset);
+	if (type == VIRTIO_PCI_O_MSIX) {
+		switch (offset) {
+		case VIRTIO_MSI_CONFIG_VECTOR:
+			ioport__write16(data, vpci->config_vector);
+			break;
+		case VIRTIO_MSI_QUEUE_VECTOR:
+			ioport__write16(data, vpci->vq_vector[vpci->queue_selector]);
+			break;
+		};
+
+		return true;
+	} else if (type == VIRTIO_PCI_O_CONFIG) {
+		u8 cfg;
+
+		cfg = vdev->ops->get_config(kvm, vpci->dev)[config_offset];
+		ioport__write8(data, cfg);
+		return true;
+	}
+
+	return false;
+}
+
+static bool virtio_pci__data_in(struct kvm_cpu *vcpu, struct virtio_device *vdev,
+				unsigned long offset, void *data, int size)
+{
+	bool ret = true;
+	struct virtio_pci *vpci;
+	struct virt_queue *vq;
+	struct kvm *kvm;
+	u32 val;
+
+	kvm = vcpu->kvm;
+	vpci = vdev->virtio;
+
+	switch (offset) {
+	case VIRTIO_PCI_HOST_FEATURES:
+		val = vdev->ops->get_host_features(kvm, vpci->dev);
+		ioport__write32(data, val);
+		break;
+	case VIRTIO_PCI_QUEUE_PFN:
+		vq = vdev->ops->get_vq(kvm, vpci->dev, vpci->queue_selector);
+		ioport__write32(data, vq->pfn);
+		break;
+	case VIRTIO_PCI_QUEUE_NUM:
+		val = vdev->ops->get_size_vq(kvm, vpci->dev, vpci->queue_selector);
+		ioport__write16(data, val);
+		break;
+	case VIRTIO_PCI_STATUS:
+		ioport__write8(data, vpci->status);
+		break;
+	case VIRTIO_PCI_ISR:
+		ioport__write8(data, vpci->isr);
+		kvm__irq_line(kvm, vpci->legacy_irq_line, VIRTIO_IRQ_LOW);
+		vpci->isr = VIRTIO_IRQ_LOW;
+		break;
+	default:
+		ret = virtio_pci__specific_data_in(kvm, vdev, data, size, offset);
+		break;
+	};
+
+	return ret;
+}
+
+static bool virtio_pci__io_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	struct virtio_device *vdev = ioport->priv;
+	struct virtio_pci *vpci = vdev->virtio;
+	unsigned long offset = port - virtio_pci__port_addr(vpci);
+
+	return virtio_pci__data_in(vcpu, vdev, offset, data, size);
+}
+
+static void update_msix_map(struct virtio_pci *vpci,
+			    struct msix_table *msix_entry, u32 vecnum)
+{
+	u32 gsi, i;
+
+	/* Find the GSI number used for that vector */
+	if (vecnum == vpci->config_vector) {
+		gsi = vpci->config_gsi;
+	} else {
+		for (i = 0; i < VIRTIO_PCI_MAX_VQ; i++)
+			if (vpci->vq_vector[i] == vecnum)
+				break;
+		if (i == VIRTIO_PCI_MAX_VQ)
+			return;
+		gsi = vpci->gsis[i];
+	}
+
+	if (gsi == 0)
+		return;
+
+	msix_entry = &msix_entry[vecnum];
+	irq__update_msix_route(vpci->kvm, gsi, &msix_entry->msg);
+}
+
+static bool virtio_pci__specific_data_out(struct kvm *kvm, struct virtio_device *vdev,
+					  void *data, int size, unsigned long offset)
+{
+	struct virtio_pci *vpci = vdev->virtio;
+	u32 config_offset, vec;
+	int gsi;
+	int type = virtio__get_dev_specific_field(offset - 20, virtio_pci__msix_enabled(vpci),
+							&config_offset);
+	if (type == VIRTIO_PCI_O_MSIX) {
+		switch (offset) {
+		case VIRTIO_MSI_CONFIG_VECTOR:
+			vec = vpci->config_vector = ioport__read16(data);
+			if (vec == VIRTIO_MSI_NO_VECTOR)
+				break;
+
+			gsi = irq__add_msix_route(kvm,
+						  &vpci->msix_table[vec].msg,
+						  vpci->dev_hdr.dev_num << 3);
+			/*
+			 * We don't need IRQ routing if we can use
+			 * MSI injection via the KVM_SIGNAL_MSI ioctl.
+			 */
+			if (gsi == -ENXIO &&
+			    vpci->features & VIRTIO_PCI_F_SIGNAL_MSI)
+				break;
+
+			if (gsi < 0) {
+				die("failed to configure MSIs");
+				break;
+			}
+
+			vpci->config_gsi = gsi;
+			break;
+		case VIRTIO_MSI_QUEUE_VECTOR:
+			vec = ioport__read16(data);
+			vpci->vq_vector[vpci->queue_selector] = vec;
+
+			if (vec == VIRTIO_MSI_NO_VECTOR)
+				break;
+
+			gsi = irq__add_msix_route(kvm,
+						  &vpci->msix_table[vec].msg,
+						  vpci->dev_hdr.dev_num << 3);
+			/*
+			 * We don't need IRQ routing if we can use
+			 * MSI injection via the KVM_SIGNAL_MSI ioctl.
+			 */
+			if (gsi == -ENXIO &&
+			    vpci->features & VIRTIO_PCI_F_SIGNAL_MSI)
+				break;
+
+			if (gsi < 0) {
+				die("failed to configure MSIs");
+				break;
+			}
+
+			vpci->gsis[vpci->queue_selector] = gsi;
+			if (vdev->ops->notify_vq_gsi)
+				vdev->ops->notify_vq_gsi(kvm, vpci->dev,
+							 vpci->queue_selector,
+							 gsi);
+			break;
+		};
+
+		return true;
+	} else if (type == VIRTIO_PCI_O_CONFIG) {
+		vdev->ops->get_config(kvm, vpci->dev)[config_offset] = *(u8 *)data;
+
+		return true;
+	}
+
+	return false;
+}
+
+static bool virtio_pci__data_out(struct kvm_cpu *vcpu, struct virtio_device *vdev,
+				 unsigned long offset, void *data, int size)
+{
+	bool ret = true;
+	struct virtio_pci *vpci;
+	struct kvm *kvm;
+	u32 val;
+
+	kvm = vcpu->kvm;
+	vpci = vdev->virtio;
+
+	switch (offset) {
+	case VIRTIO_PCI_GUEST_FEATURES:
+		val = ioport__read32(data);
+		virtio_set_guest_features(kvm, vdev, vpci->dev, val);
+		break;
+	case VIRTIO_PCI_QUEUE_PFN:
+		val = ioport__read32(data);
+		if (val) {
+			virtio_pci__init_ioeventfd(kvm, vdev,
+						   vpci->queue_selector);
+			vdev->ops->init_vq(kvm, vpci->dev, vpci->queue_selector,
+					   1 << VIRTIO_PCI_QUEUE_ADDR_SHIFT,
+					   VIRTIO_PCI_VRING_ALIGN, val);
+		} else {
+			virtio_pci_exit_vq(kvm, vdev, vpci->queue_selector);
+		}
+		break;
+	case VIRTIO_PCI_QUEUE_SEL:
+		vpci->queue_selector = ioport__read16(data);
+		break;
+	case VIRTIO_PCI_QUEUE_NOTIFY:
+		val = ioport__read16(data);
+		vdev->ops->notify_vq(kvm, vpci->dev, val);
+		break;
+	case VIRTIO_PCI_STATUS:
+		vpci->status = ioport__read8(data);
+		if (!vpci->status) /* Sample endianness on reset */
+			vdev->endian = kvm_cpu__get_endianness(vcpu);
+		virtio_notify_status(kvm, vdev, vpci->dev, vpci->status);
+		break;
+	default:
+		ret = virtio_pci__specific_data_out(kvm, vdev, data, size, offset);
+		break;
+	};
+
+	return ret;
+}
+
+static bool virtio_pci__io_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	struct virtio_device *vdev = ioport->priv;
+	struct virtio_pci *vpci = vdev->virtio;
+	unsigned long offset = port - virtio_pci__port_addr(vpci);
+
+	return virtio_pci__data_out(vcpu, vdev, offset, data, size);
+}
+
+static struct ioport_operations virtio_pci__io_ops = {
+	.io_in	= virtio_pci__io_in,
+	.io_out	= virtio_pci__io_out,
+};
+
+static void virtio_pci__msix_mmio_callback(struct kvm_cpu *vcpu,
+					   u64 addr, u8 *data, u32 len,
+					   u8 is_write, void *ptr)
+{
+	struct virtio_device *vdev = ptr;
+	struct virtio_pci *vpci = vdev->virtio;
+	struct msix_table *table;
+	u32 msix_io_addr = virtio_pci__msix_io_addr(vpci);
+	int vecnum;
+	size_t offset;
+
+	if (addr > msix_io_addr + PCI_IO_SIZE) {
+		if (is_write)
+			return;
+		table  = (struct msix_table *)&vpci->msix_pba;
+		offset = addr - (msix_io_addr + PCI_IO_SIZE);
+	} else {
+		table  = vpci->msix_table;
+		offset = addr - msix_io_addr;
+	}
+	vecnum = offset / sizeof(struct msix_table);
+	offset = offset % sizeof(struct msix_table);
+
+	if (!is_write) {
+		memcpy(data, (void *)&table[vecnum] + offset, len);
+		return;
+	}
+
+	memcpy((void *)&table[vecnum] + offset, data, len);
+
+	/* Did we just update the address or payload? */
+	if (offset < offsetof(struct msix_table, ctrl))
+		update_msix_map(vpci, table, vecnum);
+}
+
+static void virtio_pci__signal_msi(struct kvm *kvm, struct virtio_pci *vpci,
+				   int vec)
+{
+	struct kvm_msi msi = {
+		.address_lo = vpci->msix_table[vec].msg.address_lo,
+		.address_hi = vpci->msix_table[vec].msg.address_hi,
+		.data = vpci->msix_table[vec].msg.data,
+	};
+
+	if (kvm->msix_needs_devid) {
+		msi.flags = KVM_MSI_VALID_DEVID;
+		msi.devid = vpci->dev_hdr.dev_num << 3;
+	}
+
+	irq__signal_msi(kvm, &msi);
+}
+
+int virtio_pci__signal_vq(struct kvm *kvm, struct virtio_device *vdev, u32 vq)
+{
+	struct virtio_pci *vpci = vdev->virtio;
+	int tbl = vpci->vq_vector[vq];
+
+	if (virtio_pci__msix_enabled(vpci) && tbl != VIRTIO_MSI_NO_VECTOR) {
+		if (vpci->pci_hdr.msix.ctrl & cpu_to_le16(PCI_MSIX_FLAGS_MASKALL) ||
+		    vpci->msix_table[tbl].ctrl & cpu_to_le16(PCI_MSIX_ENTRY_CTRL_MASKBIT)) {
+
+			vpci->msix_pba |= 1 << tbl;
+			return 0;
+		}
+
+		if (vpci->features & VIRTIO_PCI_F_SIGNAL_MSI)
+			virtio_pci__signal_msi(kvm, vpci, vpci->vq_vector[vq]);
+		else
+			kvm__irq_trigger(kvm, vpci->gsis[vq]);
+	} else {
+		vpci->isr = VIRTIO_IRQ_HIGH;
+		kvm__irq_trigger(kvm, vpci->legacy_irq_line);
+	}
+	return 0;
+}
+
+int virtio_pci__signal_config(struct kvm *kvm, struct virtio_device *vdev)
+{
+	struct virtio_pci *vpci = vdev->virtio;
+	int tbl = vpci->config_vector;
+
+	if (virtio_pci__msix_enabled(vpci) && tbl != VIRTIO_MSI_NO_VECTOR) {
+		if (vpci->pci_hdr.msix.ctrl & cpu_to_le16(PCI_MSIX_FLAGS_MASKALL) ||
+		    vpci->msix_table[tbl].ctrl & cpu_to_le16(PCI_MSIX_ENTRY_CTRL_MASKBIT)) {
+
+			vpci->msix_pba |= 1 << tbl;
+			return 0;
+		}
+
+		if (vpci->features & VIRTIO_PCI_F_SIGNAL_MSI)
+			virtio_pci__signal_msi(kvm, vpci, tbl);
+		else
+			kvm__irq_trigger(kvm, vpci->config_gsi);
+	} else {
+		vpci->isr = VIRTIO_PCI_ISR_CONFIG;
+		kvm__irq_trigger(kvm, vpci->legacy_irq_line);
+	}
+
+	return 0;
+}
+
+static void virtio_pci__io_mmio_callback(struct kvm_cpu *vcpu,
+					 u64 addr, u8 *data, u32 len,
+					 u8 is_write, void *ptr)
+{
+	struct virtio_device *vdev = ptr;
+	struct virtio_pci *vpci = vdev->virtio;
+	u32 mmio_addr = virtio_pci__mmio_addr(vpci);
+
+	if (!is_write)
+		virtio_pci__data_in(vcpu, vdev, addr - mmio_addr, data, len);
+	else
+		virtio_pci__data_out(vcpu, vdev, addr - mmio_addr, data, len);
+}
+
+static int virtio_pci__bar_activate(struct kvm *kvm,
+				    struct pci_device_header *pci_hdr,
+				    int bar_num, void *data)
+{
+	struct virtio_device *vdev = data;
+	u32 bar_addr, bar_size;
+	int r = -EINVAL;
+
+	assert(bar_num <= 2);
+
+	bar_addr = pci__bar_address(pci_hdr, bar_num);
+	bar_size = pci__bar_size(pci_hdr, bar_num);
+
+	switch (bar_num) {
+	case 0:
+		r = ioport__register(kvm, bar_addr, &virtio_pci__io_ops,
+				     bar_size, vdev);
+		if (r > 0)
+			r = 0;
+		break;
+	case 1:
+		r =  kvm__register_mmio(kvm, bar_addr, bar_size, false,
+					virtio_pci__io_mmio_callback, vdev);
+		break;
+	case 2:
+		r =  kvm__register_mmio(kvm, bar_addr, bar_size, false,
+					virtio_pci__msix_mmio_callback, vdev);
+		break;
+	}
+
+	return r;
+}
+
+static int virtio_pci__bar_deactivate(struct kvm *kvm,
+				      struct pci_device_header *pci_hdr,
+				      int bar_num, void *data)
+{
+	u32 bar_addr;
+	bool success;
+	int r = -EINVAL;
+
+	assert(bar_num <= 2);
+
+	bar_addr = pci__bar_address(pci_hdr, bar_num);
+
+	switch (bar_num) {
+	case 0:
+		r = ioport__unregister(kvm, bar_addr);
+		break;
+	case 1:
+	case 2:
+		success = kvm__deregister_mmio(kvm, bar_addr);
+		/* kvm__deregister_mmio fails when the region is not found. */
+		r = (success ? 0 : -ENOENT);
+		break;
+	}
+
+	return r;
+}
+
+int virtio_pci__init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+		     int device_id, int subsys_id, int class)
+{
+	struct virtio_pci *vpci = vdev->virtio;
+	u32 mmio_addr, msix_io_block;
+	u16 port_addr;
+	int r;
+
+	vpci->kvm = kvm;
+	vpci->dev = dev;
+
+	BUILD_BUG_ON(!is_power_of_two(PCI_IO_SIZE));
+
+	port_addr = pci_get_io_port_block(PCI_IO_SIZE);
+	mmio_addr = pci_get_mmio_block(PCI_IO_SIZE);
+	msix_io_block = pci_get_mmio_block(PCI_IO_SIZE * 2);
+
+	vpci->pci_hdr = (struct pci_device_header) {
+		.vendor_id		= cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET),
+		.device_id		= cpu_to_le16(device_id),
+		.command		= PCI_COMMAND_IO | PCI_COMMAND_MEMORY,
+		.header_type		= PCI_HEADER_TYPE_NORMAL,
+		.revision_id		= 0,
+		.class[0]		= class & 0xff,
+		.class[1]		= (class >> 8) & 0xff,
+		.class[2]		= (class >> 16) & 0xff,
+		.subsys_vendor_id	= cpu_to_le16(PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET),
+		.subsys_id		= cpu_to_le16(subsys_id),
+		.bar[0]			= cpu_to_le32(port_addr
+							| PCI_BASE_ADDRESS_SPACE_IO),
+		.bar[1]			= cpu_to_le32(mmio_addr
+							| PCI_BASE_ADDRESS_SPACE_MEMORY),
+		.bar[2]			= cpu_to_le32(msix_io_block
+							| PCI_BASE_ADDRESS_SPACE_MEMORY),
+		.status			= cpu_to_le16(PCI_STATUS_CAP_LIST),
+		.capabilities		= (void *)&vpci->pci_hdr.msix - (void *)&vpci->pci_hdr,
+		.bar_size[0]		= cpu_to_le32(PCI_IO_SIZE),
+		.bar_size[1]		= cpu_to_le32(PCI_IO_SIZE),
+		.bar_size[2]		= cpu_to_le32(PCI_IO_SIZE*2),
+	};
+
+	r = pci__register_bar_regions(kvm, &vpci->pci_hdr,
+				      virtio_pci__bar_activate,
+				      virtio_pci__bar_deactivate, vdev);
+	if (r < 0)
+		return r;
+
+	vpci->dev_hdr = (struct device_header) {
+		.bus_type		= DEVICE_BUS_PCI,
+		.data			= &vpci->pci_hdr,
+	};
+
+	vpci->pci_hdr.msix.cap = PCI_CAP_ID_MSIX;
+	vpci->pci_hdr.msix.next = 0;
+	/*
+	 * We at most have VIRTIO_PCI_MAX_VQ entries for virt queue,
+	 * VIRTIO_PCI_MAX_CONFIG entries for config.
+	 *
+	 * To quote the PCI spec:
+	 *
+	 * System software reads this field to determine the
+	 * MSI-X Table Size N, which is encoded as N-1.
+	 * For example, a returned value of "00000000011"
+	 * indicates a table size of 4.
+	 */
+	vpci->pci_hdr.msix.ctrl = cpu_to_le16(VIRTIO_PCI_MAX_VQ + VIRTIO_PCI_MAX_CONFIG - 1);
+
+	/* Both table and PBA are mapped to the same BAR (2) */
+	vpci->pci_hdr.msix.table_offset = cpu_to_le32(2);
+	vpci->pci_hdr.msix.pba_offset = cpu_to_le32(2 | PCI_IO_SIZE);
+	vpci->config_vector = 0;
+
+	if (irq__can_signal_msi(kvm))
+		vpci->features |= VIRTIO_PCI_F_SIGNAL_MSI;
+
+	vpci->legacy_irq_line = pci__assign_irq(&vpci->pci_hdr);
+
+	r = device__register(&vpci->dev_hdr);
+	if (r < 0)
+		return r;
+
+	return 0;
+}
+
+int virtio_pci__reset(struct kvm *kvm, struct virtio_device *vdev)
+{
+	int vq;
+	struct virtio_pci *vpci = vdev->virtio;
+
+	for (vq = 0; vq < vdev->ops->get_vq_count(kvm, vpci->dev); vq++)
+		virtio_pci_exit_vq(kvm, vdev, vq);
+
+	return 0;
+}
+
+int virtio_pci__exit(struct kvm *kvm, struct virtio_device *vdev)
+{
+	struct virtio_pci *vpci = vdev->virtio;
+
+	virtio_pci__reset(kvm, vdev);
+	kvm__deregister_mmio(kvm, virtio_pci__mmio_addr(vpci));
+	kvm__deregister_mmio(kvm, virtio_pci__msix_io_addr(vpci));
+	ioport__unregister(kvm, virtio_pci__port_addr(vpci));
+
+	return 0;
+}
diff --git a/kvmtool/virtio/rng.c b/kvmtool/virtio/rng.c
new file mode 100644
index 0000000..78eaa64
--- /dev/null
+++ b/kvmtool/virtio/rng.c
@@ -0,0 +1,211 @@
+#include "kvm/virtio-rng.h"
+
+#include "kvm/virtio-pci-dev.h"
+
+#include "kvm/virtio.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/threadpool.h"
+#include "kvm/guest_compat.h"
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_rng.h>
+
+#include <linux/list.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <pthread.h>
+#include <linux/kernel.h>
+
+#define NUM_VIRT_QUEUES		1
+#define VIRTIO_RNG_QUEUE_SIZE	128
+
+struct rng_dev_job {
+	struct virt_queue	*vq;
+	struct rng_dev		*rdev;
+	struct thread_pool__job	job_id;
+};
+
+struct rng_dev {
+	struct list_head	list;
+	struct virtio_device	vdev;
+
+	int			fd;
+
+	/* virtio queue */
+	struct virt_queue	vqs[NUM_VIRT_QUEUES];
+	struct rng_dev_job	jobs[NUM_VIRT_QUEUES];
+};
+
+static LIST_HEAD(rdevs);
+static int compat_id = -1;
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+	/* Unused */
+	return 0;
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+	/* Unused */
+	return 0;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+	/* Unused */
+}
+
+static bool virtio_rng_do_io_request(struct kvm *kvm, struct rng_dev *rdev, struct virt_queue *queue)
+{
+	struct iovec iov[VIRTIO_RNG_QUEUE_SIZE];
+	ssize_t len = 0;
+	u16 out, in, head;
+
+	head	= virt_queue__get_iov(queue, iov, &out, &in, kvm);
+	len	= readv(rdev->fd, iov, in);
+	if (len < 0 && errno == EAGAIN)
+		len = 0;
+
+	virt_queue__set_used_elem(queue, head, len);
+
+	return true;
+}
+
+static void virtio_rng_do_io(struct kvm *kvm, void *param)
+{
+	struct rng_dev_job *job	= param;
+	struct virt_queue *vq	= job->vq;
+	struct rng_dev *rdev	= job->rdev;
+
+	while (virt_queue__available(vq))
+		virtio_rng_do_io_request(kvm, rdev, vq);
+
+	rdev->vdev.ops->signal_vq(kvm, &rdev->vdev, vq - rdev->vqs);
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+		   u32 pfn)
+{
+	struct rng_dev *rdev = dev;
+	struct virt_queue *queue;
+	struct rng_dev_job *job;
+	void *p;
+
+	compat__remove_message(compat_id);
+
+	queue		= &rdev->vqs[vq];
+	queue->pfn	= pfn;
+	p		= virtio_get_vq(kvm, queue->pfn, page_size);
+
+	job = &rdev->jobs[vq];
+
+	vring_init(&queue->vring, VIRTIO_RNG_QUEUE_SIZE, p, align);
+	virtio_init_device_vq(&rdev->vdev, queue);
+
+	*job = (struct rng_dev_job) {
+		.vq	= queue,
+		.rdev	= rdev,
+	};
+
+	thread_pool__init_job(&job->job_id, kvm, virtio_rng_do_io, job);
+
+	return 0;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct rng_dev *rdev = dev;
+
+	thread_pool__do_job(&rdev->jobs[vq].job_id);
+
+	return 0;
+}
+
+static struct virt_queue *get_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct rng_dev *rdev = dev;
+
+	return &rdev->vqs[vq];
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	return VIRTIO_RNG_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+	/* FIXME: dynamic */
+	return size;
+}
+
+static int get_vq_count(struct kvm *kvm, void *dev)
+{
+	return NUM_VIRT_QUEUES;
+}
+
+static struct virtio_ops rng_dev_virtio_ops = {
+	.get_config		= get_config,
+	.get_host_features	= get_host_features,
+	.set_guest_features	= set_guest_features,
+	.init_vq		= init_vq,
+	.notify_vq		= notify_vq,
+	.get_vq			= get_vq,
+	.get_size_vq		= get_size_vq,
+	.set_size_vq		= set_size_vq,
+	.get_vq_count		= get_vq_count,
+};
+
+int virtio_rng__init(struct kvm *kvm)
+{
+	struct rng_dev *rdev;
+	int r;
+
+	if (!kvm->cfg.virtio_rng)
+		return 0;
+
+	rdev = malloc(sizeof(*rdev));
+	if (rdev == NULL)
+		return -ENOMEM;
+
+	rdev->fd = open("/dev/random", O_RDONLY | O_NONBLOCK);
+	if (rdev->fd < 0) {
+		r = rdev->fd;
+		goto cleanup;
+	}
+
+	r = virtio_init(kvm, rdev, &rdev->vdev, &rng_dev_virtio_ops,
+			VIRTIO_DEFAULT_TRANS(kvm), PCI_DEVICE_ID_VIRTIO_RNG,
+			VIRTIO_ID_RNG, PCI_CLASS_RNG);
+	if (r < 0)
+		goto cleanup;
+
+	list_add_tail(&rdev->list, &rdevs);
+
+	if (compat_id == -1)
+		compat_id = virtio_compat_add_message("virtio-rng", "CONFIG_HW_RANDOM_VIRTIO");
+	return 0;
+cleanup:
+	close(rdev->fd);
+	free(rdev);
+
+	return r;
+}
+virtio_dev_init(virtio_rng__init);
+
+int virtio_rng__exit(struct kvm *kvm)
+{
+	struct rng_dev *rdev, *tmp;
+
+	list_for_each_entry_safe(rdev, tmp, &rdevs, list) {
+		list_del(&rdev->list);
+		rdev->vdev.ops->exit(kvm, &rdev->vdev);
+		free(rdev);
+	}
+
+	return 0;
+}
+virtio_dev_exit(virtio_rng__exit);
diff --git a/kvmtool/virtio/scsi.c b/kvmtool/virtio/scsi.c
new file mode 100644
index 0000000..16a86cb
--- /dev/null
+++ b/kvmtool/virtio/scsi.c
@@ -0,0 +1,324 @@
+#include "kvm/virtio-scsi.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/disk-image.h"
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/ioeventfd.h"
+#include "kvm/guest_compat.h"
+#include "kvm/virtio-pci.h"
+#include "kvm/virtio.h"
+#include "kvm/strbuf.h"
+
+#include <linux/kernel.h>
+#include <linux/virtio_scsi.h>
+#include <linux/vhost.h>
+
+#define VIRTIO_SCSI_QUEUE_SIZE		128
+#define NUM_VIRT_QUEUES			3
+
+static LIST_HEAD(sdevs);
+static int compat_id = -1;
+
+struct scsi_dev {
+	struct virt_queue		vqs[NUM_VIRT_QUEUES];
+	struct virtio_scsi_config	config;
+	struct vhost_scsi_target	target;
+	u32				features;
+	int				vhost_fd;
+	struct virtio_device		vdev;
+	struct list_head		list;
+	struct kvm			*kvm;
+};
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+	struct scsi_dev *sdev = dev;
+
+	return ((u8 *)(&sdev->config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+	return	1UL << VIRTIO_RING_F_EVENT_IDX |
+		1UL << VIRTIO_RING_F_INDIRECT_DESC;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+	struct scsi_dev *sdev = dev;
+
+	sdev->features = features;
+}
+
+static void notify_status(struct kvm *kvm, void *dev, u32 status)
+{
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+		   u32 pfn)
+{
+	struct vhost_vring_state state = { .index = vq };
+	struct vhost_vring_addr addr;
+	struct scsi_dev *sdev = dev;
+	struct virt_queue *queue;
+	void *p;
+	int r;
+
+	compat__remove_message(compat_id);
+
+	queue		= &sdev->vqs[vq];
+	queue->pfn	= pfn;
+	p		= virtio_get_vq(kvm, queue->pfn, page_size);
+
+	vring_init(&queue->vring, VIRTIO_SCSI_QUEUE_SIZE, p, align);
+	virtio_init_device_vq(&sdev->vdev, queue);
+
+	if (sdev->vhost_fd == 0)
+		return 0;
+
+	state.num = queue->vring.num;
+	r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_NUM, &state);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_NUM failed");
+	state.num = 0;
+	r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_BASE, &state);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_BASE failed");
+
+	addr = (struct vhost_vring_addr) {
+		.index = vq,
+		.desc_user_addr = (u64)(unsigned long)queue->vring.desc,
+		.avail_user_addr = (u64)(unsigned long)queue->vring.avail,
+		.used_user_addr = (u64)(unsigned long)queue->vring.used,
+	};
+
+	r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_ADDR, &addr);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_ADDR failed");
+
+	return 0;
+}
+
+static void notify_vq_gsi(struct kvm *kvm, void *dev, u32 vq, u32 gsi)
+{
+	struct vhost_vring_file file;
+	struct scsi_dev *sdev = dev;
+	int r;
+
+	if (sdev->vhost_fd == 0)
+		return;
+
+	file = (struct vhost_vring_file) {
+		.index	= vq,
+		.fd	= eventfd(0, 0),
+	};
+
+	r = irq__add_irqfd(kvm, gsi, file.fd, -1);
+	if (r < 0)
+		die_perror("KVM_IRQFD failed");
+
+	r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_CALL, &file);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_CALL failed");
+
+	if (vq > 0)
+		return;
+
+	r = ioctl(sdev->vhost_fd, VHOST_SCSI_SET_ENDPOINT, &sdev->target);
+	if (r != 0)
+		die("VHOST_SCSI_SET_ENDPOINT failed %d", errno);
+}
+
+static void notify_vq_eventfd(struct kvm *kvm, void *dev, u32 vq, u32 efd)
+{
+	struct scsi_dev *sdev = dev;
+	struct vhost_vring_file file = {
+		.index	= vq,
+		.fd	= efd,
+	};
+	int r;
+
+	if (sdev->vhost_fd == 0)
+		return;
+
+	r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_KICK, &file);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_KICK failed");
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	return 0;
+}
+
+static struct virt_queue *get_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct scsi_dev *sdev = dev;
+
+	return &sdev->vqs[vq];
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	return VIRTIO_SCSI_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+	return size;
+}
+
+static int get_vq_count(struct kvm *kvm, void *dev)
+{
+	return NUM_VIRT_QUEUES;
+}
+
+static struct virtio_ops scsi_dev_virtio_ops = {
+	.get_config		= get_config,
+	.get_host_features	= get_host_features,
+	.set_guest_features	= set_guest_features,
+	.init_vq		= init_vq,
+	.get_vq			= get_vq,
+	.get_size_vq		= get_size_vq,
+	.set_size_vq		= set_size_vq,
+	.notify_status		= notify_status,
+	.notify_vq		= notify_vq,
+	.notify_vq_gsi		= notify_vq_gsi,
+	.notify_vq_eventfd	= notify_vq_eventfd,
+	.get_vq_count		= get_vq_count,
+};
+
+static void virtio_scsi_vhost_init(struct kvm *kvm, struct scsi_dev *sdev)
+{
+	struct vhost_memory *mem;
+	u64 features;
+	int r;
+
+	sdev->vhost_fd = open("/dev/vhost-scsi", O_RDWR);
+	if (sdev->vhost_fd < 0)
+		die_perror("Failed openning vhost-scsi device");
+
+	mem = calloc(1, sizeof(*mem) + sizeof(struct vhost_memory_region));
+	if (mem == NULL)
+		die("Failed allocating memory for vhost memory map");
+
+	mem->nregions = 1;
+	mem->regions[0] = (struct vhost_memory_region) {
+		.guest_phys_addr	= 0,
+		.memory_size		= kvm->ram_size,
+		.userspace_addr		= (unsigned long)kvm->ram_start,
+	};
+
+	r = ioctl(sdev->vhost_fd, VHOST_SET_OWNER);
+	if (r != 0)
+		die_perror("VHOST_SET_OWNER failed");
+
+	r = ioctl(sdev->vhost_fd, VHOST_GET_FEATURES, &features);
+	if (r != 0)
+		die_perror("VHOST_GET_FEATURES failed");
+
+	r = ioctl(sdev->vhost_fd, VHOST_SET_FEATURES, &features);
+	if (r != 0)
+		die_perror("VHOST_SET_FEATURES failed");
+	r = ioctl(sdev->vhost_fd, VHOST_SET_MEM_TABLE, mem);
+	if (r != 0)
+		die_perror("VHOST_SET_MEM_TABLE failed");
+
+	sdev->vdev.use_vhost = true;
+
+	free(mem);
+}
+
+
+static int virtio_scsi_init_one(struct kvm *kvm, struct disk_image *disk)
+{
+	struct scsi_dev *sdev;
+	int r;
+
+	if (!disk)
+		return -EINVAL;
+
+	sdev = calloc(1, sizeof(struct scsi_dev));
+	if (sdev == NULL)
+		return -ENOMEM;
+
+	*sdev = (struct scsi_dev) {
+		.config	= (struct virtio_scsi_config) {
+			.num_queues	= NUM_VIRT_QUEUES - 2,
+			.seg_max	= VIRTIO_SCSI_CDB_SIZE - 2,
+			.max_sectors	= 65535,
+			.cmd_per_lun	= 128,
+			.sense_size	= VIRTIO_SCSI_SENSE_SIZE,
+			.cdb_size	= VIRTIO_SCSI_CDB_SIZE,
+			.max_channel	= 0,
+			.max_target	= 0,
+			.max_lun	= 16383,
+			.event_info_size = sizeof(struct virtio_scsi_event),
+		},
+		.kvm			= kvm,
+	};
+	strlcpy((char *)&sdev->target.vhost_wwpn, disk->wwpn, sizeof(sdev->target.vhost_wwpn));
+	sdev->target.vhost_tpgt = strtol(disk->tpgt, NULL, 0);
+
+	list_add_tail(&sdev->list, &sdevs);
+
+	r = virtio_init(kvm, sdev, &sdev->vdev, &scsi_dev_virtio_ops,
+			VIRTIO_DEFAULT_TRANS(kvm), PCI_DEVICE_ID_VIRTIO_SCSI,
+			VIRTIO_ID_SCSI, PCI_CLASS_BLK);
+	if (r < 0)
+		return r;
+
+	virtio_scsi_vhost_init(kvm, sdev);
+
+	if (compat_id == -1)
+		compat_id = virtio_compat_add_message("virtio-scsi", "CONFIG_VIRTIO_SCSI");
+
+	return 0;
+}
+
+static int virtio_scsi_exit_one(struct kvm *kvm, struct scsi_dev *sdev)
+{
+	int r;
+
+	r = ioctl(sdev->vhost_fd, VHOST_SCSI_CLEAR_ENDPOINT, &sdev->target);
+	if (r != 0)
+		die("VHOST_SCSI_CLEAR_ENDPOINT failed %d", errno);
+
+	list_del(&sdev->list);
+	free(sdev);
+
+	return 0;
+}
+
+int virtio_scsi_init(struct kvm *kvm)
+{
+	int i, r = 0;
+
+	for (i = 0; i < kvm->nr_disks; i++) {
+		if (!kvm->disks[i]->wwpn)
+			continue;
+		r = virtio_scsi_init_one(kvm, kvm->disks[i]);
+		if (r < 0)
+			goto cleanup;
+	}
+
+	return 0;
+cleanup:
+	virtio_scsi_exit(kvm);
+	return r;
+}
+virtio_dev_init(virtio_scsi_init);
+
+int virtio_scsi_exit(struct kvm *kvm)
+{
+	while (!list_empty(&sdevs)) {
+		struct scsi_dev *sdev;
+
+		sdev = list_first_entry(&sdevs, struct scsi_dev, list);
+		virtio_scsi_exit_one(kvm, sdev);
+	}
+
+	return 0;
+}
+virtio_dev_exit(virtio_scsi_exit);
diff --git a/kvmtool/x86/bios.c b/kvmtool/x86/bios.c
new file mode 100644
index 0000000..5ac9e24
--- /dev/null
+++ b/kvmtool/x86/bios.c
@@ -0,0 +1,173 @@
+#include "kvm/kvm.h"
+#include "kvm/boot-protocol.h"
+#include "kvm/e820.h"
+#include "kvm/interrupt.h"
+#include "kvm/util.h"
+
+#include <string.h>
+
+#include "bios/bios-rom.h"
+
+struct irq_handler {
+	unsigned long		address;
+	unsigned int		irq;
+	void			*handler;
+	size_t			size;
+};
+
+#define BIOS_IRQ_PA_ADDR(name)	(MB_BIOS_BEGIN + BIOS_OFFSET__##name)
+#define BIOS_IRQ_FUNC(name)	((char *)&bios_rom[BIOS_OFFSET__##name])
+#define BIOS_IRQ_SIZE(name)	(BIOS_ENTRY_SIZE(BIOS_OFFSET__##name))
+
+#define DEFINE_BIOS_IRQ_HANDLER(_irq, _handler)			\
+	{							\
+		.irq		= _irq,				\
+		.address	= BIOS_IRQ_PA_ADDR(_handler),	\
+		.handler	= BIOS_IRQ_FUNC(_handler),	\
+		.size		= BIOS_IRQ_SIZE(_handler),	\
+	}
+
+static struct irq_handler bios_irq_handlers[] = {
+	DEFINE_BIOS_IRQ_HANDLER(0x10, bios_int10),
+	DEFINE_BIOS_IRQ_HANDLER(0x15, bios_int15),
+};
+
+static void setup_irq_handler(struct kvm *kvm, struct irq_handler *handler)
+{
+	struct real_intr_desc intr_desc;
+	void *p;
+
+	p = guest_flat_to_host(kvm, handler->address);
+	memcpy(p, handler->handler, handler->size);
+
+	intr_desc = (struct real_intr_desc) {
+		.segment	= REAL_SEGMENT(MB_BIOS_BEGIN),
+		.offset		= handler->address - MB_BIOS_BEGIN,
+	};
+
+	DIE_IF((handler->address - MB_BIOS_BEGIN) > 0xffffUL);
+
+	interrupt_table__set(&kvm->arch.interrupt_table, &intr_desc, handler->irq);
+}
+
+/**
+ * e820_setup - setup some simple E820 memory map
+ * @kvm - guest system descriptor
+ */
+static void e820_setup(struct kvm *kvm)
+{
+	struct e820map *e820;
+	struct e820entry *mem_map;
+	unsigned int i = 0;
+
+	e820		= guest_flat_to_host(kvm, E820_MAP_START);
+	mem_map		= e820->map;
+
+	mem_map[i++]	= (struct e820entry) {
+		.addr		= REAL_MODE_IVT_BEGIN,
+		.size		= EBDA_START - REAL_MODE_IVT_BEGIN,
+		.type		= E820_RAM,
+	};
+	mem_map[i++]	= (struct e820entry) {
+		.addr		= EBDA_START,
+		.size		= VGA_RAM_BEGIN - EBDA_START,
+		.type		= E820_RESERVED,
+	};
+	mem_map[i++]	= (struct e820entry) {
+		.addr		= MB_BIOS_BEGIN,
+		.size		= MB_BIOS_END - MB_BIOS_BEGIN,
+		.type		= E820_RESERVED,
+	};
+	if (kvm->ram_size < KVM_32BIT_GAP_START) {
+		mem_map[i++]	= (struct e820entry) {
+			.addr		= BZ_KERNEL_START,
+			.size		= kvm->ram_size - BZ_KERNEL_START,
+			.type		= E820_RAM,
+		};
+	} else {
+		mem_map[i++]	= (struct e820entry) {
+			.addr		= BZ_KERNEL_START,
+			.size		= KVM_32BIT_GAP_START - BZ_KERNEL_START,
+			.type		= E820_RAM,
+		};
+		mem_map[i++]	= (struct e820entry) {
+			.addr		= KVM_32BIT_MAX_MEM_SIZE,
+			.size		= kvm->ram_size - KVM_32BIT_MAX_MEM_SIZE,
+			.type		= E820_RAM,
+		};
+	}
+
+	BUG_ON(i > E820_X_MAX);
+
+	e820->nr_map = i;
+}
+
+static void setup_vga_rom(struct kvm *kvm)
+{
+	u16 *mode;
+	void *p;
+
+	p = guest_flat_to_host(kvm, VGA_ROM_OEM_STRING);
+	memset(p, 0, VGA_ROM_OEM_STRING_SIZE);
+	strncpy(p, "KVM VESA", VGA_ROM_OEM_STRING_SIZE);
+
+	mode = guest_flat_to_host(kvm, VGA_ROM_MODES);
+	mode[0]	= 0x0112;
+	mode[1] = 0xffff;
+}
+
+/**
+ * setup_bios - inject BIOS into guest memory
+ * @kvm - guest system descriptor
+ */
+void setup_bios(struct kvm *kvm)
+{
+	unsigned long address = MB_BIOS_BEGIN;
+	struct real_intr_desc intr_desc;
+	unsigned int i;
+	void *p;
+
+	/*
+	 * before anything else -- clean some known areas
+	 * we definitely don't want any trash here
+	 */
+	p = guest_flat_to_host(kvm, BDA_START);
+	memset(p, 0, BDA_END - BDA_START);
+
+	p = guest_flat_to_host(kvm, EBDA_START);
+	memset(p, 0, EBDA_END - EBDA_START);
+
+	p = guest_flat_to_host(kvm, MB_BIOS_BEGIN);
+	memset(p, 0, MB_BIOS_END - MB_BIOS_BEGIN);
+
+	p = guest_flat_to_host(kvm, VGA_ROM_BEGIN);
+	memset(p, 0, VGA_ROM_END - VGA_ROM_BEGIN);
+
+	/* just copy the bios rom into the place */
+	p = guest_flat_to_host(kvm, MB_BIOS_BEGIN);
+	memcpy(p, bios_rom, bios_rom_size);
+
+	/* E820 memory map must be present */
+	e820_setup(kvm);
+
+	/* VESA needs own tricks */
+	setup_vga_rom(kvm);
+
+	/*
+	 * Setup a *fake* real mode vector table, it has only
+	 * one real handler which does just iret
+	 */
+	address = BIOS_IRQ_PA_ADDR(bios_intfake);
+	intr_desc = (struct real_intr_desc) {
+		.segment	= REAL_SEGMENT(MB_BIOS_BEGIN),
+		.offset		= address - MB_BIOS_BEGIN,
+	};
+	interrupt_table__setup(&kvm->arch.interrupt_table, &intr_desc);
+
+	for (i = 0; i < ARRAY_SIZE(bios_irq_handlers); i++)
+		setup_irq_handler(kvm, &bios_irq_handlers[i]);
+
+	/* we almost done */
+	p = guest_flat_to_host(kvm, 0);
+	interrupt_table__copy(&kvm->arch.interrupt_table, p, REAL_INTR_SIZE);
+}
diff --git a/kvmtool/x86/bios/.gitignore b/kvmtool/x86/bios/.gitignore
new file mode 100644
index 0000000..1f0080b
--- /dev/null
+++ b/kvmtool/x86/bios/.gitignore
@@ -0,0 +1,3 @@
+bios-rom.bin
+bios-rom.bin.elf
+bios-rom.h
diff --git a/kvmtool/x86/bios/bios-rom.S b/kvmtool/x86/bios/bios-rom.S
new file mode 100644
index 0000000..3269ce9
--- /dev/null
+++ b/kvmtool/x86/bios/bios-rom.S
@@ -0,0 +1,12 @@
+#include <kvm/assembly.h>
+
+	.org 0
+#ifdef CONFIG_X86_64
+	.code64
+#else
+	.code32
+#endif
+
+GLOBAL(bios_rom)
+	.incbin "x86/bios/bios.bin"
+END(bios_rom)
diff --git a/kvmtool/x86/bios/e820.c b/kvmtool/x86/bios/e820.c
new file mode 100644
index 0000000..51576b4
--- /dev/null
+++ b/kvmtool/x86/bios/e820.c
@@ -0,0 +1,81 @@
+#include "asm/bios/types.h"
+#include "kvm/e820.h"
+
+#include "kvm/bios.h"
+
+#include <asm/processor-flags.h>
+
+static inline u16 flat_to_seg16(u32 address)
+{
+	return address >> 4;
+}
+
+static inline u16 flat_to_off16(u32 address, u32 segment)
+{
+	return address - (segment << 4);
+}
+
+static inline void set_fs(u16 seg)
+{
+	asm volatile("movw %0,%%fs" : : "rm" (seg));
+}
+
+static inline u8 rdfs8(unsigned long addr)
+{
+	u8 v;
+
+	asm volatile("addr32 movb %%fs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr));
+
+	return v;
+}
+
+static inline u32 rdfs32(unsigned long addr)
+{
+	u32 v;
+
+	asm volatile("addr32 movl %%fs:%1,%0" : "=q" (v) : "m" (*(u32 *)addr));
+
+	return v;
+}
+
+bioscall void e820_query_map(struct biosregs *regs)
+{
+	struct e820map *e820;
+	u32 map_size;
+	u16 fs_seg;
+	u32 ndx;
+
+	e820		= (struct e820map *)E820_MAP_START;
+	fs_seg		= flat_to_seg16(E820_MAP_START);
+	set_fs(fs_seg);
+
+	ndx		= regs->ebx;
+
+	map_size	= rdfs32(flat_to_off16((u32)&e820->nr_map, fs_seg));
+
+	if (ndx < map_size) {
+		u32 start;
+		unsigned int i;
+		u8 *p;
+
+		fs_seg	= flat_to_seg16(E820_MAP_START);
+		set_fs(fs_seg);
+
+		start	= (u32)&e820->map[ndx];
+
+		p	= (void *) regs->edi;
+
+		for (i = 0; i < sizeof(struct e820entry); i++)
+			*p++	= rdfs8(flat_to_off16(start + i, fs_seg));
+	}
+
+	regs->eax	= SMAP;
+	regs->ecx	= sizeof(struct e820entry);
+	regs->ebx	= ++ndx;
+
+	/* Clear CF to indicate success.  */
+	regs->eflags	&= ~X86_EFLAGS_CF;
+
+	if (ndx >= map_size)
+		regs->ebx	= 0;	/* end of map */
+}
diff --git a/kvmtool/x86/bios/entry.S b/kvmtool/x86/bios/entry.S
new file mode 100644
index 0000000..85056e9
--- /dev/null
+++ b/kvmtool/x86/bios/entry.S
@@ -0,0 +1,92 @@
+/*
+ * Our pretty trivial BIOS emulation
+ */
+
+#include <kvm/bios.h>
+#include <kvm/assembly.h>
+
+	.org 0
+	.code16gcc
+
+#define EFLAGS_CF	(1 << 0)
+
+#include "macro.S"
+
+/* If you change these macros, remember to update 'struct biosregs' */
+.macro SAVE_BIOSREGS
+	pushl	%fs
+	pushl	%es
+	pushl	%ds
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebp
+	pushl	%esp
+	pushl	%edx
+	pushl	%ecx
+	pushl	%ebx
+	pushl	%eax
+.endm
+
+.macro RESTORE_BIOSREGS
+	popl	%eax
+	popl	%ebx
+	popl	%ecx
+	popl	%edx
+	popl	%esp
+	popl	%ebp
+	popl	%esi
+	popl	%edi
+	popl	%ds
+	popl	%es
+	popl	%fs
+.endm
+
+/*
+ * fake interrupt handler, nothing can be faster ever
+ */
+ENTRY(bios_intfake)
+	/*
+	 * Set CF to indicate failure. We don't want callers to think that the
+	 * interrupt handler succeeded and then treat the return values in
+	 * registers as valid data.
+	 */
+	orl	$EFLAGS_CF, 0x4(%esp)
+
+	IRET
+ENTRY_END(bios_intfake)
+
+/*
+ * int 10 - video - service
+ */
+ENTRY(bios_int10)
+	SAVE_BIOSREGS
+
+	movl		%esp, %eax
+	/* this is way easier than doing it in assembly */
+	/* just push all the regs and jump to a C handler */
+	call	int10_handler
+
+	RESTORE_BIOSREGS
+
+	/* Clear CF to indicate success.  */
+	andl	$~EFLAGS_CF, 0x4(%esp)
+
+	IRET
+ENTRY_END(bios_int10)
+
+ENTRY(bios_int15)
+	SAVE_BIOSREGS
+
+	movl	%esp, %eax
+	call	int15_handler
+
+	RESTORE_BIOSREGS
+
+	IRET
+ENTRY_END(bios_int15)
+
+GLOBAL(__locals)
+
+#include "local.S"
+
+END(__locals)
diff --git a/kvmtool/x86/bios/gen-offsets.sh b/kvmtool/x86/bios/gen-offsets.sh
new file mode 100644
index 0000000..8771bbe
--- /dev/null
+++ b/kvmtool/x86/bios/gen-offsets.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+echo "/* Autogenerated file, don't edit */"
+echo "#ifndef BIOS_OFFSETS_H"
+echo "#define BIOS_OFFSETS_H"
+
+echo ""
+echo "#define BIOS_ENTRY_SIZE(name) (name##_end - name)"
+echo ""
+
+nm bios.bin.elf | grep ' [Tt] ' | awk '{ print "#define BIOS_OFFSET__" $3 " 0x" $1; }'
+
+echo ""
+echo "#endif"
diff --git a/kvmtool/x86/bios/int10.c b/kvmtool/x86/bios/int10.c
new file mode 100644
index 0000000..d164cf5
--- /dev/null
+++ b/kvmtool/x86/bios/int10.c
@@ -0,0 +1,109 @@
+#include "kvm/bios.h"
+#include "kvm/vesa.h"
+
+#include "asm/bios/memcpy.h"
+
+#include "asm/bios/vesa.h"
+
+static far_ptr gen_far_ptr(unsigned int pa)
+{
+	far_ptr ptr;
+
+	ptr.seg = (pa >> 4);
+	ptr.off = pa - (ptr.seg << 4);
+
+	return ptr;
+}
+
+static inline void outb(unsigned short port, unsigned char val)
+{
+	asm volatile("outb %0, %1" : : "a"(val), "Nd"(port));
+}
+
+/*
+ * It's probably much more useful to make this print to the serial
+ * line rather than print to a non-displayed VGA memory
+ */
+static inline void int10_putchar(struct biosregs *args)
+{
+	u8 al = args->eax & 0xFF;
+
+	outb(0x3f8, al);
+}
+
+static void vbe_get_mode(struct biosregs *args)
+{
+	struct vesa_mode_info *info = (struct vesa_mode_info *) args->edi;
+
+	*info = (struct vesa_mode_info) {
+		.mode_attr		= 0xd9, /* 11011011 */
+		.logical_scan		= VESA_WIDTH*4,
+		.h_res			= VESA_WIDTH,
+		.v_res			= VESA_HEIGHT,
+		.bpp			= VESA_BPP,
+		.memory_layout		= 6,
+		.memory_planes		= 1,
+		.lfb_ptr		= VESA_MEM_ADDR,
+		.rmask			= 8,
+		.gmask			= 8,
+		.bmask			= 8,
+		.resv_mask		= 8,
+		.resv_pos		= 24,
+		.bpos			= 16,
+		.gpos			= 8,
+	};
+}
+
+static void vbe_get_info(struct biosregs *args)
+{
+	struct vesa_general_info *infop = (struct vesa_general_info *) args->edi;
+	struct vesa_general_info info;
+
+	info = (struct vesa_general_info) {
+		.signature		= VESA_MAGIC,
+		.version		= 0x102,
+		.vendor_string		= gen_far_ptr(VGA_ROM_BEGIN),
+		.capabilities		= 0x10,
+		.video_mode_ptr		= gen_far_ptr(VGA_ROM_MODES),
+		.total_memory		= (4 * VESA_WIDTH * VESA_HEIGHT) / 0x10000,
+	};
+
+	memcpy16(args->es, infop, args->ds, &info, sizeof(info));
+}
+
+#define VBE_STATUS_OK		0x004F
+
+static void int10_vesa(struct biosregs *args)
+{
+	u8 al;
+
+	al = args->eax & 0xff;
+
+	switch (al) {
+	case 0x00:
+		vbe_get_info(args);
+		break;
+	case 0x01:
+		vbe_get_mode(args);
+		break;
+	}
+
+	args->eax = VBE_STATUS_OK;
+}
+
+bioscall void int10_handler(struct biosregs *args)
+{
+	u8 ah;
+
+	ah = (args->eax & 0xff00) >> 8;
+
+	switch (ah) {
+	case 0x0e:
+		int10_putchar(args);
+		break;
+	case 0x4f:
+		int10_vesa(args);
+		break;
+	}
+
+}
diff --git a/kvmtool/x86/bios/int15.c b/kvmtool/x86/bios/int15.c
new file mode 100644
index 0000000..faf5343
--- /dev/null
+++ b/kvmtool/x86/bios/int15.c
@@ -0,0 +1,18 @@
+#include "kvm/bios.h"
+
+#include "kvm/e820.h"
+
+#include <asm/processor-flags.h>
+
+bioscall void int15_handler(struct biosregs *regs)
+{
+	switch (regs->eax) {
+	case 0xe820:
+		e820_query_map(regs);
+		break;
+	default:
+		/* Set CF to indicate failure.  */
+		regs->eflags	|= X86_EFLAGS_CF;
+		break;
+	}
+}
diff --git a/kvmtool/x86/bios/local.S b/kvmtool/x86/bios/local.S
new file mode 100644
index 0000000..f2cdbf4
--- /dev/null
+++ b/kvmtool/x86/bios/local.S
@@ -0,0 +1,7 @@
+/*
+ * Local variables for almost every BIOS irq handler
+ * Must be put somewhere inside irq handler body
+ */
+__CALLER_SS:		.int  0
+__CALLER_SP:		.long 0
+__CALLER_CLOBBER:	.long 0
diff --git a/kvmtool/x86/bios/macro.S b/kvmtool/x86/bios/macro.S
new file mode 100644
index 0000000..0d5e567
--- /dev/null
+++ b/kvmtool/x86/bios/macro.S
@@ -0,0 +1,25 @@
+/*
+ * handy BIOS macros
+ */
+
+/*
+ * switch to BIOS stack
+ */
+.macro stack_swap
+	movw %ss, %cs:(__CALLER_SS)
+	movl %esp, %cs:(__CALLER_SP)
+	movl %edx, %cs:(__CALLER_CLOBBER)
+	movw $MB_BIOS_SS, %dx
+	movw %dx, %ss
+	movw $MB_BIOS_SP, %sp
+	movl %cs:(__CALLER_CLOBBER), %edx
+.endm
+
+/*
+ * restore the original stack
+ */
+.macro stack_restore
+	movl %cs:(__CALLER_SP), %esp
+	movw %cs:(__CALLER_SS), %ss
+.endm
+
diff --git a/kvmtool/x86/bios/memcpy.c b/kvmtool/x86/bios/memcpy.c
new file mode 100644
index 0000000..2be3a27
--- /dev/null
+++ b/kvmtool/x86/bios/memcpy.c
@@ -0,0 +1,23 @@
+#include "asm/bios/memcpy.h"
+
+/*
+ *  Copy memory area in 16-bit real mode.
+ */
+void memcpy16(u16 dst_seg, void *dst, u16 src_seg, const void *src, size_t len)
+{
+	__asm__ __volatile__ (
+		"pushw	%%ds				\n"
+		"pushw	%%es				\n"
+		"movw	%[src_seg], %%ds		\n"
+		"movw	%[dst_seg], %%es		\n"
+		"rep movsb %%ds:(%%si), %%es:(%%di)	\n"
+		"popw	%%es				\n"
+		"popw	%%ds				\n"
+		:
+		: "S"(src),
+		  "D"(dst),
+		  "c"(len),
+		  [src_seg] "r"(src_seg),
+		  [dst_seg] "r"(dst_seg)
+		: "cc", "memory");
+}
diff --git a/kvmtool/x86/bios/rom.ld.S b/kvmtool/x86/bios/rom.ld.S
new file mode 100644
index 0000000..f4f1835
--- /dev/null
+++ b/kvmtool/x86/bios/rom.ld.S
@@ -0,0 +1,16 @@
+OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
+OUTPUT_ARCH(i386)
+
+SECTIONS {
+	.text 0 : {
+		*(.text)
+	}
+
+	/DISCARD/ : {
+		*(.debug*)
+		*(.data)
+		*(.bss)
+		*(.eh_frame*)
+	}
+}
+
diff --git a/kvmtool/x86/boot.c b/kvmtool/x86/boot.c
new file mode 100644
index 0000000..61535eb
--- /dev/null
+++ b/kvmtool/x86/boot.c
@@ -0,0 +1,41 @@
+#include "kvm/kvm.h"
+
+#include "kvm/util.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <fcntl.h>
+
+#define BIOS_SELECTOR	0xf000
+#define BIOS_IP		0xfff0
+#define BIOS_SP		0x8000
+
+bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename)
+{
+	struct stat st;
+	void *p;
+	int fd;
+	int nr;
+
+	fd = open(firmware_filename, O_RDONLY);
+	if (fd < 0)
+		return false;
+
+	if (fstat(fd, &st))
+		return false;
+
+	if (st.st_size > MB_FIRMWARE_BIOS_SIZE)
+		die("firmware image %s is too big to fit in memory (%Lu KB).\n", firmware_filename, (u64)(st.st_size / 1024));
+
+	p = guest_flat_to_host(kvm, MB_FIRMWARE_BIOS_BEGIN);
+
+	while ((nr = read(fd, p, st.st_size)) > 0)
+		p += nr;
+
+	kvm->arch.boot_selector	= BIOS_SELECTOR;
+	kvm->arch.boot_ip	= BIOS_IP;
+	kvm->arch.boot_sp	= BIOS_SP;
+
+	return true;
+}
diff --git a/kvmtool/x86/cpuid.c b/kvmtool/x86/cpuid.c
new file mode 100644
index 0000000..c3b67d9
--- /dev/null
+++ b/kvmtool/x86/cpuid.c
@@ -0,0 +1,89 @@
+#include "kvm/kvm-cpu.h"
+
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+#include <sys/ioctl.h>
+#include <stdlib.h>
+
+#define	MAX_KVM_CPUID_ENTRIES		100
+
+static void filter_cpuid(struct kvm_cpuid2 *kvm_cpuid)
+{
+	unsigned int signature[3];
+	unsigned int i;
+
+	/*
+	 * Filter CPUID functions that are not supported by the hypervisor.
+	 */
+	for (i = 0; i < kvm_cpuid->nent; i++) {
+		struct kvm_cpuid_entry2 *entry = &kvm_cpuid->entries[i];
+
+		switch (entry->function) {
+		case 0:
+			/* Vendor name */
+			memcpy(signature, "LKVMLKVMLKVM", 12);
+			entry->ebx = signature[0];
+			entry->ecx = signature[1];
+			entry->edx = signature[2];
+			break;
+		case 1:
+			/* Set X86_FEATURE_HYPERVISOR */
+			if (entry->index == 0)
+				entry->ecx |= (1 << 31);
+			break;
+		case 6:
+			/* Clear X86_FEATURE_EPB */
+			entry->ecx = entry->ecx & ~(1 << 3);
+			break;
+		case 10: { /* Architectural Performance Monitoring */
+			union cpuid10_eax {
+				struct {
+					unsigned int version_id		:8;
+					unsigned int num_counters	:8;
+					unsigned int bit_width		:8;
+					unsigned int mask_length	:8;
+				} split;
+				unsigned int full;
+			} eax;
+
+			/*
+			 * If the host has perf system running,
+			 * but no architectural events available
+			 * through kvm pmu -- disable perf support,
+			 * thus guest won't even try to access msr
+			 * registers.
+			 */
+			if (entry->eax) {
+				eax.full = entry->eax;
+				if (eax.split.version_id != 2 ||
+				    !eax.split.num_counters)
+					entry->eax = 0;
+			}
+			break;
+		}
+		default:
+			/* Keep the CPUID function as -is */
+			break;
+		};
+	}
+}
+
+void kvm_cpu__setup_cpuid(struct kvm_cpu *vcpu)
+{
+	struct kvm_cpuid2 *kvm_cpuid;
+
+	kvm_cpuid = calloc(1, sizeof(*kvm_cpuid) +
+				MAX_KVM_CPUID_ENTRIES * sizeof(*kvm_cpuid->entries));
+
+	kvm_cpuid->nent = MAX_KVM_CPUID_ENTRIES;
+	if (ioctl(vcpu->kvm->sys_fd, KVM_GET_SUPPORTED_CPUID, kvm_cpuid) < 0)
+		die_perror("KVM_GET_SUPPORTED_CPUID failed");
+
+	filter_cpuid(kvm_cpuid);
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_CPUID2, kvm_cpuid) < 0)
+		die_perror("KVM_SET_CPUID2 failed");
+
+	free(kvm_cpuid);
+}
diff --git a/kvmtool/x86/include/asm/apicdef.h b/kvmtool/x86/include/asm/apicdef.h
new file mode 100644
index 0000000..c46bb99
--- /dev/null
+++ b/kvmtool/x86/include/asm/apicdef.h
@@ -0,0 +1,445 @@
+#ifndef _ASM_X86_APICDEF_H
+#define _ASM_X86_APICDEF_H
+
+/*
+ * Constants for various Intel APICs. (local APIC, IOAPIC, etc.)
+ *
+ * Alan Cox <Alan.Cox@linux.org>, 1995.
+ * Ingo Molnar <mingo@redhat.com>, 1999, 2000
+ */
+
+#define IO_APIC_DEFAULT_PHYS_BASE	0xfec00000
+#define	APIC_DEFAULT_PHYS_BASE		0xfee00000
+
+/*
+ * This is the IO-APIC register space as specified
+ * by Intel docs:
+ */
+#define IO_APIC_SLOT_SIZE		1024
+
+#define	APIC_ID		0x20
+
+#define	APIC_LVR	0x30
+#define		APIC_LVR_MASK		0xFF00FF
+#define		APIC_LVR_DIRECTED_EOI	(1 << 24)
+#define		GET_APIC_VERSION(x)	((x) & 0xFFu)
+#define		GET_APIC_MAXLVT(x)	(((x) >> 16) & 0xFFu)
+#ifdef CONFIG_X86_32
+#  define	APIC_INTEGRATED(x)	((x) & 0xF0u)
+#else
+#  define	APIC_INTEGRATED(x)	(1)
+#endif
+#define		APIC_XAPIC(x)		((x) >= 0x14)
+#define		APIC_EXT_SPACE(x)	((x) & 0x80000000)
+#define	APIC_TASKPRI	0x80
+#define		APIC_TPRI_MASK		0xFFu
+#define	APIC_ARBPRI	0x90
+#define		APIC_ARBPRI_MASK	0xFFu
+#define	APIC_PROCPRI	0xA0
+#define	APIC_EOI	0xB0
+#define		APIC_EOI_ACK		0x0 /* Docs say 0 for future compat. */
+#define	APIC_RRR	0xC0
+#define	APIC_LDR	0xD0
+#define		APIC_LDR_MASK		(0xFFu << 24)
+#define		GET_APIC_LOGICAL_ID(x)	(((x) >> 24) & 0xFFu)
+#define		SET_APIC_LOGICAL_ID(x)	(((x) << 24))
+#define		APIC_ALL_CPUS		0xFFu
+#define	APIC_DFR	0xE0
+#define		APIC_DFR_CLUSTER		0x0FFFFFFFul
+#define		APIC_DFR_FLAT			0xFFFFFFFFul
+#define	APIC_SPIV	0xF0
+#define		APIC_SPIV_DIRECTED_EOI		(1 << 12)
+#define		APIC_SPIV_FOCUS_DISABLED	(1 << 9)
+#define		APIC_SPIV_APIC_ENABLED		(1 << 8)
+#define	APIC_ISR	0x100
+#define	APIC_ISR_NR     0x8     /* Number of 32 bit ISR registers. */
+#define	APIC_TMR	0x180
+#define	APIC_IRR	0x200
+#define	APIC_ESR	0x280
+#define		APIC_ESR_SEND_CS	0x00001
+#define		APIC_ESR_RECV_CS	0x00002
+#define		APIC_ESR_SEND_ACC	0x00004
+#define		APIC_ESR_RECV_ACC	0x00008
+#define		APIC_ESR_SENDILL	0x00020
+#define		APIC_ESR_RECVILL	0x00040
+#define		APIC_ESR_ILLREGA	0x00080
+#define 	APIC_LVTCMCI	0x2f0
+#define	APIC_ICR	0x300
+#define		APIC_DEST_SELF		0x40000
+#define		APIC_DEST_ALLINC	0x80000
+#define		APIC_DEST_ALLBUT	0xC0000
+#define		APIC_ICR_RR_MASK	0x30000
+#define		APIC_ICR_RR_INVALID	0x00000
+#define		APIC_ICR_RR_INPROG	0x10000
+#define		APIC_ICR_RR_VALID	0x20000
+#define		APIC_INT_LEVELTRIG	0x08000
+#define		APIC_INT_ASSERT		0x04000
+#define		APIC_ICR_BUSY		0x01000
+#define		APIC_DEST_LOGICAL	0x00800
+#define		APIC_DEST_PHYSICAL	0x00000
+#define		APIC_DM_FIXED		0x00000
+#define		APIC_DM_FIXED_MASK	0x00700
+#define		APIC_DM_LOWEST		0x00100
+#define		APIC_DM_SMI		0x00200
+#define		APIC_DM_REMRD		0x00300
+#define		APIC_DM_NMI		0x00400
+#define		APIC_DM_INIT		0x00500
+#define		APIC_DM_STARTUP		0x00600
+#define		APIC_DM_EXTINT		0x00700
+#define		APIC_VECTOR_MASK	0x000FF
+#define	APIC_ICR2	0x310
+#define		GET_APIC_DEST_FIELD(x)	(((x) >> 24) & 0xFF)
+#define		SET_APIC_DEST_FIELD(x)	((x) << 24)
+#define	APIC_LVTT	0x320
+#define	APIC_LVTTHMR	0x330
+#define	APIC_LVTPC	0x340
+#define	APIC_LVT0	0x350
+#define		APIC_LVT_TIMER_BASE_MASK	(0x3 << 18)
+#define		GET_APIC_TIMER_BASE(x)		(((x) >> 18) & 0x3)
+#define		SET_APIC_TIMER_BASE(x)		(((x) << 18))
+#define		APIC_TIMER_BASE_CLKIN		0x0
+#define		APIC_TIMER_BASE_TMBASE		0x1
+#define		APIC_TIMER_BASE_DIV		0x2
+#define		APIC_LVT_TIMER_ONESHOT		(0 << 17)
+#define		APIC_LVT_TIMER_PERIODIC		(1 << 17)
+#define		APIC_LVT_TIMER_TSCDEADLINE	(2 << 17)
+#define		APIC_LVT_MASKED			(1 << 16)
+#define		APIC_LVT_LEVEL_TRIGGER		(1 << 15)
+#define		APIC_LVT_REMOTE_IRR		(1 << 14)
+#define		APIC_INPUT_POLARITY		(1 << 13)
+#define		APIC_SEND_PENDING		(1 << 12)
+#define		APIC_MODE_MASK			0x700
+#define		GET_APIC_DELIVERY_MODE(x)	(((x) >> 8) & 0x7)
+#define		SET_APIC_DELIVERY_MODE(x, y)	(((x) & ~0x700) | ((y) << 8))
+#define			APIC_MODE_FIXED		0x0
+#define			APIC_MODE_NMI		0x4
+#define			APIC_MODE_EXTINT	0x7
+#define	APIC_LVT1	0x360
+#define	APIC_LVTERR	0x370
+#define	APIC_TMICT	0x380
+#define	APIC_TMCCT	0x390
+#define	APIC_TDCR	0x3E0
+#define APIC_SELF_IPI	0x3F0
+#define		APIC_TDR_DIV_TMBASE	(1 << 2)
+#define		APIC_TDR_DIV_1		0xB
+#define		APIC_TDR_DIV_2		0x0
+#define		APIC_TDR_DIV_4		0x1
+#define		APIC_TDR_DIV_8		0x2
+#define		APIC_TDR_DIV_16		0x3
+#define		APIC_TDR_DIV_32		0x8
+#define		APIC_TDR_DIV_64		0x9
+#define		APIC_TDR_DIV_128	0xA
+#define	APIC_EFEAT	0x400
+#define	APIC_ECTRL	0x410
+#define APIC_EILVTn(n)	(0x500 + 0x10 * n)
+#define		APIC_EILVT_NR_AMD_K8	1	/* # of extended interrupts */
+#define		APIC_EILVT_NR_AMD_10H	4
+#define		APIC_EILVT_NR_MAX	APIC_EILVT_NR_AMD_10H
+#define		APIC_EILVT_LVTOFF(x)	(((x) >> 4) & 0xF)
+#define		APIC_EILVT_MSG_FIX	0x0
+#define		APIC_EILVT_MSG_SMI	0x2
+#define		APIC_EILVT_MSG_NMI	0x4
+#define		APIC_EILVT_MSG_EXT	0x7
+#define		APIC_EILVT_MASKED	(1 << 16)
+
+#define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
+#define APIC_BASE_MSR	0x800
+#define XAPIC_ENABLE	(1UL << 11)
+#define X2APIC_ENABLE	(1UL << 10)
+
+#ifdef CONFIG_X86_32
+# define MAX_IO_APICS 64
+# define MAX_LOCAL_APIC 256
+#else
+# define MAX_IO_APICS 128
+# define MAX_LOCAL_APIC 32768
+#endif
+
+/*
+ * All x86-64 systems are xAPIC compatible.
+ * In the following, "apicid" is a physical APIC ID.
+ */
+#define XAPIC_DEST_CPUS_SHIFT	4
+#define XAPIC_DEST_CPUS_MASK	((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
+#define XAPIC_DEST_CLUSTER_MASK	(XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
+#define APIC_CLUSTER(apicid)	((apicid) & XAPIC_DEST_CLUSTER_MASK)
+#define APIC_CLUSTERID(apicid)	(APIC_CLUSTER(apicid) >> XAPIC_DEST_CPUS_SHIFT)
+#define APIC_CPUID(apicid)	((apicid) & XAPIC_DEST_CPUS_MASK)
+#define NUM_APIC_CLUSTERS	((BAD_APICID + 1) >> XAPIC_DEST_CPUS_SHIFT)
+
+/*
+ * the local APIC register structure, memory mapped. Not terribly well
+ * tested, but we might eventually use this one in the future - the
+ * problem why we cannot use it right now is the P5 APIC, it has an
+ * errata which cannot take 8-bit reads and writes, only 32-bit ones ...
+ */
+#define u32 unsigned int
+
+struct local_apic {
+
+/*000*/	struct { u32 __reserved[4]; } __reserved_01;
+
+/*010*/	struct { u32 __reserved[4]; } __reserved_02;
+
+/*020*/	struct { /* APIC ID Register */
+		u32   __reserved_1	: 24,
+			phys_apic_id	:  4,
+			__reserved_2	:  4;
+		u32 __reserved[3];
+	} id;
+
+/*030*/	const
+	struct { /* APIC Version Register */
+		u32   version		:  8,
+			__reserved_1	:  8,
+			max_lvt		:  8,
+			__reserved_2	:  8;
+		u32 __reserved[3];
+	} version;
+
+/*040*/	struct { u32 __reserved[4]; } __reserved_03;
+
+/*050*/	struct { u32 __reserved[4]; } __reserved_04;
+
+/*060*/	struct { u32 __reserved[4]; } __reserved_05;
+
+/*070*/	struct { u32 __reserved[4]; } __reserved_06;
+
+/*080*/	struct { /* Task Priority Register */
+		u32   priority	:  8,
+			__reserved_1	: 24;
+		u32 __reserved_2[3];
+	} tpr;
+
+/*090*/	const
+	struct { /* Arbitration Priority Register */
+		u32   priority	:  8,
+			__reserved_1	: 24;
+		u32 __reserved_2[3];
+	} apr;
+
+/*0A0*/	const
+	struct { /* Processor Priority Register */
+		u32   priority	:  8,
+			__reserved_1	: 24;
+		u32 __reserved_2[3];
+	} ppr;
+
+/*0B0*/	struct { /* End Of Interrupt Register */
+		u32   eoi;
+		u32 __reserved[3];
+	} eoi;
+
+/*0C0*/	struct { u32 __reserved[4]; } __reserved_07;
+
+/*0D0*/	struct { /* Logical Destination Register */
+		u32   __reserved_1	: 24,
+			logical_dest	:  8;
+		u32 __reserved_2[3];
+	} ldr;
+
+/*0E0*/	struct { /* Destination Format Register */
+		u32   __reserved_1	: 28,
+			model		:  4;
+		u32 __reserved_2[3];
+	} dfr;
+
+/*0F0*/	struct { /* Spurious Interrupt Vector Register */
+		u32	spurious_vector	:  8,
+			apic_enabled	:  1,
+			focus_cpu	:  1,
+			__reserved_2	: 22;
+		u32 __reserved_3[3];
+	} svr;
+
+/*100*/	struct { /* In Service Register */
+/*170*/		u32 bitfield;
+		u32 __reserved[3];
+	} isr [8];
+
+/*180*/	struct { /* Trigger Mode Register */
+/*1F0*/		u32 bitfield;
+		u32 __reserved[3];
+	} tmr [8];
+
+/*200*/	struct { /* Interrupt Request Register */
+/*270*/		u32 bitfield;
+		u32 __reserved[3];
+	} irr [8];
+
+/*280*/	union { /* Error Status Register */
+		struct {
+			u32   send_cs_error			:  1,
+				receive_cs_error		:  1,
+				send_accept_error		:  1,
+				receive_accept_error		:  1,
+				__reserved_1			:  1,
+				send_illegal_vector		:  1,
+				receive_illegal_vector		:  1,
+				illegal_register_address	:  1,
+				__reserved_2			: 24;
+			u32 __reserved_3[3];
+		} error_bits;
+		struct {
+			u32 errors;
+			u32 __reserved_3[3];
+		} all_errors;
+	} esr;
+
+/*290*/	struct { u32 __reserved[4]; } __reserved_08;
+
+/*2A0*/	struct { u32 __reserved[4]; } __reserved_09;
+
+/*2B0*/	struct { u32 __reserved[4]; } __reserved_10;
+
+/*2C0*/	struct { u32 __reserved[4]; } __reserved_11;
+
+/*2D0*/	struct { u32 __reserved[4]; } __reserved_12;
+
+/*2E0*/	struct { u32 __reserved[4]; } __reserved_13;
+
+/*2F0*/	struct { u32 __reserved[4]; } __reserved_14;
+
+/*300*/	struct { /* Interrupt Command Register 1 */
+		u32   vector			:  8,
+			delivery_mode		:  3,
+			destination_mode	:  1,
+			delivery_status		:  1,
+			__reserved_1		:  1,
+			level			:  1,
+			trigger			:  1,
+			__reserved_2		:  2,
+			shorthand		:  2,
+			__reserved_3		:  12;
+		u32 __reserved_4[3];
+	} icr1;
+
+/*310*/	struct { /* Interrupt Command Register 2 */
+		union {
+			u32   __reserved_1	: 24,
+				phys_dest	:  4,
+				__reserved_2	:  4;
+			u32   __reserved_3	: 24,
+				logical_dest	:  8;
+		} dest;
+		u32 __reserved_4[3];
+	} icr2;
+
+/*320*/	struct { /* LVT - Timer */
+		u32   vector		:  8,
+			__reserved_1	:  4,
+			delivery_status	:  1,
+			__reserved_2	:  3,
+			mask		:  1,
+			timer_mode	:  1,
+			__reserved_3	: 14;
+		u32 __reserved_4[3];
+	} lvt_timer;
+
+/*330*/	struct { /* LVT - Thermal Sensor */
+		u32  vector		:  8,
+			delivery_mode	:  3,
+			__reserved_1	:  1,
+			delivery_status	:  1,
+			__reserved_2	:  3,
+			mask		:  1,
+			__reserved_3	: 15;
+		u32 __reserved_4[3];
+	} lvt_thermal;
+
+/*340*/	struct { /* LVT - Performance Counter */
+		u32   vector		:  8,
+			delivery_mode	:  3,
+			__reserved_1	:  1,
+			delivery_status	:  1,
+			__reserved_2	:  3,
+			mask		:  1,
+			__reserved_3	: 15;
+		u32 __reserved_4[3];
+	} lvt_pc;
+
+/*350*/	struct { /* LVT - LINT0 */
+		u32   vector		:  8,
+			delivery_mode	:  3,
+			__reserved_1	:  1,
+			delivery_status	:  1,
+			polarity	:  1,
+			remote_irr	:  1,
+			trigger		:  1,
+			mask		:  1,
+			__reserved_2	: 15;
+		u32 __reserved_3[3];
+	} lvt_lint0;
+
+/*360*/	struct { /* LVT - LINT1 */
+		u32   vector		:  8,
+			delivery_mode	:  3,
+			__reserved_1	:  1,
+			delivery_status	:  1,
+			polarity	:  1,
+			remote_irr	:  1,
+			trigger		:  1,
+			mask		:  1,
+			__reserved_2	: 15;
+		u32 __reserved_3[3];
+	} lvt_lint1;
+
+/*370*/	struct { /* LVT - Error */
+		u32   vector		:  8,
+			__reserved_1	:  4,
+			delivery_status	:  1,
+			__reserved_2	:  3,
+			mask		:  1,
+			__reserved_3	: 15;
+		u32 __reserved_4[3];
+	} lvt_error;
+
+/*380*/	struct { /* Timer Initial Count Register */
+		u32   initial_count;
+		u32 __reserved_2[3];
+	} timer_icr;
+
+/*390*/	const
+	struct { /* Timer Current Count Register */
+		u32   curr_count;
+		u32 __reserved_2[3];
+	} timer_ccr;
+
+/*3A0*/	struct { u32 __reserved[4]; } __reserved_16;
+
+/*3B0*/	struct { u32 __reserved[4]; } __reserved_17;
+
+/*3C0*/	struct { u32 __reserved[4]; } __reserved_18;
+
+/*3D0*/	struct { u32 __reserved[4]; } __reserved_19;
+
+/*3E0*/	struct { /* Timer Divide Configuration Register */
+		u32   divisor		:  4,
+			__reserved_1	: 28;
+		u32 __reserved_2[3];
+	} timer_dcr;
+
+/*3F0*/	struct { u32 __reserved[4]; } __reserved_20;
+
+} __attribute__ ((packed));
+
+#undef u32
+
+#ifdef CONFIG_X86_32
+ #define BAD_APICID 0xFFu
+#else
+ #define BAD_APICID 0xFFFFu
+#endif
+
+enum ioapic_irq_destination_types {
+	dest_Fixed		= 0,
+	dest_LowestPrio		= 1,
+	dest_SMI		= 2,
+	dest__reserved_1	= 3,
+	dest_NMI		= 4,
+	dest_INIT		= 5,
+	dest__reserved_2	= 6,
+	dest_ExtINT		= 7
+};
+
+#endif /* _ASM_X86_APICDEF_H */
diff --git a/kvmtool/x86/include/asm/bios/memcpy.h b/kvmtool/x86/include/asm/bios/memcpy.h
new file mode 100644
index 0000000..eec9d20
--- /dev/null
+++ b/kvmtool/x86/include/asm/bios/memcpy.h
@@ -0,0 +1,9 @@
+#ifndef KVM_BIOS_MEMCPY_H
+#define KVM_BIOS_MEMCPY_H
+
+#include <asm/bios/types.h>
+#include <stddef.h>
+
+void memcpy16(u16 dst_seg, void *dst, u16 src_seg, const void *src, size_t len);
+
+#endif /* KVM_BIOS_MEMCPY_H */
diff --git a/kvmtool/x86/include/asm/bios/types.h b/kvmtool/x86/include/asm/bios/types.h
new file mode 100644
index 0000000..19e55df
--- /dev/null
+++ b/kvmtool/x86/include/asm/bios/types.h
@@ -0,0 +1,9 @@
+#ifndef __X86_ASM_BIOS_TYPES_H
+#define __X86_ASM_BIOS_TYPES_H
+
+typedef unsigned char  u8;
+typedef unsigned short u16;
+typedef unsigned int   u32;
+typedef unsigned long long u64;
+
+#endif
diff --git a/kvmtool/x86/include/asm/bios/vesa.h b/kvmtool/x86/include/asm/bios/vesa.h
new file mode 100644
index 0000000..468e444
--- /dev/null
+++ b/kvmtool/x86/include/asm/bios/vesa.h
@@ -0,0 +1,72 @@
+/* ----------------------------------------------------------------------- *
+ *
+ *   Copyright 1999-2007 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+#ifndef BOOT_VESA_H
+#define BOOT_VESA_H
+
+typedef struct {
+	u16 off, seg;
+} far_ptr;
+
+/* VESA General Information table */
+struct vesa_general_info {
+	u32 signature;		/* 0 Magic number = "VESA" */
+	u16 version;		/* 4 */
+	far_ptr vendor_string;	/* 6 */
+	u32 capabilities;	/* 10 */
+	far_ptr video_mode_ptr;	/* 14 */
+	u16 total_memory;	/* 18 */
+
+	u8 reserved[236];	/* 20 */
+} __attribute__ ((packed));
+
+#define VESA_MAGIC ('V' + ('E' << 8) + ('S' << 16) + ('A' << 24))
+
+struct vesa_mode_info {
+	u16 mode_attr;		/* 0 */
+	u8 win_attr[2];		/* 2 */
+	u16 win_grain;		/* 4 */
+	u16 win_size;		/* 6 */
+	u16 win_seg[2];		/* 8 */
+	far_ptr win_scheme;	/* 12 */
+	u16 logical_scan;	/* 16 */
+
+	u16 h_res;		/* 18 */
+	u16 v_res;		/* 20 */
+	u8 char_width;		/* 22 */
+	u8 char_height;		/* 23 */
+	u8 memory_planes;	/* 24 */
+	u8 bpp;			/* 25 */
+	u8 banks;		/* 26 */
+	u8 memory_layout;	/* 27 */
+	u8 bank_size;		/* 28 */
+	u8 image_planes;	/* 29 */
+	u8 page_function;	/* 30 */
+
+	u8 rmask;		/* 31 */
+	u8 rpos;		/* 32 */
+	u8 gmask;		/* 33 */
+	u8 gpos;		/* 34 */
+	u8 bmask;		/* 35 */
+	u8 bpos;		/* 36 */
+	u8 resv_mask;		/* 37 */
+	u8 resv_pos;		/* 38 */
+	u8 dcm_info;		/* 39 */
+
+	u32 lfb_ptr;		/* 40 Linear frame buffer address */
+	u32 offscreen_ptr;	/* 44 Offscreen memory address */
+	u16 offscreen_size;	/* 48 */
+
+	u8 reserved[206];	/* 50 */
+} __attribute__ ((packed));
+
+#endif				/* LIB_SYS_VESA_H */
diff --git a/kvmtool/x86/include/asm/kvm.h b/kvmtool/x86/include/asm/kvm.h
new file mode 100644
index 0000000..503d3f4
--- /dev/null
+++ b/kvmtool/x86/include/asm/kvm.h
@@ -0,0 +1,449 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_KVM_H
+#define _ASM_X86_KVM_H
+
+/*
+ * KVM x86 specific structures and definitions
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define KVM_PIO_PAGE_OFFSET 1
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
+
+#define DE_VECTOR 0
+#define DB_VECTOR 1
+#define BP_VECTOR 3
+#define OF_VECTOR 4
+#define BR_VECTOR 5
+#define UD_VECTOR 6
+#define NM_VECTOR 7
+#define DF_VECTOR 8
+#define TS_VECTOR 10
+#define NP_VECTOR 11
+#define SS_VECTOR 12
+#define GP_VECTOR 13
+#define PF_VECTOR 14
+#define MF_VECTOR 16
+#define AC_VECTOR 17
+#define MC_VECTOR 18
+#define XM_VECTOR 19
+#define VE_VECTOR 20
+
+/* Select x86 specific features in <linux/kvm.h> */
+#define __KVM_HAVE_PIT
+#define __KVM_HAVE_IOAPIC
+#define __KVM_HAVE_IRQ_LINE
+#define __KVM_HAVE_MSI
+#define __KVM_HAVE_USER_NMI
+#define __KVM_HAVE_GUEST_DEBUG
+#define __KVM_HAVE_MSIX
+#define __KVM_HAVE_MCE
+#define __KVM_HAVE_PIT_STATE2
+#define __KVM_HAVE_XEN_HVM
+#define __KVM_HAVE_VCPU_EVENTS
+#define __KVM_HAVE_DEBUGREGS
+#define __KVM_HAVE_XSAVE
+#define __KVM_HAVE_XCRS
+#define __KVM_HAVE_READONLY_MEM
+
+/* Architectural interrupt line count. */
+#define KVM_NR_INTERRUPTS 256
+
+struct kvm_memory_alias {
+	__u32 slot;  /* this has a different namespace than memory slots */
+	__u32 flags;
+	__u64 guest_phys_addr;
+	__u64 memory_size;
+	__u64 target_phys_addr;
+};
+
+/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
+struct kvm_pic_state {
+	__u8 last_irr;	/* edge detection */
+	__u8 irr;		/* interrupt request register */
+	__u8 imr;		/* interrupt mask register */
+	__u8 isr;		/* interrupt service register */
+	__u8 priority_add;	/* highest irq priority */
+	__u8 irq_base;
+	__u8 read_reg_select;
+	__u8 poll;
+	__u8 special_mask;
+	__u8 init_state;
+	__u8 auto_eoi;
+	__u8 rotate_on_auto_eoi;
+	__u8 special_fully_nested_mode;
+	__u8 init4;		/* true if 4 byte init */
+	__u8 elcr;		/* PIIX edge/trigger selection */
+	__u8 elcr_mask;
+};
+
+#define KVM_IOAPIC_NUM_PINS  24
+struct kvm_ioapic_state {
+	__u64 base_address;
+	__u32 ioregsel;
+	__u32 id;
+	__u32 irr;
+	__u32 pad;
+	union {
+		__u64 bits;
+		struct {
+			__u8 vector;
+			__u8 delivery_mode:3;
+			__u8 dest_mode:1;
+			__u8 delivery_status:1;
+			__u8 polarity:1;
+			__u8 remote_irr:1;
+			__u8 trig_mode:1;
+			__u8 mask:1;
+			__u8 reserve:7;
+			__u8 reserved[4];
+			__u8 dest_id;
+		} fields;
+	} redirtbl[KVM_IOAPIC_NUM_PINS];
+};
+
+#define KVM_IRQCHIP_PIC_MASTER   0
+#define KVM_IRQCHIP_PIC_SLAVE    1
+#define KVM_IRQCHIP_IOAPIC       2
+#define KVM_NR_IRQCHIPS          3
+
+#define KVM_RUN_X86_SMM		 (1 << 0)
+
+/* for KVM_GET_REGS and KVM_SET_REGS */
+struct kvm_regs {
+	/* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
+	__u64 rax, rbx, rcx, rdx;
+	__u64 rsi, rdi, rsp, rbp;
+	__u64 r8,  r9,  r10, r11;
+	__u64 r12, r13, r14, r15;
+	__u64 rip, rflags;
+};
+
+/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
+#define KVM_APIC_REG_SIZE 0x400
+struct kvm_lapic_state {
+	char regs[KVM_APIC_REG_SIZE];
+};
+
+struct kvm_segment {
+	__u64 base;
+	__u32 limit;
+	__u16 selector;
+	__u8  type;
+	__u8  present, dpl, db, s, l, g, avl;
+	__u8  unusable;
+	__u8  padding;
+};
+
+struct kvm_dtable {
+	__u64 base;
+	__u16 limit;
+	__u16 padding[3];
+};
+
+
+/* for KVM_GET_SREGS and KVM_SET_SREGS */
+struct kvm_sregs {
+	/* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
+	struct kvm_segment cs, ds, es, fs, gs, ss;
+	struct kvm_segment tr, ldt;
+	struct kvm_dtable gdt, idt;
+	__u64 cr0, cr2, cr3, cr4, cr8;
+	__u64 efer;
+	__u64 apic_base;
+	__u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
+};
+
+/* for KVM_GET_FPU and KVM_SET_FPU */
+struct kvm_fpu {
+	__u8  fpr[8][16];
+	__u16 fcw;
+	__u16 fsw;
+	__u8  ftwx;  /* in fxsave format */
+	__u8  pad1;
+	__u16 last_opcode;
+	__u64 last_ip;
+	__u64 last_dp;
+	__u8  xmm[16][16];
+	__u32 mxcsr;
+	__u32 pad2;
+};
+
+struct kvm_msr_entry {
+	__u32 index;
+	__u32 reserved;
+	__u64 data;
+};
+
+/* for KVM_GET_MSRS and KVM_SET_MSRS */
+struct kvm_msrs {
+	__u32 nmsrs; /* number of msrs in entries */
+	__u32 pad;
+
+	struct kvm_msr_entry entries[0];
+};
+
+/* for KVM_GET_MSR_INDEX_LIST */
+struct kvm_msr_list {
+	__u32 nmsrs; /* number of msrs in entries */
+	__u32 indices[0];
+};
+
+
+struct kvm_cpuid_entry {
+	__u32 function;
+	__u32 eax;
+	__u32 ebx;
+	__u32 ecx;
+	__u32 edx;
+	__u32 padding;
+};
+
+/* for KVM_SET_CPUID */
+struct kvm_cpuid {
+	__u32 nent;
+	__u32 padding;
+	struct kvm_cpuid_entry entries[0];
+};
+
+struct kvm_cpuid_entry2 {
+	__u32 function;
+	__u32 index;
+	__u32 flags;
+	__u32 eax;
+	__u32 ebx;
+	__u32 ecx;
+	__u32 edx;
+	__u32 padding[3];
+};
+
+#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX		(1 << 0)
+#define KVM_CPUID_FLAG_STATEFUL_FUNC		(1 << 1)
+#define KVM_CPUID_FLAG_STATE_READ_NEXT		(1 << 2)
+
+/* for KVM_SET_CPUID2 */
+struct kvm_cpuid2 {
+	__u32 nent;
+	__u32 padding;
+	struct kvm_cpuid_entry2 entries[0];
+};
+
+/* for KVM_GET_PIT and KVM_SET_PIT */
+struct kvm_pit_channel_state {
+	__u32 count; /* can be 65536 */
+	__u16 latched_count;
+	__u8 count_latched;
+	__u8 status_latched;
+	__u8 status;
+	__u8 read_state;
+	__u8 write_state;
+	__u8 write_latch;
+	__u8 rw_mode;
+	__u8 mode;
+	__u8 bcd;
+	__u8 gate;
+	__s64 count_load_time;
+};
+
+struct kvm_debug_exit_arch {
+	__u32 exception;
+	__u32 pad;
+	__u64 pc;
+	__u64 dr6;
+	__u64 dr7;
+};
+
+#define KVM_GUESTDBG_USE_SW_BP		0x00010000
+#define KVM_GUESTDBG_USE_HW_BP		0x00020000
+#define KVM_GUESTDBG_INJECT_DB		0x00040000
+#define KVM_GUESTDBG_INJECT_BP		0x00080000
+
+/* for KVM_SET_GUEST_DEBUG */
+struct kvm_guest_debug_arch {
+	__u64 debugreg[8];
+};
+
+struct kvm_pit_state {
+	struct kvm_pit_channel_state channels[3];
+};
+
+#define KVM_PIT_FLAGS_HPET_LEGACY  0x00000001
+
+struct kvm_pit_state2 {
+	struct kvm_pit_channel_state channels[3];
+	__u32 flags;
+	__u32 reserved[9];
+};
+
+struct kvm_reinject_control {
+	__u8 pit_reinject;
+	__u8 reserved[31];
+};
+
+/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */
+#define KVM_VCPUEVENT_VALID_NMI_PENDING	0x00000001
+#define KVM_VCPUEVENT_VALID_SIPI_VECTOR	0x00000002
+#define KVM_VCPUEVENT_VALID_SHADOW	0x00000004
+#define KVM_VCPUEVENT_VALID_SMM		0x00000008
+#define KVM_VCPUEVENT_VALID_PAYLOAD	0x00000010
+
+/* Interrupt shadow states */
+#define KVM_X86_SHADOW_INT_MOV_SS	0x01
+#define KVM_X86_SHADOW_INT_STI		0x02
+
+/* for KVM_GET/SET_VCPU_EVENTS */
+struct kvm_vcpu_events {
+	struct {
+		__u8 injected;
+		__u8 nr;
+		__u8 has_error_code;
+		__u8 pending;
+		__u32 error_code;
+	} exception;
+	struct {
+		__u8 injected;
+		__u8 nr;
+		__u8 soft;
+		__u8 shadow;
+	} interrupt;
+	struct {
+		__u8 injected;
+		__u8 pending;
+		__u8 masked;
+		__u8 pad;
+	} nmi;
+	__u32 sipi_vector;
+	__u32 flags;
+	struct {
+		__u8 smm;
+		__u8 pending;
+		__u8 smm_inside_nmi;
+		__u8 latched_init;
+	} smi;
+	__u8 reserved[27];
+	__u8 exception_has_payload;
+	__u64 exception_payload;
+};
+
+/* for KVM_GET/SET_DEBUGREGS */
+struct kvm_debugregs {
+	__u64 db[4];
+	__u64 dr6;
+	__u64 dr7;
+	__u64 flags;
+	__u64 reserved[9];
+};
+
+/* for KVM_CAP_XSAVE */
+struct kvm_xsave {
+	__u32 region[1024];
+};
+
+#define KVM_MAX_XCRS	16
+
+struct kvm_xcr {
+	__u32 xcr;
+	__u32 reserved;
+	__u64 value;
+};
+
+struct kvm_xcrs {
+	__u32 nr_xcrs;
+	__u32 flags;
+	struct kvm_xcr xcrs[KVM_MAX_XCRS];
+	__u64 padding[16];
+};
+
+#define KVM_SYNC_X86_REGS      (1UL << 0)
+#define KVM_SYNC_X86_SREGS     (1UL << 1)
+#define KVM_SYNC_X86_EVENTS    (1UL << 2)
+
+#define KVM_SYNC_X86_VALID_FIELDS \
+	(KVM_SYNC_X86_REGS| \
+	 KVM_SYNC_X86_SREGS| \
+	 KVM_SYNC_X86_EVENTS)
+
+/* kvm_sync_regs struct included by kvm_run struct */
+struct kvm_sync_regs {
+	/* Members of this structure are potentially malicious.
+	 * Care must be taken by code reading, esp. interpreting,
+	 * data fields from them inside KVM to prevent TOCTOU and
+	 * double-fetch types of vulnerabilities.
+	 */
+	struct kvm_regs regs;
+	struct kvm_sregs sregs;
+	struct kvm_vcpu_events events;
+};
+
+#define KVM_X86_QUIRK_LINT0_REENABLED	   (1 << 0)
+#define KVM_X86_QUIRK_CD_NW_CLEARED	   (1 << 1)
+#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE	   (1 << 2)
+#define KVM_X86_QUIRK_OUT_7E_INC_RIP	   (1 << 3)
+#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4)
+
+#define KVM_STATE_NESTED_FORMAT_VMX	0
+#define KVM_STATE_NESTED_FORMAT_SVM	1	/* unused */
+
+#define KVM_STATE_NESTED_GUEST_MODE	0x00000001
+#define KVM_STATE_NESTED_RUN_PENDING	0x00000002
+#define KVM_STATE_NESTED_EVMCS		0x00000004
+
+#define KVM_STATE_NESTED_SMM_GUEST_MODE	0x00000001
+#define KVM_STATE_NESTED_SMM_VMXON	0x00000002
+
+#define KVM_STATE_NESTED_VMX_VMCS_SIZE	0x1000
+
+struct kvm_vmx_nested_state_data {
+	__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
+	__u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
+};
+
+struct kvm_vmx_nested_state_hdr {
+	__u64 vmxon_pa;
+	__u64 vmcs12_pa;
+
+	struct {
+		__u16 flags;
+	} smm;
+};
+
+/* for KVM_CAP_NESTED_STATE */
+struct kvm_nested_state {
+	__u16 flags;
+	__u16 format;
+	__u32 size;
+
+	union {
+		struct kvm_vmx_nested_state_hdr vmx;
+
+		/* Pad the header to 128 bytes.  */
+		__u8 pad[120];
+	} hdr;
+
+	/*
+	 * Define data region as 0 bytes to preserve backwards-compatability
+	 * to old definition of kvm_nested_state in order to avoid changing
+	 * KVM_{GET,PUT}_NESTED_STATE ioctl values.
+	 */
+	union {
+		struct kvm_vmx_nested_state_data vmx[0];
+	} data;
+};
+
+/* for KVM_CAP_PMU_EVENT_FILTER */
+struct kvm_pmu_event_filter {
+	__u32 action;
+	__u32 nevents;
+	__u32 fixed_counter_bitmap;
+	__u32 flags;
+	__u32 pad[4];
+	__u64 events[0];
+};
+
+#define KVM_PMU_EVENT_ALLOW 0
+#define KVM_PMU_EVENT_DENY 1
+
+#endif /* _ASM_X86_KVM_H */
diff --git a/kvmtool/x86/include/asm/mpspec_def.h b/kvmtool/x86/include/asm/mpspec_def.h
new file mode 100644
index 0000000..b31f8c0
--- /dev/null
+++ b/kvmtool/x86/include/asm/mpspec_def.h
@@ -0,0 +1,173 @@
+#ifndef _ASM_X86_MPSPEC_DEF_H
+#define _ASM_X86_MPSPEC_DEF_H
+
+/*
+ * Structure definitions for SMP machines following the
+ * Intel Multiprocessing Specification 1.1 and 1.4.
+ */
+
+/*
+ * This tag identifies where the SMP configuration
+ * information is.
+ */
+
+#define SMP_MAGIC_IDENT	(('_'<<24) | ('P'<<16) | ('M'<<8) | '_')
+
+#ifdef CONFIG_X86_32
+# define MAX_MPC_ENTRY 1024
+#endif
+
+/* Intel MP Floating Pointer Structure */
+struct mpf_intel {
+	char signature[4];		/* "_MP_"			*/
+	unsigned int physptr;		/* Configuration table address	*/
+	unsigned char length;		/* Our length (paragraphs)	*/
+	unsigned char specification;	/* Specification version	*/
+	unsigned char checksum;		/* Checksum (makes sum 0)	*/
+	unsigned char feature1;		/* Standard or configuration ?	*/
+	unsigned char feature2;		/* Bit7 set for IMCR|PIC	*/
+	unsigned char feature3;		/* Unused (0)			*/
+	unsigned char feature4;		/* Unused (0)			*/
+	unsigned char feature5;		/* Unused (0)			*/
+};
+
+#define MPC_SIGNATURE "PCMP"
+
+struct mpc_table {
+	char signature[4];
+	unsigned short length;		/* Size of table */
+	char spec;			/* 0x01 */
+	char checksum;
+	char oem[8];
+	char productid[12];
+	unsigned int oemptr;		/* 0 if not present */
+	unsigned short oemsize;		/* 0 if not present */
+	unsigned short oemcount;
+	unsigned int lapic;		/* APIC address */
+	unsigned int reserved;
+};
+
+/* Followed by entries */
+
+#define	MP_PROCESSOR		0
+#define	MP_BUS			1
+#define	MP_IOAPIC		2
+#define	MP_INTSRC		3
+#define	MP_LINTSRC		4
+/* Used by IBM NUMA-Q to describe node locality */
+#define	MP_TRANSLATION		192
+
+#define CPU_ENABLED		1	/* Processor is available */
+#define CPU_BOOTPROCESSOR	2	/* Processor is the BP */
+
+#define CPU_STEPPING_MASK	0x000F
+#define CPU_MODEL_MASK		0x00F0
+#define CPU_FAMILY_MASK		0x0F00
+
+struct mpc_cpu {
+	unsigned char type;
+	unsigned char apicid;		/* Local APIC number */
+	unsigned char apicver;		/* Its versions */
+	unsigned char cpuflag;
+	unsigned int cpufeature;
+	unsigned int featureflag;	/* CPUID feature value */
+	unsigned int reserved[2];
+};
+
+struct mpc_bus {
+	unsigned char type;
+	unsigned char busid;
+	unsigned char bustype[6];
+};
+
+/* List of Bus Type string values, Intel MP Spec. */
+#define BUSTYPE_EISA	"EISA"
+#define BUSTYPE_ISA	"ISA"
+#define BUSTYPE_INTERN	"INTERN"	/* Internal BUS */
+#define BUSTYPE_MCA	"MCA"		/* Obsolete */
+#define BUSTYPE_VL	"VL"		/* Local bus */
+#define BUSTYPE_PCI	"PCI"
+#define BUSTYPE_PCMCIA	"PCMCIA"
+#define BUSTYPE_CBUS	"CBUS"
+#define BUSTYPE_CBUSII	"CBUSII"
+#define BUSTYPE_FUTURE	"FUTURE"
+#define BUSTYPE_MBI	"MBI"
+#define BUSTYPE_MBII	"MBII"
+#define BUSTYPE_MPI	"MPI"
+#define BUSTYPE_MPSA	"MPSA"
+#define BUSTYPE_NUBUS	"NUBUS"
+#define BUSTYPE_TC	"TC"
+#define BUSTYPE_VME	"VME"
+#define BUSTYPE_XPRESS	"XPRESS"
+
+#define MPC_APIC_USABLE		0x01
+
+struct mpc_ioapic {
+	unsigned char type;
+	unsigned char apicid;
+	unsigned char apicver;
+	unsigned char flags;
+	unsigned int apicaddr;
+};
+
+struct mpc_intsrc {
+	unsigned char type;
+	unsigned char irqtype;
+	unsigned short irqflag;
+	unsigned char srcbus;
+	unsigned char srcbusirq;
+	unsigned char dstapic;
+	unsigned char dstirq;
+};
+
+enum mp_irq_source_types {
+	mp_INT = 0,
+	mp_NMI = 1,
+	mp_SMI = 2,
+	mp_ExtINT = 3
+};
+
+#define MP_IRQDIR_DEFAULT	0
+#define MP_IRQDIR_HIGH		1
+#define MP_IRQDIR_LOW		3
+
+#define MP_APIC_ALL	0xFF
+
+struct mpc_lintsrc {
+	unsigned char type;
+	unsigned char irqtype;
+	unsigned short irqflag;
+	unsigned char srcbusid;
+	unsigned char srcbusirq;
+	unsigned char destapic;
+	unsigned char destapiclint;
+};
+
+#define MPC_OEM_SIGNATURE "_OEM"
+
+struct mpc_oemtable {
+	char signature[4];
+	unsigned short length;		/* Size of table */
+	char  rev;			/* 0x01 */
+	char  checksum;
+	char  mpc[8];
+};
+
+/*
+ *	Default configurations
+ *
+ *	1	2 CPU ISA 82489DX
+ *	2	2 CPU EISA 82489DX neither IRQ 0 timer nor IRQ 13 DMA chaining
+ *	3	2 CPU EISA 82489DX
+ *	4	2 CPU MCA 82489DX
+ *	5	2 CPU ISA+PCI
+ *	6	2 CPU EISA+PCI
+ *	7	2 CPU MCA+PCI
+ */
+
+enum mp_bustype {
+	MP_BUS_ISA = 1,
+	MP_BUS_EISA,
+	MP_BUS_PCI,
+};
+#endif /* _ASM_X86_MPSPEC_DEF_H */
diff --git a/kvmtool/x86/include/asm/processor-flags.h b/kvmtool/x86/include/asm/processor-flags.h
new file mode 100644
index 0000000..07c2834
--- /dev/null
+++ b/kvmtool/x86/include/asm/processor-flags.h
@@ -0,0 +1,10 @@
+#ifndef _ASM_X86_PROCESSOR_FLAGS_H
+#define _ASM_X86_PROCESSOR_FLAGS_H
+/* Various flags defined: can be included from assembler. */
+
+/*
+ * EFLAGS bits
+ */
+#define X86_EFLAGS_CF	0x00000001 /* Carry Flag */
+
+#endif
diff --git a/kvmtool/x86/include/kvm/assembly.h b/kvmtool/x86/include/kvm/assembly.h
new file mode 100644
index 0000000..e70baab
--- /dev/null
+++ b/kvmtool/x86/include/kvm/assembly.h
@@ -0,0 +1,24 @@
+#ifndef ASSEMBLY_H_
+#define ASSEMBLY_H_
+
+#define __ALIGN	.p2align 4, 0x90
+#define ENTRY(name)	\
+	__ALIGN;	\
+	.globl name;	\
+	name:
+
+#define GLOBAL(name)	\
+	.globl name;	\
+	name:
+
+#define ENTRY_END(name)	GLOBAL(name##_end)
+#define END(name)	GLOBAL(name##_end)
+
+/*
+ * gas produces size override prefix with which
+ * we are unhappy, lets make it hardcoded for
+ * 16 bit mode
+ */
+#define IRET	.byte 0xcf
+
+#endif /* ASSEMBLY_H_ */
diff --git a/kvmtool/x86/include/kvm/barrier.h b/kvmtool/x86/include/kvm/barrier.h
new file mode 100644
index 0000000..46d14f6
--- /dev/null
+++ b/kvmtool/x86/include/kvm/barrier.h
@@ -0,0 +1,20 @@
+#ifndef _KVM_BARRIER_H_
+#define _KVM_BARRIER_H_
+
+#define barrier() asm volatile("": : :"memory")
+
+#define mb()	asm volatile ("mfence": : :"memory")
+#define rmb()	asm volatile ("lfence": : :"memory")
+#define wmb()	asm volatile ("sfence": : :"memory")
+
+#ifdef CONFIG_SMP
+#define smp_mb()	mb()
+#define smp_rmb()	rmb()
+#define smp_wmb()	wmb()
+#else
+#define smp_mb()	barrier()
+#define smp_rmb()	barrier()
+#define smp_wmb()	barrier()
+#endif
+
+#endif /* _KVM_BARRIER_H_ */
diff --git a/kvmtool/x86/include/kvm/bios-export.h b/kvmtool/x86/include/kvm/bios-export.h
new file mode 100644
index 0000000..23825aa
--- /dev/null
+++ b/kvmtool/x86/include/kvm/bios-export.h
@@ -0,0 +1,13 @@
+#ifndef BIOS_EXPORT_H_
+#define BIOS_EXPORT_H_
+
+struct kvm;
+
+extern char bios_rom[0];
+extern char bios_rom_end[0];
+
+#define bios_rom_size		(bios_rom_end - bios_rom)
+
+extern void setup_bios(struct kvm *kvm);
+
+#endif /* BIOS_EXPORT_H_ */
diff --git a/kvmtool/x86/include/kvm/bios.h b/kvmtool/x86/include/kvm/bios.h
new file mode 100644
index 0000000..edeab17
--- /dev/null
+++ b/kvmtool/x86/include/kvm/bios.h
@@ -0,0 +1,93 @@
+#ifndef BIOS_H_
+#define BIOS_H_
+
+/*
+ * X86-32 Memory Map (typical)
+ *					start      end
+ * Real Mode Interrupt Vector Table	0x00000000 0x000003FF
+ * BDA area				0x00000400 0x000004FF
+ * Conventional Low Memory		0x00000500 0x0009FBFF
+ * EBDA area				0x0009FC00 0x0009FFFF
+ * VIDEO RAM				0x000A0000 0x000BFFFF
+ * VIDEO ROM (BIOS)			0x000C0000 0x000C7FFF
+ * ROMs & unus. space (mapped hw & misc)0x000C8000 0x000EFFFF 160 KiB (typically)
+ * Motherboard BIOS			0x000F0000 0x000FFFFF
+ * Extended Memory			0x00100000 0xFEBFFFFF
+ * Reserved (configs, ACPI, PnP, etc)	0xFEC00000 0xFFFFFFFF
+ */
+
+#define REAL_MODE_IVT_BEGIN		0x00000000
+#define REAL_MODE_IVT_END		0x000003ff
+
+#define BDA_START			0x00000400
+#define BDA_END				0x000004ff
+
+#define EBDA_START			0x0009fc00
+#define EBDA_END			0x0009ffff
+
+#define E820_MAP_START			EBDA_START
+
+#define MB_BIOS_BEGIN			0x000f0000
+#define MB_FIRMWARE_BIOS_BEGIN		0x000e0000
+#define MB_BIOS_END			0x000fffff
+
+#define MB_BIOS_SIZE			(MB_BIOS_END - MB_BIOS_BEGIN + 1)
+#define MB_FIRMWARE_BIOS_SIZE		(MB_BIOS_END - MB_FIRMWARE_BIOS_BEGIN + 1)
+
+#define VGA_RAM_BEGIN			0x000a0000
+#define VGA_RAM_END			0x000bffff
+
+#define VGA_ROM_BEGIN			0x000c0000
+#define VGA_ROM_OEM_STRING		VGA_ROM_BEGIN
+#define VGA_ROM_OEM_STRING_SIZE		16
+#define VGA_ROM_MODES			(VGA_ROM_OEM_STRING + VGA_ROM_OEM_STRING_SIZE)
+#define VGA_ROM_MODES_SIZE		32
+#define VGA_ROM_END			0x000c7fff
+
+/* we handle one page only */
+#define VGA_RAM_SEG			(VGA_RAM_BEGIN >> 4)
+#define VGA_PAGE_SIZE			0x007d0 /* 80x25 */
+
+/* real mode interrupt vector table */
+#define REAL_INTR_BASE			REAL_MODE_IVT_BEGIN
+#define REAL_INTR_VECTORS		256
+
+/*
+ * BIOS stack must be at absolute predefined memory address
+ * We reserve 64 bytes for BIOS stack
+ */
+#define MB_BIOS_SS			0xfff7
+#define MB_BIOS_SP			0x40
+
+/*
+ * When interfere with assembler code we need to be sure how
+ * arguments are passed in real mode.
+ */
+#define bioscall __attribute__((regparm(3)))
+
+#ifndef __ASSEMBLER__
+
+#include <asm/bios/types.h>
+
+struct biosregs {
+	u32			eax;
+	u32			ebx;
+	u32			ecx;
+	u32			edx;
+	u32			esp;
+	u32			ebp;
+	u32			esi;
+	u32			edi;
+	u32			ds;
+	u32			es;
+	u32			fs;
+	u32			eip;
+	u32			eflags;
+};
+
+extern bioscall void int10_handler(struct biosregs *regs);
+extern bioscall void int15_handler(struct biosregs *regs);
+
+#endif
+
+#endif /* BIOS_H_ */
diff --git a/kvmtool/x86/include/kvm/boot-protocol.h b/kvmtool/x86/include/kvm/boot-protocol.h
new file mode 100644
index 0000000..85b637f
--- /dev/null
+++ b/kvmtool/x86/include/kvm/boot-protocol.h
@@ -0,0 +1,16 @@
+/*
+ * Linux boot protocol specifics
+ */
+
+#ifndef BOOT_PROTOCOL_H_
+#define BOOT_PROTOCOL_H_
+
+/*
+ * The protected mode kernel part of a modern bzImage is loaded
+ * at 1 MB by default.
+ */
+#define BZ_DEFAULT_SETUP_SECTS		4
+#define BZ_KERNEL_START			0x100000UL
+#define INITRD_START			0x1000000UL
+
+#endif /* BOOT_PROTOCOL_H_ */
diff --git a/kvmtool/x86/include/kvm/cpufeature.h b/kvmtool/x86/include/kvm/cpufeature.h
new file mode 100644
index 0000000..bc4abbb
--- /dev/null
+++ b/kvmtool/x86/include/kvm/cpufeature.h
@@ -0,0 +1,41 @@
+#ifndef KVM__CPUFEATURE_H
+#define KVM__CPUFEATURE_H
+
+#define CPUID_VENDOR_INTEL_1 0x756e6547 /* "Genu" */
+#define CPUID_VENDOR_INTEL_2 0x49656e69 /* "ineI" */
+#define CPUID_VENDOR_INTEL_3 0x6c65746e /* "ntel" */
+
+#define CPUID_VENDOR_AMD_1   0x68747541 /* "Auth" */
+#define CPUID_VENDOR_AMD_2   0x69746e65 /* "enti" */
+#define CPUID_VENDOR_AMD_3   0x444d4163 /* "cAMD" */
+
+/*
+ * CPUID flags we need to deal with
+ */
+#define KVM__X86_FEATURE_VMX		5	/* Hardware virtualization */
+#define KVM__X86_FEATURE_SVM		2	/* Secure virtual machine */
+#define KVM__X86_FEATURE_XSAVE		26	/* XSAVE/XRSTOR/XSETBV/XGETBV */
+
+#define cpu_feature_disable(reg, feature)	\
+	((reg) & ~(1 << (feature)))
+#define cpu_feature_enable(reg, feature)	\
+	((reg) |  (1 << (feature)))
+
+struct cpuid_regs {
+	u32	eax;
+	u32	ebx;
+	u32	ecx;
+	u32	edx;
+};
+
+static inline void host_cpuid(struct cpuid_regs *regs)
+{
+	asm volatile("cpuid"
+		: "=a" (regs->eax),
+		  "=b" (regs->ebx),
+		  "=c" (regs->ecx),
+		  "=d" (regs->edx)
+		: "0" (regs->eax), "2" (regs->ecx));
+}
+
+#endif /* KVM__CPUFEATURE_H */
diff --git a/kvmtool/x86/include/kvm/e820.h b/kvmtool/x86/include/kvm/e820.h
new file mode 100644
index 0000000..f2889da
--- /dev/null
+++ b/kvmtool/x86/include/kvm/e820.h
@@ -0,0 +1,29 @@
+#ifndef KVM_E820_H
+#define KVM_E820_H
+
+#include <kvm/bios.h>
+
+#define SMAP    0x534d4150      /* ASCII "SMAP" */
+
+#define E820MAX 128             /* number of entries in E820MAP */
+#define E820_X_MAX E820MAX
+
+#define E820_RAM        1
+#define E820_RESERVED   2
+
+struct e820entry {
+	u64 addr;     /* start of memory segment */
+	u64 size;     /* size of memory segment */
+	u32 type;     /* type of memory segment */
+} __attribute__((packed));
+
+struct e820map {
+	u32 nr_map;
+        struct e820entry map[E820_X_MAX];
+};
+
+struct biosregs;
+
+extern bioscall void e820_query_map(struct biosregs *regs);
+
+#endif /* KVM_E820_H */
diff --git a/kvmtool/x86/include/kvm/fdt-arch.h b/kvmtool/x86/include/kvm/fdt-arch.h
new file mode 100644
index 0000000..eebd73f
--- /dev/null
+++ b/kvmtool/x86/include/kvm/fdt-arch.h
@@ -0,0 +1,6 @@
+#ifndef X86__FDT_ARCH_H
+#define X86__FDT_ARCH_H
+
+enum phandles {PHANDLE_RESERVED = 0, PHANDLES_MAX};
+
+#endif /* KVM__KVM_FDT_H */
diff --git a/kvmtool/x86/include/kvm/interrupt.h b/kvmtool/x86/include/kvm/interrupt.h
new file mode 100644
index 0000000..00c7ed7
--- /dev/null
+++ b/kvmtool/x86/include/kvm/interrupt.h
@@ -0,0 +1,26 @@
+#ifndef KVM__INTERRUPT_H
+#define KVM__INTERRUPT_H
+
+#include <linux/types.h>
+#include "kvm/bios.h"
+#include "kvm/bios-export.h"
+
+struct real_intr_desc {
+	u16 offset;
+	u16 segment;
+} __attribute__((packed));
+
+#define REAL_SEGMENT_SHIFT	4
+#define REAL_SEGMENT(addr)	((addr) >> REAL_SEGMENT_SHIFT)
+#define REAL_OFFSET(addr)	((addr) & ((1 << REAL_SEGMENT_SHIFT) - 1))
+#define REAL_INTR_SIZE		(REAL_INTR_VECTORS * sizeof(struct real_intr_desc))
+
+struct interrupt_table {
+	struct real_intr_desc entries[REAL_INTR_VECTORS];
+};
+
+void interrupt_table__copy(struct interrupt_table *itable, void *dst, unsigned int size);
+void interrupt_table__setup(struct interrupt_table *itable, struct real_intr_desc *entry);
+void interrupt_table__set(struct interrupt_table *itable, struct real_intr_desc *entry, unsigned int num);
+
+#endif /* KVM__INTERRUPT_H */
diff --git a/kvmtool/x86/include/kvm/kvm-arch.h b/kvmtool/x86/include/kvm/kvm-arch.h
new file mode 100644
index 0000000..85cd336
--- /dev/null
+++ b/kvmtool/x86/include/kvm/kvm-arch.h
@@ -0,0 +1,43 @@
+#ifndef KVM__KVM_ARCH_H
+#define KVM__KVM_ARCH_H
+
+#include "kvm/interrupt.h"
+
+#include <stdbool.h>
+#include <linux/types.h>
+#include <time.h>
+
+/*
+ * The hole includes VESA framebuffer and PCI memory.
+ */
+#define KVM_32BIT_MAX_MEM_SIZE  (1ULL << 32)
+#define KVM_32BIT_GAP_SIZE	(768 << 20)
+#define KVM_32BIT_GAP_START	(KVM_32BIT_MAX_MEM_SIZE - KVM_32BIT_GAP_SIZE)
+
+#define KVM_MMIO_START		KVM_32BIT_GAP_START
+
+/* This is the address that pci_get_io_port_block() starts allocating
+ * from.  Note that this is a PCI bus address (though same on x86).
+ */
+#define KVM_IOPORT_AREA		0x0
+#define KVM_PCI_CFG_AREA	(KVM_MMIO_START + 0x1000000)
+#define KVM_PCI_MMIO_AREA	(KVM_MMIO_START + 0x2000000)
+#define KVM_VIRTIO_MMIO_AREA	(KVM_MMIO_START + 0x3000000)
+
+#define KVM_IRQ_OFFSET		5
+
+#define KVM_VM_TYPE		0
+
+#define KVM_IOEVENTFD_HAS_PIO	1
+
+#define VIRTIO_DEFAULT_TRANS(kvm)	VIRTIO_PCI
+
+struct kvm_arch {
+	u16			boot_selector;
+	u16			boot_ip;
+	u16			boot_sp;
+
+	struct interrupt_table	interrupt_table;
+};
+
+#endif /* KVM__KVM_ARCH_H */
diff --git a/kvmtool/x86/include/kvm/kvm-config-arch.h b/kvmtool/x86/include/kvm/kvm-config-arch.h
new file mode 100644
index 0000000..3eae8db
--- /dev/null
+++ b/kvmtool/x86/include/kvm/kvm-config-arch.h
@@ -0,0 +1,15 @@
+#ifndef KVM__KVM_CONFIG_ARCH_H
+#define KVM__KVM_CONFIG_ARCH_H
+
+#include "kvm/parse-options.h"
+
+struct kvm_config_arch {
+	int vidmode;
+};
+
+#define OPT_ARCH_RUN(pfx, cfg)						\
+	pfx,								\
+	OPT_GROUP("BIOS options:"),					\
+	OPT_INTEGER('\0', "vidmode", &(cfg)->vidmode, "Video mode"),
+
+#endif /* KVM__KVM_CONFIG_ARCH_H */
diff --git a/kvmtool/x86/include/kvm/kvm-cpu-arch.h b/kvmtool/x86/include/kvm/kvm-cpu-arch.h
new file mode 100644
index 0000000..05e5bb6
--- /dev/null
+++ b/kvmtool/x86/include/kvm/kvm-cpu-arch.h
@@ -0,0 +1,50 @@
+#ifndef KVM__KVM_CPU_ARCH_H
+#define KVM__KVM_CPU_ARCH_H
+
+/* Architecture-specific kvm_cpu definitions. */
+
+#include <linux/kvm.h>	/* for struct kvm_regs */
+#include "kvm/kvm.h"	/* for kvm__emulate_{mm}io() */
+#include <stdbool.h>
+#include <pthread.h>
+
+struct kvm;
+
+struct kvm_cpu {
+	pthread_t		thread;		/* VCPU thread */
+
+	unsigned long		cpu_id;
+
+	struct kvm		*kvm;		/* parent KVM */
+	int			vcpu_fd;	/* For VCPU ioctls() */
+	struct kvm_run		*kvm_run;
+	struct kvm_cpu_task	*task;
+
+	struct kvm_regs		regs;
+	struct kvm_sregs	sregs;
+	struct kvm_fpu		fpu;
+
+	struct kvm_msrs		*msrs;		/* dynamically allocated */
+
+	u8			is_running;
+	u8			paused;
+	u8			needs_nmi;
+
+	struct kvm_coalesced_mmio_ring	*ring;
+};
+
+/*
+ * As these are such simple wrappers, let's have them in the header so they'll
+ * be cheaper to call:
+ */
+static inline bool kvm_cpu__emulate_io(struct kvm_cpu *vcpu, u16 port, void *data, int direction, int size, u32 count)
+{
+	return kvm__emulate_io(vcpu, port, data, direction, size, count);
+}
+
+static inline bool kvm_cpu__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, u8 is_write)
+{
+	return kvm__emulate_mmio(vcpu, phys_addr, data, len, is_write);
+}
+
+#endif /* KVM__KVM_CPU_ARCH_H */
diff --git a/kvmtool/x86/include/kvm/mptable.h b/kvmtool/x86/include/kvm/mptable.h
new file mode 100644
index 0000000..9e3cfa6
--- /dev/null
+++ b/kvmtool/x86/include/kvm/mptable.h
@@ -0,0 +1,9 @@
+#ifndef KVM_MPTABLE_H_
+#define KVM_MPTABLE_H_
+
+struct kvm;
+
+int mptable__init(struct kvm *kvm);
+int mptable__exit(struct kvm *kvm);
+
+#endif /* KVM_MPTABLE_H_ */
diff --git a/kvmtool/x86/init.S b/kvmtool/x86/init.S
new file mode 100644
index 0000000..488a93f
--- /dev/null
+++ b/kvmtool/x86/init.S
@@ -0,0 +1,38 @@
+.data
+
+.m_dev:
+.string "hostfs"
+.m_dir:
+.string "/host"
+.m_typ:
+.string "9p"
+.m_opt:
+.string "trans=virtio,version=9p2000.L"
+
+.e_nam:
+.string "/virt/init"
+
+.text
+.globl _start
+_start:
+
+	mov $165, %rax		# __NR_mount
+	mov $.m_dev, %rdi
+	mov $.m_dir, %rsi
+	mov $.m_typ, %rdx
+	mov $1, %r10		# MS_RDONLY
+	mov $.m_opt, %r8
+	syscall
+
+	mov $59, %rax		# __NR_execve
+	mov $.e_nam, %rdi
+	lea 8(%rsp), %rsi	# argv[]
+	mov %rdi, (%rsi)	# change argv[0]
+	pop %rcx		# argc
+	inc %rcx
+	lea (%rsi,%rcx,8), %rdx # envp[]
+	syscall
+
+	mov $60, %rax		# __NR_exit
+	mov $1, %rdi
+	syscall			# panic
diff --git a/kvmtool/x86/interrupt.c b/kvmtool/x86/interrupt.c
new file mode 100644
index 0000000..7d47869
--- /dev/null
+++ b/kvmtool/x86/interrupt.c
@@ -0,0 +1,28 @@
+#include "kvm/interrupt.h"
+
+#include "kvm/util.h"
+
+#include <string.h>
+
+void interrupt_table__copy(struct interrupt_table *itable, void *dst, unsigned int size)
+{
+	if (size < sizeof(itable->entries))
+		die("An attempt to overwrite host memory");
+
+	memcpy(dst, itable->entries, sizeof(itable->entries));
+}
+
+void interrupt_table__setup(struct interrupt_table *itable, struct real_intr_desc *entry)
+{
+	unsigned int i;
+
+	for (i = 0; i < REAL_INTR_VECTORS; i++)
+		itable->entries[i] = *entry;
+}
+
+void interrupt_table__set(struct interrupt_table *itable,
+				struct real_intr_desc *entry, unsigned int num)
+{
+	if (num < REAL_INTR_VECTORS)
+		itable->entries[num] = *entry;
+}
diff --git a/kvmtool/x86/ioport.c b/kvmtool/x86/ioport.c
new file mode 100644
index 0000000..7ad7b8f
--- /dev/null
+++ b/kvmtool/x86/ioport.c
@@ -0,0 +1,152 @@
+#include "kvm/ioport.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+
+static bool debug_io_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	return 0;
+}
+
+static struct ioport_operations debug_ops = {
+	.io_out		= debug_io_out,
+};
+
+static bool seabios_debug_io_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	char ch;
+
+	ch = ioport__read8(data);
+
+	putchar(ch);
+
+	return true;
+}
+
+static struct ioport_operations seabios_debug_ops = {
+	.io_out		= seabios_debug_io_out,
+};
+
+static bool dummy_io_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	return true;
+}
+
+static bool dummy_io_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	return true;
+}
+
+static struct ioport_operations dummy_read_write_ioport_ops = {
+	.io_in		= dummy_io_in,
+	.io_out		= dummy_io_out,
+};
+
+static struct ioport_operations dummy_write_only_ioport_ops = {
+	.io_out		= dummy_io_out,
+};
+
+/*
+ * The "fast A20 gate"
+ */
+
+static bool ps2_control_a_io_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	/*
+	 * A20 is always enabled.
+	 */
+	ioport__write8(data, 0x02);
+
+	return true;
+}
+
+static struct ioport_operations ps2_control_a_ops = {
+	.io_in		= ps2_control_a_io_in,
+	.io_out		= dummy_io_out,
+};
+
+void ioport__map_irq(u8 *irq)
+{
+}
+
+int ioport__setup_arch(struct kvm *kvm)
+{
+	int r;
+
+	/* Legacy ioport setup */
+
+	/* 0000 - 001F - DMA1 controller */
+	r = ioport__register(kvm, 0x0000, &dummy_read_write_ioport_ops, 32, NULL);
+	if (r < 0)
+		return r;
+
+	/* 0x0020 - 0x003F - 8259A PIC 1 */
+	r = ioport__register(kvm, 0x0020, &dummy_read_write_ioport_ops, 2, NULL);
+	if (r < 0)
+		return r;
+
+	/* PORT 0040-005F - PIT - PROGRAMMABLE INTERVAL TIMER (8253, 8254) */
+	r = ioport__register(kvm, 0x0040, &dummy_read_write_ioport_ops, 4, NULL);
+	if (r < 0)
+		return r;
+
+	/* 0092 - PS/2 system control port A */
+	r = ioport__register(kvm, 0x0092, &ps2_control_a_ops, 1, NULL);
+	if (r < 0)
+		return r;
+
+	/* 0x00A0 - 0x00AF - 8259A PIC 2 */
+	r = ioport__register(kvm, 0x00A0, &dummy_read_write_ioport_ops, 2, NULL);
+	if (r < 0)
+		return r;
+
+	/* 00C0 - 001F - DMA2 controller */
+	r = ioport__register(kvm, 0x00C0, &dummy_read_write_ioport_ops, 32, NULL);
+	if (r < 0)
+		return r;
+
+	/* PORT 00E0-00EF are 'motherboard specific' so we use them for our
+	   internal debugging purposes.  */
+	r = ioport__register(kvm, IOPORT_DBG, &debug_ops, 1, NULL);
+	if (r < 0)
+		return r;
+
+	/* PORT 00ED - DUMMY PORT FOR DELAY??? */
+	r = ioport__register(kvm, 0x00ED, &dummy_write_only_ioport_ops, 1, NULL);
+	if (r < 0)
+		return r;
+
+	/* 0x00F0 - 0x00FF - Math co-processor */
+	r = ioport__register(kvm, 0x00F0, &dummy_write_only_ioport_ops, 2, NULL);
+	if (r < 0)
+		return r;
+
+	/* PORT 0278-027A - PARALLEL PRINTER PORT (usually LPT1, sometimes LPT2) */
+	r = ioport__register(kvm, 0x0278, &dummy_read_write_ioport_ops, 3, NULL);
+	if (r < 0)
+		return r;
+
+	/* PORT 0378-037A - PARALLEL PRINTER PORT (usually LPT2, sometimes LPT3) */
+	r = ioport__register(kvm, 0x0378, &dummy_read_write_ioport_ops, 3, NULL);
+	if (r < 0)
+		return r;
+
+	/* PORT 03D4-03D5 - COLOR VIDEO - CRT CONTROL REGISTERS */
+	r = ioport__register(kvm, 0x03D4, &dummy_read_write_ioport_ops, 1, NULL);
+	if (r < 0)
+		return r;
+	r = ioport__register(kvm, 0x03D5, &dummy_write_only_ioport_ops, 1, NULL);
+	if (r < 0)
+		return r;
+
+	r = ioport__register(kvm, 0x402, &seabios_debug_ops, 1, NULL);
+	if (r < 0)
+		return r;
+
+	/* 0510 - QEMU BIOS configuration register */
+	r = ioport__register(kvm, 0x510, &dummy_read_write_ioport_ops, 2, NULL);
+	if (r < 0)
+		return r;
+
+	return 0;
+}
diff --git a/kvmtool/x86/irq.c b/kvmtool/x86/irq.c
new file mode 100644
index 0000000..db465a1
--- /dev/null
+++ b/kvmtool/x86/irq.c
@@ -0,0 +1,66 @@
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/kvm.h>
+#include <sys/ioctl.h>
+
+#include <stddef.h>
+#include <stdlib.h>
+
+#define IRQCHIP_MASTER			0
+#define IRQCHIP_SLAVE			1
+#define IRQCHIP_IOAPIC			2
+
+static int irq__add_routing(u32 gsi, u32 type, u32 irqchip, u32 pin)
+{
+	int r = irq__allocate_routing_entry();
+	if (r)
+		return r;
+
+	irq_routing->entries[irq_routing->nr++] =
+		(struct kvm_irq_routing_entry) {
+			.gsi = gsi,
+			.type = type,
+			.u.irqchip.irqchip = irqchip,
+			.u.irqchip.pin = pin,
+		};
+
+	return 0;
+}
+
+int irq__init(struct kvm *kvm)
+{
+	int i, r;
+
+	/* Hook first 8 GSIs to master IRQCHIP */
+	for (i = 0; i < 8; i++)
+		if (i != 2)
+			irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_MASTER, i);
+
+	/* Hook next 8 GSIs to slave IRQCHIP */
+	for (i = 8; i < 16; i++)
+		irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_SLAVE, i - 8);
+
+	/* Last but not least, IOAPIC */
+	for (i = 0; i < 24; i++) {
+		if (i == 0)
+			irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_IOAPIC, 2);
+		else if (i != 2)
+			irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_IOAPIC, i);
+	}
+
+	r = ioctl(kvm->vm_fd, KVM_SET_GSI_ROUTING, irq_routing);
+	if (r) {
+		free(irq_routing);
+		return errno;
+	}
+
+	next_gsi = i;
+
+	return 0;
+}
+dev_base_init(irq__init);
diff --git a/kvmtool/x86/kvm-cpu.c b/kvmtool/x86/kvm-cpu.c
new file mode 100644
index 0000000..b02ff65
--- /dev/null
+++ b/kvmtool/x86/kvm-cpu.c
@@ -0,0 +1,446 @@
+#include "kvm/kvm-cpu.h"
+
+#include "kvm/symbol.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+#include <asm/apicdef.h>
+#include <linux/err.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+
+static int debug_fd;
+
+void kvm_cpu__set_debug_fd(int fd)
+{
+	debug_fd = fd;
+}
+
+int kvm_cpu__get_debug_fd(void)
+{
+	return debug_fd;
+}
+
+static inline bool is_in_protected_mode(struct kvm_cpu *vcpu)
+{
+	return vcpu->sregs.cr0 & 0x01;
+}
+
+static inline u64 ip_to_flat(struct kvm_cpu *vcpu, u64 ip)
+{
+	u64 cs;
+
+	/*
+	 * NOTE! We should take code segment base address into account here.
+	 * Luckily it's usually zero because Linux uses flat memory model.
+	 */
+	if (is_in_protected_mode(vcpu))
+		return ip;
+
+	cs = vcpu->sregs.cs.selector;
+
+	return ip + (cs << 4);
+}
+
+static inline u32 selector_to_base(u16 selector)
+{
+	/*
+	 * KVM on Intel requires 'base' to be 'selector * 16' in real mode.
+	 */
+	return (u32)selector << 4;
+}
+
+static struct kvm_cpu *kvm_cpu__new(struct kvm *kvm)
+{
+	struct kvm_cpu *vcpu;
+
+	vcpu = calloc(1, sizeof(*vcpu));
+	if (!vcpu)
+		return NULL;
+
+	vcpu->kvm = kvm;
+
+	return vcpu;
+}
+
+void kvm_cpu__delete(struct kvm_cpu *vcpu)
+{
+	if (vcpu->msrs)
+		free(vcpu->msrs);
+
+	free(vcpu);
+}
+
+static int kvm_cpu__set_lint(struct kvm_cpu *vcpu)
+{
+	struct local_apic lapic;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_LAPIC, &lapic))
+		return -1;
+
+	lapic.lvt_lint0.delivery_mode = APIC_MODE_EXTINT;
+	lapic.lvt_lint1.delivery_mode = APIC_MODE_NMI;
+
+	return ioctl(vcpu->vcpu_fd, KVM_SET_LAPIC, &lapic);
+}
+
+struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, unsigned long cpu_id)
+{
+	struct kvm_cpu *vcpu;
+	int mmap_size;
+	int coalesced_offset;
+
+	vcpu = kvm_cpu__new(kvm);
+	if (!vcpu)
+		return NULL;
+
+	vcpu->cpu_id = cpu_id;
+
+	vcpu->vcpu_fd = ioctl(vcpu->kvm->vm_fd, KVM_CREATE_VCPU, cpu_id);
+	if (vcpu->vcpu_fd < 0)
+		die_perror("KVM_CREATE_VCPU ioctl");
+
+	mmap_size = ioctl(vcpu->kvm->sys_fd, KVM_GET_VCPU_MMAP_SIZE, 0);
+	if (mmap_size < 0)
+		die_perror("KVM_GET_VCPU_MMAP_SIZE ioctl");
+
+	vcpu->kvm_run = mmap(NULL, mmap_size, PROT_RW, MAP_SHARED, vcpu->vcpu_fd, 0);
+	if (vcpu->kvm_run == MAP_FAILED)
+		die("unable to mmap vcpu fd");
+
+	coalesced_offset = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_COALESCED_MMIO);
+	if (coalesced_offset)
+		vcpu->ring = (void *)vcpu->kvm_run + (coalesced_offset * PAGE_SIZE);
+
+	if (kvm_cpu__set_lint(vcpu))
+		die_perror("KVM_SET_LAPIC failed");
+
+	vcpu->is_running = true;
+
+	return vcpu;
+}
+
+static struct kvm_msrs *kvm_msrs__new(size_t nmsrs)
+{
+	struct kvm_msrs *vcpu = calloc(1, sizeof(*vcpu) + (sizeof(struct kvm_msr_entry) * nmsrs));
+
+	if (!vcpu)
+		die("out of memory");
+
+	return vcpu;
+}
+
+#define MSR_IA32_SYSENTER_CS            0x00000174
+#define MSR_IA32_SYSENTER_ESP           0x00000175
+#define MSR_IA32_SYSENTER_EIP           0x00000176
+
+#define MSR_STAR                0xc0000081 /* legacy mode SYSCALL target */
+#define MSR_LSTAR               0xc0000082 /* long mode SYSCALL target */
+#define MSR_CSTAR               0xc0000083 /* compat mode SYSCALL target */
+#define MSR_SYSCALL_MASK        0xc0000084 /* EFLAGS mask for syscall */
+#define MSR_KERNEL_GS_BASE      0xc0000102 /* SwapGS GS shadow */
+
+#define MSR_IA32_TSC                    0x00000010
+#define MSR_IA32_MISC_ENABLE            0x000001a0
+
+#define MSR_IA32_MISC_ENABLE_FAST_STRING_BIT            0
+#define MSR_IA32_MISC_ENABLE_FAST_STRING                (1ULL << MSR_IA32_MISC_ENABLE_FAST_STRING_BIT)
+
+#define KVM_MSR_ENTRY(_index, _data)	\
+	(struct kvm_msr_entry) { .index = _index, .data = _data }
+
+static void kvm_cpu__setup_msrs(struct kvm_cpu *vcpu)
+{
+	unsigned long ndx = 0;
+
+	vcpu->msrs = kvm_msrs__new(100);
+
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_SYSENTER_CS,	0x0);
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_SYSENTER_ESP,	0x0);
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_SYSENTER_EIP,	0x0);
+#ifdef CONFIG_X86_64
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_STAR,			0x0);
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_CSTAR,			0x0);
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_KERNEL_GS_BASE,		0x0);
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_SYSCALL_MASK,		0x0);
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_LSTAR,			0x0);
+#endif
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_TSC,		0x0);
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_MISC_ENABLE,
+						MSR_IA32_MISC_ENABLE_FAST_STRING);
+
+	vcpu->msrs->nmsrs = ndx;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_MSRS, vcpu->msrs) < 0)
+		die_perror("KVM_SET_MSRS failed");
+}
+
+static void kvm_cpu__setup_fpu(struct kvm_cpu *vcpu)
+{
+	vcpu->fpu = (struct kvm_fpu) {
+		.fcw	= 0x37f,
+		.mxcsr	= 0x1f80,
+	};
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_FPU, &vcpu->fpu) < 0)
+		die_perror("KVM_SET_FPU failed");
+}
+
+static void kvm_cpu__setup_regs(struct kvm_cpu *vcpu)
+{
+	vcpu->regs = (struct kvm_regs) {
+		/* We start the guest in 16-bit real mode  */
+		.rflags	= 0x0000000000000002ULL,
+
+		.rip	= vcpu->kvm->arch.boot_ip,
+		.rsp	= vcpu->kvm->arch.boot_sp,
+		.rbp	= vcpu->kvm->arch.boot_sp,
+	};
+
+	if (vcpu->regs.rip > USHRT_MAX)
+		die("ip 0x%llx is too high for real mode", (u64)vcpu->regs.rip);
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_REGS, &vcpu->regs) < 0)
+		die_perror("KVM_SET_REGS failed");
+}
+
+static void kvm_cpu__setup_sregs(struct kvm_cpu *vcpu)
+{
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &vcpu->sregs) < 0)
+		die_perror("KVM_GET_SREGS failed");
+
+	vcpu->sregs.cs.selector	= vcpu->kvm->arch.boot_selector;
+	vcpu->sregs.cs.base	= selector_to_base(vcpu->kvm->arch.boot_selector);
+	vcpu->sregs.ss.selector	= vcpu->kvm->arch.boot_selector;
+	vcpu->sregs.ss.base	= selector_to_base(vcpu->kvm->arch.boot_selector);
+	vcpu->sregs.ds.selector	= vcpu->kvm->arch.boot_selector;
+	vcpu->sregs.ds.base	= selector_to_base(vcpu->kvm->arch.boot_selector);
+	vcpu->sregs.es.selector	= vcpu->kvm->arch.boot_selector;
+	vcpu->sregs.es.base	= selector_to_base(vcpu->kvm->arch.boot_selector);
+	vcpu->sregs.fs.selector	= vcpu->kvm->arch.boot_selector;
+	vcpu->sregs.fs.base	= selector_to_base(vcpu->kvm->arch.boot_selector);
+	vcpu->sregs.gs.selector	= vcpu->kvm->arch.boot_selector;
+	vcpu->sregs.gs.base	= selector_to_base(vcpu->kvm->arch.boot_selector);
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_SREGS, &vcpu->sregs) < 0)
+		die_perror("KVM_SET_SREGS failed");
+}
+
+/**
+ * kvm_cpu__reset_vcpu - reset virtual CPU to a known state
+ */
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu)
+{
+	kvm_cpu__setup_cpuid(vcpu);
+	kvm_cpu__setup_sregs(vcpu);
+	kvm_cpu__setup_regs(vcpu);
+	kvm_cpu__setup_fpu(vcpu);
+	kvm_cpu__setup_msrs(vcpu);
+}
+
+bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu)
+{
+	return false;
+}
+
+static void print_dtable(const char *name, struct kvm_dtable *dtable)
+{
+	dprintf(debug_fd, " %s                 %016llx  %08hx\n",
+		name, (u64) dtable->base, (u16) dtable->limit);
+}
+
+static void print_segment(const char *name, struct kvm_segment *seg)
+{
+	dprintf(debug_fd, " %s       %04hx      %016llx  %08x  %02hhx    %x %x   %x  %x %x %x %x\n",
+		name, (u16) seg->selector, (u64) seg->base, (u32) seg->limit,
+		(u8) seg->type, seg->present, seg->dpl, seg->db, seg->s, seg->l, seg->g, seg->avl);
+}
+
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu)
+{
+	unsigned long cr0, cr2, cr3;
+	unsigned long cr4, cr8;
+	unsigned long rax, rbx, rcx;
+	unsigned long rdx, rsi, rdi;
+	unsigned long rbp,  r8,  r9;
+	unsigned long r10, r11, r12;
+	unsigned long r13, r14, r15;
+	unsigned long rip, rsp;
+	struct kvm_sregs sregs;
+	unsigned long rflags;
+	struct kvm_regs regs;
+	int i;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, &regs) < 0)
+		die("KVM_GET_REGS failed");
+
+	rflags = regs.rflags;
+
+	rip = regs.rip; rsp = regs.rsp;
+	rax = regs.rax; rbx = regs.rbx; rcx = regs.rcx;
+	rdx = regs.rdx; rsi = regs.rsi; rdi = regs.rdi;
+	rbp = regs.rbp; r8  = regs.r8;  r9  = regs.r9;
+	r10 = regs.r10; r11 = regs.r11; r12 = regs.r12;
+	r13 = regs.r13; r14 = regs.r14; r15 = regs.r15;
+
+	dprintf(debug_fd, "\n Registers:\n");
+	dprintf(debug_fd,   " ----------\n");
+	dprintf(debug_fd, " rip: %016lx   rsp: %016lx flags: %016lx\n", rip, rsp, rflags);
+	dprintf(debug_fd, " rax: %016lx   rbx: %016lx   rcx: %016lx\n", rax, rbx, rcx);
+	dprintf(debug_fd, " rdx: %016lx   rsi: %016lx   rdi: %016lx\n", rdx, rsi, rdi);
+	dprintf(debug_fd, " rbp: %016lx    r8: %016lx    r9: %016lx\n", rbp, r8,  r9);
+	dprintf(debug_fd, " r10: %016lx   r11: %016lx   r12: %016lx\n", r10, r11, r12);
+	dprintf(debug_fd, " r13: %016lx   r14: %016lx   r15: %016lx\n", r13, r14, r15);
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &sregs) < 0)
+		die("KVM_GET_REGS failed");
+
+	cr0 = sregs.cr0; cr2 = sregs.cr2; cr3 = sregs.cr3;
+	cr4 = sregs.cr4; cr8 = sregs.cr8;
+
+	dprintf(debug_fd, " cr0: %016lx   cr2: %016lx   cr3: %016lx\n", cr0, cr2, cr3);
+	dprintf(debug_fd, " cr4: %016lx   cr8: %016lx\n", cr4, cr8);
+	dprintf(debug_fd, "\n Segment registers:\n");
+	dprintf(debug_fd,   " ------------------\n");
+	dprintf(debug_fd, " register  selector  base              limit     type  p dpl db s l g avl\n");
+	print_segment("cs ", &sregs.cs);
+	print_segment("ss ", &sregs.ss);
+	print_segment("ds ", &sregs.ds);
+	print_segment("es ", &sregs.es);
+	print_segment("fs ", &sregs.fs);
+	print_segment("gs ", &sregs.gs);
+	print_segment("tr ", &sregs.tr);
+	print_segment("ldt", &sregs.ldt);
+	print_dtable("gdt", &sregs.gdt);
+	print_dtable("idt", &sregs.idt);
+
+	dprintf(debug_fd, "\n APIC:\n");
+	dprintf(debug_fd,   " -----\n");
+	dprintf(debug_fd, " efer: %016llx  apic base: %016llx  nmi: %s\n",
+		(u64) sregs.efer, (u64) sregs.apic_base,
+		(vcpu->kvm->nmi_disabled ? "disabled" : "enabled"));
+
+	dprintf(debug_fd, "\n Interrupt bitmap:\n");
+	dprintf(debug_fd,   " -----------------\n");
+	for (i = 0; i < (KVM_NR_INTERRUPTS + 63) / 64; i++)
+		dprintf(debug_fd, " %016llx", (u64) sregs.interrupt_bitmap[i]);
+	dprintf(debug_fd, "\n");
+}
+
+#define MAX_SYM_LEN 128
+
+void kvm_cpu__show_code(struct kvm_cpu *vcpu)
+{
+	unsigned int code_bytes = 64;
+	unsigned int code_prologue = 43;
+	unsigned int code_len = code_bytes;
+	char sym[MAX_SYM_LEN] = SYMBOL_DEFAULT_UNKNOWN, *psym;
+	unsigned char c;
+	unsigned int i;
+	u8 *ip;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, &vcpu->regs) < 0)
+		die("KVM_GET_REGS failed");
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &vcpu->sregs) < 0)
+		die("KVM_GET_SREGS failed");
+
+	ip = guest_flat_to_host(vcpu->kvm, ip_to_flat(vcpu, vcpu->regs.rip) - code_prologue);
+
+	dprintf(debug_fd, "\n Code:\n");
+	dprintf(debug_fd,   " -----\n");
+
+	psym = symbol_lookup(vcpu->kvm, vcpu->regs.rip, sym, MAX_SYM_LEN);
+	if (IS_ERR(psym))
+		dprintf(debug_fd,
+			"Warning: symbol_lookup() failed to find symbol "
+			"with error: %ld\n", PTR_ERR(psym));
+
+	dprintf(debug_fd, " rip: [<%016lx>] %s\n\n", (unsigned long) vcpu->regs.rip, sym);
+
+	for (i = 0; i < code_len; i++, ip++) {
+		if (!host_ptr_in_ram(vcpu->kvm, ip))
+			break;
+
+		c = *ip;
+
+		if (ip == guest_flat_to_host(vcpu->kvm, ip_to_flat(vcpu, vcpu->regs.rip)))
+			dprintf(debug_fd, " <%02x>", c);
+		else
+			dprintf(debug_fd, " %02x", c);
+	}
+
+	dprintf(debug_fd, "\n");
+
+	dprintf(debug_fd, "\n Stack:\n");
+	dprintf(debug_fd,   " ------\n");
+	dprintf(debug_fd, " rsp: [<%016lx>] \n", (unsigned long) vcpu->regs.rsp);
+	kvm__dump_mem(vcpu->kvm, vcpu->regs.rsp, 32, debug_fd);
+}
+
+void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu)
+{
+	u64 *pte1;
+	u64 *pte2;
+	u64 *pte3;
+	u64 *pte4;
+
+	if (!is_in_protected_mode(vcpu)) {
+		dprintf(debug_fd, "\n Page Tables:\n");
+		dprintf(debug_fd, " ------\n");
+		dprintf(debug_fd, " Not in protected mode\n");
+		return;
+	}
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &vcpu->sregs) < 0)
+		die("KVM_GET_SREGS failed");
+
+	pte4 = guest_flat_to_host(vcpu->kvm, vcpu->sregs.cr3);
+	if (!host_ptr_in_ram(vcpu->kvm, pte4))
+		return;
+
+	pte3 = guest_flat_to_host(vcpu->kvm, (*pte4 & ~0xfff));
+	if (!host_ptr_in_ram(vcpu->kvm, pte3))
+		return;
+
+	pte2 = guest_flat_to_host(vcpu->kvm, (*pte3 & ~0xfff));
+	if (!host_ptr_in_ram(vcpu->kvm, pte2))
+		return;
+
+	pte1 = guest_flat_to_host(vcpu->kvm, (*pte2 & ~0xfff));
+	if (!host_ptr_in_ram(vcpu->kvm, pte1))
+		return;
+
+	dprintf(debug_fd, "\n Page Tables:\n");
+	dprintf(debug_fd, " ------\n");
+	if (*pte2 & (1 << 7))
+		dprintf(debug_fd, " pte4: %016llx   pte3: %016llx"
+			"   pte2: %016llx\n",
+			*pte4, *pte3, *pte2);
+	else
+		dprintf(debug_fd, " pte4: %016llx  pte3: %016llx   pte2: %016"
+			"llx   pte1: %016llx\n",
+			*pte4, *pte3, *pte2, *pte1);
+}
+
+void kvm_cpu__arch_nmi(struct kvm_cpu *cpu)
+{
+	struct kvm_lapic_state klapic;
+	struct local_apic *lapic = (void *)&klapic;
+
+	if (ioctl(cpu->vcpu_fd, KVM_GET_LAPIC, &klapic) != 0)
+		return;
+
+	if (lapic->lvt_lint1.mask)
+		return;
+
+	if (lapic->lvt_lint1.delivery_mode != APIC_MODE_NMI)
+		return;
+
+	ioctl(cpu->vcpu_fd, KVM_NMI);
+}
diff --git a/kvmtool/x86/kvm.c b/kvmtool/x86/kvm.c
new file mode 100644
index 0000000..3e0f0b7
--- /dev/null
+++ b/kvmtool/x86/kvm.c
@@ -0,0 +1,378 @@
+#include "kvm/kvm.h"
+#include "kvm/boot-protocol.h"
+#include "kvm/cpufeature.h"
+#include "kvm/interrupt.h"
+#include "kvm/mptable.h"
+#include "kvm/util.h"
+#include "kvm/8250-serial.h"
+#include "kvm/virtio-console.h"
+
+#include <asm/bootparam.h>
+#include <linux/kvm.h>
+#include <linux/kernel.h>
+
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <fcntl.h>
+
+struct kvm_ext kvm_req_ext[] = {
+	{ DEFINE_KVM_EXT(KVM_CAP_COALESCED_MMIO) },
+	{ DEFINE_KVM_EXT(KVM_CAP_SET_TSS_ADDR) },
+	{ DEFINE_KVM_EXT(KVM_CAP_PIT2) },
+	{ DEFINE_KVM_EXT(KVM_CAP_USER_MEMORY) },
+	{ DEFINE_KVM_EXT(KVM_CAP_IRQ_ROUTING) },
+	{ DEFINE_KVM_EXT(KVM_CAP_IRQCHIP) },
+	{ DEFINE_KVM_EXT(KVM_CAP_HLT) },
+	{ DEFINE_KVM_EXT(KVM_CAP_IRQ_INJECT_STATUS) },
+	{ DEFINE_KVM_EXT(KVM_CAP_EXT_CPUID) },
+	{ 0, 0 }
+};
+
+bool kvm__arch_cpu_supports_vm(void)
+{
+	struct cpuid_regs regs;
+	u32 eax_base;
+	int feature;
+
+	regs	= (struct cpuid_regs) {
+		.eax		= 0x00,
+	};
+	host_cpuid(&regs);
+
+	switch (regs.ebx) {
+	case CPUID_VENDOR_INTEL_1:
+		eax_base	= 0x00;
+		feature		= KVM__X86_FEATURE_VMX;
+		break;
+
+	case CPUID_VENDOR_AMD_1:
+		eax_base	= 0x80000000;
+		feature		= KVM__X86_FEATURE_SVM;
+		break;
+
+	default:
+		return false;
+	}
+
+	regs	= (struct cpuid_regs) {
+		.eax		= eax_base,
+	};
+	host_cpuid(&regs);
+
+	if (regs.eax < eax_base + 0x01)
+		return false;
+
+	regs	= (struct cpuid_regs) {
+		.eax		= eax_base + 0x01
+	};
+	host_cpuid(&regs);
+
+	return regs.ecx & (1 << feature);
+}
+
+/*
+ * Allocating RAM size bigger than 4GB requires us to leave a gap
+ * in the RAM which is used for PCI MMIO, hotplug, and unconfigured
+ * devices (see documentation of e820_setup_gap() for details).
+ *
+ * If we're required to initialize RAM bigger than 4GB, we will create
+ * a gap between 0xe0000000 and 0x100000000 in the guest virtual mem space.
+ */
+
+void kvm__init_ram(struct kvm *kvm)
+{
+	u64	phys_start, phys_size;
+	void	*host_mem;
+
+	if (kvm->ram_size < KVM_32BIT_GAP_START) {
+		/* Use a single block of RAM for 32bit RAM */
+
+		phys_start = 0;
+		phys_size  = kvm->ram_size;
+		host_mem   = kvm->ram_start;
+
+		kvm__register_ram(kvm, phys_start, phys_size, host_mem);
+	} else {
+		/* First RAM range from zero to the PCI gap: */
+
+		phys_start = 0;
+		phys_size  = KVM_32BIT_GAP_START;
+		host_mem   = kvm->ram_start;
+
+		kvm__register_ram(kvm, phys_start, phys_size, host_mem);
+
+		/* Second RAM range from 4GB to the end of RAM: */
+
+		phys_start = KVM_32BIT_MAX_MEM_SIZE;
+		phys_size  = kvm->ram_size - phys_start;
+		host_mem   = kvm->ram_start + phys_start;
+
+		kvm__register_ram(kvm, phys_start, phys_size, host_mem);
+	}
+}
+
+/* Arch-specific commandline setup */
+void kvm__arch_set_cmdline(char *cmdline, bool video)
+{
+	strcpy(cmdline, "noapic noacpi pci=conf1 reboot=k panic=1 i8042.direct=1 "
+				"i8042.dumbkbd=1 i8042.nopnp=1");
+	if (video)
+		strcat(cmdline, " video=vesafb");
+	else
+		strcat(cmdline, " earlyprintk=serial i8042.noaux=1");
+}
+
+/* Architecture-specific KVM init */
+void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size)
+{
+	struct kvm_pit_config pit_config = { .flags = 0, };
+	int ret;
+
+	ret = ioctl(kvm->vm_fd, KVM_SET_TSS_ADDR, 0xfffbd000);
+	if (ret < 0)
+		die_perror("KVM_SET_TSS_ADDR ioctl");
+
+	ret = ioctl(kvm->vm_fd, KVM_CREATE_PIT2, &pit_config);
+	if (ret < 0)
+		die_perror("KVM_CREATE_PIT2 ioctl");
+
+	if (ram_size < KVM_32BIT_GAP_START) {
+		kvm->ram_size = ram_size;
+		kvm->ram_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path, ram_size);
+	} else {
+		kvm->ram_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path, ram_size + KVM_32BIT_GAP_SIZE);
+		kvm->ram_size = ram_size + KVM_32BIT_GAP_SIZE;
+		if (kvm->ram_start != MAP_FAILED)
+			/*
+			 * We mprotect the gap (see kvm__init_ram() for details) PROT_NONE so that
+			 * if we accidently write to it, we will know.
+			 */
+			mprotect(kvm->ram_start + KVM_32BIT_GAP_START, KVM_32BIT_GAP_SIZE, PROT_NONE);
+	}
+	if (kvm->ram_start == MAP_FAILED)
+		die("out of memory");
+
+	madvise(kvm->ram_start, kvm->ram_size, MADV_MERGEABLE);
+
+	ret = ioctl(kvm->vm_fd, KVM_CREATE_IRQCHIP);
+	if (ret < 0)
+		die_perror("KVM_CREATE_IRQCHIP ioctl");
+}
+
+void kvm__arch_delete_ram(struct kvm *kvm)
+{
+	munmap(kvm->ram_start, kvm->ram_size);
+}
+
+void kvm__irq_line(struct kvm *kvm, int irq, int level)
+{
+	struct kvm_irq_level irq_level;
+
+	irq_level	= (struct kvm_irq_level) {
+		{
+			.irq		= irq,
+		},
+		.level		= level,
+	};
+
+	if (ioctl(kvm->vm_fd, KVM_IRQ_LINE, &irq_level) < 0)
+		die_perror("KVM_IRQ_LINE failed");
+}
+
+void kvm__irq_trigger(struct kvm *kvm, int irq)
+{
+	kvm__irq_line(kvm, irq, 1);
+	kvm__irq_line(kvm, irq, 0);
+}
+
+#define BOOT_LOADER_SELECTOR	0x1000
+#define BOOT_LOADER_IP		0x0000
+#define BOOT_LOADER_SP		0x8000
+#define BOOT_CMDLINE_OFFSET	0x20000
+
+#define BOOT_PROTOCOL_REQUIRED	0x206
+#define LOAD_HIGH		0x01
+
+static inline void *guest_real_to_host(struct kvm *kvm, u16 selector, u16 offset)
+{
+	unsigned long flat = ((u32)selector << 4) + offset;
+
+	return guest_flat_to_host(kvm, flat);
+}
+
+static bool load_flat_binary(struct kvm *kvm, int fd_kernel)
+{
+	void *p;
+
+	if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+		die_perror("lseek");
+
+	p = guest_real_to_host(kvm, BOOT_LOADER_SELECTOR, BOOT_LOADER_IP);
+
+	if (read_file(fd_kernel, p, kvm->cfg.ram_size) < 0)
+		die_perror("read");
+
+	kvm->arch.boot_selector	= BOOT_LOADER_SELECTOR;
+	kvm->arch.boot_ip	= BOOT_LOADER_IP;
+	kvm->arch.boot_sp	= BOOT_LOADER_SP;
+
+	return true;
+}
+
+static const char *BZIMAGE_MAGIC = "HdrS";
+
+static bool load_bzimage(struct kvm *kvm, int fd_kernel, int fd_initrd,
+			 const char *kernel_cmdline)
+{
+	struct boot_params *kern_boot;
+	struct boot_params boot;
+	size_t cmdline_size;
+	ssize_t file_size;
+	void *p;
+	u16 vidmode;
+
+	/*
+	 * See Documentation/x86/boot.txt for details no bzImage on-disk and
+	 * memory layout.
+	 */
+
+	if (read_in_full(fd_kernel, &boot, sizeof(boot)) != sizeof(boot))
+		return false;
+
+	if (memcmp(&boot.hdr.header, BZIMAGE_MAGIC, strlen(BZIMAGE_MAGIC)))
+		return false;
+
+	if (boot.hdr.version < BOOT_PROTOCOL_REQUIRED)
+		die("Too old kernel");
+
+	if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+		die_perror("lseek");
+
+	if (!boot.hdr.setup_sects)
+		boot.hdr.setup_sects = BZ_DEFAULT_SETUP_SECTS;
+	file_size = (boot.hdr.setup_sects + 1) << 9;
+	p = guest_real_to_host(kvm, BOOT_LOADER_SELECTOR, BOOT_LOADER_IP);
+	if (read_in_full(fd_kernel, p, file_size) != file_size)
+		die_perror("kernel setup read");
+
+	/* read actual kernel image (vmlinux.bin) to BZ_KERNEL_START */
+	p = guest_flat_to_host(kvm, BZ_KERNEL_START);
+	file_size = read_file(fd_kernel, p,
+			      kvm->cfg.ram_size - BZ_KERNEL_START);
+	if (file_size < 0)
+		die_perror("kernel read");
+
+	p = guest_flat_to_host(kvm, BOOT_CMDLINE_OFFSET);
+	if (kernel_cmdline) {
+		cmdline_size = strlen(kernel_cmdline) + 1;
+		if (cmdline_size > boot.hdr.cmdline_size)
+			cmdline_size = boot.hdr.cmdline_size;
+
+		memset(p, 0, boot.hdr.cmdline_size);
+		memcpy(p, kernel_cmdline, cmdline_size - 1);
+	}
+
+	/* vidmode should be either specified or set by default */
+	if (kvm->cfg.vnc || kvm->cfg.sdl || kvm->cfg.gtk) {
+		if (!kvm->cfg.arch.vidmode)
+			vidmode = 0x312;
+		else
+			vidmode = kvm->cfg.arch.vidmode;
+	} else {
+		vidmode = 0;
+	}
+
+	kern_boot	= guest_real_to_host(kvm, BOOT_LOADER_SELECTOR, 0x00);
+
+	kern_boot->hdr.cmd_line_ptr	= BOOT_CMDLINE_OFFSET;
+	kern_boot->hdr.type_of_loader	= 0xff;
+	kern_boot->hdr.heap_end_ptr	= 0xfe00;
+	kern_boot->hdr.loadflags	|= CAN_USE_HEAP;
+	kern_boot->hdr.vid_mode		= vidmode;
+
+	/*
+	 * Read initrd image into guest memory
+	 */
+	if (fd_initrd >= 0) {
+		struct stat initrd_stat;
+		unsigned long addr;
+
+		if (fstat(fd_initrd, &initrd_stat))
+			die_perror("fstat");
+
+		addr = boot.hdr.initrd_addr_max & ~0xfffff;
+		for (;;) {
+			if (addr < BZ_KERNEL_START)
+				die("Not enough memory for initrd");
+			else if (addr < (kvm->ram_size - initrd_stat.st_size))
+				break;
+			addr -= 0x100000;
+		}
+
+		p = guest_flat_to_host(kvm, addr);
+		if (read_in_full(fd_initrd, p, initrd_stat.st_size) < 0)
+			die("Failed to read initrd");
+
+		kern_boot->hdr.ramdisk_image	= addr;
+		kern_boot->hdr.ramdisk_size	= initrd_stat.st_size;
+	}
+
+	kvm->arch.boot_selector = BOOT_LOADER_SELECTOR;
+	/*
+	 * The real-mode setup code starts at offset 0x200 of a bzImage. See
+	 * Documentation/x86/boot.txt for details.
+	 */
+	kvm->arch.boot_ip = BOOT_LOADER_IP + 0x200;
+	kvm->arch.boot_sp = BOOT_LOADER_SP;
+
+	return true;
+}
+
+bool kvm__arch_load_kernel_image(struct kvm *kvm, int fd_kernel, int fd_initrd,
+				 const char *kernel_cmdline)
+{
+	if (load_bzimage(kvm, fd_kernel, fd_initrd, kernel_cmdline))
+		return true;
+	pr_warning("Kernel image is not a bzImage.");
+	pr_warning("Trying to load it as a flat binary (no cmdline support)");
+
+	if (fd_initrd != -1)
+		pr_warning("Loading initrd with flat binary not supported.");
+
+	return load_flat_binary(kvm, fd_kernel);
+}
+
+/**
+ * kvm__arch_setup_firmware - inject BIOS into guest system memory
+ * @kvm - guest system descriptor
+ *
+ * This function is a main routine where we poke guest memory
+ * and install BIOS there.
+ */
+int kvm__arch_setup_firmware(struct kvm *kvm)
+{
+	/* standart minimal configuration */
+	setup_bios(kvm);
+
+	/* FIXME: SMP, ACPI and friends here */
+
+	return 0;
+}
+
+int kvm__arch_free_firmware(struct kvm *kvm)
+{
+	return 0;
+}
+
+void kvm__arch_read_term(struct kvm *kvm)
+{
+	serial8250__update_consoles(kvm);
+	virtio_console__inject_interrupt(kvm);
+}
diff --git a/kvmtool/x86/mptable.c b/kvmtool/x86/mptable.c
new file mode 100644
index 0000000..a984de9
--- /dev/null
+++ b/kvmtool/x86/mptable.c
@@ -0,0 +1,280 @@
+#include "kvm/kvm.h"
+#include "kvm/bios.h"
+#include "kvm/apic.h"
+#include "kvm/mptable.h"
+#include "kvm/util.h"
+#include "kvm/devices.h"
+#include "kvm/pci.h"
+
+#include <linux/kernel.h>
+#include <string.h>
+
+#include <asm/mpspec_def.h>
+#include <linux/types.h>
+
+/*
+ * FIXME: please make sure the addresses borrowed
+ * for apic/ioapic never overlaped! We need a global
+ * tracker of system resources (including io, mmio,
+ * and friends).
+ */
+
+static unsigned int mpf_checksum(unsigned char *mp, int len)
+{
+	unsigned int sum = 0;
+
+	while (len--)
+		sum += *mp++;
+
+	return sum & 0xFF;
+}
+
+static unsigned int gen_cpu_flag(unsigned int cpu, unsigned int ncpu)
+{
+	/* sets enabled/disabled | BSP/AP processor */
+	return ( (cpu < ncpu) ? CPU_ENABLED       : 0) |
+		((cpu == 0)   ? CPU_BOOTPROCESSOR : 0x00);
+}
+
+#define MPTABLE_SIG_FLOATING	"_MP_"
+#define MPTABLE_OEM		"KVMCPU00"
+#define MPTABLE_PRODUCTID	"0.1         "
+#define MPTABLE_PCIBUSTYPE	"PCI   "
+#define MPTABLE_ISABUSTYPE	"ISA   "
+
+#define MPTABLE_STRNCPY(d, s)	memcpy(d, s, sizeof(d))
+
+/* It should be more than enough */
+#define MPTABLE_MAX_SIZE	(32 << 20)
+
+/*
+ * Too many cpus will require x2apic mode
+ * and rather ACPI support so we limit it
+ * here for a while.
+ */
+#define MPTABLE_MAX_CPUS	255
+
+static void mptable_add_irq_src(struct mpc_intsrc *mpc_intsrc,
+				u16 srcbusid,	u16 srcbusirq,
+				u16 dstapic,	u16 dstirq)
+{
+	*mpc_intsrc = (struct mpc_intsrc) {
+		.type		= MP_INTSRC,
+		.irqtype	= mp_INT,
+		.irqflag	= MP_IRQDIR_DEFAULT,
+		.srcbus		= srcbusid,
+		.srcbusirq	= srcbusirq,
+		.dstapic	= dstapic,
+		.dstirq		= dstirq
+	};
+}
+
+/**
+ * mptable_setup - create mptable and fill guest memory with it
+ */
+int mptable__init(struct kvm *kvm)
+{
+	unsigned long real_mpc_table, real_mpf_intel, size;
+	struct mpf_intel *mpf_intel;
+	struct mpc_table *mpc_table;
+	struct mpc_cpu *mpc_cpu;
+	struct mpc_bus *mpc_bus;
+	struct mpc_ioapic *mpc_ioapic;
+	struct mpc_intsrc *mpc_intsrc;
+	struct device_header *dev_hdr;
+
+	const int pcibusid = 0;
+	const int isabusid = 1;
+
+	unsigned int i, nentries = 0, ncpus = kvm->nrcpus;
+	unsigned int ioapicid;
+	void *last_addr;
+
+	/* That is where MP table will be in guest memory */
+	real_mpc_table = ALIGN(MB_BIOS_BEGIN + bios_rom_size, 16);
+
+	if (ncpus > MPTABLE_MAX_CPUS) {
+		pr_warning("Too many cpus: %d limited to %d",
+			ncpus, MPTABLE_MAX_CPUS);
+		ncpus = MPTABLE_MAX_CPUS;
+	}
+
+	mpc_table = calloc(1, MPTABLE_MAX_SIZE);
+	if (!mpc_table)
+		return -ENOMEM;
+
+	MPTABLE_STRNCPY(mpc_table->signature,	MPC_SIGNATURE);
+	MPTABLE_STRNCPY(mpc_table->oem,		MPTABLE_OEM);
+	MPTABLE_STRNCPY(mpc_table->productid,	MPTABLE_PRODUCTID);
+
+	mpc_table->spec		= 4;
+	mpc_table->lapic	= APIC_ADDR(0);
+	mpc_table->oemcount	= ncpus; /* will be updated again at end */
+
+	/*
+	 * CPUs enumeration. Technically speaking we should
+	 * ask either host or HV for apic version supported
+	 * but for a while we simply put some random value
+	 * here.
+	 */
+	mpc_cpu = (void *)&mpc_table[1];
+	for (i = 0; i < ncpus; i++) {
+		mpc_cpu->type		= MP_PROCESSOR;
+		mpc_cpu->apicid		= i;
+		mpc_cpu->apicver	= KVM_APIC_VERSION;
+		mpc_cpu->cpuflag	= gen_cpu_flag(i, ncpus);
+		mpc_cpu->cpufeature	= 0x600; /* some default value */
+		mpc_cpu->featureflag	= 0x201; /* some default value */
+		mpc_cpu++;
+	}
+
+	last_addr = (void *)mpc_cpu;
+	nentries += ncpus;
+
+	/*
+	 * PCI buses.
+	 * FIXME: Some callback here to obtain real number
+	 * of PCI buses present in system.
+	 */
+	mpc_bus		= last_addr;
+	mpc_bus->type	= MP_BUS;
+	mpc_bus->busid	= pcibusid;
+	MPTABLE_STRNCPY(mpc_bus->bustype, MPTABLE_PCIBUSTYPE);
+
+	last_addr = (void *)&mpc_bus[1];
+	nentries++;
+
+	/*
+	 * ISA bus.
+	 * FIXME: Same issue as for PCI bus.
+	 */
+	mpc_bus		= last_addr;
+	mpc_bus->type	= MP_BUS;
+	mpc_bus->busid	= isabusid;
+	MPTABLE_STRNCPY(mpc_bus->bustype, MPTABLE_ISABUSTYPE);
+
+	last_addr = (void *)&mpc_bus[1];
+	nentries++;
+
+	/*
+	 * IO-APIC chip.
+	 */
+	ioapicid		= ncpus + 1;
+	mpc_ioapic		= last_addr;
+	mpc_ioapic->type	= MP_IOAPIC;
+	mpc_ioapic->apicid	= ioapicid;
+	mpc_ioapic->apicver	= KVM_APIC_VERSION;
+	mpc_ioapic->flags	= MPC_APIC_USABLE;
+	mpc_ioapic->apicaddr	= IOAPIC_ADDR(0);
+
+	last_addr = (void *)&mpc_ioapic[1];
+	nentries++;
+
+	/*
+	 * IRQ sources.
+	 * Also note we use PCI irqs here, no for ISA bus yet.
+	 */
+
+	dev_hdr = device__first_dev(DEVICE_BUS_PCI);
+	while (dev_hdr) {
+		unsigned char srcbusirq;
+		struct pci_device_header *pci_hdr = dev_hdr->data;
+
+		srcbusirq = (pci_hdr->subsys_id << 2) | (pci_hdr->irq_pin - 1);
+		mpc_intsrc = last_addr;
+		mptable_add_irq_src(mpc_intsrc, pcibusid, srcbusirq, ioapicid, pci_hdr->irq_line);
+
+		last_addr = (void *)&mpc_intsrc[dev_hdr->dev_num];
+		nentries++;
+		dev_hdr = device__next_dev(dev_hdr);
+	}
+
+	/*
+	 * Local IRQs assignment (LINT0, LINT1)
+	 */
+	mpc_intsrc		= last_addr;
+	mpc_intsrc->type	= MP_LINTSRC;
+	mpc_intsrc->irqtype	= mp_ExtINT;
+	mpc_intsrc->irqtype	= mp_INT;
+	mpc_intsrc->irqflag	= MP_IRQDIR_DEFAULT;
+	mpc_intsrc->srcbus	= isabusid;
+	mpc_intsrc->srcbusirq	= 0;
+	mpc_intsrc->dstapic	= 0; /* FIXME: BSP apic */
+	mpc_intsrc->dstirq	= 0; /* LINT0 */
+
+	last_addr = (void *)&mpc_intsrc[1];
+	nentries++;
+
+	mpc_intsrc		= last_addr;
+	mpc_intsrc->type	= MP_LINTSRC;
+	mpc_intsrc->irqtype	= mp_NMI;
+	mpc_intsrc->irqflag	= MP_IRQDIR_DEFAULT;
+	mpc_intsrc->srcbus	= isabusid;
+	mpc_intsrc->srcbusirq	= 0;
+	mpc_intsrc->dstapic	= 0; /* FIXME: BSP apic */
+	mpc_intsrc->dstirq	= 1; /* LINT1 */
+
+	last_addr = (void *)&mpc_intsrc[1];
+	nentries++;
+
+	/*
+	 * Floating MP table finally.
+	 */
+	real_mpf_intel	= ALIGN((unsigned long)last_addr - (unsigned long)mpc_table, 16);
+	mpf_intel	= (void *)((unsigned long)mpc_table + real_mpf_intel);
+
+	MPTABLE_STRNCPY(mpf_intel->signature, MPTABLE_SIG_FLOATING);
+	mpf_intel->length	= 1;
+	mpf_intel->specification= 4;
+	mpf_intel->physptr	= (unsigned int)real_mpc_table;
+	mpf_intel->checksum	= -mpf_checksum((unsigned char *)mpf_intel, sizeof(*mpf_intel));
+
+	/*
+	 * No last_addr inclrement here please, we need last
+	 * active position here to compute table size.
+	 */
+
+	/*
+	 * Don't forget to update header in fixed table.
+	*/
+	mpc_table->oemcount	= nentries;
+	mpc_table->length	= last_addr - (void *)mpc_table;
+	mpc_table->checksum	= -mpf_checksum((unsigned char *)mpc_table, mpc_table->length);
+
+
+	/*
+	 * We will copy the whole table, no need to separate
+	 * floating structure and table itkvm.
+	 */
+	size = (unsigned long)mpf_intel + sizeof(*mpf_intel) - (unsigned long)mpc_table;
+
+	/*
+	 * The finial check -- never get out of system bios
+	 * area. Lets also check for allocated memory overrun,
+	 * in real it's late but still usefull.
+	 */
+
+	if (size > (unsigned long)(MB_BIOS_END - bios_rom_size) ||
+	    size > MPTABLE_MAX_SIZE) {
+		free(mpc_table);
+		pr_err("MP table is too big");
+
+		return -E2BIG;
+	}
+
+	/*
+	 * OK, it is time to move it to guest memory.
+	 */
+	memcpy(guest_flat_to_host(kvm, real_mpc_table), mpc_table, size);
+
+	free(mpc_table);
+
+	return 0;
+}
+firmware_init(mptable__init);
+
+int mptable__exit(struct kvm *kvm)
+{
+	return 0;
+}
+firmware_exit(mptable__exit);