diff --git a/MAINTAINERS b/MAINTAINERS
index 84d6a8277cbd..6c8b66d2adcb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7640,17 +7640,6 @@ T:	git git://linuxtv.org/mkrufky/tuners.git
 S:	Maintained
 F:	drivers/media/dvb-frontends/lgdt3305.*
 
-LGUEST
-M:	Rusty Russell <rusty@rustcorp.com.au>
-L:	lguest@lists.ozlabs.org
-W:	http://lguest.ozlabs.org/
-S:	Odd Fixes
-F:	arch/x86/include/asm/lguest*.h
-F:	arch/x86/lguest/
-F:	drivers/lguest/
-F:	include/linux/lguest*.h
-F:	tools/lguest/
-
 LIBATA PATA ARASAN COMPACT FLASH CONTROLLER
 M:	Viresh Kumar <vireshk@kernel.org>
 L:	linux-ide@vger.kernel.org
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index 586b786b3edf..f65a804b86f0 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -10,9 +10,6 @@ obj-$(CONFIG_XEN) += xen/
 # Hyper-V paravirtualization support
 obj-$(CONFIG_HYPERVISOR_GUEST) += hyperv/
 
-# lguest paravirtualization support
-obj-$(CONFIG_LGUEST_GUEST) += lguest/
-
 obj-y += realmode/
 obj-y += kernel/
 obj-y += mm/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9b302121584d..651021713385 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -777,8 +777,6 @@ config KVM_DEBUG_FS
 	  Statistics are displayed in debugfs filesystem. Enabling this option
 	  may incur significant overhead.
 
-source "arch/x86/lguest/Kconfig"
-
 config PARAVIRT_TIME_ACCOUNTING
 	bool "Paravirtual steal time accounting"
 	depends on PARAVIRT
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
deleted file mode 100644
index 73d0c9b92087..000000000000
--- a/arch/x86/include/asm/lguest.h
+++ /dev/null
@@ -1,91 +0,0 @@
-#ifndef _ASM_X86_LGUEST_H
-#define _ASM_X86_LGUEST_H
-
-#define GDT_ENTRY_LGUEST_CS	10
-#define GDT_ENTRY_LGUEST_DS	11
-#define LGUEST_CS		(GDT_ENTRY_LGUEST_CS * 8)
-#define LGUEST_DS		(GDT_ENTRY_LGUEST_DS * 8)
-
-#ifndef __ASSEMBLY__
-#include <asm/desc.h>
-
-#define GUEST_PL 1
-
-/* Page for Switcher text itself, then two pages per cpu */
-#define SWITCHER_TEXT_PAGES (1)
-#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids)
-#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES)
-
-/* Where we map the Switcher, in both Host and Guest. */
-extern unsigned long switcher_addr;
-
-/* Found in switcher.S */
-extern unsigned long default_idt_entries[];
-
-/* Declarations for definitions in arch/x86/lguest/head_32.S */
-extern char lguest_noirq_iret[];
-extern const char lgstart_cli[], lgend_cli[];
-extern const char lgstart_pushf[], lgend_pushf[];
-
-extern void lguest_iret(void);
-extern void lguest_init(void);
-
-struct lguest_regs {
-	/* Manually saved part. */
-	unsigned long eax, ebx, ecx, edx;
-	unsigned long esi, edi, ebp;
-	unsigned long gs;
-	unsigned long fs, ds, es;
-	unsigned long trapnum, errcode;
-	/* Trap pushed part */
-	unsigned long eip;
-	unsigned long cs;
-	unsigned long eflags;
-	unsigned long esp;
-	unsigned long ss;
-};
-
-/* This is a guest-specific page (mapped ro) into the guest. */
-struct lguest_ro_state {
-	/* Host information we need to restore when we switch back. */
-	u32 host_cr3;
-	struct desc_ptr host_idt_desc;
-	struct desc_ptr host_gdt_desc;
-	u32 host_sp;
-
-	/* Fields which are used when guest is running. */
-	struct desc_ptr guest_idt_desc;
-	struct desc_ptr guest_gdt_desc;
-	struct x86_hw_tss guest_tss;
-	struct desc_struct guest_idt[IDT_ENTRIES];
-	struct desc_struct guest_gdt[GDT_ENTRIES];
-};
-
-struct lg_cpu_arch {
-	/* The GDT entries copied into lguest_ro_state when running. */
-	struct desc_struct gdt[GDT_ENTRIES];
-
-	/* The IDT entries: some copied into lguest_ro_state when running. */
-	struct desc_struct idt[IDT_ENTRIES];
-
-	/* The address of the last guest-visible pagefault (ie. cr2). */
-	unsigned long last_pagefault;
-};
-
-static inline void lguest_set_ts(void)
-{
-	u32 cr0;
-
-	cr0 = read_cr0();
-	if (!(cr0 & 8))
-		write_cr0(cr0 | 8);
-}
-
-/* Full 4G segment descriptors, suitable for CS and DS. */
-#define FULL_EXEC_SEGMENT \
-	((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff))
-#define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff))
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_X86_LGUEST_H */
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
deleted file mode 100644
index 6c119cfae218..000000000000
--- a/arch/x86/include/asm/lguest_hcall.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Architecture specific portion of the lguest hypercalls */
-#ifndef _ASM_X86_LGUEST_HCALL_H
-#define _ASM_X86_LGUEST_HCALL_H
-
-#define LHCALL_FLUSH_ASYNC	0
-#define LHCALL_LGUEST_INIT	1
-#define LHCALL_SHUTDOWN		2
-#define LHCALL_NEW_PGTABLE	4
-#define LHCALL_FLUSH_TLB	5
-#define LHCALL_LOAD_IDT_ENTRY	6
-#define LHCALL_SET_STACK	7
-#define LHCALL_SET_CLOCKEVENT	9
-#define LHCALL_HALT		10
-#define LHCALL_SET_PMD		13
-#define LHCALL_SET_PTE		14
-#define LHCALL_SET_PGD		15
-#define LHCALL_LOAD_TLS		16
-#define LHCALL_LOAD_GDT_ENTRY	18
-#define LHCALL_SEND_INTERRUPTS	19
-
-#define LGUEST_TRAP_ENTRY 0x1F
-
-/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */
-#define LGUEST_SHUTDOWN_POWEROFF	1
-#define LGUEST_SHUTDOWN_RESTART		2
-
-#ifndef __ASSEMBLY__
-#include <asm/hw_irq.h>
-
-/*G:030
- * But first, how does our Guest contact the Host to ask for privileged
- * operations?  There are two ways: the direct way is to make a "hypercall",
- * to make requests of the Host Itself.
- *
- * Our hypercall mechanism uses the highest unused trap code (traps 32 and
- * above are used by real hardware interrupts).  Seventeen hypercalls are
- * available: the hypercall number is put in the %eax register, and the
- * arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
- * If a return value makes sense, it's returned in %eax.
- *
- * Grossly invalid calls result in Sudden Death at the hands of the vengeful
- * Host, rather than returning failure.  This reflects Winston Churchill's
- * definition of a gentleman: "someone who is only rude intentionally".
- */
-static inline unsigned long
-hcall(unsigned long call,
-      unsigned long arg1, unsigned long arg2, unsigned long arg3,
-      unsigned long arg4)
-{
-	/* "int" is the Intel instruction to trigger a trap. */
-	asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
-		     /* The call in %eax (aka "a") might be overwritten */
-		     : "=a"(call)
-		       /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */
-		     : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4)
-		       /* "memory" means this might write somewhere in memory.
-			* This isn't true for all calls, but it's safe to tell
-			* gcc that it might happen so it doesn't get clever. */
-		     : "memory");
-	return call;
-}
-/*:*/
-
-/* Can't use our min() macro here: needs to be a constant */
-#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
-
-#define LHCALL_RING_SIZE 64
-struct hcall_args {
-	/* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */
-	unsigned long arg0, arg1, arg2, arg3, arg4;
-};
-
-#endif /* !__ASSEMBLY__ */
-#endif /* _ASM_X86_LGUEST_HCALL_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 0b03d655db7c..abc99b9c7ffd 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -662,7 +662,7 @@ static inline void sync_core(void)
 	 * In case NMI unmasking or performance ever becomes a problem,
 	 * the next best option appears to be MOV-to-CR2 and an
 	 * unconditional jump.  That sequence also works on all CPUs,
-	 * but it will fault at CPL3 (i.e. Xen PV and lguest).
+	 * but it will fault at CPL3 (i.e. Xen PV).
 	 *
 	 * CPUID is the conventional way, but it's nasty: it doesn't
 	 * exist on some 486-like CPUs, and it usually exits to a
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index ddef37b16af2..66b8f93333d1 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -201,7 +201,7 @@ struct boot_params {
  *
  * @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard
  *	PC mechanisms (PCI, ACPI) and doesn't need a special boot flow.
- * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest
+ * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest, deprecated
  * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path,
  * 	which start at asm startup_xen() entry point and later jump to the C
  * 	xen_start_kernel() entry point. Both domU and dom0 type of guests are
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 880aa093268d..710edab9e644 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -4,9 +4,6 @@
 
 #include <asm/ucontext.h>
 
-#include <linux/lguest.h>
-#include "../../../drivers/lguest/lg.h"
-
 #define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_32.h>
@@ -62,23 +59,6 @@ void foo(void)
 	OFFSET(stack_canary_offset, stack_canary, canary);
 #endif
 
-#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
-	BLANK();
-	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
-	OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
-
-	BLANK();
-	OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
-	OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
-	OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
-	OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
-	OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
-	OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
-	OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
-	OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
-	OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
-	OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
-#endif
 	BLANK();
 	DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
 	DEFINE(NR_syscalls, sizeof(syscalls));
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 0332664eb158..29da9599fec0 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -155,7 +155,6 @@ ENTRY(startup_32)
 	jmp *%eax
 
 .Lbad_subarch:
-WEAK(lguest_entry)
 WEAK(xen_entry)
 	/* Unknown implementation; there's really
 	   nothing we can do at this point. */
@@ -165,7 +164,6 @@ WEAK(xen_entry)
 
 subarch_entries:
 	.long .Ldefault_entry		/* normal x86/PC */
-	.long lguest_entry		/* lguest hypervisor */
 	.long xen_entry			/* Xen hypervisor */
 	.long .Ldefault_entry		/* Moorestown MID */
 num_subarch_entries = (. - subarch_entries) / 4
diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c
index 91271122f0df..502a77d0adb0 100644
--- a/arch/x86/kernel/platform-quirks.c
+++ b/arch/x86/kernel/platform-quirks.c
@@ -16,7 +16,6 @@ void __init x86_early_init_platform_quirks(void)
 		x86_platform.legacy.reserve_bios_regions = 1;
 		break;
 	case X86_SUBARCH_XEN:
-	case X86_SUBARCH_LGUEST:
 		x86_platform.legacy.devices.pnpbios = 0;
 		x86_platform.legacy.rtc = 0;
 		break;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 2688c7dc5323..3ea624452f93 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -89,6 +89,5 @@ config KVM_MMU_AUDIT
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/vhost/Kconfig
-source drivers/lguest/Kconfig
 
 endif # VIRTUALIZATION
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
deleted file mode 100644
index 08f41caada45..000000000000
--- a/arch/x86/lguest/Kconfig
+++ /dev/null
@@ -1,14 +0,0 @@
-config LGUEST_GUEST
-	bool "Lguest guest support"
-	depends on X86_32 && PARAVIRT && PCI
-	select TTY
-	select VIRTUALIZATION
-	select VIRTIO
-	select VIRTIO_CONSOLE
-	help
-	  Lguest is a tiny in-kernel hypervisor.  Selecting this will
-	  allow your kernel to boot under lguest.  This option will increase
-	  your kernel size by about 10k.  If in doubt, say N.
-
-	  If you say Y here, make sure you say Y (or M) to the virtio block
-	  and net drivers which lguest needs.
diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile
deleted file mode 100644
index 8f38d577a2fa..000000000000
--- a/arch/x86/lguest/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-obj-y		:= head_32.o boot.o
-CFLAGS_boot.o	:= $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
deleted file mode 100644
index 99472698c931..000000000000
--- a/arch/x86/lguest/boot.c
+++ /dev/null
@@ -1,1558 +0,0 @@
-/*P:010
- * A hypervisor allows multiple Operating Systems to run on a single machine.
- * To quote David Wheeler: "Any problem in computer science can be solved with
- * another layer of indirection."
- *
- * We keep things simple in two ways.  First, we start with a normal Linux
- * kernel and insert a module (lg.ko) which allows us to run other Linux
- * kernels the same way we'd run processes.  We call the first kernel the Host,
- * and the others the Guests.  The program which sets up and configures Guests
- * (such as the example in tools/lguest/lguest.c) is called the Launcher.
- *
- * Secondly, we only run specially modified Guests, not normal kernels: setting
- * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows
- * how to be a Guest at boot time.  This means that you can use the same kernel
- * you boot normally (ie. as a Host) as a Guest.
- *
- * These Guests know that they cannot do privileged operations, such as disable
- * interrupts, and that they have to ask the Host to do such things explicitly.
- * This file consists of all the replacements for such low-level native
- * hardware operations: these special Guest versions call the Host.
- *
- * So how does the kernel know it's a Guest?  We'll see that later, but let's
- * just say that we end up here where we replace the native functions various
- * "paravirt" structures with our Guest versions, then boot like normal.
-:*/
-
-/*
- * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-#include <linux/kernel.h>
-#include <linux/start_kernel.h>
-#include <linux/string.h>
-#include <linux/console.h>
-#include <linux/screen_info.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-#include <linux/lguest.h>
-#include <linux/lguest_launcher.h>
-#include <linux/virtio_console.h>
-#include <linux/pm.h>
-#include <linux/export.h>
-#include <linux/pci.h>
-#include <linux/virtio_pci.h>
-#include <asm/acpi.h>
-#include <asm/apic.h>
-#include <asm/lguest.h>
-#include <asm/paravirt.h>
-#include <asm/param.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/setup.h>
-#include <asm/e820/api.h>
-#include <asm/mce.h>
-#include <asm/io.h>
-#include <asm/fpu/api.h>
-#include <asm/stackprotector.h>
-#include <asm/reboot.h>		/* for struct machine_ops */
-#include <asm/kvm_para.h>
-#include <asm/pci_x86.h>
-#include <asm/pci-direct.h>
-
-/*G:010
- * Welcome to the Guest!
- *
- * The Guest in our tale is a simple creature: identical to the Host but
- * behaving in simplified but equivalent ways.  In particular, the Guest is the
- * same kernel as the Host (or at least, built from the same source code).
-:*/
-
-struct lguest_data lguest_data = {
-	.hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
-	.noirq_iret = (u32)lguest_noirq_iret,
-	.kernel_address = PAGE_OFFSET,
-	.blocked_interrupts = { 1 }, /* Block timer interrupts */
-	.syscall_vec = IA32_SYSCALL_VECTOR,
-};
-
-/*G:037
- * async_hcall() is pretty simple: I'm quite proud of it really.  We have a
- * ring buffer of stored hypercalls which the Host will run though next time we
- * do a normal hypercall.  Each entry in the ring has 5 slots for the hypercall
- * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
- * and 255 once the Host has finished with it.
- *
- * If we come around to a slot which hasn't been finished, then the table is
- * full and we just make the hypercall directly.  This has the nice side
- * effect of causing the Host to run all the stored calls in the ring buffer
- * which empties it for next time!
- */
-static void async_hcall(unsigned long call, unsigned long arg1,
-			unsigned long arg2, unsigned long arg3,
-			unsigned long arg4)
-{
-	/* Note: This code assumes we're uniprocessor. */
-	static unsigned int next_call;
-	unsigned long flags;
-
-	/*
-	 * Disable interrupts if not already disabled: we don't want an
-	 * interrupt handler making a hypercall while we're already doing
-	 * one!
-	 */
-	local_irq_save(flags);
-	if (lguest_data.hcall_status[next_call] != 0xFF) {
-		/* Table full, so do normal hcall which will flush table. */
-		hcall(call, arg1, arg2, arg3, arg4);
-	} else {
-		lguest_data.hcalls[next_call].arg0 = call;
-		lguest_data.hcalls[next_call].arg1 = arg1;
-		lguest_data.hcalls[next_call].arg2 = arg2;
-		lguest_data.hcalls[next_call].arg3 = arg3;
-		lguest_data.hcalls[next_call].arg4 = arg4;
-		/* Arguments must all be written before we mark it to go */
-		wmb();
-		lguest_data.hcall_status[next_call] = 0;
-		if (++next_call == LHCALL_RING_SIZE)
-			next_call = 0;
-	}
-	local_irq_restore(flags);
-}
-
-/*G:035
- * Notice the lazy_hcall() above, rather than hcall().  This is our first real
- * optimization trick!
- *
- * When lazy_mode is set, it means we're allowed to defer all hypercalls and do
- * them as a batch when lazy_mode is eventually turned off.  Because hypercalls
- * are reasonably expensive, batching them up makes sense.  For example, a
- * large munmap might update dozens of page table entries: that code calls
- * paravirt_enter_lazy_mmu(), does the dozen updates, then calls
- * lguest_leave_lazy_mode().
- *
- * So, when we're in lazy mode, we call async_hcall() to store the call for
- * future processing:
- */
-static void lazy_hcall1(unsigned long call, unsigned long arg1)
-{
-	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-		hcall(call, arg1, 0, 0, 0);
-	else
-		async_hcall(call, arg1, 0, 0, 0);
-}
-
-/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/
-static void lazy_hcall2(unsigned long call,
-			unsigned long arg1,
-			unsigned long arg2)
-{
-	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-		hcall(call, arg1, arg2, 0, 0);
-	else
-		async_hcall(call, arg1, arg2, 0, 0);
-}
-
-static void lazy_hcall3(unsigned long call,
-			unsigned long arg1,
-			unsigned long arg2,
-			unsigned long arg3)
-{
-	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-		hcall(call, arg1, arg2, arg3, 0);
-	else
-		async_hcall(call, arg1, arg2, arg3, 0);
-}
-
-#ifdef CONFIG_X86_PAE
-static void lazy_hcall4(unsigned long call,
-			unsigned long arg1,
-			unsigned long arg2,
-			unsigned long arg3,
-			unsigned long arg4)
-{
-	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-		hcall(call, arg1, arg2, arg3, arg4);
-	else
-		async_hcall(call, arg1, arg2, arg3, arg4);
-}
-#endif
-
-/*G:036
- * When lazy mode is turned off, we issue the do-nothing hypercall to
- * flush any stored calls, and call the generic helper to reset the
- * per-cpu lazy mode variable.
- */
-static void lguest_leave_lazy_mmu_mode(void)
-{
-	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
-	paravirt_leave_lazy_mmu();
-}
-
-/*
- * We also catch the end of context switch; we enter lazy mode for much of
- * that too, so again we need to flush here.
- *
- * (Technically, this is lazy CPU mode, and normally we're in lazy MMU
- * mode, but unlike Xen, lguest doesn't care about the difference).
- */
-static void lguest_end_context_switch(struct task_struct *next)
-{
-	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
-	paravirt_end_context_switch(next);
-}
-
-/*G:032
- * After that diversion we return to our first native-instruction
- * replacements: four functions for interrupt control.
- *
- * The simplest way of implementing these would be to have "turn interrupts
- * off" and "turn interrupts on" hypercalls.  Unfortunately, this is too slow:
- * these are by far the most commonly called functions of those we override.
- *
- * So instead we keep an "irq_enabled" field inside our "struct lguest_data",
- * which the Guest can update with a single instruction.  The Host knows to
- * check there before it tries to deliver an interrupt.
- */
-
-/*
- * save_flags() is expected to return the processor state (ie. "flags").  The
- * flags word contains all kind of stuff, but in practice Linux only cares
- * about the interrupt flag.  Our "save_flags()" just returns that.
- */
-asmlinkage __visible unsigned long lguest_save_fl(void)
-{
-	return lguest_data.irq_enabled;
-}
-
-/* Interrupts go off... */
-asmlinkage __visible void lguest_irq_disable(void)
-{
-	lguest_data.irq_enabled = 0;
-}
-
-/*
- * Let's pause a moment.  Remember how I said these are called so often?
- * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
- * break some rules.  In particular, these functions are assumed to save their
- * own registers if they need to: normal C functions assume they can trash the
- * eax register.  To use normal C functions, we use
- * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
- * C function, then restores it.
- */
-PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl);
-PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable);
-/*:*/
-
-/* These are in head_32.S */
-extern void lg_irq_enable(void);
-extern void lg_restore_fl(unsigned long flags);
-
-/*M:003
- * We could be more efficient in our checking of outstanding interrupts, rather
- * than using a branch.  One way would be to put the "irq_enabled" field in a
- * page by itself, and have the Host write-protect it when an interrupt comes
- * in when irqs are disabled.  There will then be a page fault as soon as
- * interrupts are re-enabled.
- *
- * A better method is to implement soft interrupt disable generally for x86:
- * instead of disabling interrupts, we set a flag.  If an interrupt does come
- * in, we then disable them for real.  This is uncommon, so we could simply use
- * a hypercall for interrupt control and not worry about efficiency.
-:*/
-
-/*G:034
- * The Interrupt Descriptor Table (IDT).
- *
- * The IDT tells the processor what to do when an interrupt comes in.  Each
- * entry in the table is a 64-bit descriptor: this holds the privilege level,
- * address of the handler, and... well, who cares?  The Guest just asks the
- * Host to make the change anyway, because the Host controls the real IDT.
- */
-static void lguest_write_idt_entry(gate_desc *dt,
-				   int entrynum, const gate_desc *g)
-{
-	/*
-	 * The gate_desc structure is 8 bytes long: we hand it to the Host in
-	 * two 32-bit chunks.  The whole 32-bit kernel used to hand descriptors
-	 * around like this; typesafety wasn't a big concern in Linux's early
-	 * years.
-	 */
-	u32 *desc = (u32 *)g;
-	/* Keep the local copy up to date. */
-	native_write_idt_entry(dt, entrynum, g);
-	/* Tell Host about this new entry. */
-	hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1], 0);
-}
-
-/*
- * Changing to a different IDT is very rare: we keep the IDT up-to-date every
- * time it is written, so we can simply loop through all entries and tell the
- * Host about them.
- */
-static void lguest_load_idt(const struct desc_ptr *desc)
-{
-	unsigned int i;
-	struct desc_struct *idt = (void *)desc->address;
-
-	for (i = 0; i < (desc->size+1)/8; i++)
-		hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b, 0);
-}
-
-/*
- * The Global Descriptor Table.
- *
- * The Intel architecture defines another table, called the Global Descriptor
- * Table (GDT).  You tell the CPU where it is (and its size) using the "lgdt"
- * instruction, and then several other instructions refer to entries in the
- * table.  There are three entries which the Switcher needs, so the Host simply
- * controls the entire thing and the Guest asks it to make changes using the
- * LOAD_GDT hypercall.
- *
- * This is the exactly like the IDT code.
- */
-static void lguest_load_gdt(const struct desc_ptr *desc)
-{
-	unsigned int i;
-	struct desc_struct *gdt = (void *)desc->address;
-
-	for (i = 0; i < (desc->size+1)/8; i++)
-		hcall(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b, 0);
-}
-
-/*
- * For a single GDT entry which changes, we simply change our copy and
- * then tell the host about it.
- */
-static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
-				   const void *desc, int type)
-{
-	native_write_gdt_entry(dt, entrynum, desc, type);
-	/* Tell Host about this new entry. */
-	hcall(LHCALL_LOAD_GDT_ENTRY, entrynum,
-	      dt[entrynum].a, dt[entrynum].b, 0);
-}
-
-/*
- * There are three "thread local storage" GDT entries which change
- * on every context switch (these three entries are how glibc implements
- * __thread variables).  As an optimization, we have a hypercall
- * specifically for this case.
- *
- * Wouldn't it be nicer to have a general LOAD_GDT_ENTRIES hypercall
- * which took a range of entries?
- */
-static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
-{
-	/*
-	 * There's one problem which normal hardware doesn't have: the Host
-	 * can't handle us removing entries we're currently using.  So we clear
-	 * the GS register here: if it's needed it'll be reloaded anyway.
-	 */
-	lazy_load_gs(0);
-	lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu);
-}
-
-/*G:038
- * That's enough excitement for now, back to ploughing through each of the
- * different pv_ops structures (we're about 1/3 of the way through).
- *
- * This is the Local Descriptor Table, another weird Intel thingy.  Linux only
- * uses this for some strange applications like Wine.  We don't do anything
- * here, so they'll get an informative and friendly Segmentation Fault.
- */
-static void lguest_set_ldt(const void *addr, unsigned entries)
-{
-}
-
-/*
- * This loads a GDT entry into the "Task Register": that entry points to a
- * structure called the Task State Segment.  Some comments scattered though the
- * kernel code indicate that this used for task switching in ages past, along
- * with blood sacrifice and astrology.
- *
- * Now there's nothing interesting in here that we don't get told elsewhere.
- * But the native version uses the "ltr" instruction, which makes the Host
- * complain to the Guest about a Segmentation Fault and it'll oops.  So we
- * override the native version with a do-nothing version.
- */
-static void lguest_load_tr_desc(void)
-{
-}
-
-/*
- * The "cpuid" instruction is a way of querying both the CPU identity
- * (manufacturer, model, etc) and its features.  It was introduced before the
- * Pentium in 1993 and keeps getting extended by both Intel, AMD and others.
- * As you might imagine, after a decade and a half this treatment, it is now a
- * giant ball of hair.  Its entry in the current Intel manual runs to 28 pages.
- *
- * This instruction even it has its own Wikipedia entry.  The Wikipedia entry
- * has been translated into 6 languages.  I am not making this up!
- *
- * We could get funky here and identify ourselves as "GenuineLguest", but
- * instead we just use the real "cpuid" instruction.  Then I pretty much turned
- * off feature bits until the Guest booted.  (Don't say that: you'll damage
- * lguest sales!)  Shut up, inner voice!  (Hey, just pointing out that this is
- * hardly future proof.)  No one's listening!  They don't like you anyway,
- * parenthetic weirdo!
- *
- * Replacing the cpuid so we can turn features off is great for the kernel, but
- * anyone (including userspace) can just use the raw "cpuid" instruction and
- * the Host won't even notice since it isn't privileged.  So we try not to get
- * too worked up about it.
- */
-static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
-			 unsigned int *cx, unsigned int *dx)
-{
-	int function = *ax;
-
-	native_cpuid(ax, bx, cx, dx);
-	switch (function) {
-	/*
-	 * CPUID 0 gives the highest legal CPUID number (and the ID string).
-	 * We futureproof our code a little by sticking to known CPUID values.
-	 */
-	case 0:
-		if (*ax > 5)
-			*ax = 5;
-		break;
-
-	/*
-	 * CPUID 1 is a basic feature request.
-	 *
-	 * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3
-	 * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE.
-	 */
-	case 1:
-		*cx &= 0x00002201;
-		*dx &= 0x07808151;
-		/*
-		 * The Host can do a nice optimization if it knows that the
-		 * kernel mappings (addresses above 0xC0000000 or whatever
-		 * PAGE_OFFSET is set to) haven't changed.  But Linux calls
-		 * flush_tlb_user() for both user and kernel mappings unless
-		 * the Page Global Enable (PGE) feature bit is set.
-		 */
-		*dx |= 0x00002000;
-		/*
-		 * We also lie, and say we're family id 5.  6 or greater
-		 * leads to a rdmsr in early_init_intel which we can't handle.
-		 * Family ID is returned as bits 8-12 in ax.
-		 */
-		*ax &= 0xFFFFF0FF;
-		*ax |= 0x00000500;
-		break;
-
-	/*
-	 * This is used to detect if we're running under KVM.  We might be,
-	 * but that's a Host matter, not us.  So say we're not.
-	 */
-	case KVM_CPUID_SIGNATURE:
-		*bx = *cx = *dx = 0;
-		break;
-
-	/*
-	 * 0x80000000 returns the highest Extended Function, so we futureproof
-	 * like we do above by limiting it to known fields.
-	 */
-	case 0x80000000:
-		if (*ax > 0x80000008)
-			*ax = 0x80000008;
-		break;
-
-	/*
-	 * PAE systems can mark pages as non-executable.  Linux calls this the
-	 * NX bit.  Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
-	 * Virus Protection).  We just switch it off here, since we don't
-	 * support it.
-	 */
-	case 0x80000001:
-		*dx &= ~(1 << 20);
-		break;
-	}
-}
-
-/*
- * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4.
- * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother
- * it.  The Host needs to know when the Guest wants to change them, so we have
- * a whole series of functions like read_cr0() and write_cr0().
- *
- * We start with cr0.  cr0 allows you to turn on and off all kinds of basic
- * features, but the only cr0 bit that Linux ever used at runtime was the
- * horrifically-named Task Switched (TS) bit at bit 3 (ie. 8)
- *
- * What does the TS bit do?  Well, it causes the CPU to trap (interrupt 7) if
- * the floating point unit is used.  Which allows us to restore FPU state
- * lazily after a task switch if we wanted to, but wouldn't a name like
- * "FPUTRAP bit" be a little less cryptic?
- *
- * Fortunately, Linux keeps it simple and doesn't use TS, so we can ignore
- * cr0.
- */
-static void lguest_write_cr0(unsigned long val)
-{
-}
-
-static unsigned long lguest_read_cr0(void)
-{
-	return 0;
-}
-
-/*
- * cr2 is the virtual address of the last page fault, which the Guest only ever
- * reads.  The Host kindly writes this into our "struct lguest_data", so we
- * just read it out of there.
- */
-static unsigned long lguest_read_cr2(void)
-{
-	return lguest_data.cr2;
-}
-
-/* See lguest_set_pte() below. */
-static bool cr3_changed = false;
-static unsigned long current_cr3;
-
-/*
- * cr3 is the current toplevel pagetable page: the principle is the same as
- * cr0.  Keep a local copy, and tell the Host when it changes.
- */
-static void lguest_write_cr3(unsigned long cr3)
-{
-	lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
-	current_cr3 = cr3;
-
-	/* These two page tables are simple, linear, and used during boot */
-	if (cr3 != __pa_symbol(swapper_pg_dir) &&
-	    cr3 != __pa_symbol(initial_page_table))
-		cr3_changed = true;
-}
-
-static unsigned long lguest_read_cr3(void)
-{
-	return current_cr3;
-}
-
-/* cr4 is used to enable and disable PGE, but we don't care. */
-static unsigned long lguest_read_cr4(void)
-{
-	return 0;
-}
-
-static void lguest_write_cr4(unsigned long val)
-{
-}
-
-/*
- * Page Table Handling.
- *
- * Now would be a good time to take a rest and grab a coffee or similarly
- * relaxing stimulant.  The easy parts are behind us, and the trek gradually
- * winds uphill from here.
- *
- * Quick refresher: memory is divided into "pages" of 4096 bytes each.  The CPU
- * maps virtual addresses to physical addresses using "page tables".  We could
- * use one huge index of 1 million entries: each address is 4 bytes, so that's
- * 1024 pages just to hold the page tables.   But since most virtual addresses
- * are unused, we use a two level index which saves space.  The cr3 register
- * contains the physical address of the top level "page directory" page, which
- * contains physical addresses of up to 1024 second-level pages.  Each of these
- * second level pages contains up to 1024 physical addresses of actual pages,
- * or Page Table Entries (PTEs).
- *
- * Here's a diagram, where arrows indicate physical addresses:
- *
- * cr3 ---> +---------+
- *	    |  	   --------->+---------+
- *	    |	      |	     | PADDR1  |
- *	  Mid-level   |	     | PADDR2  |
- *	  (PMD) page  |	     | 	       |
- *	    |	      |	   Lower-level |
- *	    |	      |	   (PTE) page  |
- *	    |	      |	     |	       |
- *	      ....    	     	 ....
- *
- * So to convert a virtual address to a physical address, we look up the top
- * level, which points us to the second level, which gives us the physical
- * address of that page.  If the top level entry was not present, or the second
- * level entry was not present, then the virtual address is invalid (we
- * say "the page was not mapped").
- *
- * Put another way, a 32-bit virtual address is divided up like so:
- *
- *  1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
- * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>|
- *    Index into top     Index into second      Offset within page
- *  page directory page    pagetable page
- *
- * Now, unfortunately, this isn't the whole story: Intel added Physical Address
- * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits).
- * These are held in 64-bit page table entries, so we can now only fit 512
- * entries in a page, and the neat three-level tree breaks down.
- *
- * The result is a four level page table:
- *
- * cr3 --> [ 4 Upper  ]
- *	   [   Level  ]
- *	   [  Entries ]
- *	   [(PUD Page)]---> +---------+
- *	 		    |  	   --------->+---------+
- *	 		    |	      |	     | PADDR1  |
- *	 		  Mid-level   |	     | PADDR2  |
- *	 		  (PMD) page  |	     | 	       |
- *	 		    |	      |	   Lower-level |
- *	 		    |	      |	   (PTE) page  |
- *	 		    |	      |	     |	       |
- *	 		      ....    	     	 ....
- *
- *
- * And the virtual address is decoded as:
- *
- *         1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
- *      |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>|
- * Index into    Index into mid    Index into lower    Offset within page
- * top entries   directory page     pagetable page
- *
- * It's too hard to switch between these two formats at runtime, so Linux only
- * supports one or the other depending on whether CONFIG_X86_PAE is set.  Many
- * distributions turn it on, and not just for people with silly amounts of
- * memory: the larger PTE entries allow room for the NX bit, which lets the
- * kernel disable execution of pages and increase security.
- *
- * This was a problem for lguest, which couldn't run on these distributions;
- * then Matias Zabaljauregui figured it all out and implemented it, and only a
- * handful of puppies were crushed in the process!
- *
- * Back to our point: the kernel spends a lot of time changing both the
- * top-level page directory and lower-level pagetable pages.  The Guest doesn't
- * know physical addresses, so while it maintains these page tables exactly
- * like normal, it also needs to keep the Host informed whenever it makes a
- * change: the Host will create the real page tables based on the Guests'.
- */
-
-/*
- * The Guest calls this after it has set a second-level entry (pte), ie. to map
- * a page into a process' address space.  We tell the Host the toplevel and
- * address this corresponds to.  The Guest uses one pagetable per process, so
- * we need to tell the Host which one we're changing (mm->pgd).
- */
-static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
-			       pte_t *ptep)
-{
-#ifdef CONFIG_X86_PAE
-	/* PAE needs to hand a 64 bit page table entry, so it uses two args. */
-	lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
-		    ptep->pte_low, ptep->pte_high);
-#else
-	lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
-#endif
-}
-
-/* This is the "set and update" combo-meal-deal version. */
-static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
-			      pte_t *ptep, pte_t pteval)
-{
-	native_set_pte(ptep, pteval);
-	lguest_pte_update(mm, addr, ptep);
-}
-
-/*
- * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
- * to set a middle-level entry when PAE is activated.
- *
- * Again, we set the entry then tell the Host which page we changed,
- * and the index of the entry we changed.
- */
-#ifdef CONFIG_X86_PAE
-static void lguest_set_pud(pud_t *pudp, pud_t pudval)
-{
-	native_set_pud(pudp, pudval);
-
-	/* 32 bytes aligned pdpt address and the index. */
-	lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0,
-		   (__pa(pudp) & 0x1F) / sizeof(pud_t));
-}
-
-static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-	native_set_pmd(pmdp, pmdval);
-	lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
-		   (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
-}
-#else
-
-/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */
-static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-	native_set_pmd(pmdp, pmdval);
-	lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK,
-		   (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
-}
-#endif
-
-/*
- * There are a couple of legacy places where the kernel sets a PTE, but we
- * don't know the top level any more.  This is useless for us, since we don't
- * know which pagetable is changing or what address, so we just tell the Host
- * to forget all of them.  Fortunately, this is very rare.
- *
- * ... except in early boot when the kernel sets up the initial pagetables,
- * which makes booting astonishingly slow: 48 seconds!  So we don't even tell
- * the Host anything changed until we've done the first real page table switch,
- * which brings boot back to 4.3 seconds.
- */
-static void lguest_set_pte(pte_t *ptep, pte_t pteval)
-{
-	native_set_pte(ptep, pteval);
-	if (cr3_changed)
-		lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-#ifdef CONFIG_X86_PAE
-/*
- * With 64-bit PTE values, we need to be careful setting them: if we set 32
- * bits at a time, the hardware could see a weird half-set entry.  These
- * versions ensure we update all 64 bits at once.
- */
-static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
-{
-	native_set_pte_atomic(ptep, pte);
-	if (cr3_changed)
-		lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr,
-			     pte_t *ptep)
-{
-	native_pte_clear(mm, addr, ptep);
-	lguest_pte_update(mm, addr, ptep);
-}
-
-static void lguest_pmd_clear(pmd_t *pmdp)
-{
-	lguest_set_pmd(pmdp, __pmd(0));
-}
-#endif
-
-/*
- * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
- * native page table operations.  On native hardware you can set a new page
- * table entry whenever you want, but if you want to remove one you have to do
- * a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
- *
- * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only
- * called when a valid entry is written, not when it's removed (ie. marked not
- * present).  Instead, this is where we come when the Guest wants to remove a
- * page table entry: we tell the Host to set that entry to 0 (ie. the present
- * bit is zero).
- */
-static void lguest_flush_tlb_single(unsigned long addr)
-{
-	/* Simply set it to zero: if it was not, it will fault back in. */
-	lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0);
-}
-
-/*
- * This is what happens after the Guest has removed a large number of entries.
- * This tells the Host that any of the page table entries for userspace might
- * have changed, ie. virtual addresses below PAGE_OFFSET.
- */
-static void lguest_flush_tlb_user(void)
-{
-	lazy_hcall1(LHCALL_FLUSH_TLB, 0);
-}
-
-/*
- * This is called when the kernel page tables have changed.  That's not very
- * common (unless the Guest is using highmem, which makes the Guest extremely
- * slow), so it's worth separating this from the user flushing above.
- */
-static void lguest_flush_tlb_kernel(void)
-{
-	lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-/*
- * The Unadvanced Programmable Interrupt Controller.
- *
- * This is an attempt to implement the simplest possible interrupt controller.
- * I spent some time looking though routines like set_irq_chip_and_handler,
- * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and
- * I *think* this is as simple as it gets.
- *
- * We can tell the Host what interrupts we want blocked ready for using the
- * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as
- * simple as setting a bit.  We don't actually "ack" interrupts as such, we
- * just mask and unmask them.  I wonder if we should be cleverer?
- */
-static void disable_lguest_irq(struct irq_data *data)
-{
-	set_bit(data->irq, lguest_data.blocked_interrupts);
-}
-
-static void enable_lguest_irq(struct irq_data *data)
-{
-	clear_bit(data->irq, lguest_data.blocked_interrupts);
-}
-
-/* This structure describes the lguest IRQ controller. */
-static struct irq_chip lguest_irq_controller = {
-	.name		= "lguest",
-	.irq_mask	= disable_lguest_irq,
-	.irq_mask_ack	= disable_lguest_irq,
-	.irq_unmask	= enable_lguest_irq,
-};
-
-/*
- * Interrupt descriptors are allocated as-needed, but low-numbered ones are
- * reserved by the generic x86 code.  So we ignore irq_alloc_desc_at if it
- * tells us the irq is already used: other errors (ie. ENOMEM) we take
- * seriously.
- */
-static int lguest_setup_irq(unsigned int irq)
-{
-	struct irq_desc *desc;
-	int err;
-
-	/* Returns -ve error or vector number. */
-	err = irq_alloc_desc_at(irq, 0);
-	if (err < 0 && err != -EEXIST)
-		return err;
-
-	/*
-	 * Tell the Linux infrastructure that the interrupt is
-	 * controlled by our level-based lguest interrupt controller.
-	 */
-	irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
-				      handle_level_irq, "level");
-
-	/* Some systems map "vectors" to interrupts weirdly.  Not us! */
-	desc = irq_to_desc(irq);
-	__this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc);
-	return 0;
-}
-
-static int lguest_enable_irq(struct pci_dev *dev)
-{
-	int err;
-	u8 line = 0;
-
-	/* We literally use the PCI interrupt line as the irq number. */
-	pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line);
-	err = lguest_setup_irq(line);
-	if (!err)
-		dev->irq = line;
-	return err;
-}
-
-/* We don't do hotplug PCI, so this shouldn't be called. */
-static void lguest_disable_irq(struct pci_dev *dev)
-{
-	WARN_ON(1);
-}
-
-/*
- * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
- * interrupt (except 128, which is used for system calls).
- */
-static void __init lguest_init_IRQ(void)
-{
-	unsigned int i;
-
-	for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) {
-		if (i != IA32_SYSCALL_VECTOR)
-			set_intr_gate(i, irq_entries_start +
-					8 * (i - FIRST_EXTERNAL_VECTOR));
-	}
-
-	/*
-	 * This call is required to set up for 4k stacks, where we have
-	 * separate stacks for hard and soft interrupts.
-	 */
-	irq_ctx_init(smp_processor_id());
-}
-
-/*
- * Time.
- *
- * It would be far better for everyone if the Guest had its own clock, but
- * until then the Host gives us the time on every interrupt.
- */
-static void lguest_get_wallclock(struct timespec *now)
-{
-	*now = lguest_data.time;
-}
-
-/*
- * The TSC is an Intel thing called the Time Stamp Counter.  The Host tells us
- * what speed it runs at, or 0 if it's unusable as a reliable clock source.
- * This matches what we want here: if we return 0 from this function, the x86
- * TSC clock will give up and not register itself.
- */
-static unsigned long lguest_tsc_khz(void)
-{
-	return lguest_data.tsc_khz;
-}
-
-/*
- * If we can't use the TSC, the kernel falls back to our lower-priority
- * "lguest_clock", where we read the time value given to us by the Host.
- */
-static u64 lguest_clock_read(struct clocksource *cs)
-{
-	unsigned long sec, nsec;
-
-	/*
-	 * Since the time is in two parts (seconds and nanoseconds), we risk
-	 * reading it just as it's changing from 99 & 0.999999999 to 100 and 0,
-	 * and getting 99 and 0.  As Linux tends to come apart under the stress
-	 * of time travel, we must be careful:
-	 */
-	do {
-		/* First we read the seconds part. */
-		sec = lguest_data.time.tv_sec;
-		/*
-		 * This read memory barrier tells the compiler and the CPU that
-		 * this can't be reordered: we have to complete the above
-		 * before going on.
-		 */
-		rmb();
-		/* Now we read the nanoseconds part. */
-		nsec = lguest_data.time.tv_nsec;
-		/* Make sure we've done that. */
-		rmb();
-		/* Now if the seconds part has changed, try again. */
-	} while (unlikely(lguest_data.time.tv_sec != sec));
-
-	/* Our lguest clock is in real nanoseconds. */
-	return sec*1000000000ULL + nsec;
-}
-
-/* This is the fallback clocksource: lower priority than the TSC clocksource. */
-static struct clocksource lguest_clock = {
-	.name		= "lguest",
-	.rating		= 200,
-	.read		= lguest_clock_read,
-	.mask		= CLOCKSOURCE_MASK(64),
-	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-/*
- * We also need a "struct clock_event_device": Linux asks us to set it to go
- * off some time in the future.  Actually, James Morris figured all this out, I
- * just applied the patch.
- */
-static int lguest_clockevent_set_next_event(unsigned long delta,
-                                           struct clock_event_device *evt)
-{
-	/* FIXME: I don't think this can ever happen, but James tells me he had
-	 * to put this code in.  Maybe we should remove it now.  Anyone? */
-	if (delta < LG_CLOCK_MIN_DELTA) {
-		if (printk_ratelimit())
-			printk(KERN_DEBUG "%s: small delta %lu ns\n",
-			       __func__, delta);
-		return -ETIME;
-	}
-
-	/* Please wake us this far in the future. */
-	hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0, 0);
-	return 0;
-}
-
-static int lguest_clockevent_shutdown(struct clock_event_device *evt)
-{
-	/* A 0 argument shuts the clock down. */
-	hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0);
-	return 0;
-}
-
-/* This describes our primitive timer chip. */
-static struct clock_event_device lguest_clockevent = {
-	.name                   = "lguest",
-	.features               = CLOCK_EVT_FEAT_ONESHOT,
-	.set_next_event         = lguest_clockevent_set_next_event,
-	.set_state_shutdown	= lguest_clockevent_shutdown,
-	.rating                 = INT_MAX,
-	.mult                   = 1,
-	.shift                  = 0,
-	.min_delta_ns           = LG_CLOCK_MIN_DELTA,
-	.min_delta_ticks        = LG_CLOCK_MIN_DELTA,
-	.max_delta_ns           = LG_CLOCK_MAX_DELTA,
-	.max_delta_ticks        = LG_CLOCK_MAX_DELTA,
-};
-
-/*
- * This is the Guest timer interrupt handler (hardware interrupt 0).  We just
- * call the clockevent infrastructure and it does whatever needs doing.
- */
-static void lguest_time_irq(struct irq_desc *desc)
-{
-	unsigned long flags;
-
-	/* Don't interrupt us while this is running. */
-	local_irq_save(flags);
-	lguest_clockevent.event_handler(&lguest_clockevent);
-	local_irq_restore(flags);
-}
-
-/*
- * At some point in the boot process, we get asked to set up our timing
- * infrastructure.  The kernel doesn't expect timer interrupts before this, but
- * we cleverly initialized the "blocked_interrupts" field of "struct
- * lguest_data" so that timer interrupts were blocked until now.
- */
-static void lguest_time_init(void)
-{
-	/* Set up the timer interrupt (0) to go to our simple timer routine */
-	if (lguest_setup_irq(0) != 0)
-		panic("Could not set up timer irq");
-	irq_set_handler(0, lguest_time_irq);
-
-	clocksource_register_hz(&lguest_clock, NSEC_PER_SEC);
-
-	/* We can't set cpumask in the initializer: damn C limitations!  Set it
-	 * here and register our timer device. */
-	lguest_clockevent.cpumask = cpumask_of(0);
-	clockevents_register_device(&lguest_clockevent);
-
-	/* Finally, we unblock the timer interrupt. */
-	clear_bit(0, lguest_data.blocked_interrupts);
-}
-
-/*
- * Miscellaneous bits and pieces.
- *
- * Here is an oddball collection of functions which the Guest needs for things
- * to work.  They're pretty simple.
- */
-
-/*
- * The Guest needs to tell the Host what stack it expects traps to use.  For
- * native hardware, this is part of the Task State Segment mentioned above in
- * lguest_load_tr_desc(), but to help hypervisors there's this special call.
- *
- * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data
- * segment), the privilege level (we're privilege level 1, the Host is 0 and
- * will not tolerate us trying to use that), the stack pointer, and the number
- * of pages in the stack.
- */
-static void lguest_load_sp0(struct tss_struct *tss,
-			    struct thread_struct *thread)
-{
-	lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0,
-		   THREAD_SIZE / PAGE_SIZE);
-	tss->x86_tss.sp0 = thread->sp0;
-}
-
-/* Let's just say, I wouldn't do debugging under a Guest. */
-static unsigned long lguest_get_debugreg(int regno)
-{
-	/* FIXME: Implement */
-	return 0;
-}
-
-static void lguest_set_debugreg(int regno, unsigned long value)
-{
-	/* FIXME: Implement */
-}
-
-/*
- * There are times when the kernel wants to make sure that no memory writes are
- * caught in the cache (that they've all reached real hardware devices).  This
- * doesn't matter for the Guest which has virtual hardware.
- *
- * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush
- * (clflush) instruction is available and the kernel uses that.  Otherwise, it
- * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction.
- * Unlike clflush, wbinvd can only be run at privilege level 0.  So we can
- * ignore clflush, but replace wbinvd.
- */
-static void lguest_wbinvd(void)
-{
-}
-
-/*
- * If the Guest expects to have an Advanced Programmable Interrupt Controller,
- * we play dumb by ignoring writes and returning 0 for reads.  So it's no
- * longer Programmable nor Controlling anything, and I don't think 8 lines of
- * code qualifies for Advanced.  It will also never interrupt anything.  It
- * does, however, allow us to get through the Linux boot code.
- */
-#ifdef CONFIG_X86_LOCAL_APIC
-static void lguest_apic_write(u32 reg, u32 v)
-{
-}
-
-static u32 lguest_apic_read(u32 reg)
-{
-	return 0;
-}
-
-static u64 lguest_apic_icr_read(void)
-{
-	return 0;
-}
-
-static void lguest_apic_icr_write(u32 low, u32 id)
-{
-	/* Warn to see if there's any stray references */
-	WARN_ON(1);
-}
-
-static void lguest_apic_wait_icr_idle(void)
-{
-	return;
-}
-
-static u32 lguest_apic_safe_wait_icr_idle(void)
-{
-	return 0;
-}
-
-static void set_lguest_basic_apic_ops(void)
-{
-	apic->read = lguest_apic_read;
-	apic->write = lguest_apic_write;
-	apic->icr_read = lguest_apic_icr_read;
-	apic->icr_write = lguest_apic_icr_write;
-	apic->wait_icr_idle = lguest_apic_wait_icr_idle;
-	apic->safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle;
-};
-#endif
-
-/* STOP!  Until an interrupt comes in. */
-static void lguest_safe_halt(void)
-{
-	hcall(LHCALL_HALT, 0, 0, 0, 0);
-}
-
-/*
- * The SHUTDOWN hypercall takes a string to describe what's happening, and
- * an argument which says whether this to restart (reboot) the Guest or not.
- *
- * Note that the Host always prefers that the Guest speak in physical addresses
- * rather than virtual addresses, so we use __pa() here.
- */
-static void lguest_power_off(void)
-{
-	hcall(LHCALL_SHUTDOWN, __pa("Power down"),
-	      LGUEST_SHUTDOWN_POWEROFF, 0, 0);
-}
-
-/*
- * Panicing.
- *
- * Don't.  But if you did, this is what happens.
- */
-static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
-{
-	hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0, 0);
-	/* The hcall won't return, but to keep gcc happy, we're "done". */
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block paniced = {
-	.notifier_call = lguest_panic
-};
-
-/* Setting up memory is fairly easy. */
-static __init char *lguest_memory_setup(void)
-{
-	/*
-	 * The Linux bootloader header contains an "e820" memory map: the
-	 * Launcher populated the first entry with our memory limit.
-	 */
-	e820__range_add(boot_params.e820_table[0].addr,
-			  boot_params.e820_table[0].size,
-			  boot_params.e820_table[0].type);
-
-	/* This string is for the boot messages. */
-	return "LGUEST";
-}
-
-/* Offset within PCI config space of BAR access capability. */
-static int console_cfg_offset = 0;
-static int console_access_cap;
-
-/* Set up so that we access off in bar0 (on bus 0, device 1, function 0) */
-static void set_cfg_window(u32 cfg_offset, u32 off)
-{
-	write_pci_config_byte(0, 1, 0,
-			      cfg_offset + offsetof(struct virtio_pci_cap, bar),
-			      0);
-	write_pci_config(0, 1, 0,
-			 cfg_offset + offsetof(struct virtio_pci_cap, length),
-			 4);
-	write_pci_config(0, 1, 0,
-			 cfg_offset + offsetof(struct virtio_pci_cap, offset),
-			 off);
-}
-
-static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val)
-{
-	/*
-	 * We could set this up once, then leave it; nothing else in the *
-	 * kernel should touch these registers.  But if it went wrong, that
-	 * would be a horrible bug to find.
-	 */
-	set_cfg_window(cfg_offset, off);
-	write_pci_config(0, 1, 0,
-			 cfg_offset + sizeof(struct virtio_pci_cap), val);
-}
-
-static void probe_pci_console(void)
-{
-	u8 cap, common_cap = 0, device_cap = 0;
-	u32 device_len;
-
-	/* Avoid recursive printk into here. */
-	console_cfg_offset = -1;
-
-	if (!early_pci_allowed()) {
-		printk(KERN_ERR "lguest: early PCI access not allowed!\n");
-		return;
-	}
-
-	/* We expect a console PCI device at BUS0, slot 1. */
-	if (read_pci_config(0, 1, 0, 0) != 0x10431AF4) {
-		printk(KERN_ERR "lguest: PCI device is %#x!\n",
-		       read_pci_config(0, 1, 0, 0));
-		return;
-	}
-
-	/* Find the capabilities we need (must be in bar0) */
-	cap = read_pci_config_byte(0, 1, 0, PCI_CAPABILITY_LIST);
-	while (cap) {
-		u8 vndr = read_pci_config_byte(0, 1, 0, cap);
-		if (vndr == PCI_CAP_ID_VNDR) {
-			u8 type, bar;
-
-			type = read_pci_config_byte(0, 1, 0,
-			    cap + offsetof(struct virtio_pci_cap, cfg_type));
-			bar = read_pci_config_byte(0, 1, 0,
-			    cap + offsetof(struct virtio_pci_cap, bar));
-
-			switch (type) {
-			case VIRTIO_PCI_CAP_DEVICE_CFG:
-				if (bar == 0)
-					device_cap = cap;
-				break;
-			case VIRTIO_PCI_CAP_PCI_CFG:
-				console_access_cap = cap;
-				break;
-			}
-		}
-		cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT);
-	}
-	if (!device_cap || !console_access_cap) {
-		printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n",
-		       common_cap, device_cap, console_access_cap);
-		return;
-	}
-
-	/*
-	 * Note that we can't check features, until we've set the DRIVER
-	 * status bit.  We don't want to do that until we have a real driver,
-	 * so we just check that the device-specific config has room for
-	 * emerg_wr.  If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE
-	 * it should ignore the access.
-	 */
-	device_len = read_pci_config(0, 1, 0,
-			device_cap + offsetof(struct virtio_pci_cap, length));
-	if (device_len < (offsetof(struct virtio_console_config, emerg_wr)
-			  + sizeof(u32))) {
-		printk(KERN_ERR "lguest: console missing emerg_wr field\n");
-		return;
-	}
-
-	console_cfg_offset = read_pci_config(0, 1, 0,
-			device_cap + offsetof(struct virtio_pci_cap, offset));
-	printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n");
-}
-
-/*
- * We will eventually use the virtio console device to produce console output,
- * but before that is set up we use the virtio PCI console's backdoor mmio
- * access and the "emergency" write facility (which is legal even before the
- * device is configured).
- */
-static __init int early_put_chars(u32 vtermno, const char *buf, int count)
-{
-	/* If we couldn't find PCI console, forget it. */
-	if (console_cfg_offset < 0)
-		return count;
-
-	if (unlikely(!console_cfg_offset)) {
-		probe_pci_console();
-		if (console_cfg_offset < 0)
-			return count;
-	}
-
-	write_bar_via_cfg(console_access_cap,
-			  console_cfg_offset
-			  + offsetof(struct virtio_console_config, emerg_wr),
-			  buf[0]);
-	return 1;
-}
-
-/*
- * Rebooting also tells the Host we're finished, but the RESTART flag tells the
- * Launcher to reboot us.
- */
-static void lguest_restart(char *reason)
-{
-	hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0, 0);
-}
-
-/*G:050
- * Patching (Powerfully Placating Performance Pedants)
- *
- * We have already seen that pv_ops structures let us replace simple native
- * instructions with calls to the appropriate back end all throughout the
- * kernel.  This allows the same kernel to run as a Guest and as a native
- * kernel, but it's slow because of all the indirect branches.
- *
- * Remember that David Wheeler quote about "Any problem in computer science can
- * be solved with another layer of indirection"?  The rest of that quote is
- * "... But that usually will create another problem."  This is the first of
- * those problems.
- *
- * Our current solution is to allow the paravirt back end to optionally patch
- * over the indirect calls to replace them with something more efficient.  We
- * patch two of the simplest of the most commonly called functions: disable
- * interrupts and save interrupts.  We usually have 6 or 10 bytes to patch
- * into: the Guest versions of these operations are small enough that we can
- * fit comfortably.
- *
- * First we need assembly templates of each of the patchable Guest operations,
- * and these are in head_32.S.
- */
-
-/*G:060 We construct a table from the assembler templates: */
-static const struct lguest_insns
-{
-	const char *start, *end;
-} lguest_insns[] = {
-	[PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
-	[PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
-};
-
-/*
- * Now our patch routine is fairly simple (based on the native one in
- * paravirt.c).  If we have a replacement, we copy it in and return how much of
- * the available space we used.
- */
-static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
-			     unsigned long addr, unsigned len)
-{
-	unsigned int insn_len;
-
-	/* Don't do anything special if we don't have a replacement */
-	if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start)
-		return paravirt_patch_default(type, clobber, ibuf, addr, len);
-
-	insn_len = lguest_insns[type].end - lguest_insns[type].start;
-
-	/* Similarly if it can't fit (doesn't happen, but let's be thorough). */
-	if (len < insn_len)
-		return paravirt_patch_default(type, clobber, ibuf, addr, len);
-
-	/* Copy in our instructions. */
-	memcpy(ibuf, lguest_insns[type].start, insn_len);
-	return insn_len;
-}
-
-/*G:029
- * Once we get to lguest_init(), we know we're a Guest.  The various
- * pv_ops structures in the kernel provide points for (almost) every routine we
- * have to override to avoid privileged instructions.
- */
-__init void lguest_init(void)
-{
-	/* We're under lguest. */
-	pv_info.name = "lguest";
-	/* We're running at privilege level 1, not 0 as normal. */
-	pv_info.kernel_rpl = 1;
-	/* Everyone except Xen runs with this set. */
-	pv_info.shared_kernel_pmd = 1;
-
-	/*
-	 * We set up all the lguest overrides for sensitive operations.  These
-	 * are detailed with the operations themselves.
-	 */
-
-	/* Interrupt-related operations */
-	pv_irq_ops.save_fl = PV_CALLEE_SAVE(lguest_save_fl);
-	pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
-	pv_irq_ops.irq_disable = PV_CALLEE_SAVE(lguest_irq_disable);
-	pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
-	pv_irq_ops.safe_halt = lguest_safe_halt;
-
-	/* Setup operations */
-	pv_init_ops.patch = lguest_patch;
-
-	/* Intercepts of various CPU instructions */
-	pv_cpu_ops.load_gdt = lguest_load_gdt;
-	pv_cpu_ops.cpuid = lguest_cpuid;
-	pv_cpu_ops.load_idt = lguest_load_idt;
-	pv_cpu_ops.iret = lguest_iret;
-	pv_cpu_ops.load_sp0 = lguest_load_sp0;
-	pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
-	pv_cpu_ops.set_ldt = lguest_set_ldt;
-	pv_cpu_ops.load_tls = lguest_load_tls;
-	pv_cpu_ops.get_debugreg = lguest_get_debugreg;
-	pv_cpu_ops.set_debugreg = lguest_set_debugreg;
-	pv_cpu_ops.read_cr0 = lguest_read_cr0;
-	pv_cpu_ops.write_cr0 = lguest_write_cr0;
-	pv_cpu_ops.read_cr4 = lguest_read_cr4;
-	pv_cpu_ops.write_cr4 = lguest_write_cr4;
-	pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
-	pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
-	pv_cpu_ops.wbinvd = lguest_wbinvd;
-	pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
-	pv_cpu_ops.end_context_switch = lguest_end_context_switch;
-
-	/* Pagetable management */
-	pv_mmu_ops.write_cr3 = lguest_write_cr3;
-	pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user;
-	pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single;
-	pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
-	pv_mmu_ops.set_pte = lguest_set_pte;
-	pv_mmu_ops.set_pte_at = lguest_set_pte_at;
-	pv_mmu_ops.set_pmd = lguest_set_pmd;
-#ifdef CONFIG_X86_PAE
-	pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic;
-	pv_mmu_ops.pte_clear = lguest_pte_clear;
-	pv_mmu_ops.pmd_clear = lguest_pmd_clear;
-	pv_mmu_ops.set_pud = lguest_set_pud;
-#endif
-	pv_mmu_ops.read_cr2 = lguest_read_cr2;
-	pv_mmu_ops.read_cr3 = lguest_read_cr3;
-	pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
-	pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
-	pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu;
-	pv_mmu_ops.pte_update = lguest_pte_update;
-
-#ifdef CONFIG_X86_LOCAL_APIC
-	/* APIC read/write intercepts */
-	set_lguest_basic_apic_ops();
-#endif
-
-	x86_init.resources.memory_setup = lguest_memory_setup;
-	x86_init.irqs.intr_init = lguest_init_IRQ;
-	x86_init.timers.timer_init = lguest_time_init;
-	x86_platform.calibrate_tsc = lguest_tsc_khz;
-	x86_platform.get_wallclock =  lguest_get_wallclock;
-
-	/*
-	 * Now is a good time to look at the implementations of these functions
-	 * before returning to the rest of lguest_init().
-	 */
-
-	/*G:070
-	 * Now we've seen all the paravirt_ops, we return to
-	 * lguest_init() where the rest of the fairly chaotic boot setup
-	 * occurs.
-	 */
-
-	/*
-	 * The stack protector is a weird thing where gcc places a canary
-	 * value on the stack and then checks it on return.  This file is
-	 * compiled with -fno-stack-protector it, so we got this far without
-	 * problems.  The value of the canary is kept at offset 20 from the
-	 * %gs register, so we need to set that up before calling C functions
-	 * in other files.
-	 */
-	setup_stack_canary_segment(0);
-
-	/*
-	 * We could just call load_stack_canary_segment(), but we might as well
-	 * call switch_to_new_gdt() which loads the whole table and sets up the
-	 * per-cpu segment descriptor register %fs as well.
-	 */
-	switch_to_new_gdt(0);
-
-	/*
-	 * The Host<->Guest Switcher lives at the top of our address space, and
-	 * the Host told us how big it is when we made LGUEST_INIT hypercall:
-	 * it put the answer in lguest_data.reserve_mem
-	 */
-	reserve_top_address(lguest_data.reserve_mem);
-
-	/* Hook in our special panic hypercall code. */
-	atomic_notifier_chain_register(&panic_notifier_list, &paniced);
-
-	/*
-	 * This is messy CPU setup stuff which the native boot code does before
-	 * start_kernel, so we have to do, too:
-	 */
-	cpu_detect(&new_cpu_data);
-	/* head.S usually sets up the first capability word, so do it here. */
-	new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
-
-	/* Math is always hard! */
-	set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
-
-	/* We don't have features.  We have puppies!  Puppies! */
-#ifdef CONFIG_X86_MCE
-	mca_cfg.disabled = true;
-#endif
-#ifdef CONFIG_ACPI
-	acpi_disabled = 1;
-#endif
-
-	/*
-	 * We set the preferred console to "hvc".  This is the "hypervisor
-	 * virtual console" driver written by the PowerPC people, which we also
-	 * adapted for lguest's use.
-	 */
-	add_preferred_console("hvc", 0, NULL);
-
-	/* Register our very early console. */
-	virtio_cons_early_init(early_put_chars);
-
-	/* Don't let ACPI try to control our PCI interrupts. */
-	disable_acpi();
-
-	/* We control them ourselves, by overriding these two hooks. */
-	pcibios_enable_irq = lguest_enable_irq;
-	pcibios_disable_irq = lguest_disable_irq;
-
-	/*
-	 * Last of all, we set the power management poweroff hook to point to
-	 * the Guest routine to power off, and the reboot hook to our restart
-	 * routine.
-	 */
-	pm_power_off = lguest_power_off;
-	machine_ops.restart = lguest_restart;
-
-	/*
-	 * Now we're set up, call i386_start_kernel() in head32.c and we proceed
-	 * to boot as normal.  It never returns.
-	 */
-	i386_start_kernel();
-}
-/*
- * This marks the end of stage II of our journey, The Guest.
- *
- * It is now time for us to explore the layer of virtual drivers and complete
- * our understanding of the Guest in "make Drivers".
- */
diff --git a/arch/x86/lguest/head_32.S b/arch/x86/lguest/head_32.S
deleted file mode 100644
index d5ae63f5ec5d..000000000000
--- a/arch/x86/lguest/head_32.S
+++ /dev/null
@@ -1,192 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/lguest.h>
-#include <asm/lguest_hcall.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/processor-flags.h>
-
-/*G:020
-
- * Our story starts with the bzImage: booting starts at startup_32 in
- * arch/x86/boot/compressed/head_32.S.  This merely uncompresses the real
- * kernel in place and then jumps into it: startup_32 in
- * arch/x86/kernel/head_32.S.  Both routines expects a boot header in the %esi
- * register, which is created by the bootloader (the Launcher in our case).
- *
- * The startup_32 function does very little: it clears the uninitialized global
- * C variables which we expect to be zero (ie. BSS) and then copies the boot
- * header and kernel command line somewhere safe, and populates some initial
- * page tables.  Finally it checks the 'hardware_subarch' field.  This was
- * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's
- * assigned number), then it calls us here.
- *
- * WARNING: be very careful here!  We're running at addresses equal to physical
- * addresses (around 0), not above PAGE_OFFSET as most code expects
- * (eg. 0xC0000000).  Jumps are relative, so they're OK, but we can't touch any
- * data without remembering to subtract __PAGE_OFFSET!
- *
- * The .section line puts this code in .init.text so it will be discarded after
- * boot.
- */
-.section .init.text, "ax", @progbits
-ENTRY(lguest_entry)
-	/*
-	 * We make the "initialization" hypercall now to tell the Host where
-	 * our lguest_data struct is.
-	 */
-	movl $LHCALL_LGUEST_INIT, %eax
-	movl $lguest_data - __PAGE_OFFSET, %ebx
-	int $LGUEST_TRAP_ENTRY
-
-	/* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
-	movl $LHCALL_NEW_PGTABLE, %eax
-	movl $(initial_page_table - __PAGE_OFFSET), %ebx
-	int $LGUEST_TRAP_ENTRY
-
-	/* Set up the initial stack so we can run C code. */
-	movl $(init_thread_union+THREAD_SIZE),%esp
-
-	/* Jumps are relative: we're running __PAGE_OFFSET too low. */
-	jmp lguest_init+__PAGE_OFFSET
-
-/*G:055
- * We create a macro which puts the assembler code between lgstart_ and lgend_
- * markers.  These templates are put in the .text section: they can't be
- * discarded after boot as we may need to patch modules, too.
- */
-.text
-#define LGUEST_PATCH(name, insns...)			\
-	lgstart_##name:	insns; lgend_##name:;		\
-	.globl lgstart_##name; .globl lgend_##name
-
-LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
-LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
-
-/*G:033
- * But using those wrappers is inefficient (we'll see why that doesn't matter
- * for save_fl and irq_disable later).  If we write our routines carefully in
- * assembler, we can avoid clobbering any registers and avoid jumping through
- * the wrapper functions.
- *
- * I skipped over our first piece of assembler, but this one is worth studying
- * in a bit more detail so I'll describe in easy stages.  First, the routine to
- * enable interrupts:
- */
-ENTRY(lg_irq_enable)
-	/*
-	 * The reverse of irq_disable, this sets lguest_data.irq_enabled to
-	 * X86_EFLAGS_IF (ie. "Interrupts enabled").
-	 */
-	movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
-	/*
-	 * But now we need to check if the Host wants to know: there might have
-	 * been interrupts waiting to be delivered, in which case it will have
-	 * set lguest_data.irq_pending to X86_EFLAGS_IF.  If it's not zero, we
-	 * jump to send_interrupts, otherwise we're done.
-	 */
-	cmpl $0, lguest_data+LGUEST_DATA_irq_pending
-	jnz send_interrupts
-	/*
-	 * One cool thing about x86 is that you can do many things without using
-	 * a register.  In this case, the normal path hasn't needed to save or
-	 * restore any registers at all!
-	 */
-	ret
-send_interrupts:
-	/*
-	 * OK, now we need a register: eax is used for the hypercall number,
-	 * which is LHCALL_SEND_INTERRUPTS.
-	 *
-	 * We used not to bother with this pending detection at all, which was
-	 * much simpler.  Sooner or later the Host would realize it had to
-	 * send us an interrupt.  But that turns out to make performance 7
-	 * times worse on a simple tcp benchmark.  So now we do this the hard
-	 * way.
-	 */
-	pushl %eax
-	movl $LHCALL_SEND_INTERRUPTS, %eax
-	/* This is the actual hypercall trap. */
-	int  $LGUEST_TRAP_ENTRY
-	/* Put eax back the way we found it. */
-	popl %eax
-	ret
-
-/*
- * Finally, the "popf" or "restore flags" routine.  The %eax register holds the
- * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
- * enabling interrupts again, if it's 0 we're leaving them off.
- */
-ENTRY(lg_restore_fl)
-	/* This is just "lguest_data.irq_enabled = flags;" */
-	movl %eax, lguest_data+LGUEST_DATA_irq_enabled
-	/*
-	 * Now, if the %eax value has enabled interrupts and
-	 * lguest_data.irq_pending is set, we want to tell the Host so it can
-	 * deliver any outstanding interrupts.  Fortunately, both values will
-	 * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
-	 * instruction will AND them together for us.  If both are set, we
-	 * jump to send_interrupts.
-	 */
-	testl lguest_data+LGUEST_DATA_irq_pending, %eax
-	jnz send_interrupts
-	/* Again, the normal path has used no extra registers.  Clever, huh? */
-	ret
-/*:*/
-
-/* These demark the EIP where host should never deliver interrupts. */
-.global lguest_noirq_iret
-
-/*M:004
- * When the Host reflects a trap or injects an interrupt into the Guest, it
- * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled,
- * so the Guest iret logic does the right thing when restoring it.  However,
- * when the Host sets the Guest up for direct traps, such as system calls, the
- * processor is the one to push eflags onto the stack, and the interrupt bit
- * will be 1 (in reality, interrupts are always enabled in the Guest).
- *
- * This turns out to be harmless: the only trap which should happen under Linux
- * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc
- * regions), which has to be reflected through the Host anyway.  If another
- * trap *does* go off when interrupts are disabled, the Guest will panic, and
- * we'll never get to this iret!
-:*/
-
-/*G:045
- * There is one final paravirt_op that the Guest implements, and glancing at it
- * you can see why I left it to last.  It's *cool*!  It's in *assembler*!
- *
- * The "iret" instruction is used to return from an interrupt or trap.  The
- * stack looks like this:
- *   old address
- *   old code segment & privilege level
- *   old processor flags ("eflags")
- *
- * The "iret" instruction pops those values off the stack and restores them all
- * at once.  The only problem is that eflags includes the Interrupt Flag which
- * the Guest can't change: the CPU will simply ignore it when we do an "iret".
- * So we have to copy eflags from the stack to lguest_data.irq_enabled before
- * we do the "iret".
- *
- * There are two problems with this: firstly, we can't clobber any registers
- * and secondly, the whole thing needs to be atomic.  The first problem
- * is solved by using "push memory"/"pop memory" instruction pair for copying.
- *
- * The second is harder: copying eflags to lguest_data.irq_enabled will turn
- * interrupts on before we're finished, so we could be interrupted before we
- * return to userspace or wherever.  Our solution to this is to tell the
- * Host that it is *never* to interrupt us there, even if interrupts seem to be
- * enabled. (It's not necessary to protect pop instruction, since
- * data gets updated only after it completes, so we only need to protect
- * one instruction, iret).
- */
-ENTRY(lguest_iret)
-	pushl	2*4(%esp)
-	/*
-	 * Note the %ss: segment prefix here.  Normal data accesses use the
-	 * "ds" segment, but that will have already been restored for whatever
-	 * we're returning to (such as userspace): we can't trust it.  The %ss:
-	 * prefix makes sure we use the stack segment, which is still valid.
-	 */
-	popl	%ss:lguest_data+LGUEST_DATA_irq_enabled
-lguest_noirq_iret:
-	iret
diff --git a/drivers/Makefile b/drivers/Makefile
index dfdcda00bfe3..d90fdc413648 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -125,7 +125,6 @@ obj-$(CONFIG_ACCESSIBILITY)	+= accessibility/
 obj-$(CONFIG_ISDN)		+= isdn/
 obj-$(CONFIG_EDAC)		+= edac/
 obj-$(CONFIG_EISA)		+= eisa/
-obj-y				+= lguest/
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
 obj-$(CONFIG_CPU_IDLE)		+= cpuidle/
 obj-y				+= mmc/
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 8ddc98279c8f..80aaf3420e12 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -470,7 +470,7 @@ config VIRTIO_BLK
 	depends on VIRTIO
 	---help---
 	  This is the virtual block driver for virtio.  It can be used with
-          lguest or QEMU based VMMs (like KVM or Xen).  Say Y or M.
+          QEMU based VMMs (like KVM or Xen).  Say Y or M.
 
 config VIRTIO_BLK_SCSI
 	bool "SCSI passthrough request for the Virtio block driver"
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index ccd239ab879f..623714344600 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -161,7 +161,7 @@ config VIRTIO_CONSOLE
 	depends on VIRTIO && TTY
 	select HVC_DRIVER
 	help
-	  Virtio console for use with lguest and other hypervisors.
+	  Virtio console for use with hypervisors.
 
 	  Also serves as a general-purpose serial device for data
 	  transfer between the guest and host.  Character devices at
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index ad843eb02ae7..4d229dde6522 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -1130,7 +1130,7 @@ static const struct file_operations port_fops = {
  * We turn the characters into a scatter-gather list, add it to the
  * output queue and then kick the Host.  Then we sit here waiting for
  * it to finish: inefficient in theory, but in practice
- * implementations will do it immediately (lguest's Launcher does).
+ * implementations will do it immediately.
  */
 static int put_chars(u32 vtermno, const char *buf, int count)
 {
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
deleted file mode 100644
index 169172d2ba05..000000000000
--- a/drivers/lguest/Kconfig
+++ /dev/null
@@ -1,13 +0,0 @@
-config LGUEST
-	tristate "Linux hypervisor example code"
-	depends on X86_32 && EVENTFD && TTY && PCI_DIRECT
-	select HVC_DRIVER
-	---help---
-	  This is a very simple module which allows you to run
-	  multiple instances of the same Linux kernel, using the
-	  "lguest" command found in the tools/lguest directory.
-
-	  Note that "lguest" is pronounced to rhyme with "fell quest",
-	  not "rustyvisor". See tools/lguest/lguest.txt.
-
-	  If unsure, say N.  If curious, say M.  If masochistic, say Y.
diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile
deleted file mode 100644
index 16f52ee73994..000000000000
--- a/drivers/lguest/Makefile
+++ /dev/null
@@ -1,26 +0,0 @@
-# Host requires the other files, which can be a module.
-obj-$(CONFIG_LGUEST)	+= lg.o
-lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \
-	segments.o lguest_user.o
-
-lg-$(CONFIG_X86_32) += x86/switcher_32.o x86/core.o
-
-Preparation Preparation!: PREFIX=P
-Guest: PREFIX=G
-Drivers: PREFIX=D
-Launcher: PREFIX=L
-Host: PREFIX=H
-Switcher: PREFIX=S
-Mastery: PREFIX=M
-Beer:
-	@for f in Preparation Guest Drivers Launcher Host Switcher Mastery; do echo "{==- $$f -==}"; make -s $$f; done; echo "{==-==}"
-Preparation Preparation! Guest Drivers Launcher Host Switcher Mastery:
-	@sh ../../tools/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'`
-Puppy:
-	@clear
-	@printf "      __  \n (___()'\`;\n /,    /\`\n \\\\\\\"--\\\\\\   \n"
-	@sleep 2; clear; printf "\n\n   Sit!\n\n"; sleep 1; clear
-	@printf "    __    \n   ()'\`;  \n   /\\|\` \n  /  |  \n(/_)_|_   \n"
-	@sleep 2; clear; printf "\n\n  Stand!\n\n"; sleep 1; clear
-	@printf "    __    \n   ()'\`;  \n   /\\|\` \n  /._.= \n /| /     \n(_\_)_    \n"
-	@sleep 2; clear; printf "\n\n  Good puppy!\n\n"; sleep 1; clear
diff --git a/drivers/lguest/README b/drivers/lguest/README
deleted file mode 100644
index b7db39a64c66..000000000000
--- a/drivers/lguest/README
+++ /dev/null
@@ -1,47 +0,0 @@
-Welcome, friend reader, to lguest.
-
-Lguest is an adventure, with you, the reader, as Hero.  I can't think of many
-5000-line projects which offer both such capability and glimpses of future
-potential; it is an exciting time to be delving into the source!
-
-But be warned; this is an arduous journey of several hours or more!  And as we
-know, all true Heroes are driven by a Noble Goal.  Thus I offer a Beer (or
-equivalent) to anyone I meet who has completed this documentation.
-
-So get comfortable and keep your wits about you (both quick and humorous).
-Along your way to the Noble Goal, you will also gain masterly insight into
-lguest, and hypervisors and x86 virtualization in general.
-
-Our Quest is in seven parts: (best read with C highlighting turned on)
-
-I) Preparation
-	- In which our potential hero is flown quickly over the landscape for a
-	  taste of its scope.  Suitable for the armchair coders and other such
-	  persons of faint constitution.
-
-II) Guest
-	- Where we encounter the first tantalising wisps of code, and come to
-	  understand the details of the life of a Guest kernel.
-
-III) Drivers
-	- Whereby the Guest finds its voice and become useful, and our
-	  understanding of the Guest is completed.
-
-IV) Launcher
-	- Where we trace back to the creation of the Guest, and thus begin our
-	  understanding of the Host.
-
-V) Host
-	- Where we master the Host code, through a long and tortuous journey.
-	  Indeed, it is here that our hero is tested in the Bit of Despair.
-
-VI) Switcher
-	- Where our understanding of the intertwined nature of Guests and Hosts
-	  is completed.
-
-VII) Mastery
-	- Where our fully fledged hero grapples with the Great Question:
-	  "What next?"
-
-make Preparation!
-Rusty Russell.
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
deleted file mode 100644
index 395ed1961dbf..000000000000
--- a/drivers/lguest/core.c
+++ /dev/null
@@ -1,398 +0,0 @@
-/*P:400
- * This contains run_guest() which actually calls into the Host<->Guest
- * Switcher and analyzes the return, such as determining if the Guest wants the
- * Host to do something.  This file also contains useful helper routines.
-:*/
-#include <linux/module.h>
-#include <linux/stringify.h>
-#include <linux/stddef.h>
-#include <linux/io.h>
-#include <linux/mm.h>
-#include <linux/sched/signal.h>
-#include <linux/vmalloc.h>
-#include <linux/cpu.h>
-#include <linux/freezer.h>
-#include <linux/highmem.h>
-#include <linux/slab.h>
-#include <asm/paravirt.h>
-#include <asm/pgtable.h>
-#include <linux/uaccess.h>
-#include <asm/poll.h>
-#include <asm/asm-offsets.h>
-#include "lg.h"
-
-unsigned long switcher_addr;
-struct page **lg_switcher_pages;
-static struct vm_struct *switcher_text_vma;
-static struct vm_struct *switcher_stacks_vma;
-
-/* This One Big lock protects all inter-guest data structures. */
-DEFINE_MUTEX(lguest_lock);
-
-/*H:010
- * We need to set up the Switcher at a high virtual address.  Remember the
- * Switcher is a few hundred bytes of assembler code which actually changes the
- * CPU to run the Guest, and then changes back to the Host when a trap or
- * interrupt happens.
- *
- * The Switcher code must be at the same virtual address in the Guest as the
- * Host since it will be running as the switchover occurs.
- *
- * Trying to map memory at a particular address is an unusual thing to do, so
- * it's not a simple one-liner.
- */
-static __init int map_switcher(void)
-{
-	int i, err;
-
-	/*
-	 * Map the Switcher in to high memory.
-	 *
-	 * It turns out that if we choose the address 0xFFC00000 (4MB under the
-	 * top virtual address), it makes setting up the page tables really
-	 * easy.
-	 */
-
-	/* We assume Switcher text fits into a single page. */
-	if (end_switcher_text - start_switcher_text > PAGE_SIZE) {
-		printk(KERN_ERR "lguest: switcher text too large (%zu)\n",
-		       end_switcher_text - start_switcher_text);
-		return -EINVAL;
-	}
-
-	/*
-	 * We allocate an array of struct page pointers.  map_vm_area() wants
-	 * this, rather than just an array of pages.
-	 */
-	lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0])
-				    * TOTAL_SWITCHER_PAGES,
-				    GFP_KERNEL);
-	if (!lg_switcher_pages) {
-		err = -ENOMEM;
-		goto out;
-	}
-
-	/*
-	 * Now we actually allocate the pages.  The Guest will see these pages,
-	 * so we make sure they're zeroed.
-	 */
-	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
-		lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
-		if (!lg_switcher_pages[i]) {
-			err = -ENOMEM;
-			goto free_some_pages;
-		}
-	}
-
-	/*
-	 * Copy in the compiled-in Switcher code (from x86/switcher_32.S).
-	 * It goes in the first page, which we map in momentarily.
-	 */
-	memcpy(kmap(lg_switcher_pages[0]), start_switcher_text,
-	       end_switcher_text - start_switcher_text);
-	kunmap(lg_switcher_pages[0]);
-
-	/*
-	 * We place the Switcher underneath the fixmap area, which is the
-	 * highest virtual address we can get.  This is important, since we
-	 * tell the Guest it can't access this memory, so we want its ceiling
-	 * as high as possible.
-	 */
-	switcher_addr = FIXADDR_START - TOTAL_SWITCHER_PAGES*PAGE_SIZE;
-
-	/*
-	 * Now we reserve the "virtual memory area"s we want.  We might
-	 * not get them in theory, but in practice it's worked so far.
-	 *
-	 * We want the switcher text to be read-only and executable, and
-	 * the stacks to be read-write and non-executable.
-	 */
-	switcher_text_vma = __get_vm_area(PAGE_SIZE, VM_ALLOC|VM_NO_GUARD,
-					  switcher_addr,
-					  switcher_addr + PAGE_SIZE);
-
-	if (!switcher_text_vma) {
-		err = -ENOMEM;
-		printk("lguest: could not map switcher pages high\n");
-		goto free_pages;
-	}
-
-	switcher_stacks_vma = __get_vm_area(SWITCHER_STACK_PAGES * PAGE_SIZE,
-					    VM_ALLOC|VM_NO_GUARD,
-					    switcher_addr + PAGE_SIZE,
-					    switcher_addr + TOTAL_SWITCHER_PAGES * PAGE_SIZE);
-	if (!switcher_stacks_vma) {
-		err = -ENOMEM;
-		printk("lguest: could not map switcher pages high\n");
-		goto free_text_vma;
-	}
-
-	/*
-	 * This code actually sets up the pages we've allocated to appear at
-	 * switcher_addr.  map_vm_area() takes the vma we allocated above, the
-	 * kind of pages we're mapping (kernel text pages and kernel writable
-	 * pages respectively), and a pointer to our array of struct pages.
-	 */
-	err = map_vm_area(switcher_text_vma, PAGE_KERNEL_RX, lg_switcher_pages);
-	if (err) {
-		printk("lguest: text map_vm_area failed: %i\n", err);
-		goto free_vmas;
-	}
-
-	err = map_vm_area(switcher_stacks_vma, PAGE_KERNEL,
-			  lg_switcher_pages + SWITCHER_TEXT_PAGES);
-	if (err) {
-		printk("lguest: stacks map_vm_area failed: %i\n", err);
-		goto free_vmas;
-	}
-
-	/*
-	 * Now the Switcher is mapped at the right address, we can't fail!
-	 */
-	printk(KERN_INFO "lguest: mapped switcher at %p\n",
-	       switcher_text_vma->addr);
-	/* And we succeeded... */
-	return 0;
-
-free_vmas:
-	/* Undoes map_vm_area and __get_vm_area */
-	vunmap(switcher_stacks_vma->addr);
-free_text_vma:
-	vunmap(switcher_text_vma->addr);
-free_pages:
-	i = TOTAL_SWITCHER_PAGES;
-free_some_pages:
-	for (--i; i >= 0; i--)
-		__free_pages(lg_switcher_pages[i], 0);
-	kfree(lg_switcher_pages);
-out:
-	return err;
-}
-/*:*/
-
-/* Cleaning up the mapping when the module is unloaded is almost... too easy. */
-static void unmap_switcher(void)
-{
-	unsigned int i;
-
-	/* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */
-	vunmap(switcher_text_vma->addr);
-	vunmap(switcher_stacks_vma->addr);
-	/* Now we just need to free the pages we copied the switcher into */
-	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
-		__free_pages(lg_switcher_pages[i], 0);
-	kfree(lg_switcher_pages);
-}
-
-/*H:032
- * Dealing With Guest Memory.
- *
- * Before we go too much further into the Host, we need to grok the routines
- * we use to deal with Guest memory.
- *
- * When the Guest gives us (what it thinks is) a physical address, we can use
- * the normal copy_from_user() & copy_to_user() on the corresponding place in
- * the memory region allocated by the Launcher.
- *
- * But we can't trust the Guest: it might be trying to access the Launcher
- * code.  We have to check that the range is below the pfn_limit the Launcher
- * gave us.  We have to make sure that addr + len doesn't give us a false
- * positive by overflowing, too.
- */
-bool lguest_address_ok(const struct lguest *lg,
-		       unsigned long addr, unsigned long len)
-{
-	return addr+len <= lg->pfn_limit * PAGE_SIZE && (addr+len >= addr);
-}
-
-/*
- * This routine copies memory from the Guest.  Here we can see how useful the
- * kill_lguest() routine we met in the Launcher can be: we return a random
- * value (all zeroes) instead of needing to return an error.
- */
-void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes)
-{
-	if (!lguest_address_ok(cpu->lg, addr, bytes)
-	    || copy_from_user(b, cpu->lg->mem_base + addr, bytes) != 0) {
-		/* copy_from_user should do this, but as we rely on it... */
-		memset(b, 0, bytes);
-		kill_guest(cpu, "bad read address %#lx len %u", addr, bytes);
-	}
-}
-
-/* This is the write (copy into Guest) version. */
-void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b,
-	       unsigned bytes)
-{
-	if (!lguest_address_ok(cpu->lg, addr, bytes)
-	    || copy_to_user(cpu->lg->mem_base + addr, b, bytes) != 0)
-		kill_guest(cpu, "bad write address %#lx len %u", addr, bytes);
-}
-/*:*/
-
-/*H:030
- * Let's jump straight to the the main loop which runs the Guest.
- * Remember, this is called by the Launcher reading /dev/lguest, and we keep
- * going around and around until something interesting happens.
- */
-int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
-{
-	/* If the launcher asked for a register with LHREQ_GETREG */
-	if (cpu->reg_read) {
-		if (put_user(*cpu->reg_read, user))
-			return -EFAULT;
-		cpu->reg_read = NULL;
-		return sizeof(*cpu->reg_read);
-	}
-
-	/* We stop running once the Guest is dead. */
-	while (!cpu->lg->dead) {
-		unsigned int irq;
-		bool more;
-
-		/* First we run any hypercalls the Guest wants done. */
-		if (cpu->hcall)
-			do_hypercalls(cpu);
-
-		/* Do we have to tell the Launcher about a trap? */
-		if (cpu->pending.trap) {
-			if (copy_to_user(user, &cpu->pending,
-					 sizeof(cpu->pending)))
-				return -EFAULT;
-			return sizeof(cpu->pending);
-		}
-
-		/*
-		 * All long-lived kernel loops need to check with this horrible
-		 * thing called the freezer.  If the Host is trying to suspend,
-		 * it stops us.
-		 */
-		try_to_freeze();
-
-		/* Check for signals */
-		if (signal_pending(current))
-			return -ERESTARTSYS;
-
-		/*
-		 * Check if there are any interrupts which can be delivered now:
-		 * if so, this sets up the hander to be executed when we next
-		 * run the Guest.
-		 */
-		irq = interrupt_pending(cpu, &more);
-		if (irq < LGUEST_IRQS)
-			try_deliver_interrupt(cpu, irq, more);
-
-		/*
-		 * Just make absolutely sure the Guest is still alive.  One of
-		 * those hypercalls could have been fatal, for example.
-		 */
-		if (cpu->lg->dead)
-			break;
-
-		/*
-		 * If the Guest asked to be stopped, we sleep.  The Guest's
-		 * clock timer will wake us.
-		 */
-		if (cpu->halted) {
-			set_current_state(TASK_INTERRUPTIBLE);
-			/*
-			 * Just before we sleep, make sure no interrupt snuck in
-			 * which we should be doing.
-			 */
-			if (interrupt_pending(cpu, &more) < LGUEST_IRQS)
-				set_current_state(TASK_RUNNING);
-			else
-				schedule();
-			continue;
-		}
-
-		/*
-		 * OK, now we're ready to jump into the Guest.  First we put up
-		 * the "Do Not Disturb" sign:
-		 */
-		local_irq_disable();
-
-		/* Actually run the Guest until something happens. */
-		lguest_arch_run_guest(cpu);
-
-		/* Now we're ready to be interrupted or moved to other CPUs */
-		local_irq_enable();
-
-		/* Now we deal with whatever happened to the Guest. */
-		lguest_arch_handle_trap(cpu);
-	}
-
-	/* Special case: Guest is 'dead' but wants a reboot. */
-	if (cpu->lg->dead == ERR_PTR(-ERESTART))
-		return -ERESTART;
-
-	/* The Guest is dead => "No such file or directory" */
-	return -ENOENT;
-}
-
-/*H:000
- * Welcome to the Host!
- *
- * By this point your brain has been tickled by the Guest code and numbed by
- * the Launcher code; prepare for it to be stretched by the Host code.  This is
- * the heart.  Let's begin at the initialization routine for the Host's lg
- * module.
- */
-static int __init init(void)
-{
-	int err;
-
-	/* Lguest can't run under Xen, VMI or itself.  It does Tricky Stuff. */
-	if (get_kernel_rpl() != 0) {
-		printk("lguest is afraid of being a guest\n");
-		return -EPERM;
-	}
-
-	/* First we put the Switcher up in very high virtual memory. */
-	err = map_switcher();
-	if (err)
-		goto out;
-
-	/* We might need to reserve an interrupt vector. */
-	err = init_interrupts();
-	if (err)
-		goto unmap;
-
-	/* /dev/lguest needs to be registered. */
-	err = lguest_device_init();
-	if (err)
-		goto free_interrupts;
-
-	/* Finally we do some architecture-specific setup. */
-	lguest_arch_host_init();
-
-	/* All good! */
-	return 0;
-
-free_interrupts:
-	free_interrupts();
-unmap:
-	unmap_switcher();
-out:
-	return err;
-}
-
-/* Cleaning up is just the same code, backwards.  With a little French. */
-static void __exit fini(void)
-{
-	lguest_device_remove();
-	free_interrupts();
-	unmap_switcher();
-
-	lguest_arch_host_fini();
-}
-/*:*/
-
-/*
- * The Host side of lguest can be a module.  This is a nice way for people to
- * play with it.
- */
-module_init(init);
-module_exit(fini);
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
deleted file mode 100644
index 601f81c04873..000000000000
--- a/drivers/lguest/hypercalls.c
+++ /dev/null
@@ -1,304 +0,0 @@
-/*P:500
- * Just as userspace programs request kernel operations through a system
- * call, the Guest requests Host operations through a "hypercall".  You might
- * notice this nomenclature doesn't really follow any logic, but the name has
- * been around for long enough that we're stuck with it.  As you'd expect, this
- * code is basically a one big switch statement.
-:*/
-
-/*  Copyright (C) 2006 Rusty Russell IBM Corporation
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-*/
-#include <linux/uaccess.h>
-#include <linux/syscalls.h>
-#include <linux/mm.h>
-#include <linux/ktime.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include "lg.h"
-
-/*H:120
- * This is the core hypercall routine: where the Guest gets what it wants.
- * Or gets killed.  Or, in the case of LHCALL_SHUTDOWN, both.
- */
-static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
-{
-	switch (args->arg0) {
-	case LHCALL_FLUSH_ASYNC:
-		/*
-		 * This call does nothing, except by breaking out of the Guest
-		 * it makes us process all the asynchronous hypercalls.
-		 */
-		break;
-	case LHCALL_SEND_INTERRUPTS:
-		/*
-		 * This call does nothing too, but by breaking out of the Guest
-		 * it makes us process any pending interrupts.
-		 */
-		break;
-	case LHCALL_LGUEST_INIT:
-		/*
-		 * You can't get here unless you're already initialized.  Don't
-		 * do that.
-		 */
-		kill_guest(cpu, "already have lguest_data");
-		break;
-	case LHCALL_SHUTDOWN: {
-		char msg[128];
-		/*
-		 * Shutdown is such a trivial hypercall that we do it in five
-		 * lines right here.
-		 *
-		 * If the lgread fails, it will call kill_guest() itself; the
-		 * kill_guest() with the message will be ignored.
-		 */
-		__lgread(cpu, msg, args->arg1, sizeof(msg));
-		msg[sizeof(msg)-1] = '\0';
-		kill_guest(cpu, "CRASH: %s", msg);
-		if (args->arg2 == LGUEST_SHUTDOWN_RESTART)
-			cpu->lg->dead = ERR_PTR(-ERESTART);
-		break;
-	}
-	case LHCALL_FLUSH_TLB:
-		/* FLUSH_TLB comes in two flavors, depending on the argument: */
-		if (args->arg1)
-			guest_pagetable_clear_all(cpu);
-		else
-			guest_pagetable_flush_user(cpu);
-		break;
-
-	/*
-	 * All these calls simply pass the arguments through to the right
-	 * routines.
-	 */
-	case LHCALL_NEW_PGTABLE:
-		guest_new_pagetable(cpu, args->arg1);
-		break;
-	case LHCALL_SET_STACK:
-		guest_set_stack(cpu, args->arg1, args->arg2, args->arg3);
-		break;
-	case LHCALL_SET_PTE:
-#ifdef CONFIG_X86_PAE
-		guest_set_pte(cpu, args->arg1, args->arg2,
-				__pte(args->arg3 | (u64)args->arg4 << 32));
-#else
-		guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3));
-#endif
-		break;
-	case LHCALL_SET_PGD:
-		guest_set_pgd(cpu->lg, args->arg1, args->arg2);
-		break;
-#ifdef CONFIG_X86_PAE
-	case LHCALL_SET_PMD:
-		guest_set_pmd(cpu->lg, args->arg1, args->arg2);
-		break;
-#endif
-	case LHCALL_SET_CLOCKEVENT:
-		guest_set_clockevent(cpu, args->arg1);
-		break;
-	case LHCALL_HALT:
-		/* Similarly, this sets the halted flag for run_guest(). */
-		cpu->halted = 1;
-		break;
-	default:
-		/* It should be an architecture-specific hypercall. */
-		if (lguest_arch_do_hcall(cpu, args))
-			kill_guest(cpu, "Bad hypercall %li\n", args->arg0);
-	}
-}
-
-/*H:124
- * Asynchronous hypercalls are easy: we just look in the array in the
- * Guest's "struct lguest_data" to see if any new ones are marked "ready".
- *
- * We are careful to do these in order: obviously we respect the order the
- * Guest put them in the ring, but we also promise the Guest that they will
- * happen before any normal hypercall (which is why we check this before
- * checking for a normal hcall).
- */
-static void do_async_hcalls(struct lg_cpu *cpu)
-{
-	unsigned int i;
-	u8 st[LHCALL_RING_SIZE];
-
-	/* For simplicity, we copy the entire call status array in at once. */
-	if (copy_from_user(&st, &cpu->lg->lguest_data->hcall_status, sizeof(st)))
-		return;
-
-	/* We process "struct lguest_data"s hcalls[] ring once. */
-	for (i = 0; i < ARRAY_SIZE(st); i++) {
-		struct hcall_args args;
-		/*
-		 * We remember where we were up to from last time.  This makes
-		 * sure that the hypercalls are done in the order the Guest
-		 * places them in the ring.
-		 */
-		unsigned int n = cpu->next_hcall;
-
-		/* 0xFF means there's no call here (yet). */
-		if (st[n] == 0xFF)
-			break;
-
-		/*
-		 * OK, we have hypercall.  Increment the "next_hcall" cursor,
-		 * and wrap back to 0 if we reach the end.
-		 */
-		if (++cpu->next_hcall == LHCALL_RING_SIZE)
-			cpu->next_hcall = 0;
-
-		/*
-		 * Copy the hypercall arguments into a local copy of the
-		 * hcall_args struct.
-		 */
-		if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n],
-				   sizeof(struct hcall_args))) {
-			kill_guest(cpu, "Fetching async hypercalls");
-			break;
-		}
-
-		/* Do the hypercall, same as a normal one. */
-		do_hcall(cpu, &args);
-
-		/* Mark the hypercall done. */
-		if (put_user(0xFF, &cpu->lg->lguest_data->hcall_status[n])) {
-			kill_guest(cpu, "Writing result for async hypercall");
-			break;
-		}
-
-		/*
-		 * Stop doing hypercalls if they want to notify the Launcher:
-		 * it needs to service this first.
-		 */
-		if (cpu->pending.trap)
-			break;
-	}
-}
-
-/*
- * Last of all, we look at what happens first of all.  The very first time the
- * Guest makes a hypercall, we end up here to set things up:
- */
-static void initialize(struct lg_cpu *cpu)
-{
-	/*
-	 * You can't do anything until you're initialized.  The Guest knows the
-	 * rules, so we're unforgiving here.
-	 */
-	if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) {
-		kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0);
-		return;
-	}
-
-	if (lguest_arch_init_hypercalls(cpu))
-		kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
-
-	/*
-	 * The Guest tells us where we're not to deliver interrupts by putting
-	 * the instruction address into "struct lguest_data".
-	 */
-	if (get_user(cpu->lg->noirq_iret, &cpu->lg->lguest_data->noirq_iret))
-		kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
-
-	/*
-	 * We write the current time into the Guest's data page once so it can
-	 * set its clock.
-	 */
-	write_timestamp(cpu);
-
-	/* page_tables.c will also do some setup. */
-	page_table_guest_data_init(cpu);
-
-	/*
-	 * This is the one case where the above accesses might have been the
-	 * first write to a Guest page.  This may have caused a copy-on-write
-	 * fault, but the old page might be (read-only) in the Guest
-	 * pagetable.
-	 */
-	guest_pagetable_clear_all(cpu);
-}
-/*:*/
-
-/*M:013
- * If a Guest reads from a page (so creates a mapping) that it has never
- * written to, and then the Launcher writes to it (ie. the output of a virtual
- * device), the Guest will still see the old page.  In practice, this never
- * happens: why would the Guest read a page which it has never written to?  But
- * a similar scenario might one day bite us, so it's worth mentioning.
- *
- * Note that if we used a shared anonymous mapping in the Launcher instead of
- * mapping /dev/zero private, we wouldn't worry about cop-on-write.  And we
- * need that to switch the Launcher to processes (away from threads) anyway.
-:*/
-
-/*H:100
- * Hypercalls
- *
- * Remember from the Guest, hypercalls come in two flavors: normal and
- * asynchronous.  This file handles both of types.
- */
-void do_hypercalls(struct lg_cpu *cpu)
-{
-	/* Not initialized yet?  This hypercall must do it. */
-	if (unlikely(!cpu->lg->lguest_data)) {
-		/* Set up the "struct lguest_data" */
-		initialize(cpu);
-		/* Hcall is done. */
-		cpu->hcall = NULL;
-		return;
-	}
-
-	/*
-	 * The Guest has initialized.
-	 *
-	 * Look in the hypercall ring for the async hypercalls:
-	 */
-	do_async_hcalls(cpu);
-
-	/*
-	 * If we stopped reading the hypercall ring because the Guest did a
-	 * NOTIFY to the Launcher, we want to return now.  Otherwise we do
-	 * the hypercall.
-	 */
-	if (!cpu->pending.trap) {
-		do_hcall(cpu, cpu->hcall);
-		/*
-		 * Tricky point: we reset the hcall pointer to mark the
-		 * hypercall as "done".  We use the hcall pointer rather than
-		 * the trap number to indicate a hypercall is pending.
-		 * Normally it doesn't matter: the Guest will run again and
-		 * update the trap number before we come back here.
-		 *
-		 * However, if we are signalled or the Guest sends I/O to the
-		 * Launcher, the run_guest() loop will exit without running the
-		 * Guest.  When it comes back it would try to re-run the
-		 * hypercall.  Finding that bug sucked.
-		 */
-		cpu->hcall = NULL;
-	}
-}
-
-/*
- * This routine supplies the Guest with time: it's used for wallclock time at
- * initial boot and as a rough time source if the TSC isn't available.
- */
-void write_timestamp(struct lg_cpu *cpu)
-{
-	struct timespec now;
-	ktime_get_real_ts(&now);
-	if (copy_to_user(&cpu->lg->lguest_data->time,
-			 &now, sizeof(struct timespec)))
-		kill_guest(cpu, "Writing timestamp");
-}
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
deleted file mode 100644
index 67392b6ab845..000000000000
--- a/drivers/lguest/interrupts_and_traps.c
+++ /dev/null
@@ -1,706 +0,0 @@
-/*P:800
- * Interrupts (traps) are complicated enough to earn their own file.
- * There are three classes of interrupts:
- *
- * 1) Real hardware interrupts which occur while we're running the Guest,
- * 2) Interrupts for virtual devices attached to the Guest, and
- * 3) Traps and faults from the Guest.
- *
- * Real hardware interrupts must be delivered to the Host, not the Guest.
- * Virtual interrupts must be delivered to the Guest, but we make them look
- * just like real hardware would deliver them.  Traps from the Guest can be set
- * up to go directly back into the Guest, but sometimes the Host wants to see
- * them first, so we also have a way of "reflecting" them into the Guest as if
- * they had been delivered to it directly.
-:*/
-#include <linux/uaccess.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include "lg.h"
-
-/* Allow Guests to use a non-128 (ie. non-Linux) syscall trap. */
-static unsigned int syscall_vector = IA32_SYSCALL_VECTOR;
-module_param(syscall_vector, uint, 0444);
-
-/* The address of the interrupt handler is split into two bits: */
-static unsigned long idt_address(u32 lo, u32 hi)
-{
-	return (lo & 0x0000FFFF) | (hi & 0xFFFF0000);
-}
-
-/*
- * The "type" of the interrupt handler is a 4 bit field: we only support a
- * couple of types.
- */
-static int idt_type(u32 lo, u32 hi)
-{
-	return (hi >> 8) & 0xF;
-}
-
-/* An IDT entry can't be used unless the "present" bit is set. */
-static bool idt_present(u32 lo, u32 hi)
-{
-	return (hi & 0x8000);
-}
-
-/*
- * We need a helper to "push" a value onto the Guest's stack, since that's a
- * big part of what delivering an interrupt does.
- */
-static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val)
-{
-	/* Stack grows upwards: move stack then write value. */
-	*gstack -= 4;
-	lgwrite(cpu, *gstack, u32, val);
-}
-
-/*H:210
- * The push_guest_interrupt_stack() routine saves Guest state on the stack for
- * an interrupt or trap.  The mechanics of delivering traps and interrupts to
- * the Guest are the same, except some traps have an "error code" which gets
- * pushed onto the stack as well: the caller tells us if this is one.
- *
- * We set up the stack just like the CPU does for a real interrupt, so it's
- * identical for the Guest (and the standard "iret" instruction will undo
- * it).
- */
-static void push_guest_interrupt_stack(struct lg_cpu *cpu, bool has_err)
-{
-	unsigned long gstack, origstack;
-	u32 eflags, ss, irq_enable;
-	unsigned long virtstack;
-
-	/*
-	 * There are two cases for interrupts: one where the Guest is already
-	 * in the kernel, and a more complex one where the Guest is in
-	 * userspace.  We check the privilege level to find out.
-	 */
-	if ((cpu->regs->ss&0x3) != GUEST_PL) {
-		/*
-		 * The Guest told us their kernel stack with the SET_STACK
-		 * hypercall: both the virtual address and the segment.
-		 */
-		virtstack = cpu->esp1;
-		ss = cpu->ss1;
-
-		origstack = gstack = guest_pa(cpu, virtstack);
-		/*
-		 * We push the old stack segment and pointer onto the new
-		 * stack: when the Guest does an "iret" back from the interrupt
-		 * handler the CPU will notice they're dropping privilege
-		 * levels and expect these here.
-		 */
-		push_guest_stack(cpu, &gstack, cpu->regs->ss);
-		push_guest_stack(cpu, &gstack, cpu->regs->esp);
-	} else {
-		/* We're staying on the same Guest (kernel) stack. */
-		virtstack = cpu->regs->esp;
-		ss = cpu->regs->ss;
-
-		origstack = gstack = guest_pa(cpu, virtstack);
-	}
-
-	/*
-	 * Remember that we never let the Guest actually disable interrupts, so
-	 * the "Interrupt Flag" bit is always set.  We copy that bit from the
-	 * Guest's "irq_enabled" field into the eflags word: we saw the Guest
-	 * copy it back in "lguest_iret".
-	 */
-	eflags = cpu->regs->eflags;
-	if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0
-	    && !(irq_enable & X86_EFLAGS_IF))
-		eflags &= ~X86_EFLAGS_IF;
-
-	/*
-	 * An interrupt is expected to push three things on the stack: the old
-	 * "eflags" word, the old code segment, and the old instruction
-	 * pointer.
-	 */
-	push_guest_stack(cpu, &gstack, eflags);
-	push_guest_stack(cpu, &gstack, cpu->regs->cs);
-	push_guest_stack(cpu, &gstack, cpu->regs->eip);
-
-	/* For the six traps which supply an error code, we push that, too. */
-	if (has_err)
-		push_guest_stack(cpu, &gstack, cpu->regs->errcode);
-
-	/* Adjust the stack pointer and stack segment. */
-	cpu->regs->ss = ss;
-	cpu->regs->esp = virtstack + (gstack - origstack);
-}
-
-/*
- * This actually makes the Guest start executing the given interrupt/trap
- * handler.
- *
- * "lo" and "hi" are the two parts of the Interrupt Descriptor Table for this
- * interrupt or trap.  It's split into two parts for traditional reasons: gcc
- * on i386 used to be frightened by 64 bit numbers.
- */
-static void guest_run_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi)
-{
-	/* If we're already in the kernel, we don't change stacks. */
-	if ((cpu->regs->ss&0x3) != GUEST_PL)
-		cpu->regs->ss = cpu->esp1;
-
-	/*
-	 * Set the code segment and the address to execute.
-	 */
-	cpu->regs->cs = (__KERNEL_CS|GUEST_PL);
-	cpu->regs->eip = idt_address(lo, hi);
-
-	/*
-	 * Trapping always clears these flags:
-	 * TF: Trap flag
-	 * VM: Virtual 8086 mode
-	 * RF: Resume
-	 * NT: Nested task.
-	 */
-	cpu->regs->eflags &=
-		~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT);
-
-	/*
-	 * There are two kinds of interrupt handlers: 0xE is an "interrupt
-	 * gate" which expects interrupts to be disabled on entry.
-	 */
-	if (idt_type(lo, hi) == 0xE)
-		if (put_user(0, &cpu->lg->lguest_data->irq_enabled))
-			kill_guest(cpu, "Disabling interrupts");
-}
-
-/* This restores the eflags word which was pushed on the stack by a trap */
-static void restore_eflags(struct lg_cpu *cpu)
-{
-	/* This is the physical address of the stack. */
-	unsigned long stack_pa = guest_pa(cpu, cpu->regs->esp);
-
-	/*
-	 * Stack looks like this:
-	 * Address	Contents
-	 * esp		EIP
-	 * esp + 4	CS
-	 * esp + 8	EFLAGS
-	 */
-	cpu->regs->eflags = lgread(cpu, stack_pa + 8, u32);
-	cpu->regs->eflags &=
-		~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT);
-}
-
-/*H:205
- * Virtual Interrupts.
- *
- * interrupt_pending() returns the first pending interrupt which isn't blocked
- * by the Guest.  It is called before every entry to the Guest, and just before
- * we go to sleep when the Guest has halted itself.
- */
-unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more)
-{
-	unsigned int irq;
-	DECLARE_BITMAP(blk, LGUEST_IRQS);
-
-	/* If the Guest hasn't even initialized yet, we can do nothing. */
-	if (!cpu->lg->lguest_data)
-		return LGUEST_IRQS;
-
-	/*
-	 * Take our "irqs_pending" array and remove any interrupts the Guest
-	 * wants blocked: the result ends up in "blk".
-	 */
-	if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts,
-			   sizeof(blk)))
-		return LGUEST_IRQS;
-	bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS);
-
-	/* Find the first interrupt. */
-	irq = find_first_bit(blk, LGUEST_IRQS);
-	*more = find_next_bit(blk, LGUEST_IRQS, irq+1);
-
-	return irq;
-}
-
-/*
- * This actually diverts the Guest to running an interrupt handler, once an
- * interrupt has been identified by interrupt_pending().
- */
-void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more)
-{
-	struct desc_struct *idt;
-
-	BUG_ON(irq >= LGUEST_IRQS);
-
-	/* If they're halted, interrupts restart them. */
-	if (cpu->halted) {
-		/* Re-enable interrupts. */
-		if (put_user(X86_EFLAGS_IF, &cpu->lg->lguest_data->irq_enabled))
-			kill_guest(cpu, "Re-enabling interrupts");
-		cpu->halted = 0;
-	} else {
-		/* Otherwise we check if they have interrupts disabled. */
-		u32 irq_enabled;
-		if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled))
-			irq_enabled = 0;
-		if (!irq_enabled) {
-			/* Make sure they know an IRQ is pending. */
-			put_user(X86_EFLAGS_IF,
-				 &cpu->lg->lguest_data->irq_pending);
-			return;
-		}
-	}
-
-	/*
-	 * Look at the IDT entry the Guest gave us for this interrupt.  The
-	 * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip
-	 * over them.
-	 */
-	idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq];
-	/* If they don't have a handler (yet?), we just ignore it */
-	if (idt_present(idt->a, idt->b)) {
-		/* OK, mark it no longer pending and deliver it. */
-		clear_bit(irq, cpu->irqs_pending);
-
-		/*
-		 * They may be about to iret, where they asked us never to
-		 * deliver interrupts.  In this case, we can emulate that iret
-		 * then immediately deliver the interrupt.  This is basically
-		 * a noop: the iret would pop the interrupt frame and restore
-		 * eflags, and then we'd set it up again.  So just restore the
-		 * eflags word and jump straight to the handler in this case.
-		 *
-		 * Denys Vlasenko points out that this isn't quite right: if
-		 * the iret was returning to userspace, then that interrupt
-		 * would reset the stack pointer (which the Guest told us
-		 * about via LHCALL_SET_STACK).  But unless the Guest is being
-		 * *really* weird, that will be the same as the current stack
-		 * anyway.
-		 */
-		if (cpu->regs->eip == cpu->lg->noirq_iret) {
-			restore_eflags(cpu);
-		} else {
-			/*
-			 * set_guest_interrupt() takes a flag to say whether
-			 * this interrupt pushes an error code onto the stack
-			 * as well: virtual interrupts never do.
-			 */
-			push_guest_interrupt_stack(cpu, false);
-		}
-		/* Actually make Guest cpu jump to handler. */
-		guest_run_interrupt(cpu, idt->a, idt->b);
-	}
-
-	/*
-	 * Every time we deliver an interrupt, we update the timestamp in the
-	 * Guest's lguest_data struct.  It would be better for the Guest if we
-	 * did this more often, but it can actually be quite slow: doing it
-	 * here is a compromise which means at least it gets updated every
-	 * timer interrupt.
-	 */
-	write_timestamp(cpu);
-
-	/*
-	 * If there are no other interrupts we want to deliver, clear
-	 * the pending flag.
-	 */
-	if (!more)
-		put_user(0, &cpu->lg->lguest_data->irq_pending);
-}
-
-/* And this is the routine when we want to set an interrupt for the Guest. */
-void set_interrupt(struct lg_cpu *cpu, unsigned int irq)
-{
-	/*
-	 * Next time the Guest runs, the core code will see if it can deliver
-	 * this interrupt.
-	 */
-	set_bit(irq, cpu->irqs_pending);
-
-	/*
-	 * Make sure it sees it; it might be asleep (eg. halted), or running
-	 * the Guest right now, in which case kick_process() will knock it out.
-	 */
-	if (!wake_up_process(cpu->tsk))
-		kick_process(cpu->tsk);
-}
-/*:*/
-
-/*
- * Linux uses trap 128 for system calls.  Plan9 uses 64, and Ron Minnich sent
- * me a patch, so we support that too.  It'd be a big step for lguest if half
- * the Plan 9 user base were to start using it.
- *
- * Actually now I think of it, it's possible that Ron *is* half the Plan 9
- * userbase.  Oh well.
- */
-bool could_be_syscall(unsigned int num)
-{
-	/* Normal Linux IA32_SYSCALL_VECTOR or reserved vector? */
-	return num == IA32_SYSCALL_VECTOR || num == syscall_vector;
-}
-
-/* The syscall vector it wants must be unused by Host. */
-bool check_syscall_vector(struct lguest *lg)
-{
-	u32 vector;
-
-	if (get_user(vector, &lg->lguest_data->syscall_vec))
-		return false;
-
-	return could_be_syscall(vector);
-}
-
-int init_interrupts(void)
-{
-	/* If they want some strange system call vector, reserve it now */
-	if (syscall_vector != IA32_SYSCALL_VECTOR) {
-		if (test_bit(syscall_vector, used_vectors) ||
-		    vector_used_by_percpu_irq(syscall_vector)) {
-			printk(KERN_ERR "lg: couldn't reserve syscall %u\n",
-				 syscall_vector);
-			return -EBUSY;
-		}
-		set_bit(syscall_vector, used_vectors);
-	}
-
-	return 0;
-}
-
-void free_interrupts(void)
-{
-	if (syscall_vector != IA32_SYSCALL_VECTOR)
-		clear_bit(syscall_vector, used_vectors);
-}
-
-/*H:220
- * Now we've got the routines to deliver interrupts, delivering traps like
- * page fault is easy.  The only trick is that Intel decided that some traps
- * should have error codes:
- */
-static bool has_err(unsigned int trap)
-{
-	return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17);
-}
-
-/* deliver_trap() returns true if it could deliver the trap. */
-bool deliver_trap(struct lg_cpu *cpu, unsigned int num)
-{
-	/*
-	 * Trap numbers are always 8 bit, but we set an impossible trap number
-	 * for traps inside the Switcher, so check that here.
-	 */
-	if (num >= ARRAY_SIZE(cpu->arch.idt))
-		return false;
-
-	/*
-	 * Early on the Guest hasn't set the IDT entries (or maybe it put a
-	 * bogus one in): if we fail here, the Guest will be killed.
-	 */
-	if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b))
-		return false;
-	push_guest_interrupt_stack(cpu, has_err(num));
-	guest_run_interrupt(cpu, cpu->arch.idt[num].a,
-			    cpu->arch.idt[num].b);
-	return true;
-}
-
-/*H:250
- * Here's the hard part: returning to the Host every time a trap happens
- * and then calling deliver_trap() and re-entering the Guest is slow.
- * Particularly because Guest userspace system calls are traps (usually trap
- * 128).
- *
- * So we'd like to set up the IDT to tell the CPU to deliver traps directly
- * into the Guest.  This is possible, but the complexities cause the size of
- * this file to double!  However, 150 lines of code is worth writing for taking
- * system calls down from 1750ns to 270ns.  Plus, if lguest didn't do it, all
- * the other hypervisors would beat it up at lunchtime.
- *
- * This routine indicates if a particular trap number could be delivered
- * directly.
- *
- * Unfortunately, Linux 4.6 started using an interrupt gate instead of a
- * trap gate for syscalls, so this trick is ineffective.  See Mastery for
- * how we could do this anyway...
- */
-static bool direct_trap(unsigned int num)
-{
-	/*
-	 * Hardware interrupts don't go to the Guest at all (except system
-	 * call).
-	 */
-	if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num))
-		return false;
-
-	/*
-	 * The Host needs to see page faults (for shadow paging and to save the
-	 * fault address), general protection faults (in/out emulation) and
-	 * device not available (TS handling) and of course, the hypercall trap.
-	 */
-	return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY;
-}
-/*:*/
-
-/*M:005
- * The Guest has the ability to turn its interrupt gates into trap gates,
- * if it is careful.  The Host will let trap gates can go directly to the
- * Guest, but the Guest needs the interrupts atomically disabled for an
- * interrupt gate.  The Host could provide a mechanism to register more
- * "no-interrupt" regions, and the Guest could point the trap gate at
- * instructions within that region, where it can safely disable interrupts.
- */
-
-/*M:006
- * The Guests do not use the sysenter (fast system call) instruction,
- * because it's hardcoded to enter privilege level 0 and so can't go direct.
- * It's about twice as fast as the older "int 0x80" system call, so it might
- * still be worthwhile to handle it in the Switcher and lcall down to the
- * Guest.  The sysenter semantics are hairy tho: search for that keyword in
- * entry.S
-:*/
-
-/*H:260
- * When we make traps go directly into the Guest, we need to make sure
- * the kernel stack is valid (ie. mapped in the page tables).  Otherwise, the
- * CPU trying to deliver the trap will fault while trying to push the interrupt
- * words on the stack: this is called a double fault, and it forces us to kill
- * the Guest.
- *
- * Which is deeply unfair, because (literally!) it wasn't the Guests' fault.
- */
-void pin_stack_pages(struct lg_cpu *cpu)
-{
-	unsigned int i;
-
-	/*
-	 * Depending on the CONFIG_4KSTACKS option, the Guest can have one or
-	 * two pages of stack space.
-	 */
-	for (i = 0; i < cpu->lg->stack_pages; i++)
-		/*
-		 * The stack grows *upwards*, so the address we're given is the
-		 * start of the page after the kernel stack.  Subtract one to
-		 * get back onto the first stack page, and keep subtracting to
-		 * get to the rest of the stack pages.
-		 */
-		pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE);
-}
-
-/*
- * Direct traps also mean that we need to know whenever the Guest wants to use
- * a different kernel stack, so we can change the guest TSS to use that
- * stack.  The TSS entries expect a virtual address, so unlike most addresses
- * the Guest gives us, the "esp" (stack pointer) value here is virtual, not
- * physical.
- *
- * In Linux each process has its own kernel stack, so this happens a lot: we
- * change stacks on each context switch.
- */
-void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages)
-{
-	/*
-	 * You're not allowed a stack segment with privilege level 0: bad Guest!
-	 */
-	if ((seg & 0x3) != GUEST_PL)
-		kill_guest(cpu, "bad stack segment %i", seg);
-	/* We only expect one or two stack pages. */
-	if (pages > 2)
-		kill_guest(cpu, "bad stack pages %u", pages);
-	/* Save where the stack is, and how many pages */
-	cpu->ss1 = seg;
-	cpu->esp1 = esp;
-	cpu->lg->stack_pages = pages;
-	/* Make sure the new stack pages are mapped */
-	pin_stack_pages(cpu);
-}
-
-/*
- * All this reference to mapping stacks leads us neatly into the other complex
- * part of the Host: page table handling.
- */
-
-/*H:235
- * This is the routine which actually checks the Guest's IDT entry and
- * transfers it into the entry in "struct lguest":
- */
-static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap,
-		     unsigned int num, u32 lo, u32 hi)
-{
-	u8 type = idt_type(lo, hi);
-
-	/* We zero-out a not-present entry */
-	if (!idt_present(lo, hi)) {
-		trap->a = trap->b = 0;
-		return;
-	}
-
-	/* We only support interrupt and trap gates. */
-	if (type != 0xE && type != 0xF)
-		kill_guest(cpu, "bad IDT type %i", type);
-
-	/*
-	 * We only copy the handler address, present bit, privilege level and
-	 * type.  The privilege level controls where the trap can be triggered
-	 * manually with an "int" instruction.  This is usually GUEST_PL,
-	 * except for system calls which userspace can use.
-	 */
-	trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF);
-	trap->b = (hi&0xFFFFEF00);
-}
-
-/*H:230
- * While we're here, dealing with delivering traps and interrupts to the
- * Guest, we might as well complete the picture: how the Guest tells us where
- * it wants them to go.  This would be simple, except making traps fast
- * requires some tricks.
- *
- * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the
- * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here.
- */
-void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi)
-{
-	/*
-	 * Guest never handles: NMI, doublefault, spurious interrupt or
-	 * hypercall.  We ignore when it tries to set them.
-	 */
-	if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY)
-		return;
-
-	/*
-	 * Mark the IDT as changed: next time the Guest runs we'll know we have
-	 * to copy this again.
-	 */
-	cpu->changed |= CHANGED_IDT;
-
-	/* Check that the Guest doesn't try to step outside the bounds. */
-	if (num >= ARRAY_SIZE(cpu->arch.idt))
-		kill_guest(cpu, "Setting idt entry %u", num);
-	else
-		set_trap(cpu, &cpu->arch.idt[num], num, lo, hi);
-}
-
-/*
- * The default entry for each interrupt points into the Switcher routines which
- * simply return to the Host.  The run_guest() loop will then call
- * deliver_trap() to bounce it back into the Guest.
- */
-static void default_idt_entry(struct desc_struct *idt,
-			      int trap,
-			      const unsigned long handler,
-			      const struct desc_struct *base)
-{
-	/* A present interrupt gate. */
-	u32 flags = 0x8e00;
-
-	/*
-	 * Set the privilege level on the entry for the hypercall: this allows
-	 * the Guest to use the "int" instruction to trigger it.
-	 */
-	if (trap == LGUEST_TRAP_ENTRY)
-		flags |= (GUEST_PL << 13);
-	else if (base)
-		/*
-		 * Copy privilege level from what Guest asked for.  This allows
-		 * debug (int 3) traps from Guest userspace, for example.
-		 */
-		flags |= (base->b & 0x6000);
-
-	/* Now pack it into the IDT entry in its weird format. */
-	idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF);
-	idt->b = (handler&0xFFFF0000) | flags;
-}
-
-/* When the Guest first starts, we put default entries into the IDT. */
-void setup_default_idt_entries(struct lguest_ro_state *state,
-			       const unsigned long *def)
-{
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++)
-		default_idt_entry(&state->guest_idt[i], i, def[i], NULL);
-}
-
-/*H:240
- * We don't use the IDT entries in the "struct lguest" directly, instead
- * we copy them into the IDT which we've set up for Guests on this CPU, just
- * before we run the Guest.  This routine does that copy.
- */
-void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
-		const unsigned long *def)
-{
-	unsigned int i;
-
-	/*
-	 * We can simply copy the direct traps, otherwise we use the default
-	 * ones in the Switcher: they will return to the Host.
-	 */
-	for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) {
-		const struct desc_struct *gidt = &cpu->arch.idt[i];
-
-		/* If no Guest can ever override this trap, leave it alone. */
-		if (!direct_trap(i))
-			continue;
-
-		/*
-		 * Only trap gates (type 15) can go direct to the Guest.
-		 * Interrupt gates (type 14) disable interrupts as they are
-		 * entered, which we never let the Guest do.  Not present
-		 * entries (type 0x0) also can't go direct, of course.
-		 *
-		 * If it can't go direct, we still need to copy the priv. level:
-		 * they might want to give userspace access to a software
-		 * interrupt.
-		 */
-		if (idt_type(gidt->a, gidt->b) == 0xF)
-			idt[i] = *gidt;
-		else
-			default_idt_entry(&idt[i], i, def[i], gidt);
-	}
-}
-
-/*H:200
- * The Guest Clock.
- *
- * There are two sources of virtual interrupts.  We saw one in lguest_user.c:
- * the Launcher sending interrupts for virtual devices.  The other is the Guest
- * timer interrupt.
- *
- * The Guest uses the LHCALL_SET_CLOCKEVENT hypercall to tell us how long to
- * the next timer interrupt (in nanoseconds).  We use the high-resolution timer
- * infrastructure to set a callback at that time.
- *
- * 0 means "turn off the clock".
- */
-void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta)
-{
-	ktime_t expires;
-
-	if (unlikely(delta == 0)) {
-		/* Clock event device is shutting down. */
-		hrtimer_cancel(&cpu->hrt);
-		return;
-	}
-
-	/*
-	 * We use wallclock time here, so the Guest might not be running for
-	 * all the time between now and the timer interrupt it asked for.  This
-	 * is almost always the right thing to do.
-	 */
-	expires = ktime_add_ns(ktime_get_real(), delta);
-	hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS);
-}
-
-/* This is the function called when the Guest's timer expires. */
-static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
-{
-	struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt);
-
-	/* Remember the first interrupt is the timer interrupt. */
-	set_interrupt(cpu, 0);
-	return HRTIMER_NORESTART;
-}
-
-/* This sets up the timer for this Guest. */
-void init_clockdev(struct lg_cpu *cpu)
-{
-	hrtimer_init(&cpu->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
-	cpu->hrt.function = clockdev_fn;
-}
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
deleted file mode 100644
index 2356a2318034..000000000000
--- a/drivers/lguest/lg.h
+++ /dev/null
@@ -1,258 +0,0 @@
-#ifndef _LGUEST_H
-#define _LGUEST_H
-
-#ifndef __ASSEMBLY__
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/stringify.h>
-#include <linux/lguest.h>
-#include <linux/lguest_launcher.h>
-#include <linux/wait.h>
-#include <linux/hrtimer.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-
-#include <asm/lguest.h>
-
-struct pgdir {
-	unsigned long gpgdir;
-	bool switcher_mapped;
-	int last_host_cpu;
-	pgd_t *pgdir;
-};
-
-/* We have two pages shared with guests, per cpu.  */
-struct lguest_pages {
-	/* This is the stack page mapped rw in guest */
-	char spare[PAGE_SIZE - sizeof(struct lguest_regs)];
-	struct lguest_regs regs;
-
-	/* This is the host state & guest descriptor page, ro in guest */
-	struct lguest_ro_state state;
-} __attribute__((aligned(PAGE_SIZE)));
-
-#define CHANGED_IDT		1
-#define CHANGED_GDT		2
-#define CHANGED_GDT_TLS		4 /* Actually a subset of CHANGED_GDT */
-#define CHANGED_ALL	        3
-
-struct lg_cpu {
-	unsigned int id;
-	struct lguest *lg;
-	struct task_struct *tsk;
-	struct mm_struct *mm; 	/* == tsk->mm, but that becomes NULL on exit */
-
-	u32 cr2;
-	u32 esp1;
-	u16 ss1;
-
-	/* Bitmap of what has changed: see CHANGED_* above. */
-	int changed;
-
-	/* Pending operation. */
-	struct lguest_pending pending;
-
-	unsigned long *reg_read; /* register from LHREQ_GETREG */
-
-	/* At end of a page shared mapped over lguest_pages in guest. */
-	unsigned long regs_page;
-	struct lguest_regs *regs;
-
-	struct lguest_pages *last_pages;
-
-	/* Initialization mode: linear map everything. */
-	bool linear_pages;
-	int cpu_pgd; /* Which pgd this cpu is currently using */
-
-	/* If a hypercall was asked for, this points to the arguments. */
-	struct hcall_args *hcall;
-	u32 next_hcall;
-
-	/* Virtual clock device */
-	struct hrtimer hrt;
-
-	/* Did the Guest tell us to halt? */
-	int halted;
-
-	/* Pending virtual interrupts */
-	DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
-
-	struct lg_cpu_arch arch;
-};
-
-/* The private info the thread maintains about the guest. */
-struct lguest {
-	struct lguest_data __user *lguest_data;
-	struct lg_cpu cpus[NR_CPUS];
-	unsigned int nr_cpus;
-
-	/* Valid guest memory pages must be < this. */
-	u32 pfn_limit;
-
-	/* Device memory is >= pfn_limit and < device_limit. */
-	u32 device_limit;
-
-	/*
-	 * This provides the offset to the base of guest-physical memory in the
-	 * Launcher.
-	 */
-	void __user *mem_base;
-	unsigned long kernel_address;
-
-	struct pgdir pgdirs[4];
-
-	unsigned long noirq_iret;
-
-	unsigned int stack_pages;
-	u32 tsc_khz;
-
-	/* Dead? */
-	const char *dead;
-};
-
-extern struct mutex lguest_lock;
-
-/* core.c: */
-bool lguest_address_ok(const struct lguest *lg,
-		       unsigned long addr, unsigned long len);
-void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
-void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
-extern struct page **lg_switcher_pages;
-
-/*H:035
- * Using memory-copy operations like that is usually inconvient, so we
- * have the following helper macros which read and write a specific type (often
- * an unsigned long).
- *
- * This reads into a variable of the given type then returns that.
- */
-#define lgread(cpu, addr, type)						\
-	({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; })
-
-/* This checks that the variable is of the given type, then writes it out. */
-#define lgwrite(cpu, addr, type, val)				\
-	do {							\
-		typecheck(type, val);				\
-		__lgwrite((cpu), (addr), &(val), sizeof(val));	\
-	} while(0)
-/* (end of memory access helper routines) :*/
-
-int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
-
-/*
- * Helper macros to obtain the first 12 or the last 20 bits, this is only the
- * first step in the migration to the kernel types.  pte_pfn is already defined
- * in the kernel.
- */
-#define pgd_flags(x)	(pgd_val(x) & ~PAGE_MASK)
-#define pgd_pfn(x)	(pgd_val(x) >> PAGE_SHIFT)
-#define pmd_flags(x)    (pmd_val(x) & ~PAGE_MASK)
-#define pmd_pfn(x)	(pmd_val(x) >> PAGE_SHIFT)
-
-/* interrupts_and_traps.c: */
-unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more);
-void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more);
-void set_interrupt(struct lg_cpu *cpu, unsigned int irq);
-bool deliver_trap(struct lg_cpu *cpu, unsigned int num);
-void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
-			  u32 low, u32 hi);
-void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages);
-void pin_stack_pages(struct lg_cpu *cpu);
-void setup_default_idt_entries(struct lguest_ro_state *state,
-			       const unsigned long *def);
-void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
-		const unsigned long *def);
-void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
-bool send_notify_to_eventfd(struct lg_cpu *cpu);
-void init_clockdev(struct lg_cpu *cpu);
-bool check_syscall_vector(struct lguest *lg);
-bool could_be_syscall(unsigned int num);
-int init_interrupts(void);
-void free_interrupts(void);
-
-/* segments.c: */
-void setup_default_gdt_entries(struct lguest_ro_state *state);
-void setup_guest_gdt(struct lg_cpu *cpu);
-void load_guest_gdt_entry(struct lg_cpu *cpu, unsigned int i,
-			  u32 low, u32 hi);
-void guest_load_tls(struct lg_cpu *cpu, unsigned long tls_array);
-void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt);
-void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt);
-
-/* page_tables.c: */
-int init_guest_pagetable(struct lguest *lg);
-void free_guest_pagetable(struct lguest *lg);
-void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable);
-void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i);
-#ifdef CONFIG_X86_PAE
-void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
-#endif
-void guest_pagetable_clear_all(struct lg_cpu *cpu);
-void guest_pagetable_flush_user(struct lg_cpu *cpu);
-void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
-		   unsigned long vaddr, pte_t val);
-void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages);
-bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode,
-		 unsigned long *iomem);
-void pin_page(struct lg_cpu *cpu, unsigned long vaddr);
-bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr);
-unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr);
-void page_table_guest_data_init(struct lg_cpu *cpu);
-
-/* <arch>/core.c: */
-void lguest_arch_host_init(void);
-void lguest_arch_host_fini(void);
-void lguest_arch_run_guest(struct lg_cpu *cpu);
-void lguest_arch_handle_trap(struct lg_cpu *cpu);
-int lguest_arch_init_hypercalls(struct lg_cpu *cpu);
-int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args);
-void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start);
-unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any);
-
-/* <arch>/switcher.S: */
-extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
-
-/* lguest_user.c: */
-int lguest_device_init(void);
-void lguest_device_remove(void);
-
-/* hypercalls.c: */
-void do_hypercalls(struct lg_cpu *cpu);
-void write_timestamp(struct lg_cpu *cpu);
-
-/*L:035
- * Let's step aside for the moment, to study one important routine that's used
- * widely in the Host code.
- *
- * There are many cases where the Guest can do something invalid, like pass crap
- * to a hypercall.  Since only the Guest kernel can make hypercalls, it's quite
- * acceptable to simply terminate the Guest and give the Launcher a nicely
- * formatted reason.  It's also simpler for the Guest itself, which doesn't
- * need to check most hypercalls for "success"; if you're still running, it
- * succeeded.
- *
- * Once this is called, the Guest will never run again, so most Host code can
- * call this then continue as if nothing had happened.  This means many
- * functions don't have to explicitly return an error code, which keeps the
- * code simple.
- *
- * It also means that this can be called more than once: only the first one is
- * remembered.  The only trick is that we still need to kill the Guest even if
- * we can't allocate memory to store the reason.  Linux has a neat way of
- * packing error codes into invalid pointers, so we use that here.
- *
- * Like any macro which uses an "if", it is safely wrapped in a run-once "do {
- * } while(0)".
- */
-#define kill_guest(cpu, fmt...)					\
-do {								\
-	if (!(cpu)->lg->dead) {					\
-		(cpu)->lg->dead = kasprintf(GFP_ATOMIC, fmt);	\
-		if (!(cpu)->lg->dead)				\
-			(cpu)->lg->dead = ERR_PTR(-ENOMEM);	\
-	}							\
-} while(0)
-/* (End of aside) :*/
-
-#endif	/* __ASSEMBLY__ */
-#endif	/* _LGUEST_H */
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
deleted file mode 100644
index 1a6787bc9386..000000000000
--- a/drivers/lguest/lguest_user.c
+++ /dev/null
@@ -1,446 +0,0 @@
-/*P:200 This contains all the /dev/lguest code, whereby the userspace
- * launcher controls and communicates with the Guest.  For example,
- * the first write will tell us the Guest's memory layout and entry
- * point.  A read will run the Guest until something happens, such as
- * a signal or the Guest accessing a device.
-:*/
-#include <linux/uaccess.h>
-#include <linux/miscdevice.h>
-#include <linux/fs.h>
-#include <linux/sched.h>
-#include <linux/sched/mm.h>
-#include <linux/file.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include "lg.h"
-
-/*L:052
-  The Launcher can get the registers, and also set some of them.
-*/
-static int getreg_setup(struct lg_cpu *cpu, const unsigned long __user *input)
-{
-	unsigned long which;
-
-	/* We re-use the ptrace structure to specify which register to read. */
-	if (get_user(which, input) != 0)
-		return -EFAULT;
-
-	/*
-	 * We set up the cpu register pointer, and their next read will
-	 * actually get the value (instead of running the guest).
-	 *
-	 * The last argument 'true' says we can access any register.
-	 */
-	cpu->reg_read = lguest_arch_regptr(cpu, which, true);
-	if (!cpu->reg_read)
-		return -ENOENT;
-
-	/* And because this is a write() call, we return the length used. */
-	return sizeof(unsigned long) * 2;
-}
-
-static int setreg(struct lg_cpu *cpu, const unsigned long __user *input)
-{
-	unsigned long which, value, *reg;
-
-	/* We re-use the ptrace structure to specify which register to read. */
-	if (get_user(which, input) != 0)
-		return -EFAULT;
-	input++;
-	if (get_user(value, input) != 0)
-		return -EFAULT;
-
-	/* The last argument 'false' means we can't access all registers. */
-	reg = lguest_arch_regptr(cpu, which, false);
-	if (!reg)
-		return -ENOENT;
-
-	*reg = value;
-
-	/* And because this is a write() call, we return the length used. */
-	return sizeof(unsigned long) * 3;
-}
-
-/*L:050
- * Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
- * number to /dev/lguest.
- */
-static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
-{
-	unsigned long irq;
-
-	if (get_user(irq, input) != 0)
-		return -EFAULT;
-	if (irq >= LGUEST_IRQS)
-		return -EINVAL;
-
-	/*
-	 * Next time the Guest runs, the core code will see if it can deliver
-	 * this interrupt.
-	 */
-	set_interrupt(cpu, irq);
-	return 0;
-}
-
-/*L:053
- * Deliver a trap: this is used by the Launcher if it can't emulate
- * an instruction.
- */
-static int trap(struct lg_cpu *cpu, const unsigned long __user *input)
-{
-	unsigned long trapnum;
-
-	if (get_user(trapnum, input) != 0)
-		return -EFAULT;
-
-	if (!deliver_trap(cpu, trapnum))
-		return -EINVAL;
-
-	return 0;
-}
-
-/*L:040
- * Once our Guest is initialized, the Launcher makes it run by reading
- * from /dev/lguest.
- */
-static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
-{
-	struct lguest *lg = file->private_data;
-	struct lg_cpu *cpu;
-	unsigned int cpu_id = *o;
-
-	/* You must write LHREQ_INITIALIZE first! */
-	if (!lg)
-		return -EINVAL;
-
-	/* Watch out for arbitrary vcpu indexes! */
-	if (cpu_id >= lg->nr_cpus)
-		return -EINVAL;
-
-	cpu = &lg->cpus[cpu_id];
-
-	/* If you're not the task which owns the Guest, go away. */
-	if (current != cpu->tsk)
-		return -EPERM;
-
-	/* If the Guest is already dead, we indicate why */
-	if (lg->dead) {
-		size_t len;
-
-		/* lg->dead either contains an error code, or a string. */
-		if (IS_ERR(lg->dead))
-			return PTR_ERR(lg->dead);
-
-		/* We can only return as much as the buffer they read with. */
-		len = min(size, strlen(lg->dead)+1);
-		if (copy_to_user(user, lg->dead, len) != 0)
-			return -EFAULT;
-		return len;
-	}
-
-	/*
-	 * If we returned from read() last time because the Guest sent I/O,
-	 * clear the flag.
-	 */
-	if (cpu->pending.trap)
-		cpu->pending.trap = 0;
-
-	/* Run the Guest until something interesting happens. */
-	return run_guest(cpu, (unsigned long __user *)user);
-}
-
-/*L:025
- * This actually initializes a CPU.  For the moment, a Guest is only
- * uniprocessor, so "id" is always 0.
- */
-static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
-{
-	/* We have a limited number of CPUs in the lguest struct. */
-	if (id >= ARRAY_SIZE(cpu->lg->cpus))
-		return -EINVAL;
-
-	/* Set up this CPU's id, and pointer back to the lguest struct. */
-	cpu->id = id;
-	cpu->lg = container_of(cpu, struct lguest, cpus[id]);
-	cpu->lg->nr_cpus++;
-
-	/* Each CPU has a timer it can set. */
-	init_clockdev(cpu);
-
-	/*
-	 * We need a complete page for the Guest registers: they are accessible
-	 * to the Guest and we can only grant it access to whole pages.
-	 */
-	cpu->regs_page = get_zeroed_page(GFP_KERNEL);
-	if (!cpu->regs_page)
-		return -ENOMEM;
-
-	/* We actually put the registers at the end of the page. */
-	cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
-
-	/*
-	 * Now we initialize the Guest's registers, handing it the start
-	 * address.
-	 */
-	lguest_arch_setup_regs(cpu, start_ip);
-
-	/*
-	 * We keep a pointer to the Launcher task (ie. current task) for when
-	 * other Guests want to wake this one (eg. console input).
-	 */
-	cpu->tsk = current;
-
-	/*
-	 * We need to keep a pointer to the Launcher's memory map, because if
-	 * the Launcher dies we need to clean it up.  If we don't keep a
-	 * reference, it is destroyed before close() is called.
-	 */
-	cpu->mm = get_task_mm(cpu->tsk);
-
-	/*
-	 * We remember which CPU's pages this Guest used last, for optimization
-	 * when the same Guest runs on the same CPU twice.
-	 */
-	cpu->last_pages = NULL;
-
-	/* No error == success. */
-	return 0;
-}
-
-/*L:020
- * The initialization write supplies 3 pointer sized (32 or 64 bit) values (in
- * addition to the LHREQ_INITIALIZE value).  These are:
- *
- * base: The start of the Guest-physical memory inside the Launcher memory.
- *
- * pfnlimit: The highest (Guest-physical) page number the Guest should be
- * allowed to access.  The Guest memory lives inside the Launcher, so it sets
- * this to ensure the Guest can only reach its own memory.
- *
- * start: The first instruction to execute ("eip" in x86-speak).
- */
-static int initialize(struct file *file, const unsigned long __user *input)
-{
-	/* "struct lguest" contains all we (the Host) know about a Guest. */
-	struct lguest *lg;
-	int err;
-	unsigned long args[4];
-
-	/*
-	 * We grab the Big Lguest lock, which protects against multiple
-	 * simultaneous initializations.
-	 */
-	mutex_lock(&lguest_lock);
-	/* You can't initialize twice!  Close the device and start again... */
-	if (file->private_data) {
-		err = -EBUSY;
-		goto unlock;
-	}
-
-	if (copy_from_user(args, input, sizeof(args)) != 0) {
-		err = -EFAULT;
-		goto unlock;
-	}
-
-	lg = kzalloc(sizeof(*lg), GFP_KERNEL);
-	if (!lg) {
-		err = -ENOMEM;
-		goto unlock;
-	}
-
-	/* Populate the easy fields of our "struct lguest" */
-	lg->mem_base = (void __user *)args[0];
-	lg->pfn_limit = args[1];
-	lg->device_limit = args[3];
-
-	/* This is the first cpu (cpu 0) and it will start booting at args[2] */
-	err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
-	if (err)
-		goto free_lg;
-
-	/*
-	 * Initialize the Guest's shadow page tables.  This allocates
-	 * memory, so can fail.
-	 */
-	err = init_guest_pagetable(lg);
-	if (err)
-		goto free_regs;
-
-	/* We keep our "struct lguest" in the file's private_data. */
-	file->private_data = lg;
-
-	mutex_unlock(&lguest_lock);
-
-	/* And because this is a write() call, we return the length used. */
-	return sizeof(args);
-
-free_regs:
-	/* FIXME: This should be in free_vcpu */
-	free_page(lg->cpus[0].regs_page);
-free_lg:
-	kfree(lg);
-unlock:
-	mutex_unlock(&lguest_lock);
-	return err;
-}
-
-/*L:010
- * The first operation the Launcher does must be a write.  All writes
- * start with an unsigned long number: for the first write this must be
- * LHREQ_INITIALIZE to set up the Guest.  After that the Launcher can use
- * writes of other values to send interrupts or set up receipt of notifications.
- *
- * Note that we overload the "offset" in the /dev/lguest file to indicate what
- * CPU number we're dealing with.  Currently this is always 0 since we only
- * support uniprocessor Guests, but you can see the beginnings of SMP support
- * here.
- */
-static ssize_t write(struct file *file, const char __user *in,
-		     size_t size, loff_t *off)
-{
-	/*
-	 * Once the Guest is initialized, we hold the "struct lguest" in the
-	 * file private data.
-	 */
-	struct lguest *lg = file->private_data;
-	const unsigned long __user *input = (const unsigned long __user *)in;
-	unsigned long req;
-	struct lg_cpu *uninitialized_var(cpu);
-	unsigned int cpu_id = *off;
-
-	/* The first value tells us what this request is. */
-	if (get_user(req, input) != 0)
-		return -EFAULT;
-	input++;
-
-	/* If you haven't initialized, you must do that first. */
-	if (req != LHREQ_INITIALIZE) {
-		if (!lg || (cpu_id >= lg->nr_cpus))
-			return -EINVAL;
-		cpu = &lg->cpus[cpu_id];
-
-		/* Once the Guest is dead, you can only read() why it died. */
-		if (lg->dead)
-			return -ENOENT;
-	}
-
-	switch (req) {
-	case LHREQ_INITIALIZE:
-		return initialize(file, input);
-	case LHREQ_IRQ:
-		return user_send_irq(cpu, input);
-	case LHREQ_GETREG:
-		return getreg_setup(cpu, input);
-	case LHREQ_SETREG:
-		return setreg(cpu, input);
-	case LHREQ_TRAP:
-		return trap(cpu, input);
-	default:
-		return -EINVAL;
-	}
-}
-
-static int open(struct inode *inode, struct file *file)
-{
-	file->private_data = NULL;
-
-	return 0;
-}
-
-/*L:060
- * The final piece of interface code is the close() routine.  It reverses
- * everything done in initialize().  This is usually called because the
- * Launcher exited.
- *
- * Note that the close routine returns 0 or a negative error number: it can't
- * really fail, but it can whine.  I blame Sun for this wart, and K&R C for
- * letting them do it.
-:*/
-static int close(struct inode *inode, struct file *file)
-{
-	struct lguest *lg = file->private_data;
-	unsigned int i;
-
-	/* If we never successfully initialized, there's nothing to clean up */
-	if (!lg)
-		return 0;
-
-	/*
-	 * We need the big lock, to protect from inter-guest I/O and other
-	 * Launchers initializing guests.
-	 */
-	mutex_lock(&lguest_lock);
-
-	/* Free up the shadow page tables for the Guest. */
-	free_guest_pagetable(lg);
-
-	for (i = 0; i < lg->nr_cpus; i++) {
-		/* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
-		hrtimer_cancel(&lg->cpus[i].hrt);
-		/* We can free up the register page we allocated. */
-		free_page(lg->cpus[i].regs_page);
-		/*
-		 * Now all the memory cleanups are done, it's safe to release
-		 * the Launcher's memory management structure.
-		 */
-		mmput(lg->cpus[i].mm);
-	}
-
-	/*
-	 * If lg->dead doesn't contain an error code it will be NULL or a
-	 * kmalloc()ed string, either of which is ok to hand to kfree().
-	 */
-	if (!IS_ERR(lg->dead))
-		kfree(lg->dead);
-	/* Free the memory allocated to the lguest_struct */
-	kfree(lg);
-	/* Release lock and exit. */
-	mutex_unlock(&lguest_lock);
-
-	return 0;
-}
-
-/*L:000
- * Welcome to our journey through the Launcher!
- *
- * The Launcher is the Host userspace program which sets up, runs and services
- * the Guest.  In fact, many comments in the Drivers which refer to "the Host"
- * doing things are inaccurate: the Launcher does all the device handling for
- * the Guest, but the Guest can't know that.
- *
- * Just to confuse you: to the Host kernel, the Launcher *is* the Guest and we
- * shall see more of that later.
- *
- * We begin our understanding with the Host kernel interface which the Launcher
- * uses: reading and writing a character device called /dev/lguest.  All the
- * work happens in the read(), write() and close() routines:
- */
-static const struct file_operations lguest_fops = {
-	.owner	 = THIS_MODULE,
-	.open	 = open,
-	.release = close,
-	.write	 = write,
-	.read	 = read,
-	.llseek  = default_llseek,
-};
-/*:*/
-
-/*
- * This is a textbook example of a "misc" character device.  Populate a "struct
- * miscdevice" and register it with misc_register().
- */
-static struct miscdevice lguest_dev = {
-	.minor	= MISC_DYNAMIC_MINOR,
-	.name	= "lguest",
-	.fops	= &lguest_fops,
-};
-
-int __init lguest_device_init(void)
-{
-	return misc_register(&lguest_dev);
-}
-
-void __exit lguest_device_remove(void)
-{
-	misc_deregister(&lguest_dev);
-}
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
deleted file mode 100644
index 0bc127e9f16a..000000000000
--- a/drivers/lguest/page_tables.c
+++ /dev/null
@@ -1,1239 +0,0 @@
-/*P:700
- * The pagetable code, on the other hand, still shows the scars of
- * previous encounters.  It's functional, and as neat as it can be in the
- * circumstances, but be wary, for these things are subtle and break easily.
- * The Guest provides a virtual to physical mapping, but we can neither trust
- * it nor use it: we verify and convert it here then point the CPU to the
- * converted Guest pages when running the Guest.
-:*/
-
-/* Copyright (C) Rusty Russell IBM Corporation 2013.
- * GPL v2 and any later version */
-#include <linux/mm.h>
-#include <linux/gfp.h>
-#include <linux/types.h>
-#include <linux/spinlock.h>
-#include <linux/random.h>
-#include <linux/percpu.h>
-#include <asm/tlbflush.h>
-#include <linux/uaccess.h>
-#include "lg.h"
-
-/*M:008
- * We hold reference to pages, which prevents them from being swapped.
- * It'd be nice to have a callback in the "struct mm_struct" when Linux wants
- * to swap out.  If we had this, and a shrinker callback to trim PTE pages, we
- * could probably consider launching Guests as non-root.
-:*/
-
-/*H:300
- * The Page Table Code
- *
- * We use two-level page tables for the Guest, or three-level with PAE.  If
- * you're not entirely comfortable with virtual addresses, physical addresses
- * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page
- * Table Handling" (with diagrams!).
- *
- * The Guest keeps page tables, but we maintain the actual ones here: these are
- * called "shadow" page tables.  Which is a very Guest-centric name: these are
- * the real page tables the CPU uses, although we keep them up to date to
- * reflect the Guest's.  (See what I mean about weird naming?  Since when do
- * shadows reflect anything?)
- *
- * Anyway, this is the most complicated part of the Host code.  There are seven
- * parts to this:
- *  (i) Looking up a page table entry when the Guest faults,
- *  (ii) Making sure the Guest stack is mapped,
- *  (iii) Setting up a page table entry when the Guest tells us one has changed,
- *  (iv) Switching page tables,
- *  (v) Flushing (throwing away) page tables,
- *  (vi) Mapping the Switcher when the Guest is about to run,
- *  (vii) Setting up the page tables initially.
-:*/
-
-/*
- * The Switcher uses the complete top PTE page.  That's 1024 PTE entries (4MB)
- * or 512 PTE entries with PAE (2MB).
- */
-#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
-
-/*
- * For PAE we need the PMD index as well. We use the last 2MB, so we
- * will need the last pmd entry of the last pmd page.
- */
-#ifdef CONFIG_X86_PAE
-#define CHECK_GPGD_MASK		_PAGE_PRESENT
-#else
-#define CHECK_GPGD_MASK		_PAGE_TABLE
-#endif
-
-/*H:320
- * The page table code is curly enough to need helper functions to keep it
- * clear and clean.  The kernel itself provides many of them; one advantage
- * of insisting that the Guest and Host use the same CONFIG_X86_PAE setting.
- *
- * There are two functions which return pointers to the shadow (aka "real")
- * page tables.
- *
- * spgd_addr() takes the virtual address and returns a pointer to the top-level
- * page directory entry (PGD) for that address.  Since we keep track of several
- * page tables, the "i" argument tells us which one we're interested in (it's
- * usually the current one).
- */
-static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
-{
-	unsigned int index = pgd_index(vaddr);
-
-	/* Return a pointer index'th pgd entry for the i'th page table. */
-	return &cpu->lg->pgdirs[i].pgdir[index];
-}
-
-#ifdef CONFIG_X86_PAE
-/*
- * This routine then takes the PGD entry given above, which contains the
- * address of the PMD page.  It then returns a pointer to the PMD entry for the
- * given address.
- */
-static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
-{
-	unsigned int index = pmd_index(vaddr);
-	pmd_t *page;
-
-	/* You should never call this if the PGD entry wasn't valid */
-	BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
-	page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
-
-	return &page[index];
-}
-#endif
-
-/*
- * This routine then takes the page directory entry returned above, which
- * contains the address of the page table entry (PTE) page.  It then returns a
- * pointer to the PTE entry for the given address.
- */
-static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
-{
-#ifdef CONFIG_X86_PAE
-	pmd_t *pmd = spmd_addr(cpu, spgd, vaddr);
-	pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT);
-
-	/* You should never call this if the PMD entry wasn't valid */
-	BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT));
-#else
-	pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
-	/* You should never call this if the PGD entry wasn't valid */
-	BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
-#endif
-
-	return &page[pte_index(vaddr)];
-}
-
-/*
- * These functions are just like the above, except they access the Guest
- * page tables.  Hence they return a Guest address.
- */
-static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
-{
-	unsigned int index = vaddr >> (PGDIR_SHIFT);
-	return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t);
-}
-
-#ifdef CONFIG_X86_PAE
-/* Follow the PGD to the PMD. */
-static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)
-{
-	unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
-	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
-	return gpage + pmd_index(vaddr) * sizeof(pmd_t);
-}
-
-/* Follow the PMD to the PTE. */
-static unsigned long gpte_addr(struct lg_cpu *cpu,
-			       pmd_t gpmd, unsigned long vaddr)
-{
-	unsigned long gpage = pmd_pfn(gpmd) << PAGE_SHIFT;
-
-	BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT));
-	return gpage + pte_index(vaddr) * sizeof(pte_t);
-}
-#else
-/* Follow the PGD to the PTE (no mid-level for !PAE). */
-static unsigned long gpte_addr(struct lg_cpu *cpu,
-				pgd_t gpgd, unsigned long vaddr)
-{
-	unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
-
-	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
-	return gpage + pte_index(vaddr) * sizeof(pte_t);
-}
-#endif
-/*:*/
-
-/*M:007
- * get_pfn is slow: we could probably try to grab batches of pages here as
- * an optimization (ie. pre-faulting).
-:*/
-
-/*H:350
- * This routine takes a page number given by the Guest and converts it to
- * an actual, physical page number.  It can fail for several reasons: the
- * virtual address might not be mapped by the Launcher, the write flag is set
- * and the page is read-only, or the write flag was set and the page was
- * shared so had to be copied, but we ran out of memory.
- *
- * This holds a reference to the page, so release_pte() is careful to put that
- * back.
- */
-static unsigned long get_pfn(unsigned long virtpfn, int write)
-{
-	struct page *page;
-
-	/* gup me one page at this address please! */
-	if (get_user_pages_fast(virtpfn << PAGE_SHIFT, 1, write, &page) == 1)
-		return page_to_pfn(page);
-
-	/* This value indicates failure. */
-	return -1UL;
-}
-
-/*H:340
- * Converting a Guest page table entry to a shadow (ie. real) page table
- * entry can be a little tricky.  The flags are (almost) the same, but the
- * Guest PTE contains a virtual page number: the CPU needs the real page
- * number.
- */
-static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write)
-{
-	unsigned long pfn, base, flags;
-
-	/*
-	 * The Guest sets the global flag, because it thinks that it is using
-	 * PGE.  We only told it to use PGE so it would tell us whether it was
-	 * flushing a kernel mapping or a userspace mapping.  We don't actually
-	 * use the global bit, so throw it away.
-	 */
-	flags = (pte_flags(gpte) & ~_PAGE_GLOBAL);
-
-	/* The Guest's pages are offset inside the Launcher. */
-	base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE;
-
-	/*
-	 * We need a temporary "unsigned long" variable to hold the answer from
-	 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't
-	 * fit in spte.pfn.  get_pfn() finds the real physical number of the
-	 * page, given the virtual number.
-	 */
-	pfn = get_pfn(base + pte_pfn(gpte), write);
-	if (pfn == -1UL) {
-		kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte));
-		/*
-		 * When we destroy the Guest, we'll go through the shadow page
-		 * tables and release_pte() them.  Make sure we don't think
-		 * this one is valid!
-		 */
-		flags = 0;
-	}
-	/* Now we assemble our shadow PTE from the page number and flags. */
-	return pfn_pte(pfn, __pgprot(flags));
-}
-
-/*H:460 And to complete the chain, release_pte() looks like this: */
-static void release_pte(pte_t pte)
-{
-	/*
-	 * Remember that get_user_pages_fast() took a reference to the page, in
-	 * get_pfn()?  We have to put it back now.
-	 */
-	if (pte_flags(pte) & _PAGE_PRESENT)
-		put_page(pte_page(pte));
-}
-/*:*/
-
-static bool gpte_in_iomem(struct lg_cpu *cpu, pte_t gpte)
-{
-	/* We don't handle large pages. */
-	if (pte_flags(gpte) & _PAGE_PSE)
-		return false;
-
-	return (pte_pfn(gpte) >= cpu->lg->pfn_limit
-		&& pte_pfn(gpte) < cpu->lg->device_limit);
-}
-
-static bool check_gpte(struct lg_cpu *cpu, pte_t gpte)
-{
-	if ((pte_flags(gpte) & _PAGE_PSE) ||
-	    pte_pfn(gpte) >= cpu->lg->pfn_limit) {
-		kill_guest(cpu, "bad page table entry");
-		return false;
-	}
-	return true;
-}
-
-static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
-{
-	if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) ||
-	    (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) {
-		kill_guest(cpu, "bad page directory entry");
-		return false;
-	}
-	return true;
-}
-
-#ifdef CONFIG_X86_PAE
-static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
-{
-	if ((pmd_flags(gpmd) & ~_PAGE_TABLE) ||
-	    (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) {
-		kill_guest(cpu, "bad page middle directory entry");
-		return false;
-	}
-	return true;
-}
-#endif
-
-/*H:331
- * This is the core routine to walk the shadow page tables and find the page
- * table entry for a specific address.
- *
- * If allocate is set, then we allocate any missing levels, setting the flags
- * on the new page directory and mid-level directories using the arguments
- * (which are copied from the Guest's page table entries).
- */
-static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate,
-			int pgd_flags, int pmd_flags)
-{
-	pgd_t *spgd;
-	/* Mid level for PAE. */
-#ifdef CONFIG_X86_PAE
-	pmd_t *spmd;
-#endif
-
-	/* Get top level entry. */
-	spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
-	if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
-		/* No shadow entry: allocate a new shadow PTE page. */
-		unsigned long ptepage;
-
-		/* If they didn't want us to allocate anything, stop. */
-		if (!allocate)
-			return NULL;
-
-		ptepage = get_zeroed_page(GFP_KERNEL);
-		/*
-		 * This is not really the Guest's fault, but killing it is
-		 * simple for this corner case.
-		 */
-		if (!ptepage) {
-			kill_guest(cpu, "out of memory allocating pte page");
-			return NULL;
-		}
-		/*
-		 * And we copy the flags to the shadow PGD entry.  The page
-		 * number in the shadow PGD is the page we just allocated.
-		 */
-		set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags));
-	}
-
-	/*
-	 * Intel's Physical Address Extension actually uses three levels of
-	 * page tables, so we need to look in the mid-level.
-	 */
-#ifdef CONFIG_X86_PAE
-	/* Now look at the mid-level shadow entry. */
-	spmd = spmd_addr(cpu, *spgd, vaddr);
-
-	if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
-		/* No shadow entry: allocate a new shadow PTE page. */
-		unsigned long ptepage;
-
-		/* If they didn't want us to allocate anything, stop. */
-		if (!allocate)
-			return NULL;
-
-		ptepage = get_zeroed_page(GFP_KERNEL);
-
-		/*
-		 * This is not really the Guest's fault, but killing it is
-		 * simple for this corner case.
-		 */
-		if (!ptepage) {
-			kill_guest(cpu, "out of memory allocating pmd page");
-			return NULL;
-		}
-
-		/*
-		 * And we copy the flags to the shadow PMD entry.  The page
-		 * number in the shadow PMD is the page we just allocated.
-		 */
-		set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags));
-	}
-#endif
-
-	/* Get the pointer to the shadow PTE entry we're going to set. */
-	return spte_addr(cpu, *spgd, vaddr);
-}
-
-/*H:330
- * (i) Looking up a page table entry when the Guest faults.
- *
- * We saw this call in run_guest(): when we see a page fault in the Guest, we
- * come here.  That's because we only set up the shadow page tables lazily as
- * they're needed, so we get page faults all the time and quietly fix them up
- * and return to the Guest without it knowing.
- *
- * If we fixed up the fault (ie. we mapped the address), this routine returns
- * true.  Otherwise, it was a real fault and we need to tell the Guest.
- *
- * There's a corner case: they're trying to access memory between
- * pfn_limit and device_limit, which is I/O memory.  In this case, we
- * return false and set @iomem to the physical address, so the the
- * Launcher can handle the instruction manually.
- */
-bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode,
-		 unsigned long *iomem)
-{
-	unsigned long gpte_ptr;
-	pte_t gpte;
-	pte_t *spte;
-	pmd_t gpmd;
-	pgd_t gpgd;
-
-	*iomem = 0;
-
-	/* We never demand page the Switcher, so trying is a mistake. */
-	if (vaddr >= switcher_addr)
-		return false;
-
-	/* First step: get the top-level Guest page table entry. */
-	if (unlikely(cpu->linear_pages)) {
-		/* Faking up a linear mapping. */
-		gpgd = __pgd(CHECK_GPGD_MASK);
-	} else {
-		gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
-		/* Toplevel not present?  We can't map it in. */
-		if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
-			return false;
-
-		/* 
-		 * This kills the Guest if it has weird flags or tries to
-		 * refer to a "physical" address outside the bounds.
-		 */
-		if (!check_gpgd(cpu, gpgd))
-			return false;
-	}
-
-	/* This "mid-level" entry is only used for non-linear, PAE mode. */
-	gpmd = __pmd(_PAGE_TABLE);
-
-#ifdef CONFIG_X86_PAE
-	if (likely(!cpu->linear_pages)) {
-		gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
-		/* Middle level not present?  We can't map it in. */
-		if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
-			return false;
-
-		/* 
-		 * This kills the Guest if it has weird flags or tries to
-		 * refer to a "physical" address outside the bounds.
-		 */
-		if (!check_gpmd(cpu, gpmd))
-			return false;
-	}
-
-	/*
-	 * OK, now we look at the lower level in the Guest page table: keep its
-	 * address, because we might update it later.
-	 */
-	gpte_ptr = gpte_addr(cpu, gpmd, vaddr);
-#else
-	/*
-	 * OK, now we look at the lower level in the Guest page table: keep its
-	 * address, because we might update it later.
-	 */
-	gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
-#endif
-
-	if (unlikely(cpu->linear_pages)) {
-		/* Linear?  Make up a PTE which points to same page. */
-		gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT);
-	} else {
-		/* Read the actual PTE value. */
-		gpte = lgread(cpu, gpte_ptr, pte_t);
-	}
-
-	/* If this page isn't in the Guest page tables, we can't page it in. */
-	if (!(pte_flags(gpte) & _PAGE_PRESENT))
-		return false;
-
-	/*
-	 * Check they're not trying to write to a page the Guest wants
-	 * read-only (bit 2 of errcode == write).
-	 */
-	if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW))
-		return false;
-
-	/* User access to a kernel-only page? (bit 3 == user access) */
-	if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
-		return false;
-
-	/* If they're accessing io memory, we expect a fault. */
-	if (gpte_in_iomem(cpu, gpte)) {
-		*iomem = (pte_pfn(gpte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
-		return false;
-	}
-
-	/*
-	 * Check that the Guest PTE flags are OK, and the page number is below
-	 * the pfn_limit (ie. not mapping the Launcher binary).
-	 */
-	if (!check_gpte(cpu, gpte))
-		return false;
-
-	/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
-	gpte = pte_mkyoung(gpte);
-	if (errcode & 2)
-		gpte = pte_mkdirty(gpte);
-
-	/* Get the pointer to the shadow PTE entry we're going to set. */
-	spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd));
-	if (!spte)
-		return false;
-
-	/*
-	 * If there was a valid shadow PTE entry here before, we release it.
-	 * This can happen with a write to a previously read-only entry.
-	 */
-	release_pte(*spte);
-
-	/*
-	 * If this is a write, we insist that the Guest page is writable (the
-	 * final arg to gpte_to_spte()).
-	 */
-	if (pte_dirty(gpte))
-		*spte = gpte_to_spte(cpu, gpte, 1);
-	else
-		/*
-		 * If this is a read, don't set the "writable" bit in the page
-		 * table entry, even if the Guest says it's writable.  That way
-		 * we will come back here when a write does actually occur, so
-		 * we can update the Guest's _PAGE_DIRTY flag.
-		 */
-		set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0));
-
-	/*
-	 * Finally, we write the Guest PTE entry back: we've set the
-	 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags.
-	 */
-	if (likely(!cpu->linear_pages))
-		lgwrite(cpu, gpte_ptr, pte_t, gpte);
-
-	/*
-	 * The fault is fixed, the page table is populated, the mapping
-	 * manipulated, the result returned and the code complete.  A small
-	 * delay and a trace of alliteration are the only indications the Guest
-	 * has that a page fault occurred at all.
-	 */
-	return true;
-}
-
-/*H:360
- * (ii) Making sure the Guest stack is mapped.
- *
- * Remember that direct traps into the Guest need a mapped Guest kernel stack.
- * pin_stack_pages() calls us here: we could simply call demand_page(), but as
- * we've seen that logic is quite long, and usually the stack pages are already
- * mapped, so it's overkill.
- *
- * This is a quick version which answers the question: is this virtual address
- * mapped by the shadow page tables, and is it writable?
- */
-static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
-{
-	pte_t *spte;
-	unsigned long flags;
-
-	/* You can't put your stack in the Switcher! */
-	if (vaddr >= switcher_addr)
-		return false;
-
-	/* If there's no shadow PTE, it's not writable. */
-	spte = find_spte(cpu, vaddr, false, 0, 0);
-	if (!spte)
-		return false;
-
-	/*
-	 * Check the flags on the pte entry itself: it must be present and
-	 * writable.
-	 */
-	flags = pte_flags(*spte);
-	return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
-}
-
-/*
- * So, when pin_stack_pages() asks us to pin a page, we check if it's already
- * in the page tables, and if not, we call demand_page() with error code 2
- * (meaning "write").
- */
-void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
-{
-	unsigned long iomem;
-
-	if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2, &iomem))
-		kill_guest(cpu, "bad stack page %#lx", vaddr);
-}
-/*:*/
-
-#ifdef CONFIG_X86_PAE
-static void release_pmd(pmd_t *spmd)
-{
-	/* If the entry's not present, there's nothing to release. */
-	if (pmd_flags(*spmd) & _PAGE_PRESENT) {
-		unsigned int i;
-		pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT);
-		/* For each entry in the page, we might need to release it. */
-		for (i = 0; i < PTRS_PER_PTE; i++)
-			release_pte(ptepage[i]);
-		/* Now we can free the page of PTEs */
-		free_page((long)ptepage);
-		/* And zero out the PMD entry so we never release it twice. */
-		set_pmd(spmd, __pmd(0));
-	}
-}
-
-static void release_pgd(pgd_t *spgd)
-{
-	/* If the entry's not present, there's nothing to release. */
-	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
-		unsigned int i;
-		pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
-
-		for (i = 0; i < PTRS_PER_PMD; i++)
-			release_pmd(&pmdpage[i]);
-
-		/* Now we can free the page of PMDs */
-		free_page((long)pmdpage);
-		/* And zero out the PGD entry so we never release it twice. */
-		set_pgd(spgd, __pgd(0));
-	}
-}
-
-#else /* !CONFIG_X86_PAE */
-/*H:450
- * If we chase down the release_pgd() code, the non-PAE version looks like
- * this.  The PAE version is almost identical, but instead of calling
- * release_pte it calls release_pmd(), which looks much like this.
- */
-static void release_pgd(pgd_t *spgd)
-{
-	/* If the entry's not present, there's nothing to release. */
-	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
-		unsigned int i;
-		/*
-		 * Converting the pfn to find the actual PTE page is easy: turn
-		 * the page number into a physical address, then convert to a
-		 * virtual address (easy for kernel pages like this one).
-		 */
-		pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
-		/* For each entry in the page, we might need to release it. */
-		for (i = 0; i < PTRS_PER_PTE; i++)
-			release_pte(ptepage[i]);
-		/* Now we can free the page of PTEs */
-		free_page((long)ptepage);
-		/* And zero out the PGD entry so we never release it twice. */
-		*spgd = __pgd(0);
-	}
-}
-#endif
-
-/*H:445
- * We saw flush_user_mappings() twice: once from the flush_user_mappings()
- * hypercall and once in new_pgdir() when we re-used a top-level pgdir page.
- * It simply releases every PTE page from 0 up to the Guest's kernel address.
- */
-static void flush_user_mappings(struct lguest *lg, int idx)
-{
-	unsigned int i;
-	/* Release every pgd entry up to the kernel's address. */
-	for (i = 0; i < pgd_index(lg->kernel_address); i++)
-		release_pgd(lg->pgdirs[idx].pgdir + i);
-}
-
-/*H:440
- * (v) Flushing (throwing away) page tables,
- *
- * The Guest has a hypercall to throw away the page tables: it's used when a
- * large number of mappings have been changed.
- */
-void guest_pagetable_flush_user(struct lg_cpu *cpu)
-{
-	/* Drop the userspace part of the current page table. */
-	flush_user_mappings(cpu->lg, cpu->cpu_pgd);
-}
-/*:*/
-
-/* We walk down the guest page tables to get a guest-physical address */
-bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr)
-{
-	pgd_t gpgd;
-	pte_t gpte;
-#ifdef CONFIG_X86_PAE
-	pmd_t gpmd;
-#endif
-
-	/* Still not set up?  Just map 1:1. */
-	if (unlikely(cpu->linear_pages)) {
-		*paddr = vaddr;
-		return true;
-	}
-
-	/* First step: get the top-level Guest page table entry. */
-	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
-	/* Toplevel not present?  We can't map it in. */
-	if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
-		goto fail;
-
-#ifdef CONFIG_X86_PAE
-	gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
-	if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
-		goto fail;
-	gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t);
-#else
-	gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
-#endif
-	if (!(pte_flags(gpte) & _PAGE_PRESENT))
-		goto fail;
-
-	*paddr = pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
-	return true;
-
-fail:
-	*paddr = -1UL;
-	return false;
-}
-
-/*
- * This is the version we normally use: kills the Guest if it uses a
- * bad address
- */
-unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
-{
-	unsigned long paddr;
-
-	if (!__guest_pa(cpu, vaddr, &paddr))
-		kill_guest(cpu, "Bad address %#lx", vaddr);
-	return paddr;
-}
-
-/*
- * We keep several page tables.  This is a simple routine to find the page
- * table (if any) corresponding to this top-level address the Guest has given
- * us.
- */
-static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
-{
-	unsigned int i;
-	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
-		if (lg->pgdirs[i].pgdir && lg->pgdirs[i].gpgdir == pgtable)
-			break;
-	return i;
-}
-
-/*H:435
- * And this is us, creating the new page directory.  If we really do
- * allocate a new one (and so the kernel parts are not there), we set
- * blank_pgdir.
- */
-static unsigned int new_pgdir(struct lg_cpu *cpu,
-			      unsigned long gpgdir,
-			      int *blank_pgdir)
-{
-	unsigned int next;
-
-	/*
-	 * We pick one entry at random to throw out.  Choosing the Least
-	 * Recently Used might be better, but this is easy.
-	 */
-	next = prandom_u32() % ARRAY_SIZE(cpu->lg->pgdirs);
-	/* If it's never been allocated at all before, try now. */
-	if (!cpu->lg->pgdirs[next].pgdir) {
-		cpu->lg->pgdirs[next].pgdir =
-					(pgd_t *)get_zeroed_page(GFP_KERNEL);
-		/* If the allocation fails, just keep using the one we have */
-		if (!cpu->lg->pgdirs[next].pgdir)
-			next = cpu->cpu_pgd;
-		else {
-			/*
-			 * This is a blank page, so there are no kernel
-			 * mappings: caller must map the stack!
-			 */
-			*blank_pgdir = 1;
-		}
-	}
-	/* Record which Guest toplevel this shadows. */
-	cpu->lg->pgdirs[next].gpgdir = gpgdir;
-	/* Release all the non-kernel mappings. */
-	flush_user_mappings(cpu->lg, next);
-
-	/* This hasn't run on any CPU at all. */
-	cpu->lg->pgdirs[next].last_host_cpu = -1;
-
-	return next;
-}
-
-/*H:501
- * We do need the Switcher code mapped at all times, so we allocate that
- * part of the Guest page table here.  We map the Switcher code immediately,
- * but defer mapping of the guest register page and IDT/LDT etc page until
- * just before we run the guest in map_switcher_in_guest().
- *
- * We *could* do this setup in map_switcher_in_guest(), but at that point
- * we've interrupts disabled, and allocating pages like that is fraught: we
- * can't sleep if we need to free up some memory.
- */
-static bool allocate_switcher_mapping(struct lg_cpu *cpu)
-{
-	int i;
-
-	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
-		pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true,
-				       CHECK_GPGD_MASK, _PAGE_TABLE);
-		if (!pte)
-			return false;
-
-		/*
-		 * Map the switcher page if not already there.  It might
-		 * already be there because we call allocate_switcher_mapping()
-		 * in guest_set_pgd() just in case it did discard our Switcher
-		 * mapping, but it probably didn't.
-		 */
-		if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) {
-			/* Get a reference to the Switcher page. */
-			get_page(lg_switcher_pages[0]);
-			/* Create a read-only, exectuable, kernel-style PTE */
-			set_pte(pte,
-				mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX));
-		}
-	}
-	cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true;
-	return true;
-}
-
-/*H:470
- * Finally, a routine which throws away everything: all PGD entries in all
- * the shadow page tables, including the Guest's kernel mappings.  This is used
- * when we destroy the Guest.
- */
-static void release_all_pagetables(struct lguest *lg)
-{
-	unsigned int i, j;
-
-	/* Every shadow pagetable this Guest has */
-	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) {
-		if (!lg->pgdirs[i].pgdir)
-			continue;
-
-		/* Every PGD entry. */
-		for (j = 0; j < PTRS_PER_PGD; j++)
-			release_pgd(lg->pgdirs[i].pgdir + j);
-		lg->pgdirs[i].switcher_mapped = false;
-		lg->pgdirs[i].last_host_cpu = -1;
-	}
-}
-
-/*
- * We also throw away everything when a Guest tells us it's changed a kernel
- * mapping.  Since kernel mappings are in every page table, it's easiest to
- * throw them all away.  This traps the Guest in amber for a while as
- * everything faults back in, but it's rare.
- */
-void guest_pagetable_clear_all(struct lg_cpu *cpu)
-{
-	release_all_pagetables(cpu->lg);
-	/* We need the Guest kernel stack mapped again. */
-	pin_stack_pages(cpu);
-	/* And we need Switcher allocated. */
-	if (!allocate_switcher_mapping(cpu))
-		kill_guest(cpu, "Cannot populate switcher mapping");
-}
-
-/*H:430
- * (iv) Switching page tables
- *
- * Now we've seen all the page table setting and manipulation, let's see
- * what happens when the Guest changes page tables (ie. changes the top-level
- * pgdir).  This occurs on almost every context switch.
- */
-void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
-{
-	int newpgdir, repin = 0;
-
-	/*
-	 * The very first time they call this, we're actually running without
-	 * any page tables; we've been making it up.  Throw them away now.
-	 */
-	if (unlikely(cpu->linear_pages)) {
-		release_all_pagetables(cpu->lg);
-		cpu->linear_pages = false;
-		/* Force allocation of a new pgdir. */
-		newpgdir = ARRAY_SIZE(cpu->lg->pgdirs);
-	} else {
-		/* Look to see if we have this one already. */
-		newpgdir = find_pgdir(cpu->lg, pgtable);
-	}
-
-	/*
-	 * If not, we allocate or mug an existing one: if it's a fresh one,
-	 * repin gets set to 1.
-	 */
-	if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
-		newpgdir = new_pgdir(cpu, pgtable, &repin);
-	/* Change the current pgd index to the new one. */
-	cpu->cpu_pgd = newpgdir;
-	/*
-	 * If it was completely blank, we map in the Guest kernel stack and
-	 * the Switcher.
-	 */
-	if (repin)
-		pin_stack_pages(cpu);
-
-	if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) {
-		if (!allocate_switcher_mapping(cpu))
-			kill_guest(cpu, "Cannot populate switcher mapping");
-	}
-}
-/*:*/
-
-/*M:009
- * Since we throw away all mappings when a kernel mapping changes, our
- * performance sucks for guests using highmem.  In fact, a guest with
- * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is
- * usually slower than a Guest with less memory.
- *
- * This, of course, cannot be fixed.  It would take some kind of... well, I
- * don't know, but the term "puissant code-fu" comes to mind.
-:*/
-
-/*H:420
- * This is the routine which actually sets the page table entry for then
- * "idx"'th shadow page table.
- *
- * Normally, we can just throw out the old entry and replace it with 0: if they
- * use it demand_page() will put the new entry in.  We need to do this anyway:
- * The Guest expects _PAGE_ACCESSED to be set on its PTE the first time a page
- * is read from, and _PAGE_DIRTY when it's written to.
- *
- * But Avi Kivity pointed out that most Operating Systems (Linux included) set
- * these bits on PTEs immediately anyway.  This is done to save the CPU from
- * having to update them, but it helps us the same way: if they set
- * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if
- * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately.
- */
-static void __guest_set_pte(struct lg_cpu *cpu, int idx,
-		       unsigned long vaddr, pte_t gpte)
-{
-	/* Look up the matching shadow page directory entry. */
-	pgd_t *spgd = spgd_addr(cpu, idx, vaddr);
-#ifdef CONFIG_X86_PAE
-	pmd_t *spmd;
-#endif
-
-	/* If the top level isn't present, there's no entry to update. */
-	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
-#ifdef CONFIG_X86_PAE
-		spmd = spmd_addr(cpu, *spgd, vaddr);
-		if (pmd_flags(*spmd) & _PAGE_PRESENT) {
-#endif
-			/* Otherwise, start by releasing the existing entry. */
-			pte_t *spte = spte_addr(cpu, *spgd, vaddr);
-			release_pte(*spte);
-
-			/*
-			 * If they're setting this entry as dirty or accessed,
-			 * we might as well put that entry they've given us in
-			 * now.  This shaves 10% off a copy-on-write
-			 * micro-benchmark.
-			 */
-			if ((pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED))
-			    && !gpte_in_iomem(cpu, gpte)) {
-				if (!check_gpte(cpu, gpte))
-					return;
-				set_pte(spte,
-					gpte_to_spte(cpu, gpte,
-						pte_flags(gpte) & _PAGE_DIRTY));
-			} else {
-				/*
-				 * Otherwise kill it and we can demand_page()
-				 * it in later.
-				 */
-				set_pte(spte, __pte(0));
-			}
-#ifdef CONFIG_X86_PAE
-		}
-#endif
-	}
-}
-
-/*H:410
- * Updating a PTE entry is a little trickier.
- *
- * We keep track of several different page tables (the Guest uses one for each
- * process, so it makes sense to cache at least a few).  Each of these have
- * identical kernel parts: ie. every mapping above PAGE_OFFSET is the same for
- * all processes.  So when the page table above that address changes, we update
- * all the page tables, not just the current one.  This is rare.
- *
- * The benefit is that when we have to track a new page table, we can keep all
- * the kernel mappings.  This speeds up context switch immensely.
- */
-void guest_set_pte(struct lg_cpu *cpu,
-		   unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
-{
-	/* We don't let you remap the Switcher; we need it to get back! */
-	if (vaddr >= switcher_addr) {
-		kill_guest(cpu, "attempt to set pte into Switcher pages");
-		return;
-	}
-
-	/*
-	 * Kernel mappings must be changed on all top levels.  Slow, but doesn't
-	 * happen often.
-	 */
-	if (vaddr >= cpu->lg->kernel_address) {
-		unsigned int i;
-		for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++)
-			if (cpu->lg->pgdirs[i].pgdir)
-				__guest_set_pte(cpu, i, vaddr, gpte);
-	} else {
-		/* Is this page table one we have a shadow for? */
-		int pgdir = find_pgdir(cpu->lg, gpgdir);
-		if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs))
-			/* If so, do the update. */
-			__guest_set_pte(cpu, pgdir, vaddr, gpte);
-	}
-}
-
-/*H:400
- * (iii) Setting up a page table entry when the Guest tells us one has changed.
- *
- * Just like we did in interrupts_and_traps.c, it makes sense for us to deal
- * with the other side of page tables while we're here: what happens when the
- * Guest asks for a page table to be updated?
- *
- * We already saw that demand_page() will fill in the shadow page tables when
- * needed, so we can simply remove shadow page table entries whenever the Guest
- * tells us they've changed.  When the Guest tries to use the new entry it will
- * fault and demand_page() will fix it up.
- *
- * So with that in mind here's our code to update a (top-level) PGD entry:
- */
-void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
-{
-	int pgdir;
-
-	if (idx > PTRS_PER_PGD) {
-		kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u",
-			   idx, PTRS_PER_PGD);
-		return;
-	}
-
-	/* If they're talking about a page table we have a shadow for... */
-	pgdir = find_pgdir(lg, gpgdir);
-	if (pgdir < ARRAY_SIZE(lg->pgdirs)) {
-		/* ... throw it away. */
-		release_pgd(lg->pgdirs[pgdir].pgdir + idx);
-		/* That might have been the Switcher mapping, remap it. */
-		if (!allocate_switcher_mapping(&lg->cpus[0])) {
-			kill_guest(&lg->cpus[0],
-				   "Cannot populate switcher mapping");
-		}
-		lg->pgdirs[pgdir].last_host_cpu = -1;
-	}
-}
-
-#ifdef CONFIG_X86_PAE
-/* For setting a mid-level, we just throw everything away.  It's easy. */
-void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
-{
-	guest_pagetable_clear_all(&lg->cpus[0]);
-}
-#endif
-
-/*H:500
- * (vii) Setting up the page tables initially.
- *
- * When a Guest is first created, set initialize a shadow page table which
- * we will populate on future faults.  The Guest doesn't have any actual
- * pagetables yet, so we set linear_pages to tell demand_page() to fake it
- * for the moment.
- *
- * We do need the Switcher to be mapped at all times, so we allocate that
- * part of the Guest page table here.
- */
-int init_guest_pagetable(struct lguest *lg)
-{
-	struct lg_cpu *cpu = &lg->cpus[0];
-	int allocated = 0;
-
-	/* lg (and lg->cpus[]) starts zeroed: this allocates a new pgdir */
-	cpu->cpu_pgd = new_pgdir(cpu, 0, &allocated);
-	if (!allocated)
-		return -ENOMEM;
-
-	/* We start with a linear mapping until the initialize. */
-	cpu->linear_pages = true;
-
-	/* Allocate the page tables for the Switcher. */
-	if (!allocate_switcher_mapping(cpu)) {
-		release_all_pagetables(lg);
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
-void page_table_guest_data_init(struct lg_cpu *cpu)
-{
-	/*
-	 * We tell the Guest that it can't use the virtual addresses
-	 * used by the Switcher.  This trick is equivalent to 4GB -
-	 * switcher_addr.
-	 */
-	u32 top = ~switcher_addr + 1;
-
-	/* We get the kernel address: above this is all kernel memory. */
-	if (get_user(cpu->lg->kernel_address,
-		     &cpu->lg->lguest_data->kernel_address)
-		/*
-		 * We tell the Guest that it can't use the top virtual
-		 * addresses (used by the Switcher).
-		 */
-	    || put_user(top, &cpu->lg->lguest_data->reserve_mem)) {
-		kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
-		return;
-	}
-
-	/*
-	 * In flush_user_mappings() we loop from 0 to
-	 * "pgd_index(lg->kernel_address)".  This assumes it won't hit the
-	 * Switcher mappings, so check that now.
-	 */
-	if (cpu->lg->kernel_address >= switcher_addr)
-		kill_guest(cpu, "bad kernel address %#lx",
-				 cpu->lg->kernel_address);
-}
-
-/* When a Guest dies, our cleanup is fairly simple. */
-void free_guest_pagetable(struct lguest *lg)
-{
-	unsigned int i;
-
-	/* Throw away all page table pages. */
-	release_all_pagetables(lg);
-	/* Now free the top levels: free_page() can handle 0 just fine. */
-	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
-		free_page((long)lg->pgdirs[i].pgdir);
-}
-
-/*H:481
- * This clears the Switcher mappings for cpu #i.
- */
-static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i)
-{
-	unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2;
-	pte_t *pte;
-
-	/* Clear the mappings for both pages. */
-	pte = find_spte(cpu, base, false, 0, 0);
-	release_pte(*pte);
-	set_pte(pte, __pte(0));
-
-	pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
-	release_pte(*pte);
-	set_pte(pte, __pte(0));
-}
-
-/*H:480
- * (vi) Mapping the Switcher when the Guest is about to run.
- *
- * The Switcher and the two pages for this CPU need to be visible in the Guest
- * (and not the pages for other CPUs).
- *
- * The pages for the pagetables have all been allocated before: we just need
- * to make sure the actual PTEs are up-to-date for the CPU we're about to run
- * on.
- */
-void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
-{
-	unsigned long base;
-	struct page *percpu_switcher_page, *regs_page;
-	pte_t *pte;
-	struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd];
-
-	/* Switcher page should always be mapped by now! */
-	BUG_ON(!pgdir->switcher_mapped);
-
-	/* 
-	 * Remember that we have two pages for each Host CPU, so we can run a
-	 * Guest on each CPU without them interfering.  We need to make sure
-	 * those pages are mapped correctly in the Guest, but since we usually
-	 * run on the same CPU, we cache that, and only update the mappings
-	 * when we move.
-	 */
-	if (pgdir->last_host_cpu == raw_smp_processor_id())
-		return;
-
-	/* -1 means unknown so we remove everything. */
-	if (pgdir->last_host_cpu == -1) {
-		unsigned int i;
-		for_each_possible_cpu(i)
-			remove_switcher_percpu_map(cpu, i);
-	} else {
-		/* We know exactly what CPU mapping to remove. */
-		remove_switcher_percpu_map(cpu, pgdir->last_host_cpu);
-	}
-
-	/*
-	 * When we're running the Guest, we want the Guest's "regs" page to
-	 * appear where the first Switcher page for this CPU is.  This is an
-	 * optimization: when the Switcher saves the Guest registers, it saves
-	 * them into the first page of this CPU's "struct lguest_pages": if we
-	 * make sure the Guest's register page is already mapped there, we
-	 * don't have to copy them out again.
-	 */
-	/* Find the shadow PTE for this regs page. */
-	base = switcher_addr + PAGE_SIZE
-		+ raw_smp_processor_id() * sizeof(struct lguest_pages);
-	pte = find_spte(cpu, base, false, 0, 0);
-	regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT);
-	get_page(regs_page);
-	set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)));
-
-	/*
-	 * We map the second page of the struct lguest_pages read-only in
-	 * the Guest: the IDT, GDT and other things it's not supposed to
-	 * change.
-	 */
-	pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
-	percpu_switcher_page
-		= lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1];
-	get_page(percpu_switcher_page);
-	set_pte(pte, mk_pte(percpu_switcher_page,
-			    __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)));
-
-	pgdir->last_host_cpu = raw_smp_processor_id();
-}
-
-/*H:490
- * We've made it through the page table code.  Perhaps our tired brains are
- * still processing the details, or perhaps we're simply glad it's over.
- *
- * If nothing else, note that all this complexity in juggling shadow page tables
- * in sync with the Guest's page tables is for one reason: for most Guests this
- * page table dance determines how bad performance will be.  This is why Xen
- * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD
- * have implemented shadow page table support directly into hardware.
- *
- * There is just one file remaining in the Host.
- */
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c
deleted file mode 100644
index c4fb424dfddb..000000000000
--- a/drivers/lguest/segments.c
+++ /dev/null
@@ -1,228 +0,0 @@
-/*P:600
- * The x86 architecture has segments, which involve a table of descriptors
- * which can be used to do funky things with virtual address interpretation.
- * We originally used to use segments so the Guest couldn't alter the
- * Guest<->Host Switcher, and then we had to trim Guest segments, and restore
- * for userspace per-thread segments, but trim again for on userspace->kernel
- * transitions...  This nightmarish creation was contained within this file,
- * where we knew not to tread without heavy armament and a change of underwear.
- *
- * In these modern times, the segment handling code consists of simple sanity
- * checks, and the worst you'll experience reading this code is butterfly-rash
- * from frolicking through its parklike serenity.
-:*/
-#include "lg.h"
-
-/*H:600
- * Segments & The Global Descriptor Table
- *
- * (That title sounds like a bad Nerdcore group.  Not to suggest that there are
- * any good Nerdcore groups, but in high school a friend of mine had a band
- * called Joe Fish and the Chips, so there are definitely worse band names).
- *
- * To refresh: the GDT is a table of 8-byte values describing segments.  Once
- * set up, these segments can be loaded into one of the 6 "segment registers".
- *
- * GDT entries are passed around as "struct desc_struct"s, which like IDT
- * entries are split into two 32-bit members, "a" and "b".  One day, someone
- * will clean that up, and be declared a Hero.  (No pressure, I'm just saying).
- *
- * Anyway, the GDT entry contains a base (the start address of the segment), a
- * limit (the size of the segment - 1), and some flags.  Sounds simple, and it
- * would be, except those zany Intel engineers decided that it was too boring
- * to put the base at one end, the limit at the other, and the flags in
- * between.  They decided to shotgun the bits at random throughout the 8 bytes,
- * like so:
- *
- * 0               16                     40       48  52  56     63
- * [ limit part 1 ][     base part 1     ][ flags ][li][fl][base ]
- *                                                  mit ags part 2
- *                                                part 2
- *
- * As a result, this file contains a certain amount of magic numeracy.  Let's
- * begin.
- */
-
-/*
- * There are several entries we don't let the Guest set.  The TSS entry is the
- * "Task State Segment" which controls all kinds of delicate things.  The
- * LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the
- * the Guest can't be trusted to deal with double faults.
- */
-static bool ignored_gdt(unsigned int num)
-{
-	return (num == GDT_ENTRY_TSS
-		|| num == GDT_ENTRY_LGUEST_CS
-		|| num == GDT_ENTRY_LGUEST_DS
-		|| num == GDT_ENTRY_DOUBLEFAULT_TSS);
-}
-
-/*H:630
- * Once the Guest gave us new GDT entries, we fix them up a little.  We
- * don't care if they're invalid: the worst that can happen is a General
- * Protection Fault in the Switcher when it restores a Guest segment register
- * which tries to use that entry.  Then we kill the Guest for causing such a
- * mess: the message will be "unhandled trap 256".
- */
-static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end)
-{
-	unsigned int i;
-
-	for (i = start; i < end; i++) {
-		/*
-		 * We never copy these ones to real GDT, so we don't care what
-		 * they say
-		 */
-		if (ignored_gdt(i))
-			continue;
-
-		/*
-		 * Segment descriptors contain a privilege level: the Guest is
-		 * sometimes careless and leaves this as 0, even though it's
-		 * running at privilege level 1.  If so, we fix it here.
-		 */
-		if (cpu->arch.gdt[i].dpl == 0)
-			cpu->arch.gdt[i].dpl |= GUEST_PL;
-
-		/*
-		 * Each descriptor has an "accessed" bit.  If we don't set it
-		 * now, the CPU will try to set it when the Guest first loads
-		 * that entry into a segment register.  But the GDT isn't
-		 * writable by the Guest, so bad things can happen.
-		 */
-		cpu->arch.gdt[i].type |= 0x1;
-	}
-}
-
-/*H:610
- * Like the IDT, we never simply use the GDT the Guest gives us.  We keep
- * a GDT for each CPU, and copy across the Guest's entries each time we want to
- * run the Guest on that CPU.
- *
- * This routine is called at boot or modprobe time for each CPU to set up the
- * constant GDT entries: the ones which are the same no matter what Guest we're
- * running.
- */
-void setup_default_gdt_entries(struct lguest_ro_state *state)
-{
-	struct desc_struct *gdt = state->guest_gdt;
-	unsigned long tss = (unsigned long)&state->guest_tss;
-
-	/* The Switcher segments are full 0-4G segments, privilege level 0 */
-	gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
-	gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
-
-	/*
-	 * The TSS segment refers to the TSS entry for this particular CPU.
-	 */
-	gdt[GDT_ENTRY_TSS].a = 0;
-	gdt[GDT_ENTRY_TSS].b = 0;
-
-	gdt[GDT_ENTRY_TSS].limit0 = 0x67;
-	gdt[GDT_ENTRY_TSS].base0  = tss & 0xFFFF;
-	gdt[GDT_ENTRY_TSS].base1  = (tss >> 16) & 0xFF;
-	gdt[GDT_ENTRY_TSS].base2  = tss >> 24;
-	gdt[GDT_ENTRY_TSS].type   = 0x9; /* 32-bit TSS (available) */
-	gdt[GDT_ENTRY_TSS].p      = 0x1; /* Entry is present */
-	gdt[GDT_ENTRY_TSS].dpl    = 0x0; /* Privilege level 0 */
-	gdt[GDT_ENTRY_TSS].s      = 0x0; /* system segment */
-
-}
-
-/*
- * This routine sets up the initial Guest GDT for booting.  All entries start
- * as 0 (unusable).
- */
-void setup_guest_gdt(struct lg_cpu *cpu)
-{
-	/*
-	 * Start with full 0-4G segments...except the Guest is allowed to use
-	 * them, so set the privilege level appropriately in the flags.
-	 */
-	cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
-	cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
-	cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].dpl |= GUEST_PL;
-	cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].dpl |= GUEST_PL;
-}
-
-/*H:650
- * An optimization of copy_gdt(), for just the three "thead-local storage"
- * entries.
- */
-void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt)
-{
-	unsigned int i;
-
-	for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++)
-		gdt[i] = cpu->arch.gdt[i];
-}
-
-/*H:640
- * When the Guest is run on a different CPU, or the GDT entries have changed,
- * copy_gdt() is called to copy the Guest's GDT entries across to this CPU's
- * GDT.
- */
-void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt)
-{
-	unsigned int i;
-
-	/*
-	 * The default entries from setup_default_gdt_entries() are not
-	 * replaced.  See ignored_gdt() above.
-	 */
-	for (i = 0; i < GDT_ENTRIES; i++)
-		if (!ignored_gdt(i))
-			gdt[i] = cpu->arch.gdt[i];
-}
-
-/*H:620
- * This is where the Guest asks us to load a new GDT entry
- * (LHCALL_LOAD_GDT_ENTRY).  We tweak the entry and copy it in.
- */
-void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi)
-{
-	/*
-	 * We assume the Guest has the same number of GDT entries as the
-	 * Host, otherwise we'd have to dynamically allocate the Guest GDT.
-	 */
-	if (num >= ARRAY_SIZE(cpu->arch.gdt)) {
-		kill_guest(cpu, "too many gdt entries %i", num);
-		return;
-	}
-
-	/* Set it up, then fix it. */
-	cpu->arch.gdt[num].a = lo;
-	cpu->arch.gdt[num].b = hi;
-	fixup_gdt_table(cpu, num, num+1);
-	/*
-	 * Mark that the GDT changed so the core knows it has to copy it again,
-	 * even if the Guest is run on the same CPU.
-	 */
-	cpu->changed |= CHANGED_GDT;
-}
-
-/*
- * This is the fast-track version for just changing the three TLS entries.
- * Remember that this happens on every context switch, so it's worth
- * optimizing.  But wouldn't it be neater to have a single hypercall to cover
- * both cases?
- */
-void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls)
-{
-	struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN];
-
-	__lgread(cpu, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
-	fixup_gdt_table(cpu, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
-	/* Note that just the TLS entries have changed. */
-	cpu->changed |= CHANGED_GDT_TLS;
-}
-
-/*H:660
- * With this, we have finished the Host.
- *
- * Five of the seven parts of our task are complete.  You have made it through
- * the Bit of Despair (I think that's somewhere in the page table code,
- * myself).
- *
- * Next, we examine "make Switcher".  It's short, but intense.
- */
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
deleted file mode 100644
index b4f79b923aea..000000000000
--- a/drivers/lguest/x86/core.c
+++ /dev/null
@@ -1,724 +0,0 @@
-/*
- * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
- * Copyright (C) 2007, Jes Sorensen <jes@sgi.com> SGI.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-/*P:450
- * This file contains the x86-specific lguest code.  It used to be all
- * mixed in with drivers/lguest/core.c but several foolhardy code slashers
- * wrestled most of the dependencies out to here in preparation for porting
- * lguest to other architectures (see what I mean by foolhardy?).
- *
- * This also contains a couple of non-obvious setup and teardown pieces which
- * were implemented after days of debugging pain.
-:*/
-#include <linux/kernel.h>
-#include <linux/start_kernel.h>
-#include <linux/string.h>
-#include <linux/console.h>
-#include <linux/screen_info.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-#include <linux/cpu.h>
-#include <linux/lguest.h>
-#include <linux/lguest_launcher.h>
-#include <asm/paravirt.h>
-#include <asm/param.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/setup.h>
-#include <asm/lguest.h>
-#include <linux/uaccess.h>
-#include <asm/fpu/internal.h>
-#include <asm/tlbflush.h>
-#include "../lg.h"
-
-static int cpu_had_pge;
-
-static struct {
-	unsigned long offset;
-	unsigned short segment;
-} lguest_entry;
-
-/* Offset from where switcher.S was compiled to where we've copied it */
-static unsigned long switcher_offset(void)
-{
-	return switcher_addr - (unsigned long)start_switcher_text;
-}
-
-/* This cpu's struct lguest_pages (after the Switcher text page) */
-static struct lguest_pages *lguest_pages(unsigned int cpu)
-{
-	return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]);
-}
-
-static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu);
-
-/*S:010
- * We approach the Switcher.
- *
- * Remember that each CPU has two pages which are visible to the Guest when it
- * runs on that CPU.  This has to contain the state for that Guest: we copy the
- * state in just before we run the Guest.
- *
- * Each Guest has "changed" flags which indicate what has changed in the Guest
- * since it last ran.  We saw this set in interrupts_and_traps.c and
- * segments.c.
- */
-static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages)
-{
-	/*
-	 * Copying all this data can be quite expensive.  We usually run the
-	 * same Guest we ran last time (and that Guest hasn't run anywhere else
-	 * meanwhile).  If that's not the case, we pretend everything in the
-	 * Guest has changed.
-	 */
-	if (__this_cpu_read(lg_last_cpu) != cpu || cpu->last_pages != pages) {
-		__this_cpu_write(lg_last_cpu, cpu);
-		cpu->last_pages = pages;
-		cpu->changed = CHANGED_ALL;
-	}
-
-	/*
-	 * These copies are pretty cheap, so we do them unconditionally: */
-	/* Save the current Host top-level page directory.
-	 */
-	pages->state.host_cr3 = __pa(current->mm->pgd);
-	/*
-	 * Set up the Guest's page tables to see this CPU's pages (and no
-	 * other CPU's pages).
-	 */
-	map_switcher_in_guest(cpu, pages);
-	/*
-	 * Set up the two "TSS" members which tell the CPU what stack to use
-	 * for traps which do directly into the Guest (ie. traps at privilege
-	 * level 1).
-	 */
-	pages->state.guest_tss.sp1 = cpu->esp1;
-	pages->state.guest_tss.ss1 = cpu->ss1;
-
-	/* Copy direct-to-Guest trap entries. */
-	if (cpu->changed & CHANGED_IDT)
-		copy_traps(cpu, pages->state.guest_idt, default_idt_entries);
-
-	/* Copy all GDT entries which the Guest can change. */
-	if (cpu->changed & CHANGED_GDT)
-		copy_gdt(cpu, pages->state.guest_gdt);
-	/* If only the TLS entries have changed, copy them. */
-	else if (cpu->changed & CHANGED_GDT_TLS)
-		copy_gdt_tls(cpu, pages->state.guest_gdt);
-
-	/* Mark the Guest as unchanged for next time. */
-	cpu->changed = 0;
-}
-
-/* Finally: the code to actually call into the Switcher to run the Guest. */
-static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
-{
-	/* This is a dummy value we need for GCC's sake. */
-	unsigned int clobber;
-
-	/*
-	 * Copy the guest-specific information into this CPU's "struct
-	 * lguest_pages".
-	 */
-	copy_in_guest_info(cpu, pages);
-
-	/*
-	 * Set the trap number to 256 (impossible value).  If we fault while
-	 * switching to the Guest (bad segment registers or bug), this will
-	 * cause us to abort the Guest.
-	 */
-	cpu->regs->trapnum = 256;
-
-	/*
-	 * Now: we push the "eflags" register on the stack, then do an "lcall".
-	 * This is how we change from using the kernel code segment to using
-	 * the dedicated lguest code segment, as well as jumping into the
-	 * Switcher.
-	 *
-	 * The lcall also pushes the old code segment (KERNEL_CS) onto the
-	 * stack, then the address of this call.  This stack layout happens to
-	 * exactly match the stack layout created by an interrupt...
-	 */
-	asm volatile("pushf; lcall *%4"
-		     /*
-		      * This is how we tell GCC that %eax ("a") and %ebx ("b")
-		      * are changed by this routine.  The "=" means output.
-		      */
-		     : "=a"(clobber), "=b"(clobber)
-		     /*
-		      * %eax contains the pages pointer.  ("0" refers to the
-		      * 0-th argument above, ie "a").  %ebx contains the
-		      * physical address of the Guest's top-level page
-		      * directory.
-		      */
-		     : "0"(pages), 
-		       "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)),
-		       "m"(lguest_entry)
-		     /*
-		      * We tell gcc that all these registers could change,
-		      * which means we don't have to save and restore them in
-		      * the Switcher.
-		      */
-		     : "memory", "%edx", "%ecx", "%edi", "%esi");
-}
-/*:*/
-
-unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any)
-{
-	switch (reg_off) {
-	case offsetof(struct pt_regs, bx):
-		return &cpu->regs->ebx;
-	case offsetof(struct pt_regs, cx):
-		return &cpu->regs->ecx;
-	case offsetof(struct pt_regs, dx):
-		return &cpu->regs->edx;
-	case offsetof(struct pt_regs, si):
-		return &cpu->regs->esi;
-	case offsetof(struct pt_regs, di):
-		return &cpu->regs->edi;
-	case offsetof(struct pt_regs, bp):
-		return &cpu->regs->ebp;
-	case offsetof(struct pt_regs, ax):
-		return &cpu->regs->eax;
-	case offsetof(struct pt_regs, ip):
-		return &cpu->regs->eip;
-	case offsetof(struct pt_regs, sp):
-		return &cpu->regs->esp;
-	}
-
-	/* Launcher can read these, but we don't allow any setting. */
-	if (any) {
-		switch (reg_off) {
-		case offsetof(struct pt_regs, ds):
-			return &cpu->regs->ds;
-		case offsetof(struct pt_regs, es):
-			return &cpu->regs->es;
-		case offsetof(struct pt_regs, fs):
-			return &cpu->regs->fs;
-		case offsetof(struct pt_regs, gs):
-			return &cpu->regs->gs;
-		case offsetof(struct pt_regs, cs):
-			return &cpu->regs->cs;
-		case offsetof(struct pt_regs, flags):
-			return &cpu->regs->eflags;
-		case offsetof(struct pt_regs, ss):
-			return &cpu->regs->ss;
-		}
-	}
-
-	return NULL;
-}
-
-/*M:002
- * There are hooks in the scheduler which we can register to tell when we
- * get kicked off the CPU (preempt_notifier_register()).  This would allow us
- * to lazily disable SYSENTER which would regain some performance, and should
- * also simplify copy_in_guest_info().  Note that we'd still need to restore
- * things when we exit to Launcher userspace, but that's fairly easy.
- *
- * We could also try using these hooks for PGE, but that might be too expensive.
- *
- * The hooks were designed for KVM, but we can also put them to good use.
-:*/
-
-/*H:040
- * This is the i386-specific code to setup and run the Guest.  Interrupts
- * are disabled: we own the CPU.
- */
-void lguest_arch_run_guest(struct lg_cpu *cpu)
-{
-	/*
-	 * SYSENTER is an optimized way of doing system calls.  We can't allow
-	 * it because it always jumps to privilege level 0.  A normal Guest
-	 * won't try it because we don't advertise it in CPUID, but a malicious
-	 * Guest (or malicious Guest userspace program) could, so we tell the
-	 * CPU to disable it before running the Guest.
-	 */
-	if (boot_cpu_has(X86_FEATURE_SEP))
-		wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
-
-	/*
-	 * Now we actually run the Guest.  It will return when something
-	 * interesting happens, and we can examine its registers to see what it
-	 * was doing.
-	 */
-	run_guest_once(cpu, lguest_pages(raw_smp_processor_id()));
-
-	/*
-	 * Note that the "regs" structure contains two extra entries which are
-	 * not really registers: a trap number which says what interrupt or
-	 * trap made the switcher code come back, and an error code which some
-	 * traps set.
-	 */
-
-	 /* Restore SYSENTER if it's supposed to be on. */
-	 if (boot_cpu_has(X86_FEATURE_SEP))
-		wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
-
-	/*
-	 * If the Guest page faulted, then the cr2 register will tell us the
-	 * bad virtual address.  We have to grab this now, because once we
-	 * re-enable interrupts an interrupt could fault and thus overwrite
-	 * cr2, or we could even move off to a different CPU.
-	 */
-	if (cpu->regs->trapnum == 14)
-		cpu->arch.last_pagefault = read_cr2();
-	/*
-	 * Similarly, if we took a trap because the Guest used the FPU,
-	 * we have to restore the FPU it expects to see.
-	 * fpu__restore() may sleep and we may even move off to
-	 * a different CPU. So all the critical stuff should be done
-	 * before this.
-	 */
-	else if (cpu->regs->trapnum == 7 && !fpregs_active())
-		fpu__restore(&current->thread.fpu);
-}
-
-/*H:130
- * Now we've examined the hypercall code; our Guest can make requests.
- * Our Guest is usually so well behaved; it never tries to do things it isn't
- * allowed to, and uses hypercalls instead.  Unfortunately, Linux's paravirtual
- * infrastructure isn't quite complete, because it doesn't contain replacements
- * for the Intel I/O instructions.  As a result, the Guest sometimes fumbles
- * across one during the boot process as it probes for various things which are
- * usually attached to a PC.
- *
- * When the Guest uses one of these instructions, we get a trap (General
- * Protection Fault) and come here.  We queue this to be sent out to the
- * Launcher to handle.
- */
-
-/*
- * The eip contains the *virtual* address of the Guest's instruction:
- * we copy the instruction here so the Launcher doesn't have to walk
- * the page tables to decode it.  We handle the case (eg. in a kernel
- * module) where the instruction is over two pages, and the pages are
- * virtually but not physically contiguous.
- *
- * The longest possible x86 instruction is 15 bytes, but we don't handle
- * anything that strange.
- */
-static void copy_from_guest(struct lg_cpu *cpu,
-			    void *dst, unsigned long vaddr, size_t len)
-{
-	size_t to_page_end = PAGE_SIZE - (vaddr % PAGE_SIZE);
-	unsigned long paddr;
-
-	BUG_ON(len > PAGE_SIZE);
-
-	/* If it goes over a page, copy in two parts. */
-	if (len > to_page_end) {
-		/* But make sure the next page is mapped! */
-		if (__guest_pa(cpu, vaddr + to_page_end, &paddr))
-			copy_from_guest(cpu, dst + to_page_end,
-					vaddr + to_page_end,
-					len - to_page_end);
-		else
-			/* Otherwise fill with zeroes. */
-			memset(dst + to_page_end, 0, len - to_page_end);
-		len = to_page_end;
-	}
-
-	/* This will kill the guest if it isn't mapped, but that
-	 * shouldn't happen. */
-	__lgread(cpu, dst, guest_pa(cpu, vaddr), len);
-}
-
-
-static void setup_emulate_insn(struct lg_cpu *cpu)
-{
-	cpu->pending.trap = 13;
-	copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
-			sizeof(cpu->pending.insn));
-}
-
-static void setup_iomem_insn(struct lg_cpu *cpu, unsigned long iomem_addr)
-{
-	cpu->pending.trap = 14;
-	cpu->pending.addr = iomem_addr;
-	copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
-			sizeof(cpu->pending.insn));
-}
-
-/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
-void lguest_arch_handle_trap(struct lg_cpu *cpu)
-{
-	unsigned long iomem_addr;
-
-	switch (cpu->regs->trapnum) {
-	case 13: /* We've intercepted a General Protection Fault. */
-		/* Hand to Launcher to emulate those pesky IN and OUT insns */
-		if (cpu->regs->errcode == 0) {
-			setup_emulate_insn(cpu);
-			return;
-		}
-		break;
-	case 14: /* We've intercepted a Page Fault. */
-		/*
-		 * The Guest accessed a virtual address that wasn't mapped.
-		 * This happens a lot: we don't actually set up most of the page
-		 * tables for the Guest at all when we start: as it runs it asks
-		 * for more and more, and we set them up as required. In this
-		 * case, we don't even tell the Guest that the fault happened.
-		 *
-		 * The errcode tells whether this was a read or a write, and
-		 * whether kernel or userspace code.
-		 */
-		if (demand_page(cpu, cpu->arch.last_pagefault,
-				cpu->regs->errcode, &iomem_addr))
-			return;
-
-		/* Was this an access to memory mapped IO? */
-		if (iomem_addr) {
-			/* Tell Launcher, let it handle it. */
-			setup_iomem_insn(cpu, iomem_addr);
-			return;
-		}
-
-		/*
-		 * OK, it's really not there (or not OK): the Guest needs to
-		 * know.  We write out the cr2 value so it knows where the
-		 * fault occurred.
-		 *
-		 * Note that if the Guest were really messed up, this could
-		 * happen before it's done the LHCALL_LGUEST_INIT hypercall, so
-		 * lg->lguest_data could be NULL
-		 */
-		if (cpu->lg->lguest_data &&
-		    put_user(cpu->arch.last_pagefault,
-			     &cpu->lg->lguest_data->cr2))
-			kill_guest(cpu, "Writing cr2");
-		break;
-	case 7: /* We've intercepted a Device Not Available fault. */
-		/* No special handling is needed here. */
-		break;
-	case 32 ... 255:
-		/* This might be a syscall. */
-		if (could_be_syscall(cpu->regs->trapnum))
-			break;
-
-		/*
-		 * Other values mean a real interrupt occurred, in which case
-		 * the Host handler has already been run. We just do a
-		 * friendly check if another process should now be run, then
-		 * return to run the Guest again.
-		 */
-		cond_resched();
-		return;
-	case LGUEST_TRAP_ENTRY:
-		/*
-		 * Our 'struct hcall_args' maps directly over our regs: we set
-		 * up the pointer now to indicate a hypercall is pending.
-		 */
-		cpu->hcall = (struct hcall_args *)cpu->regs;
-		return;
-	}
-
-	/* We didn't handle the trap, so it needs to go to the Guest. */
-	if (!deliver_trap(cpu, cpu->regs->trapnum))
-		/*
-		 * If the Guest doesn't have a handler (either it hasn't
-		 * registered any yet, or it's one of the faults we don't let
-		 * it handle), it dies with this cryptic error message.
-		 */
-		kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)",
-			   cpu->regs->trapnum, cpu->regs->eip,
-			   cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault
-			   : cpu->regs->errcode);
-}
-
-/*
- * Now we can look at each of the routines this calls, in increasing order of
- * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(),
- * deliver_trap() and demand_page().  After all those, we'll be ready to
- * examine the Switcher, and our philosophical understanding of the Host/Guest
- * duality will be complete.
-:*/
-static void adjust_pge(void *on)
-{
-	if (on)
-		cr4_set_bits(X86_CR4_PGE);
-	else
-		cr4_clear_bits(X86_CR4_PGE);
-}
-
-/*H:020
- * Now the Switcher is mapped and every thing else is ready, we need to do
- * some more i386-specific initialization.
- */
-void __init lguest_arch_host_init(void)
-{
-	int i;
-
-	/*
-	 * Most of the x86/switcher_32.S doesn't care that it's been moved; on
-	 * Intel, jumps are relative, and it doesn't access any references to
-	 * external code or data.
-	 *
-	 * The only exception is the interrupt handlers in switcher.S: their
-	 * addresses are placed in a table (default_idt_entries), so we need to
-	 * update the table with the new addresses.  switcher_offset() is a
-	 * convenience function which returns the distance between the
-	 * compiled-in switcher code and the high-mapped copy we just made.
-	 */
-	for (i = 0; i < IDT_ENTRIES; i++)
-		default_idt_entries[i] += switcher_offset();
-
-	/*
-	 * Set up the Switcher's per-cpu areas.
-	 *
-	 * Each CPU gets two pages of its own within the high-mapped region
-	 * (aka. "struct lguest_pages").  Much of this can be initialized now,
-	 * but some depends on what Guest we are running (which is set up in
-	 * copy_in_guest_info()).
-	 */
-	for_each_possible_cpu(i) {
-		/* lguest_pages() returns this CPU's two pages. */
-		struct lguest_pages *pages = lguest_pages(i);
-		/* This is a convenience pointer to make the code neater. */
-		struct lguest_ro_state *state = &pages->state;
-
-		/*
-		 * The Global Descriptor Table: the Host has a different one
-		 * for each CPU.  We keep a descriptor for the GDT which says
-		 * where it is and how big it is (the size is actually the last
-		 * byte, not the size, hence the "-1").
-		 */
-		state->host_gdt_desc.size = GDT_SIZE-1;
-		state->host_gdt_desc.address = (long)get_cpu_gdt_rw(i);
-
-		/*
-		 * All CPUs on the Host use the same Interrupt Descriptor
-		 * Table, so we just use store_idt(), which gets this CPU's IDT
-		 * descriptor.
-		 */
-		store_idt(&state->host_idt_desc);
-
-		/*
-		 * The descriptors for the Guest's GDT and IDT can be filled
-		 * out now, too.  We copy the GDT & IDT into ->guest_gdt and
-		 * ->guest_idt before actually running the Guest.
-		 */
-		state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
-		state->guest_idt_desc.address = (long)&state->guest_idt;
-		state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
-		state->guest_gdt_desc.address = (long)&state->guest_gdt;
-
-		/*
-		 * We know where we want the stack to be when the Guest enters
-		 * the Switcher: in pages->regs.  The stack grows upwards, so
-		 * we start it at the end of that structure.
-		 */
-		state->guest_tss.sp0 = (long)(&pages->regs + 1);
-		/*
-		 * And this is the GDT entry to use for the stack: we keep a
-		 * couple of special LGUEST entries.
-		 */
-		state->guest_tss.ss0 = LGUEST_DS;
-
-		/*
-		 * x86 can have a finegrained bitmap which indicates what I/O
-		 * ports the process can use.  We set it to the end of our
-		 * structure, meaning "none".
-		 */
-		state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
-
-		/*
-		 * Some GDT entries are the same across all Guests, so we can
-		 * set them up now.
-		 */
-		setup_default_gdt_entries(state);
-		/* Most IDT entries are the same for all Guests, too.*/
-		setup_default_idt_entries(state, default_idt_entries);
-
-		/*
-		 * The Host needs to be able to use the LGUEST segments on this
-		 * CPU, too, so put them in the Host GDT.
-		 */
-		get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
-		get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
-	}
-
-	/*
-	 * In the Switcher, we want the %cs segment register to use the
-	 * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so
-	 * it will be undisturbed when we switch.  To change %cs and jump we
-	 * need this structure to feed to Intel's "lcall" instruction.
-	 */
-	lguest_entry.offset = (long)switch_to_guest + switcher_offset();
-	lguest_entry.segment = LGUEST_CS;
-
-	/*
-	 * Finally, we need to turn off "Page Global Enable".  PGE is an
-	 * optimization where page table entries are specially marked to show
-	 * they never change.  The Host kernel marks all the kernel pages this
-	 * way because it's always present, even when userspace is running.
-	 *
-	 * Lguest breaks this: unbeknownst to the rest of the Host kernel, we
-	 * switch to the Guest kernel.  If you don't disable this on all CPUs,
-	 * you'll get really weird bugs that you'll chase for two days.
-	 *
-	 * I used to turn PGE off every time we switched to the Guest and back
-	 * on when we return, but that slowed the Switcher down noticibly.
-	 */
-
-	/*
-	 * We don't need the complexity of CPUs coming and going while we're
-	 * doing this.
-	 */
-	get_online_cpus();
-	if (boot_cpu_has(X86_FEATURE_PGE)) { /* We have a broader idea of "global". */
-		/* Remember that this was originally set (for cleanup). */
-		cpu_had_pge = 1;
-		/*
-		 * adjust_pge is a helper function which sets or unsets the PGE
-		 * bit on its CPU, depending on the argument (0 == unset).
-		 */
-		on_each_cpu(adjust_pge, (void *)0, 1);
-		/* Turn off the feature in the global feature set. */
-		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
-	}
-	put_online_cpus();
-}
-/*:*/
-
-void __exit lguest_arch_host_fini(void)
-{
-	/* If we had PGE before we started, turn it back on now. */
-	get_online_cpus();
-	if (cpu_had_pge) {
-		set_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
-		/* adjust_pge's argument "1" means set PGE. */
-		on_each_cpu(adjust_pge, (void *)1, 1);
-	}
-	put_online_cpus();
-}
-
-
-/*H:122 The i386-specific hypercalls simply farm out to the right functions. */
-int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
-{
-	switch (args->arg0) {
-	case LHCALL_LOAD_GDT_ENTRY:
-		load_guest_gdt_entry(cpu, args->arg1, args->arg2, args->arg3);
-		break;
-	case LHCALL_LOAD_IDT_ENTRY:
-		load_guest_idt_entry(cpu, args->arg1, args->arg2, args->arg3);
-		break;
-	case LHCALL_LOAD_TLS:
-		guest_load_tls(cpu, args->arg1);
-		break;
-	default:
-		/* Bad Guest.  Bad! */
-		return -EIO;
-	}
-	return 0;
-}
-
-/*H:126 i386-specific hypercall initialization: */
-int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
-{
-	u32 tsc_speed;
-
-	/*
-	 * The pointer to the Guest's "struct lguest_data" is the only argument.
-	 * We check that address now.
-	 */
-	if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1,
-			       sizeof(*cpu->lg->lguest_data)))
-		return -EFAULT;
-
-	/*
-	 * Having checked it, we simply set lg->lguest_data to point straight
-	 * into the Launcher's memory at the right place and then use
-	 * copy_to_user/from_user from now on, instead of lgread/write.  I put
-	 * this in to show that I'm not immune to writing stupid
-	 * optimizations.
-	 */
-	cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1;
-
-	/*
-	 * We insist that the Time Stamp Counter exist and doesn't change with
-	 * cpu frequency.  Some devious chip manufacturers decided that TSC
-	 * changes could be handled in software.  I decided that time going
-	 * backwards might be good for benchmarks, but it's bad for users.
-	 *
-	 * We also insist that the TSC be stable: the kernel detects unreliable
-	 * TSCs for its own purposes, and we use that here.
-	 */
-	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
-		tsc_speed = tsc_khz;
-	else
-		tsc_speed = 0;
-	if (put_user(tsc_speed, &cpu->lg->lguest_data->tsc_khz))
-		return -EFAULT;
-
-	/* The interrupt code might not like the system call vector. */
-	if (!check_syscall_vector(cpu->lg))
-		kill_guest(cpu, "bad syscall vector");
-
-	return 0;
-}
-/*:*/
-
-/*L:030
- * Most of the Guest's registers are left alone: we used get_zeroed_page() to
- * allocate the structure, so they will be 0.
- */
-void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start)
-{
-	struct lguest_regs *regs = cpu->regs;
-
-	/*
-	 * There are four "segment" registers which the Guest needs to boot:
-	 * The "code segment" register (cs) refers to the kernel code segment
-	 * __KERNEL_CS, and the "data", "extra" and "stack" segment registers
-	 * refer to the kernel data segment __KERNEL_DS.
-	 *
-	 * The privilege level is packed into the lower bits.  The Guest runs
-	 * at privilege level 1 (GUEST_PL).
-	 */
-	regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL;
-	regs->cs = __KERNEL_CS|GUEST_PL;
-
-	/*
-	 * The "eflags" register contains miscellaneous flags.  Bit 1 (0x002)
-	 * is supposed to always be "1".  Bit 9 (0x200) controls whether
-	 * interrupts are enabled.  We always leave interrupts enabled while
-	 * running the Guest.
-	 */
-	regs->eflags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
-
-	/*
-	 * The "Extended Instruction Pointer" register says where the Guest is
-	 * running.
-	 */
-	regs->eip = start;
-
-	/*
-	 * %esi points to our boot information, at physical address 0, so don't
-	 * touch it.
-	 */
-
-	/* There are a couple of GDT entries the Guest expects at boot. */
-	setup_guest_gdt(cpu);
-}
diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S
deleted file mode 100644
index 40634b0db9f7..000000000000
--- a/drivers/lguest/x86/switcher_32.S
+++ /dev/null
@@ -1,388 +0,0 @@
-/*P:900
- * This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride
- * both the Host and Guest to do the low-level Guest<->Host switch.  It is as
- * simple as it can be made, but it's naturally very specific to x86.
- *
- * You have now completed Preparation.  If this has whet your appetite; if you
- * are feeling invigorated and refreshed then the next, more challenging stage
- * can be found in "make Guest".
- :*/
-
-/*M:012
- * Lguest is meant to be simple: my rule of thumb is that 1% more LOC must
- * gain at least 1% more performance.  Since neither LOC nor performance can be
- * measured beforehand, it generally means implementing a feature then deciding
- * if it's worth it.  And once it's implemented, who can say no?
- *
- * This is why I haven't implemented this idea myself.  I want to, but I
- * haven't.  You could, though.
- *
- * The main place where lguest performance sucks is Guest page faulting.  When
- * a Guest userspace process hits an unmapped page we switch back to the Host,
- * walk the page tables, find it's not mapped, switch back to the Guest page
- * fault handler, which calls a hypercall to set the page table entry, then
- * finally returns to userspace.  That's two round-trips.
- *
- * If we had a small walker in the Switcher, we could quickly check the Guest
- * page table and if the page isn't mapped, immediately reflect the fault back
- * into the Guest.  This means the Switcher would have to know the top of the
- * Guest page table and the page fault handler address.
- *
- * For simplicity, the Guest should only handle the case where the privilege
- * level of the fault is 3 and probably only not present or write faults.  It
- * should also detect recursive faults, and hand the original fault to the
- * Host (which is actually really easy).
- *
- * Two questions remain.  Would the performance gain outweigh the complexity?
- * And who would write the verse documenting it?
-:*/
-
-/*M:011
- * Lguest64 handles NMI.  This gave me NMI envy (until I looked at their
- * code).  It's worth doing though, since it would let us use oprofile in the
- * Host when a Guest is running.
-:*/
-
-/*S:100
- * Welcome to the Switcher itself!
- *
- * This file contains the low-level code which changes the CPU to run the Guest
- * code, and returns to the Host when something happens.  Understand this, and
- * you understand the heart of our journey.
- *
- * Because this is in assembler rather than C, our tale switches from prose to
- * verse.  First I tried limericks:
- *
- *	There once was an eax reg,
- *	To which our pointer was fed,
- *	It needed an add,
- *	Which asm-offsets.h had
- *	But this limerick is hurting my head.
- *
- * Next I tried haikus, but fitting the required reference to the seasons in
- * every stanza was quickly becoming tiresome:
- *
- *	The %eax reg
- *	Holds "struct lguest_pages" now:
- *	Cherry blossoms fall.
- *
- * Then I started with Heroic Verse, but the rhyming requirement leeched away
- * the content density and led to some uniquely awful oblique rhymes:
- *
- *	These constants are coming from struct offsets
- *	For use within the asm switcher text.
- *
- * Finally, I settled for something between heroic hexameter, and normal prose
- * with inappropriate linebreaks.  Anyway, it aint no Shakespeare.
- */
-
-// Not all kernel headers work from assembler
-// But these ones are needed: the ENTRY() define
-// And constants extracted from struct offsets
-// To avoid magic numbers and breakage:
-// Should they change the compiler can't save us
-// Down here in the depths of assembler code.
-#include <linux/linkage.h>
-#include <asm/asm-offsets.h>
-#include <asm/page.h>
-#include <asm/segment.h>
-#include <asm/lguest.h>
-
-// We mark the start of the code to copy
-// It's placed in .text tho it's never run here
-// You'll see the trick macro at the end
-// Which interleaves data and text to effect.
-.text
-ENTRY(start_switcher_text)
-
-// When we reach switch_to_guest we have just left
-// The safe and comforting shores of C code
-// %eax has the "struct lguest_pages" to use
-// Where we save state and still see it from the Guest
-// And %ebx holds the Guest shadow pagetable:
-// Once set we have truly left Host behind.
-ENTRY(switch_to_guest)
-	// We told gcc all its regs could fade,
-	// Clobbered by our journey into the Guest
-	// We could have saved them, if we tried
-	// But time is our master and cycles count.
-
-	// Segment registers must be saved for the Host
-	// We push them on the Host stack for later
-	pushl	%es
-	pushl	%ds
-	pushl	%gs
-	pushl	%fs
-	// But the compiler is fickle, and heeds
-	// No warning of %ebp clobbers
-	// When frame pointers are used.  That register
-	// Must be saved and restored or chaos strikes.
-	pushl	%ebp
-	// The Host's stack is done, now save it away
-	// In our "struct lguest_pages" at offset
-	// Distilled into asm-offsets.h
-	movl	%esp, LGUEST_PAGES_host_sp(%eax)
-
-	// All saved and there's now five steps before us:
-	// Stack, GDT, IDT, TSS
-	// Then last of all the page tables are flipped.
-
-	// Yet beware that our stack pointer must be
-	// Always valid lest an NMI hits
-	// %edx does the duty here as we juggle
-	// %eax is lguest_pages: our stack lies within.
-	movl	%eax, %edx
-	addl	$LGUEST_PAGES_regs, %edx
-	movl	%edx, %esp
-
-	// The Guest's GDT we so carefully
-	// Placed in the "struct lguest_pages" before
-	lgdt	LGUEST_PAGES_guest_gdt_desc(%eax)
-
-	// The Guest's IDT we did partially
-	// Copy to "struct lguest_pages" as well.
-	lidt	LGUEST_PAGES_guest_idt_desc(%eax)
-
-	// The TSS entry which controls traps
-	// Must be loaded up with "ltr" now:
-	// The GDT entry that TSS uses 
-	// Changes type when we load it: damn Intel!
-	// For after we switch over our page tables
-	// That entry will be read-only: we'd crash.
-	movl	$(GDT_ENTRY_TSS*8), %edx
-	ltr	%dx
-
-	// Look back now, before we take this last step!
-	// The Host's TSS entry was also marked used;
-	// Let's clear it again for our return.
-	// The GDT descriptor of the Host
-	// Points to the table after two "size" bytes
-	movl	(LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx
-	// Clear "used" from type field (byte 5, bit 2)
-	andb	$0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx)
-
-	// Once our page table's switched, the Guest is live!
-	// The Host fades as we run this final step.
-	// Our "struct lguest_pages" is now read-only.
-	movl	%ebx, %cr3
-
-	// The page table change did one tricky thing:
-	// The Guest's register page has been mapped
-	// Writable under our %esp (stack) --
-	// We can simply pop off all Guest regs.
-	popl	%eax
-	popl	%ebx
-	popl	%ecx
-	popl	%edx
-	popl	%esi
-	popl	%edi
-	popl	%ebp
-	popl	%gs
-	popl	%fs
-	popl	%ds
-	popl	%es
-
-	// Near the base of the stack lurk two strange fields
-	// Which we fill as we exit the Guest
-	// These are the trap number and its error
-	// We can simply step past them on our way.
-	addl	$8, %esp
-
-	// The last five stack slots hold return address
-	// And everything needed to switch privilege
-	// From Switcher's level 0 to Guest's 1,
-	// And the stack where the Guest had last left it.
-	// Interrupts are turned back on: we are Guest.
-	iret
-
-// We tread two paths to switch back to the Host
-// Yet both must save Guest state and restore Host
-// So we put the routine in a macro.
-#define SWITCH_TO_HOST							\
-	/* We save the Guest state: all registers first			\
-	 * Laid out just as "struct lguest_regs" defines */		\
-	pushl	%es;							\
-	pushl	%ds;							\
-	pushl	%fs;							\
-	pushl	%gs;							\
-	pushl	%ebp;							\
-	pushl	%edi;							\
-	pushl	%esi;							\
-	pushl	%edx;							\
-	pushl	%ecx;							\
-	pushl	%ebx;							\
-	pushl	%eax;							\
-	/* Our stack and our code are using segments			\
-	 * Set in the TSS and IDT					\
-	 * Yet if we were to touch data we'd use			\
-	 * Whatever data segment the Guest had.				\
-	 * Load the lguest ds segment for now. */			\
-	movl	$(LGUEST_DS), %eax;					\
-	movl	%eax, %ds;						\
-	/* So where are we?  Which CPU, which struct?			\
-	 * The stack is our clue: our TSS starts			\
-	 * It at the end of "struct lguest_pages".			\
-	 * Or we may have stumbled while restoring			\
-	 * Our Guest segment regs while in switch_to_guest,		\
-	 * The fault pushed atop that part-unwound stack.		\
-	 * If we round the stack down to the page start			\
-	 * We're at the start of "struct lguest_pages". */		\
-	movl	%esp, %eax;						\
-	andl	$(~(1 << PAGE_SHIFT - 1)), %eax;			\
-	/* Save our trap number: the switch will obscure it		\
-	 * (In the Host the Guest regs are not mapped here)		\
-	 * %ebx holds it safe for deliver_to_host */			\
-	movl	LGUEST_PAGES_regs_trapnum(%eax), %ebx;			\
-	/* The Host GDT, IDT and stack!					\
-	 * All these lie safely hidden from the Guest:			\
-	 * We must return to the Host page tables			\
-	 * (Hence that was saved in struct lguest_pages) */		\
-	movl	LGUEST_PAGES_host_cr3(%eax), %edx;			\
-	movl	%edx, %cr3;						\
-	/* As before, when we looked back at the Host			\
-	 * As we left and marked TSS unused				\
-	 * So must we now for the Guest left behind. */			\
-	andb	$0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \
-	/* Switch to Host's GDT, IDT. */				\
-	lgdt	LGUEST_PAGES_host_gdt_desc(%eax);			\
-	lidt	LGUEST_PAGES_host_idt_desc(%eax);			\
-	/* Restore the Host's stack where its saved regs lie */		\
-	movl	LGUEST_PAGES_host_sp(%eax), %esp;			\
-	/* Last the TSS: our Host is returned */			\
-	movl	$(GDT_ENTRY_TSS*8), %edx;				\
-	ltr	%dx;							\
-	/* Restore now the regs saved right at the first. */		\
-	popl	%ebp;							\
-	popl	%fs;							\
-	popl	%gs;							\
-	popl	%ds;							\
-	popl	%es
-
-// The first path is trod when the Guest has trapped:
-// (Which trap it was has been pushed on the stack).
-// We need only switch back, and the Host will decode
-// Why we came home, and what needs to be done.
-return_to_host:
-	SWITCH_TO_HOST
-	iret
-
-// We are lead to the second path like so:
-// An interrupt, with some cause external
-// Has ajerked us rudely from the Guest's code
-// Again we must return home to the Host
-deliver_to_host:
-	SWITCH_TO_HOST
-	// But now we must go home via that place
-	// Where that interrupt was supposed to go
-	// Had we not been ensconced, running the Guest.
-	// Here we see the trickness of run_guest_once():
-	// The Host stack is formed like an interrupt
-	// With EIP, CS and EFLAGS layered.
-	// Interrupt handlers end with "iret"
-	// And that will take us home at long long last.
-
-	// But first we must find the handler to call!
-	// The IDT descriptor for the Host
-	// Has two bytes for size, and four for address:
-	// %edx will hold it for us for now.
-	movl	(LGUEST_PAGES_host_idt_desc+2)(%eax), %edx
-	// We now know the table address we need,
-	// And saved the trap's number inside %ebx.
-	// Yet the pointer to the handler is smeared
-	// Across the bits of the table entry.
-	// What oracle can tell us how to extract
-	// From such a convoluted encoding?
-	// I consulted gcc, and it gave
-	// These instructions, which I gladly credit:
-	leal	(%edx,%ebx,8), %eax
-	movzwl	(%eax),%edx
-	movl	4(%eax), %eax
-	xorw	%ax, %ax
-	orl	%eax, %edx
-	// Now the address of the handler's in %edx
-	// We call it now: its "iret" drops us home.
-	jmp	*%edx
-
-// Every interrupt can come to us here
-// But we must truly tell each apart.
-// They number two hundred and fifty six
-// And each must land in a different spot,
-// Push its number on stack, and join the stream.
-
-// And worse, a mere six of the traps stand apart
-// And push on their stack an addition:
-// An error number, thirty two bits long
-// So we punish the other two fifty
-// And make them push a zero so they match.
-
-// Yet two fifty six entries is long
-// And all will look most the same as the last
-// So we create a macro which can make
-// As many entries as we need to fill.
-
-// Note the change to .data then .text:
-// We plant the address of each entry
-// Into a (data) table for the Host
-// To know where each Guest interrupt should go.
-.macro IRQ_STUB N TARGET
-	.data; .long 1f; .text; 1:
- // Trap eight, ten through fourteen and seventeen
- // Supply an error number.  Else zero.
- .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17)
-	pushl	$0
- .endif
-	pushl	$\N
-	jmp	\TARGET
-	ALIGN
-.endm
-
-// This macro creates numerous entries
-// Using GAS macros which out-power C's.
-.macro IRQ_STUBS FIRST LAST TARGET
- irq=\FIRST
- .rept \LAST-\FIRST+1
-	IRQ_STUB irq \TARGET
-  irq=irq+1
- .endr
-.endm
-
-// Here's the marker for our pointer table
-// Laid in the data section just before
-// Each macro places the address of code
-// Forming an array: each one points to text
-// Which handles interrupt in its turn.
-.data
-.global default_idt_entries
-default_idt_entries:
-.text
-	// The first two traps go straight back to the Host
-	IRQ_STUBS 0 1 return_to_host
-	// We'll say nothing, yet, about NMI
-	IRQ_STUB 2 handle_nmi
-	// Other traps also return to the Host
-	IRQ_STUBS 3 31 return_to_host
-	// All interrupts go via their handlers
-	IRQ_STUBS 32 127 deliver_to_host
-	// 'Cept system calls coming from userspace
-	// Are to go to the Guest, never the Host.
-	IRQ_STUB 128 return_to_host
-	IRQ_STUBS 129 255 deliver_to_host
-
-// The NMI, what a fabulous beast
-// Which swoops in and stops us no matter that
-// We're suspended between heaven and hell,
-// (Or more likely between the Host and Guest)
-// When in it comes!  We are dazed and confused
-// So we do the simplest thing which one can.
-// Though we've pushed the trap number and zero
-// We discard them, return, and hope we live.
-handle_nmi:
-	addl	$8, %esp
-	iret
-
-// We are done; all that's left is Mastery
-// And "make Mastery" is a journey long
-// Designed to make your fingers itch to code.
-
-// Here ends the text, the file and poem.
-ENTRY(end_switcher_text)
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 83a1616903f8..aba0d652095b 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -333,7 +333,7 @@ config VIRTIO_NET
 	depends on VIRTIO
 	---help---
 	  This is the virtual network driver for virtio.  It can be used with
-	  lguest or QEMU based VMMs (like KVM or Xen).  Say Y or M.
+	  QEMU based VMMs (like KVM or Xen).  Say Y or M.
 
 config NLMON
 	tristate "Virtual netlink monitoring device"
diff --git a/drivers/tty/hvc/Kconfig b/drivers/tty/hvc/Kconfig
index b8d5ea0ae26b..fec457edad14 100644
--- a/drivers/tty/hvc/Kconfig
+++ b/drivers/tty/hvc/Kconfig
@@ -4,7 +4,7 @@ config HVC_DRIVER
 	bool
 	help
 	  Generic "hypervisor virtual console" infrastructure for various
-	  hypervisors (pSeries, iSeries, Xen, lguest).
+	  hypervisors (pSeries, iSeries, Xen).
 	  It will automatically be selected if one of the back-end console drivers
 	  is selected.
 
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index 623f72334fa5..cff773f15b7e 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -2,8 +2,8 @@ config VIRTIO
 	tristate
 	---help---
 	  This option is selected by any driver which implements the virtio
-	  bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_LGUEST,
-	  CONFIG_RPMSG or CONFIG_S390_GUEST.
+	  bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_RPMSG
+	  or CONFIG_S390_GUEST.
 
 menu "Virtio drivers"
 
diff --git a/include/linux/lguest.h b/include/linux/lguest.h
deleted file mode 100644
index 6db19f35f7c5..000000000000
--- a/include/linux/lguest.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Things the lguest guest needs to know.  Note: like all lguest interfaces,
- * this is subject to wild and random change between versions.
- */
-#ifndef _LINUX_LGUEST_H
-#define _LINUX_LGUEST_H
-
-#ifndef __ASSEMBLY__
-#include <linux/time.h>
-#include <asm/irq.h>
-#include <asm/lguest_hcall.h>
-
-#define LG_CLOCK_MIN_DELTA	100UL
-#define LG_CLOCK_MAX_DELTA	ULONG_MAX
-
-/*G:031
- * The second method of communicating with the Host is to via "struct
- * lguest_data".  Once the Guest's initialization hypercall tells the Host where
- * this is, the Guest and Host both publish information in it.
-:*/
-struct lguest_data {
-	/*
-	 * 512 == enabled (same as eflags in normal hardware).  The Guest
-	 * changes interrupts so often that a hypercall is too slow.
-	 */
-	unsigned int irq_enabled;
-	/* Fine-grained interrupt disabling by the Guest */
-	DECLARE_BITMAP(blocked_interrupts, LGUEST_IRQS);
-
-	/*
-	 * The Host writes the virtual address of the last page fault here,
-	 * which saves the Guest a hypercall.  CR2 is the native register where
-	 * this address would normally be found.
-	 */
-	unsigned long cr2;
-
-	/* Wallclock time set by the Host. */
-	struct timespec time;
-
-	/*
-	 * Interrupt pending set by the Host.  The Guest should do a hypercall
-	 * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF).
-	 */
-	int irq_pending;
-
-	/*
-	 * Async hypercall ring.  Instead of directly making hypercalls, we can
-	 * place them in here for processing the next time the Host wants.
-	 * This batching can be quite efficient.
-	 */
-
-	/* 0xFF == done (set by Host), 0 == pending (set by Guest). */
-	u8 hcall_status[LHCALL_RING_SIZE];
-	/* The actual registers for the hypercalls. */
-	struct hcall_args hcalls[LHCALL_RING_SIZE];
-
-/* Fields initialized by the Host at boot: */
-	/* Memory not to try to access */
-	unsigned long reserve_mem;
-	/* KHz for the TSC clock. */
-	u32 tsc_khz;
-
-/* Fields initialized by the Guest at boot: */
-	/* Instruction to suppress interrupts even if enabled */
-	unsigned long noirq_iret;
-	/* Address above which page tables are all identical. */
-	unsigned long kernel_address;
-	/* The vector to try to use for system calls (0x40 or 0x80). */
-	unsigned int syscall_vec;
-};
-extern struct lguest_data lguest_data;
-#endif /* __ASSEMBLY__ */
-#endif	/* _LINUX_LGUEST_H */
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
deleted file mode 100644
index acd5b12565cc..000000000000
--- a/include/linux/lguest_launcher.h
+++ /dev/null
@@ -1,44 +0,0 @@
-#ifndef _LINUX_LGUEST_LAUNCHER
-#define _LINUX_LGUEST_LAUNCHER
-/* Everything the "lguest" userspace program needs to know. */
-#include <linux/types.h>
-
-/*D:010
- * Drivers
- *
- * The Guest needs devices to do anything useful.  Since we don't let it touch
- * real devices (think of the damage it could do!) we provide virtual devices.
- * We emulate a PCI bus with virtio devices on it; we used to have our own
- * lguest bus which was far simpler, but this tests the virtio 1.0 standard.
- *
- * Virtio devices are also used by kvm, so we can simply reuse their optimized
- * device drivers.  And one day when everyone uses virtio, my plan will be
- * complete.  Bwahahahah!
- */
-
-/* Write command first word is a request. */
-enum lguest_req
-{
-	LHREQ_INITIALIZE, /* + base, pfnlimit, start */
-	LHREQ_GETDMA, /* No longer used */
-	LHREQ_IRQ, /* + irq */
-	LHREQ_BREAK, /* No longer used */
-	LHREQ_EVENTFD, /* No longer used. */
-	LHREQ_GETREG, /* + offset within struct pt_regs (then read value). */
-	LHREQ_SETREG, /* + offset within struct pt_regs, value. */
-	LHREQ_TRAP, /* + trap number to deliver to guest. */
-};
-
-/*
- * This is what read() of the lguest fd populates.  trap ==
- * LGUEST_TRAP_ENTRY for an LHCALL_NOTIFY (addr is the
- * argument), 14 for a page fault in the MMIO region (addr is
- * the trap address, insn is the instruction), or 13 for a GPF
- * (insn is the instruction).
- */
-struct lguest_pending {
-	__u8 trap;
-	__u8 insn[7];
-	__u32 addr;
-};
-#endif /* _LINUX_LGUEST_LAUNCHER */
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index c07295969b7e..6d5d5faa989b 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -1,7 +1,7 @@
 #ifndef _UAPI_LINUX_VIRTIO_RING_H
 #define _UAPI_LINUX_VIRTIO_RING_H
-/* An interface for efficient virtio implementation, currently for use by KVM
- * and lguest, but hopefully others soon.  Do NOT change this since it will
+/* An interface for efficient virtio implementation, currently for use by KVM,
+ * but hopefully others soon.  Do NOT change this since it will
  * break existing servers and clients.
  *
  * This header is BSD licensed so anyone can use the definitions to implement
diff --git a/tools/Makefile b/tools/Makefile
index 221e1ce78b06..a19b176b914b 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -18,7 +18,6 @@ help:
 	@echo '  iio                    - IIO tools'
 	@echo '  kvm_stat               - top-like utility for displaying kvm statistics'
 	@echo '  leds                   - LEDs  tools'
-	@echo '  lguest                 - a minimal 32-bit x86 hypervisor'
 	@echo '  liblockdep             - user-space wrapper for kernel locking-validator'
 	@echo '  net                    - misc networking tools'
 	@echo '  perf                   - Linux performance measurement and analysis tool'
@@ -90,7 +89,7 @@ freefall: FORCE
 kvm_stat: FORCE
 	$(call descend,kvm/$@)
 
-all: acpi cgroup cpupower gpio hv firewire lguest liblockdep \
+all: acpi cgroup cpupower gpio hv firewire liblockdep \
 		perf selftests turbostat usb \
 		virtio vm net x86_energy_perf_policy \
 		tmon freefall objtool kvm_stat
@@ -101,7 +100,7 @@ acpi_install:
 cpupower_install:
 	$(call descend,power/$(@:_install=),install)
 
-cgroup_install firewire_install gpio_install hv_install lguest_install perf_install usb_install virtio_install vm_install net_install objtool_install:
+cgroup_install firewire_install gpio_install hv_install perf_install usb_install virtio_install vm_install net_install objtool_install:
 	$(call descend,$(@:_install=),install)
 
 liblockdep_install:
@@ -123,7 +122,7 @@ kvm_stat_install:
 	$(call descend,kvm/$(@:_install=),install)
 
 install: acpi_install cgroup_install cpupower_install gpio_install \
-		hv_install firewire_install lguest_install liblockdep_install \
+		hv_install firewire_install liblockdep_install \
 		perf_install selftests_install turbostat_install usb_install \
 		virtio_install vm_install net_install x86_energy_perf_policy_install \
 		tmon_install freefall_install objtool_install kvm_stat_install
@@ -134,7 +133,7 @@ acpi_clean:
 cpupower_clean:
 	$(call descend,power/cpupower,clean)
 
-cgroup_clean hv_clean firewire_clean lguest_clean spi_clean usb_clean virtio_clean vm_clean net_clean iio_clean gpio_clean objtool_clean leds_clean:
+cgroup_clean hv_clean firewire_clean spi_clean usb_clean virtio_clean vm_clean net_clean iio_clean gpio_clean objtool_clean leds_clean:
 	$(call descend,$(@:_clean=),clean)
 
 liblockdep_clean:
@@ -168,7 +167,7 @@ freefall_clean:
 build_clean:
 	$(call descend,build,clean)
 
-clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean lguest_clean \
+clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean \
 		perf_clean selftests_clean turbostat_clean spi_clean usb_clean virtio_clean \
 		vm_clean net_clean iio_clean x86_energy_perf_policy_clean tmon_clean \
 		freefall_clean build_clean libbpf_clean libsubcmd_clean liblockdep_clean \
diff --git a/tools/lguest/.gitignore b/tools/lguest/.gitignore
deleted file mode 100644
index 8d9a8383a52e..000000000000
--- a/tools/lguest/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-lguest
-include
diff --git a/tools/lguest/Makefile b/tools/lguest/Makefile
deleted file mode 100644
index d04599a79802..000000000000
--- a/tools/lguest/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-# This creates the demonstration utility "lguest" which runs a Linux guest.
-CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -U_FORTIFY_SOURCE -Iinclude
-
-all: lguest
-
-include/linux/virtio_types.h: ../../include/uapi/linux/virtio_types.h
-	mkdir -p include/linux 2>&1 || true
-	ln -sf ../../../../include/uapi/linux/virtio_types.h $@
-
-lguest: include/linux/virtio_types.h
-
-clean:
-	rm -f lguest
-	rm -rf include
diff --git a/tools/lguest/extract b/tools/lguest/extract
deleted file mode 100644
index 7730bb6e4b94..000000000000
--- a/tools/lguest/extract
+++ /dev/null
@@ -1,58 +0,0 @@
-#! /bin/sh
-
-set -e
-
-PREFIX=$1
-shift
-
-trap 'rm -r $TMPDIR' 0
-TMPDIR=`mktemp -d`
-
-exec 3>/dev/null
-for f; do
-    while IFS="
-" read -r LINE; do
-	case "$LINE" in
-	    *$PREFIX:[0-9]*:\**)
-		NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
-		if [ -f $TMPDIR/$NUM ]; then
-		    echo "$TMPDIR/$NUM already exits prior to $f"
-		    exit 1
-		fi
-		exec 3>>$TMPDIR/$NUM
-		echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
-		/bin/echo "$LINE" | sed -e "s/$PREFIX:[0-9]*//" -e "s/:\*/*/" >&3
-		;;
-	    *$PREFIX:[0-9]*)
-		NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
-		if [ -f $TMPDIR/$NUM ]; then
-		    echo "$TMPDIR/$NUM already exits prior to $f"
-		    exit 1
-		fi
-		exec 3>>$TMPDIR/$NUM
-		echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
-		/bin/echo "$LINE" | sed "s/$PREFIX:[0-9]*//" >&3
-		;;
-	    *:\**)
-		/bin/echo "$LINE" | sed -e "s/:\*/*/" -e "s,/\*\*/,," >&3
-		echo >&3
-		exec 3>/dev/null
-		;;
-	    *)
-		/bin/echo "$LINE" >&3
-		;;
-	esac
-    done < $f
-    echo >&3
-    exec 3>/dev/null
-done
-
-LASTFILE=""
-for f in $TMPDIR/*; do
-    if [ "$LASTFILE" != $(cat $TMPDIR/.$(basename $f) ) ]; then
-	LASTFILE=$(cat $TMPDIR/.$(basename $f) )
-	echo "[ $LASTFILE ]"
-    fi
-    cat $f
-done
-
diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c
deleted file mode 100644
index 897cd6f3f687..000000000000
--- a/tools/lguest/lguest.c
+++ /dev/null
@@ -1,3420 +0,0 @@
-/*P:100
- * This is the Launcher code, a simple program which lays out the "physical"
- * memory for the new Guest by mapping the kernel image and the virtual
- * devices, then opens /dev/lguest to tell the kernel about the Guest and
- * control it.
-:*/
-#define _LARGEFILE64_SOURCE
-#define _GNU_SOURCE
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include <err.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <elf.h>
-#include <sys/mman.h>
-#include <sys/param.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/wait.h>
-#include <sys/eventfd.h>
-#include <fcntl.h>
-#include <stdbool.h>
-#include <errno.h>
-#include <ctype.h>
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-#include <time.h>
-#include <netinet/in.h>
-#include <net/if.h>
-#include <linux/sockios.h>
-#include <linux/if_tun.h>
-#include <sys/uio.h>
-#include <termios.h>
-#include <getopt.h>
-#include <assert.h>
-#include <sched.h>
-#include <limits.h>
-#include <stddef.h>
-#include <signal.h>
-#include <pwd.h>
-#include <grp.h>
-#include <sys/user.h>
-#include <linux/pci_regs.h>
-
-#ifndef VIRTIO_F_ANY_LAYOUT
-#define VIRTIO_F_ANY_LAYOUT		27
-#endif
-
-/*L:110
- * We can ignore the 43 include files we need for this program, but I do want
- * to draw attention to the use of kernel-style types.
- *
- * As Linus said, "C is a Spartan language, and so should your naming be."  I
- * like these abbreviations, so we define them here.  Note that u64 is always
- * unsigned long long, which works on all Linux systems: this means that we can
- * use %llu in printf for any u64.
- */
-typedef unsigned long long u64;
-typedef uint32_t u32;
-typedef uint16_t u16;
-typedef uint8_t u8;
-/*:*/
-
-#define VIRTIO_CONFIG_NO_LEGACY
-#define VIRTIO_PCI_NO_LEGACY
-#define VIRTIO_BLK_NO_LEGACY
-#define VIRTIO_NET_NO_LEGACY
-
-/* Use in-kernel ones, which defines VIRTIO_F_VERSION_1 */
-#include "../../include/uapi/linux/virtio_config.h"
-#include "../../include/uapi/linux/virtio_net.h"
-#include "../../include/uapi/linux/virtio_blk.h"
-#include "../../include/uapi/linux/virtio_console.h"
-#include "../../include/uapi/linux/virtio_rng.h"
-#include <linux/virtio_ring.h>
-#include "../../include/uapi/linux/virtio_pci.h"
-#include <asm/bootparam.h>
-#include "../../include/linux/lguest_launcher.h"
-
-#define BRIDGE_PFX "bridge:"
-#ifndef SIOCBRADDIF
-#define SIOCBRADDIF	0x89a2		/* add interface to bridge      */
-#endif
-/* We can have up to 256 pages for devices. */
-#define DEVICE_PAGES 256
-/* This will occupy 3 pages: it must be a power of 2. */
-#define VIRTQUEUE_NUM 256
-
-/*L:120
- * verbose is both a global flag and a macro.  The C preprocessor allows
- * this, and although I wouldn't recommend it, it works quite nicely here.
- */
-static bool verbose;
-#define verbose(args...) \
-	do { if (verbose) printf(args); } while(0)
-/*:*/
-
-/* The pointer to the start of guest memory. */
-static void *guest_base;
-/* The maximum guest physical address allowed, and maximum possible. */
-static unsigned long guest_limit, guest_max, guest_mmio;
-/* The /dev/lguest file descriptor. */
-static int lguest_fd;
-
-/* a per-cpu variable indicating whose vcpu is currently running */
-static unsigned int __thread cpu_id;
-
-/* 5 bit device number in the PCI_CONFIG_ADDR => 32 only */
-#define MAX_PCI_DEVICES 32
-
-/* This is our list of devices. */
-struct device_list {
-	/* Counter to assign interrupt numbers. */
-	unsigned int next_irq;
-
-	/* Counter to print out convenient device numbers. */
-	unsigned int device_num;
-
-	/* PCI devices. */
-	struct device *pci[MAX_PCI_DEVICES];
-};
-
-/* The list of Guest devices, based on command line arguments. */
-static struct device_list devices;
-
-/*
- * Just like struct virtio_pci_cfg_cap in uapi/linux/virtio_pci.h,
- * but uses a u32 explicitly for the data.
- */
-struct virtio_pci_cfg_cap_u32 {
-	struct virtio_pci_cap cap;
-	u32 pci_cfg_data; /* Data for BAR access. */
-};
-
-struct virtio_pci_mmio {
-	struct virtio_pci_common_cfg cfg;
-	u16 notify;
-	u8 isr;
-	u8 padding;
-	/* Device-specific configuration follows this. */
-};
-
-/* This is the layout (little-endian) of the PCI config space. */
-struct pci_config {
-	u16 vendor_id, device_id;
-	u16 command, status;
-	u8 revid, prog_if, subclass, class;
-	u8 cacheline_size, lat_timer, header_type, bist;
-	u32 bar[6];
-	u32 cardbus_cis_ptr;
-	u16 subsystem_vendor_id, subsystem_device_id;
-	u32 expansion_rom_addr;
-	u8 capabilities, reserved1[3];
-	u32 reserved2;
-	u8 irq_line, irq_pin, min_grant, max_latency;
-
-	/* Now, this is the linked capability list. */
-	struct virtio_pci_cap common;
-	struct virtio_pci_notify_cap notify;
-	struct virtio_pci_cap isr;
-	struct virtio_pci_cap device;
-	struct virtio_pci_cfg_cap_u32 cfg_access;
-};
-
-/* The device structure describes a single device. */
-struct device {
-	/* The name of this device, for --verbose. */
-	const char *name;
-
-	/* Any queues attached to this device */
-	struct virtqueue *vq;
-
-	/* Is it operational */
-	bool running;
-
-	/* Has it written FEATURES_OK but not re-checked it? */
-	bool wrote_features_ok;
-
-	/* PCI configuration */
-	union {
-		struct pci_config config;
-		u32 config_words[sizeof(struct pci_config) / sizeof(u32)];
-	};
-
-	/* Features we offer, and those accepted. */
-	u64 features, features_accepted;
-
-	/* Device-specific config hangs off the end of this. */
-	struct virtio_pci_mmio *mmio;
-
-	/* PCI MMIO resources (all in BAR0) */
-	size_t mmio_size;
-	u32 mmio_addr;
-
-	/* Device-specific data. */
-	void *priv;
-};
-
-/* The virtqueue structure describes a queue attached to a device. */
-struct virtqueue {
-	struct virtqueue *next;
-
-	/* Which device owns me. */
-	struct device *dev;
-
-	/* Name for printing errors. */
-	const char *name;
-
-	/* The actual ring of buffers. */
-	struct vring vring;
-
-	/* The information about this virtqueue (we only use queue_size on) */
-	struct virtio_pci_common_cfg pci_config;
-
-	/* Last available index we saw. */
-	u16 last_avail_idx;
-
-	/* How many are used since we sent last irq? */
-	unsigned int pending_used;
-
-	/* Eventfd where Guest notifications arrive. */
-	int eventfd;
-
-	/* Function for the thread which is servicing this virtqueue. */
-	void (*service)(struct virtqueue *vq);
-	pid_t thread;
-};
-
-/* Remember the arguments to the program so we can "reboot" */
-static char **main_args;
-
-/* The original tty settings to restore on exit. */
-static struct termios orig_term;
-
-/*
- * We have to be careful with barriers: our devices are all run in separate
- * threads and so we need to make sure that changes visible to the Guest happen
- * in precise order.
- */
-#define wmb() __asm__ __volatile__("" : : : "memory")
-#define rmb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory")
-#define mb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory")
-
-/* Wrapper for the last available index.  Makes it easier to change. */
-#define lg_last_avail(vq)	((vq)->last_avail_idx)
-
-/*
- * The virtio configuration space is defined to be little-endian.  x86 is
- * little-endian too, but it's nice to be explicit so we have these helpers.
- */
-#define cpu_to_le16(v16) (v16)
-#define cpu_to_le32(v32) (v32)
-#define cpu_to_le64(v64) (v64)
-#define le16_to_cpu(v16) (v16)
-#define le32_to_cpu(v32) (v32)
-#define le64_to_cpu(v64) (v64)
-
-/*
- * A real device would ignore weird/non-compliant driver behaviour.  We
- * stop and flag it, to help debugging Linux problems.
- */
-#define bad_driver(d, fmt, ...) \
-	errx(1, "%s: bad driver: " fmt, (d)->name, ## __VA_ARGS__)
-#define bad_driver_vq(vq, fmt, ...)			       \
-	errx(1, "%s vq %s: bad driver: " fmt, (vq)->dev->name, \
-	     vq->name, ## __VA_ARGS__)
-
-/* Is this iovec empty? */
-static bool iov_empty(const struct iovec iov[], unsigned int num_iov)
-{
-	unsigned int i;
-
-	for (i = 0; i < num_iov; i++)
-		if (iov[i].iov_len)
-			return false;
-	return true;
-}
-
-/* Take len bytes from the front of this iovec. */
-static void iov_consume(struct device *d,
-			struct iovec iov[], unsigned num_iov,
-			void *dest, unsigned len)
-{
-	unsigned int i;
-
-	for (i = 0; i < num_iov; i++) {
-		unsigned int used;
-
-		used = iov[i].iov_len < len ? iov[i].iov_len : len;
-		if (dest) {
-			memcpy(dest, iov[i].iov_base, used);
-			dest += used;
-		}
-		iov[i].iov_base += used;
-		iov[i].iov_len -= used;
-		len -= used;
-	}
-	if (len != 0)
-		bad_driver(d, "iovec too short!");
-}
-
-/*L:100
- * The Launcher code itself takes us out into userspace, that scary place where
- * pointers run wild and free!  Unfortunately, like most userspace programs,
- * it's quite boring (which is why everyone likes to hack on the kernel!).
- * Perhaps if you make up an Lguest Drinking Game at this point, it will get
- * you through this section.  Or, maybe not.
- *
- * The Launcher sets up a big chunk of memory to be the Guest's "physical"
- * memory and stores it in "guest_base".  In other words, Guest physical ==
- * Launcher virtual with an offset.
- *
- * This can be tough to get your head around, but usually it just means that we
- * use these trivial conversion functions when the Guest gives us its
- * "physical" addresses:
- */
-static void *from_guest_phys(unsigned long addr)
-{
-	return guest_base + addr;
-}
-
-static unsigned long to_guest_phys(const void *addr)
-{
-	return (addr - guest_base);
-}
-
-/*L:130
- * Loading the Kernel.
- *
- * We start with couple of simple helper routines.  open_or_die() avoids
- * error-checking code cluttering the callers:
- */
-static int open_or_die(const char *name, int flags)
-{
-	int fd = open(name, flags);
-	if (fd < 0)
-		err(1, "Failed to open %s", name);
-	return fd;
-}
-
-/* map_zeroed_pages() takes a number of pages. */
-static void *map_zeroed_pages(unsigned int num)
-{
-	int fd = open_or_die("/dev/zero", O_RDONLY);
-	void *addr;
-
-	/*
-	 * We use a private mapping (ie. if we write to the page, it will be
-	 * copied). We allocate an extra two pages PROT_NONE to act as guard
-	 * pages against read/write attempts that exceed allocated space.
-	 */
-	addr = mmap(NULL, getpagesize() * (num+2),
-		    PROT_NONE, MAP_PRIVATE, fd, 0);
-
-	if (addr == MAP_FAILED)
-		err(1, "Mmapping %u pages of /dev/zero", num);
-
-	if (mprotect(addr + getpagesize(), getpagesize() * num,
-		     PROT_READ|PROT_WRITE) == -1)
-		err(1, "mprotect rw %u pages failed", num);
-
-	/*
-	 * One neat mmap feature is that you can close the fd, and it
-	 * stays mapped.
-	 */
-	close(fd);
-
-	/* Return address after PROT_NONE page */
-	return addr + getpagesize();
-}
-
-/* Get some bytes which won't be mapped into the guest. */
-static unsigned long get_mmio_region(size_t size)
-{
-	unsigned long addr = guest_mmio;
-	size_t i;
-
-	if (!size)
-		return addr;
-
-	/* Size has to be a power of 2 (and multiple of 16) */
-	for (i = 1; i < size; i <<= 1);
-
-	guest_mmio += i;
-
-	return addr;
-}
-
-/*
- * This routine is used to load the kernel or initrd.  It tries mmap, but if
- * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries),
- * it falls back to reading the memory in.
- */
-static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
-{
-	ssize_t r;
-
-	/*
-	 * We map writable even though for some segments are marked read-only.
-	 * The kernel really wants to be writable: it patches its own
-	 * instructions.
-	 *
-	 * MAP_PRIVATE means that the page won't be copied until a write is
-	 * done to it.  This allows us to share untouched memory between
-	 * Guests.
-	 */
-	if (mmap(addr, len, PROT_READ|PROT_WRITE,
-		 MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED)
-		return;
-
-	/* pread does a seek and a read in one shot: saves a few lines. */
-	r = pread(fd, addr, len, offset);
-	if (r != len)
-		err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);
-}
-
-/*
- * This routine takes an open vmlinux image, which is in ELF, and maps it into
- * the Guest memory.  ELF = Embedded Linking Format, which is the format used
- * by all modern binaries on Linux including the kernel.
- *
- * The ELF headers give *two* addresses: a physical address, and a virtual
- * address.  We use the physical address; the Guest will map itself to the
- * virtual address.
- *
- * We return the starting address.
- */
-static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
-{
-	Elf32_Phdr phdr[ehdr->e_phnum];
-	unsigned int i;
-
-	/*
-	 * Sanity checks on the main ELF header: an x86 executable with a
-	 * reasonable number of correctly-sized program headers.
-	 */
-	if (ehdr->e_type != ET_EXEC
-	    || ehdr->e_machine != EM_386
-	    || ehdr->e_phentsize != sizeof(Elf32_Phdr)
-	    || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
-		errx(1, "Malformed elf header");
-
-	/*
-	 * An ELF executable contains an ELF header and a number of "program"
-	 * headers which indicate which parts ("segments") of the program to
-	 * load where.
-	 */
-
-	/* We read in all the program headers at once: */
-	if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
-		err(1, "Seeking to program headers");
-	if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
-		err(1, "Reading program headers");
-
-	/*
-	 * Try all the headers: there are usually only three.  A read-only one,
-	 * a read-write one, and a "note" section which we don't load.
-	 */
-	for (i = 0; i < ehdr->e_phnum; i++) {
-		/* If this isn't a loadable segment, we ignore it */
-		if (phdr[i].p_type != PT_LOAD)
-			continue;
-
-		verbose("Section %i: size %i addr %p\n",
-			i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
-
-		/* We map this section of the file at its physical address. */
-		map_at(elf_fd, from_guest_phys(phdr[i].p_paddr),
-		       phdr[i].p_offset, phdr[i].p_filesz);
-	}
-
-	/* The entry point is given in the ELF header. */
-	return ehdr->e_entry;
-}
-
-/*L:150
- * A bzImage, unlike an ELF file, is not meant to be loaded.  You're supposed
- * to jump into it and it will unpack itself.  We used to have to perform some
- * hairy magic because the unpacking code scared me.
- *
- * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote
- * a small patch to jump over the tricky bits in the Guest, so now we just read
- * the funky header so we know where in the file to load, and away we go!
- */
-static unsigned long load_bzimage(int fd)
-{
-	struct boot_params boot;
-	int r;
-	/* Modern bzImages get loaded at 1M. */
-	void *p = from_guest_phys(0x100000);
-
-	/*
-	 * Go back to the start of the file and read the header.  It should be
-	 * a Linux boot header (see Documentation/x86/boot.txt)
-	 */
-	lseek(fd, 0, SEEK_SET);
-	read(fd, &boot, sizeof(boot));
-
-	/* Inside the setup_hdr, we expect the magic "HdrS" */
-	if (memcmp(&boot.hdr.header, "HdrS", 4) != 0)
-		errx(1, "This doesn't look like a bzImage to me");
-
-	/* Skip over the extra sectors of the header. */
-	lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET);
-
-	/* Now read everything into memory. in nice big chunks. */
-	while ((r = read(fd, p, 65536)) > 0)
-		p += r;
-
-	/* Finally, code32_start tells us where to enter the kernel. */
-	return boot.hdr.code32_start;
-}
-
-/*L:140
- * Loading the kernel is easy when it's a "vmlinux", but most kernels
- * come wrapped up in the self-decompressing "bzImage" format.  With a little
- * work, we can load those, too.
- */
-static unsigned long load_kernel(int fd)
-{
-	Elf32_Ehdr hdr;
-
-	/* Read in the first few bytes. */
-	if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
-		err(1, "Reading kernel");
-
-	/* If it's an ELF file, it starts with "\177ELF" */
-	if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
-		return map_elf(fd, &hdr);
-
-	/* Otherwise we assume it's a bzImage, and try to load it. */
-	return load_bzimage(fd);
-}
-
-/*
- * This is a trivial little helper to align pages.  Andi Kleen hated it because
- * it calls getpagesize() twice: "it's dumb code."
- *
- * Kernel guys get really het up about optimization, even when it's not
- * necessary.  I leave this code as a reaction against that.
- */
-static inline unsigned long page_align(unsigned long addr)
-{
-	/* Add upwards and truncate downwards. */
-	return ((addr + getpagesize()-1) & ~(getpagesize()-1));
-}
-
-/*L:180
- * An "initial ram disk" is a disk image loaded into memory along with the
- * kernel which the kernel can use to boot from without needing any drivers.
- * Most distributions now use this as standard: the initrd contains the code to
- * load the appropriate driver modules for the current machine.
- *
- * Importantly, James Morris works for RedHat, and Fedora uses initrds for its
- * kernels.  He sent me this (and tells me when I break it).
- */
-static unsigned long load_initrd(const char *name, unsigned long mem)
-{
-	int ifd;
-	struct stat st;
-	unsigned long len;
-
-	ifd = open_or_die(name, O_RDONLY);
-	/* fstat() is needed to get the file size. */
-	if (fstat(ifd, &st) < 0)
-		err(1, "fstat() on initrd '%s'", name);
-
-	/*
-	 * We map the initrd at the top of memory, but mmap wants it to be
-	 * page-aligned, so we round the size up for that.
-	 */
-	len = page_align(st.st_size);
-	map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
-	/*
-	 * Once a file is mapped, you can close the file descriptor.  It's a
-	 * little odd, but quite useful.
-	 */
-	close(ifd);
-	verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len);
-
-	/* We return the initrd size. */
-	return len;
-}
-/*:*/
-
-/*
- * Simple routine to roll all the commandline arguments together with spaces
- * between them.
- */
-static void concat(char *dst, char *args[])
-{
-	unsigned int i, len = 0;
-
-	for (i = 0; args[i]; i++) {
-		if (i) {
-			strcat(dst+len, " ");
-			len++;
-		}
-		strcpy(dst+len, args[i]);
-		len += strlen(args[i]);
-	}
-	/* In case it's empty. */
-	dst[len] = '\0';
-}
-
-/*L:185
- * This is where we actually tell the kernel to initialize the Guest.  We
- * saw the arguments it expects when we looked at initialize() in lguest_user.c:
- * the base of Guest "physical" memory, the top physical page to allow and the
- * entry point for the Guest.
- */
-static void tell_kernel(unsigned long start)
-{
-	unsigned long args[] = { LHREQ_INITIALIZE,
-				 (unsigned long)guest_base,
-				 guest_limit / getpagesize(), start,
-				 (guest_mmio+getpagesize()-1) / getpagesize() };
-	verbose("Guest: %p - %p (%#lx, MMIO %#lx)\n",
-		guest_base, guest_base + guest_limit,
-		guest_limit, guest_mmio);
-	lguest_fd = open_or_die("/dev/lguest", O_RDWR);
-	if (write(lguest_fd, args, sizeof(args)) < 0)
-		err(1, "Writing to /dev/lguest");
-}
-/*:*/
-
-/*L:200
- * Device Handling.
- *
- * When the Guest gives us a buffer, it sends an array of addresses and sizes.
- * We need to make sure it's not trying to reach into the Launcher itself, so
- * we have a convenient routine which checks it and exits with an error message
- * if something funny is going on:
- */
-static void *_check_pointer(struct device *d,
-			    unsigned long addr, unsigned int size,
-			    unsigned int line)
-{
-	/*
-	 * Check if the requested address and size exceeds the allocated memory,
-	 * or addr + size wraps around.
-	 */
-	if ((addr + size) > guest_limit || (addr + size) < addr)
-		bad_driver(d, "%s:%i: Invalid address %#lx",
-			   __FILE__, line, addr);
-	/*
-	 * We return a pointer for the caller's convenience, now we know it's
-	 * safe to use.
-	 */
-	return from_guest_phys(addr);
-}
-/* A macro which transparently hands the line number to the real function. */
-#define check_pointer(d,addr,size) _check_pointer(d, addr, size, __LINE__)
-
-/*
- * Each buffer in the virtqueues is actually a chain of descriptors.  This
- * function returns the next descriptor in the chain, or vq->vring.num if we're
- * at the end.
- */
-static unsigned next_desc(struct device *d, struct vring_desc *desc,
-			  unsigned int i, unsigned int max)
-{
-	unsigned int next;
-
-	/* If this descriptor says it doesn't chain, we're done. */
-	if (!(desc[i].flags & VRING_DESC_F_NEXT))
-		return max;
-
-	/* Check they're not leading us off end of descriptors. */
-	next = desc[i].next;
-	/* Make sure compiler knows to grab that: we don't want it changing! */
-	wmb();
-
-	if (next >= max)
-		bad_driver(d, "Desc next is %u", next);
-
-	return next;
-}
-
-/*
- * This actually sends the interrupt for this virtqueue, if we've used a
- * buffer.
- */
-static void trigger_irq(struct virtqueue *vq)
-{
-	unsigned long buf[] = { LHREQ_IRQ, vq->dev->config.irq_line };
-
-	/* Don't inform them if nothing used. */
-	if (!vq->pending_used)
-		return;
-	vq->pending_used = 0;
-
-	/*
-	 * 2.4.7.1:
-	 *
-	 *  If the VIRTIO_F_EVENT_IDX feature bit is not negotiated:
-	 *    The driver MUST set flags to 0 or 1. 
-	 */
-	if (vq->vring.avail->flags > 1)
-		bad_driver_vq(vq, "avail->flags = %u\n", vq->vring.avail->flags);
-
-	/*
-	 * 2.4.7.2:
-	 *
-	 *  If the VIRTIO_F_EVENT_IDX feature bit is not negotiated:
-	 *
-	 *     - The device MUST ignore the used_event value.
-	 *     - After the device writes a descriptor index into the used ring:
-	 *         - If flags is 1, the device SHOULD NOT send an interrupt.
-	 *         - If flags is 0, the device MUST send an interrupt.
-	 */
-	if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) {
-		return;
-	}
-
-	/*
-	 * 4.1.4.5.1:
-	 *
-	 *  If MSI-X capability is disabled, the device MUST set the Queue
-	 *  Interrupt bit in ISR status before sending a virtqueue notification
-	 *  to the driver.
-	 */
-	vq->dev->mmio->isr = 0x1;
-
-	/* Send the Guest an interrupt tell them we used something up. */
-	if (write(lguest_fd, buf, sizeof(buf)) != 0)
-		err(1, "Triggering irq %i", vq->dev->config.irq_line);
-}
-
-/*
- * This looks in the virtqueue for the first available buffer, and converts
- * it to an iovec for convenient access.  Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function waits if necessary, and returns the descriptor number found.
- */
-static unsigned wait_for_vq_desc(struct virtqueue *vq,
-				 struct iovec iov[],
-				 unsigned int *out_num, unsigned int *in_num)
-{
-	unsigned int i, head, max;
-	struct vring_desc *desc;
-	u16 last_avail = lg_last_avail(vq);
-
-	/*
-	 * 2.4.7.1:
-	 *
-	 *   The driver MUST handle spurious interrupts from the device.
-	 *
-	 * That's why this is a while loop.
-	 */
-
-	/* There's nothing available? */
-	while (last_avail == vq->vring.avail->idx) {
-		u64 event;
-
-		/*
-		 * Since we're about to sleep, now is a good time to tell the
-		 * Guest about what we've used up to now.
-		 */
-		trigger_irq(vq);
-
-		/* OK, now we need to know about added descriptors. */
-		vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
-
-		/*
-		 * They could have slipped one in as we were doing that: make
-		 * sure it's written, then check again.
-		 */
-		mb();
-		if (last_avail != vq->vring.avail->idx) {
-			vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
-			break;
-		}
-
-		/* Nothing new?  Wait for eventfd to tell us they refilled. */
-		if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event))
-			errx(1, "Event read failed?");
-
-		/* We don't need to be notified again. */
-		vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
-	}
-
-	/* Check it isn't doing very strange things with descriptor numbers. */
-	if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
-		bad_driver_vq(vq, "Guest moved used index from %u to %u",
-			      last_avail, vq->vring.avail->idx);
-
-	/* 
-	 * Make sure we read the descriptor number *after* we read the ring
-	 * update; don't let the cpu or compiler change the order.
-	 */
-	rmb();
-
-	/*
-	 * Grab the next descriptor number they're advertising, and increment
-	 * the index we've seen.
-	 */
-	head = vq->vring.avail->ring[last_avail % vq->vring.num];
-	lg_last_avail(vq)++;
-
-	/* If their number is silly, that's a fatal mistake. */
-	if (head >= vq->vring.num)
-		bad_driver_vq(vq, "Guest says index %u is available", head);
-
-	/* When we start there are none of either input nor output. */
-	*out_num = *in_num = 0;
-
-	max = vq->vring.num;
-	desc = vq->vring.desc;
-	i = head;
-
-	/*
-	 * We have to read the descriptor after we read the descriptor number,
-	 * but there's a data dependency there so the CPU shouldn't reorder
-	 * that: no rmb() required.
-	 */
-
-	do {
-		/*
-		 * If this is an indirect entry, then this buffer contains a
-		 * descriptor table which we handle as if it's any normal
-		 * descriptor chain.
-		 */
-		if (desc[i].flags & VRING_DESC_F_INDIRECT) {
-			/* 2.4.5.3.1:
-			 *
-			 *  The driver MUST NOT set the VIRTQ_DESC_F_INDIRECT
-			 *  flag unless the VIRTIO_F_INDIRECT_DESC feature was
-			 *  negotiated.
-			 */
-			if (!(vq->dev->features_accepted &
-			      (1<<VIRTIO_RING_F_INDIRECT_DESC)))
-				bad_driver_vq(vq, "vq indirect not negotiated");
-
-			/*
-			 * 2.4.5.3.1:
-			 *
-			 *   The driver MUST NOT set the VIRTQ_DESC_F_INDIRECT
-			 *   flag within an indirect descriptor (ie. only one
-			 *   table per descriptor).
-			 */
-			if (desc != vq->vring.desc)
-				bad_driver_vq(vq, "Indirect within indirect");
-
-			/*
-			 * Proposed update VIRTIO-134 spells this out:
-			 *
-			 *   A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT
-			 *   and VIRTQ_DESC_F_NEXT in flags.
-			 */
-			if (desc[i].flags & VRING_DESC_F_NEXT)
-				bad_driver_vq(vq, "indirect and next together");
-
-			if (desc[i].len % sizeof(struct vring_desc))
-				bad_driver_vq(vq,
-					      "Invalid size for indirect table");
-			/*
-			 * 2.4.5.3.2:
-			 *
-			 *  The device MUST ignore the write-only flag
-			 *  (flags&VIRTQ_DESC_F_WRITE) in the descriptor that
-			 *  refers to an indirect table.
-			 *
-			 * We ignore it here: :)
-			 */
-
-			max = desc[i].len / sizeof(struct vring_desc);
-			desc = check_pointer(vq->dev, desc[i].addr, desc[i].len);
-			i = 0;
-
-			/* 2.4.5.3.1:
-			 *
-			 *  A driver MUST NOT create a descriptor chain longer
-			 *  than the Queue Size of the device.
-			 */
-			if (max > vq->pci_config.queue_size)
-				bad_driver_vq(vq,
-					      "indirect has too many entries");
-		}
-
-		/* Grab the first descriptor, and check it's OK. */
-		iov[*out_num + *in_num].iov_len = desc[i].len;
-		iov[*out_num + *in_num].iov_base
-			= check_pointer(vq->dev, desc[i].addr, desc[i].len);
-		/* If this is an input descriptor, increment that count. */
-		if (desc[i].flags & VRING_DESC_F_WRITE)
-			(*in_num)++;
-		else {
-			/*
-			 * If it's an output descriptor, they're all supposed
-			 * to come before any input descriptors.
-			 */
-			if (*in_num)
-				bad_driver_vq(vq,
-					      "Descriptor has out after in");
-			(*out_num)++;
-		}
-
-		/* If we've got too many, that implies a descriptor loop. */
-		if (*out_num + *in_num > max)
-			bad_driver_vq(vq, "Looped descriptor");
-	} while ((i = next_desc(vq->dev, desc, i, max)) != max);
-
-	return head;
-}
-
-/*
- * After we've used one of their buffers, we tell the Guest about it.  Sometime
- * later we'll want to send them an interrupt using trigger_irq(); note that
- * wait_for_vq_desc() does that for us if it has to wait.
- */
-static void add_used(struct virtqueue *vq, unsigned int head, int len)
-{
-	struct vring_used_elem *used;
-
-	/*
-	 * The virtqueue contains a ring of used buffers.  Get a pointer to the
-	 * next entry in that used ring.
-	 */
-	used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
-	used->id = head;
-	used->len = len;
-	/* Make sure buffer is written before we update index. */
-	wmb();
-	vq->vring.used->idx++;
-	vq->pending_used++;
-}
-
-/* And here's the combo meal deal.  Supersize me! */
-static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
-{
-	add_used(vq, head, len);
-	trigger_irq(vq);
-}
-
-/*
- * The Console
- *
- * We associate some data with the console for our exit hack.
- */
-struct console_abort {
-	/* How many times have they hit ^C? */
-	int count;
-	/* When did they start? */
-	struct timeval start;
-};
-
-/* This is the routine which handles console input (ie. stdin). */
-static void console_input(struct virtqueue *vq)
-{
-	int len;
-	unsigned int head, in_num, out_num;
-	struct console_abort *abort = vq->dev->priv;
-	struct iovec iov[vq->vring.num];
-
-	/* Make sure there's a descriptor available. */
-	head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
-	if (out_num)
-		bad_driver_vq(vq, "Output buffers in console in queue?");
-
-	/* Read into it.  This is where we usually wait. */
-	len = readv(STDIN_FILENO, iov, in_num);
-	if (len <= 0) {
-		/* Ran out of input? */
-		warnx("Failed to get console input, ignoring console.");
-		/*
-		 * For simplicity, dying threads kill the whole Launcher.  So
-		 * just nap here.
-		 */
-		for (;;)
-			pause();
-	}
-
-	/* Tell the Guest we used a buffer. */
-	add_used_and_trigger(vq, head, len);
-
-	/*
-	 * Three ^C within one second?  Exit.
-	 *
-	 * This is such a hack, but works surprisingly well.  Each ^C has to
-	 * be in a buffer by itself, so they can't be too fast.  But we check
-	 * that we get three within about a second, so they can't be too
-	 * slow.
-	 */
-	if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) {
-		abort->count = 0;
-		return;
-	}
-
-	abort->count++;
-	if (abort->count == 1)
-		gettimeofday(&abort->start, NULL);
-	else if (abort->count == 3) {
-		struct timeval now;
-		gettimeofday(&now, NULL);
-		/* Kill all Launcher processes with SIGINT, like normal ^C */
-		if (now.tv_sec <= abort->start.tv_sec+1)
-			kill(0, SIGINT);
-		abort->count = 0;
-	}
-}
-
-/* This is the routine which handles console output (ie. stdout). */
-static void console_output(struct virtqueue *vq)
-{
-	unsigned int head, out, in;
-	struct iovec iov[vq->vring.num];
-
-	/* We usually wait in here, for the Guest to give us something. */
-	head = wait_for_vq_desc(vq, iov, &out, &in);
-	if (in)
-		bad_driver_vq(vq, "Input buffers in console output queue?");
-
-	/* writev can return a partial write, so we loop here. */
-	while (!iov_empty(iov, out)) {
-		int len = writev(STDOUT_FILENO, iov, out);
-		if (len <= 0) {
-			warn("Write to stdout gave %i (%d)", len, errno);
-			break;
-		}
-		iov_consume(vq->dev, iov, out, NULL, len);
-	}
-
-	/*
-	 * We're finished with that buffer: if we're going to sleep,
-	 * wait_for_vq_desc() will prod the Guest with an interrupt.
-	 */
-	add_used(vq, head, 0);
-}
-
-/*
- * The Network
- *
- * Handling output for network is also simple: we get all the output buffers
- * and write them to /dev/net/tun.
- */
-struct net_info {
-	int tunfd;
-};
-
-static void net_output(struct virtqueue *vq)
-{
-	struct net_info *net_info = vq->dev->priv;
-	unsigned int head, out, in;
-	struct iovec iov[vq->vring.num];
-
-	/* We usually wait in here for the Guest to give us a packet. */
-	head = wait_for_vq_desc(vq, iov, &out, &in);
-	if (in)
-		bad_driver_vq(vq, "Input buffers in net output queue?");
-	/*
-	 * Send the whole thing through to /dev/net/tun.  It expects the exact
-	 * same format: what a coincidence!
-	 */
-	if (writev(net_info->tunfd, iov, out) < 0)
-		warnx("Write to tun failed (%d)?", errno);
-
-	/*
-	 * Done with that one; wait_for_vq_desc() will send the interrupt if
-	 * all packets are processed.
-	 */
-	add_used(vq, head, 0);
-}
-
-/*
- * Handling network input is a bit trickier, because I've tried to optimize it.
- *
- * First we have a helper routine which tells is if from this file descriptor
- * (ie. the /dev/net/tun device) will block:
- */
-static bool will_block(int fd)
-{
-	fd_set fdset;
-	struct timeval zero = { 0, 0 };
-	FD_ZERO(&fdset);
-	FD_SET(fd, &fdset);
-	return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
-}
-
-/*
- * This handles packets coming in from the tun device to our Guest.  Like all
- * service routines, it gets called again as soon as it returns, so you don't
- * see a while(1) loop here.
- */
-static void net_input(struct virtqueue *vq)
-{
-	int len;
-	unsigned int head, out, in;
-	struct iovec iov[vq->vring.num];
-	struct net_info *net_info = vq->dev->priv;
-
-	/*
-	 * Get a descriptor to write an incoming packet into.  This will also
-	 * send an interrupt if they're out of descriptors.
-	 */
-	head = wait_for_vq_desc(vq, iov, &out, &in);
-	if (out)
-		bad_driver_vq(vq, "Output buffers in net input queue?");
-
-	/*
-	 * If it looks like we'll block reading from the tun device, send them
-	 * an interrupt.
-	 */
-	if (vq->pending_used && will_block(net_info->tunfd))
-		trigger_irq(vq);
-
-	/*
-	 * Read in the packet.  This is where we normally wait (when there's no
-	 * incoming network traffic).
-	 */
-	len = readv(net_info->tunfd, iov, in);
-	if (len <= 0)
-		warn("Failed to read from tun (%d).", errno);
-
-	/*
-	 * Mark that packet buffer as used, but don't interrupt here.  We want
-	 * to wait until we've done as much work as we can.
-	 */
-	add_used(vq, head, len);
-}
-/*:*/
-
-/* This is the helper to create threads: run the service routine in a loop. */
-static int do_thread(void *_vq)
-{
-	struct virtqueue *vq = _vq;
-
-	for (;;)
-		vq->service(vq);
-	return 0;
-}
-
-/*
- * When a child dies, we kill our entire process group with SIGTERM.  This
- * also has the side effect that the shell restores the console for us!
- */
-static void kill_launcher(int signal)
-{
-	kill(0, SIGTERM);
-}
-
-static void reset_vq_pci_config(struct virtqueue *vq)
-{
-	vq->pci_config.queue_size = VIRTQUEUE_NUM;
-	vq->pci_config.queue_enable = 0;
-}
-
-static void reset_device(struct device *dev)
-{
-	struct virtqueue *vq;
-
-	verbose("Resetting device %s\n", dev->name);
-
-	/* Clear any features they've acked. */
-	dev->features_accepted = 0;
-
-	/* We're going to be explicitly killing threads, so ignore them. */
-	signal(SIGCHLD, SIG_IGN);
-
-	/*
-	 * 4.1.4.3.1:
-	 *
-	 *   The device MUST present a 0 in queue_enable on reset. 
-	 *
-	 * This means we set it here, and reset the saved ones in every vq.
-	 */
-	dev->mmio->cfg.queue_enable = 0;
-
-	/* Get rid of the virtqueue threads */
-	for (vq = dev->vq; vq; vq = vq->next) {
-		vq->last_avail_idx = 0;
-		reset_vq_pci_config(vq);
-		if (vq->thread != (pid_t)-1) {
-			kill(vq->thread, SIGTERM);
-			waitpid(vq->thread, NULL, 0);
-			vq->thread = (pid_t)-1;
-		}
-	}
-	dev->running = false;
-	dev->wrote_features_ok = false;
-
-	/* Now we care if threads die. */
-	signal(SIGCHLD, (void *)kill_launcher);
-}
-
-static void cleanup_devices(void)
-{
-	unsigned int i;
-
-	for (i = 1; i < MAX_PCI_DEVICES; i++) {
-		struct device *d = devices.pci[i];
-		if (!d)
-			continue;
-		reset_device(d);
-	}
-
-	/* If we saved off the original terminal settings, restore them now. */
-	if (orig_term.c_lflag & (ISIG|ICANON|ECHO))
-		tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
-}
-
-/*L:217
- * We do PCI.  This is mainly done to let us test the kernel virtio PCI
- * code.
- */
-
-/* Linux expects a PCI host bridge: ours is a dummy, and first on the bus. */
-static struct device pci_host_bridge;
-
-static void init_pci_host_bridge(void)
-{
-	pci_host_bridge.name = "PCI Host Bridge";
-	pci_host_bridge.config.class = 0x06; /* bridge */
-	pci_host_bridge.config.subclass = 0; /* host bridge */
-	devices.pci[0] = &pci_host_bridge;
-}
-
-/* The IO ports used to read the PCI config space. */
-#define PCI_CONFIG_ADDR 0xCF8
-#define PCI_CONFIG_DATA 0xCFC
-
-/*
- * Not really portable, but does help readability: this is what the Guest
- * writes to the PCI_CONFIG_ADDR IO port.
- */
-union pci_config_addr {
-	struct {
-		unsigned mbz: 2;
-		unsigned offset: 6;
-		unsigned funcnum: 3;
-		unsigned devnum: 5;
-		unsigned busnum: 8;
-		unsigned reserved: 7;
-		unsigned enabled : 1;
-	} bits;
-	u32 val;
-};
-
-/*
- * We cache what they wrote to the address port, so we know what they're
- * talking about when they access the data port.
- */
-static union pci_config_addr pci_config_addr;
-
-static struct device *find_pci_device(unsigned int index)
-{
-	return devices.pci[index];
-}
-
-/* PCI can do 1, 2 and 4 byte reads; we handle that here. */
-static void ioread(u16 off, u32 v, u32 mask, u32 *val)
-{
-	assert(off < 4);
-	assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF);
-	*val = (v >> (off * 8)) & mask;
-}
-
-/* PCI can do 1, 2 and 4 byte writes; we handle that here. */
-static void iowrite(u16 off, u32 v, u32 mask, u32 *dst)
-{
-	assert(off < 4);
-	assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF);
-	*dst &= ~(mask << (off * 8));
-	*dst |= (v & mask) << (off * 8);
-}
-
-/*
- * Where PCI_CONFIG_DATA accesses depends on the previous write to
- * PCI_CONFIG_ADDR.
- */
-static struct device *dev_and_reg(u32 *reg)
-{
-	if (!pci_config_addr.bits.enabled)
-		return NULL;
-
-	if (pci_config_addr.bits.funcnum != 0)
-		return NULL;
-
-	if (pci_config_addr.bits.busnum != 0)
-		return NULL;
-
-	if (pci_config_addr.bits.offset * 4 >= sizeof(struct pci_config))
-		return NULL;
-
-	*reg = pci_config_addr.bits.offset;
-	return find_pci_device(pci_config_addr.bits.devnum);
-}
-
-/*
- * We can get invalid combinations of values while they're writing, so we
- * only fault if they try to write with some invalid bar/offset/length.
- */
-static bool valid_bar_access(struct device *d,
-			     struct virtio_pci_cfg_cap_u32 *cfg_access)
-{
-	/* We only have 1 bar (BAR0) */
-	if (cfg_access->cap.bar != 0)
-		return false;
-
-	/* Check it's within BAR0. */
-	if (cfg_access->cap.offset >= d->mmio_size
-	    || cfg_access->cap.offset + cfg_access->cap.length > d->mmio_size)
-		return false;
-
-	/* Check length is 1, 2 or 4. */
-	if (cfg_access->cap.length != 1
-	    && cfg_access->cap.length != 2
-	    && cfg_access->cap.length != 4)
-		return false;
-
-	/*
-	 * 4.1.4.7.2:
-	 *
-	 *  The driver MUST NOT write a cap.offset which is not a multiple of
-	 *  cap.length (ie. all accesses MUST be aligned).
-	 */
-	if (cfg_access->cap.offset % cfg_access->cap.length != 0)
-		return false;
-
-	/* Return pointer into word in BAR0. */
-	return true;
-}
-
-/* Is this accessing the PCI config address port?. */
-static bool is_pci_addr_port(u16 port)
-{
-	return port >= PCI_CONFIG_ADDR && port < PCI_CONFIG_ADDR + 4;
-}
-
-static bool pci_addr_iowrite(u16 port, u32 mask, u32 val)
-{
-	iowrite(port - PCI_CONFIG_ADDR, val, mask,
-		&pci_config_addr.val);
-	verbose("PCI%s: %#x/%x: bus %u dev %u func %u reg %u\n",
-		pci_config_addr.bits.enabled ? "" : " DISABLED",
-		val, mask,
-		pci_config_addr.bits.busnum,
-		pci_config_addr.bits.devnum,
-		pci_config_addr.bits.funcnum,
-		pci_config_addr.bits.offset);
-	return true;
-}
-
-static void pci_addr_ioread(u16 port, u32 mask, u32 *val)
-{
-	ioread(port - PCI_CONFIG_ADDR, pci_config_addr.val, mask, val);
-}
-
-/* Is this accessing the PCI config data port?. */
-static bool is_pci_data_port(u16 port)
-{
-	return port >= PCI_CONFIG_DATA && port < PCI_CONFIG_DATA + 4;
-}
-
-static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask);
-
-static bool pci_data_iowrite(u16 port, u32 mask, u32 val)
-{
-	u32 reg, portoff;
-	struct device *d = dev_and_reg(&reg);
-
-	/* Complain if they don't belong to a device. */
-	if (!d)
-		return false;
-
-	/* They can do 1 byte writes, etc. */
-	portoff = port - PCI_CONFIG_DATA;
-
-	/*
-	 * PCI uses a weird way to determine the BAR size: the OS
-	 * writes all 1's, and sees which ones stick.
-	 */
-	if (&d->config_words[reg] == &d->config.bar[0]) {
-		int i;
-
-		iowrite(portoff, val, mask, &d->config.bar[0]);
-		for (i = 0; (1 << i) < d->mmio_size; i++)
-			d->config.bar[0] &= ~(1 << i);
-		return true;
-	} else if ((&d->config_words[reg] > &d->config.bar[0]
-		    && &d->config_words[reg] <= &d->config.bar[6])
-		   || &d->config_words[reg] == &d->config.expansion_rom_addr) {
-		/* Allow writing to any other BAR, or expansion ROM */
-		iowrite(portoff, val, mask, &d->config_words[reg]);
-		return true;
-		/* We let them override latency timer and cacheline size */
-	} else if (&d->config_words[reg] == (void *)&d->config.cacheline_size) {
-		/* Only let them change the first two fields. */
-		if (mask == 0xFFFFFFFF)
-			mask = 0xFFFF;
-		iowrite(portoff, val, mask, &d->config_words[reg]);
-		return true;
-	} else if (&d->config_words[reg] == (void *)&d->config.command
-		   && mask == 0xFFFF) {
-		/* Ignore command writes. */
-		return true;
-	} else if (&d->config_words[reg]
-		   == (void *)&d->config.cfg_access.cap.bar
-		   || &d->config_words[reg]
-		   == &d->config.cfg_access.cap.length
-		   || &d->config_words[reg]
-		   == &d->config.cfg_access.cap.offset) {
-
-		/*
-		 * The VIRTIO_PCI_CAP_PCI_CFG capability
-		 * provides a backdoor to access the MMIO
-		 * regions without mapping them.  Weird, but
-		 * useful.
-		 */
-		iowrite(portoff, val, mask, &d->config_words[reg]);
-		return true;
-	} else if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) {
-		u32 write_mask;
-
-		/*
-		 * 4.1.4.7.1:
-		 *
-		 *  Upon detecting driver write access to pci_cfg_data, the
-		 *  device MUST execute a write access at offset cap.offset at
-		 *  BAR selected by cap.bar using the first cap.length bytes
-		 *  from pci_cfg_data.
-		 */
-
-		/* Must be bar 0 */
-		if (!valid_bar_access(d, &d->config.cfg_access))
-			return false;
-
-		iowrite(portoff, val, mask, &d->config.cfg_access.pci_cfg_data);
-
-		/*
-		 * Now emulate a write.  The mask we use is set by
-		 * len, *not* this write!
-		 */
-		write_mask = (1ULL<<(8*d->config.cfg_access.cap.length)) - 1;
-		verbose("Window writing %#x/%#x to bar %u, offset %u len %u\n",
-			d->config.cfg_access.pci_cfg_data, write_mask,
-			d->config.cfg_access.cap.bar,
-			d->config.cfg_access.cap.offset,
-			d->config.cfg_access.cap.length);
-
-		emulate_mmio_write(d, d->config.cfg_access.cap.offset,
-				   d->config.cfg_access.pci_cfg_data,
-				   write_mask);
-		return true;
-	}
-
-	/*
-	 * 4.1.4.1:
-	 *
-	 *  The driver MUST NOT write into any field of the capability
-	 *  structure, with the exception of those with cap_type
-	 *  VIRTIO_PCI_CAP_PCI_CFG...
-	 */
-	return false;
-}
-
-static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask);
-
-static void pci_data_ioread(u16 port, u32 mask, u32 *val)
-{
-	u32 reg;
-	struct device *d = dev_and_reg(&reg);
-
-	if (!d)
-		return;
-
-	/* Read through the PCI MMIO access window is special */
-	if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) {
-		u32 read_mask;
-
-		/*
-		 * 4.1.4.7.1:
-		 *
-		 *  Upon detecting driver read access to pci_cfg_data, the
-		 *  device MUST execute a read access of length cap.length at
-		 *  offset cap.offset at BAR selected by cap.bar and store the
-		 *  first cap.length bytes in pci_cfg_data.
-		 */
-		/* Must be bar 0 */
-		if (!valid_bar_access(d, &d->config.cfg_access))
-			bad_driver(d,
-			     "Invalid cfg_access to bar%u, offset %u len %u",
-			     d->config.cfg_access.cap.bar,
-			     d->config.cfg_access.cap.offset,
-			     d->config.cfg_access.cap.length);
-
-		/*
-		 * Read into the window.  The mask we use is set by
-		 * len, *not* this read!
-		 */
-		read_mask = (1ULL<<(8*d->config.cfg_access.cap.length))-1;
-		d->config.cfg_access.pci_cfg_data
-			= emulate_mmio_read(d,
-					    d->config.cfg_access.cap.offset,
-					    read_mask);
-		verbose("Window read %#x/%#x from bar %u, offset %u len %u\n",
-			d->config.cfg_access.pci_cfg_data, read_mask,
-			d->config.cfg_access.cap.bar,
-			d->config.cfg_access.cap.offset,
-			d->config.cfg_access.cap.length);
-	}
-	ioread(port - PCI_CONFIG_DATA, d->config_words[reg], mask, val);
-}
-
-/*L:216
- * This is where we emulate a handful of Guest instructions.  It's ugly
- * and we used to do it in the kernel but it grew over time.
- */
-
-/*
- * We use the ptrace syscall's pt_regs struct to talk about registers
- * to lguest: these macros convert the names to the offsets.
- */
-#define getreg(name) getreg_off(offsetof(struct user_regs_struct, name))
-#define setreg(name, val) \
-	setreg_off(offsetof(struct user_regs_struct, name), (val))
-
-static u32 getreg_off(size_t offset)
-{
-	u32 r;
-	unsigned long args[] = { LHREQ_GETREG, offset };
-
-	if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
-		err(1, "Getting register %u", offset);
-	if (pread(lguest_fd, &r, sizeof(r), cpu_id) != sizeof(r))
-		err(1, "Reading register %u", offset);
-
-	return r;
-}
-
-static void setreg_off(size_t offset, u32 val)
-{
-	unsigned long args[] = { LHREQ_SETREG, offset, val };
-
-	if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
-		err(1, "Setting register %u", offset);
-}
-
-/* Get register by instruction encoding */
-static u32 getreg_num(unsigned regnum, u32 mask)
-{
-	/* 8 bit ops use regnums 4-7 for high parts of word */
-	if (mask == 0xFF && (regnum & 0x4))
-		return getreg_num(regnum & 0x3, 0xFFFF) >> 8;
-
-	switch (regnum) {
-	case 0: return getreg(eax) & mask;
-	case 1: return getreg(ecx) & mask;
-	case 2: return getreg(edx) & mask;
-	case 3: return getreg(ebx) & mask;
-	case 4: return getreg(esp) & mask;
-	case 5: return getreg(ebp) & mask;
-	case 6: return getreg(esi) & mask;
-	case 7: return getreg(edi) & mask;
-	}
-	abort();
-}
-
-/* Set register by instruction encoding */
-static void setreg_num(unsigned regnum, u32 val, u32 mask)
-{
-	/* Don't try to set bits out of range */
-	assert(~(val & ~mask));
-
-	/* 8 bit ops use regnums 4-7 for high parts of word */
-	if (mask == 0xFF && (regnum & 0x4)) {
-		/* Construct the 16 bits we want. */
-		val = (val << 8) | getreg_num(regnum & 0x3, 0xFF);
-		setreg_num(regnum & 0x3, val, 0xFFFF);
-		return;
-	}
-
-	switch (regnum) {
-	case 0: setreg(eax, val | (getreg(eax) & ~mask)); return;
-	case 1: setreg(ecx, val | (getreg(ecx) & ~mask)); return;
-	case 2: setreg(edx, val | (getreg(edx) & ~mask)); return;
-	case 3: setreg(ebx, val | (getreg(ebx) & ~mask)); return;
-	case 4: setreg(esp, val | (getreg(esp) & ~mask)); return;
-	case 5: setreg(ebp, val | (getreg(ebp) & ~mask)); return;
-	case 6: setreg(esi, val | (getreg(esi) & ~mask)); return;
-	case 7: setreg(edi, val | (getreg(edi) & ~mask)); return;
-	}
-	abort();
-}
-
-/* Get bytes of displacement appended to instruction, from r/m encoding */
-static u32 insn_displacement_len(u8 mod_reg_rm)
-{
-	/* Switch on the mod bits */
-	switch (mod_reg_rm >> 6) {
-	case 0:
-		/* If mod == 0, and r/m == 101, 16-bit displacement follows */
-		if ((mod_reg_rm & 0x7) == 0x5)
-			return 2;
-		/* Normally, mod == 0 means no literal displacement */
-		return 0;
-	case 1:
-		/* One byte displacement */
-		return 1;
-	case 2:
-		/* Four byte displacement */
-		return 4;
-	case 3:
-		/* Register mode */
-		return 0;
-	}
-	abort();
-}
-
-static void emulate_insn(const u8 insn[])
-{
-	unsigned long args[] = { LHREQ_TRAP, 13 };
-	unsigned int insnlen = 0, in = 0, small_operand = 0, byte_access;
-	unsigned int eax, port, mask;
-	/*
-	 * Default is to return all-ones on IO port reads, which traditionally
-	 * means "there's nothing there".
-	 */
-	u32 val = 0xFFFFFFFF;
-
-	/*
-	 * This must be the Guest kernel trying to do something, not userspace!
-	 * The bottom two bits of the CS segment register are the privilege
-	 * level.
-	 */
-	if ((getreg(xcs) & 3) != 0x1)
-		goto no_emulate;
-
-	/* Decoding x86 instructions is icky. */
-
-	/*
-	 * Around 2.6.33, the kernel started using an emulation for the
-	 * cmpxchg8b instruction in early boot on many configurations.  This
-	 * code isn't paravirtualized, and it tries to disable interrupts.
-	 * Ignore it, which will Mostly Work.
-	 */
-	if (insn[insnlen] == 0xfa) {
-		/* "cli", or Clear Interrupt Enable instruction.  Skip it. */
-		insnlen = 1;
-		goto skip_insn;
-	}
-
-	/*
-	 * 0x66 is an "operand prefix".  It means a 16, not 32 bit in/out.
-	 */
-	if (insn[insnlen] == 0x66) {
-		small_operand = 1;
-		/* The instruction is 1 byte so far, read the next byte. */
-		insnlen = 1;
-	}
-
-	/* If the lower bit isn't set, it's a single byte access */
-	byte_access = !(insn[insnlen] & 1);
-
-	/*
-	 * Now we can ignore the lower bit and decode the 4 opcodes
-	 * we need to emulate.
-	 */
-	switch (insn[insnlen] & 0xFE) {
-	case 0xE4: /* in     <next byte>,%al */
-		port = insn[insnlen+1];
-		insnlen += 2;
-		in = 1;
-		break;
-	case 0xEC: /* in     (%dx),%al */
-		port = getreg(edx) & 0xFFFF;
-		insnlen += 1;
-		in = 1;
-		break;
-	case 0xE6: /* out    %al,<next byte> */
-		port = insn[insnlen+1];
-		insnlen += 2;
-		break;
-	case 0xEE: /* out    %al,(%dx) */
-		port = getreg(edx) & 0xFFFF;
-		insnlen += 1;
-		break;
-	default:
-		/* OK, we don't know what this is, can't emulate. */
-		goto no_emulate;
-	}
-
-	/* Set a mask of the 1, 2 or 4 bytes, depending on size of IO */
-	if (byte_access)
-		mask = 0xFF;
-	else if (small_operand)
-		mask = 0xFFFF;
-	else
-		mask = 0xFFFFFFFF;
-
-	/*
-	 * If it was an "IN" instruction, they expect the result to be read
-	 * into %eax, so we change %eax.
-	 */
-	eax = getreg(eax);
-
-	if (in) {
-		/* This is the PS/2 keyboard status; 1 means ready for output */
-		if (port == 0x64)
-			val = 1;
-		else if (is_pci_addr_port(port))
-			pci_addr_ioread(port, mask, &val);
-		else if (is_pci_data_port(port))
-			pci_data_ioread(port, mask, &val);
-
-		/* Clear the bits we're about to read */
-		eax &= ~mask;
-		/* Copy bits in from val. */
-		eax |= val & mask;
-		/* Now update the register. */
-		setreg(eax, eax);
-	} else {
-		if (is_pci_addr_port(port)) {
-			if (!pci_addr_iowrite(port, mask, eax))
-				goto bad_io;
-		} else if (is_pci_data_port(port)) {
-			if (!pci_data_iowrite(port, mask, eax))
-				goto bad_io;
-		}
-		/* There are many other ports, eg. CMOS clock, serial
-		 * and parallel ports, so we ignore them all. */
-	}
-
-	verbose("IO %s of %x to %u: %#08x\n",
-		in ? "IN" : "OUT", mask, port, eax);
-skip_insn:
-	/* Finally, we've "done" the instruction, so move past it. */
-	setreg(eip, getreg(eip) + insnlen);
-	return;
-
-bad_io:
-	warnx("Attempt to %s port %u (%#x mask)",
-	      in ? "read from" : "write to", port, mask);
-
-no_emulate:
-	/* Inject trap into Guest. */
-	if (write(lguest_fd, args, sizeof(args)) < 0)
-		err(1, "Reinjecting trap 13 for fault at %#x", getreg(eip));
-}
-
-static struct device *find_mmio_region(unsigned long paddr, u32 *off)
-{
-	unsigned int i;
-
-	for (i = 1; i < MAX_PCI_DEVICES; i++) {
-		struct device *d = devices.pci[i];
-
-		if (!d)
-			continue;
-		if (paddr < d->mmio_addr)
-			continue;
-		if (paddr >= d->mmio_addr + d->mmio_size)
-			continue;
-		*off = paddr - d->mmio_addr;
-		return d;
-	}
-	return NULL;
-}
-
-/* FIXME: Use vq array. */
-static struct virtqueue *vq_by_num(struct device *d, u32 num)
-{
-	struct virtqueue *vq = d->vq;
-
-	while (num-- && vq)
-		vq = vq->next;
-
-	return vq;
-}
-
-static void save_vq_config(const struct virtio_pci_common_cfg *cfg,
-			   struct virtqueue *vq)
-{
-	vq->pci_config = *cfg;
-}
-
-static void restore_vq_config(struct virtio_pci_common_cfg *cfg,
-			      struct virtqueue *vq)
-{
-	/* Only restore the per-vq part */
-	size_t off = offsetof(struct virtio_pci_common_cfg, queue_size);
-
-	memcpy((void *)cfg + off, (void *)&vq->pci_config + off,
-	       sizeof(*cfg) - off);
-}
-
-/*
- * 4.1.4.3.2:
- *
- *  The driver MUST configure the other virtqueue fields before
- *  enabling the virtqueue with queue_enable.
- *
- * When they enable the virtqueue, we check that their setup is valid.
- */
-static void check_virtqueue(struct device *d, struct virtqueue *vq)
-{
-	/* Because lguest is 32 bit, all the descriptor high bits must be 0 */
-	if (vq->pci_config.queue_desc_hi
-	    || vq->pci_config.queue_avail_hi
-	    || vq->pci_config.queue_used_hi)
-		bad_driver_vq(vq, "invalid 64-bit queue address");
-
-	/*
-	 * 2.4.1:
-	 *
-	 *  The driver MUST ensure that the physical address of the first byte
-	 *  of each virtqueue part is a multiple of the specified alignment
-	 *  value in the above table.
-	 */
-	if (vq->pci_config.queue_desc_lo % 16
-	    || vq->pci_config.queue_avail_lo % 2
-	    || vq->pci_config.queue_used_lo % 4)
-		bad_driver_vq(vq, "invalid alignment in queue addresses");
-
-	/* Initialize the virtqueue and check they're all in range. */
-	vq->vring.num = vq->pci_config.queue_size;
-	vq->vring.desc = check_pointer(vq->dev,
-				       vq->pci_config.queue_desc_lo,
-				       sizeof(*vq->vring.desc) * vq->vring.num);
-	vq->vring.avail = check_pointer(vq->dev,
-					vq->pci_config.queue_avail_lo,
-					sizeof(*vq->vring.avail)
-					+ (sizeof(vq->vring.avail->ring[0])
-					   * vq->vring.num));
-	vq->vring.used = check_pointer(vq->dev,
-				       vq->pci_config.queue_used_lo,
-				       sizeof(*vq->vring.used)
-				       + (sizeof(vq->vring.used->ring[0])
-					  * vq->vring.num));
-
-	/*
-	 * 2.4.9.1:
-	 *
-	 *   The driver MUST initialize flags in the used ring to 0
-	 *   when allocating the used ring.
-	 */
-	if (vq->vring.used->flags != 0)
-		bad_driver_vq(vq, "invalid initial used.flags %#x",
-			      vq->vring.used->flags);
-}
-
-static void start_virtqueue(struct virtqueue *vq)
-{
-	/*
-	 * Create stack for thread.  Since the stack grows upwards, we point
-	 * the stack pointer to the end of this region.
-	 */
-	char *stack = malloc(32768);
-
-	/* Create a zero-initialized eventfd. */
-	vq->eventfd = eventfd(0, 0);
-	if (vq->eventfd < 0)
-		err(1, "Creating eventfd");
-
-	/*
-	 * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so
-	 * we get a signal if it dies.
-	 */
-	vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
-	if (vq->thread == (pid_t)-1)
-		err(1, "Creating clone");
-}
-
-static void start_virtqueues(struct device *d)
-{
-	struct virtqueue *vq;
-
-	for (vq = d->vq; vq; vq = vq->next) {
-		if (vq->pci_config.queue_enable)
-			start_virtqueue(vq);
-	}
-}
-
-static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask)
-{
-	struct virtqueue *vq;
-
-	switch (off) {
-	case offsetof(struct virtio_pci_mmio, cfg.device_feature_select):
-		/*
-		 * 4.1.4.3.1:
-		 *
-		 * The device MUST present the feature bits it is offering in
-		 * device_feature, starting at bit device_feature_select ∗ 32
-		 * for any device_feature_select written by the driver
-		 */
-		if (val == 0)
-			d->mmio->cfg.device_feature = d->features;
-		else if (val == 1)
-			d->mmio->cfg.device_feature = (d->features >> 32);
-		else
-			d->mmio->cfg.device_feature = 0;
-		goto feature_write_through32;
-	case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select):
-		if (val > 1)
-			bad_driver(d, "Unexpected driver select %u", val);
-		goto feature_write_through32;
-	case offsetof(struct virtio_pci_mmio, cfg.guest_feature):
-		if (d->mmio->cfg.guest_feature_select == 0) {
-			d->features_accepted &= ~((u64)0xFFFFFFFF);
-			d->features_accepted |= val;
-		} else {
-			assert(d->mmio->cfg.guest_feature_select == 1);
-			d->features_accepted &= 0xFFFFFFFF;
-			d->features_accepted |= ((u64)val) << 32;
-		}
-		/*
-		 * 2.2.1:
-		 *
-		 *   The driver MUST NOT accept a feature which the device did
-		 *   not offer
-		 */
-		if (d->features_accepted & ~d->features)
-			bad_driver(d, "over-accepted features %#llx of %#llx",
-				   d->features_accepted, d->features);
-		goto feature_write_through32;
-	case offsetof(struct virtio_pci_mmio, cfg.device_status): {
-		u8 prev;
-
-		verbose("%s: device status -> %#x\n", d->name, val);
-		/*
-		 * 4.1.4.3.1:
-		 * 
-		 *  The device MUST reset when 0 is written to device_status,
-		 *  and present a 0 in device_status once that is done.
-		 */
-		if (val == 0) {
-			reset_device(d);
-			goto write_through8;
-		}
-
-		/* 2.1.1: The driver MUST NOT clear a device status bit. */
-		if (d->mmio->cfg.device_status & ~val)
-			bad_driver(d, "unset of device status bit %#x -> %#x",
-				   d->mmio->cfg.device_status, val);
-
-		/*
-		 * 2.1.2:
-		 *
-		 *  The device MUST NOT consume buffers or notify the driver
-		 *  before DRIVER_OK.
-		 */
-		if (val & VIRTIO_CONFIG_S_DRIVER_OK
-		    && !(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK))
-			start_virtqueues(d);
-
-		/*
-		 * 3.1.1:
-		 *
-		 *   The driver MUST follow this sequence to initialize a device:
-		 *   - Reset the device.
-		 *   - Set the ACKNOWLEDGE status bit: the guest OS has
-                 *     notice the device.
-		 *   - Set the DRIVER status bit: the guest OS knows how
-                 *     to drive the device.
-		 *   - Read device feature bits, and write the subset
-		 *     of feature bits understood by the OS and driver
-		 *     to the device. During this step the driver MAY
-		 *     read (but MUST NOT write) the device-specific
-		 *     configuration fields to check that it can
-		 *     support the device before accepting it.
-		 *   - Set the FEATURES_OK status bit.  The driver
-		 *     MUST not accept new feature bits after this
-		 *     step.
-		 *   - Re-read device status to ensure the FEATURES_OK
-		 *     bit is still set: otherwise, the device does
-		 *     not support our subset of features and the
-		 *     device is unusable.
-		 *   - Perform device-specific setup, including
-		 *     discovery of virtqueues for the device,
-		 *     optional per-bus setup, reading and possibly
-		 *     writing the device’s virtio configuration
-		 *     space, and population of virtqueues.
-		 *   - Set the DRIVER_OK status bit. At this point the
-                 *     device is “live”.
-		 */
-		prev = 0;
-		switch (val & ~d->mmio->cfg.device_status) {
-		case VIRTIO_CONFIG_S_DRIVER_OK:
-			prev |= VIRTIO_CONFIG_S_FEATURES_OK; /* fall thru */
-		case VIRTIO_CONFIG_S_FEATURES_OK:
-			prev |= VIRTIO_CONFIG_S_DRIVER; /* fall thru */
-		case VIRTIO_CONFIG_S_DRIVER:
-			prev |= VIRTIO_CONFIG_S_ACKNOWLEDGE; /* fall thru */
-		case VIRTIO_CONFIG_S_ACKNOWLEDGE:
-			break;
-		default:
-			bad_driver(d, "unknown device status bit %#x -> %#x",
-				   d->mmio->cfg.device_status, val);
-		}
-		if (d->mmio->cfg.device_status != prev)
-			bad_driver(d, "unexpected status transition %#x -> %#x",
-				   d->mmio->cfg.device_status, val);
-
-		/* If they just wrote FEATURES_OK, we make sure they read */
-		switch (val & ~d->mmio->cfg.device_status) {
-		case VIRTIO_CONFIG_S_FEATURES_OK:
-			d->wrote_features_ok = true;
-			break;
-		case VIRTIO_CONFIG_S_DRIVER_OK:
-			if (d->wrote_features_ok)
-				bad_driver(d, "did not re-read FEATURES_OK");
-			break;
-		}
-		goto write_through8;
-	}
-	case offsetof(struct virtio_pci_mmio, cfg.queue_select):
-		vq = vq_by_num(d, val);
-		/*
-		 * 4.1.4.3.1:
-		 *
-		 *  The device MUST present a 0 in queue_size if the virtqueue
-		 *  corresponding to the current queue_select is unavailable.
-		 */
-		if (!vq) {
-			d->mmio->cfg.queue_size = 0;
-			goto write_through16;
-		}
-		/* Save registers for old vq, if it was a valid vq */
-		if (d->mmio->cfg.queue_size)
-			save_vq_config(&d->mmio->cfg,
-				       vq_by_num(d, d->mmio->cfg.queue_select));
-		/* Restore the registers for the queue they asked for */
-		restore_vq_config(&d->mmio->cfg, vq);
-		goto write_through16;
-	case offsetof(struct virtio_pci_mmio, cfg.queue_size):
-		/*
-		 * 4.1.4.3.2:
-		 *
-		 *  The driver MUST NOT write a value which is not a power of 2
-		 *  to queue_size.
-		 */
-		if (val & (val-1))
-			bad_driver(d, "invalid queue size %u", val);
-		if (d->mmio->cfg.queue_enable)
-			bad_driver(d, "changing queue size on live device");
-		goto write_through16;
-	case offsetof(struct virtio_pci_mmio, cfg.queue_msix_vector):
-		bad_driver(d, "attempt to set MSIX vector to %u", val);
-	case offsetof(struct virtio_pci_mmio, cfg.queue_enable): {
-		struct virtqueue *vq = vq_by_num(d, d->mmio->cfg.queue_select);
-
-		/*
-		 * 4.1.4.3.2:
-		 *
-		 *  The driver MUST NOT write a 0 to queue_enable.
-		 */
-		if (val != 1)
-			bad_driver(d, "setting queue_enable to %u", val);
-
-		/*
-		 * 3.1.1:
-		 *
-		 *  7. Perform device-specific setup, including discovery of
-		 *     virtqueues for the device, optional per-bus setup,
-		 *     reading and possibly writing the device’s virtio
-		 *     configuration space, and population of virtqueues.
-		 *  8. Set the DRIVER_OK status bit.
-		 *
-		 * All our devices require all virtqueues to be enabled, so
-		 * they should have done that before setting DRIVER_OK.
-		 */
-		if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK)
-			bad_driver(d, "enabling vq after DRIVER_OK");
-
-		d->mmio->cfg.queue_enable = val;
-		save_vq_config(&d->mmio->cfg, vq);
-		check_virtqueue(d, vq);
-		goto write_through16;
-	}
-	case offsetof(struct virtio_pci_mmio, cfg.queue_notify_off):
-		bad_driver(d, "attempt to write to queue_notify_off");
-	case offsetof(struct virtio_pci_mmio, cfg.queue_desc_lo):
-	case offsetof(struct virtio_pci_mmio, cfg.queue_desc_hi):
-	case offsetof(struct virtio_pci_mmio, cfg.queue_avail_lo):
-	case offsetof(struct virtio_pci_mmio, cfg.queue_avail_hi):
-	case offsetof(struct virtio_pci_mmio, cfg.queue_used_lo):
-	case offsetof(struct virtio_pci_mmio, cfg.queue_used_hi):
-		/*
-		 * 4.1.4.3.2:
-		 *
-		 *  The driver MUST configure the other virtqueue fields before
-		 *  enabling the virtqueue with queue_enable.
-		 */
-		if (d->mmio->cfg.queue_enable)
-			bad_driver(d, "changing queue on live device");
-
-		/*
-		 * 3.1.1:
-		 *
-		 *  The driver MUST follow this sequence to initialize a device:
-		 *...
-		 *  5. Set the FEATURES_OK status bit. The driver MUST not
-		 *  accept new feature bits after this step.
-		 */
-		if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK))
-			bad_driver(d, "setting up vq before FEATURES_OK");
-
-		/*
-		 *  6. Re-read device status to ensure the FEATURES_OK bit is
-		 *     still set...
-		 */
-		if (d->wrote_features_ok)
-			bad_driver(d, "didn't re-read FEATURES_OK before setup");
-
-		goto write_through32;
-	case offsetof(struct virtio_pci_mmio, notify):
-		vq = vq_by_num(d, val);
-		if (!vq)
-			bad_driver(d, "Invalid vq notification on %u", val);
-		/* Notify the process handling this vq by adding 1 to eventfd */
-		write(vq->eventfd, "\1\0\0\0\0\0\0\0", 8);
-		goto write_through16;
-	case offsetof(struct virtio_pci_mmio, isr):
-		bad_driver(d, "Unexpected write to isr");
-	/* Weird corner case: write to emerg_wr of console */
-	case sizeof(struct virtio_pci_mmio)
-		+ offsetof(struct virtio_console_config, emerg_wr):
-		if (strcmp(d->name, "console") == 0) {
-			char c = val;
-			write(STDOUT_FILENO, &c, 1);
-			goto write_through32;
-		}
-		/* Fall through... */
-	default:
-		/*
-		 * 4.1.4.3.2:
-		 *
-		 *   The driver MUST NOT write to device_feature, num_queues,
-		 *   config_generation or queue_notify_off.
-		 */
-		bad_driver(d, "Unexpected write to offset %u", off);
-	}
-
-feature_write_through32:
-	/*
-	 * 3.1.1:
-	 *
-	 *   The driver MUST follow this sequence to initialize a device:
-	 *...
-	 *   - Set the DRIVER status bit: the guest OS knows how
-	 *     to drive the device.
-	 *   - Read device feature bits, and write the subset
-	 *     of feature bits understood by the OS and driver
-	 *     to the device.
-	 *...
-	 *   - Set the FEATURES_OK status bit. The driver MUST not
-	 *     accept new feature bits after this step.
-	 */
-	if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
-		bad_driver(d, "feature write before VIRTIO_CONFIG_S_DRIVER");
-	if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK)
-		bad_driver(d, "feature write after VIRTIO_CONFIG_S_FEATURES_OK");
-
-	/*
-	 * 4.1.3.1:
-	 *
-	 *  The driver MUST access each field using the “natural” access
-	 *  method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for
-	 *  16-bit fields and 8-bit accesses for 8-bit fields.
-	 */
-write_through32:
-	if (mask != 0xFFFFFFFF) {
-		bad_driver(d, "non-32-bit write to offset %u (%#x)",
-			   off, getreg(eip));
-		return;
-	}
-	memcpy((char *)d->mmio + off, &val, 4);
-	return;
-
-write_through16:
-	if (mask != 0xFFFF)
-		bad_driver(d, "non-16-bit write to offset %u (%#x)",
-			   off, getreg(eip));
-	memcpy((char *)d->mmio + off, &val, 2);
-	return;
-
-write_through8:
-	if (mask != 0xFF)
-		bad_driver(d, "non-8-bit write to offset %u (%#x)",
-			   off, getreg(eip));
-	memcpy((char *)d->mmio + off, &val, 1);
-	return;
-}
-
-static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask)
-{
-	u8 isr;
-	u32 val = 0;
-
-	switch (off) {
-	case offsetof(struct virtio_pci_mmio, cfg.device_feature_select):
-	case offsetof(struct virtio_pci_mmio, cfg.device_feature):
-	case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select):
-	case offsetof(struct virtio_pci_mmio, cfg.guest_feature):
-		/*
-		 * 3.1.1:
-		 *
-		 *   The driver MUST follow this sequence to initialize a device:
-		 *...
-		 *   - Set the DRIVER status bit: the guest OS knows how
-		 *     to drive the device.
-		 *   - Read device feature bits, and write the subset
-		 *     of feature bits understood by the OS and driver
-		 *     to the device.
-		 */
-		if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
-			bad_driver(d,
-				   "feature read before VIRTIO_CONFIG_S_DRIVER");
-		goto read_through32;
-	case offsetof(struct virtio_pci_mmio, cfg.msix_config):
-		bad_driver(d, "read of msix_config");
-	case offsetof(struct virtio_pci_mmio, cfg.num_queues):
-		goto read_through16;
-	case offsetof(struct virtio_pci_mmio, cfg.device_status):
-		/* As they did read, any write of FEATURES_OK is now fine. */
-		d->wrote_features_ok = false;
-		goto read_through8;
-	case offsetof(struct virtio_pci_mmio, cfg.config_generation):
-		/*
-		 * 4.1.4.3.1:
-		 *
-		 *  The device MUST present a changed config_generation after
-		 *  the driver has read a device-specific configuration value
-		 *  which has changed since any part of the device-specific
-		 *  configuration was last read.
-		 *
-		 * This is simple: none of our devices change config, so this
-		 * is always 0.
-		 */
-		goto read_through8;
-	case offsetof(struct virtio_pci_mmio, notify):
-		/*
-		 * 3.1.1:
-		 *
-		 *   The driver MUST NOT notify the device before setting
-		 *   DRIVER_OK.
-		 */
-		if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK))
-			bad_driver(d, "notify before VIRTIO_CONFIG_S_DRIVER_OK");
-		goto read_through16;
-	case offsetof(struct virtio_pci_mmio, isr):
-		if (mask != 0xFF)
-			bad_driver(d, "non-8-bit read from offset %u (%#x)",
-				   off, getreg(eip));
-		isr = d->mmio->isr;
-		/*
-		 * 4.1.4.5.1:
-		 *
-		 *  The device MUST reset ISR status to 0 on driver read. 
-		 */
-		d->mmio->isr = 0;
-		return isr;
-	case offsetof(struct virtio_pci_mmio, padding):
-		bad_driver(d, "read from padding (%#x)", getreg(eip));
-	default:
-		/* Read from device config space, beware unaligned overflow */
-		if (off > d->mmio_size - 4)
-			bad_driver(d, "read past end (%#x)", getreg(eip));
-
-		/*
-		 * 3.1.1:
-		 *  The driver MUST follow this sequence to initialize a device:
-		 *...
-		 *  3. Set the DRIVER status bit: the guest OS knows how to
-		 *  drive the device.
-		 *  4. Read device feature bits, and write the subset of
-		 *  feature bits understood by the OS and driver to the
-		 *  device. During this step the driver MAY read (but MUST NOT
-		 *  write) the device-specific configuration fields to check
-		 *  that it can support the device before accepting it.
-		 */
-		if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
-			bad_driver(d,
-				   "config read before VIRTIO_CONFIG_S_DRIVER");
-
-		if (mask == 0xFFFFFFFF)
-			goto read_through32;
-		else if (mask == 0xFFFF)
-			goto read_through16;
-		else
-			goto read_through8;
-	}
-
-	/*
-	 * 4.1.3.1:
-	 *
-	 *  The driver MUST access each field using the “natural” access
-	 *  method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for
-	 *  16-bit fields and 8-bit accesses for 8-bit fields.
-	 */
-read_through32:
-	if (mask != 0xFFFFFFFF)
-		bad_driver(d, "non-32-bit read to offset %u (%#x)",
-			   off, getreg(eip));
-	memcpy(&val, (char *)d->mmio + off, 4);
-	return val;
-
-read_through16:
-	if (mask != 0xFFFF)
-		bad_driver(d, "non-16-bit read to offset %u (%#x)",
-			   off, getreg(eip));
-	memcpy(&val, (char *)d->mmio + off, 2);
-	return val;
-
-read_through8:
-	if (mask != 0xFF)
-		bad_driver(d, "non-8-bit read to offset %u (%#x)",
-			   off, getreg(eip));
-	memcpy(&val, (char *)d->mmio + off, 1);
-	return val;
-}
-
-static void emulate_mmio(unsigned long paddr, const u8 *insn)
-{
-	u32 val, off, mask = 0xFFFFFFFF, insnlen = 0;
-	struct device *d = find_mmio_region(paddr, &off);
-	unsigned long args[] = { LHREQ_TRAP, 14 };
-
-	if (!d) {
-		warnx("MMIO touching %#08lx (not a device)", paddr);
-		goto reinject;
-	}
-
-	/* Prefix makes it a 16 bit op */
-	if (insn[0] == 0x66) {
-		mask = 0xFFFF;
-		insnlen++;
-	}
-
-	/* iowrite */
-	if (insn[insnlen] == 0x89) {
-		/* Next byte is r/m byte: bits 3-5 are register. */
-		val = getreg_num((insn[insnlen+1] >> 3) & 0x7, mask);
-		emulate_mmio_write(d, off, val, mask);
-		insnlen += 2 + insn_displacement_len(insn[insnlen+1]);
-	} else if (insn[insnlen] == 0x8b) { /* ioread */
-		/* Next byte is r/m byte: bits 3-5 are register. */
-		val = emulate_mmio_read(d, off, mask);
-		setreg_num((insn[insnlen+1] >> 3) & 0x7, val, mask);
-		insnlen += 2 + insn_displacement_len(insn[insnlen+1]);
-	} else if (insn[0] == 0x88) { /* 8-bit iowrite */
-		mask = 0xff;
-		/* Next byte is r/m byte: bits 3-5 are register. */
-		val = getreg_num((insn[1] >> 3) & 0x7, mask);
-		emulate_mmio_write(d, off, val, mask);
-		insnlen = 2 + insn_displacement_len(insn[1]);
-	} else if (insn[0] == 0x8a) { /* 8-bit ioread */
-		mask = 0xff;
-		val = emulate_mmio_read(d, off, mask);
-		setreg_num((insn[1] >> 3) & 0x7, val, mask);
-		insnlen = 2 + insn_displacement_len(insn[1]);
-	} else {
-		warnx("Unknown MMIO instruction touching %#08lx:"
-		     " %02x %02x %02x %02x at %u",
-		     paddr, insn[0], insn[1], insn[2], insn[3], getreg(eip));
-	reinject:
-		/* Inject trap into Guest. */
-		if (write(lguest_fd, args, sizeof(args)) < 0)
-			err(1, "Reinjecting trap 14 for fault at %#x",
-			    getreg(eip));
-		return;
-	}
-
-	/* Finally, we've "done" the instruction, so move past it. */
-	setreg(eip, getreg(eip) + insnlen);
-}
-
-/*L:190
- * Device Setup
- *
- * All devices need a descriptor so the Guest knows it exists, and a "struct
- * device" so the Launcher can keep track of it.  We have common helper
- * routines to allocate and manage them.
- */
-static void add_pci_virtqueue(struct device *dev,
-			      void (*service)(struct virtqueue *),
-			      const char *name)
-{
-	struct virtqueue **i, *vq = malloc(sizeof(*vq));
-
-	/* Initialize the virtqueue */
-	vq->next = NULL;
-	vq->last_avail_idx = 0;
-	vq->dev = dev;
-	vq->name = name;
-
-	/*
-	 * This is the routine the service thread will run, and its Process ID
-	 * once it's running.
-	 */
-	vq->service = service;
-	vq->thread = (pid_t)-1;
-
-	/* Initialize the configuration. */
-	reset_vq_pci_config(vq);
-	vq->pci_config.queue_notify_off = 0;
-
-	/* Add one to the number of queues */
-	vq->dev->mmio->cfg.num_queues++;
-
-	/*
-	 * Add to tail of list, so dev->vq is first vq, dev->vq->next is
-	 * second.
-	 */
-	for (i = &dev->vq; *i; i = &(*i)->next);
-	*i = vq;
-}
-
-/* The Guest accesses the feature bits via the PCI common config MMIO region */
-static void add_pci_feature(struct device *dev, unsigned bit)
-{
-	dev->features |= (1ULL << bit);
-}
-
-/* For devices with no config. */
-static void no_device_config(struct device *dev)
-{
-	dev->mmio_addr = get_mmio_region(dev->mmio_size);
-
-	dev->config.bar[0] = dev->mmio_addr;
-	/* Bottom 4 bits must be zero */
-	assert(~(dev->config.bar[0] & 0xF));
-}
-
-/* This puts the device config into BAR0 */
-static void set_device_config(struct device *dev, const void *conf, size_t len)
-{
-	/* Set up BAR 0 */
-	dev->mmio_size += len;
-	dev->mmio = realloc(dev->mmio, dev->mmio_size);
-	memcpy(dev->mmio + 1, conf, len);
-
-	/*
-	 * 4.1.4.6:
-	 *
-	 *  The device MUST present at least one VIRTIO_PCI_CAP_DEVICE_CFG
-	 *  capability for any device type which has a device-specific
-	 *  configuration.
-	 */
-	/* Hook up device cfg */
-	dev->config.cfg_access.cap.cap_next
-		= offsetof(struct pci_config, device);
-
-	/*
-	 * 4.1.4.6.1:
-	 *
-	 *  The offset for the device-specific configuration MUST be 4-byte
-	 *  aligned.
-	 */
-	assert(dev->config.cfg_access.cap.cap_next % 4 == 0);
-
-	/* Fix up device cfg field length. */
-	dev->config.device.length = len;
-
-	/* The rest is the same as the no-config case */
-	no_device_config(dev);
-}
-
-static void init_cap(struct virtio_pci_cap *cap, size_t caplen, int type,
-		     size_t bar_offset, size_t bar_bytes, u8 next)
-{
-	cap->cap_vndr = PCI_CAP_ID_VNDR;
-	cap->cap_next = next;
-	cap->cap_len = caplen;
-	cap->cfg_type = type;
-	cap->bar = 0;
-	memset(cap->padding, 0, sizeof(cap->padding));
-	cap->offset = bar_offset;
-	cap->length = bar_bytes;
-}
-
-/*
- * This sets up the pci_config structure, as defined in the virtio 1.0
- * standard (and PCI standard).
- */
-static void init_pci_config(struct pci_config *pci, u16 type,
-			    u8 class, u8 subclass)
-{
-	size_t bar_offset, bar_len;
-
-	/*
-	 * 4.1.4.4.1:
-	 *
-	 *  The device MUST either present notify_off_multiplier as an even
-	 *  power of 2, or present notify_off_multiplier as 0.
-	 *
-	 * 2.1.2:
-	 *
-	 *   The device MUST initialize device status to 0 upon reset. 
-	 */
-	memset(pci, 0, sizeof(*pci));
-
-	/* 4.1.2.1: Devices MUST have the PCI Vendor ID 0x1AF4 */
-	pci->vendor_id = 0x1AF4;
-	/* 4.1.2.1: ... PCI Device ID calculated by adding 0x1040 ... */
-	pci->device_id = 0x1040 + type;
-
-	/*
-	 * PCI have specific codes for different types of devices.
-	 * Linux doesn't care, but it's a good clue for people looking
-	 * at the device.
-	 */
-	pci->class = class;
-	pci->subclass = subclass;
-
-	/*
-	 * 4.1.2.1:
-	 *
-	 *  Non-transitional devices SHOULD have a PCI Revision ID of 1 or
-	 *  higher
-	 */
-	pci->revid = 1;
-
-	/*
-	 * 4.1.2.1:
-	 *
-	 *  Non-transitional devices SHOULD have a PCI Subsystem Device ID of
-	 *  0x40 or higher.
-	 */
-	pci->subsystem_device_id = 0x40;
-
-	/* We use our dummy interrupt controller, and irq_line is the irq */
-	pci->irq_line = devices.next_irq++;
-	pci->irq_pin = 0;
-
-	/* Support for extended capabilities. */
-	pci->status = (1 << 4);
-
-	/* Link them in. */
-	/*
-	 * 4.1.4.3.1:
-	 *
-	 *  The device MUST present at least one common configuration
-	 *  capability.
-	 */
-	pci->capabilities = offsetof(struct pci_config, common);
-
-	/* 4.1.4.3.1 ... offset MUST be 4-byte aligned. */
-	assert(pci->capabilities % 4 == 0);
-
-	bar_offset = offsetof(struct virtio_pci_mmio, cfg);
-	bar_len = sizeof(((struct virtio_pci_mmio *)0)->cfg);
-	init_cap(&pci->common, sizeof(pci->common), VIRTIO_PCI_CAP_COMMON_CFG,
-		 bar_offset, bar_len,
-		 offsetof(struct pci_config, notify));
-
-	/*
-	 * 4.1.4.4.1:
-	 *
-	 *  The device MUST present at least one notification capability.
-	 */
-	bar_offset += bar_len;
-	bar_len = sizeof(((struct virtio_pci_mmio *)0)->notify);
-
-	/*
-	 * 4.1.4.4.1:
-	 *
-	 *  The cap.offset MUST be 2-byte aligned.
-	 */
-	assert(pci->common.cap_next % 2 == 0);
-
-	/* FIXME: Use a non-zero notify_off, for per-queue notification? */
-	/*
-	 * 4.1.4.4.1:
-	 *
-	 *  The value cap.length presented by the device MUST be at least 2 and
-	 *  MUST be large enough to support queue notification offsets for all
-	 *  supported queues in all possible configurations.
-	 */
-	assert(bar_len >= 2);
-
-	init_cap(&pci->notify.cap, sizeof(pci->notify),
-		 VIRTIO_PCI_CAP_NOTIFY_CFG,
-		 bar_offset, bar_len,
-		 offsetof(struct pci_config, isr));
-
-	bar_offset += bar_len;
-	bar_len = sizeof(((struct virtio_pci_mmio *)0)->isr);
-	/*
-	 * 4.1.4.5.1:
-	 *
-	 *  The device MUST present at least one VIRTIO_PCI_CAP_ISR_CFG
-	 *  capability.
-	 */
-	init_cap(&pci->isr, sizeof(pci->isr),
-		 VIRTIO_PCI_CAP_ISR_CFG,
-		 bar_offset, bar_len,
-		 offsetof(struct pci_config, cfg_access));
-
-	/*
-	 * 4.1.4.7.1:
-	 *
-	 * The device MUST present at least one VIRTIO_PCI_CAP_PCI_CFG
-	 * capability.
-	 */
-	/* This doesn't have any presence in the BAR */
-	init_cap(&pci->cfg_access.cap, sizeof(pci->cfg_access),
-		 VIRTIO_PCI_CAP_PCI_CFG,
-		 0, 0, 0);
-
-	bar_offset += bar_len + sizeof(((struct virtio_pci_mmio *)0)->padding);
-	assert(bar_offset == sizeof(struct virtio_pci_mmio));
-
-	/*
-	 * This gets sewn in and length set in set_device_config().
-	 * Some devices don't have a device configuration interface, so
-	 * we never expose this if we don't call set_device_config().
-	 */
-	init_cap(&pci->device, sizeof(pci->device), VIRTIO_PCI_CAP_DEVICE_CFG,
-		 bar_offset, 0, 0);
-}
-
-/*
- * This routine does all the creation and setup of a new device, but we don't
- * actually place the MMIO region until we know the size (if any) of the
- * device-specific config.  And we don't actually start the service threads
- * until later.
- *
- * See what I mean about userspace being boring?
- */
-static struct device *new_pci_device(const char *name, u16 type,
-				     u8 class, u8 subclass)
-{
-	struct device *dev = malloc(sizeof(*dev));
-
-	/* Now we populate the fields one at a time. */
-	dev->name = name;
-	dev->vq = NULL;
-	dev->running = false;
-	dev->wrote_features_ok = false;
-	dev->mmio_size = sizeof(struct virtio_pci_mmio);
-	dev->mmio = calloc(1, dev->mmio_size);
-	dev->features = (u64)1 << VIRTIO_F_VERSION_1;
-	dev->features_accepted = 0;
-
-	if (devices.device_num + 1 >= MAX_PCI_DEVICES)
-		errx(1, "Can only handle 31 PCI devices");
-
-	init_pci_config(&dev->config, type, class, subclass);
-	assert(!devices.pci[devices.device_num+1]);
-	devices.pci[++devices.device_num] = dev;
-
-	return dev;
-}
-
-/*
- * Our first setup routine is the console.  It's a fairly simple device, but
- * UNIX tty handling makes it uglier than it could be.
- */
-static void setup_console(void)
-{
-	struct device *dev;
-	struct virtio_console_config conf;
-
-	/* If we can save the initial standard input settings... */
-	if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
-		struct termios term = orig_term;
-		/*
-		 * Then we turn off echo, line buffering and ^C etc: We want a
-		 * raw input stream to the Guest.
-		 */
-		term.c_lflag &= ~(ISIG|ICANON|ECHO);
-		tcsetattr(STDIN_FILENO, TCSANOW, &term);
-	}
-
-	dev = new_pci_device("console", VIRTIO_ID_CONSOLE, 0x07, 0x00);
-
-	/* We store the console state in dev->priv, and initialize it. */
-	dev->priv = malloc(sizeof(struct console_abort));
-	((struct console_abort *)dev->priv)->count = 0;
-
-	/*
-	 * The console needs two virtqueues: the input then the output.  When
-	 * they put something the input queue, we make sure we're listening to
-	 * stdin.  When they put something in the output queue, we write it to
-	 * stdout.
-	 */
-	add_pci_virtqueue(dev, console_input, "input");
-	add_pci_virtqueue(dev, console_output, "output");
-
-	/* We need a configuration area for the emerg_wr early writes. */
-	add_pci_feature(dev, VIRTIO_CONSOLE_F_EMERG_WRITE);
-	set_device_config(dev, &conf, sizeof(conf));
-
-	verbose("device %u: console\n", devices.device_num);
-}
-/*:*/
-
-/*M:010
- * Inter-guest networking is an interesting area.  Simplest is to have a
- * --sharenet=<name> option which opens or creates a named pipe.  This can be
- * used to send packets to another guest in a 1:1 manner.
- *
- * More sophisticated is to use one of the tools developed for project like UML
- * to do networking.
- *
- * Faster is to do virtio bonding in kernel.  Doing this 1:1 would be
- * completely generic ("here's my vring, attach to your vring") and would work
- * for any traffic.  Of course, namespace and permissions issues need to be
- * dealt with.  A more sophisticated "multi-channel" virtio_net.c could hide
- * multiple inter-guest channels behind one interface, although it would
- * require some manner of hotplugging new virtio channels.
- *
- * Finally, we could use a virtio network switch in the kernel, ie. vhost.
-:*/
-
-static u32 str2ip(const char *ipaddr)
-{
-	unsigned int b[4];
-
-	if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4)
-		errx(1, "Failed to parse IP address '%s'", ipaddr);
-	return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3];
-}
-
-static void str2mac(const char *macaddr, unsigned char mac[6])
-{
-	unsigned int m[6];
-	if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x",
-		   &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6)
-		errx(1, "Failed to parse mac address '%s'", macaddr);
-	mac[0] = m[0];
-	mac[1] = m[1];
-	mac[2] = m[2];
-	mac[3] = m[3];
-	mac[4] = m[4];
-	mac[5] = m[5];
-}
-
-/*
- * This code is "adapted" from libbridge: it attaches the Host end of the
- * network device to the bridge device specified by the command line.
- *
- * This is yet another James Morris contribution (I'm an IP-level guy, so I
- * dislike bridging), and I just try not to break it.
- */
-static void add_to_bridge(int fd, const char *if_name, const char *br_name)
-{
-	int ifidx;
-	struct ifreq ifr;
-
-	if (!*br_name)
-		errx(1, "must specify bridge name");
-
-	ifidx = if_nametoindex(if_name);
-	if (!ifidx)
-		errx(1, "interface %s does not exist!", if_name);
-
-	strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
-	ifr.ifr_name[IFNAMSIZ-1] = '\0';
-	ifr.ifr_ifindex = ifidx;
-	if (ioctl(fd, SIOCBRADDIF, &ifr) < 0)
-		err(1, "can't add %s to bridge %s", if_name, br_name);
-}
-
-/*
- * This sets up the Host end of the network device with an IP address, brings
- * it up so packets will flow, the copies the MAC address into the hwaddr
- * pointer.
- */
-static void configure_device(int fd, const char *tapif, u32 ipaddr)
-{
-	struct ifreq ifr;
-	struct sockaddr_in sin;
-
-	memset(&ifr, 0, sizeof(ifr));
-	strcpy(ifr.ifr_name, tapif);
-
-	/* Don't read these incantations.  Just cut & paste them like I did! */
-	sin.sin_family = AF_INET;
-	sin.sin_addr.s_addr = htonl(ipaddr);
-	memcpy(&ifr.ifr_addr, &sin, sizeof(sin));
-	if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
-		err(1, "Setting %s interface address", tapif);
-	ifr.ifr_flags = IFF_UP;
-	if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
-		err(1, "Bringing interface %s up", tapif);
-}
-
-static int get_tun_device(char tapif[IFNAMSIZ])
-{
-	struct ifreq ifr;
-	int vnet_hdr_sz;
-	int netfd;
-
-	/* Start with this zeroed.  Messy but sure. */
-	memset(&ifr, 0, sizeof(ifr));
-
-	/*
-	 * We open the /dev/net/tun device and tell it we want a tap device.  A
-	 * tap device is like a tun device, only somehow different.  To tell
-	 * the truth, I completely blundered my way through this code, but it
-	 * works now!
-	 */
-	netfd = open_or_die("/dev/net/tun", O_RDWR);
-	ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
-	strcpy(ifr.ifr_name, "tap%d");
-	if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
-		err(1, "configuring /dev/net/tun");
-
-	if (ioctl(netfd, TUNSETOFFLOAD,
-		  TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0)
-		err(1, "Could not set features for tun device");
-
-	/*
-	 * We don't need checksums calculated for packets coming in this
-	 * device: trust us!
-	 */
-	ioctl(netfd, TUNSETNOCSUM, 1);
-
-	/*
-	 * In virtio before 1.0 (aka legacy virtio), we added a 16-bit
-	 * field at the end of the network header iff
-	 * VIRTIO_NET_F_MRG_RXBUF was negotiated.  For virtio 1.0,
-	 * that became the norm, but we need to tell the tun device
-	 * about our expanded header (which is called
-	 * virtio_net_hdr_mrg_rxbuf in the legacy system).
-	 */
-	vnet_hdr_sz = sizeof(struct virtio_net_hdr_v1);
-	if (ioctl(netfd, TUNSETVNETHDRSZ, &vnet_hdr_sz) != 0)
-		err(1, "Setting tun header size to %u", vnet_hdr_sz);
-
-	memcpy(tapif, ifr.ifr_name, IFNAMSIZ);
-	return netfd;
-}
-
-/*L:195
- * Our network is a Host<->Guest network.  This can either use bridging or
- * routing, but the principle is the same: it uses the "tun" device to inject
- * packets into the Host as if they came in from a normal network card.  We
- * just shunt packets between the Guest and the tun device.
- */
-static void setup_tun_net(char *arg)
-{
-	struct device *dev;
-	struct net_info *net_info = malloc(sizeof(*net_info));
-	int ipfd;
-	u32 ip = INADDR_ANY;
-	bool bridging = false;
-	char tapif[IFNAMSIZ], *p;
-	struct virtio_net_config conf;
-
-	net_info->tunfd = get_tun_device(tapif);
-
-	/* First we create a new network device. */
-	dev = new_pci_device("net", VIRTIO_ID_NET, 0x02, 0x00);
-	dev->priv = net_info;
-
-	/* Network devices need a recv and a send queue, just like console. */
-	add_pci_virtqueue(dev, net_input, "rx");
-	add_pci_virtqueue(dev, net_output, "tx");
-
-	/*
-	 * We need a socket to perform the magic network ioctls to bring up the
-	 * tap interface, connect to the bridge etc.  Any socket will do!
-	 */
-	ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
-	if (ipfd < 0)
-		err(1, "opening IP socket");
-
-	/* If the command line was --tunnet=bridge:<name> do bridging. */
-	if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
-		arg += strlen(BRIDGE_PFX);
-		bridging = true;
-	}
-
-	/* A mac address may follow the bridge name or IP address */
-	p = strchr(arg, ':');
-	if (p) {
-		str2mac(p+1, conf.mac);
-		add_pci_feature(dev, VIRTIO_NET_F_MAC);
-		*p = '\0';
-	}
-
-	/* arg is now either an IP address or a bridge name */
-	if (bridging)
-		add_to_bridge(ipfd, tapif, arg);
-	else
-		ip = str2ip(arg);
-
-	/* Set up the tun device. */
-	configure_device(ipfd, tapif, ip);
-
-	/* Expect Guest to handle everything except UFO */
-	add_pci_feature(dev, VIRTIO_NET_F_CSUM);
-	add_pci_feature(dev, VIRTIO_NET_F_GUEST_CSUM);
-	add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO4);
-	add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO6);
-	add_pci_feature(dev, VIRTIO_NET_F_GUEST_ECN);
-	add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO4);
-	add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO6);
-	add_pci_feature(dev, VIRTIO_NET_F_HOST_ECN);
-	/* We handle indirect ring entries */
-	add_pci_feature(dev, VIRTIO_RING_F_INDIRECT_DESC);
-	set_device_config(dev, &conf, sizeof(conf));
-
-	/* We don't need the socket any more; setup is done. */
-	close(ipfd);
-
-	if (bridging)
-		verbose("device %u: tun %s attached to bridge: %s\n",
-			devices.device_num, tapif, arg);
-	else
-		verbose("device %u: tun %s: %s\n",
-			devices.device_num, tapif, arg);
-}
-/*:*/
-
-/* This hangs off device->priv. */
-struct vblk_info {
-	/* The size of the file. */
-	off64_t len;
-
-	/* The file descriptor for the file. */
-	int fd;
-
-};
-
-/*L:210
- * The Disk
- *
- * The disk only has one virtqueue, so it only has one thread.  It is really
- * simple: the Guest asks for a block number and we read or write that position
- * in the file.
- *
- * Before we serviced each virtqueue in a separate thread, that was unacceptably
- * slow: the Guest waits until the read is finished before running anything
- * else, even if it could have been doing useful work.
- *
- * We could have used async I/O, except it's reputed to suck so hard that
- * characters actually go missing from your code when you try to use it.
- */
-static void blk_request(struct virtqueue *vq)
-{
-	struct vblk_info *vblk = vq->dev->priv;
-	unsigned int head, out_num, in_num, wlen;
-	int ret, i;
-	u8 *in;
-	struct virtio_blk_outhdr out;
-	struct iovec iov[vq->vring.num];
-	off64_t off;
-
-	/*
-	 * Get the next request, where we normally wait.  It triggers the
-	 * interrupt to acknowledge previously serviced requests (if any).
-	 */
-	head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
-
-	/* Copy the output header from the front of the iov (adjusts iov) */
-	iov_consume(vq->dev, iov, out_num, &out, sizeof(out));
-
-	/* Find and trim end of iov input array, for our status byte. */
-	in = NULL;
-	for (i = out_num + in_num - 1; i >= out_num; i--) {
-		if (iov[i].iov_len > 0) {
-			in = iov[i].iov_base + iov[i].iov_len - 1;
-			iov[i].iov_len--;
-			break;
-		}
-	}
-	if (!in)
-		bad_driver_vq(vq, "Bad virtblk cmd with no room for status");
-
-	/*
-	 * For historical reasons, block operations are expressed in 512 byte
-	 * "sectors".
-	 */
-	off = out.sector * 512;
-
-	if (out.type & VIRTIO_BLK_T_OUT) {
-		/*
-		 * Write
-		 *
-		 * Move to the right location in the block file.  This can fail
-		 * if they try to write past end.
-		 */
-		if (lseek64(vblk->fd, off, SEEK_SET) != off)
-			err(1, "Bad seek to sector %llu", out.sector);
-
-		ret = writev(vblk->fd, iov, out_num);
-		verbose("WRITE to sector %llu: %i\n", out.sector, ret);
-
-		/*
-		 * Grr... Now we know how long the descriptor they sent was, we
-		 * make sure they didn't try to write over the end of the block
-		 * file (possibly extending it).
-		 */
-		if (ret > 0 && off + ret > vblk->len) {
-			/* Trim it back to the correct length */
-			ftruncate64(vblk->fd, vblk->len);
-			/* Die, bad Guest, die. */
-			bad_driver_vq(vq, "Write past end %llu+%u", off, ret);
-		}
-
-		wlen = sizeof(*in);
-		*in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
-	} else if (out.type & VIRTIO_BLK_T_FLUSH) {
-		/* Flush */
-		ret = fdatasync(vblk->fd);
-		verbose("FLUSH fdatasync: %i\n", ret);
-		wlen = sizeof(*in);
-		*in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
-	} else {
-		/*
-		 * Read
-		 *
-		 * Move to the right location in the block file.  This can fail
-		 * if they try to read past end.
-		 */
-		if (lseek64(vblk->fd, off, SEEK_SET) != off)
-			err(1, "Bad seek to sector %llu", out.sector);
-
-		ret = readv(vblk->fd, iov + out_num, in_num);
-		if (ret >= 0) {
-			wlen = sizeof(*in) + ret;
-			*in = VIRTIO_BLK_S_OK;
-		} else {
-			wlen = sizeof(*in);
-			*in = VIRTIO_BLK_S_IOERR;
-		}
-	}
-
-	/* Finished that request. */
-	add_used(vq, head, wlen);
-}
-
-/*L:198 This actually sets up a virtual block device. */
-static void setup_block_file(const char *filename)
-{
-	struct device *dev;
-	struct vblk_info *vblk;
-	struct virtio_blk_config conf;
-
-	/* Create the device. */
-	dev = new_pci_device("block", VIRTIO_ID_BLOCK, 0x01, 0x80);
-
-	/* The device has one virtqueue, where the Guest places requests. */
-	add_pci_virtqueue(dev, blk_request, "request");
-
-	/* Allocate the room for our own bookkeeping */
-	vblk = dev->priv = malloc(sizeof(*vblk));
-
-	/* First we open the file and store the length. */
-	vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
-	vblk->len = lseek64(vblk->fd, 0, SEEK_END);
-
-	/* Tell Guest how many sectors this device has. */
-	conf.capacity = cpu_to_le64(vblk->len / 512);
-
-	/*
-	 * Tell Guest not to put in too many descriptors at once: two are used
-	 * for the in and out elements.
-	 */
-	add_pci_feature(dev, VIRTIO_BLK_F_SEG_MAX);
-	conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2);
-
-	set_device_config(dev, &conf, sizeof(struct virtio_blk_config));
-
-	verbose("device %u: virtblock %llu sectors\n",
-		devices.device_num, le64_to_cpu(conf.capacity));
-}
-
-/*L:211
- * Our random number generator device reads from /dev/urandom into the Guest's
- * input buffers.  The usual case is that the Guest doesn't want random numbers
- * and so has no buffers although /dev/urandom is still readable, whereas
- * console is the reverse.
- *
- * The same logic applies, however.
- */
-struct rng_info {
-	int rfd;
-};
-
-static void rng_input(struct virtqueue *vq)
-{
-	int len;
-	unsigned int head, in_num, out_num, totlen = 0;
-	struct rng_info *rng_info = vq->dev->priv;
-	struct iovec iov[vq->vring.num];
-
-	/* First we need a buffer from the Guests's virtqueue. */
-	head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
-	if (out_num)
-		bad_driver_vq(vq, "Output buffers in rng?");
-
-	/*
-	 * Just like the console write, we loop to cover the whole iovec.
-	 * In this case, short reads actually happen quite a bit.
-	 */
-	while (!iov_empty(iov, in_num)) {
-		len = readv(rng_info->rfd, iov, in_num);
-		if (len <= 0)
-			err(1, "Read from /dev/urandom gave %i", len);
-		iov_consume(vq->dev, iov, in_num, NULL, len);
-		totlen += len;
-	}
-
-	/* Tell the Guest about the new input. */
-	add_used(vq, head, totlen);
-}
-
-/*L:199
- * This creates a "hardware" random number device for the Guest.
- */
-static void setup_rng(void)
-{
-	struct device *dev;
-	struct rng_info *rng_info = malloc(sizeof(*rng_info));
-
-	/* Our device's private info simply contains the /dev/urandom fd. */
-	rng_info->rfd = open_or_die("/dev/urandom", O_RDONLY);
-
-	/* Create the new device. */
-	dev = new_pci_device("rng", VIRTIO_ID_RNG, 0xff, 0);
-	dev->priv = rng_info;
-
-	/* The device has one virtqueue, where the Guest places inbufs. */
-	add_pci_virtqueue(dev, rng_input, "input");
-
-	/* We don't have any configuration space */
-	no_device_config(dev);
-
-	verbose("device %u: rng\n", devices.device_num);
-}
-/* That's the end of device setup. */
-
-/*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */
-static void __attribute__((noreturn)) restart_guest(void)
-{
-	unsigned int i;
-
-	/*
-	 * Since we don't track all open fds, we simply close everything beyond
-	 * stderr.
-	 */
-	for (i = 3; i < FD_SETSIZE; i++)
-		close(i);
-
-	/* Reset all the devices (kills all threads). */
-	cleanup_devices();
-
-	execv(main_args[0], main_args);
-	err(1, "Could not exec %s", main_args[0]);
-}
-
-/*L:220
- * Finally we reach the core of the Launcher which runs the Guest, serves
- * its input and output, and finally, lays it to rest.
- */
-static void __attribute__((noreturn)) run_guest(void)
-{
-	for (;;) {
-		struct lguest_pending notify;
-		int readval;
-
-		/* We read from the /dev/lguest device to run the Guest. */
-		readval = pread(lguest_fd, &notify, sizeof(notify), cpu_id);
-		if (readval == sizeof(notify)) {
-			if (notify.trap == 13) {
-				verbose("Emulating instruction at %#x\n",
-					getreg(eip));
-				emulate_insn(notify.insn);
-			} else if (notify.trap == 14) {
-				verbose("Emulating MMIO at %#x\n",
-					getreg(eip));
-				emulate_mmio(notify.addr, notify.insn);
-			} else
-				errx(1, "Unknown trap %i addr %#08x\n",
-				     notify.trap, notify.addr);
-		/* ENOENT means the Guest died.  Reading tells us why. */
-		} else if (errno == ENOENT) {
-			char reason[1024] = { 0 };
-			pread(lguest_fd, reason, sizeof(reason)-1, cpu_id);
-			errx(1, "%s", reason);
-		/* ERESTART means that we need to reboot the guest */
-		} else if (errno == ERESTART) {
-			restart_guest();
-		/* Anything else means a bug or incompatible change. */
-		} else
-			err(1, "Running guest failed");
-	}
-}
-/*L:240
- * This is the end of the Launcher.  The good news: we are over halfway
- * through!  The bad news: the most fiendish part of the code still lies ahead
- * of us.
- *
- * Are you ready?  Take a deep breath and join me in the core of the Host, in
- * "make Host".
-:*/
-
-static struct option opts[] = {
-	{ "verbose", 0, NULL, 'v' },
-	{ "tunnet", 1, NULL, 't' },
-	{ "block", 1, NULL, 'b' },
-	{ "rng", 0, NULL, 'r' },
-	{ "initrd", 1, NULL, 'i' },
-	{ "username", 1, NULL, 'u' },
-	{ "chroot", 1, NULL, 'c' },
-	{ NULL },
-};
-static void usage(void)
-{
-	errx(1, "Usage: lguest [--verbose] "
-	     "[--tunnet=(<ipaddr>:<macaddr>|bridge:<bridgename>:<macaddr>)\n"
-	     "|--block=<filename>|--initrd=<filename>]...\n"
-	     "<mem-in-mb> vmlinux [args...]");
-}
-
-/*L:105 The main routine is where the real work begins: */
-int main(int argc, char *argv[])
-{
-	/* Memory, code startpoint and size of the (optional) initrd. */
-	unsigned long mem = 0, start, initrd_size = 0;
-	/* Two temporaries. */
-	int i, c;
-	/* The boot information for the Guest. */
-	struct boot_params *boot;
-	/* If they specify an initrd file to load. */
-	const char *initrd_name = NULL;
-
-	/* Password structure for initgroups/setres[gu]id */
-	struct passwd *user_details = NULL;
-
-	/* Directory to chroot to */
-	char *chroot_path = NULL;
-
-	/* Save the args: we "reboot" by execing ourselves again. */
-	main_args = argv;
-
-	/*
-	 * First we initialize the device list.  We remember next interrupt
-	 * number to use for devices (1: remember that 0 is used by the timer).
-	 */
-	devices.next_irq = 1;
-
-	/* We're CPU 0.  In fact, that's the only CPU possible right now. */
-	cpu_id = 0;
-
-	/*
-	 * We need to know how much memory so we can set up the device
-	 * descriptor and memory pages for the devices as we parse the command
-	 * line.  So we quickly look through the arguments to find the amount
-	 * of memory now.
-	 */
-	for (i = 1; i < argc; i++) {
-		if (argv[i][0] != '-') {
-			mem = atoi(argv[i]) * 1024 * 1024;
-			/*
-			 * We start by mapping anonymous pages over all of
-			 * guest-physical memory range.  This fills it with 0,
-			 * and ensures that the Guest won't be killed when it
-			 * tries to access it.
-			 */
-			guest_base = map_zeroed_pages(mem / getpagesize()
-						      + DEVICE_PAGES);
-			guest_limit = mem;
-			guest_max = guest_mmio = mem + DEVICE_PAGES*getpagesize();
-			break;
-		}
-	}
-
-	/* If we exit via err(), this kills all the threads, restores tty. */
-	atexit(cleanup_devices);
-
-	/* We always have a console device, and it's always device 1. */
-	setup_console();
-
-	/* The options are fairly straight-forward */
-	while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) {
-		switch (c) {
-		case 'v':
-			verbose = true;
-			break;
-		case 't':
-			setup_tun_net(optarg);
-			break;
-		case 'b':
-			setup_block_file(optarg);
-			break;
-		case 'r':
-			setup_rng();
-			break;
-		case 'i':
-			initrd_name = optarg;
-			break;
-		case 'u':
-			user_details = getpwnam(optarg);
-			if (!user_details)
-				err(1, "getpwnam failed, incorrect username?");
-			break;
-		case 'c':
-			chroot_path = optarg;
-			break;
-		default:
-			warnx("Unknown argument %s", argv[optind]);
-			usage();
-		}
-	}
-	/*
-	 * After the other arguments we expect memory and kernel image name,
-	 * followed by command line arguments for the kernel.
-	 */
-	if (optind + 2 > argc)
-		usage();
-
-	verbose("Guest base is at %p\n", guest_base);
-
-	/* Initialize the (fake) PCI host bridge device. */
-	init_pci_host_bridge();
-
-	/* Now we load the kernel */
-	start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
-
-	/* Boot information is stashed at physical address 0 */
-	boot = from_guest_phys(0);
-
-	/* Map the initrd image if requested (at top of physical memory) */
-	if (initrd_name) {
-		initrd_size = load_initrd(initrd_name, mem);
-		/*
-		 * These are the location in the Linux boot header where the
-		 * start and size of the initrd are expected to be found.
-		 */
-		boot->hdr.ramdisk_image = mem - initrd_size;
-		boot->hdr.ramdisk_size = initrd_size;
-		/* The bootloader type 0xFF means "unknown"; that's OK. */
-		boot->hdr.type_of_loader = 0xFF;
-	}
-
-	/*
-	 * The Linux boot header contains an "E820" memory map: ours is a
-	 * simple, single region.
-	 */
-	boot->e820_entries = 1;
-	boot->e820_table[0] = ((struct e820_entry) { 0, mem, E820_TYPE_RAM });
-	/*
-	 * The boot header contains a command line pointer: we put the command
-	 * line after the boot header.
-	 */
-	boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
-	/* We use a simple helper to copy the arguments separated by spaces. */
-	concat((char *)(boot + 1), argv+optind+2);
-
-	/* Set kernel alignment to 16M (CONFIG_PHYSICAL_ALIGN) */
-	boot->hdr.kernel_alignment = 0x1000000;
-
-	/* Boot protocol version: 2.07 supports the fields for lguest. */
-	boot->hdr.version = 0x207;
-
-	/* X86_SUBARCH_LGUEST tells the Guest it's an lguest. */
-	boot->hdr.hardware_subarch = X86_SUBARCH_LGUEST;
-
-	/* Tell the entry path not to try to reload segment registers. */
-	boot->hdr.loadflags |= KEEP_SEGMENTS;
-
-	/* We don't support tboot: */
-	boot->tboot_addr = 0;
-
-	/* Ensure this is 0 to prevent APM from loading: */
-	boot->apm_bios_info.version = 0;
-
-	/* We tell the kernel to initialize the Guest. */
-	tell_kernel(start);
-
-	/* Ensure that we terminate if a device-servicing child dies. */
-	signal(SIGCHLD, kill_launcher);
-
-	/* If requested, chroot to a directory */
-	if (chroot_path) {
-		if (chroot(chroot_path) != 0)
-			err(1, "chroot(\"%s\") failed", chroot_path);
-
-		if (chdir("/") != 0)
-			err(1, "chdir(\"/\") failed");
-
-		verbose("chroot done\n");
-	}
-
-	/* If requested, drop privileges */
-	if (user_details) {
-		uid_t u;
-		gid_t g;
-
-		u = user_details->pw_uid;
-		g = user_details->pw_gid;
-
-		if (initgroups(user_details->pw_name, g) != 0)
-			err(1, "initgroups failed");
-
-		if (setresgid(g, g, g) != 0)
-			err(1, "setresgid failed");
-
-		if (setresuid(u, u, u) != 0)
-			err(1, "setresuid failed");
-
-		verbose("Dropping privileges completed\n");
-	}
-
-	/* Finally, run the Guest.  This doesn't return. */
-	run_guest();
-}
-/*:*/
-
-/*M:999
- * Mastery is done: you now know everything I do.
- *
- * But surely you have seen code, features and bugs in your wanderings which
- * you now yearn to attack?  That is the real game, and I look forward to you
- * patching and forking lguest into the Your-Name-Here-visor.
- *
- * Farewell, and good coding!
- * Rusty Russell.
- */
diff --git a/tools/lguest/lguest.txt b/tools/lguest/lguest.txt
deleted file mode 100644
index 06e1f4649511..000000000000
--- a/tools/lguest/lguest.txt
+++ /dev/null
@@ -1,125 +0,0 @@
-      __
- (___()'`;  Rusty's Remarkably Unreliable Guide to Lguest
- /,    /`      - or, A Young Coder's Illustrated Hypervisor
- \\"--\\    http://lguest.ozlabs.org
-
-Lguest is designed to be a minimal 32-bit x86 hypervisor for the Linux kernel,
-for Linux developers and users to experiment with virtualization with the
-minimum of complexity.  Nonetheless, it should have sufficient features to
-make it useful for specific tasks, and, of course, you are encouraged to fork
-and enhance it (see drivers/lguest/README).
-
-Features:
-
-- Kernel module which runs in a normal kernel.
-- Simple I/O model for communication.
-- Simple program to create new guests.
-- Logo contains cute puppies: http://lguest.ozlabs.org
-
-Developer features:
-
-- Fun to hack on.
-- No ABI: being tied to a specific kernel anyway, you can change anything.
-- Many opportunities for improvement or feature implementation.
-
-Running Lguest:
-
-- The easiest way to run lguest is to use same kernel as guest and host.
-  You can configure them differently, but usually it's easiest not to.
-
-  You will need to configure your kernel with the following options:
-
-  "Processor type and features":
-     "Paravirtualized guest support" = Y
-        "Lguest guest support" = Y
-     "High Memory Support" = off/4GB
-     "Alignment value to which kernel should be aligned" = 0x100000
-        (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
-         CONFIG_PHYSICAL_ALIGN=0x100000)
-
-  "Device Drivers":
-     "Block devices"
-        "Virtio block driver" = M/Y
-     "Network device support"
-        "Universal TUN/TAP device driver support" = M/Y
-        "Virtio network driver" = M/Y
-           (CONFIG_VIRTIO_BLK=m, CONFIG_VIRTIO_NET=m and CONFIG_TUN=m)
-
-  "Virtualization"
-     "Linux hypervisor example code" = M/Y
-        (CONFIG_LGUEST=m)
-
-- A tool called "lguest" is available in this directory: type "make"
-  to build it.  If you didn't build your kernel in-tree, use "make
-  O=<builddir>".
-
-- Create or find a root disk image.  There are several useful ones
-  around, such as the xm-test tiny root image at
-	  http://xm-test.xensource.com/ramdisks/initrd-1.1-i386.img
-
-  For more serious work, I usually use a distribution ISO image and
-  install it under qemu, then make multiple copies:
-
-	  dd if=/dev/zero of=rootfile bs=1M count=2048
-	  qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d
-
-  Make sure that you install a getty on /dev/hvc0 if you want to log in on the
-  console!
-
-- "modprobe lg" if you built it as a module.
-
-- Run an lguest as root:
-
-      tools/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \
-        --block=rootfile root=/dev/vda
-
-   Explanation:
-    64: the amount of memory to use, in MB.
-
-    vmlinux: the kernel image found in the top of your build directory.  You
-       can also use a standard bzImage.
-
-    --tunnet=192.168.19.1: configures a "tap" device for networking with this
-       IP address.
-
-    --block=rootfile: a file or block device which becomes /dev/vda
-       inside the guest.
-
-    root=/dev/vda: this (and anything else on the command line) are
-       kernel boot parameters.
-
-- Configuring networking.  I usually have the host masquerade, using
-  "iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE" and "echo 1 >
-  /proc/sys/net/ipv4/ip_forward".  In this example, I would configure
-  eth0 inside the guest at 192.168.19.2.
-
-  Another method is to bridge the tap device to an external interface
-  using --tunnet=bridge:<bridgename>, and perhaps run dhcp on the guest
-  to obtain an IP address.  The bridge needs to be configured first:
-  this option simply adds the tap interface to it.
-
-  A simple example on my system:
-
-    ifconfig eth0 0.0.0.0
-    brctl addbr lg0
-    ifconfig lg0 up
-    brctl addif lg0 eth0
-    dhclient lg0
-
-  Then use --tunnet=bridge:lg0 when launching the guest.
-
-  See:
-  
-    http://www.linuxfoundation.org/collaborate/workgroups/networking/bridge
-    
-  for general information on how to get bridging to work.
-
-- Random number generation. Using the --rng option will provide a
-  /dev/hwrng in the guest that will read from the host's /dev/random.
-  Use this option in conjunction with rng-tools (see ../hw_random.txt)
-  to provide entropy to the guest kernel's /dev/random.
-
-There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest
-
-Good luck!
-Rusty Russell rusty@rustcorp.com.au.