diff --git a/MAINTAINERS b/MAINTAINERS index 84d6a8277cbd..6c8b66d2adcb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7640,17 +7640,6 @@ T: git git://linuxtv.org/mkrufky/tuners.git S: Maintained F: drivers/media/dvb-frontends/lgdt3305.* -LGUEST -M: Rusty Russell -L: lguest@lists.ozlabs.org -W: http://lguest.ozlabs.org/ -S: Odd Fixes -F: arch/x86/include/asm/lguest*.h -F: arch/x86/lguest/ -F: drivers/lguest/ -F: include/linux/lguest*.h -F: tools/lguest/ - LIBATA PATA ARASAN COMPACT FLASH CONTROLLER M: Viresh Kumar L: linux-ide@vger.kernel.org diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 586b786b3edf..f65a804b86f0 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -10,9 +10,6 @@ obj-$(CONFIG_XEN) += xen/ # Hyper-V paravirtualization support obj-$(CONFIG_HYPERVISOR_GUEST) += hyperv/ -# lguest paravirtualization support -obj-$(CONFIG_LGUEST_GUEST) += lguest/ - obj-y += realmode/ obj-y += kernel/ obj-y += mm/ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9b302121584d..651021713385 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -777,8 +777,6 @@ config KVM_DEBUG_FS Statistics are displayed in debugfs filesystem. Enabling this option may incur significant overhead. -source "arch/x86/lguest/Kconfig" - config PARAVIRT_TIME_ACCOUNTING bool "Paravirtual steal time accounting" depends on PARAVIRT diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h deleted file mode 100644 index 73d0c9b92087..000000000000 --- a/arch/x86/include/asm/lguest.h +++ /dev/null @@ -1,91 +0,0 @@ -#ifndef _ASM_X86_LGUEST_H -#define _ASM_X86_LGUEST_H - -#define GDT_ENTRY_LGUEST_CS 10 -#define GDT_ENTRY_LGUEST_DS 11 -#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8) -#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8) - -#ifndef __ASSEMBLY__ -#include - -#define GUEST_PL 1 - -/* Page for Switcher text itself, then two pages per cpu */ -#define SWITCHER_TEXT_PAGES (1) -#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids) -#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES) - -/* Where we map the Switcher, in both Host and Guest. */ -extern unsigned long switcher_addr; - -/* Found in switcher.S */ -extern unsigned long default_idt_entries[]; - -/* Declarations for definitions in arch/x86/lguest/head_32.S */ -extern char lguest_noirq_iret[]; -extern const char lgstart_cli[], lgend_cli[]; -extern const char lgstart_pushf[], lgend_pushf[]; - -extern void lguest_iret(void); -extern void lguest_init(void); - -struct lguest_regs { - /* Manually saved part. */ - unsigned long eax, ebx, ecx, edx; - unsigned long esi, edi, ebp; - unsigned long gs; - unsigned long fs, ds, es; - unsigned long trapnum, errcode; - /* Trap pushed part */ - unsigned long eip; - unsigned long cs; - unsigned long eflags; - unsigned long esp; - unsigned long ss; -}; - -/* This is a guest-specific page (mapped ro) into the guest. */ -struct lguest_ro_state { - /* Host information we need to restore when we switch back. */ - u32 host_cr3; - struct desc_ptr host_idt_desc; - struct desc_ptr host_gdt_desc; - u32 host_sp; - - /* Fields which are used when guest is running. */ - struct desc_ptr guest_idt_desc; - struct desc_ptr guest_gdt_desc; - struct x86_hw_tss guest_tss; - struct desc_struct guest_idt[IDT_ENTRIES]; - struct desc_struct guest_gdt[GDT_ENTRIES]; -}; - -struct lg_cpu_arch { - /* The GDT entries copied into lguest_ro_state when running. */ - struct desc_struct gdt[GDT_ENTRIES]; - - /* The IDT entries: some copied into lguest_ro_state when running. */ - struct desc_struct idt[IDT_ENTRIES]; - - /* The address of the last guest-visible pagefault (ie. cr2). */ - unsigned long last_pagefault; -}; - -static inline void lguest_set_ts(void) -{ - u32 cr0; - - cr0 = read_cr0(); - if (!(cr0 & 8)) - write_cr0(cr0 | 8); -} - -/* Full 4G segment descriptors, suitable for CS and DS. */ -#define FULL_EXEC_SEGMENT \ - ((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff)) -#define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff)) - -#endif /* __ASSEMBLY__ */ - -#endif /* _ASM_X86_LGUEST_H */ diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h deleted file mode 100644 index 6c119cfae218..000000000000 --- a/arch/x86/include/asm/lguest_hcall.h +++ /dev/null @@ -1,74 +0,0 @@ -/* Architecture specific portion of the lguest hypercalls */ -#ifndef _ASM_X86_LGUEST_HCALL_H -#define _ASM_X86_LGUEST_HCALL_H - -#define LHCALL_FLUSH_ASYNC 0 -#define LHCALL_LGUEST_INIT 1 -#define LHCALL_SHUTDOWN 2 -#define LHCALL_NEW_PGTABLE 4 -#define LHCALL_FLUSH_TLB 5 -#define LHCALL_LOAD_IDT_ENTRY 6 -#define LHCALL_SET_STACK 7 -#define LHCALL_SET_CLOCKEVENT 9 -#define LHCALL_HALT 10 -#define LHCALL_SET_PMD 13 -#define LHCALL_SET_PTE 14 -#define LHCALL_SET_PGD 15 -#define LHCALL_LOAD_TLS 16 -#define LHCALL_LOAD_GDT_ENTRY 18 -#define LHCALL_SEND_INTERRUPTS 19 - -#define LGUEST_TRAP_ENTRY 0x1F - -/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */ -#define LGUEST_SHUTDOWN_POWEROFF 1 -#define LGUEST_SHUTDOWN_RESTART 2 - -#ifndef __ASSEMBLY__ -#include - -/*G:030 - * But first, how does our Guest contact the Host to ask for privileged - * operations? There are two ways: the direct way is to make a "hypercall", - * to make requests of the Host Itself. - * - * Our hypercall mechanism uses the highest unused trap code (traps 32 and - * above are used by real hardware interrupts). Seventeen hypercalls are - * available: the hypercall number is put in the %eax register, and the - * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. - * If a return value makes sense, it's returned in %eax. - * - * Grossly invalid calls result in Sudden Death at the hands of the vengeful - * Host, rather than returning failure. This reflects Winston Churchill's - * definition of a gentleman: "someone who is only rude intentionally". - */ -static inline unsigned long -hcall(unsigned long call, - unsigned long arg1, unsigned long arg2, unsigned long arg3, - unsigned long arg4) -{ - /* "int" is the Intel instruction to trigger a trap. */ - asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) - /* The call in %eax (aka "a") might be overwritten */ - : "=a"(call) - /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */ - : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4) - /* "memory" means this might write somewhere in memory. - * This isn't true for all calls, but it's safe to tell - * gcc that it might happen so it doesn't get clever. */ - : "memory"); - return call; -} -/*:*/ - -/* Can't use our min() macro here: needs to be a constant */ -#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) - -#define LHCALL_RING_SIZE 64 -struct hcall_args { - /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */ - unsigned long arg0, arg1, arg2, arg3, arg4; -}; - -#endif /* !__ASSEMBLY__ */ -#endif /* _ASM_X86_LGUEST_HCALL_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 0b03d655db7c..abc99b9c7ffd 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -662,7 +662,7 @@ static inline void sync_core(void) * In case NMI unmasking or performance ever becomes a problem, * the next best option appears to be MOV-to-CR2 and an * unconditional jump. That sequence also works on all CPUs, - * but it will fault at CPL3 (i.e. Xen PV and lguest). + * but it will fault at CPL3 (i.e. Xen PV). * * CPUID is the conventional way, but it's nasty: it doesn't * exist on some 486-like CPUs, and it usually exits to a diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index ddef37b16af2..66b8f93333d1 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -201,7 +201,7 @@ struct boot_params { * * @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard * PC mechanisms (PCI, ACPI) and doesn't need a special boot flow. - * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest + * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest, deprecated * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path, * which start at asm startup_xen() entry point and later jump to the C * xen_start_kernel() entry point. Both domU and dom0 type of guests are diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 880aa093268d..710edab9e644 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -4,9 +4,6 @@ #include -#include -#include "../../../drivers/lguest/lg.h" - #define __SYSCALL_I386(nr, sym, qual) [nr] = 1, static char syscalls[] = { #include @@ -62,23 +59,6 @@ void foo(void) OFFSET(stack_canary_offset, stack_canary, canary); #endif -#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) - BLANK(); - OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); - OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); - - BLANK(); - OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); - OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); - OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3); - OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp); - OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc); - OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc); - OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt); - OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum); - OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); - OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); -#endif BLANK(); DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); DEFINE(NR_syscalls, sizeof(syscalls)); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 0332664eb158..29da9599fec0 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -155,7 +155,6 @@ ENTRY(startup_32) jmp *%eax .Lbad_subarch: -WEAK(lguest_entry) WEAK(xen_entry) /* Unknown implementation; there's really nothing we can do at this point. */ @@ -165,7 +164,6 @@ WEAK(xen_entry) subarch_entries: .long .Ldefault_entry /* normal x86/PC */ - .long lguest_entry /* lguest hypervisor */ .long xen_entry /* Xen hypervisor */ .long .Ldefault_entry /* Moorestown MID */ num_subarch_entries = (. - subarch_entries) / 4 diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index 91271122f0df..502a77d0adb0 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -16,7 +16,6 @@ void __init x86_early_init_platform_quirks(void) x86_platform.legacy.reserve_bios_regions = 1; break; case X86_SUBARCH_XEN: - case X86_SUBARCH_LGUEST: x86_platform.legacy.devices.pnpbios = 0; x86_platform.legacy.rtc = 0; break; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 2688c7dc5323..3ea624452f93 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -89,6 +89,5 @@ config KVM_MMU_AUDIT # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/vhost/Kconfig -source drivers/lguest/Kconfig endif # VIRTUALIZATION diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig deleted file mode 100644 index 08f41caada45..000000000000 --- a/arch/x86/lguest/Kconfig +++ /dev/null @@ -1,14 +0,0 @@ -config LGUEST_GUEST - bool "Lguest guest support" - depends on X86_32 && PARAVIRT && PCI - select TTY - select VIRTUALIZATION - select VIRTIO - select VIRTIO_CONSOLE - help - Lguest is a tiny in-kernel hypervisor. Selecting this will - allow your kernel to boot under lguest. This option will increase - your kernel size by about 10k. If in doubt, say N. - - If you say Y here, make sure you say Y (or M) to the virtio block - and net drivers which lguest needs. diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile deleted file mode 100644 index 8f38d577a2fa..000000000000 --- a/arch/x86/lguest/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -obj-y := head_32.o boot.o -CFLAGS_boot.o := $(call cc-option, -fno-stack-protector) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c deleted file mode 100644 index 99472698c931..000000000000 --- a/arch/x86/lguest/boot.c +++ /dev/null @@ -1,1558 +0,0 @@ -/*P:010 - * A hypervisor allows multiple Operating Systems to run on a single machine. - * To quote David Wheeler: "Any problem in computer science can be solved with - * another layer of indirection." - * - * We keep things simple in two ways. First, we start with a normal Linux - * kernel and insert a module (lg.ko) which allows us to run other Linux - * kernels the same way we'd run processes. We call the first kernel the Host, - * and the others the Guests. The program which sets up and configures Guests - * (such as the example in tools/lguest/lguest.c) is called the Launcher. - * - * Secondly, we only run specially modified Guests, not normal kernels: setting - * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows - * how to be a Guest at boot time. This means that you can use the same kernel - * you boot normally (ie. as a Host) as a Guest. - * - * These Guests know that they cannot do privileged operations, such as disable - * interrupts, and that they have to ask the Host to do such things explicitly. - * This file consists of all the replacements for such low-level native - * hardware operations: these special Guest versions call the Host. - * - * So how does the kernel know it's a Guest? We'll see that later, but let's - * just say that we end up here where we replace the native functions various - * "paravirt" structures with our Guest versions, then boot like normal. -:*/ - -/* - * Copyright (C) 2006, Rusty Russell IBM Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* for struct machine_ops */ -#include -#include -#include - -/*G:010 - * Welcome to the Guest! - * - * The Guest in our tale is a simple creature: identical to the Host but - * behaving in simplified but equivalent ways. In particular, the Guest is the - * same kernel as the Host (or at least, built from the same source code). -:*/ - -struct lguest_data lguest_data = { - .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, - .noirq_iret = (u32)lguest_noirq_iret, - .kernel_address = PAGE_OFFSET, - .blocked_interrupts = { 1 }, /* Block timer interrupts */ - .syscall_vec = IA32_SYSCALL_VECTOR, -}; - -/*G:037 - * async_hcall() is pretty simple: I'm quite proud of it really. We have a - * ring buffer of stored hypercalls which the Host will run though next time we - * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall - * arguments, and a "hcall_status" word which is 0 if the call is ready to go, - * and 255 once the Host has finished with it. - * - * If we come around to a slot which hasn't been finished, then the table is - * full and we just make the hypercall directly. This has the nice side - * effect of causing the Host to run all the stored calls in the ring buffer - * which empties it for next time! - */ -static void async_hcall(unsigned long call, unsigned long arg1, - unsigned long arg2, unsigned long arg3, - unsigned long arg4) -{ - /* Note: This code assumes we're uniprocessor. */ - static unsigned int next_call; - unsigned long flags; - - /* - * Disable interrupts if not already disabled: we don't want an - * interrupt handler making a hypercall while we're already doing - * one! - */ - local_irq_save(flags); - if (lguest_data.hcall_status[next_call] != 0xFF) { - /* Table full, so do normal hcall which will flush table. */ - hcall(call, arg1, arg2, arg3, arg4); - } else { - lguest_data.hcalls[next_call].arg0 = call; - lguest_data.hcalls[next_call].arg1 = arg1; - lguest_data.hcalls[next_call].arg2 = arg2; - lguest_data.hcalls[next_call].arg3 = arg3; - lguest_data.hcalls[next_call].arg4 = arg4; - /* Arguments must all be written before we mark it to go */ - wmb(); - lguest_data.hcall_status[next_call] = 0; - if (++next_call == LHCALL_RING_SIZE) - next_call = 0; - } - local_irq_restore(flags); -} - -/*G:035 - * Notice the lazy_hcall() above, rather than hcall(). This is our first real - * optimization trick! - * - * When lazy_mode is set, it means we're allowed to defer all hypercalls and do - * them as a batch when lazy_mode is eventually turned off. Because hypercalls - * are reasonably expensive, batching them up makes sense. For example, a - * large munmap might update dozens of page table entries: that code calls - * paravirt_enter_lazy_mmu(), does the dozen updates, then calls - * lguest_leave_lazy_mode(). - * - * So, when we're in lazy mode, we call async_hcall() to store the call for - * future processing: - */ -static void lazy_hcall1(unsigned long call, unsigned long arg1) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, 0, 0, 0); - else - async_hcall(call, arg1, 0, 0, 0); -} - -/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ -static void lazy_hcall2(unsigned long call, - unsigned long arg1, - unsigned long arg2) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, arg2, 0, 0); - else - async_hcall(call, arg1, arg2, 0, 0); -} - -static void lazy_hcall3(unsigned long call, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, arg2, arg3, 0); - else - async_hcall(call, arg1, arg2, arg3, 0); -} - -#ifdef CONFIG_X86_PAE -static void lazy_hcall4(unsigned long call, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3, - unsigned long arg4) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, arg2, arg3, arg4); - else - async_hcall(call, arg1, arg2, arg3, arg4); -} -#endif - -/*G:036 - * When lazy mode is turned off, we issue the do-nothing hypercall to - * flush any stored calls, and call the generic helper to reset the - * per-cpu lazy mode variable. - */ -static void lguest_leave_lazy_mmu_mode(void) -{ - hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); - paravirt_leave_lazy_mmu(); -} - -/* - * We also catch the end of context switch; we enter lazy mode for much of - * that too, so again we need to flush here. - * - * (Technically, this is lazy CPU mode, and normally we're in lazy MMU - * mode, but unlike Xen, lguest doesn't care about the difference). - */ -static void lguest_end_context_switch(struct task_struct *next) -{ - hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); - paravirt_end_context_switch(next); -} - -/*G:032 - * After that diversion we return to our first native-instruction - * replacements: four functions for interrupt control. - * - * The simplest way of implementing these would be to have "turn interrupts - * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow: - * these are by far the most commonly called functions of those we override. - * - * So instead we keep an "irq_enabled" field inside our "struct lguest_data", - * which the Guest can update with a single instruction. The Host knows to - * check there before it tries to deliver an interrupt. - */ - -/* - * save_flags() is expected to return the processor state (ie. "flags"). The - * flags word contains all kind of stuff, but in practice Linux only cares - * about the interrupt flag. Our "save_flags()" just returns that. - */ -asmlinkage __visible unsigned long lguest_save_fl(void) -{ - return lguest_data.irq_enabled; -} - -/* Interrupts go off... */ -asmlinkage __visible void lguest_irq_disable(void) -{ - lguest_data.irq_enabled = 0; -} - -/* - * Let's pause a moment. Remember how I said these are called so often? - * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to - * break some rules. In particular, these functions are assumed to save their - * own registers if they need to: normal C functions assume they can trash the - * eax register. To use normal C functions, we use - * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the - * C function, then restores it. - */ -PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl); -PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable); -/*:*/ - -/* These are in head_32.S */ -extern void lg_irq_enable(void); -extern void lg_restore_fl(unsigned long flags); - -/*M:003 - * We could be more efficient in our checking of outstanding interrupts, rather - * than using a branch. One way would be to put the "irq_enabled" field in a - * page by itself, and have the Host write-protect it when an interrupt comes - * in when irqs are disabled. There will then be a page fault as soon as - * interrupts are re-enabled. - * - * A better method is to implement soft interrupt disable generally for x86: - * instead of disabling interrupts, we set a flag. If an interrupt does come - * in, we then disable them for real. This is uncommon, so we could simply use - * a hypercall for interrupt control and not worry about efficiency. -:*/ - -/*G:034 - * The Interrupt Descriptor Table (IDT). - * - * The IDT tells the processor what to do when an interrupt comes in. Each - * entry in the table is a 64-bit descriptor: this holds the privilege level, - * address of the handler, and... well, who cares? The Guest just asks the - * Host to make the change anyway, because the Host controls the real IDT. - */ -static void lguest_write_idt_entry(gate_desc *dt, - int entrynum, const gate_desc *g) -{ - /* - * The gate_desc structure is 8 bytes long: we hand it to the Host in - * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors - * around like this; typesafety wasn't a big concern in Linux's early - * years. - */ - u32 *desc = (u32 *)g; - /* Keep the local copy up to date. */ - native_write_idt_entry(dt, entrynum, g); - /* Tell Host about this new entry. */ - hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1], 0); -} - -/* - * Changing to a different IDT is very rare: we keep the IDT up-to-date every - * time it is written, so we can simply loop through all entries and tell the - * Host about them. - */ -static void lguest_load_idt(const struct desc_ptr *desc) -{ - unsigned int i; - struct desc_struct *idt = (void *)desc->address; - - for (i = 0; i < (desc->size+1)/8; i++) - hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b, 0); -} - -/* - * The Global Descriptor Table. - * - * The Intel architecture defines another table, called the Global Descriptor - * Table (GDT). You tell the CPU where it is (and its size) using the "lgdt" - * instruction, and then several other instructions refer to entries in the - * table. There are three entries which the Switcher needs, so the Host simply - * controls the entire thing and the Guest asks it to make changes using the - * LOAD_GDT hypercall. - * - * This is the exactly like the IDT code. - */ -static void lguest_load_gdt(const struct desc_ptr *desc) -{ - unsigned int i; - struct desc_struct *gdt = (void *)desc->address; - - for (i = 0; i < (desc->size+1)/8; i++) - hcall(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b, 0); -} - -/* - * For a single GDT entry which changes, we simply change our copy and - * then tell the host about it. - */ -static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, - const void *desc, int type) -{ - native_write_gdt_entry(dt, entrynum, desc, type); - /* Tell Host about this new entry. */ - hcall(LHCALL_LOAD_GDT_ENTRY, entrynum, - dt[entrynum].a, dt[entrynum].b, 0); -} - -/* - * There are three "thread local storage" GDT entries which change - * on every context switch (these three entries are how glibc implements - * __thread variables). As an optimization, we have a hypercall - * specifically for this case. - * - * Wouldn't it be nicer to have a general LOAD_GDT_ENTRIES hypercall - * which took a range of entries? - */ -static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) -{ - /* - * There's one problem which normal hardware doesn't have: the Host - * can't handle us removing entries we're currently using. So we clear - * the GS register here: if it's needed it'll be reloaded anyway. - */ - lazy_load_gs(0); - lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); -} - -/*G:038 - * That's enough excitement for now, back to ploughing through each of the - * different pv_ops structures (we're about 1/3 of the way through). - * - * This is the Local Descriptor Table, another weird Intel thingy. Linux only - * uses this for some strange applications like Wine. We don't do anything - * here, so they'll get an informative and friendly Segmentation Fault. - */ -static void lguest_set_ldt(const void *addr, unsigned entries) -{ -} - -/* - * This loads a GDT entry into the "Task Register": that entry points to a - * structure called the Task State Segment. Some comments scattered though the - * kernel code indicate that this used for task switching in ages past, along - * with blood sacrifice and astrology. - * - * Now there's nothing interesting in here that we don't get told elsewhere. - * But the native version uses the "ltr" instruction, which makes the Host - * complain to the Guest about a Segmentation Fault and it'll oops. So we - * override the native version with a do-nothing version. - */ -static void lguest_load_tr_desc(void) -{ -} - -/* - * The "cpuid" instruction is a way of querying both the CPU identity - * (manufacturer, model, etc) and its features. It was introduced before the - * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. - * As you might imagine, after a decade and a half this treatment, it is now a - * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. - * - * This instruction even it has its own Wikipedia entry. The Wikipedia entry - * has been translated into 6 languages. I am not making this up! - * - * We could get funky here and identify ourselves as "GenuineLguest", but - * instead we just use the real "cpuid" instruction. Then I pretty much turned - * off feature bits until the Guest booted. (Don't say that: you'll damage - * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is - * hardly future proof.) No one's listening! They don't like you anyway, - * parenthetic weirdo! - * - * Replacing the cpuid so we can turn features off is great for the kernel, but - * anyone (including userspace) can just use the raw "cpuid" instruction and - * the Host won't even notice since it isn't privileged. So we try not to get - * too worked up about it. - */ -static void lguest_cpuid(unsigned int *ax, unsigned int *bx, - unsigned int *cx, unsigned int *dx) -{ - int function = *ax; - - native_cpuid(ax, bx, cx, dx); - switch (function) { - /* - * CPUID 0 gives the highest legal CPUID number (and the ID string). - * We futureproof our code a little by sticking to known CPUID values. - */ - case 0: - if (*ax > 5) - *ax = 5; - break; - - /* - * CPUID 1 is a basic feature request. - * - * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3 - * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE. - */ - case 1: - *cx &= 0x00002201; - *dx &= 0x07808151; - /* - * The Host can do a nice optimization if it knows that the - * kernel mappings (addresses above 0xC0000000 or whatever - * PAGE_OFFSET is set to) haven't changed. But Linux calls - * flush_tlb_user() for both user and kernel mappings unless - * the Page Global Enable (PGE) feature bit is set. - */ - *dx |= 0x00002000; - /* - * We also lie, and say we're family id 5. 6 or greater - * leads to a rdmsr in early_init_intel which we can't handle. - * Family ID is returned as bits 8-12 in ax. - */ - *ax &= 0xFFFFF0FF; - *ax |= 0x00000500; - break; - - /* - * This is used to detect if we're running under KVM. We might be, - * but that's a Host matter, not us. So say we're not. - */ - case KVM_CPUID_SIGNATURE: - *bx = *cx = *dx = 0; - break; - - /* - * 0x80000000 returns the highest Extended Function, so we futureproof - * like we do above by limiting it to known fields. - */ - case 0x80000000: - if (*ax > 0x80000008) - *ax = 0x80000008; - break; - - /* - * PAE systems can mark pages as non-executable. Linux calls this the - * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced - * Virus Protection). We just switch it off here, since we don't - * support it. - */ - case 0x80000001: - *dx &= ~(1 << 20); - break; - } -} - -/* - * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. - * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother - * it. The Host needs to know when the Guest wants to change them, so we have - * a whole series of functions like read_cr0() and write_cr0(). - * - * We start with cr0. cr0 allows you to turn on and off all kinds of basic - * features, but the only cr0 bit that Linux ever used at runtime was the - * horrifically-named Task Switched (TS) bit at bit 3 (ie. 8) - * - * What does the TS bit do? Well, it causes the CPU to trap (interrupt 7) if - * the floating point unit is used. Which allows us to restore FPU state - * lazily after a task switch if we wanted to, but wouldn't a name like - * "FPUTRAP bit" be a little less cryptic? - * - * Fortunately, Linux keeps it simple and doesn't use TS, so we can ignore - * cr0. - */ -static void lguest_write_cr0(unsigned long val) -{ -} - -static unsigned long lguest_read_cr0(void) -{ - return 0; -} - -/* - * cr2 is the virtual address of the last page fault, which the Guest only ever - * reads. The Host kindly writes this into our "struct lguest_data", so we - * just read it out of there. - */ -static unsigned long lguest_read_cr2(void) -{ - return lguest_data.cr2; -} - -/* See lguest_set_pte() below. */ -static bool cr3_changed = false; -static unsigned long current_cr3; - -/* - * cr3 is the current toplevel pagetable page: the principle is the same as - * cr0. Keep a local copy, and tell the Host when it changes. - */ -static void lguest_write_cr3(unsigned long cr3) -{ - lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); - current_cr3 = cr3; - - /* These two page tables are simple, linear, and used during boot */ - if (cr3 != __pa_symbol(swapper_pg_dir) && - cr3 != __pa_symbol(initial_page_table)) - cr3_changed = true; -} - -static unsigned long lguest_read_cr3(void) -{ - return current_cr3; -} - -/* cr4 is used to enable and disable PGE, but we don't care. */ -static unsigned long lguest_read_cr4(void) -{ - return 0; -} - -static void lguest_write_cr4(unsigned long val) -{ -} - -/* - * Page Table Handling. - * - * Now would be a good time to take a rest and grab a coffee or similarly - * relaxing stimulant. The easy parts are behind us, and the trek gradually - * winds uphill from here. - * - * Quick refresher: memory is divided into "pages" of 4096 bytes each. The CPU - * maps virtual addresses to physical addresses using "page tables". We could - * use one huge index of 1 million entries: each address is 4 bytes, so that's - * 1024 pages just to hold the page tables. But since most virtual addresses - * are unused, we use a two level index which saves space. The cr3 register - * contains the physical address of the top level "page directory" page, which - * contains physical addresses of up to 1024 second-level pages. Each of these - * second level pages contains up to 1024 physical addresses of actual pages, - * or Page Table Entries (PTEs). - * - * Here's a diagram, where arrows indicate physical addresses: - * - * cr3 ---> +---------+ - * | --------->+---------+ - * | | | PADDR1 | - * Mid-level | | PADDR2 | - * (PMD) page | | | - * | | Lower-level | - * | | (PTE) page | - * | | | | - * .... .... - * - * So to convert a virtual address to a physical address, we look up the top - * level, which points us to the second level, which gives us the physical - * address of that page. If the top level entry was not present, or the second - * level entry was not present, then the virtual address is invalid (we - * say "the page was not mapped"). - * - * Put another way, a 32-bit virtual address is divided up like so: - * - * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>| - * Index into top Index into second Offset within page - * page directory page pagetable page - * - * Now, unfortunately, this isn't the whole story: Intel added Physical Address - * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits). - * These are held in 64-bit page table entries, so we can now only fit 512 - * entries in a page, and the neat three-level tree breaks down. - * - * The result is a four level page table: - * - * cr3 --> [ 4 Upper ] - * [ Level ] - * [ Entries ] - * [(PUD Page)]---> +---------+ - * | --------->+---------+ - * | | | PADDR1 | - * Mid-level | | PADDR2 | - * (PMD) page | | | - * | | Lower-level | - * | | (PTE) page | - * | | | | - * .... .... - * - * - * And the virtual address is decoded as: - * - * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>| - * Index into Index into mid Index into lower Offset within page - * top entries directory page pagetable page - * - * It's too hard to switch between these two formats at runtime, so Linux only - * supports one or the other depending on whether CONFIG_X86_PAE is set. Many - * distributions turn it on, and not just for people with silly amounts of - * memory: the larger PTE entries allow room for the NX bit, which lets the - * kernel disable execution of pages and increase security. - * - * This was a problem for lguest, which couldn't run on these distributions; - * then Matias Zabaljauregui figured it all out and implemented it, and only a - * handful of puppies were crushed in the process! - * - * Back to our point: the kernel spends a lot of time changing both the - * top-level page directory and lower-level pagetable pages. The Guest doesn't - * know physical addresses, so while it maintains these page tables exactly - * like normal, it also needs to keep the Host informed whenever it makes a - * change: the Host will create the real page tables based on the Guests'. - */ - -/* - * The Guest calls this after it has set a second-level entry (pte), ie. to map - * a page into a process' address space. We tell the Host the toplevel and - * address this corresponds to. The Guest uses one pagetable per process, so - * we need to tell the Host which one we're changing (mm->pgd). - */ -static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ -#ifdef CONFIG_X86_PAE - /* PAE needs to hand a 64 bit page table entry, so it uses two args. */ - lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, - ptep->pte_low, ptep->pte_high); -#else - lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); -#endif -} - -/* This is the "set and update" combo-meal-deal version. */ -static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval) -{ - native_set_pte(ptep, pteval); - lguest_pte_update(mm, addr, ptep); -} - -/* - * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd - * to set a middle-level entry when PAE is activated. - * - * Again, we set the entry then tell the Host which page we changed, - * and the index of the entry we changed. - */ -#ifdef CONFIG_X86_PAE -static void lguest_set_pud(pud_t *pudp, pud_t pudval) -{ - native_set_pud(pudp, pudval); - - /* 32 bytes aligned pdpt address and the index. */ - lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0, - (__pa(pudp) & 0x1F) / sizeof(pud_t)); -} - -static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) -{ - native_set_pmd(pmdp, pmdval); - lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, - (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); -} -#else - -/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */ -static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) -{ - native_set_pmd(pmdp, pmdval); - lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, - (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); -} -#endif - -/* - * There are a couple of legacy places where the kernel sets a PTE, but we - * don't know the top level any more. This is useless for us, since we don't - * know which pagetable is changing or what address, so we just tell the Host - * to forget all of them. Fortunately, this is very rare. - * - * ... except in early boot when the kernel sets up the initial pagetables, - * which makes booting astonishingly slow: 48 seconds! So we don't even tell - * the Host anything changed until we've done the first real page table switch, - * which brings boot back to 4.3 seconds. - */ -static void lguest_set_pte(pte_t *ptep, pte_t pteval) -{ - native_set_pte(ptep, pteval); - if (cr3_changed) - lazy_hcall1(LHCALL_FLUSH_TLB, 1); -} - -#ifdef CONFIG_X86_PAE -/* - * With 64-bit PTE values, we need to be careful setting them: if we set 32 - * bits at a time, the hardware could see a weird half-set entry. These - * versions ensure we update all 64 bits at once. - */ -static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) -{ - native_set_pte_atomic(ptep, pte); - if (cr3_changed) - lazy_hcall1(LHCALL_FLUSH_TLB, 1); -} - -static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - native_pte_clear(mm, addr, ptep); - lguest_pte_update(mm, addr, ptep); -} - -static void lguest_pmd_clear(pmd_t *pmdp) -{ - lguest_set_pmd(pmdp, __pmd(0)); -} -#endif - -/* - * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on - * native page table operations. On native hardware you can set a new page - * table entry whenever you want, but if you want to remove one you have to do - * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). - * - * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only - * called when a valid entry is written, not when it's removed (ie. marked not - * present). Instead, this is where we come when the Guest wants to remove a - * page table entry: we tell the Host to set that entry to 0 (ie. the present - * bit is zero). - */ -static void lguest_flush_tlb_single(unsigned long addr) -{ - /* Simply set it to zero: if it was not, it will fault back in. */ - lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0); -} - -/* - * This is what happens after the Guest has removed a large number of entries. - * This tells the Host that any of the page table entries for userspace might - * have changed, ie. virtual addresses below PAGE_OFFSET. - */ -static void lguest_flush_tlb_user(void) -{ - lazy_hcall1(LHCALL_FLUSH_TLB, 0); -} - -/* - * This is called when the kernel page tables have changed. That's not very - * common (unless the Guest is using highmem, which makes the Guest extremely - * slow), so it's worth separating this from the user flushing above. - */ -static void lguest_flush_tlb_kernel(void) -{ - lazy_hcall1(LHCALL_FLUSH_TLB, 1); -} - -/* - * The Unadvanced Programmable Interrupt Controller. - * - * This is an attempt to implement the simplest possible interrupt controller. - * I spent some time looking though routines like set_irq_chip_and_handler, - * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and - * I *think* this is as simple as it gets. - * - * We can tell the Host what interrupts we want blocked ready for using the - * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as - * simple as setting a bit. We don't actually "ack" interrupts as such, we - * just mask and unmask them. I wonder if we should be cleverer? - */ -static void disable_lguest_irq(struct irq_data *data) -{ - set_bit(data->irq, lguest_data.blocked_interrupts); -} - -static void enable_lguest_irq(struct irq_data *data) -{ - clear_bit(data->irq, lguest_data.blocked_interrupts); -} - -/* This structure describes the lguest IRQ controller. */ -static struct irq_chip lguest_irq_controller = { - .name = "lguest", - .irq_mask = disable_lguest_irq, - .irq_mask_ack = disable_lguest_irq, - .irq_unmask = enable_lguest_irq, -}; - -/* - * Interrupt descriptors are allocated as-needed, but low-numbered ones are - * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it - * tells us the irq is already used: other errors (ie. ENOMEM) we take - * seriously. - */ -static int lguest_setup_irq(unsigned int irq) -{ - struct irq_desc *desc; - int err; - - /* Returns -ve error or vector number. */ - err = irq_alloc_desc_at(irq, 0); - if (err < 0 && err != -EEXIST) - return err; - - /* - * Tell the Linux infrastructure that the interrupt is - * controlled by our level-based lguest interrupt controller. - */ - irq_set_chip_and_handler_name(irq, &lguest_irq_controller, - handle_level_irq, "level"); - - /* Some systems map "vectors" to interrupts weirdly. Not us! */ - desc = irq_to_desc(irq); - __this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc); - return 0; -} - -static int lguest_enable_irq(struct pci_dev *dev) -{ - int err; - u8 line = 0; - - /* We literally use the PCI interrupt line as the irq number. */ - pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line); - err = lguest_setup_irq(line); - if (!err) - dev->irq = line; - return err; -} - -/* We don't do hotplug PCI, so this shouldn't be called. */ -static void lguest_disable_irq(struct pci_dev *dev) -{ - WARN_ON(1); -} - -/* - * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware - * interrupt (except 128, which is used for system calls). - */ -static void __init lguest_init_IRQ(void) -{ - unsigned int i; - - for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) { - if (i != IA32_SYSCALL_VECTOR) - set_intr_gate(i, irq_entries_start + - 8 * (i - FIRST_EXTERNAL_VECTOR)); - } - - /* - * This call is required to set up for 4k stacks, where we have - * separate stacks for hard and soft interrupts. - */ - irq_ctx_init(smp_processor_id()); -} - -/* - * Time. - * - * It would be far better for everyone if the Guest had its own clock, but - * until then the Host gives us the time on every interrupt. - */ -static void lguest_get_wallclock(struct timespec *now) -{ - *now = lguest_data.time; -} - -/* - * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us - * what speed it runs at, or 0 if it's unusable as a reliable clock source. - * This matches what we want here: if we return 0 from this function, the x86 - * TSC clock will give up and not register itself. - */ -static unsigned long lguest_tsc_khz(void) -{ - return lguest_data.tsc_khz; -} - -/* - * If we can't use the TSC, the kernel falls back to our lower-priority - * "lguest_clock", where we read the time value given to us by the Host. - */ -static u64 lguest_clock_read(struct clocksource *cs) -{ - unsigned long sec, nsec; - - /* - * Since the time is in two parts (seconds and nanoseconds), we risk - * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, - * and getting 99 and 0. As Linux tends to come apart under the stress - * of time travel, we must be careful: - */ - do { - /* First we read the seconds part. */ - sec = lguest_data.time.tv_sec; - /* - * This read memory barrier tells the compiler and the CPU that - * this can't be reordered: we have to complete the above - * before going on. - */ - rmb(); - /* Now we read the nanoseconds part. */ - nsec = lguest_data.time.tv_nsec; - /* Make sure we've done that. */ - rmb(); - /* Now if the seconds part has changed, try again. */ - } while (unlikely(lguest_data.time.tv_sec != sec)); - - /* Our lguest clock is in real nanoseconds. */ - return sec*1000000000ULL + nsec; -} - -/* This is the fallback clocksource: lower priority than the TSC clocksource. */ -static struct clocksource lguest_clock = { - .name = "lguest", - .rating = 200, - .read = lguest_clock_read, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; - -/* - * We also need a "struct clock_event_device": Linux asks us to set it to go - * off some time in the future. Actually, James Morris figured all this out, I - * just applied the patch. - */ -static int lguest_clockevent_set_next_event(unsigned long delta, - struct clock_event_device *evt) -{ - /* FIXME: I don't think this can ever happen, but James tells me he had - * to put this code in. Maybe we should remove it now. Anyone? */ - if (delta < LG_CLOCK_MIN_DELTA) { - if (printk_ratelimit()) - printk(KERN_DEBUG "%s: small delta %lu ns\n", - __func__, delta); - return -ETIME; - } - - /* Please wake us this far in the future. */ - hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0, 0); - return 0; -} - -static int lguest_clockevent_shutdown(struct clock_event_device *evt) -{ - /* A 0 argument shuts the clock down. */ - hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0); - return 0; -} - -/* This describes our primitive timer chip. */ -static struct clock_event_device lguest_clockevent = { - .name = "lguest", - .features = CLOCK_EVT_FEAT_ONESHOT, - .set_next_event = lguest_clockevent_set_next_event, - .set_state_shutdown = lguest_clockevent_shutdown, - .rating = INT_MAX, - .mult = 1, - .shift = 0, - .min_delta_ns = LG_CLOCK_MIN_DELTA, - .min_delta_ticks = LG_CLOCK_MIN_DELTA, - .max_delta_ns = LG_CLOCK_MAX_DELTA, - .max_delta_ticks = LG_CLOCK_MAX_DELTA, -}; - -/* - * This is the Guest timer interrupt handler (hardware interrupt 0). We just - * call the clockevent infrastructure and it does whatever needs doing. - */ -static void lguest_time_irq(struct irq_desc *desc) -{ - unsigned long flags; - - /* Don't interrupt us while this is running. */ - local_irq_save(flags); - lguest_clockevent.event_handler(&lguest_clockevent); - local_irq_restore(flags); -} - -/* - * At some point in the boot process, we get asked to set up our timing - * infrastructure. The kernel doesn't expect timer interrupts before this, but - * we cleverly initialized the "blocked_interrupts" field of "struct - * lguest_data" so that timer interrupts were blocked until now. - */ -static void lguest_time_init(void) -{ - /* Set up the timer interrupt (0) to go to our simple timer routine */ - if (lguest_setup_irq(0) != 0) - panic("Could not set up timer irq"); - irq_set_handler(0, lguest_time_irq); - - clocksource_register_hz(&lguest_clock, NSEC_PER_SEC); - - /* We can't set cpumask in the initializer: damn C limitations! Set it - * here and register our timer device. */ - lguest_clockevent.cpumask = cpumask_of(0); - clockevents_register_device(&lguest_clockevent); - - /* Finally, we unblock the timer interrupt. */ - clear_bit(0, lguest_data.blocked_interrupts); -} - -/* - * Miscellaneous bits and pieces. - * - * Here is an oddball collection of functions which the Guest needs for things - * to work. They're pretty simple. - */ - -/* - * The Guest needs to tell the Host what stack it expects traps to use. For - * native hardware, this is part of the Task State Segment mentioned above in - * lguest_load_tr_desc(), but to help hypervisors there's this special call. - * - * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data - * segment), the privilege level (we're privilege level 1, the Host is 0 and - * will not tolerate us trying to use that), the stack pointer, and the number - * of pages in the stack. - */ -static void lguest_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) -{ - lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0, - THREAD_SIZE / PAGE_SIZE); - tss->x86_tss.sp0 = thread->sp0; -} - -/* Let's just say, I wouldn't do debugging under a Guest. */ -static unsigned long lguest_get_debugreg(int regno) -{ - /* FIXME: Implement */ - return 0; -} - -static void lguest_set_debugreg(int regno, unsigned long value) -{ - /* FIXME: Implement */ -} - -/* - * There are times when the kernel wants to make sure that no memory writes are - * caught in the cache (that they've all reached real hardware devices). This - * doesn't matter for the Guest which has virtual hardware. - * - * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush - * (clflush) instruction is available and the kernel uses that. Otherwise, it - * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction. - * Unlike clflush, wbinvd can only be run at privilege level 0. So we can - * ignore clflush, but replace wbinvd. - */ -static void lguest_wbinvd(void) -{ -} - -/* - * If the Guest expects to have an Advanced Programmable Interrupt Controller, - * we play dumb by ignoring writes and returning 0 for reads. So it's no - * longer Programmable nor Controlling anything, and I don't think 8 lines of - * code qualifies for Advanced. It will also never interrupt anything. It - * does, however, allow us to get through the Linux boot code. - */ -#ifdef CONFIG_X86_LOCAL_APIC -static void lguest_apic_write(u32 reg, u32 v) -{ -} - -static u32 lguest_apic_read(u32 reg) -{ - return 0; -} - -static u64 lguest_apic_icr_read(void) -{ - return 0; -} - -static void lguest_apic_icr_write(u32 low, u32 id) -{ - /* Warn to see if there's any stray references */ - WARN_ON(1); -} - -static void lguest_apic_wait_icr_idle(void) -{ - return; -} - -static u32 lguest_apic_safe_wait_icr_idle(void) -{ - return 0; -} - -static void set_lguest_basic_apic_ops(void) -{ - apic->read = lguest_apic_read; - apic->write = lguest_apic_write; - apic->icr_read = lguest_apic_icr_read; - apic->icr_write = lguest_apic_icr_write; - apic->wait_icr_idle = lguest_apic_wait_icr_idle; - apic->safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle; -}; -#endif - -/* STOP! Until an interrupt comes in. */ -static void lguest_safe_halt(void) -{ - hcall(LHCALL_HALT, 0, 0, 0, 0); -} - -/* - * The SHUTDOWN hypercall takes a string to describe what's happening, and - * an argument which says whether this to restart (reboot) the Guest or not. - * - * Note that the Host always prefers that the Guest speak in physical addresses - * rather than virtual addresses, so we use __pa() here. - */ -static void lguest_power_off(void) -{ - hcall(LHCALL_SHUTDOWN, __pa("Power down"), - LGUEST_SHUTDOWN_POWEROFF, 0, 0); -} - -/* - * Panicing. - * - * Don't. But if you did, this is what happens. - */ -static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) -{ - hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0, 0); - /* The hcall won't return, but to keep gcc happy, we're "done". */ - return NOTIFY_DONE; -} - -static struct notifier_block paniced = { - .notifier_call = lguest_panic -}; - -/* Setting up memory is fairly easy. */ -static __init char *lguest_memory_setup(void) -{ - /* - * The Linux bootloader header contains an "e820" memory map: the - * Launcher populated the first entry with our memory limit. - */ - e820__range_add(boot_params.e820_table[0].addr, - boot_params.e820_table[0].size, - boot_params.e820_table[0].type); - - /* This string is for the boot messages. */ - return "LGUEST"; -} - -/* Offset within PCI config space of BAR access capability. */ -static int console_cfg_offset = 0; -static int console_access_cap; - -/* Set up so that we access off in bar0 (on bus 0, device 1, function 0) */ -static void set_cfg_window(u32 cfg_offset, u32 off) -{ - write_pci_config_byte(0, 1, 0, - cfg_offset + offsetof(struct virtio_pci_cap, bar), - 0); - write_pci_config(0, 1, 0, - cfg_offset + offsetof(struct virtio_pci_cap, length), - 4); - write_pci_config(0, 1, 0, - cfg_offset + offsetof(struct virtio_pci_cap, offset), - off); -} - -static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val) -{ - /* - * We could set this up once, then leave it; nothing else in the * - * kernel should touch these registers. But if it went wrong, that - * would be a horrible bug to find. - */ - set_cfg_window(cfg_offset, off); - write_pci_config(0, 1, 0, - cfg_offset + sizeof(struct virtio_pci_cap), val); -} - -static void probe_pci_console(void) -{ - u8 cap, common_cap = 0, device_cap = 0; - u32 device_len; - - /* Avoid recursive printk into here. */ - console_cfg_offset = -1; - - if (!early_pci_allowed()) { - printk(KERN_ERR "lguest: early PCI access not allowed!\n"); - return; - } - - /* We expect a console PCI device at BUS0, slot 1. */ - if (read_pci_config(0, 1, 0, 0) != 0x10431AF4) { - printk(KERN_ERR "lguest: PCI device is %#x!\n", - read_pci_config(0, 1, 0, 0)); - return; - } - - /* Find the capabilities we need (must be in bar0) */ - cap = read_pci_config_byte(0, 1, 0, PCI_CAPABILITY_LIST); - while (cap) { - u8 vndr = read_pci_config_byte(0, 1, 0, cap); - if (vndr == PCI_CAP_ID_VNDR) { - u8 type, bar; - - type = read_pci_config_byte(0, 1, 0, - cap + offsetof(struct virtio_pci_cap, cfg_type)); - bar = read_pci_config_byte(0, 1, 0, - cap + offsetof(struct virtio_pci_cap, bar)); - - switch (type) { - case VIRTIO_PCI_CAP_DEVICE_CFG: - if (bar == 0) - device_cap = cap; - break; - case VIRTIO_PCI_CAP_PCI_CFG: - console_access_cap = cap; - break; - } - } - cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT); - } - if (!device_cap || !console_access_cap) { - printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n", - common_cap, device_cap, console_access_cap); - return; - } - - /* - * Note that we can't check features, until we've set the DRIVER - * status bit. We don't want to do that until we have a real driver, - * so we just check that the device-specific config has room for - * emerg_wr. If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE - * it should ignore the access. - */ - device_len = read_pci_config(0, 1, 0, - device_cap + offsetof(struct virtio_pci_cap, length)); - if (device_len < (offsetof(struct virtio_console_config, emerg_wr) - + sizeof(u32))) { - printk(KERN_ERR "lguest: console missing emerg_wr field\n"); - return; - } - - console_cfg_offset = read_pci_config(0, 1, 0, - device_cap + offsetof(struct virtio_pci_cap, offset)); - printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n"); -} - -/* - * We will eventually use the virtio console device to produce console output, - * but before that is set up we use the virtio PCI console's backdoor mmio - * access and the "emergency" write facility (which is legal even before the - * device is configured). - */ -static __init int early_put_chars(u32 vtermno, const char *buf, int count) -{ - /* If we couldn't find PCI console, forget it. */ - if (console_cfg_offset < 0) - return count; - - if (unlikely(!console_cfg_offset)) { - probe_pci_console(); - if (console_cfg_offset < 0) - return count; - } - - write_bar_via_cfg(console_access_cap, - console_cfg_offset - + offsetof(struct virtio_console_config, emerg_wr), - buf[0]); - return 1; -} - -/* - * Rebooting also tells the Host we're finished, but the RESTART flag tells the - * Launcher to reboot us. - */ -static void lguest_restart(char *reason) -{ - hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0, 0); -} - -/*G:050 - * Patching (Powerfully Placating Performance Pedants) - * - * We have already seen that pv_ops structures let us replace simple native - * instructions with calls to the appropriate back end all throughout the - * kernel. This allows the same kernel to run as a Guest and as a native - * kernel, but it's slow because of all the indirect branches. - * - * Remember that David Wheeler quote about "Any problem in computer science can - * be solved with another layer of indirection"? The rest of that quote is - * "... But that usually will create another problem." This is the first of - * those problems. - * - * Our current solution is to allow the paravirt back end to optionally patch - * over the indirect calls to replace them with something more efficient. We - * patch two of the simplest of the most commonly called functions: disable - * interrupts and save interrupts. We usually have 6 or 10 bytes to patch - * into: the Guest versions of these operations are small enough that we can - * fit comfortably. - * - * First we need assembly templates of each of the patchable Guest operations, - * and these are in head_32.S. - */ - -/*G:060 We construct a table from the assembler templates: */ -static const struct lguest_insns -{ - const char *start, *end; -} lguest_insns[] = { - [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, - [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, -}; - -/* - * Now our patch routine is fairly simple (based on the native one in - * paravirt.c). If we have a replacement, we copy it in and return how much of - * the available space we used. - */ -static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, - unsigned long addr, unsigned len) -{ - unsigned int insn_len; - - /* Don't do anything special if we don't have a replacement */ - if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start) - return paravirt_patch_default(type, clobber, ibuf, addr, len); - - insn_len = lguest_insns[type].end - lguest_insns[type].start; - - /* Similarly if it can't fit (doesn't happen, but let's be thorough). */ - if (len < insn_len) - return paravirt_patch_default(type, clobber, ibuf, addr, len); - - /* Copy in our instructions. */ - memcpy(ibuf, lguest_insns[type].start, insn_len); - return insn_len; -} - -/*G:029 - * Once we get to lguest_init(), we know we're a Guest. The various - * pv_ops structures in the kernel provide points for (almost) every routine we - * have to override to avoid privileged instructions. - */ -__init void lguest_init(void) -{ - /* We're under lguest. */ - pv_info.name = "lguest"; - /* We're running at privilege level 1, not 0 as normal. */ - pv_info.kernel_rpl = 1; - /* Everyone except Xen runs with this set. */ - pv_info.shared_kernel_pmd = 1; - - /* - * We set up all the lguest overrides for sensitive operations. These - * are detailed with the operations themselves. - */ - - /* Interrupt-related operations */ - pv_irq_ops.save_fl = PV_CALLEE_SAVE(lguest_save_fl); - pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); - pv_irq_ops.irq_disable = PV_CALLEE_SAVE(lguest_irq_disable); - pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); - pv_irq_ops.safe_halt = lguest_safe_halt; - - /* Setup operations */ - pv_init_ops.patch = lguest_patch; - - /* Intercepts of various CPU instructions */ - pv_cpu_ops.load_gdt = lguest_load_gdt; - pv_cpu_ops.cpuid = lguest_cpuid; - pv_cpu_ops.load_idt = lguest_load_idt; - pv_cpu_ops.iret = lguest_iret; - pv_cpu_ops.load_sp0 = lguest_load_sp0; - pv_cpu_ops.load_tr_desc = lguest_load_tr_desc; - pv_cpu_ops.set_ldt = lguest_set_ldt; - pv_cpu_ops.load_tls = lguest_load_tls; - pv_cpu_ops.get_debugreg = lguest_get_debugreg; - pv_cpu_ops.set_debugreg = lguest_set_debugreg; - pv_cpu_ops.read_cr0 = lguest_read_cr0; - pv_cpu_ops.write_cr0 = lguest_write_cr0; - pv_cpu_ops.read_cr4 = lguest_read_cr4; - pv_cpu_ops.write_cr4 = lguest_write_cr4; - pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry; - pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; - pv_cpu_ops.wbinvd = lguest_wbinvd; - pv_cpu_ops.start_context_switch = paravirt_start_context_switch; - pv_cpu_ops.end_context_switch = lguest_end_context_switch; - - /* Pagetable management */ - pv_mmu_ops.write_cr3 = lguest_write_cr3; - pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; - pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; - pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel; - pv_mmu_ops.set_pte = lguest_set_pte; - pv_mmu_ops.set_pte_at = lguest_set_pte_at; - pv_mmu_ops.set_pmd = lguest_set_pmd; -#ifdef CONFIG_X86_PAE - pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic; - pv_mmu_ops.pte_clear = lguest_pte_clear; - pv_mmu_ops.pmd_clear = lguest_pmd_clear; - pv_mmu_ops.set_pud = lguest_set_pud; -#endif - pv_mmu_ops.read_cr2 = lguest_read_cr2; - pv_mmu_ops.read_cr3 = lguest_read_cr3; - pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; - pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode; - pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu; - pv_mmu_ops.pte_update = lguest_pte_update; - -#ifdef CONFIG_X86_LOCAL_APIC - /* APIC read/write intercepts */ - set_lguest_basic_apic_ops(); -#endif - - x86_init.resources.memory_setup = lguest_memory_setup; - x86_init.irqs.intr_init = lguest_init_IRQ; - x86_init.timers.timer_init = lguest_time_init; - x86_platform.calibrate_tsc = lguest_tsc_khz; - x86_platform.get_wallclock = lguest_get_wallclock; - - /* - * Now is a good time to look at the implementations of these functions - * before returning to the rest of lguest_init(). - */ - - /*G:070 - * Now we've seen all the paravirt_ops, we return to - * lguest_init() where the rest of the fairly chaotic boot setup - * occurs. - */ - - /* - * The stack protector is a weird thing where gcc places a canary - * value on the stack and then checks it on return. This file is - * compiled with -fno-stack-protector it, so we got this far without - * problems. The value of the canary is kept at offset 20 from the - * %gs register, so we need to set that up before calling C functions - * in other files. - */ - setup_stack_canary_segment(0); - - /* - * We could just call load_stack_canary_segment(), but we might as well - * call switch_to_new_gdt() which loads the whole table and sets up the - * per-cpu segment descriptor register %fs as well. - */ - switch_to_new_gdt(0); - - /* - * The Host<->Guest Switcher lives at the top of our address space, and - * the Host told us how big it is when we made LGUEST_INIT hypercall: - * it put the answer in lguest_data.reserve_mem - */ - reserve_top_address(lguest_data.reserve_mem); - - /* Hook in our special panic hypercall code. */ - atomic_notifier_chain_register(&panic_notifier_list, &paniced); - - /* - * This is messy CPU setup stuff which the native boot code does before - * start_kernel, so we have to do, too: - */ - cpu_detect(&new_cpu_data); - /* head.S usually sets up the first capability word, so do it here. */ - new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1); - - /* Math is always hard! */ - set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU); - - /* We don't have features. We have puppies! Puppies! */ -#ifdef CONFIG_X86_MCE - mca_cfg.disabled = true; -#endif -#ifdef CONFIG_ACPI - acpi_disabled = 1; -#endif - - /* - * We set the preferred console to "hvc". This is the "hypervisor - * virtual console" driver written by the PowerPC people, which we also - * adapted for lguest's use. - */ - add_preferred_console("hvc", 0, NULL); - - /* Register our very early console. */ - virtio_cons_early_init(early_put_chars); - - /* Don't let ACPI try to control our PCI interrupts. */ - disable_acpi(); - - /* We control them ourselves, by overriding these two hooks. */ - pcibios_enable_irq = lguest_enable_irq; - pcibios_disable_irq = lguest_disable_irq; - - /* - * Last of all, we set the power management poweroff hook to point to - * the Guest routine to power off, and the reboot hook to our restart - * routine. - */ - pm_power_off = lguest_power_off; - machine_ops.restart = lguest_restart; - - /* - * Now we're set up, call i386_start_kernel() in head32.c and we proceed - * to boot as normal. It never returns. - */ - i386_start_kernel(); -} -/* - * This marks the end of stage II of our journey, The Guest. - * - * It is now time for us to explore the layer of virtual drivers and complete - * our understanding of the Guest in "make Drivers". - */ diff --git a/arch/x86/lguest/head_32.S b/arch/x86/lguest/head_32.S deleted file mode 100644 index d5ae63f5ec5d..000000000000 --- a/arch/x86/lguest/head_32.S +++ /dev/null @@ -1,192 +0,0 @@ -#include -#include -#include -#include -#include -#include - -/*G:020 - - * Our story starts with the bzImage: booting starts at startup_32 in - * arch/x86/boot/compressed/head_32.S. This merely uncompresses the real - * kernel in place and then jumps into it: startup_32 in - * arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi - * register, which is created by the bootloader (the Launcher in our case). - * - * The startup_32 function does very little: it clears the uninitialized global - * C variables which we expect to be zero (ie. BSS) and then copies the boot - * header and kernel command line somewhere safe, and populates some initial - * page tables. Finally it checks the 'hardware_subarch' field. This was - * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's - * assigned number), then it calls us here. - * - * WARNING: be very careful here! We're running at addresses equal to physical - * addresses (around 0), not above PAGE_OFFSET as most code expects - * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any - * data without remembering to subtract __PAGE_OFFSET! - * - * The .section line puts this code in .init.text so it will be discarded after - * boot. - */ -.section .init.text, "ax", @progbits -ENTRY(lguest_entry) - /* - * We make the "initialization" hypercall now to tell the Host where - * our lguest_data struct is. - */ - movl $LHCALL_LGUEST_INIT, %eax - movl $lguest_data - __PAGE_OFFSET, %ebx - int $LGUEST_TRAP_ENTRY - - /* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */ - movl $LHCALL_NEW_PGTABLE, %eax - movl $(initial_page_table - __PAGE_OFFSET), %ebx - int $LGUEST_TRAP_ENTRY - - /* Set up the initial stack so we can run C code. */ - movl $(init_thread_union+THREAD_SIZE),%esp - - /* Jumps are relative: we're running __PAGE_OFFSET too low. */ - jmp lguest_init+__PAGE_OFFSET - -/*G:055 - * We create a macro which puts the assembler code between lgstart_ and lgend_ - * markers. These templates are put in the .text section: they can't be - * discarded after boot as we may need to patch modules, too. - */ -.text -#define LGUEST_PATCH(name, insns...) \ - lgstart_##name: insns; lgend_##name:; \ - .globl lgstart_##name; .globl lgend_##name - -LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) -LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) - -/*G:033 - * But using those wrappers is inefficient (we'll see why that doesn't matter - * for save_fl and irq_disable later). If we write our routines carefully in - * assembler, we can avoid clobbering any registers and avoid jumping through - * the wrapper functions. - * - * I skipped over our first piece of assembler, but this one is worth studying - * in a bit more detail so I'll describe in easy stages. First, the routine to - * enable interrupts: - */ -ENTRY(lg_irq_enable) - /* - * The reverse of irq_disable, this sets lguest_data.irq_enabled to - * X86_EFLAGS_IF (ie. "Interrupts enabled"). - */ - movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled - /* - * But now we need to check if the Host wants to know: there might have - * been interrupts waiting to be delivered, in which case it will have - * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we - * jump to send_interrupts, otherwise we're done. - */ - cmpl $0, lguest_data+LGUEST_DATA_irq_pending - jnz send_interrupts - /* - * One cool thing about x86 is that you can do many things without using - * a register. In this case, the normal path hasn't needed to save or - * restore any registers at all! - */ - ret -send_interrupts: - /* - * OK, now we need a register: eax is used for the hypercall number, - * which is LHCALL_SEND_INTERRUPTS. - * - * We used not to bother with this pending detection at all, which was - * much simpler. Sooner or later the Host would realize it had to - * send us an interrupt. But that turns out to make performance 7 - * times worse on a simple tcp benchmark. So now we do this the hard - * way. - */ - pushl %eax - movl $LHCALL_SEND_INTERRUPTS, %eax - /* This is the actual hypercall trap. */ - int $LGUEST_TRAP_ENTRY - /* Put eax back the way we found it. */ - popl %eax - ret - -/* - * Finally, the "popf" or "restore flags" routine. The %eax register holds the - * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're - * enabling interrupts again, if it's 0 we're leaving them off. - */ -ENTRY(lg_restore_fl) - /* This is just "lguest_data.irq_enabled = flags;" */ - movl %eax, lguest_data+LGUEST_DATA_irq_enabled - /* - * Now, if the %eax value has enabled interrupts and - * lguest_data.irq_pending is set, we want to tell the Host so it can - * deliver any outstanding interrupts. Fortunately, both values will - * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" - * instruction will AND them together for us. If both are set, we - * jump to send_interrupts. - */ - testl lguest_data+LGUEST_DATA_irq_pending, %eax - jnz send_interrupts - /* Again, the normal path has used no extra registers. Clever, huh? */ - ret -/*:*/ - -/* These demark the EIP where host should never deliver interrupts. */ -.global lguest_noirq_iret - -/*M:004 - * When the Host reflects a trap or injects an interrupt into the Guest, it - * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled, - * so the Guest iret logic does the right thing when restoring it. However, - * when the Host sets the Guest up for direct traps, such as system calls, the - * processor is the one to push eflags onto the stack, and the interrupt bit - * will be 1 (in reality, interrupts are always enabled in the Guest). - * - * This turns out to be harmless: the only trap which should happen under Linux - * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc - * regions), which has to be reflected through the Host anyway. If another - * trap *does* go off when interrupts are disabled, the Guest will panic, and - * we'll never get to this iret! -:*/ - -/*G:045 - * There is one final paravirt_op that the Guest implements, and glancing at it - * you can see why I left it to last. It's *cool*! It's in *assembler*! - * - * The "iret" instruction is used to return from an interrupt or trap. The - * stack looks like this: - * old address - * old code segment & privilege level - * old processor flags ("eflags") - * - * The "iret" instruction pops those values off the stack and restores them all - * at once. The only problem is that eflags includes the Interrupt Flag which - * the Guest can't change: the CPU will simply ignore it when we do an "iret". - * So we have to copy eflags from the stack to lguest_data.irq_enabled before - * we do the "iret". - * - * There are two problems with this: firstly, we can't clobber any registers - * and secondly, the whole thing needs to be atomic. The first problem - * is solved by using "push memory"/"pop memory" instruction pair for copying. - * - * The second is harder: copying eflags to lguest_data.irq_enabled will turn - * interrupts on before we're finished, so we could be interrupted before we - * return to userspace or wherever. Our solution to this is to tell the - * Host that it is *never* to interrupt us there, even if interrupts seem to be - * enabled. (It's not necessary to protect pop instruction, since - * data gets updated only after it completes, so we only need to protect - * one instruction, iret). - */ -ENTRY(lguest_iret) - pushl 2*4(%esp) - /* - * Note the %ss: segment prefix here. Normal data accesses use the - * "ds" segment, but that will have already been restored for whatever - * we're returning to (such as userspace): we can't trust it. The %ss: - * prefix makes sure we use the stack segment, which is still valid. - */ - popl %ss:lguest_data+LGUEST_DATA_irq_enabled -lguest_noirq_iret: - iret diff --git a/drivers/Makefile b/drivers/Makefile index dfdcda00bfe3..d90fdc413648 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -125,7 +125,6 @@ obj-$(CONFIG_ACCESSIBILITY) += accessibility/ obj-$(CONFIG_ISDN) += isdn/ obj-$(CONFIG_EDAC) += edac/ obj-$(CONFIG_EISA) += eisa/ -obj-y += lguest/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_CPU_IDLE) += cpuidle/ obj-y += mmc/ diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 8ddc98279c8f..80aaf3420e12 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -470,7 +470,7 @@ config VIRTIO_BLK depends on VIRTIO ---help--- This is the virtual block driver for virtio. It can be used with - lguest or QEMU based VMMs (like KVM or Xen). Say Y or M. + QEMU based VMMs (like KVM or Xen). Say Y or M. config VIRTIO_BLK_SCSI bool "SCSI passthrough request for the Virtio block driver" diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index ccd239ab879f..623714344600 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -161,7 +161,7 @@ config VIRTIO_CONSOLE depends on VIRTIO && TTY select HVC_DRIVER help - Virtio console for use with lguest and other hypervisors. + Virtio console for use with hypervisors. Also serves as a general-purpose serial device for data transfer between the guest and host. Character devices at diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index ad843eb02ae7..4d229dde6522 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -1130,7 +1130,7 @@ static const struct file_operations port_fops = { * We turn the characters into a scatter-gather list, add it to the * output queue and then kick the Host. Then we sit here waiting for * it to finish: inefficient in theory, but in practice - * implementations will do it immediately (lguest's Launcher does). + * implementations will do it immediately. */ static int put_chars(u32 vtermno, const char *buf, int count) { diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig deleted file mode 100644 index 169172d2ba05..000000000000 --- a/drivers/lguest/Kconfig +++ /dev/null @@ -1,13 +0,0 @@ -config LGUEST - tristate "Linux hypervisor example code" - depends on X86_32 && EVENTFD && TTY && PCI_DIRECT - select HVC_DRIVER - ---help--- - This is a very simple module which allows you to run - multiple instances of the same Linux kernel, using the - "lguest" command found in the tools/lguest directory. - - Note that "lguest" is pronounced to rhyme with "fell quest", - not "rustyvisor". See tools/lguest/lguest.txt. - - If unsure, say N. If curious, say M. If masochistic, say Y. diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile deleted file mode 100644 index 16f52ee73994..000000000000 --- a/drivers/lguest/Makefile +++ /dev/null @@ -1,26 +0,0 @@ -# Host requires the other files, which can be a module. -obj-$(CONFIG_LGUEST) += lg.o -lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \ - segments.o lguest_user.o - -lg-$(CONFIG_X86_32) += x86/switcher_32.o x86/core.o - -Preparation Preparation!: PREFIX=P -Guest: PREFIX=G -Drivers: PREFIX=D -Launcher: PREFIX=L -Host: PREFIX=H -Switcher: PREFIX=S -Mastery: PREFIX=M -Beer: - @for f in Preparation Guest Drivers Launcher Host Switcher Mastery; do echo "{==- $$f -==}"; make -s $$f; done; echo "{==-==}" -Preparation Preparation! Guest Drivers Launcher Host Switcher Mastery: - @sh ../../tools/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'` -Puppy: - @clear - @printf " __ \n (___()'\`;\n /, /\`\n \\\\\\\"--\\\\\\ \n" - @sleep 2; clear; printf "\n\n Sit!\n\n"; sleep 1; clear - @printf " __ \n ()'\`; \n /\\|\` \n / | \n(/_)_|_ \n" - @sleep 2; clear; printf "\n\n Stand!\n\n"; sleep 1; clear - @printf " __ \n ()'\`; \n /\\|\` \n /._.= \n /| / \n(_\_)_ \n" - @sleep 2; clear; printf "\n\n Good puppy!\n\n"; sleep 1; clear diff --git a/drivers/lguest/README b/drivers/lguest/README deleted file mode 100644 index b7db39a64c66..000000000000 --- a/drivers/lguest/README +++ /dev/null @@ -1,47 +0,0 @@ -Welcome, friend reader, to lguest. - -Lguest is an adventure, with you, the reader, as Hero. I can't think of many -5000-line projects which offer both such capability and glimpses of future -potential; it is an exciting time to be delving into the source! - -But be warned; this is an arduous journey of several hours or more! And as we -know, all true Heroes are driven by a Noble Goal. Thus I offer a Beer (or -equivalent) to anyone I meet who has completed this documentation. - -So get comfortable and keep your wits about you (both quick and humorous). -Along your way to the Noble Goal, you will also gain masterly insight into -lguest, and hypervisors and x86 virtualization in general. - -Our Quest is in seven parts: (best read with C highlighting turned on) - -I) Preparation - - In which our potential hero is flown quickly over the landscape for a - taste of its scope. Suitable for the armchair coders and other such - persons of faint constitution. - -II) Guest - - Where we encounter the first tantalising wisps of code, and come to - understand the details of the life of a Guest kernel. - -III) Drivers - - Whereby the Guest finds its voice and become useful, and our - understanding of the Guest is completed. - -IV) Launcher - - Where we trace back to the creation of the Guest, and thus begin our - understanding of the Host. - -V) Host - - Where we master the Host code, through a long and tortuous journey. - Indeed, it is here that our hero is tested in the Bit of Despair. - -VI) Switcher - - Where our understanding of the intertwined nature of Guests and Hosts - is completed. - -VII) Mastery - - Where our fully fledged hero grapples with the Great Question: - "What next?" - -make Preparation! -Rusty Russell. diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c deleted file mode 100644 index 395ed1961dbf..000000000000 --- a/drivers/lguest/core.c +++ /dev/null @@ -1,398 +0,0 @@ -/*P:400 - * This contains run_guest() which actually calls into the Host<->Guest - * Switcher and analyzes the return, such as determining if the Guest wants the - * Host to do something. This file also contains useful helper routines. -:*/ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "lg.h" - -unsigned long switcher_addr; -struct page **lg_switcher_pages; -static struct vm_struct *switcher_text_vma; -static struct vm_struct *switcher_stacks_vma; - -/* This One Big lock protects all inter-guest data structures. */ -DEFINE_MUTEX(lguest_lock); - -/*H:010 - * We need to set up the Switcher at a high virtual address. Remember the - * Switcher is a few hundred bytes of assembler code which actually changes the - * CPU to run the Guest, and then changes back to the Host when a trap or - * interrupt happens. - * - * The Switcher code must be at the same virtual address in the Guest as the - * Host since it will be running as the switchover occurs. - * - * Trying to map memory at a particular address is an unusual thing to do, so - * it's not a simple one-liner. - */ -static __init int map_switcher(void) -{ - int i, err; - - /* - * Map the Switcher in to high memory. - * - * It turns out that if we choose the address 0xFFC00000 (4MB under the - * top virtual address), it makes setting up the page tables really - * easy. - */ - - /* We assume Switcher text fits into a single page. */ - if (end_switcher_text - start_switcher_text > PAGE_SIZE) { - printk(KERN_ERR "lguest: switcher text too large (%zu)\n", - end_switcher_text - start_switcher_text); - return -EINVAL; - } - - /* - * We allocate an array of struct page pointers. map_vm_area() wants - * this, rather than just an array of pages. - */ - lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0]) - * TOTAL_SWITCHER_PAGES, - GFP_KERNEL); - if (!lg_switcher_pages) { - err = -ENOMEM; - goto out; - } - - /* - * Now we actually allocate the pages. The Guest will see these pages, - * so we make sure they're zeroed. - */ - for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { - lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); - if (!lg_switcher_pages[i]) { - err = -ENOMEM; - goto free_some_pages; - } - } - - /* - * Copy in the compiled-in Switcher code (from x86/switcher_32.S). - * It goes in the first page, which we map in momentarily. - */ - memcpy(kmap(lg_switcher_pages[0]), start_switcher_text, - end_switcher_text - start_switcher_text); - kunmap(lg_switcher_pages[0]); - - /* - * We place the Switcher underneath the fixmap area, which is the - * highest virtual address we can get. This is important, since we - * tell the Guest it can't access this memory, so we want its ceiling - * as high as possible. - */ - switcher_addr = FIXADDR_START - TOTAL_SWITCHER_PAGES*PAGE_SIZE; - - /* - * Now we reserve the "virtual memory area"s we want. We might - * not get them in theory, but in practice it's worked so far. - * - * We want the switcher text to be read-only and executable, and - * the stacks to be read-write and non-executable. - */ - switcher_text_vma = __get_vm_area(PAGE_SIZE, VM_ALLOC|VM_NO_GUARD, - switcher_addr, - switcher_addr + PAGE_SIZE); - - if (!switcher_text_vma) { - err = -ENOMEM; - printk("lguest: could not map switcher pages high\n"); - goto free_pages; - } - - switcher_stacks_vma = __get_vm_area(SWITCHER_STACK_PAGES * PAGE_SIZE, - VM_ALLOC|VM_NO_GUARD, - switcher_addr + PAGE_SIZE, - switcher_addr + TOTAL_SWITCHER_PAGES * PAGE_SIZE); - if (!switcher_stacks_vma) { - err = -ENOMEM; - printk("lguest: could not map switcher pages high\n"); - goto free_text_vma; - } - - /* - * This code actually sets up the pages we've allocated to appear at - * switcher_addr. map_vm_area() takes the vma we allocated above, the - * kind of pages we're mapping (kernel text pages and kernel writable - * pages respectively), and a pointer to our array of struct pages. - */ - err = map_vm_area(switcher_text_vma, PAGE_KERNEL_RX, lg_switcher_pages); - if (err) { - printk("lguest: text map_vm_area failed: %i\n", err); - goto free_vmas; - } - - err = map_vm_area(switcher_stacks_vma, PAGE_KERNEL, - lg_switcher_pages + SWITCHER_TEXT_PAGES); - if (err) { - printk("lguest: stacks map_vm_area failed: %i\n", err); - goto free_vmas; - } - - /* - * Now the Switcher is mapped at the right address, we can't fail! - */ - printk(KERN_INFO "lguest: mapped switcher at %p\n", - switcher_text_vma->addr); - /* And we succeeded... */ - return 0; - -free_vmas: - /* Undoes map_vm_area and __get_vm_area */ - vunmap(switcher_stacks_vma->addr); -free_text_vma: - vunmap(switcher_text_vma->addr); -free_pages: - i = TOTAL_SWITCHER_PAGES; -free_some_pages: - for (--i; i >= 0; i--) - __free_pages(lg_switcher_pages[i], 0); - kfree(lg_switcher_pages); -out: - return err; -} -/*:*/ - -/* Cleaning up the mapping when the module is unloaded is almost... too easy. */ -static void unmap_switcher(void) -{ - unsigned int i; - - /* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */ - vunmap(switcher_text_vma->addr); - vunmap(switcher_stacks_vma->addr); - /* Now we just need to free the pages we copied the switcher into */ - for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) - __free_pages(lg_switcher_pages[i], 0); - kfree(lg_switcher_pages); -} - -/*H:032 - * Dealing With Guest Memory. - * - * Before we go too much further into the Host, we need to grok the routines - * we use to deal with Guest memory. - * - * When the Guest gives us (what it thinks is) a physical address, we can use - * the normal copy_from_user() & copy_to_user() on the corresponding place in - * the memory region allocated by the Launcher. - * - * But we can't trust the Guest: it might be trying to access the Launcher - * code. We have to check that the range is below the pfn_limit the Launcher - * gave us. We have to make sure that addr + len doesn't give us a false - * positive by overflowing, too. - */ -bool lguest_address_ok(const struct lguest *lg, - unsigned long addr, unsigned long len) -{ - return addr+len <= lg->pfn_limit * PAGE_SIZE && (addr+len >= addr); -} - -/* - * This routine copies memory from the Guest. Here we can see how useful the - * kill_lguest() routine we met in the Launcher can be: we return a random - * value (all zeroes) instead of needing to return an error. - */ -void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes) -{ - if (!lguest_address_ok(cpu->lg, addr, bytes) - || copy_from_user(b, cpu->lg->mem_base + addr, bytes) != 0) { - /* copy_from_user should do this, but as we rely on it... */ - memset(b, 0, bytes); - kill_guest(cpu, "bad read address %#lx len %u", addr, bytes); - } -} - -/* This is the write (copy into Guest) version. */ -void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b, - unsigned bytes) -{ - if (!lguest_address_ok(cpu->lg, addr, bytes) - || copy_to_user(cpu->lg->mem_base + addr, b, bytes) != 0) - kill_guest(cpu, "bad write address %#lx len %u", addr, bytes); -} -/*:*/ - -/*H:030 - * Let's jump straight to the the main loop which runs the Guest. - * Remember, this is called by the Launcher reading /dev/lguest, and we keep - * going around and around until something interesting happens. - */ -int run_guest(struct lg_cpu *cpu, unsigned long __user *user) -{ - /* If the launcher asked for a register with LHREQ_GETREG */ - if (cpu->reg_read) { - if (put_user(*cpu->reg_read, user)) - return -EFAULT; - cpu->reg_read = NULL; - return sizeof(*cpu->reg_read); - } - - /* We stop running once the Guest is dead. */ - while (!cpu->lg->dead) { - unsigned int irq; - bool more; - - /* First we run any hypercalls the Guest wants done. */ - if (cpu->hcall) - do_hypercalls(cpu); - - /* Do we have to tell the Launcher about a trap? */ - if (cpu->pending.trap) { - if (copy_to_user(user, &cpu->pending, - sizeof(cpu->pending))) - return -EFAULT; - return sizeof(cpu->pending); - } - - /* - * All long-lived kernel loops need to check with this horrible - * thing called the freezer. If the Host is trying to suspend, - * it stops us. - */ - try_to_freeze(); - - /* Check for signals */ - if (signal_pending(current)) - return -ERESTARTSYS; - - /* - * Check if there are any interrupts which can be delivered now: - * if so, this sets up the hander to be executed when we next - * run the Guest. - */ - irq = interrupt_pending(cpu, &more); - if (irq < LGUEST_IRQS) - try_deliver_interrupt(cpu, irq, more); - - /* - * Just make absolutely sure the Guest is still alive. One of - * those hypercalls could have been fatal, for example. - */ - if (cpu->lg->dead) - break; - - /* - * If the Guest asked to be stopped, we sleep. The Guest's - * clock timer will wake us. - */ - if (cpu->halted) { - set_current_state(TASK_INTERRUPTIBLE); - /* - * Just before we sleep, make sure no interrupt snuck in - * which we should be doing. - */ - if (interrupt_pending(cpu, &more) < LGUEST_IRQS) - set_current_state(TASK_RUNNING); - else - schedule(); - continue; - } - - /* - * OK, now we're ready to jump into the Guest. First we put up - * the "Do Not Disturb" sign: - */ - local_irq_disable(); - - /* Actually run the Guest until something happens. */ - lguest_arch_run_guest(cpu); - - /* Now we're ready to be interrupted or moved to other CPUs */ - local_irq_enable(); - - /* Now we deal with whatever happened to the Guest. */ - lguest_arch_handle_trap(cpu); - } - - /* Special case: Guest is 'dead' but wants a reboot. */ - if (cpu->lg->dead == ERR_PTR(-ERESTART)) - return -ERESTART; - - /* The Guest is dead => "No such file or directory" */ - return -ENOENT; -} - -/*H:000 - * Welcome to the Host! - * - * By this point your brain has been tickled by the Guest code and numbed by - * the Launcher code; prepare for it to be stretched by the Host code. This is - * the heart. Let's begin at the initialization routine for the Host's lg - * module. - */ -static int __init init(void) -{ - int err; - - /* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */ - if (get_kernel_rpl() != 0) { - printk("lguest is afraid of being a guest\n"); - return -EPERM; - } - - /* First we put the Switcher up in very high virtual memory. */ - err = map_switcher(); - if (err) - goto out; - - /* We might need to reserve an interrupt vector. */ - err = init_interrupts(); - if (err) - goto unmap; - - /* /dev/lguest needs to be registered. */ - err = lguest_device_init(); - if (err) - goto free_interrupts; - - /* Finally we do some architecture-specific setup. */ - lguest_arch_host_init(); - - /* All good! */ - return 0; - -free_interrupts: - free_interrupts(); -unmap: - unmap_switcher(); -out: - return err; -} - -/* Cleaning up is just the same code, backwards. With a little French. */ -static void __exit fini(void) -{ - lguest_device_remove(); - free_interrupts(); - unmap_switcher(); - - lguest_arch_host_fini(); -} -/*:*/ - -/* - * The Host side of lguest can be a module. This is a nice way for people to - * play with it. - */ -module_init(init); -module_exit(fini); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Rusty Russell "); diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c deleted file mode 100644 index 601f81c04873..000000000000 --- a/drivers/lguest/hypercalls.c +++ /dev/null @@ -1,304 +0,0 @@ -/*P:500 - * Just as userspace programs request kernel operations through a system - * call, the Guest requests Host operations through a "hypercall". You might - * notice this nomenclature doesn't really follow any logic, but the name has - * been around for long enough that we're stuck with it. As you'd expect, this - * code is basically a one big switch statement. -:*/ - -/* Copyright (C) 2006 Rusty Russell IBM Corporation - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -*/ -#include -#include -#include -#include -#include -#include -#include "lg.h" - -/*H:120 - * This is the core hypercall routine: where the Guest gets what it wants. - * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both. - */ -static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) -{ - switch (args->arg0) { - case LHCALL_FLUSH_ASYNC: - /* - * This call does nothing, except by breaking out of the Guest - * it makes us process all the asynchronous hypercalls. - */ - break; - case LHCALL_SEND_INTERRUPTS: - /* - * This call does nothing too, but by breaking out of the Guest - * it makes us process any pending interrupts. - */ - break; - case LHCALL_LGUEST_INIT: - /* - * You can't get here unless you're already initialized. Don't - * do that. - */ - kill_guest(cpu, "already have lguest_data"); - break; - case LHCALL_SHUTDOWN: { - char msg[128]; - /* - * Shutdown is such a trivial hypercall that we do it in five - * lines right here. - * - * If the lgread fails, it will call kill_guest() itself; the - * kill_guest() with the message will be ignored. - */ - __lgread(cpu, msg, args->arg1, sizeof(msg)); - msg[sizeof(msg)-1] = '\0'; - kill_guest(cpu, "CRASH: %s", msg); - if (args->arg2 == LGUEST_SHUTDOWN_RESTART) - cpu->lg->dead = ERR_PTR(-ERESTART); - break; - } - case LHCALL_FLUSH_TLB: - /* FLUSH_TLB comes in two flavors, depending on the argument: */ - if (args->arg1) - guest_pagetable_clear_all(cpu); - else - guest_pagetable_flush_user(cpu); - break; - - /* - * All these calls simply pass the arguments through to the right - * routines. - */ - case LHCALL_NEW_PGTABLE: - guest_new_pagetable(cpu, args->arg1); - break; - case LHCALL_SET_STACK: - guest_set_stack(cpu, args->arg1, args->arg2, args->arg3); - break; - case LHCALL_SET_PTE: -#ifdef CONFIG_X86_PAE - guest_set_pte(cpu, args->arg1, args->arg2, - __pte(args->arg3 | (u64)args->arg4 << 32)); -#else - guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3)); -#endif - break; - case LHCALL_SET_PGD: - guest_set_pgd(cpu->lg, args->arg1, args->arg2); - break; -#ifdef CONFIG_X86_PAE - case LHCALL_SET_PMD: - guest_set_pmd(cpu->lg, args->arg1, args->arg2); - break; -#endif - case LHCALL_SET_CLOCKEVENT: - guest_set_clockevent(cpu, args->arg1); - break; - case LHCALL_HALT: - /* Similarly, this sets the halted flag for run_guest(). */ - cpu->halted = 1; - break; - default: - /* It should be an architecture-specific hypercall. */ - if (lguest_arch_do_hcall(cpu, args)) - kill_guest(cpu, "Bad hypercall %li\n", args->arg0); - } -} - -/*H:124 - * Asynchronous hypercalls are easy: we just look in the array in the - * Guest's "struct lguest_data" to see if any new ones are marked "ready". - * - * We are careful to do these in order: obviously we respect the order the - * Guest put them in the ring, but we also promise the Guest that they will - * happen before any normal hypercall (which is why we check this before - * checking for a normal hcall). - */ -static void do_async_hcalls(struct lg_cpu *cpu) -{ - unsigned int i; - u8 st[LHCALL_RING_SIZE]; - - /* For simplicity, we copy the entire call status array in at once. */ - if (copy_from_user(&st, &cpu->lg->lguest_data->hcall_status, sizeof(st))) - return; - - /* We process "struct lguest_data"s hcalls[] ring once. */ - for (i = 0; i < ARRAY_SIZE(st); i++) { - struct hcall_args args; - /* - * We remember where we were up to from last time. This makes - * sure that the hypercalls are done in the order the Guest - * places them in the ring. - */ - unsigned int n = cpu->next_hcall; - - /* 0xFF means there's no call here (yet). */ - if (st[n] == 0xFF) - break; - - /* - * OK, we have hypercall. Increment the "next_hcall" cursor, - * and wrap back to 0 if we reach the end. - */ - if (++cpu->next_hcall == LHCALL_RING_SIZE) - cpu->next_hcall = 0; - - /* - * Copy the hypercall arguments into a local copy of the - * hcall_args struct. - */ - if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n], - sizeof(struct hcall_args))) { - kill_guest(cpu, "Fetching async hypercalls"); - break; - } - - /* Do the hypercall, same as a normal one. */ - do_hcall(cpu, &args); - - /* Mark the hypercall done. */ - if (put_user(0xFF, &cpu->lg->lguest_data->hcall_status[n])) { - kill_guest(cpu, "Writing result for async hypercall"); - break; - } - - /* - * Stop doing hypercalls if they want to notify the Launcher: - * it needs to service this first. - */ - if (cpu->pending.trap) - break; - } -} - -/* - * Last of all, we look at what happens first of all. The very first time the - * Guest makes a hypercall, we end up here to set things up: - */ -static void initialize(struct lg_cpu *cpu) -{ - /* - * You can't do anything until you're initialized. The Guest knows the - * rules, so we're unforgiving here. - */ - if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) { - kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0); - return; - } - - if (lguest_arch_init_hypercalls(cpu)) - kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); - - /* - * The Guest tells us where we're not to deliver interrupts by putting - * the instruction address into "struct lguest_data". - */ - if (get_user(cpu->lg->noirq_iret, &cpu->lg->lguest_data->noirq_iret)) - kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); - - /* - * We write the current time into the Guest's data page once so it can - * set its clock. - */ - write_timestamp(cpu); - - /* page_tables.c will also do some setup. */ - page_table_guest_data_init(cpu); - - /* - * This is the one case where the above accesses might have been the - * first write to a Guest page. This may have caused a copy-on-write - * fault, but the old page might be (read-only) in the Guest - * pagetable. - */ - guest_pagetable_clear_all(cpu); -} -/*:*/ - -/*M:013 - * If a Guest reads from a page (so creates a mapping) that it has never - * written to, and then the Launcher writes to it (ie. the output of a virtual - * device), the Guest will still see the old page. In practice, this never - * happens: why would the Guest read a page which it has never written to? But - * a similar scenario might one day bite us, so it's worth mentioning. - * - * Note that if we used a shared anonymous mapping in the Launcher instead of - * mapping /dev/zero private, we wouldn't worry about cop-on-write. And we - * need that to switch the Launcher to processes (away from threads) anyway. -:*/ - -/*H:100 - * Hypercalls - * - * Remember from the Guest, hypercalls come in two flavors: normal and - * asynchronous. This file handles both of types. - */ -void do_hypercalls(struct lg_cpu *cpu) -{ - /* Not initialized yet? This hypercall must do it. */ - if (unlikely(!cpu->lg->lguest_data)) { - /* Set up the "struct lguest_data" */ - initialize(cpu); - /* Hcall is done. */ - cpu->hcall = NULL; - return; - } - - /* - * The Guest has initialized. - * - * Look in the hypercall ring for the async hypercalls: - */ - do_async_hcalls(cpu); - - /* - * If we stopped reading the hypercall ring because the Guest did a - * NOTIFY to the Launcher, we want to return now. Otherwise we do - * the hypercall. - */ - if (!cpu->pending.trap) { - do_hcall(cpu, cpu->hcall); - /* - * Tricky point: we reset the hcall pointer to mark the - * hypercall as "done". We use the hcall pointer rather than - * the trap number to indicate a hypercall is pending. - * Normally it doesn't matter: the Guest will run again and - * update the trap number before we come back here. - * - * However, if we are signalled or the Guest sends I/O to the - * Launcher, the run_guest() loop will exit without running the - * Guest. When it comes back it would try to re-run the - * hypercall. Finding that bug sucked. - */ - cpu->hcall = NULL; - } -} - -/* - * This routine supplies the Guest with time: it's used for wallclock time at - * initial boot and as a rough time source if the TSC isn't available. - */ -void write_timestamp(struct lg_cpu *cpu) -{ - struct timespec now; - ktime_get_real_ts(&now); - if (copy_to_user(&cpu->lg->lguest_data->time, - &now, sizeof(struct timespec))) - kill_guest(cpu, "Writing timestamp"); -} diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c deleted file mode 100644 index 67392b6ab845..000000000000 --- a/drivers/lguest/interrupts_and_traps.c +++ /dev/null @@ -1,706 +0,0 @@ -/*P:800 - * Interrupts (traps) are complicated enough to earn their own file. - * There are three classes of interrupts: - * - * 1) Real hardware interrupts which occur while we're running the Guest, - * 2) Interrupts for virtual devices attached to the Guest, and - * 3) Traps and faults from the Guest. - * - * Real hardware interrupts must be delivered to the Host, not the Guest. - * Virtual interrupts must be delivered to the Guest, but we make them look - * just like real hardware would deliver them. Traps from the Guest can be set - * up to go directly back into the Guest, but sometimes the Host wants to see - * them first, so we also have a way of "reflecting" them into the Guest as if - * they had been delivered to it directly. -:*/ -#include -#include -#include -#include -#include "lg.h" - -/* Allow Guests to use a non-128 (ie. non-Linux) syscall trap. */ -static unsigned int syscall_vector = IA32_SYSCALL_VECTOR; -module_param(syscall_vector, uint, 0444); - -/* The address of the interrupt handler is split into two bits: */ -static unsigned long idt_address(u32 lo, u32 hi) -{ - return (lo & 0x0000FFFF) | (hi & 0xFFFF0000); -} - -/* - * The "type" of the interrupt handler is a 4 bit field: we only support a - * couple of types. - */ -static int idt_type(u32 lo, u32 hi) -{ - return (hi >> 8) & 0xF; -} - -/* An IDT entry can't be used unless the "present" bit is set. */ -static bool idt_present(u32 lo, u32 hi) -{ - return (hi & 0x8000); -} - -/* - * We need a helper to "push" a value onto the Guest's stack, since that's a - * big part of what delivering an interrupt does. - */ -static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) -{ - /* Stack grows upwards: move stack then write value. */ - *gstack -= 4; - lgwrite(cpu, *gstack, u32, val); -} - -/*H:210 - * The push_guest_interrupt_stack() routine saves Guest state on the stack for - * an interrupt or trap. The mechanics of delivering traps and interrupts to - * the Guest are the same, except some traps have an "error code" which gets - * pushed onto the stack as well: the caller tells us if this is one. - * - * We set up the stack just like the CPU does for a real interrupt, so it's - * identical for the Guest (and the standard "iret" instruction will undo - * it). - */ -static void push_guest_interrupt_stack(struct lg_cpu *cpu, bool has_err) -{ - unsigned long gstack, origstack; - u32 eflags, ss, irq_enable; - unsigned long virtstack; - - /* - * There are two cases for interrupts: one where the Guest is already - * in the kernel, and a more complex one where the Guest is in - * userspace. We check the privilege level to find out. - */ - if ((cpu->regs->ss&0x3) != GUEST_PL) { - /* - * The Guest told us their kernel stack with the SET_STACK - * hypercall: both the virtual address and the segment. - */ - virtstack = cpu->esp1; - ss = cpu->ss1; - - origstack = gstack = guest_pa(cpu, virtstack); - /* - * We push the old stack segment and pointer onto the new - * stack: when the Guest does an "iret" back from the interrupt - * handler the CPU will notice they're dropping privilege - * levels and expect these here. - */ - push_guest_stack(cpu, &gstack, cpu->regs->ss); - push_guest_stack(cpu, &gstack, cpu->regs->esp); - } else { - /* We're staying on the same Guest (kernel) stack. */ - virtstack = cpu->regs->esp; - ss = cpu->regs->ss; - - origstack = gstack = guest_pa(cpu, virtstack); - } - - /* - * Remember that we never let the Guest actually disable interrupts, so - * the "Interrupt Flag" bit is always set. We copy that bit from the - * Guest's "irq_enabled" field into the eflags word: we saw the Guest - * copy it back in "lguest_iret". - */ - eflags = cpu->regs->eflags; - if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0 - && !(irq_enable & X86_EFLAGS_IF)) - eflags &= ~X86_EFLAGS_IF; - - /* - * An interrupt is expected to push three things on the stack: the old - * "eflags" word, the old code segment, and the old instruction - * pointer. - */ - push_guest_stack(cpu, &gstack, eflags); - push_guest_stack(cpu, &gstack, cpu->regs->cs); - push_guest_stack(cpu, &gstack, cpu->regs->eip); - - /* For the six traps which supply an error code, we push that, too. */ - if (has_err) - push_guest_stack(cpu, &gstack, cpu->regs->errcode); - - /* Adjust the stack pointer and stack segment. */ - cpu->regs->ss = ss; - cpu->regs->esp = virtstack + (gstack - origstack); -} - -/* - * This actually makes the Guest start executing the given interrupt/trap - * handler. - * - * "lo" and "hi" are the two parts of the Interrupt Descriptor Table for this - * interrupt or trap. It's split into two parts for traditional reasons: gcc - * on i386 used to be frightened by 64 bit numbers. - */ -static void guest_run_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi) -{ - /* If we're already in the kernel, we don't change stacks. */ - if ((cpu->regs->ss&0x3) != GUEST_PL) - cpu->regs->ss = cpu->esp1; - - /* - * Set the code segment and the address to execute. - */ - cpu->regs->cs = (__KERNEL_CS|GUEST_PL); - cpu->regs->eip = idt_address(lo, hi); - - /* - * Trapping always clears these flags: - * TF: Trap flag - * VM: Virtual 8086 mode - * RF: Resume - * NT: Nested task. - */ - cpu->regs->eflags &= - ~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT); - - /* - * There are two kinds of interrupt handlers: 0xE is an "interrupt - * gate" which expects interrupts to be disabled on entry. - */ - if (idt_type(lo, hi) == 0xE) - if (put_user(0, &cpu->lg->lguest_data->irq_enabled)) - kill_guest(cpu, "Disabling interrupts"); -} - -/* This restores the eflags word which was pushed on the stack by a trap */ -static void restore_eflags(struct lg_cpu *cpu) -{ - /* This is the physical address of the stack. */ - unsigned long stack_pa = guest_pa(cpu, cpu->regs->esp); - - /* - * Stack looks like this: - * Address Contents - * esp EIP - * esp + 4 CS - * esp + 8 EFLAGS - */ - cpu->regs->eflags = lgread(cpu, stack_pa + 8, u32); - cpu->regs->eflags &= - ~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT); -} - -/*H:205 - * Virtual Interrupts. - * - * interrupt_pending() returns the first pending interrupt which isn't blocked - * by the Guest. It is called before every entry to the Guest, and just before - * we go to sleep when the Guest has halted itself. - */ -unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) -{ - unsigned int irq; - DECLARE_BITMAP(blk, LGUEST_IRQS); - - /* If the Guest hasn't even initialized yet, we can do nothing. */ - if (!cpu->lg->lguest_data) - return LGUEST_IRQS; - - /* - * Take our "irqs_pending" array and remove any interrupts the Guest - * wants blocked: the result ends up in "blk". - */ - if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, - sizeof(blk))) - return LGUEST_IRQS; - bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS); - - /* Find the first interrupt. */ - irq = find_first_bit(blk, LGUEST_IRQS); - *more = find_next_bit(blk, LGUEST_IRQS, irq+1); - - return irq; -} - -/* - * This actually diverts the Guest to running an interrupt handler, once an - * interrupt has been identified by interrupt_pending(). - */ -void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) -{ - struct desc_struct *idt; - - BUG_ON(irq >= LGUEST_IRQS); - - /* If they're halted, interrupts restart them. */ - if (cpu->halted) { - /* Re-enable interrupts. */ - if (put_user(X86_EFLAGS_IF, &cpu->lg->lguest_data->irq_enabled)) - kill_guest(cpu, "Re-enabling interrupts"); - cpu->halted = 0; - } else { - /* Otherwise we check if they have interrupts disabled. */ - u32 irq_enabled; - if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled)) - irq_enabled = 0; - if (!irq_enabled) { - /* Make sure they know an IRQ is pending. */ - put_user(X86_EFLAGS_IF, - &cpu->lg->lguest_data->irq_pending); - return; - } - } - - /* - * Look at the IDT entry the Guest gave us for this interrupt. The - * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip - * over them. - */ - idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq]; - /* If they don't have a handler (yet?), we just ignore it */ - if (idt_present(idt->a, idt->b)) { - /* OK, mark it no longer pending and deliver it. */ - clear_bit(irq, cpu->irqs_pending); - - /* - * They may be about to iret, where they asked us never to - * deliver interrupts. In this case, we can emulate that iret - * then immediately deliver the interrupt. This is basically - * a noop: the iret would pop the interrupt frame and restore - * eflags, and then we'd set it up again. So just restore the - * eflags word and jump straight to the handler in this case. - * - * Denys Vlasenko points out that this isn't quite right: if - * the iret was returning to userspace, then that interrupt - * would reset the stack pointer (which the Guest told us - * about via LHCALL_SET_STACK). But unless the Guest is being - * *really* weird, that will be the same as the current stack - * anyway. - */ - if (cpu->regs->eip == cpu->lg->noirq_iret) { - restore_eflags(cpu); - } else { - /* - * set_guest_interrupt() takes a flag to say whether - * this interrupt pushes an error code onto the stack - * as well: virtual interrupts never do. - */ - push_guest_interrupt_stack(cpu, false); - } - /* Actually make Guest cpu jump to handler. */ - guest_run_interrupt(cpu, idt->a, idt->b); - } - - /* - * Every time we deliver an interrupt, we update the timestamp in the - * Guest's lguest_data struct. It would be better for the Guest if we - * did this more often, but it can actually be quite slow: doing it - * here is a compromise which means at least it gets updated every - * timer interrupt. - */ - write_timestamp(cpu); - - /* - * If there are no other interrupts we want to deliver, clear - * the pending flag. - */ - if (!more) - put_user(0, &cpu->lg->lguest_data->irq_pending); -} - -/* And this is the routine when we want to set an interrupt for the Guest. */ -void set_interrupt(struct lg_cpu *cpu, unsigned int irq) -{ - /* - * Next time the Guest runs, the core code will see if it can deliver - * this interrupt. - */ - set_bit(irq, cpu->irqs_pending); - - /* - * Make sure it sees it; it might be asleep (eg. halted), or running - * the Guest right now, in which case kick_process() will knock it out. - */ - if (!wake_up_process(cpu->tsk)) - kick_process(cpu->tsk); -} -/*:*/ - -/* - * Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent - * me a patch, so we support that too. It'd be a big step for lguest if half - * the Plan 9 user base were to start using it. - * - * Actually now I think of it, it's possible that Ron *is* half the Plan 9 - * userbase. Oh well. - */ -bool could_be_syscall(unsigned int num) -{ - /* Normal Linux IA32_SYSCALL_VECTOR or reserved vector? */ - return num == IA32_SYSCALL_VECTOR || num == syscall_vector; -} - -/* The syscall vector it wants must be unused by Host. */ -bool check_syscall_vector(struct lguest *lg) -{ - u32 vector; - - if (get_user(vector, &lg->lguest_data->syscall_vec)) - return false; - - return could_be_syscall(vector); -} - -int init_interrupts(void) -{ - /* If they want some strange system call vector, reserve it now */ - if (syscall_vector != IA32_SYSCALL_VECTOR) { - if (test_bit(syscall_vector, used_vectors) || - vector_used_by_percpu_irq(syscall_vector)) { - printk(KERN_ERR "lg: couldn't reserve syscall %u\n", - syscall_vector); - return -EBUSY; - } - set_bit(syscall_vector, used_vectors); - } - - return 0; -} - -void free_interrupts(void) -{ - if (syscall_vector != IA32_SYSCALL_VECTOR) - clear_bit(syscall_vector, used_vectors); -} - -/*H:220 - * Now we've got the routines to deliver interrupts, delivering traps like - * page fault is easy. The only trick is that Intel decided that some traps - * should have error codes: - */ -static bool has_err(unsigned int trap) -{ - return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17); -} - -/* deliver_trap() returns true if it could deliver the trap. */ -bool deliver_trap(struct lg_cpu *cpu, unsigned int num) -{ - /* - * Trap numbers are always 8 bit, but we set an impossible trap number - * for traps inside the Switcher, so check that here. - */ - if (num >= ARRAY_SIZE(cpu->arch.idt)) - return false; - - /* - * Early on the Guest hasn't set the IDT entries (or maybe it put a - * bogus one in): if we fail here, the Guest will be killed. - */ - if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b)) - return false; - push_guest_interrupt_stack(cpu, has_err(num)); - guest_run_interrupt(cpu, cpu->arch.idt[num].a, - cpu->arch.idt[num].b); - return true; -} - -/*H:250 - * Here's the hard part: returning to the Host every time a trap happens - * and then calling deliver_trap() and re-entering the Guest is slow. - * Particularly because Guest userspace system calls are traps (usually trap - * 128). - * - * So we'd like to set up the IDT to tell the CPU to deliver traps directly - * into the Guest. This is possible, but the complexities cause the size of - * this file to double! However, 150 lines of code is worth writing for taking - * system calls down from 1750ns to 270ns. Plus, if lguest didn't do it, all - * the other hypervisors would beat it up at lunchtime. - * - * This routine indicates if a particular trap number could be delivered - * directly. - * - * Unfortunately, Linux 4.6 started using an interrupt gate instead of a - * trap gate for syscalls, so this trick is ineffective. See Mastery for - * how we could do this anyway... - */ -static bool direct_trap(unsigned int num) -{ - /* - * Hardware interrupts don't go to the Guest at all (except system - * call). - */ - if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num)) - return false; - - /* - * The Host needs to see page faults (for shadow paging and to save the - * fault address), general protection faults (in/out emulation) and - * device not available (TS handling) and of course, the hypercall trap. - */ - return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY; -} -/*:*/ - -/*M:005 - * The Guest has the ability to turn its interrupt gates into trap gates, - * if it is careful. The Host will let trap gates can go directly to the - * Guest, but the Guest needs the interrupts atomically disabled for an - * interrupt gate. The Host could provide a mechanism to register more - * "no-interrupt" regions, and the Guest could point the trap gate at - * instructions within that region, where it can safely disable interrupts. - */ - -/*M:006 - * The Guests do not use the sysenter (fast system call) instruction, - * because it's hardcoded to enter privilege level 0 and so can't go direct. - * It's about twice as fast as the older "int 0x80" system call, so it might - * still be worthwhile to handle it in the Switcher and lcall down to the - * Guest. The sysenter semantics are hairy tho: search for that keyword in - * entry.S -:*/ - -/*H:260 - * When we make traps go directly into the Guest, we need to make sure - * the kernel stack is valid (ie. mapped in the page tables). Otherwise, the - * CPU trying to deliver the trap will fault while trying to push the interrupt - * words on the stack: this is called a double fault, and it forces us to kill - * the Guest. - * - * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. - */ -void pin_stack_pages(struct lg_cpu *cpu) -{ - unsigned int i; - - /* - * Depending on the CONFIG_4KSTACKS option, the Guest can have one or - * two pages of stack space. - */ - for (i = 0; i < cpu->lg->stack_pages; i++) - /* - * The stack grows *upwards*, so the address we're given is the - * start of the page after the kernel stack. Subtract one to - * get back onto the first stack page, and keep subtracting to - * get to the rest of the stack pages. - */ - pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE); -} - -/* - * Direct traps also mean that we need to know whenever the Guest wants to use - * a different kernel stack, so we can change the guest TSS to use that - * stack. The TSS entries expect a virtual address, so unlike most addresses - * the Guest gives us, the "esp" (stack pointer) value here is virtual, not - * physical. - * - * In Linux each process has its own kernel stack, so this happens a lot: we - * change stacks on each context switch. - */ -void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages) -{ - /* - * You're not allowed a stack segment with privilege level 0: bad Guest! - */ - if ((seg & 0x3) != GUEST_PL) - kill_guest(cpu, "bad stack segment %i", seg); - /* We only expect one or two stack pages. */ - if (pages > 2) - kill_guest(cpu, "bad stack pages %u", pages); - /* Save where the stack is, and how many pages */ - cpu->ss1 = seg; - cpu->esp1 = esp; - cpu->lg->stack_pages = pages; - /* Make sure the new stack pages are mapped */ - pin_stack_pages(cpu); -} - -/* - * All this reference to mapping stacks leads us neatly into the other complex - * part of the Host: page table handling. - */ - -/*H:235 - * This is the routine which actually checks the Guest's IDT entry and - * transfers it into the entry in "struct lguest": - */ -static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, - unsigned int num, u32 lo, u32 hi) -{ - u8 type = idt_type(lo, hi); - - /* We zero-out a not-present entry */ - if (!idt_present(lo, hi)) { - trap->a = trap->b = 0; - return; - } - - /* We only support interrupt and trap gates. */ - if (type != 0xE && type != 0xF) - kill_guest(cpu, "bad IDT type %i", type); - - /* - * We only copy the handler address, present bit, privilege level and - * type. The privilege level controls where the trap can be triggered - * manually with an "int" instruction. This is usually GUEST_PL, - * except for system calls which userspace can use. - */ - trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF); - trap->b = (hi&0xFFFFEF00); -} - -/*H:230 - * While we're here, dealing with delivering traps and interrupts to the - * Guest, we might as well complete the picture: how the Guest tells us where - * it wants them to go. This would be simple, except making traps fast - * requires some tricks. - * - * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the - * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. - */ -void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) -{ - /* - * Guest never handles: NMI, doublefault, spurious interrupt or - * hypercall. We ignore when it tries to set them. - */ - if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY) - return; - - /* - * Mark the IDT as changed: next time the Guest runs we'll know we have - * to copy this again. - */ - cpu->changed |= CHANGED_IDT; - - /* Check that the Guest doesn't try to step outside the bounds. */ - if (num >= ARRAY_SIZE(cpu->arch.idt)) - kill_guest(cpu, "Setting idt entry %u", num); - else - set_trap(cpu, &cpu->arch.idt[num], num, lo, hi); -} - -/* - * The default entry for each interrupt points into the Switcher routines which - * simply return to the Host. The run_guest() loop will then call - * deliver_trap() to bounce it back into the Guest. - */ -static void default_idt_entry(struct desc_struct *idt, - int trap, - const unsigned long handler, - const struct desc_struct *base) -{ - /* A present interrupt gate. */ - u32 flags = 0x8e00; - - /* - * Set the privilege level on the entry for the hypercall: this allows - * the Guest to use the "int" instruction to trigger it. - */ - if (trap == LGUEST_TRAP_ENTRY) - flags |= (GUEST_PL << 13); - else if (base) - /* - * Copy privilege level from what Guest asked for. This allows - * debug (int 3) traps from Guest userspace, for example. - */ - flags |= (base->b & 0x6000); - - /* Now pack it into the IDT entry in its weird format. */ - idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF); - idt->b = (handler&0xFFFF0000) | flags; -} - -/* When the Guest first starts, we put default entries into the IDT. */ -void setup_default_idt_entries(struct lguest_ro_state *state, - const unsigned long *def) -{ - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++) - default_idt_entry(&state->guest_idt[i], i, def[i], NULL); -} - -/*H:240 - * We don't use the IDT entries in the "struct lguest" directly, instead - * we copy them into the IDT which we've set up for Guests on this CPU, just - * before we run the Guest. This routine does that copy. - */ -void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, - const unsigned long *def) -{ - unsigned int i; - - /* - * We can simply copy the direct traps, otherwise we use the default - * ones in the Switcher: they will return to the Host. - */ - for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) { - const struct desc_struct *gidt = &cpu->arch.idt[i]; - - /* If no Guest can ever override this trap, leave it alone. */ - if (!direct_trap(i)) - continue; - - /* - * Only trap gates (type 15) can go direct to the Guest. - * Interrupt gates (type 14) disable interrupts as they are - * entered, which we never let the Guest do. Not present - * entries (type 0x0) also can't go direct, of course. - * - * If it can't go direct, we still need to copy the priv. level: - * they might want to give userspace access to a software - * interrupt. - */ - if (idt_type(gidt->a, gidt->b) == 0xF) - idt[i] = *gidt; - else - default_idt_entry(&idt[i], i, def[i], gidt); - } -} - -/*H:200 - * The Guest Clock. - * - * There are two sources of virtual interrupts. We saw one in lguest_user.c: - * the Launcher sending interrupts for virtual devices. The other is the Guest - * timer interrupt. - * - * The Guest uses the LHCALL_SET_CLOCKEVENT hypercall to tell us how long to - * the next timer interrupt (in nanoseconds). We use the high-resolution timer - * infrastructure to set a callback at that time. - * - * 0 means "turn off the clock". - */ -void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta) -{ - ktime_t expires; - - if (unlikely(delta == 0)) { - /* Clock event device is shutting down. */ - hrtimer_cancel(&cpu->hrt); - return; - } - - /* - * We use wallclock time here, so the Guest might not be running for - * all the time between now and the timer interrupt it asked for. This - * is almost always the right thing to do. - */ - expires = ktime_add_ns(ktime_get_real(), delta); - hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS); -} - -/* This is the function called when the Guest's timer expires. */ -static enum hrtimer_restart clockdev_fn(struct hrtimer *timer) -{ - struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt); - - /* Remember the first interrupt is the timer interrupt. */ - set_interrupt(cpu, 0); - return HRTIMER_NORESTART; -} - -/* This sets up the timer for this Guest. */ -void init_clockdev(struct lg_cpu *cpu) -{ - hrtimer_init(&cpu->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS); - cpu->hrt.function = clockdev_fn; -} diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h deleted file mode 100644 index 2356a2318034..000000000000 --- a/drivers/lguest/lg.h +++ /dev/null @@ -1,258 +0,0 @@ -#ifndef _LGUEST_H -#define _LGUEST_H - -#ifndef __ASSEMBLY__ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -struct pgdir { - unsigned long gpgdir; - bool switcher_mapped; - int last_host_cpu; - pgd_t *pgdir; -}; - -/* We have two pages shared with guests, per cpu. */ -struct lguest_pages { - /* This is the stack page mapped rw in guest */ - char spare[PAGE_SIZE - sizeof(struct lguest_regs)]; - struct lguest_regs regs; - - /* This is the host state & guest descriptor page, ro in guest */ - struct lguest_ro_state state; -} __attribute__((aligned(PAGE_SIZE))); - -#define CHANGED_IDT 1 -#define CHANGED_GDT 2 -#define CHANGED_GDT_TLS 4 /* Actually a subset of CHANGED_GDT */ -#define CHANGED_ALL 3 - -struct lg_cpu { - unsigned int id; - struct lguest *lg; - struct task_struct *tsk; - struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */ - - u32 cr2; - u32 esp1; - u16 ss1; - - /* Bitmap of what has changed: see CHANGED_* above. */ - int changed; - - /* Pending operation. */ - struct lguest_pending pending; - - unsigned long *reg_read; /* register from LHREQ_GETREG */ - - /* At end of a page shared mapped over lguest_pages in guest. */ - unsigned long regs_page; - struct lguest_regs *regs; - - struct lguest_pages *last_pages; - - /* Initialization mode: linear map everything. */ - bool linear_pages; - int cpu_pgd; /* Which pgd this cpu is currently using */ - - /* If a hypercall was asked for, this points to the arguments. */ - struct hcall_args *hcall; - u32 next_hcall; - - /* Virtual clock device */ - struct hrtimer hrt; - - /* Did the Guest tell us to halt? */ - int halted; - - /* Pending virtual interrupts */ - DECLARE_BITMAP(irqs_pending, LGUEST_IRQS); - - struct lg_cpu_arch arch; -}; - -/* The private info the thread maintains about the guest. */ -struct lguest { - struct lguest_data __user *lguest_data; - struct lg_cpu cpus[NR_CPUS]; - unsigned int nr_cpus; - - /* Valid guest memory pages must be < this. */ - u32 pfn_limit; - - /* Device memory is >= pfn_limit and < device_limit. */ - u32 device_limit; - - /* - * This provides the offset to the base of guest-physical memory in the - * Launcher. - */ - void __user *mem_base; - unsigned long kernel_address; - - struct pgdir pgdirs[4]; - - unsigned long noirq_iret; - - unsigned int stack_pages; - u32 tsc_khz; - - /* Dead? */ - const char *dead; -}; - -extern struct mutex lguest_lock; - -/* core.c: */ -bool lguest_address_ok(const struct lguest *lg, - unsigned long addr, unsigned long len); -void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); -void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); -extern struct page **lg_switcher_pages; - -/*H:035 - * Using memory-copy operations like that is usually inconvient, so we - * have the following helper macros which read and write a specific type (often - * an unsigned long). - * - * This reads into a variable of the given type then returns that. - */ -#define lgread(cpu, addr, type) \ - ({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; }) - -/* This checks that the variable is of the given type, then writes it out. */ -#define lgwrite(cpu, addr, type, val) \ - do { \ - typecheck(type, val); \ - __lgwrite((cpu), (addr), &(val), sizeof(val)); \ - } while(0) -/* (end of memory access helper routines) :*/ - -int run_guest(struct lg_cpu *cpu, unsigned long __user *user); - -/* - * Helper macros to obtain the first 12 or the last 20 bits, this is only the - * first step in the migration to the kernel types. pte_pfn is already defined - * in the kernel. - */ -#define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) -#define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) -#define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK) -#define pmd_pfn(x) (pmd_val(x) >> PAGE_SHIFT) - -/* interrupts_and_traps.c: */ -unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more); -void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more); -void set_interrupt(struct lg_cpu *cpu, unsigned int irq); -bool deliver_trap(struct lg_cpu *cpu, unsigned int num); -void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i, - u32 low, u32 hi); -void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages); -void pin_stack_pages(struct lg_cpu *cpu); -void setup_default_idt_entries(struct lguest_ro_state *state, - const unsigned long *def); -void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, - const unsigned long *def); -void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta); -bool send_notify_to_eventfd(struct lg_cpu *cpu); -void init_clockdev(struct lg_cpu *cpu); -bool check_syscall_vector(struct lguest *lg); -bool could_be_syscall(unsigned int num); -int init_interrupts(void); -void free_interrupts(void); - -/* segments.c: */ -void setup_default_gdt_entries(struct lguest_ro_state *state); -void setup_guest_gdt(struct lg_cpu *cpu); -void load_guest_gdt_entry(struct lg_cpu *cpu, unsigned int i, - u32 low, u32 hi); -void guest_load_tls(struct lg_cpu *cpu, unsigned long tls_array); -void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt); -void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt); - -/* page_tables.c: */ -int init_guest_pagetable(struct lguest *lg); -void free_guest_pagetable(struct lguest *lg); -void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable); -void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i); -#ifdef CONFIG_X86_PAE -void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); -#endif -void guest_pagetable_clear_all(struct lg_cpu *cpu); -void guest_pagetable_flush_user(struct lg_cpu *cpu); -void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, - unsigned long vaddr, pte_t val); -void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages); -bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode, - unsigned long *iomem); -void pin_page(struct lg_cpu *cpu, unsigned long vaddr); -bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr); -unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr); -void page_table_guest_data_init(struct lg_cpu *cpu); - -/* /core.c: */ -void lguest_arch_host_init(void); -void lguest_arch_host_fini(void); -void lguest_arch_run_guest(struct lg_cpu *cpu); -void lguest_arch_handle_trap(struct lg_cpu *cpu); -int lguest_arch_init_hypercalls(struct lg_cpu *cpu); -int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args); -void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start); -unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any); - -/* /switcher.S: */ -extern char start_switcher_text[], end_switcher_text[], switch_to_guest[]; - -/* lguest_user.c: */ -int lguest_device_init(void); -void lguest_device_remove(void); - -/* hypercalls.c: */ -void do_hypercalls(struct lg_cpu *cpu); -void write_timestamp(struct lg_cpu *cpu); - -/*L:035 - * Let's step aside for the moment, to study one important routine that's used - * widely in the Host code. - * - * There are many cases where the Guest can do something invalid, like pass crap - * to a hypercall. Since only the Guest kernel can make hypercalls, it's quite - * acceptable to simply terminate the Guest and give the Launcher a nicely - * formatted reason. It's also simpler for the Guest itself, which doesn't - * need to check most hypercalls for "success"; if you're still running, it - * succeeded. - * - * Once this is called, the Guest will never run again, so most Host code can - * call this then continue as if nothing had happened. This means many - * functions don't have to explicitly return an error code, which keeps the - * code simple. - * - * It also means that this can be called more than once: only the first one is - * remembered. The only trick is that we still need to kill the Guest even if - * we can't allocate memory to store the reason. Linux has a neat way of - * packing error codes into invalid pointers, so we use that here. - * - * Like any macro which uses an "if", it is safely wrapped in a run-once "do { - * } while(0)". - */ -#define kill_guest(cpu, fmt...) \ -do { \ - if (!(cpu)->lg->dead) { \ - (cpu)->lg->dead = kasprintf(GFP_ATOMIC, fmt); \ - if (!(cpu)->lg->dead) \ - (cpu)->lg->dead = ERR_PTR(-ENOMEM); \ - } \ -} while(0) -/* (End of aside) :*/ - -#endif /* __ASSEMBLY__ */ -#endif /* _LGUEST_H */ diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c deleted file mode 100644 index 1a6787bc9386..000000000000 --- a/drivers/lguest/lguest_user.c +++ /dev/null @@ -1,446 +0,0 @@ -/*P:200 This contains all the /dev/lguest code, whereby the userspace - * launcher controls and communicates with the Guest. For example, - * the first write will tell us the Guest's memory layout and entry - * point. A read will run the Guest until something happens, such as - * a signal or the Guest accessing a device. -:*/ -#include -#include -#include -#include -#include -#include -#include -#include -#include "lg.h" - -/*L:052 - The Launcher can get the registers, and also set some of them. -*/ -static int getreg_setup(struct lg_cpu *cpu, const unsigned long __user *input) -{ - unsigned long which; - - /* We re-use the ptrace structure to specify which register to read. */ - if (get_user(which, input) != 0) - return -EFAULT; - - /* - * We set up the cpu register pointer, and their next read will - * actually get the value (instead of running the guest). - * - * The last argument 'true' says we can access any register. - */ - cpu->reg_read = lguest_arch_regptr(cpu, which, true); - if (!cpu->reg_read) - return -ENOENT; - - /* And because this is a write() call, we return the length used. */ - return sizeof(unsigned long) * 2; -} - -static int setreg(struct lg_cpu *cpu, const unsigned long __user *input) -{ - unsigned long which, value, *reg; - - /* We re-use the ptrace structure to specify which register to read. */ - if (get_user(which, input) != 0) - return -EFAULT; - input++; - if (get_user(value, input) != 0) - return -EFAULT; - - /* The last argument 'false' means we can't access all registers. */ - reg = lguest_arch_regptr(cpu, which, false); - if (!reg) - return -ENOENT; - - *reg = value; - - /* And because this is a write() call, we return the length used. */ - return sizeof(unsigned long) * 3; -} - -/*L:050 - * Sending an interrupt is done by writing LHREQ_IRQ and an interrupt - * number to /dev/lguest. - */ -static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) -{ - unsigned long irq; - - if (get_user(irq, input) != 0) - return -EFAULT; - if (irq >= LGUEST_IRQS) - return -EINVAL; - - /* - * Next time the Guest runs, the core code will see if it can deliver - * this interrupt. - */ - set_interrupt(cpu, irq); - return 0; -} - -/*L:053 - * Deliver a trap: this is used by the Launcher if it can't emulate - * an instruction. - */ -static int trap(struct lg_cpu *cpu, const unsigned long __user *input) -{ - unsigned long trapnum; - - if (get_user(trapnum, input) != 0) - return -EFAULT; - - if (!deliver_trap(cpu, trapnum)) - return -EINVAL; - - return 0; -} - -/*L:040 - * Once our Guest is initialized, the Launcher makes it run by reading - * from /dev/lguest. - */ -static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) -{ - struct lguest *lg = file->private_data; - struct lg_cpu *cpu; - unsigned int cpu_id = *o; - - /* You must write LHREQ_INITIALIZE first! */ - if (!lg) - return -EINVAL; - - /* Watch out for arbitrary vcpu indexes! */ - if (cpu_id >= lg->nr_cpus) - return -EINVAL; - - cpu = &lg->cpus[cpu_id]; - - /* If you're not the task which owns the Guest, go away. */ - if (current != cpu->tsk) - return -EPERM; - - /* If the Guest is already dead, we indicate why */ - if (lg->dead) { - size_t len; - - /* lg->dead either contains an error code, or a string. */ - if (IS_ERR(lg->dead)) - return PTR_ERR(lg->dead); - - /* We can only return as much as the buffer they read with. */ - len = min(size, strlen(lg->dead)+1); - if (copy_to_user(user, lg->dead, len) != 0) - return -EFAULT; - return len; - } - - /* - * If we returned from read() last time because the Guest sent I/O, - * clear the flag. - */ - if (cpu->pending.trap) - cpu->pending.trap = 0; - - /* Run the Guest until something interesting happens. */ - return run_guest(cpu, (unsigned long __user *)user); -} - -/*L:025 - * This actually initializes a CPU. For the moment, a Guest is only - * uniprocessor, so "id" is always 0. - */ -static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) -{ - /* We have a limited number of CPUs in the lguest struct. */ - if (id >= ARRAY_SIZE(cpu->lg->cpus)) - return -EINVAL; - - /* Set up this CPU's id, and pointer back to the lguest struct. */ - cpu->id = id; - cpu->lg = container_of(cpu, struct lguest, cpus[id]); - cpu->lg->nr_cpus++; - - /* Each CPU has a timer it can set. */ - init_clockdev(cpu); - - /* - * We need a complete page for the Guest registers: they are accessible - * to the Guest and we can only grant it access to whole pages. - */ - cpu->regs_page = get_zeroed_page(GFP_KERNEL); - if (!cpu->regs_page) - return -ENOMEM; - - /* We actually put the registers at the end of the page. */ - cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); - - /* - * Now we initialize the Guest's registers, handing it the start - * address. - */ - lguest_arch_setup_regs(cpu, start_ip); - - /* - * We keep a pointer to the Launcher task (ie. current task) for when - * other Guests want to wake this one (eg. console input). - */ - cpu->tsk = current; - - /* - * We need to keep a pointer to the Launcher's memory map, because if - * the Launcher dies we need to clean it up. If we don't keep a - * reference, it is destroyed before close() is called. - */ - cpu->mm = get_task_mm(cpu->tsk); - - /* - * We remember which CPU's pages this Guest used last, for optimization - * when the same Guest runs on the same CPU twice. - */ - cpu->last_pages = NULL; - - /* No error == success. */ - return 0; -} - -/*L:020 - * The initialization write supplies 3 pointer sized (32 or 64 bit) values (in - * addition to the LHREQ_INITIALIZE value). These are: - * - * base: The start of the Guest-physical memory inside the Launcher memory. - * - * pfnlimit: The highest (Guest-physical) page number the Guest should be - * allowed to access. The Guest memory lives inside the Launcher, so it sets - * this to ensure the Guest can only reach its own memory. - * - * start: The first instruction to execute ("eip" in x86-speak). - */ -static int initialize(struct file *file, const unsigned long __user *input) -{ - /* "struct lguest" contains all we (the Host) know about a Guest. */ - struct lguest *lg; - int err; - unsigned long args[4]; - - /* - * We grab the Big Lguest lock, which protects against multiple - * simultaneous initializations. - */ - mutex_lock(&lguest_lock); - /* You can't initialize twice! Close the device and start again... */ - if (file->private_data) { - err = -EBUSY; - goto unlock; - } - - if (copy_from_user(args, input, sizeof(args)) != 0) { - err = -EFAULT; - goto unlock; - } - - lg = kzalloc(sizeof(*lg), GFP_KERNEL); - if (!lg) { - err = -ENOMEM; - goto unlock; - } - - /* Populate the easy fields of our "struct lguest" */ - lg->mem_base = (void __user *)args[0]; - lg->pfn_limit = args[1]; - lg->device_limit = args[3]; - - /* This is the first cpu (cpu 0) and it will start booting at args[2] */ - err = lg_cpu_start(&lg->cpus[0], 0, args[2]); - if (err) - goto free_lg; - - /* - * Initialize the Guest's shadow page tables. This allocates - * memory, so can fail. - */ - err = init_guest_pagetable(lg); - if (err) - goto free_regs; - - /* We keep our "struct lguest" in the file's private_data. */ - file->private_data = lg; - - mutex_unlock(&lguest_lock); - - /* And because this is a write() call, we return the length used. */ - return sizeof(args); - -free_regs: - /* FIXME: This should be in free_vcpu */ - free_page(lg->cpus[0].regs_page); -free_lg: - kfree(lg); -unlock: - mutex_unlock(&lguest_lock); - return err; -} - -/*L:010 - * The first operation the Launcher does must be a write. All writes - * start with an unsigned long number: for the first write this must be - * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use - * writes of other values to send interrupts or set up receipt of notifications. - * - * Note that we overload the "offset" in the /dev/lguest file to indicate what - * CPU number we're dealing with. Currently this is always 0 since we only - * support uniprocessor Guests, but you can see the beginnings of SMP support - * here. - */ -static ssize_t write(struct file *file, const char __user *in, - size_t size, loff_t *off) -{ - /* - * Once the Guest is initialized, we hold the "struct lguest" in the - * file private data. - */ - struct lguest *lg = file->private_data; - const unsigned long __user *input = (const unsigned long __user *)in; - unsigned long req; - struct lg_cpu *uninitialized_var(cpu); - unsigned int cpu_id = *off; - - /* The first value tells us what this request is. */ - if (get_user(req, input) != 0) - return -EFAULT; - input++; - - /* If you haven't initialized, you must do that first. */ - if (req != LHREQ_INITIALIZE) { - if (!lg || (cpu_id >= lg->nr_cpus)) - return -EINVAL; - cpu = &lg->cpus[cpu_id]; - - /* Once the Guest is dead, you can only read() why it died. */ - if (lg->dead) - return -ENOENT; - } - - switch (req) { - case LHREQ_INITIALIZE: - return initialize(file, input); - case LHREQ_IRQ: - return user_send_irq(cpu, input); - case LHREQ_GETREG: - return getreg_setup(cpu, input); - case LHREQ_SETREG: - return setreg(cpu, input); - case LHREQ_TRAP: - return trap(cpu, input); - default: - return -EINVAL; - } -} - -static int open(struct inode *inode, struct file *file) -{ - file->private_data = NULL; - - return 0; -} - -/*L:060 - * The final piece of interface code is the close() routine. It reverses - * everything done in initialize(). This is usually called because the - * Launcher exited. - * - * Note that the close routine returns 0 or a negative error number: it can't - * really fail, but it can whine. I blame Sun for this wart, and K&R C for - * letting them do it. -:*/ -static int close(struct inode *inode, struct file *file) -{ - struct lguest *lg = file->private_data; - unsigned int i; - - /* If we never successfully initialized, there's nothing to clean up */ - if (!lg) - return 0; - - /* - * We need the big lock, to protect from inter-guest I/O and other - * Launchers initializing guests. - */ - mutex_lock(&lguest_lock); - - /* Free up the shadow page tables for the Guest. */ - free_guest_pagetable(lg); - - for (i = 0; i < lg->nr_cpus; i++) { - /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */ - hrtimer_cancel(&lg->cpus[i].hrt); - /* We can free up the register page we allocated. */ - free_page(lg->cpus[i].regs_page); - /* - * Now all the memory cleanups are done, it's safe to release - * the Launcher's memory management structure. - */ - mmput(lg->cpus[i].mm); - } - - /* - * If lg->dead doesn't contain an error code it will be NULL or a - * kmalloc()ed string, either of which is ok to hand to kfree(). - */ - if (!IS_ERR(lg->dead)) - kfree(lg->dead); - /* Free the memory allocated to the lguest_struct */ - kfree(lg); - /* Release lock and exit. */ - mutex_unlock(&lguest_lock); - - return 0; -} - -/*L:000 - * Welcome to our journey through the Launcher! - * - * The Launcher is the Host userspace program which sets up, runs and services - * the Guest. In fact, many comments in the Drivers which refer to "the Host" - * doing things are inaccurate: the Launcher does all the device handling for - * the Guest, but the Guest can't know that. - * - * Just to confuse you: to the Host kernel, the Launcher *is* the Guest and we - * shall see more of that later. - * - * We begin our understanding with the Host kernel interface which the Launcher - * uses: reading and writing a character device called /dev/lguest. All the - * work happens in the read(), write() and close() routines: - */ -static const struct file_operations lguest_fops = { - .owner = THIS_MODULE, - .open = open, - .release = close, - .write = write, - .read = read, - .llseek = default_llseek, -}; -/*:*/ - -/* - * This is a textbook example of a "misc" character device. Populate a "struct - * miscdevice" and register it with misc_register(). - */ -static struct miscdevice lguest_dev = { - .minor = MISC_DYNAMIC_MINOR, - .name = "lguest", - .fops = &lguest_fops, -}; - -int __init lguest_device_init(void) -{ - return misc_register(&lguest_dev); -} - -void __exit lguest_device_remove(void) -{ - misc_deregister(&lguest_dev); -} diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c deleted file mode 100644 index 0bc127e9f16a..000000000000 --- a/drivers/lguest/page_tables.c +++ /dev/null @@ -1,1239 +0,0 @@ -/*P:700 - * The pagetable code, on the other hand, still shows the scars of - * previous encounters. It's functional, and as neat as it can be in the - * circumstances, but be wary, for these things are subtle and break easily. - * The Guest provides a virtual to physical mapping, but we can neither trust - * it nor use it: we verify and convert it here then point the CPU to the - * converted Guest pages when running the Guest. -:*/ - -/* Copyright (C) Rusty Russell IBM Corporation 2013. - * GPL v2 and any later version */ -#include -#include -#include -#include -#include -#include -#include -#include -#include "lg.h" - -/*M:008 - * We hold reference to pages, which prevents them from being swapped. - * It'd be nice to have a callback in the "struct mm_struct" when Linux wants - * to swap out. If we had this, and a shrinker callback to trim PTE pages, we - * could probably consider launching Guests as non-root. -:*/ - -/*H:300 - * The Page Table Code - * - * We use two-level page tables for the Guest, or three-level with PAE. If - * you're not entirely comfortable with virtual addresses, physical addresses - * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page - * Table Handling" (with diagrams!). - * - * The Guest keeps page tables, but we maintain the actual ones here: these are - * called "shadow" page tables. Which is a very Guest-centric name: these are - * the real page tables the CPU uses, although we keep them up to date to - * reflect the Guest's. (See what I mean about weird naming? Since when do - * shadows reflect anything?) - * - * Anyway, this is the most complicated part of the Host code. There are seven - * parts to this: - * (i) Looking up a page table entry when the Guest faults, - * (ii) Making sure the Guest stack is mapped, - * (iii) Setting up a page table entry when the Guest tells us one has changed, - * (iv) Switching page tables, - * (v) Flushing (throwing away) page tables, - * (vi) Mapping the Switcher when the Guest is about to run, - * (vii) Setting up the page tables initially. -:*/ - -/* - * The Switcher uses the complete top PTE page. That's 1024 PTE entries (4MB) - * or 512 PTE entries with PAE (2MB). - */ -#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) - -/* - * For PAE we need the PMD index as well. We use the last 2MB, so we - * will need the last pmd entry of the last pmd page. - */ -#ifdef CONFIG_X86_PAE -#define CHECK_GPGD_MASK _PAGE_PRESENT -#else -#define CHECK_GPGD_MASK _PAGE_TABLE -#endif - -/*H:320 - * The page table code is curly enough to need helper functions to keep it - * clear and clean. The kernel itself provides many of them; one advantage - * of insisting that the Guest and Host use the same CONFIG_X86_PAE setting. - * - * There are two functions which return pointers to the shadow (aka "real") - * page tables. - * - * spgd_addr() takes the virtual address and returns a pointer to the top-level - * page directory entry (PGD) for that address. Since we keep track of several - * page tables, the "i" argument tells us which one we're interested in (it's - * usually the current one). - */ -static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) -{ - unsigned int index = pgd_index(vaddr); - - /* Return a pointer index'th pgd entry for the i'th page table. */ - return &cpu->lg->pgdirs[i].pgdir[index]; -} - -#ifdef CONFIG_X86_PAE -/* - * This routine then takes the PGD entry given above, which contains the - * address of the PMD page. It then returns a pointer to the PMD entry for the - * given address. - */ -static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) -{ - unsigned int index = pmd_index(vaddr); - pmd_t *page; - - /* You should never call this if the PGD entry wasn't valid */ - BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); - page = __va(pgd_pfn(spgd) << PAGE_SHIFT); - - return &page[index]; -} -#endif - -/* - * This routine then takes the page directory entry returned above, which - * contains the address of the page table entry (PTE) page. It then returns a - * pointer to the PTE entry for the given address. - */ -static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) -{ -#ifdef CONFIG_X86_PAE - pmd_t *pmd = spmd_addr(cpu, spgd, vaddr); - pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT); - - /* You should never call this if the PMD entry wasn't valid */ - BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT)); -#else - pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); - /* You should never call this if the PGD entry wasn't valid */ - BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); -#endif - - return &page[pte_index(vaddr)]; -} - -/* - * These functions are just like the above, except they access the Guest - * page tables. Hence they return a Guest address. - */ -static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) -{ - unsigned int index = vaddr >> (PGDIR_SHIFT); - return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t); -} - -#ifdef CONFIG_X86_PAE -/* Follow the PGD to the PMD. */ -static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) -{ - unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; - BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); - return gpage + pmd_index(vaddr) * sizeof(pmd_t); -} - -/* Follow the PMD to the PTE. */ -static unsigned long gpte_addr(struct lg_cpu *cpu, - pmd_t gpmd, unsigned long vaddr) -{ - unsigned long gpage = pmd_pfn(gpmd) << PAGE_SHIFT; - - BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT)); - return gpage + pte_index(vaddr) * sizeof(pte_t); -} -#else -/* Follow the PGD to the PTE (no mid-level for !PAE). */ -static unsigned long gpte_addr(struct lg_cpu *cpu, - pgd_t gpgd, unsigned long vaddr) -{ - unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; - - BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); - return gpage + pte_index(vaddr) * sizeof(pte_t); -} -#endif -/*:*/ - -/*M:007 - * get_pfn is slow: we could probably try to grab batches of pages here as - * an optimization (ie. pre-faulting). -:*/ - -/*H:350 - * This routine takes a page number given by the Guest and converts it to - * an actual, physical page number. It can fail for several reasons: the - * virtual address might not be mapped by the Launcher, the write flag is set - * and the page is read-only, or the write flag was set and the page was - * shared so had to be copied, but we ran out of memory. - * - * This holds a reference to the page, so release_pte() is careful to put that - * back. - */ -static unsigned long get_pfn(unsigned long virtpfn, int write) -{ - struct page *page; - - /* gup me one page at this address please! */ - if (get_user_pages_fast(virtpfn << PAGE_SHIFT, 1, write, &page) == 1) - return page_to_pfn(page); - - /* This value indicates failure. */ - return -1UL; -} - -/*H:340 - * Converting a Guest page table entry to a shadow (ie. real) page table - * entry can be a little tricky. The flags are (almost) the same, but the - * Guest PTE contains a virtual page number: the CPU needs the real page - * number. - */ -static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) -{ - unsigned long pfn, base, flags; - - /* - * The Guest sets the global flag, because it thinks that it is using - * PGE. We only told it to use PGE so it would tell us whether it was - * flushing a kernel mapping or a userspace mapping. We don't actually - * use the global bit, so throw it away. - */ - flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); - - /* The Guest's pages are offset inside the Launcher. */ - base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE; - - /* - * We need a temporary "unsigned long" variable to hold the answer from - * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't - * fit in spte.pfn. get_pfn() finds the real physical number of the - * page, given the virtual number. - */ - pfn = get_pfn(base + pte_pfn(gpte), write); - if (pfn == -1UL) { - kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte)); - /* - * When we destroy the Guest, we'll go through the shadow page - * tables and release_pte() them. Make sure we don't think - * this one is valid! - */ - flags = 0; - } - /* Now we assemble our shadow PTE from the page number and flags. */ - return pfn_pte(pfn, __pgprot(flags)); -} - -/*H:460 And to complete the chain, release_pte() looks like this: */ -static void release_pte(pte_t pte) -{ - /* - * Remember that get_user_pages_fast() took a reference to the page, in - * get_pfn()? We have to put it back now. - */ - if (pte_flags(pte) & _PAGE_PRESENT) - put_page(pte_page(pte)); -} -/*:*/ - -static bool gpte_in_iomem(struct lg_cpu *cpu, pte_t gpte) -{ - /* We don't handle large pages. */ - if (pte_flags(gpte) & _PAGE_PSE) - return false; - - return (pte_pfn(gpte) >= cpu->lg->pfn_limit - && pte_pfn(gpte) < cpu->lg->device_limit); -} - -static bool check_gpte(struct lg_cpu *cpu, pte_t gpte) -{ - if ((pte_flags(gpte) & _PAGE_PSE) || - pte_pfn(gpte) >= cpu->lg->pfn_limit) { - kill_guest(cpu, "bad page table entry"); - return false; - } - return true; -} - -static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) -{ - if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || - (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) { - kill_guest(cpu, "bad page directory entry"); - return false; - } - return true; -} - -#ifdef CONFIG_X86_PAE -static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) -{ - if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || - (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) { - kill_guest(cpu, "bad page middle directory entry"); - return false; - } - return true; -} -#endif - -/*H:331 - * This is the core routine to walk the shadow page tables and find the page - * table entry for a specific address. - * - * If allocate is set, then we allocate any missing levels, setting the flags - * on the new page directory and mid-level directories using the arguments - * (which are copied from the Guest's page table entries). - */ -static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate, - int pgd_flags, int pmd_flags) -{ - pgd_t *spgd; - /* Mid level for PAE. */ -#ifdef CONFIG_X86_PAE - pmd_t *spmd; -#endif - - /* Get top level entry. */ - spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); - if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { - /* No shadow entry: allocate a new shadow PTE page. */ - unsigned long ptepage; - - /* If they didn't want us to allocate anything, stop. */ - if (!allocate) - return NULL; - - ptepage = get_zeroed_page(GFP_KERNEL); - /* - * This is not really the Guest's fault, but killing it is - * simple for this corner case. - */ - if (!ptepage) { - kill_guest(cpu, "out of memory allocating pte page"); - return NULL; - } - /* - * And we copy the flags to the shadow PGD entry. The page - * number in the shadow PGD is the page we just allocated. - */ - set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags)); - } - - /* - * Intel's Physical Address Extension actually uses three levels of - * page tables, so we need to look in the mid-level. - */ -#ifdef CONFIG_X86_PAE - /* Now look at the mid-level shadow entry. */ - spmd = spmd_addr(cpu, *spgd, vaddr); - - if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { - /* No shadow entry: allocate a new shadow PTE page. */ - unsigned long ptepage; - - /* If they didn't want us to allocate anything, stop. */ - if (!allocate) - return NULL; - - ptepage = get_zeroed_page(GFP_KERNEL); - - /* - * This is not really the Guest's fault, but killing it is - * simple for this corner case. - */ - if (!ptepage) { - kill_guest(cpu, "out of memory allocating pmd page"); - return NULL; - } - - /* - * And we copy the flags to the shadow PMD entry. The page - * number in the shadow PMD is the page we just allocated. - */ - set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags)); - } -#endif - - /* Get the pointer to the shadow PTE entry we're going to set. */ - return spte_addr(cpu, *spgd, vaddr); -} - -/*H:330 - * (i) Looking up a page table entry when the Guest faults. - * - * We saw this call in run_guest(): when we see a page fault in the Guest, we - * come here. That's because we only set up the shadow page tables lazily as - * they're needed, so we get page faults all the time and quietly fix them up - * and return to the Guest without it knowing. - * - * If we fixed up the fault (ie. we mapped the address), this routine returns - * true. Otherwise, it was a real fault and we need to tell the Guest. - * - * There's a corner case: they're trying to access memory between - * pfn_limit and device_limit, which is I/O memory. In this case, we - * return false and set @iomem to the physical address, so the the - * Launcher can handle the instruction manually. - */ -bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode, - unsigned long *iomem) -{ - unsigned long gpte_ptr; - pte_t gpte; - pte_t *spte; - pmd_t gpmd; - pgd_t gpgd; - - *iomem = 0; - - /* We never demand page the Switcher, so trying is a mistake. */ - if (vaddr >= switcher_addr) - return false; - - /* First step: get the top-level Guest page table entry. */ - if (unlikely(cpu->linear_pages)) { - /* Faking up a linear mapping. */ - gpgd = __pgd(CHECK_GPGD_MASK); - } else { - gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); - /* Toplevel not present? We can't map it in. */ - if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) - return false; - - /* - * This kills the Guest if it has weird flags or tries to - * refer to a "physical" address outside the bounds. - */ - if (!check_gpgd(cpu, gpgd)) - return false; - } - - /* This "mid-level" entry is only used for non-linear, PAE mode. */ - gpmd = __pmd(_PAGE_TABLE); - -#ifdef CONFIG_X86_PAE - if (likely(!cpu->linear_pages)) { - gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); - /* Middle level not present? We can't map it in. */ - if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) - return false; - - /* - * This kills the Guest if it has weird flags or tries to - * refer to a "physical" address outside the bounds. - */ - if (!check_gpmd(cpu, gpmd)) - return false; - } - - /* - * OK, now we look at the lower level in the Guest page table: keep its - * address, because we might update it later. - */ - gpte_ptr = gpte_addr(cpu, gpmd, vaddr); -#else - /* - * OK, now we look at the lower level in the Guest page table: keep its - * address, because we might update it later. - */ - gpte_ptr = gpte_addr(cpu, gpgd, vaddr); -#endif - - if (unlikely(cpu->linear_pages)) { - /* Linear? Make up a PTE which points to same page. */ - gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT); - } else { - /* Read the actual PTE value. */ - gpte = lgread(cpu, gpte_ptr, pte_t); - } - - /* If this page isn't in the Guest page tables, we can't page it in. */ - if (!(pte_flags(gpte) & _PAGE_PRESENT)) - return false; - - /* - * Check they're not trying to write to a page the Guest wants - * read-only (bit 2 of errcode == write). - */ - if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) - return false; - - /* User access to a kernel-only page? (bit 3 == user access) */ - if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) - return false; - - /* If they're accessing io memory, we expect a fault. */ - if (gpte_in_iomem(cpu, gpte)) { - *iomem = (pte_pfn(gpte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK); - return false; - } - - /* - * Check that the Guest PTE flags are OK, and the page number is below - * the pfn_limit (ie. not mapping the Launcher binary). - */ - if (!check_gpte(cpu, gpte)) - return false; - - /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ - gpte = pte_mkyoung(gpte); - if (errcode & 2) - gpte = pte_mkdirty(gpte); - - /* Get the pointer to the shadow PTE entry we're going to set. */ - spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd)); - if (!spte) - return false; - - /* - * If there was a valid shadow PTE entry here before, we release it. - * This can happen with a write to a previously read-only entry. - */ - release_pte(*spte); - - /* - * If this is a write, we insist that the Guest page is writable (the - * final arg to gpte_to_spte()). - */ - if (pte_dirty(gpte)) - *spte = gpte_to_spte(cpu, gpte, 1); - else - /* - * If this is a read, don't set the "writable" bit in the page - * table entry, even if the Guest says it's writable. That way - * we will come back here when a write does actually occur, so - * we can update the Guest's _PAGE_DIRTY flag. - */ - set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); - - /* - * Finally, we write the Guest PTE entry back: we've set the - * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. - */ - if (likely(!cpu->linear_pages)) - lgwrite(cpu, gpte_ptr, pte_t, gpte); - - /* - * The fault is fixed, the page table is populated, the mapping - * manipulated, the result returned and the code complete. A small - * delay and a trace of alliteration are the only indications the Guest - * has that a page fault occurred at all. - */ - return true; -} - -/*H:360 - * (ii) Making sure the Guest stack is mapped. - * - * Remember that direct traps into the Guest need a mapped Guest kernel stack. - * pin_stack_pages() calls us here: we could simply call demand_page(), but as - * we've seen that logic is quite long, and usually the stack pages are already - * mapped, so it's overkill. - * - * This is a quick version which answers the question: is this virtual address - * mapped by the shadow page tables, and is it writable? - */ -static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) -{ - pte_t *spte; - unsigned long flags; - - /* You can't put your stack in the Switcher! */ - if (vaddr >= switcher_addr) - return false; - - /* If there's no shadow PTE, it's not writable. */ - spte = find_spte(cpu, vaddr, false, 0, 0); - if (!spte) - return false; - - /* - * Check the flags on the pte entry itself: it must be present and - * writable. - */ - flags = pte_flags(*spte); - return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); -} - -/* - * So, when pin_stack_pages() asks us to pin a page, we check if it's already - * in the page tables, and if not, we call demand_page() with error code 2 - * (meaning "write"). - */ -void pin_page(struct lg_cpu *cpu, unsigned long vaddr) -{ - unsigned long iomem; - - if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2, &iomem)) - kill_guest(cpu, "bad stack page %#lx", vaddr); -} -/*:*/ - -#ifdef CONFIG_X86_PAE -static void release_pmd(pmd_t *spmd) -{ - /* If the entry's not present, there's nothing to release. */ - if (pmd_flags(*spmd) & _PAGE_PRESENT) { - unsigned int i; - pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT); - /* For each entry in the page, we might need to release it. */ - for (i = 0; i < PTRS_PER_PTE; i++) - release_pte(ptepage[i]); - /* Now we can free the page of PTEs */ - free_page((long)ptepage); - /* And zero out the PMD entry so we never release it twice. */ - set_pmd(spmd, __pmd(0)); - } -} - -static void release_pgd(pgd_t *spgd) -{ - /* If the entry's not present, there's nothing to release. */ - if (pgd_flags(*spgd) & _PAGE_PRESENT) { - unsigned int i; - pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); - - for (i = 0; i < PTRS_PER_PMD; i++) - release_pmd(&pmdpage[i]); - - /* Now we can free the page of PMDs */ - free_page((long)pmdpage); - /* And zero out the PGD entry so we never release it twice. */ - set_pgd(spgd, __pgd(0)); - } -} - -#else /* !CONFIG_X86_PAE */ -/*H:450 - * If we chase down the release_pgd() code, the non-PAE version looks like - * this. The PAE version is almost identical, but instead of calling - * release_pte it calls release_pmd(), which looks much like this. - */ -static void release_pgd(pgd_t *spgd) -{ - /* If the entry's not present, there's nothing to release. */ - if (pgd_flags(*spgd) & _PAGE_PRESENT) { - unsigned int i; - /* - * Converting the pfn to find the actual PTE page is easy: turn - * the page number into a physical address, then convert to a - * virtual address (easy for kernel pages like this one). - */ - pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); - /* For each entry in the page, we might need to release it. */ - for (i = 0; i < PTRS_PER_PTE; i++) - release_pte(ptepage[i]); - /* Now we can free the page of PTEs */ - free_page((long)ptepage); - /* And zero out the PGD entry so we never release it twice. */ - *spgd = __pgd(0); - } -} -#endif - -/*H:445 - * We saw flush_user_mappings() twice: once from the flush_user_mappings() - * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. - * It simply releases every PTE page from 0 up to the Guest's kernel address. - */ -static void flush_user_mappings(struct lguest *lg, int idx) -{ - unsigned int i; - /* Release every pgd entry up to the kernel's address. */ - for (i = 0; i < pgd_index(lg->kernel_address); i++) - release_pgd(lg->pgdirs[idx].pgdir + i); -} - -/*H:440 - * (v) Flushing (throwing away) page tables, - * - * The Guest has a hypercall to throw away the page tables: it's used when a - * large number of mappings have been changed. - */ -void guest_pagetable_flush_user(struct lg_cpu *cpu) -{ - /* Drop the userspace part of the current page table. */ - flush_user_mappings(cpu->lg, cpu->cpu_pgd); -} -/*:*/ - -/* We walk down the guest page tables to get a guest-physical address */ -bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr) -{ - pgd_t gpgd; - pte_t gpte; -#ifdef CONFIG_X86_PAE - pmd_t gpmd; -#endif - - /* Still not set up? Just map 1:1. */ - if (unlikely(cpu->linear_pages)) { - *paddr = vaddr; - return true; - } - - /* First step: get the top-level Guest page table entry. */ - gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); - /* Toplevel not present? We can't map it in. */ - if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) - goto fail; - -#ifdef CONFIG_X86_PAE - gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); - if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) - goto fail; - gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t); -#else - gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); -#endif - if (!(pte_flags(gpte) & _PAGE_PRESENT)) - goto fail; - - *paddr = pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); - return true; - -fail: - *paddr = -1UL; - return false; -} - -/* - * This is the version we normally use: kills the Guest if it uses a - * bad address - */ -unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) -{ - unsigned long paddr; - - if (!__guest_pa(cpu, vaddr, &paddr)) - kill_guest(cpu, "Bad address %#lx", vaddr); - return paddr; -} - -/* - * We keep several page tables. This is a simple routine to find the page - * table (if any) corresponding to this top-level address the Guest has given - * us. - */ -static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) -{ - unsigned int i; - for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) - if (lg->pgdirs[i].pgdir && lg->pgdirs[i].gpgdir == pgtable) - break; - return i; -} - -/*H:435 - * And this is us, creating the new page directory. If we really do - * allocate a new one (and so the kernel parts are not there), we set - * blank_pgdir. - */ -static unsigned int new_pgdir(struct lg_cpu *cpu, - unsigned long gpgdir, - int *blank_pgdir) -{ - unsigned int next; - - /* - * We pick one entry at random to throw out. Choosing the Least - * Recently Used might be better, but this is easy. - */ - next = prandom_u32() % ARRAY_SIZE(cpu->lg->pgdirs); - /* If it's never been allocated at all before, try now. */ - if (!cpu->lg->pgdirs[next].pgdir) { - cpu->lg->pgdirs[next].pgdir = - (pgd_t *)get_zeroed_page(GFP_KERNEL); - /* If the allocation fails, just keep using the one we have */ - if (!cpu->lg->pgdirs[next].pgdir) - next = cpu->cpu_pgd; - else { - /* - * This is a blank page, so there are no kernel - * mappings: caller must map the stack! - */ - *blank_pgdir = 1; - } - } - /* Record which Guest toplevel this shadows. */ - cpu->lg->pgdirs[next].gpgdir = gpgdir; - /* Release all the non-kernel mappings. */ - flush_user_mappings(cpu->lg, next); - - /* This hasn't run on any CPU at all. */ - cpu->lg->pgdirs[next].last_host_cpu = -1; - - return next; -} - -/*H:501 - * We do need the Switcher code mapped at all times, so we allocate that - * part of the Guest page table here. We map the Switcher code immediately, - * but defer mapping of the guest register page and IDT/LDT etc page until - * just before we run the guest in map_switcher_in_guest(). - * - * We *could* do this setup in map_switcher_in_guest(), but at that point - * we've interrupts disabled, and allocating pages like that is fraught: we - * can't sleep if we need to free up some memory. - */ -static bool allocate_switcher_mapping(struct lg_cpu *cpu) -{ - int i; - - for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { - pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true, - CHECK_GPGD_MASK, _PAGE_TABLE); - if (!pte) - return false; - - /* - * Map the switcher page if not already there. It might - * already be there because we call allocate_switcher_mapping() - * in guest_set_pgd() just in case it did discard our Switcher - * mapping, but it probably didn't. - */ - if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) { - /* Get a reference to the Switcher page. */ - get_page(lg_switcher_pages[0]); - /* Create a read-only, exectuable, kernel-style PTE */ - set_pte(pte, - mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX)); - } - } - cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true; - return true; -} - -/*H:470 - * Finally, a routine which throws away everything: all PGD entries in all - * the shadow page tables, including the Guest's kernel mappings. This is used - * when we destroy the Guest. - */ -static void release_all_pagetables(struct lguest *lg) -{ - unsigned int i, j; - - /* Every shadow pagetable this Guest has */ - for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) { - if (!lg->pgdirs[i].pgdir) - continue; - - /* Every PGD entry. */ - for (j = 0; j < PTRS_PER_PGD; j++) - release_pgd(lg->pgdirs[i].pgdir + j); - lg->pgdirs[i].switcher_mapped = false; - lg->pgdirs[i].last_host_cpu = -1; - } -} - -/* - * We also throw away everything when a Guest tells us it's changed a kernel - * mapping. Since kernel mappings are in every page table, it's easiest to - * throw them all away. This traps the Guest in amber for a while as - * everything faults back in, but it's rare. - */ -void guest_pagetable_clear_all(struct lg_cpu *cpu) -{ - release_all_pagetables(cpu->lg); - /* We need the Guest kernel stack mapped again. */ - pin_stack_pages(cpu); - /* And we need Switcher allocated. */ - if (!allocate_switcher_mapping(cpu)) - kill_guest(cpu, "Cannot populate switcher mapping"); -} - -/*H:430 - * (iv) Switching page tables - * - * Now we've seen all the page table setting and manipulation, let's see - * what happens when the Guest changes page tables (ie. changes the top-level - * pgdir). This occurs on almost every context switch. - */ -void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) -{ - int newpgdir, repin = 0; - - /* - * The very first time they call this, we're actually running without - * any page tables; we've been making it up. Throw them away now. - */ - if (unlikely(cpu->linear_pages)) { - release_all_pagetables(cpu->lg); - cpu->linear_pages = false; - /* Force allocation of a new pgdir. */ - newpgdir = ARRAY_SIZE(cpu->lg->pgdirs); - } else { - /* Look to see if we have this one already. */ - newpgdir = find_pgdir(cpu->lg, pgtable); - } - - /* - * If not, we allocate or mug an existing one: if it's a fresh one, - * repin gets set to 1. - */ - if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) - newpgdir = new_pgdir(cpu, pgtable, &repin); - /* Change the current pgd index to the new one. */ - cpu->cpu_pgd = newpgdir; - /* - * If it was completely blank, we map in the Guest kernel stack and - * the Switcher. - */ - if (repin) - pin_stack_pages(cpu); - - if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) { - if (!allocate_switcher_mapping(cpu)) - kill_guest(cpu, "Cannot populate switcher mapping"); - } -} -/*:*/ - -/*M:009 - * Since we throw away all mappings when a kernel mapping changes, our - * performance sucks for guests using highmem. In fact, a guest with - * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is - * usually slower than a Guest with less memory. - * - * This, of course, cannot be fixed. It would take some kind of... well, I - * don't know, but the term "puissant code-fu" comes to mind. -:*/ - -/*H:420 - * This is the routine which actually sets the page table entry for then - * "idx"'th shadow page table. - * - * Normally, we can just throw out the old entry and replace it with 0: if they - * use it demand_page() will put the new entry in. We need to do this anyway: - * The Guest expects _PAGE_ACCESSED to be set on its PTE the first time a page - * is read from, and _PAGE_DIRTY when it's written to. - * - * But Avi Kivity pointed out that most Operating Systems (Linux included) set - * these bits on PTEs immediately anyway. This is done to save the CPU from - * having to update them, but it helps us the same way: if they set - * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if - * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. - */ -static void __guest_set_pte(struct lg_cpu *cpu, int idx, - unsigned long vaddr, pte_t gpte) -{ - /* Look up the matching shadow page directory entry. */ - pgd_t *spgd = spgd_addr(cpu, idx, vaddr); -#ifdef CONFIG_X86_PAE - pmd_t *spmd; -#endif - - /* If the top level isn't present, there's no entry to update. */ - if (pgd_flags(*spgd) & _PAGE_PRESENT) { -#ifdef CONFIG_X86_PAE - spmd = spmd_addr(cpu, *spgd, vaddr); - if (pmd_flags(*spmd) & _PAGE_PRESENT) { -#endif - /* Otherwise, start by releasing the existing entry. */ - pte_t *spte = spte_addr(cpu, *spgd, vaddr); - release_pte(*spte); - - /* - * If they're setting this entry as dirty or accessed, - * we might as well put that entry they've given us in - * now. This shaves 10% off a copy-on-write - * micro-benchmark. - */ - if ((pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) - && !gpte_in_iomem(cpu, gpte)) { - if (!check_gpte(cpu, gpte)) - return; - set_pte(spte, - gpte_to_spte(cpu, gpte, - pte_flags(gpte) & _PAGE_DIRTY)); - } else { - /* - * Otherwise kill it and we can demand_page() - * it in later. - */ - set_pte(spte, __pte(0)); - } -#ifdef CONFIG_X86_PAE - } -#endif - } -} - -/*H:410 - * Updating a PTE entry is a little trickier. - * - * We keep track of several different page tables (the Guest uses one for each - * process, so it makes sense to cache at least a few). Each of these have - * identical kernel parts: ie. every mapping above PAGE_OFFSET is the same for - * all processes. So when the page table above that address changes, we update - * all the page tables, not just the current one. This is rare. - * - * The benefit is that when we have to track a new page table, we can keep all - * the kernel mappings. This speeds up context switch immensely. - */ -void guest_set_pte(struct lg_cpu *cpu, - unsigned long gpgdir, unsigned long vaddr, pte_t gpte) -{ - /* We don't let you remap the Switcher; we need it to get back! */ - if (vaddr >= switcher_addr) { - kill_guest(cpu, "attempt to set pte into Switcher pages"); - return; - } - - /* - * Kernel mappings must be changed on all top levels. Slow, but doesn't - * happen often. - */ - if (vaddr >= cpu->lg->kernel_address) { - unsigned int i; - for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) - if (cpu->lg->pgdirs[i].pgdir) - __guest_set_pte(cpu, i, vaddr, gpte); - } else { - /* Is this page table one we have a shadow for? */ - int pgdir = find_pgdir(cpu->lg, gpgdir); - if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs)) - /* If so, do the update. */ - __guest_set_pte(cpu, pgdir, vaddr, gpte); - } -} - -/*H:400 - * (iii) Setting up a page table entry when the Guest tells us one has changed. - * - * Just like we did in interrupts_and_traps.c, it makes sense for us to deal - * with the other side of page tables while we're here: what happens when the - * Guest asks for a page table to be updated? - * - * We already saw that demand_page() will fill in the shadow page tables when - * needed, so we can simply remove shadow page table entries whenever the Guest - * tells us they've changed. When the Guest tries to use the new entry it will - * fault and demand_page() will fix it up. - * - * So with that in mind here's our code to update a (top-level) PGD entry: - */ -void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) -{ - int pgdir; - - if (idx > PTRS_PER_PGD) { - kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u", - idx, PTRS_PER_PGD); - return; - } - - /* If they're talking about a page table we have a shadow for... */ - pgdir = find_pgdir(lg, gpgdir); - if (pgdir < ARRAY_SIZE(lg->pgdirs)) { - /* ... throw it away. */ - release_pgd(lg->pgdirs[pgdir].pgdir + idx); - /* That might have been the Switcher mapping, remap it. */ - if (!allocate_switcher_mapping(&lg->cpus[0])) { - kill_guest(&lg->cpus[0], - "Cannot populate switcher mapping"); - } - lg->pgdirs[pgdir].last_host_cpu = -1; - } -} - -#ifdef CONFIG_X86_PAE -/* For setting a mid-level, we just throw everything away. It's easy. */ -void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) -{ - guest_pagetable_clear_all(&lg->cpus[0]); -} -#endif - -/*H:500 - * (vii) Setting up the page tables initially. - * - * When a Guest is first created, set initialize a shadow page table which - * we will populate on future faults. The Guest doesn't have any actual - * pagetables yet, so we set linear_pages to tell demand_page() to fake it - * for the moment. - * - * We do need the Switcher to be mapped at all times, so we allocate that - * part of the Guest page table here. - */ -int init_guest_pagetable(struct lguest *lg) -{ - struct lg_cpu *cpu = &lg->cpus[0]; - int allocated = 0; - - /* lg (and lg->cpus[]) starts zeroed: this allocates a new pgdir */ - cpu->cpu_pgd = new_pgdir(cpu, 0, &allocated); - if (!allocated) - return -ENOMEM; - - /* We start with a linear mapping until the initialize. */ - cpu->linear_pages = true; - - /* Allocate the page tables for the Switcher. */ - if (!allocate_switcher_mapping(cpu)) { - release_all_pagetables(lg); - return -ENOMEM; - } - - return 0; -} - -/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ -void page_table_guest_data_init(struct lg_cpu *cpu) -{ - /* - * We tell the Guest that it can't use the virtual addresses - * used by the Switcher. This trick is equivalent to 4GB - - * switcher_addr. - */ - u32 top = ~switcher_addr + 1; - - /* We get the kernel address: above this is all kernel memory. */ - if (get_user(cpu->lg->kernel_address, - &cpu->lg->lguest_data->kernel_address) - /* - * We tell the Guest that it can't use the top virtual - * addresses (used by the Switcher). - */ - || put_user(top, &cpu->lg->lguest_data->reserve_mem)) { - kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); - return; - } - - /* - * In flush_user_mappings() we loop from 0 to - * "pgd_index(lg->kernel_address)". This assumes it won't hit the - * Switcher mappings, so check that now. - */ - if (cpu->lg->kernel_address >= switcher_addr) - kill_guest(cpu, "bad kernel address %#lx", - cpu->lg->kernel_address); -} - -/* When a Guest dies, our cleanup is fairly simple. */ -void free_guest_pagetable(struct lguest *lg) -{ - unsigned int i; - - /* Throw away all page table pages. */ - release_all_pagetables(lg); - /* Now free the top levels: free_page() can handle 0 just fine. */ - for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) - free_page((long)lg->pgdirs[i].pgdir); -} - -/*H:481 - * This clears the Switcher mappings for cpu #i. - */ -static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i) -{ - unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2; - pte_t *pte; - - /* Clear the mappings for both pages. */ - pte = find_spte(cpu, base, false, 0, 0); - release_pte(*pte); - set_pte(pte, __pte(0)); - - pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); - release_pte(*pte); - set_pte(pte, __pte(0)); -} - -/*H:480 - * (vi) Mapping the Switcher when the Guest is about to run. - * - * The Switcher and the two pages for this CPU need to be visible in the Guest - * (and not the pages for other CPUs). - * - * The pages for the pagetables have all been allocated before: we just need - * to make sure the actual PTEs are up-to-date for the CPU we're about to run - * on. - */ -void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) -{ - unsigned long base; - struct page *percpu_switcher_page, *regs_page; - pte_t *pte; - struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd]; - - /* Switcher page should always be mapped by now! */ - BUG_ON(!pgdir->switcher_mapped); - - /* - * Remember that we have two pages for each Host CPU, so we can run a - * Guest on each CPU without them interfering. We need to make sure - * those pages are mapped correctly in the Guest, but since we usually - * run on the same CPU, we cache that, and only update the mappings - * when we move. - */ - if (pgdir->last_host_cpu == raw_smp_processor_id()) - return; - - /* -1 means unknown so we remove everything. */ - if (pgdir->last_host_cpu == -1) { - unsigned int i; - for_each_possible_cpu(i) - remove_switcher_percpu_map(cpu, i); - } else { - /* We know exactly what CPU mapping to remove. */ - remove_switcher_percpu_map(cpu, pgdir->last_host_cpu); - } - - /* - * When we're running the Guest, we want the Guest's "regs" page to - * appear where the first Switcher page for this CPU is. This is an - * optimization: when the Switcher saves the Guest registers, it saves - * them into the first page of this CPU's "struct lguest_pages": if we - * make sure the Guest's register page is already mapped there, we - * don't have to copy them out again. - */ - /* Find the shadow PTE for this regs page. */ - base = switcher_addr + PAGE_SIZE - + raw_smp_processor_id() * sizeof(struct lguest_pages); - pte = find_spte(cpu, base, false, 0, 0); - regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT); - get_page(regs_page); - set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL))); - - /* - * We map the second page of the struct lguest_pages read-only in - * the Guest: the IDT, GDT and other things it's not supposed to - * change. - */ - pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); - percpu_switcher_page - = lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1]; - get_page(percpu_switcher_page); - set_pte(pte, mk_pte(percpu_switcher_page, - __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL))); - - pgdir->last_host_cpu = raw_smp_processor_id(); -} - -/*H:490 - * We've made it through the page table code. Perhaps our tired brains are - * still processing the details, or perhaps we're simply glad it's over. - * - * If nothing else, note that all this complexity in juggling shadow page tables - * in sync with the Guest's page tables is for one reason: for most Guests this - * page table dance determines how bad performance will be. This is why Xen - * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD - * have implemented shadow page table support directly into hardware. - * - * There is just one file remaining in the Host. - */ diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c deleted file mode 100644 index c4fb424dfddb..000000000000 --- a/drivers/lguest/segments.c +++ /dev/null @@ -1,228 +0,0 @@ -/*P:600 - * The x86 architecture has segments, which involve a table of descriptors - * which can be used to do funky things with virtual address interpretation. - * We originally used to use segments so the Guest couldn't alter the - * Guest<->Host Switcher, and then we had to trim Guest segments, and restore - * for userspace per-thread segments, but trim again for on userspace->kernel - * transitions... This nightmarish creation was contained within this file, - * where we knew not to tread without heavy armament and a change of underwear. - * - * In these modern times, the segment handling code consists of simple sanity - * checks, and the worst you'll experience reading this code is butterfly-rash - * from frolicking through its parklike serenity. -:*/ -#include "lg.h" - -/*H:600 - * Segments & The Global Descriptor Table - * - * (That title sounds like a bad Nerdcore group. Not to suggest that there are - * any good Nerdcore groups, but in high school a friend of mine had a band - * called Joe Fish and the Chips, so there are definitely worse band names). - * - * To refresh: the GDT is a table of 8-byte values describing segments. Once - * set up, these segments can be loaded into one of the 6 "segment registers". - * - * GDT entries are passed around as "struct desc_struct"s, which like IDT - * entries are split into two 32-bit members, "a" and "b". One day, someone - * will clean that up, and be declared a Hero. (No pressure, I'm just saying). - * - * Anyway, the GDT entry contains a base (the start address of the segment), a - * limit (the size of the segment - 1), and some flags. Sounds simple, and it - * would be, except those zany Intel engineers decided that it was too boring - * to put the base at one end, the limit at the other, and the flags in - * between. They decided to shotgun the bits at random throughout the 8 bytes, - * like so: - * - * 0 16 40 48 52 56 63 - * [ limit part 1 ][ base part 1 ][ flags ][li][fl][base ] - * mit ags part 2 - * part 2 - * - * As a result, this file contains a certain amount of magic numeracy. Let's - * begin. - */ - -/* - * There are several entries we don't let the Guest set. The TSS entry is the - * "Task State Segment" which controls all kinds of delicate things. The - * LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the - * the Guest can't be trusted to deal with double faults. - */ -static bool ignored_gdt(unsigned int num) -{ - return (num == GDT_ENTRY_TSS - || num == GDT_ENTRY_LGUEST_CS - || num == GDT_ENTRY_LGUEST_DS - || num == GDT_ENTRY_DOUBLEFAULT_TSS); -} - -/*H:630 - * Once the Guest gave us new GDT entries, we fix them up a little. We - * don't care if they're invalid: the worst that can happen is a General - * Protection Fault in the Switcher when it restores a Guest segment register - * which tries to use that entry. Then we kill the Guest for causing such a - * mess: the message will be "unhandled trap 256". - */ -static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end) -{ - unsigned int i; - - for (i = start; i < end; i++) { - /* - * We never copy these ones to real GDT, so we don't care what - * they say - */ - if (ignored_gdt(i)) - continue; - - /* - * Segment descriptors contain a privilege level: the Guest is - * sometimes careless and leaves this as 0, even though it's - * running at privilege level 1. If so, we fix it here. - */ - if (cpu->arch.gdt[i].dpl == 0) - cpu->arch.gdt[i].dpl |= GUEST_PL; - - /* - * Each descriptor has an "accessed" bit. If we don't set it - * now, the CPU will try to set it when the Guest first loads - * that entry into a segment register. But the GDT isn't - * writable by the Guest, so bad things can happen. - */ - cpu->arch.gdt[i].type |= 0x1; - } -} - -/*H:610 - * Like the IDT, we never simply use the GDT the Guest gives us. We keep - * a GDT for each CPU, and copy across the Guest's entries each time we want to - * run the Guest on that CPU. - * - * This routine is called at boot or modprobe time for each CPU to set up the - * constant GDT entries: the ones which are the same no matter what Guest we're - * running. - */ -void setup_default_gdt_entries(struct lguest_ro_state *state) -{ - struct desc_struct *gdt = state->guest_gdt; - unsigned long tss = (unsigned long)&state->guest_tss; - - /* The Switcher segments are full 0-4G segments, privilege level 0 */ - gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; - gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; - - /* - * The TSS segment refers to the TSS entry for this particular CPU. - */ - gdt[GDT_ENTRY_TSS].a = 0; - gdt[GDT_ENTRY_TSS].b = 0; - - gdt[GDT_ENTRY_TSS].limit0 = 0x67; - gdt[GDT_ENTRY_TSS].base0 = tss & 0xFFFF; - gdt[GDT_ENTRY_TSS].base1 = (tss >> 16) & 0xFF; - gdt[GDT_ENTRY_TSS].base2 = tss >> 24; - gdt[GDT_ENTRY_TSS].type = 0x9; /* 32-bit TSS (available) */ - gdt[GDT_ENTRY_TSS].p = 0x1; /* Entry is present */ - gdt[GDT_ENTRY_TSS].dpl = 0x0; /* Privilege level 0 */ - gdt[GDT_ENTRY_TSS].s = 0x0; /* system segment */ - -} - -/* - * This routine sets up the initial Guest GDT for booting. All entries start - * as 0 (unusable). - */ -void setup_guest_gdt(struct lg_cpu *cpu) -{ - /* - * Start with full 0-4G segments...except the Guest is allowed to use - * them, so set the privilege level appropriately in the flags. - */ - cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; - cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; - cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].dpl |= GUEST_PL; - cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].dpl |= GUEST_PL; -} - -/*H:650 - * An optimization of copy_gdt(), for just the three "thead-local storage" - * entries. - */ -void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt) -{ - unsigned int i; - - for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++) - gdt[i] = cpu->arch.gdt[i]; -} - -/*H:640 - * When the Guest is run on a different CPU, or the GDT entries have changed, - * copy_gdt() is called to copy the Guest's GDT entries across to this CPU's - * GDT. - */ -void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt) -{ - unsigned int i; - - /* - * The default entries from setup_default_gdt_entries() are not - * replaced. See ignored_gdt() above. - */ - for (i = 0; i < GDT_ENTRIES; i++) - if (!ignored_gdt(i)) - gdt[i] = cpu->arch.gdt[i]; -} - -/*H:620 - * This is where the Guest asks us to load a new GDT entry - * (LHCALL_LOAD_GDT_ENTRY). We tweak the entry and copy it in. - */ -void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi) -{ - /* - * We assume the Guest has the same number of GDT entries as the - * Host, otherwise we'd have to dynamically allocate the Guest GDT. - */ - if (num >= ARRAY_SIZE(cpu->arch.gdt)) { - kill_guest(cpu, "too many gdt entries %i", num); - return; - } - - /* Set it up, then fix it. */ - cpu->arch.gdt[num].a = lo; - cpu->arch.gdt[num].b = hi; - fixup_gdt_table(cpu, num, num+1); - /* - * Mark that the GDT changed so the core knows it has to copy it again, - * even if the Guest is run on the same CPU. - */ - cpu->changed |= CHANGED_GDT; -} - -/* - * This is the fast-track version for just changing the three TLS entries. - * Remember that this happens on every context switch, so it's worth - * optimizing. But wouldn't it be neater to have a single hypercall to cover - * both cases? - */ -void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls) -{ - struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN]; - - __lgread(cpu, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); - fixup_gdt_table(cpu, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); - /* Note that just the TLS entries have changed. */ - cpu->changed |= CHANGED_GDT_TLS; -} - -/*H:660 - * With this, we have finished the Host. - * - * Five of the seven parts of our task are complete. You have made it through - * the Bit of Despair (I think that's somewhere in the page table code, - * myself). - * - * Next, we examine "make Switcher". It's short, but intense. - */ diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c deleted file mode 100644 index b4f79b923aea..000000000000 --- a/drivers/lguest/x86/core.c +++ /dev/null @@ -1,724 +0,0 @@ -/* - * Copyright (C) 2006, Rusty Russell IBM Corporation. - * Copyright (C) 2007, Jes Sorensen SGI. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ -/*P:450 - * This file contains the x86-specific lguest code. It used to be all - * mixed in with drivers/lguest/core.c but several foolhardy code slashers - * wrestled most of the dependencies out to here in preparation for porting - * lguest to other architectures (see what I mean by foolhardy?). - * - * This also contains a couple of non-obvious setup and teardown pieces which - * were implemented after days of debugging pain. -:*/ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "../lg.h" - -static int cpu_had_pge; - -static struct { - unsigned long offset; - unsigned short segment; -} lguest_entry; - -/* Offset from where switcher.S was compiled to where we've copied it */ -static unsigned long switcher_offset(void) -{ - return switcher_addr - (unsigned long)start_switcher_text; -} - -/* This cpu's struct lguest_pages (after the Switcher text page) */ -static struct lguest_pages *lguest_pages(unsigned int cpu) -{ - return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]); -} - -static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu); - -/*S:010 - * We approach the Switcher. - * - * Remember that each CPU has two pages which are visible to the Guest when it - * runs on that CPU. This has to contain the state for that Guest: we copy the - * state in just before we run the Guest. - * - * Each Guest has "changed" flags which indicate what has changed in the Guest - * since it last ran. We saw this set in interrupts_and_traps.c and - * segments.c. - */ -static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages) -{ - /* - * Copying all this data can be quite expensive. We usually run the - * same Guest we ran last time (and that Guest hasn't run anywhere else - * meanwhile). If that's not the case, we pretend everything in the - * Guest has changed. - */ - if (__this_cpu_read(lg_last_cpu) != cpu || cpu->last_pages != pages) { - __this_cpu_write(lg_last_cpu, cpu); - cpu->last_pages = pages; - cpu->changed = CHANGED_ALL; - } - - /* - * These copies are pretty cheap, so we do them unconditionally: */ - /* Save the current Host top-level page directory. - */ - pages->state.host_cr3 = __pa(current->mm->pgd); - /* - * Set up the Guest's page tables to see this CPU's pages (and no - * other CPU's pages). - */ - map_switcher_in_guest(cpu, pages); - /* - * Set up the two "TSS" members which tell the CPU what stack to use - * for traps which do directly into the Guest (ie. traps at privilege - * level 1). - */ - pages->state.guest_tss.sp1 = cpu->esp1; - pages->state.guest_tss.ss1 = cpu->ss1; - - /* Copy direct-to-Guest trap entries. */ - if (cpu->changed & CHANGED_IDT) - copy_traps(cpu, pages->state.guest_idt, default_idt_entries); - - /* Copy all GDT entries which the Guest can change. */ - if (cpu->changed & CHANGED_GDT) - copy_gdt(cpu, pages->state.guest_gdt); - /* If only the TLS entries have changed, copy them. */ - else if (cpu->changed & CHANGED_GDT_TLS) - copy_gdt_tls(cpu, pages->state.guest_gdt); - - /* Mark the Guest as unchanged for next time. */ - cpu->changed = 0; -} - -/* Finally: the code to actually call into the Switcher to run the Guest. */ -static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) -{ - /* This is a dummy value we need for GCC's sake. */ - unsigned int clobber; - - /* - * Copy the guest-specific information into this CPU's "struct - * lguest_pages". - */ - copy_in_guest_info(cpu, pages); - - /* - * Set the trap number to 256 (impossible value). If we fault while - * switching to the Guest (bad segment registers or bug), this will - * cause us to abort the Guest. - */ - cpu->regs->trapnum = 256; - - /* - * Now: we push the "eflags" register on the stack, then do an "lcall". - * This is how we change from using the kernel code segment to using - * the dedicated lguest code segment, as well as jumping into the - * Switcher. - * - * The lcall also pushes the old code segment (KERNEL_CS) onto the - * stack, then the address of this call. This stack layout happens to - * exactly match the stack layout created by an interrupt... - */ - asm volatile("pushf; lcall *%4" - /* - * This is how we tell GCC that %eax ("a") and %ebx ("b") - * are changed by this routine. The "=" means output. - */ - : "=a"(clobber), "=b"(clobber) - /* - * %eax contains the pages pointer. ("0" refers to the - * 0-th argument above, ie "a"). %ebx contains the - * physical address of the Guest's top-level page - * directory. - */ - : "0"(pages), - "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)), - "m"(lguest_entry) - /* - * We tell gcc that all these registers could change, - * which means we don't have to save and restore them in - * the Switcher. - */ - : "memory", "%edx", "%ecx", "%edi", "%esi"); -} -/*:*/ - -unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any) -{ - switch (reg_off) { - case offsetof(struct pt_regs, bx): - return &cpu->regs->ebx; - case offsetof(struct pt_regs, cx): - return &cpu->regs->ecx; - case offsetof(struct pt_regs, dx): - return &cpu->regs->edx; - case offsetof(struct pt_regs, si): - return &cpu->regs->esi; - case offsetof(struct pt_regs, di): - return &cpu->regs->edi; - case offsetof(struct pt_regs, bp): - return &cpu->regs->ebp; - case offsetof(struct pt_regs, ax): - return &cpu->regs->eax; - case offsetof(struct pt_regs, ip): - return &cpu->regs->eip; - case offsetof(struct pt_regs, sp): - return &cpu->regs->esp; - } - - /* Launcher can read these, but we don't allow any setting. */ - if (any) { - switch (reg_off) { - case offsetof(struct pt_regs, ds): - return &cpu->regs->ds; - case offsetof(struct pt_regs, es): - return &cpu->regs->es; - case offsetof(struct pt_regs, fs): - return &cpu->regs->fs; - case offsetof(struct pt_regs, gs): - return &cpu->regs->gs; - case offsetof(struct pt_regs, cs): - return &cpu->regs->cs; - case offsetof(struct pt_regs, flags): - return &cpu->regs->eflags; - case offsetof(struct pt_regs, ss): - return &cpu->regs->ss; - } - } - - return NULL; -} - -/*M:002 - * There are hooks in the scheduler which we can register to tell when we - * get kicked off the CPU (preempt_notifier_register()). This would allow us - * to lazily disable SYSENTER which would regain some performance, and should - * also simplify copy_in_guest_info(). Note that we'd still need to restore - * things when we exit to Launcher userspace, but that's fairly easy. - * - * We could also try using these hooks for PGE, but that might be too expensive. - * - * The hooks were designed for KVM, but we can also put them to good use. -:*/ - -/*H:040 - * This is the i386-specific code to setup and run the Guest. Interrupts - * are disabled: we own the CPU. - */ -void lguest_arch_run_guest(struct lg_cpu *cpu) -{ - /* - * SYSENTER is an optimized way of doing system calls. We can't allow - * it because it always jumps to privilege level 0. A normal Guest - * won't try it because we don't advertise it in CPUID, but a malicious - * Guest (or malicious Guest userspace program) could, so we tell the - * CPU to disable it before running the Guest. - */ - if (boot_cpu_has(X86_FEATURE_SEP)) - wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); - - /* - * Now we actually run the Guest. It will return when something - * interesting happens, and we can examine its registers to see what it - * was doing. - */ - run_guest_once(cpu, lguest_pages(raw_smp_processor_id())); - - /* - * Note that the "regs" structure contains two extra entries which are - * not really registers: a trap number which says what interrupt or - * trap made the switcher code come back, and an error code which some - * traps set. - */ - - /* Restore SYSENTER if it's supposed to be on. */ - if (boot_cpu_has(X86_FEATURE_SEP)) - wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); - - /* - * If the Guest page faulted, then the cr2 register will tell us the - * bad virtual address. We have to grab this now, because once we - * re-enable interrupts an interrupt could fault and thus overwrite - * cr2, or we could even move off to a different CPU. - */ - if (cpu->regs->trapnum == 14) - cpu->arch.last_pagefault = read_cr2(); - /* - * Similarly, if we took a trap because the Guest used the FPU, - * we have to restore the FPU it expects to see. - * fpu__restore() may sleep and we may even move off to - * a different CPU. So all the critical stuff should be done - * before this. - */ - else if (cpu->regs->trapnum == 7 && !fpregs_active()) - fpu__restore(¤t->thread.fpu); -} - -/*H:130 - * Now we've examined the hypercall code; our Guest can make requests. - * Our Guest is usually so well behaved; it never tries to do things it isn't - * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual - * infrastructure isn't quite complete, because it doesn't contain replacements - * for the Intel I/O instructions. As a result, the Guest sometimes fumbles - * across one during the boot process as it probes for various things which are - * usually attached to a PC. - * - * When the Guest uses one of these instructions, we get a trap (General - * Protection Fault) and come here. We queue this to be sent out to the - * Launcher to handle. - */ - -/* - * The eip contains the *virtual* address of the Guest's instruction: - * we copy the instruction here so the Launcher doesn't have to walk - * the page tables to decode it. We handle the case (eg. in a kernel - * module) where the instruction is over two pages, and the pages are - * virtually but not physically contiguous. - * - * The longest possible x86 instruction is 15 bytes, but we don't handle - * anything that strange. - */ -static void copy_from_guest(struct lg_cpu *cpu, - void *dst, unsigned long vaddr, size_t len) -{ - size_t to_page_end = PAGE_SIZE - (vaddr % PAGE_SIZE); - unsigned long paddr; - - BUG_ON(len > PAGE_SIZE); - - /* If it goes over a page, copy in two parts. */ - if (len > to_page_end) { - /* But make sure the next page is mapped! */ - if (__guest_pa(cpu, vaddr + to_page_end, &paddr)) - copy_from_guest(cpu, dst + to_page_end, - vaddr + to_page_end, - len - to_page_end); - else - /* Otherwise fill with zeroes. */ - memset(dst + to_page_end, 0, len - to_page_end); - len = to_page_end; - } - - /* This will kill the guest if it isn't mapped, but that - * shouldn't happen. */ - __lgread(cpu, dst, guest_pa(cpu, vaddr), len); -} - - -static void setup_emulate_insn(struct lg_cpu *cpu) -{ - cpu->pending.trap = 13; - copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip, - sizeof(cpu->pending.insn)); -} - -static void setup_iomem_insn(struct lg_cpu *cpu, unsigned long iomem_addr) -{ - cpu->pending.trap = 14; - cpu->pending.addr = iomem_addr; - copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip, - sizeof(cpu->pending.insn)); -} - -/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ -void lguest_arch_handle_trap(struct lg_cpu *cpu) -{ - unsigned long iomem_addr; - - switch (cpu->regs->trapnum) { - case 13: /* We've intercepted a General Protection Fault. */ - /* Hand to Launcher to emulate those pesky IN and OUT insns */ - if (cpu->regs->errcode == 0) { - setup_emulate_insn(cpu); - return; - } - break; - case 14: /* We've intercepted a Page Fault. */ - /* - * The Guest accessed a virtual address that wasn't mapped. - * This happens a lot: we don't actually set up most of the page - * tables for the Guest at all when we start: as it runs it asks - * for more and more, and we set them up as required. In this - * case, we don't even tell the Guest that the fault happened. - * - * The errcode tells whether this was a read or a write, and - * whether kernel or userspace code. - */ - if (demand_page(cpu, cpu->arch.last_pagefault, - cpu->regs->errcode, &iomem_addr)) - return; - - /* Was this an access to memory mapped IO? */ - if (iomem_addr) { - /* Tell Launcher, let it handle it. */ - setup_iomem_insn(cpu, iomem_addr); - return; - } - - /* - * OK, it's really not there (or not OK): the Guest needs to - * know. We write out the cr2 value so it knows where the - * fault occurred. - * - * Note that if the Guest were really messed up, this could - * happen before it's done the LHCALL_LGUEST_INIT hypercall, so - * lg->lguest_data could be NULL - */ - if (cpu->lg->lguest_data && - put_user(cpu->arch.last_pagefault, - &cpu->lg->lguest_data->cr2)) - kill_guest(cpu, "Writing cr2"); - break; - case 7: /* We've intercepted a Device Not Available fault. */ - /* No special handling is needed here. */ - break; - case 32 ... 255: - /* This might be a syscall. */ - if (could_be_syscall(cpu->regs->trapnum)) - break; - - /* - * Other values mean a real interrupt occurred, in which case - * the Host handler has already been run. We just do a - * friendly check if another process should now be run, then - * return to run the Guest again. - */ - cond_resched(); - return; - case LGUEST_TRAP_ENTRY: - /* - * Our 'struct hcall_args' maps directly over our regs: we set - * up the pointer now to indicate a hypercall is pending. - */ - cpu->hcall = (struct hcall_args *)cpu->regs; - return; - } - - /* We didn't handle the trap, so it needs to go to the Guest. */ - if (!deliver_trap(cpu, cpu->regs->trapnum)) - /* - * If the Guest doesn't have a handler (either it hasn't - * registered any yet, or it's one of the faults we don't let - * it handle), it dies with this cryptic error message. - */ - kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)", - cpu->regs->trapnum, cpu->regs->eip, - cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault - : cpu->regs->errcode); -} - -/* - * Now we can look at each of the routines this calls, in increasing order of - * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(), - * deliver_trap() and demand_page(). After all those, we'll be ready to - * examine the Switcher, and our philosophical understanding of the Host/Guest - * duality will be complete. -:*/ -static void adjust_pge(void *on) -{ - if (on) - cr4_set_bits(X86_CR4_PGE); - else - cr4_clear_bits(X86_CR4_PGE); -} - -/*H:020 - * Now the Switcher is mapped and every thing else is ready, we need to do - * some more i386-specific initialization. - */ -void __init lguest_arch_host_init(void) -{ - int i; - - /* - * Most of the x86/switcher_32.S doesn't care that it's been moved; on - * Intel, jumps are relative, and it doesn't access any references to - * external code or data. - * - * The only exception is the interrupt handlers in switcher.S: their - * addresses are placed in a table (default_idt_entries), so we need to - * update the table with the new addresses. switcher_offset() is a - * convenience function which returns the distance between the - * compiled-in switcher code and the high-mapped copy we just made. - */ - for (i = 0; i < IDT_ENTRIES; i++) - default_idt_entries[i] += switcher_offset(); - - /* - * Set up the Switcher's per-cpu areas. - * - * Each CPU gets two pages of its own within the high-mapped region - * (aka. "struct lguest_pages"). Much of this can be initialized now, - * but some depends on what Guest we are running (which is set up in - * copy_in_guest_info()). - */ - for_each_possible_cpu(i) { - /* lguest_pages() returns this CPU's two pages. */ - struct lguest_pages *pages = lguest_pages(i); - /* This is a convenience pointer to make the code neater. */ - struct lguest_ro_state *state = &pages->state; - - /* - * The Global Descriptor Table: the Host has a different one - * for each CPU. We keep a descriptor for the GDT which says - * where it is and how big it is (the size is actually the last - * byte, not the size, hence the "-1"). - */ - state->host_gdt_desc.size = GDT_SIZE-1; - state->host_gdt_desc.address = (long)get_cpu_gdt_rw(i); - - /* - * All CPUs on the Host use the same Interrupt Descriptor - * Table, so we just use store_idt(), which gets this CPU's IDT - * descriptor. - */ - store_idt(&state->host_idt_desc); - - /* - * The descriptors for the Guest's GDT and IDT can be filled - * out now, too. We copy the GDT & IDT into ->guest_gdt and - * ->guest_idt before actually running the Guest. - */ - state->guest_idt_desc.size = sizeof(state->guest_idt)-1; - state->guest_idt_desc.address = (long)&state->guest_idt; - state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; - state->guest_gdt_desc.address = (long)&state->guest_gdt; - - /* - * We know where we want the stack to be when the Guest enters - * the Switcher: in pages->regs. The stack grows upwards, so - * we start it at the end of that structure. - */ - state->guest_tss.sp0 = (long)(&pages->regs + 1); - /* - * And this is the GDT entry to use for the stack: we keep a - * couple of special LGUEST entries. - */ - state->guest_tss.ss0 = LGUEST_DS; - - /* - * x86 can have a finegrained bitmap which indicates what I/O - * ports the process can use. We set it to the end of our - * structure, meaning "none". - */ - state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); - - /* - * Some GDT entries are the same across all Guests, so we can - * set them up now. - */ - setup_default_gdt_entries(state); - /* Most IDT entries are the same for all Guests, too.*/ - setup_default_idt_entries(state, default_idt_entries); - - /* - * The Host needs to be able to use the LGUEST segments on this - * CPU, too, so put them in the Host GDT. - */ - get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; - get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; - } - - /* - * In the Switcher, we want the %cs segment register to use the - * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so - * it will be undisturbed when we switch. To change %cs and jump we - * need this structure to feed to Intel's "lcall" instruction. - */ - lguest_entry.offset = (long)switch_to_guest + switcher_offset(); - lguest_entry.segment = LGUEST_CS; - - /* - * Finally, we need to turn off "Page Global Enable". PGE is an - * optimization where page table entries are specially marked to show - * they never change. The Host kernel marks all the kernel pages this - * way because it's always present, even when userspace is running. - * - * Lguest breaks this: unbeknownst to the rest of the Host kernel, we - * switch to the Guest kernel. If you don't disable this on all CPUs, - * you'll get really weird bugs that you'll chase for two days. - * - * I used to turn PGE off every time we switched to the Guest and back - * on when we return, but that slowed the Switcher down noticibly. - */ - - /* - * We don't need the complexity of CPUs coming and going while we're - * doing this. - */ - get_online_cpus(); - if (boot_cpu_has(X86_FEATURE_PGE)) { /* We have a broader idea of "global". */ - /* Remember that this was originally set (for cleanup). */ - cpu_had_pge = 1; - /* - * adjust_pge is a helper function which sets or unsets the PGE - * bit on its CPU, depending on the argument (0 == unset). - */ - on_each_cpu(adjust_pge, (void *)0, 1); - /* Turn off the feature in the global feature set. */ - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); - } - put_online_cpus(); -} -/*:*/ - -void __exit lguest_arch_host_fini(void) -{ - /* If we had PGE before we started, turn it back on now. */ - get_online_cpus(); - if (cpu_had_pge) { - set_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); - /* adjust_pge's argument "1" means set PGE. */ - on_each_cpu(adjust_pge, (void *)1, 1); - } - put_online_cpus(); -} - - -/*H:122 The i386-specific hypercalls simply farm out to the right functions. */ -int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args) -{ - switch (args->arg0) { - case LHCALL_LOAD_GDT_ENTRY: - load_guest_gdt_entry(cpu, args->arg1, args->arg2, args->arg3); - break; - case LHCALL_LOAD_IDT_ENTRY: - load_guest_idt_entry(cpu, args->arg1, args->arg2, args->arg3); - break; - case LHCALL_LOAD_TLS: - guest_load_tls(cpu, args->arg1); - break; - default: - /* Bad Guest. Bad! */ - return -EIO; - } - return 0; -} - -/*H:126 i386-specific hypercall initialization: */ -int lguest_arch_init_hypercalls(struct lg_cpu *cpu) -{ - u32 tsc_speed; - - /* - * The pointer to the Guest's "struct lguest_data" is the only argument. - * We check that address now. - */ - if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1, - sizeof(*cpu->lg->lguest_data))) - return -EFAULT; - - /* - * Having checked it, we simply set lg->lguest_data to point straight - * into the Launcher's memory at the right place and then use - * copy_to_user/from_user from now on, instead of lgread/write. I put - * this in to show that I'm not immune to writing stupid - * optimizations. - */ - cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1; - - /* - * We insist that the Time Stamp Counter exist and doesn't change with - * cpu frequency. Some devious chip manufacturers decided that TSC - * changes could be handled in software. I decided that time going - * backwards might be good for benchmarks, but it's bad for users. - * - * We also insist that the TSC be stable: the kernel detects unreliable - * TSCs for its own purposes, and we use that here. - */ - if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) - tsc_speed = tsc_khz; - else - tsc_speed = 0; - if (put_user(tsc_speed, &cpu->lg->lguest_data->tsc_khz)) - return -EFAULT; - - /* The interrupt code might not like the system call vector. */ - if (!check_syscall_vector(cpu->lg)) - kill_guest(cpu, "bad syscall vector"); - - return 0; -} -/*:*/ - -/*L:030 - * Most of the Guest's registers are left alone: we used get_zeroed_page() to - * allocate the structure, so they will be 0. - */ -void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start) -{ - struct lguest_regs *regs = cpu->regs; - - /* - * There are four "segment" registers which the Guest needs to boot: - * The "code segment" register (cs) refers to the kernel code segment - * __KERNEL_CS, and the "data", "extra" and "stack" segment registers - * refer to the kernel data segment __KERNEL_DS. - * - * The privilege level is packed into the lower bits. The Guest runs - * at privilege level 1 (GUEST_PL). - */ - regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; - regs->cs = __KERNEL_CS|GUEST_PL; - - /* - * The "eflags" register contains miscellaneous flags. Bit 1 (0x002) - * is supposed to always be "1". Bit 9 (0x200) controls whether - * interrupts are enabled. We always leave interrupts enabled while - * running the Guest. - */ - regs->eflags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; - - /* - * The "Extended Instruction Pointer" register says where the Guest is - * running. - */ - regs->eip = start; - - /* - * %esi points to our boot information, at physical address 0, so don't - * touch it. - */ - - /* There are a couple of GDT entries the Guest expects at boot. */ - setup_guest_gdt(cpu); -} diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S deleted file mode 100644 index 40634b0db9f7..000000000000 --- a/drivers/lguest/x86/switcher_32.S +++ /dev/null @@ -1,388 +0,0 @@ -/*P:900 - * This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride - * both the Host and Guest to do the low-level Guest<->Host switch. It is as - * simple as it can be made, but it's naturally very specific to x86. - * - * You have now completed Preparation. If this has whet your appetite; if you - * are feeling invigorated and refreshed then the next, more challenging stage - * can be found in "make Guest". - :*/ - -/*M:012 - * Lguest is meant to be simple: my rule of thumb is that 1% more LOC must - * gain at least 1% more performance. Since neither LOC nor performance can be - * measured beforehand, it generally means implementing a feature then deciding - * if it's worth it. And once it's implemented, who can say no? - * - * This is why I haven't implemented this idea myself. I want to, but I - * haven't. You could, though. - * - * The main place where lguest performance sucks is Guest page faulting. When - * a Guest userspace process hits an unmapped page we switch back to the Host, - * walk the page tables, find it's not mapped, switch back to the Guest page - * fault handler, which calls a hypercall to set the page table entry, then - * finally returns to userspace. That's two round-trips. - * - * If we had a small walker in the Switcher, we could quickly check the Guest - * page table and if the page isn't mapped, immediately reflect the fault back - * into the Guest. This means the Switcher would have to know the top of the - * Guest page table and the page fault handler address. - * - * For simplicity, the Guest should only handle the case where the privilege - * level of the fault is 3 and probably only not present or write faults. It - * should also detect recursive faults, and hand the original fault to the - * Host (which is actually really easy). - * - * Two questions remain. Would the performance gain outweigh the complexity? - * And who would write the verse documenting it? -:*/ - -/*M:011 - * Lguest64 handles NMI. This gave me NMI envy (until I looked at their - * code). It's worth doing though, since it would let us use oprofile in the - * Host when a Guest is running. -:*/ - -/*S:100 - * Welcome to the Switcher itself! - * - * This file contains the low-level code which changes the CPU to run the Guest - * code, and returns to the Host when something happens. Understand this, and - * you understand the heart of our journey. - * - * Because this is in assembler rather than C, our tale switches from prose to - * verse. First I tried limericks: - * - * There once was an eax reg, - * To which our pointer was fed, - * It needed an add, - * Which asm-offsets.h had - * But this limerick is hurting my head. - * - * Next I tried haikus, but fitting the required reference to the seasons in - * every stanza was quickly becoming tiresome: - * - * The %eax reg - * Holds "struct lguest_pages" now: - * Cherry blossoms fall. - * - * Then I started with Heroic Verse, but the rhyming requirement leeched away - * the content density and led to some uniquely awful oblique rhymes: - * - * These constants are coming from struct offsets - * For use within the asm switcher text. - * - * Finally, I settled for something between heroic hexameter, and normal prose - * with inappropriate linebreaks. Anyway, it aint no Shakespeare. - */ - -// Not all kernel headers work from assembler -// But these ones are needed: the ENTRY() define -// And constants extracted from struct offsets -// To avoid magic numbers and breakage: -// Should they change the compiler can't save us -// Down here in the depths of assembler code. -#include -#include -#include -#include -#include - -// We mark the start of the code to copy -// It's placed in .text tho it's never run here -// You'll see the trick macro at the end -// Which interleaves data and text to effect. -.text -ENTRY(start_switcher_text) - -// When we reach switch_to_guest we have just left -// The safe and comforting shores of C code -// %eax has the "struct lguest_pages" to use -// Where we save state and still see it from the Guest -// And %ebx holds the Guest shadow pagetable: -// Once set we have truly left Host behind. -ENTRY(switch_to_guest) - // We told gcc all its regs could fade, - // Clobbered by our journey into the Guest - // We could have saved them, if we tried - // But time is our master and cycles count. - - // Segment registers must be saved for the Host - // We push them on the Host stack for later - pushl %es - pushl %ds - pushl %gs - pushl %fs - // But the compiler is fickle, and heeds - // No warning of %ebp clobbers - // When frame pointers are used. That register - // Must be saved and restored or chaos strikes. - pushl %ebp - // The Host's stack is done, now save it away - // In our "struct lguest_pages" at offset - // Distilled into asm-offsets.h - movl %esp, LGUEST_PAGES_host_sp(%eax) - - // All saved and there's now five steps before us: - // Stack, GDT, IDT, TSS - // Then last of all the page tables are flipped. - - // Yet beware that our stack pointer must be - // Always valid lest an NMI hits - // %edx does the duty here as we juggle - // %eax is lguest_pages: our stack lies within. - movl %eax, %edx - addl $LGUEST_PAGES_regs, %edx - movl %edx, %esp - - // The Guest's GDT we so carefully - // Placed in the "struct lguest_pages" before - lgdt LGUEST_PAGES_guest_gdt_desc(%eax) - - // The Guest's IDT we did partially - // Copy to "struct lguest_pages" as well. - lidt LGUEST_PAGES_guest_idt_desc(%eax) - - // The TSS entry which controls traps - // Must be loaded up with "ltr" now: - // The GDT entry that TSS uses - // Changes type when we load it: damn Intel! - // For after we switch over our page tables - // That entry will be read-only: we'd crash. - movl $(GDT_ENTRY_TSS*8), %edx - ltr %dx - - // Look back now, before we take this last step! - // The Host's TSS entry was also marked used; - // Let's clear it again for our return. - // The GDT descriptor of the Host - // Points to the table after two "size" bytes - movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx - // Clear "used" from type field (byte 5, bit 2) - andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx) - - // Once our page table's switched, the Guest is live! - // The Host fades as we run this final step. - // Our "struct lguest_pages" is now read-only. - movl %ebx, %cr3 - - // The page table change did one tricky thing: - // The Guest's register page has been mapped - // Writable under our %esp (stack) -- - // We can simply pop off all Guest regs. - popl %eax - popl %ebx - popl %ecx - popl %edx - popl %esi - popl %edi - popl %ebp - popl %gs - popl %fs - popl %ds - popl %es - - // Near the base of the stack lurk two strange fields - // Which we fill as we exit the Guest - // These are the trap number and its error - // We can simply step past them on our way. - addl $8, %esp - - // The last five stack slots hold return address - // And everything needed to switch privilege - // From Switcher's level 0 to Guest's 1, - // And the stack where the Guest had last left it. - // Interrupts are turned back on: we are Guest. - iret - -// We tread two paths to switch back to the Host -// Yet both must save Guest state and restore Host -// So we put the routine in a macro. -#define SWITCH_TO_HOST \ - /* We save the Guest state: all registers first \ - * Laid out just as "struct lguest_regs" defines */ \ - pushl %es; \ - pushl %ds; \ - pushl %fs; \ - pushl %gs; \ - pushl %ebp; \ - pushl %edi; \ - pushl %esi; \ - pushl %edx; \ - pushl %ecx; \ - pushl %ebx; \ - pushl %eax; \ - /* Our stack and our code are using segments \ - * Set in the TSS and IDT \ - * Yet if we were to touch data we'd use \ - * Whatever data segment the Guest had. \ - * Load the lguest ds segment for now. */ \ - movl $(LGUEST_DS), %eax; \ - movl %eax, %ds; \ - /* So where are we? Which CPU, which struct? \ - * The stack is our clue: our TSS starts \ - * It at the end of "struct lguest_pages". \ - * Or we may have stumbled while restoring \ - * Our Guest segment regs while in switch_to_guest, \ - * The fault pushed atop that part-unwound stack. \ - * If we round the stack down to the page start \ - * We're at the start of "struct lguest_pages". */ \ - movl %esp, %eax; \ - andl $(~(1 << PAGE_SHIFT - 1)), %eax; \ - /* Save our trap number: the switch will obscure it \ - * (In the Host the Guest regs are not mapped here) \ - * %ebx holds it safe for deliver_to_host */ \ - movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \ - /* The Host GDT, IDT and stack! \ - * All these lie safely hidden from the Guest: \ - * We must return to the Host page tables \ - * (Hence that was saved in struct lguest_pages) */ \ - movl LGUEST_PAGES_host_cr3(%eax), %edx; \ - movl %edx, %cr3; \ - /* As before, when we looked back at the Host \ - * As we left and marked TSS unused \ - * So must we now for the Guest left behind. */ \ - andb $0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \ - /* Switch to Host's GDT, IDT. */ \ - lgdt LGUEST_PAGES_host_gdt_desc(%eax); \ - lidt LGUEST_PAGES_host_idt_desc(%eax); \ - /* Restore the Host's stack where its saved regs lie */ \ - movl LGUEST_PAGES_host_sp(%eax), %esp; \ - /* Last the TSS: our Host is returned */ \ - movl $(GDT_ENTRY_TSS*8), %edx; \ - ltr %dx; \ - /* Restore now the regs saved right at the first. */ \ - popl %ebp; \ - popl %fs; \ - popl %gs; \ - popl %ds; \ - popl %es - -// The first path is trod when the Guest has trapped: -// (Which trap it was has been pushed on the stack). -// We need only switch back, and the Host will decode -// Why we came home, and what needs to be done. -return_to_host: - SWITCH_TO_HOST - iret - -// We are lead to the second path like so: -// An interrupt, with some cause external -// Has ajerked us rudely from the Guest's code -// Again we must return home to the Host -deliver_to_host: - SWITCH_TO_HOST - // But now we must go home via that place - // Where that interrupt was supposed to go - // Had we not been ensconced, running the Guest. - // Here we see the trickness of run_guest_once(): - // The Host stack is formed like an interrupt - // With EIP, CS and EFLAGS layered. - // Interrupt handlers end with "iret" - // And that will take us home at long long last. - - // But first we must find the handler to call! - // The IDT descriptor for the Host - // Has two bytes for size, and four for address: - // %edx will hold it for us for now. - movl (LGUEST_PAGES_host_idt_desc+2)(%eax), %edx - // We now know the table address we need, - // And saved the trap's number inside %ebx. - // Yet the pointer to the handler is smeared - // Across the bits of the table entry. - // What oracle can tell us how to extract - // From such a convoluted encoding? - // I consulted gcc, and it gave - // These instructions, which I gladly credit: - leal (%edx,%ebx,8), %eax - movzwl (%eax),%edx - movl 4(%eax), %eax - xorw %ax, %ax - orl %eax, %edx - // Now the address of the handler's in %edx - // We call it now: its "iret" drops us home. - jmp *%edx - -// Every interrupt can come to us here -// But we must truly tell each apart. -// They number two hundred and fifty six -// And each must land in a different spot, -// Push its number on stack, and join the stream. - -// And worse, a mere six of the traps stand apart -// And push on their stack an addition: -// An error number, thirty two bits long -// So we punish the other two fifty -// And make them push a zero so they match. - -// Yet two fifty six entries is long -// And all will look most the same as the last -// So we create a macro which can make -// As many entries as we need to fill. - -// Note the change to .data then .text: -// We plant the address of each entry -// Into a (data) table for the Host -// To know where each Guest interrupt should go. -.macro IRQ_STUB N TARGET - .data; .long 1f; .text; 1: - // Trap eight, ten through fourteen and seventeen - // Supply an error number. Else zero. - .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17) - pushl $0 - .endif - pushl $\N - jmp \TARGET - ALIGN -.endm - -// This macro creates numerous entries -// Using GAS macros which out-power C's. -.macro IRQ_STUBS FIRST LAST TARGET - irq=\FIRST - .rept \LAST-\FIRST+1 - IRQ_STUB irq \TARGET - irq=irq+1 - .endr -.endm - -// Here's the marker for our pointer table -// Laid in the data section just before -// Each macro places the address of code -// Forming an array: each one points to text -// Which handles interrupt in its turn. -.data -.global default_idt_entries -default_idt_entries: -.text - // The first two traps go straight back to the Host - IRQ_STUBS 0 1 return_to_host - // We'll say nothing, yet, about NMI - IRQ_STUB 2 handle_nmi - // Other traps also return to the Host - IRQ_STUBS 3 31 return_to_host - // All interrupts go via their handlers - IRQ_STUBS 32 127 deliver_to_host - // 'Cept system calls coming from userspace - // Are to go to the Guest, never the Host. - IRQ_STUB 128 return_to_host - IRQ_STUBS 129 255 deliver_to_host - -// The NMI, what a fabulous beast -// Which swoops in and stops us no matter that -// We're suspended between heaven and hell, -// (Or more likely between the Host and Guest) -// When in it comes! We are dazed and confused -// So we do the simplest thing which one can. -// Though we've pushed the trap number and zero -// We discard them, return, and hope we live. -handle_nmi: - addl $8, %esp - iret - -// We are done; all that's left is Mastery -// And "make Mastery" is a journey long -// Designed to make your fingers itch to code. - -// Here ends the text, the file and poem. -ENTRY(end_switcher_text) diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 83a1616903f8..aba0d652095b 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -333,7 +333,7 @@ config VIRTIO_NET depends on VIRTIO ---help--- This is the virtual network driver for virtio. It can be used with - lguest or QEMU based VMMs (like KVM or Xen). Say Y or M. + QEMU based VMMs (like KVM or Xen). Say Y or M. config NLMON tristate "Virtual netlink monitoring device" diff --git a/drivers/tty/hvc/Kconfig b/drivers/tty/hvc/Kconfig index b8d5ea0ae26b..fec457edad14 100644 --- a/drivers/tty/hvc/Kconfig +++ b/drivers/tty/hvc/Kconfig @@ -4,7 +4,7 @@ config HVC_DRIVER bool help Generic "hypervisor virtual console" infrastructure for various - hypervisors (pSeries, iSeries, Xen, lguest). + hypervisors (pSeries, iSeries, Xen). It will automatically be selected if one of the back-end console drivers is selected. diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index 623f72334fa5..cff773f15b7e 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -2,8 +2,8 @@ config VIRTIO tristate ---help--- This option is selected by any driver which implements the virtio - bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_LGUEST, - CONFIG_RPMSG or CONFIG_S390_GUEST. + bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_RPMSG + or CONFIG_S390_GUEST. menu "Virtio drivers" diff --git a/include/linux/lguest.h b/include/linux/lguest.h deleted file mode 100644 index 6db19f35f7c5..000000000000 --- a/include/linux/lguest.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Things the lguest guest needs to know. Note: like all lguest interfaces, - * this is subject to wild and random change between versions. - */ -#ifndef _LINUX_LGUEST_H -#define _LINUX_LGUEST_H - -#ifndef __ASSEMBLY__ -#include -#include -#include - -#define LG_CLOCK_MIN_DELTA 100UL -#define LG_CLOCK_MAX_DELTA ULONG_MAX - -/*G:031 - * The second method of communicating with the Host is to via "struct - * lguest_data". Once the Guest's initialization hypercall tells the Host where - * this is, the Guest and Host both publish information in it. -:*/ -struct lguest_data { - /* - * 512 == enabled (same as eflags in normal hardware). The Guest - * changes interrupts so often that a hypercall is too slow. - */ - unsigned int irq_enabled; - /* Fine-grained interrupt disabling by the Guest */ - DECLARE_BITMAP(blocked_interrupts, LGUEST_IRQS); - - /* - * The Host writes the virtual address of the last page fault here, - * which saves the Guest a hypercall. CR2 is the native register where - * this address would normally be found. - */ - unsigned long cr2; - - /* Wallclock time set by the Host. */ - struct timespec time; - - /* - * Interrupt pending set by the Host. The Guest should do a hypercall - * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF). - */ - int irq_pending; - - /* - * Async hypercall ring. Instead of directly making hypercalls, we can - * place them in here for processing the next time the Host wants. - * This batching can be quite efficient. - */ - - /* 0xFF == done (set by Host), 0 == pending (set by Guest). */ - u8 hcall_status[LHCALL_RING_SIZE]; - /* The actual registers for the hypercalls. */ - struct hcall_args hcalls[LHCALL_RING_SIZE]; - -/* Fields initialized by the Host at boot: */ - /* Memory not to try to access */ - unsigned long reserve_mem; - /* KHz for the TSC clock. */ - u32 tsc_khz; - -/* Fields initialized by the Guest at boot: */ - /* Instruction to suppress interrupts even if enabled */ - unsigned long noirq_iret; - /* Address above which page tables are all identical. */ - unsigned long kernel_address; - /* The vector to try to use for system calls (0x40 or 0x80). */ - unsigned int syscall_vec; -}; -extern struct lguest_data lguest_data; -#endif /* __ASSEMBLY__ */ -#endif /* _LINUX_LGUEST_H */ diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h deleted file mode 100644 index acd5b12565cc..000000000000 --- a/include/linux/lguest_launcher.h +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef _LINUX_LGUEST_LAUNCHER -#define _LINUX_LGUEST_LAUNCHER -/* Everything the "lguest" userspace program needs to know. */ -#include - -/*D:010 - * Drivers - * - * The Guest needs devices to do anything useful. Since we don't let it touch - * real devices (think of the damage it could do!) we provide virtual devices. - * We emulate a PCI bus with virtio devices on it; we used to have our own - * lguest bus which was far simpler, but this tests the virtio 1.0 standard. - * - * Virtio devices are also used by kvm, so we can simply reuse their optimized - * device drivers. And one day when everyone uses virtio, my plan will be - * complete. Bwahahahah! - */ - -/* Write command first word is a request. */ -enum lguest_req -{ - LHREQ_INITIALIZE, /* + base, pfnlimit, start */ - LHREQ_GETDMA, /* No longer used */ - LHREQ_IRQ, /* + irq */ - LHREQ_BREAK, /* No longer used */ - LHREQ_EVENTFD, /* No longer used. */ - LHREQ_GETREG, /* + offset within struct pt_regs (then read value). */ - LHREQ_SETREG, /* + offset within struct pt_regs, value. */ - LHREQ_TRAP, /* + trap number to deliver to guest. */ -}; - -/* - * This is what read() of the lguest fd populates. trap == - * LGUEST_TRAP_ENTRY for an LHCALL_NOTIFY (addr is the - * argument), 14 for a page fault in the MMIO region (addr is - * the trap address, insn is the instruction), or 13 for a GPF - * (insn is the instruction). - */ -struct lguest_pending { - __u8 trap; - __u8 insn[7]; - __u32 addr; -}; -#endif /* _LINUX_LGUEST_LAUNCHER */ diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h index c07295969b7e..6d5d5faa989b 100644 --- a/include/uapi/linux/virtio_ring.h +++ b/include/uapi/linux/virtio_ring.h @@ -1,7 +1,7 @@ #ifndef _UAPI_LINUX_VIRTIO_RING_H #define _UAPI_LINUX_VIRTIO_RING_H -/* An interface for efficient virtio implementation, currently for use by KVM - * and lguest, but hopefully others soon. Do NOT change this since it will +/* An interface for efficient virtio implementation, currently for use by KVM, + * but hopefully others soon. Do NOT change this since it will * break existing servers and clients. * * This header is BSD licensed so anyone can use the definitions to implement diff --git a/tools/Makefile b/tools/Makefile index 221e1ce78b06..a19b176b914b 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -18,7 +18,6 @@ help: @echo ' iio - IIO tools' @echo ' kvm_stat - top-like utility for displaying kvm statistics' @echo ' leds - LEDs tools' - @echo ' lguest - a minimal 32-bit x86 hypervisor' @echo ' liblockdep - user-space wrapper for kernel locking-validator' @echo ' net - misc networking tools' @echo ' perf - Linux performance measurement and analysis tool' @@ -90,7 +89,7 @@ freefall: FORCE kvm_stat: FORCE $(call descend,kvm/$@) -all: acpi cgroup cpupower gpio hv firewire lguest liblockdep \ +all: acpi cgroup cpupower gpio hv firewire liblockdep \ perf selftests turbostat usb \ virtio vm net x86_energy_perf_policy \ tmon freefall objtool kvm_stat @@ -101,7 +100,7 @@ acpi_install: cpupower_install: $(call descend,power/$(@:_install=),install) -cgroup_install firewire_install gpio_install hv_install lguest_install perf_install usb_install virtio_install vm_install net_install objtool_install: +cgroup_install firewire_install gpio_install hv_install perf_install usb_install virtio_install vm_install net_install objtool_install: $(call descend,$(@:_install=),install) liblockdep_install: @@ -123,7 +122,7 @@ kvm_stat_install: $(call descend,kvm/$(@:_install=),install) install: acpi_install cgroup_install cpupower_install gpio_install \ - hv_install firewire_install lguest_install liblockdep_install \ + hv_install firewire_install liblockdep_install \ perf_install selftests_install turbostat_install usb_install \ virtio_install vm_install net_install x86_energy_perf_policy_install \ tmon_install freefall_install objtool_install kvm_stat_install @@ -134,7 +133,7 @@ acpi_clean: cpupower_clean: $(call descend,power/cpupower,clean) -cgroup_clean hv_clean firewire_clean lguest_clean spi_clean usb_clean virtio_clean vm_clean net_clean iio_clean gpio_clean objtool_clean leds_clean: +cgroup_clean hv_clean firewire_clean spi_clean usb_clean virtio_clean vm_clean net_clean iio_clean gpio_clean objtool_clean leds_clean: $(call descend,$(@:_clean=),clean) liblockdep_clean: @@ -168,7 +167,7 @@ freefall_clean: build_clean: $(call descend,build,clean) -clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean lguest_clean \ +clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean \ perf_clean selftests_clean turbostat_clean spi_clean usb_clean virtio_clean \ vm_clean net_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ freefall_clean build_clean libbpf_clean libsubcmd_clean liblockdep_clean \ diff --git a/tools/lguest/.gitignore b/tools/lguest/.gitignore deleted file mode 100644 index 8d9a8383a52e..000000000000 --- a/tools/lguest/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -lguest -include diff --git a/tools/lguest/Makefile b/tools/lguest/Makefile deleted file mode 100644 index d04599a79802..000000000000 --- a/tools/lguest/Makefile +++ /dev/null @@ -1,14 +0,0 @@ -# This creates the demonstration utility "lguest" which runs a Linux guest. -CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -U_FORTIFY_SOURCE -Iinclude - -all: lguest - -include/linux/virtio_types.h: ../../include/uapi/linux/virtio_types.h - mkdir -p include/linux 2>&1 || true - ln -sf ../../../../include/uapi/linux/virtio_types.h $@ - -lguest: include/linux/virtio_types.h - -clean: - rm -f lguest - rm -rf include diff --git a/tools/lguest/extract b/tools/lguest/extract deleted file mode 100644 index 7730bb6e4b94..000000000000 --- a/tools/lguest/extract +++ /dev/null @@ -1,58 +0,0 @@ -#! /bin/sh - -set -e - -PREFIX=$1 -shift - -trap 'rm -r $TMPDIR' 0 -TMPDIR=`mktemp -d` - -exec 3>/dev/null -for f; do - while IFS=" -" read -r LINE; do - case "$LINE" in - *$PREFIX:[0-9]*:\**) - NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"` - if [ -f $TMPDIR/$NUM ]; then - echo "$TMPDIR/$NUM already exits prior to $f" - exit 1 - fi - exec 3>>$TMPDIR/$NUM - echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM - /bin/echo "$LINE" | sed -e "s/$PREFIX:[0-9]*//" -e "s/:\*/*/" >&3 - ;; - *$PREFIX:[0-9]*) - NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"` - if [ -f $TMPDIR/$NUM ]; then - echo "$TMPDIR/$NUM already exits prior to $f" - exit 1 - fi - exec 3>>$TMPDIR/$NUM - echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM - /bin/echo "$LINE" | sed "s/$PREFIX:[0-9]*//" >&3 - ;; - *:\**) - /bin/echo "$LINE" | sed -e "s/:\*/*/" -e "s,/\*\*/,," >&3 - echo >&3 - exec 3>/dev/null - ;; - *) - /bin/echo "$LINE" >&3 - ;; - esac - done < $f - echo >&3 - exec 3>/dev/null -done - -LASTFILE="" -for f in $TMPDIR/*; do - if [ "$LASTFILE" != $(cat $TMPDIR/.$(basename $f) ) ]; then - LASTFILE=$(cat $TMPDIR/.$(basename $f) ) - echo "[ $LASTFILE ]" - fi - cat $f -done - diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c deleted file mode 100644 index 897cd6f3f687..000000000000 --- a/tools/lguest/lguest.c +++ /dev/null @@ -1,3420 +0,0 @@ -/*P:100 - * This is the Launcher code, a simple program which lays out the "physical" - * memory for the new Guest by mapping the kernel image and the virtual - * devices, then opens /dev/lguest to tell the kernel about the Guest and - * control it. -:*/ -#define _LARGEFILE64_SOURCE -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifndef VIRTIO_F_ANY_LAYOUT -#define VIRTIO_F_ANY_LAYOUT 27 -#endif - -/*L:110 - * We can ignore the 43 include files we need for this program, but I do want - * to draw attention to the use of kernel-style types. - * - * As Linus said, "C is a Spartan language, and so should your naming be." I - * like these abbreviations, so we define them here. Note that u64 is always - * unsigned long long, which works on all Linux systems: this means that we can - * use %llu in printf for any u64. - */ -typedef unsigned long long u64; -typedef uint32_t u32; -typedef uint16_t u16; -typedef uint8_t u8; -/*:*/ - -#define VIRTIO_CONFIG_NO_LEGACY -#define VIRTIO_PCI_NO_LEGACY -#define VIRTIO_BLK_NO_LEGACY -#define VIRTIO_NET_NO_LEGACY - -/* Use in-kernel ones, which defines VIRTIO_F_VERSION_1 */ -#include "../../include/uapi/linux/virtio_config.h" -#include "../../include/uapi/linux/virtio_net.h" -#include "../../include/uapi/linux/virtio_blk.h" -#include "../../include/uapi/linux/virtio_console.h" -#include "../../include/uapi/linux/virtio_rng.h" -#include -#include "../../include/uapi/linux/virtio_pci.h" -#include -#include "../../include/linux/lguest_launcher.h" - -#define BRIDGE_PFX "bridge:" -#ifndef SIOCBRADDIF -#define SIOCBRADDIF 0x89a2 /* add interface to bridge */ -#endif -/* We can have up to 256 pages for devices. */ -#define DEVICE_PAGES 256 -/* This will occupy 3 pages: it must be a power of 2. */ -#define VIRTQUEUE_NUM 256 - -/*L:120 - * verbose is both a global flag and a macro. The C preprocessor allows - * this, and although I wouldn't recommend it, it works quite nicely here. - */ -static bool verbose; -#define verbose(args...) \ - do { if (verbose) printf(args); } while(0) -/*:*/ - -/* The pointer to the start of guest memory. */ -static void *guest_base; -/* The maximum guest physical address allowed, and maximum possible. */ -static unsigned long guest_limit, guest_max, guest_mmio; -/* The /dev/lguest file descriptor. */ -static int lguest_fd; - -/* a per-cpu variable indicating whose vcpu is currently running */ -static unsigned int __thread cpu_id; - -/* 5 bit device number in the PCI_CONFIG_ADDR => 32 only */ -#define MAX_PCI_DEVICES 32 - -/* This is our list of devices. */ -struct device_list { - /* Counter to assign interrupt numbers. */ - unsigned int next_irq; - - /* Counter to print out convenient device numbers. */ - unsigned int device_num; - - /* PCI devices. */ - struct device *pci[MAX_PCI_DEVICES]; -}; - -/* The list of Guest devices, based on command line arguments. */ -static struct device_list devices; - -/* - * Just like struct virtio_pci_cfg_cap in uapi/linux/virtio_pci.h, - * but uses a u32 explicitly for the data. - */ -struct virtio_pci_cfg_cap_u32 { - struct virtio_pci_cap cap; - u32 pci_cfg_data; /* Data for BAR access. */ -}; - -struct virtio_pci_mmio { - struct virtio_pci_common_cfg cfg; - u16 notify; - u8 isr; - u8 padding; - /* Device-specific configuration follows this. */ -}; - -/* This is the layout (little-endian) of the PCI config space. */ -struct pci_config { - u16 vendor_id, device_id; - u16 command, status; - u8 revid, prog_if, subclass, class; - u8 cacheline_size, lat_timer, header_type, bist; - u32 bar[6]; - u32 cardbus_cis_ptr; - u16 subsystem_vendor_id, subsystem_device_id; - u32 expansion_rom_addr; - u8 capabilities, reserved1[3]; - u32 reserved2; - u8 irq_line, irq_pin, min_grant, max_latency; - - /* Now, this is the linked capability list. */ - struct virtio_pci_cap common; - struct virtio_pci_notify_cap notify; - struct virtio_pci_cap isr; - struct virtio_pci_cap device; - struct virtio_pci_cfg_cap_u32 cfg_access; -}; - -/* The device structure describes a single device. */ -struct device { - /* The name of this device, for --verbose. */ - const char *name; - - /* Any queues attached to this device */ - struct virtqueue *vq; - - /* Is it operational */ - bool running; - - /* Has it written FEATURES_OK but not re-checked it? */ - bool wrote_features_ok; - - /* PCI configuration */ - union { - struct pci_config config; - u32 config_words[sizeof(struct pci_config) / sizeof(u32)]; - }; - - /* Features we offer, and those accepted. */ - u64 features, features_accepted; - - /* Device-specific config hangs off the end of this. */ - struct virtio_pci_mmio *mmio; - - /* PCI MMIO resources (all in BAR0) */ - size_t mmio_size; - u32 mmio_addr; - - /* Device-specific data. */ - void *priv; -}; - -/* The virtqueue structure describes a queue attached to a device. */ -struct virtqueue { - struct virtqueue *next; - - /* Which device owns me. */ - struct device *dev; - - /* Name for printing errors. */ - const char *name; - - /* The actual ring of buffers. */ - struct vring vring; - - /* The information about this virtqueue (we only use queue_size on) */ - struct virtio_pci_common_cfg pci_config; - - /* Last available index we saw. */ - u16 last_avail_idx; - - /* How many are used since we sent last irq? */ - unsigned int pending_used; - - /* Eventfd where Guest notifications arrive. */ - int eventfd; - - /* Function for the thread which is servicing this virtqueue. */ - void (*service)(struct virtqueue *vq); - pid_t thread; -}; - -/* Remember the arguments to the program so we can "reboot" */ -static char **main_args; - -/* The original tty settings to restore on exit. */ -static struct termios orig_term; - -/* - * We have to be careful with barriers: our devices are all run in separate - * threads and so we need to make sure that changes visible to the Guest happen - * in precise order. - */ -#define wmb() __asm__ __volatile__("" : : : "memory") -#define rmb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory") -#define mb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory") - -/* Wrapper for the last available index. Makes it easier to change. */ -#define lg_last_avail(vq) ((vq)->last_avail_idx) - -/* - * The virtio configuration space is defined to be little-endian. x86 is - * little-endian too, but it's nice to be explicit so we have these helpers. - */ -#define cpu_to_le16(v16) (v16) -#define cpu_to_le32(v32) (v32) -#define cpu_to_le64(v64) (v64) -#define le16_to_cpu(v16) (v16) -#define le32_to_cpu(v32) (v32) -#define le64_to_cpu(v64) (v64) - -/* - * A real device would ignore weird/non-compliant driver behaviour. We - * stop and flag it, to help debugging Linux problems. - */ -#define bad_driver(d, fmt, ...) \ - errx(1, "%s: bad driver: " fmt, (d)->name, ## __VA_ARGS__) -#define bad_driver_vq(vq, fmt, ...) \ - errx(1, "%s vq %s: bad driver: " fmt, (vq)->dev->name, \ - vq->name, ## __VA_ARGS__) - -/* Is this iovec empty? */ -static bool iov_empty(const struct iovec iov[], unsigned int num_iov) -{ - unsigned int i; - - for (i = 0; i < num_iov; i++) - if (iov[i].iov_len) - return false; - return true; -} - -/* Take len bytes from the front of this iovec. */ -static void iov_consume(struct device *d, - struct iovec iov[], unsigned num_iov, - void *dest, unsigned len) -{ - unsigned int i; - - for (i = 0; i < num_iov; i++) { - unsigned int used; - - used = iov[i].iov_len < len ? iov[i].iov_len : len; - if (dest) { - memcpy(dest, iov[i].iov_base, used); - dest += used; - } - iov[i].iov_base += used; - iov[i].iov_len -= used; - len -= used; - } - if (len != 0) - bad_driver(d, "iovec too short!"); -} - -/*L:100 - * The Launcher code itself takes us out into userspace, that scary place where - * pointers run wild and free! Unfortunately, like most userspace programs, - * it's quite boring (which is why everyone likes to hack on the kernel!). - * Perhaps if you make up an Lguest Drinking Game at this point, it will get - * you through this section. Or, maybe not. - * - * The Launcher sets up a big chunk of memory to be the Guest's "physical" - * memory and stores it in "guest_base". In other words, Guest physical == - * Launcher virtual with an offset. - * - * This can be tough to get your head around, but usually it just means that we - * use these trivial conversion functions when the Guest gives us its - * "physical" addresses: - */ -static void *from_guest_phys(unsigned long addr) -{ - return guest_base + addr; -} - -static unsigned long to_guest_phys(const void *addr) -{ - return (addr - guest_base); -} - -/*L:130 - * Loading the Kernel. - * - * We start with couple of simple helper routines. open_or_die() avoids - * error-checking code cluttering the callers: - */ -static int open_or_die(const char *name, int flags) -{ - int fd = open(name, flags); - if (fd < 0) - err(1, "Failed to open %s", name); - return fd; -} - -/* map_zeroed_pages() takes a number of pages. */ -static void *map_zeroed_pages(unsigned int num) -{ - int fd = open_or_die("/dev/zero", O_RDONLY); - void *addr; - - /* - * We use a private mapping (ie. if we write to the page, it will be - * copied). We allocate an extra two pages PROT_NONE to act as guard - * pages against read/write attempts that exceed allocated space. - */ - addr = mmap(NULL, getpagesize() * (num+2), - PROT_NONE, MAP_PRIVATE, fd, 0); - - if (addr == MAP_FAILED) - err(1, "Mmapping %u pages of /dev/zero", num); - - if (mprotect(addr + getpagesize(), getpagesize() * num, - PROT_READ|PROT_WRITE) == -1) - err(1, "mprotect rw %u pages failed", num); - - /* - * One neat mmap feature is that you can close the fd, and it - * stays mapped. - */ - close(fd); - - /* Return address after PROT_NONE page */ - return addr + getpagesize(); -} - -/* Get some bytes which won't be mapped into the guest. */ -static unsigned long get_mmio_region(size_t size) -{ - unsigned long addr = guest_mmio; - size_t i; - - if (!size) - return addr; - - /* Size has to be a power of 2 (and multiple of 16) */ - for (i = 1; i < size; i <<= 1); - - guest_mmio += i; - - return addr; -} - -/* - * This routine is used to load the kernel or initrd. It tries mmap, but if - * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries), - * it falls back to reading the memory in. - */ -static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) -{ - ssize_t r; - - /* - * We map writable even though for some segments are marked read-only. - * The kernel really wants to be writable: it patches its own - * instructions. - * - * MAP_PRIVATE means that the page won't be copied until a write is - * done to it. This allows us to share untouched memory between - * Guests. - */ - if (mmap(addr, len, PROT_READ|PROT_WRITE, - MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED) - return; - - /* pread does a seek and a read in one shot: saves a few lines. */ - r = pread(fd, addr, len, offset); - if (r != len) - err(1, "Reading offset %lu len %lu gave %zi", offset, len, r); -} - -/* - * This routine takes an open vmlinux image, which is in ELF, and maps it into - * the Guest memory. ELF = Embedded Linking Format, which is the format used - * by all modern binaries on Linux including the kernel. - * - * The ELF headers give *two* addresses: a physical address, and a virtual - * address. We use the physical address; the Guest will map itself to the - * virtual address. - * - * We return the starting address. - */ -static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) -{ - Elf32_Phdr phdr[ehdr->e_phnum]; - unsigned int i; - - /* - * Sanity checks on the main ELF header: an x86 executable with a - * reasonable number of correctly-sized program headers. - */ - if (ehdr->e_type != ET_EXEC - || ehdr->e_machine != EM_386 - || ehdr->e_phentsize != sizeof(Elf32_Phdr) - || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr)) - errx(1, "Malformed elf header"); - - /* - * An ELF executable contains an ELF header and a number of "program" - * headers which indicate which parts ("segments") of the program to - * load where. - */ - - /* We read in all the program headers at once: */ - if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) - err(1, "Seeking to program headers"); - if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) - err(1, "Reading program headers"); - - /* - * Try all the headers: there are usually only three. A read-only one, - * a read-write one, and a "note" section which we don't load. - */ - for (i = 0; i < ehdr->e_phnum; i++) { - /* If this isn't a loadable segment, we ignore it */ - if (phdr[i].p_type != PT_LOAD) - continue; - - verbose("Section %i: size %i addr %p\n", - i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); - - /* We map this section of the file at its physical address. */ - map_at(elf_fd, from_guest_phys(phdr[i].p_paddr), - phdr[i].p_offset, phdr[i].p_filesz); - } - - /* The entry point is given in the ELF header. */ - return ehdr->e_entry; -} - -/*L:150 - * A bzImage, unlike an ELF file, is not meant to be loaded. You're supposed - * to jump into it and it will unpack itself. We used to have to perform some - * hairy magic because the unpacking code scared me. - * - * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote - * a small patch to jump over the tricky bits in the Guest, so now we just read - * the funky header so we know where in the file to load, and away we go! - */ -static unsigned long load_bzimage(int fd) -{ - struct boot_params boot; - int r; - /* Modern bzImages get loaded at 1M. */ - void *p = from_guest_phys(0x100000); - - /* - * Go back to the start of the file and read the header. It should be - * a Linux boot header (see Documentation/x86/boot.txt) - */ - lseek(fd, 0, SEEK_SET); - read(fd, &boot, sizeof(boot)); - - /* Inside the setup_hdr, we expect the magic "HdrS" */ - if (memcmp(&boot.hdr.header, "HdrS", 4) != 0) - errx(1, "This doesn't look like a bzImage to me"); - - /* Skip over the extra sectors of the header. */ - lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET); - - /* Now read everything into memory. in nice big chunks. */ - while ((r = read(fd, p, 65536)) > 0) - p += r; - - /* Finally, code32_start tells us where to enter the kernel. */ - return boot.hdr.code32_start; -} - -/*L:140 - * Loading the kernel is easy when it's a "vmlinux", but most kernels - * come wrapped up in the self-decompressing "bzImage" format. With a little - * work, we can load those, too. - */ -static unsigned long load_kernel(int fd) -{ - Elf32_Ehdr hdr; - - /* Read in the first few bytes. */ - if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr)) - err(1, "Reading kernel"); - - /* If it's an ELF file, it starts with "\177ELF" */ - if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) - return map_elf(fd, &hdr); - - /* Otherwise we assume it's a bzImage, and try to load it. */ - return load_bzimage(fd); -} - -/* - * This is a trivial little helper to align pages. Andi Kleen hated it because - * it calls getpagesize() twice: "it's dumb code." - * - * Kernel guys get really het up about optimization, even when it's not - * necessary. I leave this code as a reaction against that. - */ -static inline unsigned long page_align(unsigned long addr) -{ - /* Add upwards and truncate downwards. */ - return ((addr + getpagesize()-1) & ~(getpagesize()-1)); -} - -/*L:180 - * An "initial ram disk" is a disk image loaded into memory along with the - * kernel which the kernel can use to boot from without needing any drivers. - * Most distributions now use this as standard: the initrd contains the code to - * load the appropriate driver modules for the current machine. - * - * Importantly, James Morris works for RedHat, and Fedora uses initrds for its - * kernels. He sent me this (and tells me when I break it). - */ -static unsigned long load_initrd(const char *name, unsigned long mem) -{ - int ifd; - struct stat st; - unsigned long len; - - ifd = open_or_die(name, O_RDONLY); - /* fstat() is needed to get the file size. */ - if (fstat(ifd, &st) < 0) - err(1, "fstat() on initrd '%s'", name); - - /* - * We map the initrd at the top of memory, but mmap wants it to be - * page-aligned, so we round the size up for that. - */ - len = page_align(st.st_size); - map_at(ifd, from_guest_phys(mem - len), 0, st.st_size); - /* - * Once a file is mapped, you can close the file descriptor. It's a - * little odd, but quite useful. - */ - close(ifd); - verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len); - - /* We return the initrd size. */ - return len; -} -/*:*/ - -/* - * Simple routine to roll all the commandline arguments together with spaces - * between them. - */ -static void concat(char *dst, char *args[]) -{ - unsigned int i, len = 0; - - for (i = 0; args[i]; i++) { - if (i) { - strcat(dst+len, " "); - len++; - } - strcpy(dst+len, args[i]); - len += strlen(args[i]); - } - /* In case it's empty. */ - dst[len] = '\0'; -} - -/*L:185 - * This is where we actually tell the kernel to initialize the Guest. We - * saw the arguments it expects when we looked at initialize() in lguest_user.c: - * the base of Guest "physical" memory, the top physical page to allow and the - * entry point for the Guest. - */ -static void tell_kernel(unsigned long start) -{ - unsigned long args[] = { LHREQ_INITIALIZE, - (unsigned long)guest_base, - guest_limit / getpagesize(), start, - (guest_mmio+getpagesize()-1) / getpagesize() }; - verbose("Guest: %p - %p (%#lx, MMIO %#lx)\n", - guest_base, guest_base + guest_limit, - guest_limit, guest_mmio); - lguest_fd = open_or_die("/dev/lguest", O_RDWR); - if (write(lguest_fd, args, sizeof(args)) < 0) - err(1, "Writing to /dev/lguest"); -} -/*:*/ - -/*L:200 - * Device Handling. - * - * When the Guest gives us a buffer, it sends an array of addresses and sizes. - * We need to make sure it's not trying to reach into the Launcher itself, so - * we have a convenient routine which checks it and exits with an error message - * if something funny is going on: - */ -static void *_check_pointer(struct device *d, - unsigned long addr, unsigned int size, - unsigned int line) -{ - /* - * Check if the requested address and size exceeds the allocated memory, - * or addr + size wraps around. - */ - if ((addr + size) > guest_limit || (addr + size) < addr) - bad_driver(d, "%s:%i: Invalid address %#lx", - __FILE__, line, addr); - /* - * We return a pointer for the caller's convenience, now we know it's - * safe to use. - */ - return from_guest_phys(addr); -} -/* A macro which transparently hands the line number to the real function. */ -#define check_pointer(d,addr,size) _check_pointer(d, addr, size, __LINE__) - -/* - * Each buffer in the virtqueues is actually a chain of descriptors. This - * function returns the next descriptor in the chain, or vq->vring.num if we're - * at the end. - */ -static unsigned next_desc(struct device *d, struct vring_desc *desc, - unsigned int i, unsigned int max) -{ - unsigned int next; - - /* If this descriptor says it doesn't chain, we're done. */ - if (!(desc[i].flags & VRING_DESC_F_NEXT)) - return max; - - /* Check they're not leading us off end of descriptors. */ - next = desc[i].next; - /* Make sure compiler knows to grab that: we don't want it changing! */ - wmb(); - - if (next >= max) - bad_driver(d, "Desc next is %u", next); - - return next; -} - -/* - * This actually sends the interrupt for this virtqueue, if we've used a - * buffer. - */ -static void trigger_irq(struct virtqueue *vq) -{ - unsigned long buf[] = { LHREQ_IRQ, vq->dev->config.irq_line }; - - /* Don't inform them if nothing used. */ - if (!vq->pending_used) - return; - vq->pending_used = 0; - - /* - * 2.4.7.1: - * - * If the VIRTIO_F_EVENT_IDX feature bit is not negotiated: - * The driver MUST set flags to 0 or 1. - */ - if (vq->vring.avail->flags > 1) - bad_driver_vq(vq, "avail->flags = %u\n", vq->vring.avail->flags); - - /* - * 2.4.7.2: - * - * If the VIRTIO_F_EVENT_IDX feature bit is not negotiated: - * - * - The device MUST ignore the used_event value. - * - After the device writes a descriptor index into the used ring: - * - If flags is 1, the device SHOULD NOT send an interrupt. - * - If flags is 0, the device MUST send an interrupt. - */ - if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) { - return; - } - - /* - * 4.1.4.5.1: - * - * If MSI-X capability is disabled, the device MUST set the Queue - * Interrupt bit in ISR status before sending a virtqueue notification - * to the driver. - */ - vq->dev->mmio->isr = 0x1; - - /* Send the Guest an interrupt tell them we used something up. */ - if (write(lguest_fd, buf, sizeof(buf)) != 0) - err(1, "Triggering irq %i", vq->dev->config.irq_line); -} - -/* - * This looks in the virtqueue for the first available buffer, and converts - * it to an iovec for convenient access. Since descriptors consist of some - * number of output then some number of input descriptors, it's actually two - * iovecs, but we pack them into one and note how many of each there were. - * - * This function waits if necessary, and returns the descriptor number found. - */ -static unsigned wait_for_vq_desc(struct virtqueue *vq, - struct iovec iov[], - unsigned int *out_num, unsigned int *in_num) -{ - unsigned int i, head, max; - struct vring_desc *desc; - u16 last_avail = lg_last_avail(vq); - - /* - * 2.4.7.1: - * - * The driver MUST handle spurious interrupts from the device. - * - * That's why this is a while loop. - */ - - /* There's nothing available? */ - while (last_avail == vq->vring.avail->idx) { - u64 event; - - /* - * Since we're about to sleep, now is a good time to tell the - * Guest about what we've used up to now. - */ - trigger_irq(vq); - - /* OK, now we need to know about added descriptors. */ - vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; - - /* - * They could have slipped one in as we were doing that: make - * sure it's written, then check again. - */ - mb(); - if (last_avail != vq->vring.avail->idx) { - vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; - break; - } - - /* Nothing new? Wait for eventfd to tell us they refilled. */ - if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event)) - errx(1, "Event read failed?"); - - /* We don't need to be notified again. */ - vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; - } - - /* Check it isn't doing very strange things with descriptor numbers. */ - if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) - bad_driver_vq(vq, "Guest moved used index from %u to %u", - last_avail, vq->vring.avail->idx); - - /* - * Make sure we read the descriptor number *after* we read the ring - * update; don't let the cpu or compiler change the order. - */ - rmb(); - - /* - * Grab the next descriptor number they're advertising, and increment - * the index we've seen. - */ - head = vq->vring.avail->ring[last_avail % vq->vring.num]; - lg_last_avail(vq)++; - - /* If their number is silly, that's a fatal mistake. */ - if (head >= vq->vring.num) - bad_driver_vq(vq, "Guest says index %u is available", head); - - /* When we start there are none of either input nor output. */ - *out_num = *in_num = 0; - - max = vq->vring.num; - desc = vq->vring.desc; - i = head; - - /* - * We have to read the descriptor after we read the descriptor number, - * but there's a data dependency there so the CPU shouldn't reorder - * that: no rmb() required. - */ - - do { - /* - * If this is an indirect entry, then this buffer contains a - * descriptor table which we handle as if it's any normal - * descriptor chain. - */ - if (desc[i].flags & VRING_DESC_F_INDIRECT) { - /* 2.4.5.3.1: - * - * The driver MUST NOT set the VIRTQ_DESC_F_INDIRECT - * flag unless the VIRTIO_F_INDIRECT_DESC feature was - * negotiated. - */ - if (!(vq->dev->features_accepted & - (1<vring.desc) - bad_driver_vq(vq, "Indirect within indirect"); - - /* - * Proposed update VIRTIO-134 spells this out: - * - * A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT - * and VIRTQ_DESC_F_NEXT in flags. - */ - if (desc[i].flags & VRING_DESC_F_NEXT) - bad_driver_vq(vq, "indirect and next together"); - - if (desc[i].len % sizeof(struct vring_desc)) - bad_driver_vq(vq, - "Invalid size for indirect table"); - /* - * 2.4.5.3.2: - * - * The device MUST ignore the write-only flag - * (flags&VIRTQ_DESC_F_WRITE) in the descriptor that - * refers to an indirect table. - * - * We ignore it here: :) - */ - - max = desc[i].len / sizeof(struct vring_desc); - desc = check_pointer(vq->dev, desc[i].addr, desc[i].len); - i = 0; - - /* 2.4.5.3.1: - * - * A driver MUST NOT create a descriptor chain longer - * than the Queue Size of the device. - */ - if (max > vq->pci_config.queue_size) - bad_driver_vq(vq, - "indirect has too many entries"); - } - - /* Grab the first descriptor, and check it's OK. */ - iov[*out_num + *in_num].iov_len = desc[i].len; - iov[*out_num + *in_num].iov_base - = check_pointer(vq->dev, desc[i].addr, desc[i].len); - /* If this is an input descriptor, increment that count. */ - if (desc[i].flags & VRING_DESC_F_WRITE) - (*in_num)++; - else { - /* - * If it's an output descriptor, they're all supposed - * to come before any input descriptors. - */ - if (*in_num) - bad_driver_vq(vq, - "Descriptor has out after in"); - (*out_num)++; - } - - /* If we've got too many, that implies a descriptor loop. */ - if (*out_num + *in_num > max) - bad_driver_vq(vq, "Looped descriptor"); - } while ((i = next_desc(vq->dev, desc, i, max)) != max); - - return head; -} - -/* - * After we've used one of their buffers, we tell the Guest about it. Sometime - * later we'll want to send them an interrupt using trigger_irq(); note that - * wait_for_vq_desc() does that for us if it has to wait. - */ -static void add_used(struct virtqueue *vq, unsigned int head, int len) -{ - struct vring_used_elem *used; - - /* - * The virtqueue contains a ring of used buffers. Get a pointer to the - * next entry in that used ring. - */ - used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; - used->id = head; - used->len = len; - /* Make sure buffer is written before we update index. */ - wmb(); - vq->vring.used->idx++; - vq->pending_used++; -} - -/* And here's the combo meal deal. Supersize me! */ -static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len) -{ - add_used(vq, head, len); - trigger_irq(vq); -} - -/* - * The Console - * - * We associate some data with the console for our exit hack. - */ -struct console_abort { - /* How many times have they hit ^C? */ - int count; - /* When did they start? */ - struct timeval start; -}; - -/* This is the routine which handles console input (ie. stdin). */ -static void console_input(struct virtqueue *vq) -{ - int len; - unsigned int head, in_num, out_num; - struct console_abort *abort = vq->dev->priv; - struct iovec iov[vq->vring.num]; - - /* Make sure there's a descriptor available. */ - head = wait_for_vq_desc(vq, iov, &out_num, &in_num); - if (out_num) - bad_driver_vq(vq, "Output buffers in console in queue?"); - - /* Read into it. This is where we usually wait. */ - len = readv(STDIN_FILENO, iov, in_num); - if (len <= 0) { - /* Ran out of input? */ - warnx("Failed to get console input, ignoring console."); - /* - * For simplicity, dying threads kill the whole Launcher. So - * just nap here. - */ - for (;;) - pause(); - } - - /* Tell the Guest we used a buffer. */ - add_used_and_trigger(vq, head, len); - - /* - * Three ^C within one second? Exit. - * - * This is such a hack, but works surprisingly well. Each ^C has to - * be in a buffer by itself, so they can't be too fast. But we check - * that we get three within about a second, so they can't be too - * slow. - */ - if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) { - abort->count = 0; - return; - } - - abort->count++; - if (abort->count == 1) - gettimeofday(&abort->start, NULL); - else if (abort->count == 3) { - struct timeval now; - gettimeofday(&now, NULL); - /* Kill all Launcher processes with SIGINT, like normal ^C */ - if (now.tv_sec <= abort->start.tv_sec+1) - kill(0, SIGINT); - abort->count = 0; - } -} - -/* This is the routine which handles console output (ie. stdout). */ -static void console_output(struct virtqueue *vq) -{ - unsigned int head, out, in; - struct iovec iov[vq->vring.num]; - - /* We usually wait in here, for the Guest to give us something. */ - head = wait_for_vq_desc(vq, iov, &out, &in); - if (in) - bad_driver_vq(vq, "Input buffers in console output queue?"); - - /* writev can return a partial write, so we loop here. */ - while (!iov_empty(iov, out)) { - int len = writev(STDOUT_FILENO, iov, out); - if (len <= 0) { - warn("Write to stdout gave %i (%d)", len, errno); - break; - } - iov_consume(vq->dev, iov, out, NULL, len); - } - - /* - * We're finished with that buffer: if we're going to sleep, - * wait_for_vq_desc() will prod the Guest with an interrupt. - */ - add_used(vq, head, 0); -} - -/* - * The Network - * - * Handling output for network is also simple: we get all the output buffers - * and write them to /dev/net/tun. - */ -struct net_info { - int tunfd; -}; - -static void net_output(struct virtqueue *vq) -{ - struct net_info *net_info = vq->dev->priv; - unsigned int head, out, in; - struct iovec iov[vq->vring.num]; - - /* We usually wait in here for the Guest to give us a packet. */ - head = wait_for_vq_desc(vq, iov, &out, &in); - if (in) - bad_driver_vq(vq, "Input buffers in net output queue?"); - /* - * Send the whole thing through to /dev/net/tun. It expects the exact - * same format: what a coincidence! - */ - if (writev(net_info->tunfd, iov, out) < 0) - warnx("Write to tun failed (%d)?", errno); - - /* - * Done with that one; wait_for_vq_desc() will send the interrupt if - * all packets are processed. - */ - add_used(vq, head, 0); -} - -/* - * Handling network input is a bit trickier, because I've tried to optimize it. - * - * First we have a helper routine which tells is if from this file descriptor - * (ie. the /dev/net/tun device) will block: - */ -static bool will_block(int fd) -{ - fd_set fdset; - struct timeval zero = { 0, 0 }; - FD_ZERO(&fdset); - FD_SET(fd, &fdset); - return select(fd+1, &fdset, NULL, NULL, &zero) != 1; -} - -/* - * This handles packets coming in from the tun device to our Guest. Like all - * service routines, it gets called again as soon as it returns, so you don't - * see a while(1) loop here. - */ -static void net_input(struct virtqueue *vq) -{ - int len; - unsigned int head, out, in; - struct iovec iov[vq->vring.num]; - struct net_info *net_info = vq->dev->priv; - - /* - * Get a descriptor to write an incoming packet into. This will also - * send an interrupt if they're out of descriptors. - */ - head = wait_for_vq_desc(vq, iov, &out, &in); - if (out) - bad_driver_vq(vq, "Output buffers in net input queue?"); - - /* - * If it looks like we'll block reading from the tun device, send them - * an interrupt. - */ - if (vq->pending_used && will_block(net_info->tunfd)) - trigger_irq(vq); - - /* - * Read in the packet. This is where we normally wait (when there's no - * incoming network traffic). - */ - len = readv(net_info->tunfd, iov, in); - if (len <= 0) - warn("Failed to read from tun (%d).", errno); - - /* - * Mark that packet buffer as used, but don't interrupt here. We want - * to wait until we've done as much work as we can. - */ - add_used(vq, head, len); -} -/*:*/ - -/* This is the helper to create threads: run the service routine in a loop. */ -static int do_thread(void *_vq) -{ - struct virtqueue *vq = _vq; - - for (;;) - vq->service(vq); - return 0; -} - -/* - * When a child dies, we kill our entire process group with SIGTERM. This - * also has the side effect that the shell restores the console for us! - */ -static void kill_launcher(int signal) -{ - kill(0, SIGTERM); -} - -static void reset_vq_pci_config(struct virtqueue *vq) -{ - vq->pci_config.queue_size = VIRTQUEUE_NUM; - vq->pci_config.queue_enable = 0; -} - -static void reset_device(struct device *dev) -{ - struct virtqueue *vq; - - verbose("Resetting device %s\n", dev->name); - - /* Clear any features they've acked. */ - dev->features_accepted = 0; - - /* We're going to be explicitly killing threads, so ignore them. */ - signal(SIGCHLD, SIG_IGN); - - /* - * 4.1.4.3.1: - * - * The device MUST present a 0 in queue_enable on reset. - * - * This means we set it here, and reset the saved ones in every vq. - */ - dev->mmio->cfg.queue_enable = 0; - - /* Get rid of the virtqueue threads */ - for (vq = dev->vq; vq; vq = vq->next) { - vq->last_avail_idx = 0; - reset_vq_pci_config(vq); - if (vq->thread != (pid_t)-1) { - kill(vq->thread, SIGTERM); - waitpid(vq->thread, NULL, 0); - vq->thread = (pid_t)-1; - } - } - dev->running = false; - dev->wrote_features_ok = false; - - /* Now we care if threads die. */ - signal(SIGCHLD, (void *)kill_launcher); -} - -static void cleanup_devices(void) -{ - unsigned int i; - - for (i = 1; i < MAX_PCI_DEVICES; i++) { - struct device *d = devices.pci[i]; - if (!d) - continue; - reset_device(d); - } - - /* If we saved off the original terminal settings, restore them now. */ - if (orig_term.c_lflag & (ISIG|ICANON|ECHO)) - tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); -} - -/*L:217 - * We do PCI. This is mainly done to let us test the kernel virtio PCI - * code. - */ - -/* Linux expects a PCI host bridge: ours is a dummy, and first on the bus. */ -static struct device pci_host_bridge; - -static void init_pci_host_bridge(void) -{ - pci_host_bridge.name = "PCI Host Bridge"; - pci_host_bridge.config.class = 0x06; /* bridge */ - pci_host_bridge.config.subclass = 0; /* host bridge */ - devices.pci[0] = &pci_host_bridge; -} - -/* The IO ports used to read the PCI config space. */ -#define PCI_CONFIG_ADDR 0xCF8 -#define PCI_CONFIG_DATA 0xCFC - -/* - * Not really portable, but does help readability: this is what the Guest - * writes to the PCI_CONFIG_ADDR IO port. - */ -union pci_config_addr { - struct { - unsigned mbz: 2; - unsigned offset: 6; - unsigned funcnum: 3; - unsigned devnum: 5; - unsigned busnum: 8; - unsigned reserved: 7; - unsigned enabled : 1; - } bits; - u32 val; -}; - -/* - * We cache what they wrote to the address port, so we know what they're - * talking about when they access the data port. - */ -static union pci_config_addr pci_config_addr; - -static struct device *find_pci_device(unsigned int index) -{ - return devices.pci[index]; -} - -/* PCI can do 1, 2 and 4 byte reads; we handle that here. */ -static void ioread(u16 off, u32 v, u32 mask, u32 *val) -{ - assert(off < 4); - assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF); - *val = (v >> (off * 8)) & mask; -} - -/* PCI can do 1, 2 and 4 byte writes; we handle that here. */ -static void iowrite(u16 off, u32 v, u32 mask, u32 *dst) -{ - assert(off < 4); - assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF); - *dst &= ~(mask << (off * 8)); - *dst |= (v & mask) << (off * 8); -} - -/* - * Where PCI_CONFIG_DATA accesses depends on the previous write to - * PCI_CONFIG_ADDR. - */ -static struct device *dev_and_reg(u32 *reg) -{ - if (!pci_config_addr.bits.enabled) - return NULL; - - if (pci_config_addr.bits.funcnum != 0) - return NULL; - - if (pci_config_addr.bits.busnum != 0) - return NULL; - - if (pci_config_addr.bits.offset * 4 >= sizeof(struct pci_config)) - return NULL; - - *reg = pci_config_addr.bits.offset; - return find_pci_device(pci_config_addr.bits.devnum); -} - -/* - * We can get invalid combinations of values while they're writing, so we - * only fault if they try to write with some invalid bar/offset/length. - */ -static bool valid_bar_access(struct device *d, - struct virtio_pci_cfg_cap_u32 *cfg_access) -{ - /* We only have 1 bar (BAR0) */ - if (cfg_access->cap.bar != 0) - return false; - - /* Check it's within BAR0. */ - if (cfg_access->cap.offset >= d->mmio_size - || cfg_access->cap.offset + cfg_access->cap.length > d->mmio_size) - return false; - - /* Check length is 1, 2 or 4. */ - if (cfg_access->cap.length != 1 - && cfg_access->cap.length != 2 - && cfg_access->cap.length != 4) - return false; - - /* - * 4.1.4.7.2: - * - * The driver MUST NOT write a cap.offset which is not a multiple of - * cap.length (ie. all accesses MUST be aligned). - */ - if (cfg_access->cap.offset % cfg_access->cap.length != 0) - return false; - - /* Return pointer into word in BAR0. */ - return true; -} - -/* Is this accessing the PCI config address port?. */ -static bool is_pci_addr_port(u16 port) -{ - return port >= PCI_CONFIG_ADDR && port < PCI_CONFIG_ADDR + 4; -} - -static bool pci_addr_iowrite(u16 port, u32 mask, u32 val) -{ - iowrite(port - PCI_CONFIG_ADDR, val, mask, - &pci_config_addr.val); - verbose("PCI%s: %#x/%x: bus %u dev %u func %u reg %u\n", - pci_config_addr.bits.enabled ? "" : " DISABLED", - val, mask, - pci_config_addr.bits.busnum, - pci_config_addr.bits.devnum, - pci_config_addr.bits.funcnum, - pci_config_addr.bits.offset); - return true; -} - -static void pci_addr_ioread(u16 port, u32 mask, u32 *val) -{ - ioread(port - PCI_CONFIG_ADDR, pci_config_addr.val, mask, val); -} - -/* Is this accessing the PCI config data port?. */ -static bool is_pci_data_port(u16 port) -{ - return port >= PCI_CONFIG_DATA && port < PCI_CONFIG_DATA + 4; -} - -static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask); - -static bool pci_data_iowrite(u16 port, u32 mask, u32 val) -{ - u32 reg, portoff; - struct device *d = dev_and_reg(®); - - /* Complain if they don't belong to a device. */ - if (!d) - return false; - - /* They can do 1 byte writes, etc. */ - portoff = port - PCI_CONFIG_DATA; - - /* - * PCI uses a weird way to determine the BAR size: the OS - * writes all 1's, and sees which ones stick. - */ - if (&d->config_words[reg] == &d->config.bar[0]) { - int i; - - iowrite(portoff, val, mask, &d->config.bar[0]); - for (i = 0; (1 << i) < d->mmio_size; i++) - d->config.bar[0] &= ~(1 << i); - return true; - } else if ((&d->config_words[reg] > &d->config.bar[0] - && &d->config_words[reg] <= &d->config.bar[6]) - || &d->config_words[reg] == &d->config.expansion_rom_addr) { - /* Allow writing to any other BAR, or expansion ROM */ - iowrite(portoff, val, mask, &d->config_words[reg]); - return true; - /* We let them override latency timer and cacheline size */ - } else if (&d->config_words[reg] == (void *)&d->config.cacheline_size) { - /* Only let them change the first two fields. */ - if (mask == 0xFFFFFFFF) - mask = 0xFFFF; - iowrite(portoff, val, mask, &d->config_words[reg]); - return true; - } else if (&d->config_words[reg] == (void *)&d->config.command - && mask == 0xFFFF) { - /* Ignore command writes. */ - return true; - } else if (&d->config_words[reg] - == (void *)&d->config.cfg_access.cap.bar - || &d->config_words[reg] - == &d->config.cfg_access.cap.length - || &d->config_words[reg] - == &d->config.cfg_access.cap.offset) { - - /* - * The VIRTIO_PCI_CAP_PCI_CFG capability - * provides a backdoor to access the MMIO - * regions without mapping them. Weird, but - * useful. - */ - iowrite(portoff, val, mask, &d->config_words[reg]); - return true; - } else if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) { - u32 write_mask; - - /* - * 4.1.4.7.1: - * - * Upon detecting driver write access to pci_cfg_data, the - * device MUST execute a write access at offset cap.offset at - * BAR selected by cap.bar using the first cap.length bytes - * from pci_cfg_data. - */ - - /* Must be bar 0 */ - if (!valid_bar_access(d, &d->config.cfg_access)) - return false; - - iowrite(portoff, val, mask, &d->config.cfg_access.pci_cfg_data); - - /* - * Now emulate a write. The mask we use is set by - * len, *not* this write! - */ - write_mask = (1ULL<<(8*d->config.cfg_access.cap.length)) - 1; - verbose("Window writing %#x/%#x to bar %u, offset %u len %u\n", - d->config.cfg_access.pci_cfg_data, write_mask, - d->config.cfg_access.cap.bar, - d->config.cfg_access.cap.offset, - d->config.cfg_access.cap.length); - - emulate_mmio_write(d, d->config.cfg_access.cap.offset, - d->config.cfg_access.pci_cfg_data, - write_mask); - return true; - } - - /* - * 4.1.4.1: - * - * The driver MUST NOT write into any field of the capability - * structure, with the exception of those with cap_type - * VIRTIO_PCI_CAP_PCI_CFG... - */ - return false; -} - -static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask); - -static void pci_data_ioread(u16 port, u32 mask, u32 *val) -{ - u32 reg; - struct device *d = dev_and_reg(®); - - if (!d) - return; - - /* Read through the PCI MMIO access window is special */ - if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) { - u32 read_mask; - - /* - * 4.1.4.7.1: - * - * Upon detecting driver read access to pci_cfg_data, the - * device MUST execute a read access of length cap.length at - * offset cap.offset at BAR selected by cap.bar and store the - * first cap.length bytes in pci_cfg_data. - */ - /* Must be bar 0 */ - if (!valid_bar_access(d, &d->config.cfg_access)) - bad_driver(d, - "Invalid cfg_access to bar%u, offset %u len %u", - d->config.cfg_access.cap.bar, - d->config.cfg_access.cap.offset, - d->config.cfg_access.cap.length); - - /* - * Read into the window. The mask we use is set by - * len, *not* this read! - */ - read_mask = (1ULL<<(8*d->config.cfg_access.cap.length))-1; - d->config.cfg_access.pci_cfg_data - = emulate_mmio_read(d, - d->config.cfg_access.cap.offset, - read_mask); - verbose("Window read %#x/%#x from bar %u, offset %u len %u\n", - d->config.cfg_access.pci_cfg_data, read_mask, - d->config.cfg_access.cap.bar, - d->config.cfg_access.cap.offset, - d->config.cfg_access.cap.length); - } - ioread(port - PCI_CONFIG_DATA, d->config_words[reg], mask, val); -} - -/*L:216 - * This is where we emulate a handful of Guest instructions. It's ugly - * and we used to do it in the kernel but it grew over time. - */ - -/* - * We use the ptrace syscall's pt_regs struct to talk about registers - * to lguest: these macros convert the names to the offsets. - */ -#define getreg(name) getreg_off(offsetof(struct user_regs_struct, name)) -#define setreg(name, val) \ - setreg_off(offsetof(struct user_regs_struct, name), (val)) - -static u32 getreg_off(size_t offset) -{ - u32 r; - unsigned long args[] = { LHREQ_GETREG, offset }; - - if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0) - err(1, "Getting register %u", offset); - if (pread(lguest_fd, &r, sizeof(r), cpu_id) != sizeof(r)) - err(1, "Reading register %u", offset); - - return r; -} - -static void setreg_off(size_t offset, u32 val) -{ - unsigned long args[] = { LHREQ_SETREG, offset, val }; - - if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0) - err(1, "Setting register %u", offset); -} - -/* Get register by instruction encoding */ -static u32 getreg_num(unsigned regnum, u32 mask) -{ - /* 8 bit ops use regnums 4-7 for high parts of word */ - if (mask == 0xFF && (regnum & 0x4)) - return getreg_num(regnum & 0x3, 0xFFFF) >> 8; - - switch (regnum) { - case 0: return getreg(eax) & mask; - case 1: return getreg(ecx) & mask; - case 2: return getreg(edx) & mask; - case 3: return getreg(ebx) & mask; - case 4: return getreg(esp) & mask; - case 5: return getreg(ebp) & mask; - case 6: return getreg(esi) & mask; - case 7: return getreg(edi) & mask; - } - abort(); -} - -/* Set register by instruction encoding */ -static void setreg_num(unsigned regnum, u32 val, u32 mask) -{ - /* Don't try to set bits out of range */ - assert(~(val & ~mask)); - - /* 8 bit ops use regnums 4-7 for high parts of word */ - if (mask == 0xFF && (regnum & 0x4)) { - /* Construct the 16 bits we want. */ - val = (val << 8) | getreg_num(regnum & 0x3, 0xFF); - setreg_num(regnum & 0x3, val, 0xFFFF); - return; - } - - switch (regnum) { - case 0: setreg(eax, val | (getreg(eax) & ~mask)); return; - case 1: setreg(ecx, val | (getreg(ecx) & ~mask)); return; - case 2: setreg(edx, val | (getreg(edx) & ~mask)); return; - case 3: setreg(ebx, val | (getreg(ebx) & ~mask)); return; - case 4: setreg(esp, val | (getreg(esp) & ~mask)); return; - case 5: setreg(ebp, val | (getreg(ebp) & ~mask)); return; - case 6: setreg(esi, val | (getreg(esi) & ~mask)); return; - case 7: setreg(edi, val | (getreg(edi) & ~mask)); return; - } - abort(); -} - -/* Get bytes of displacement appended to instruction, from r/m encoding */ -static u32 insn_displacement_len(u8 mod_reg_rm) -{ - /* Switch on the mod bits */ - switch (mod_reg_rm >> 6) { - case 0: - /* If mod == 0, and r/m == 101, 16-bit displacement follows */ - if ((mod_reg_rm & 0x7) == 0x5) - return 2; - /* Normally, mod == 0 means no literal displacement */ - return 0; - case 1: - /* One byte displacement */ - return 1; - case 2: - /* Four byte displacement */ - return 4; - case 3: - /* Register mode */ - return 0; - } - abort(); -} - -static void emulate_insn(const u8 insn[]) -{ - unsigned long args[] = { LHREQ_TRAP, 13 }; - unsigned int insnlen = 0, in = 0, small_operand = 0, byte_access; - unsigned int eax, port, mask; - /* - * Default is to return all-ones on IO port reads, which traditionally - * means "there's nothing there". - */ - u32 val = 0xFFFFFFFF; - - /* - * This must be the Guest kernel trying to do something, not userspace! - * The bottom two bits of the CS segment register are the privilege - * level. - */ - if ((getreg(xcs) & 3) != 0x1) - goto no_emulate; - - /* Decoding x86 instructions is icky. */ - - /* - * Around 2.6.33, the kernel started using an emulation for the - * cmpxchg8b instruction in early boot on many configurations. This - * code isn't paravirtualized, and it tries to disable interrupts. - * Ignore it, which will Mostly Work. - */ - if (insn[insnlen] == 0xfa) { - /* "cli", or Clear Interrupt Enable instruction. Skip it. */ - insnlen = 1; - goto skip_insn; - } - - /* - * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out. - */ - if (insn[insnlen] == 0x66) { - small_operand = 1; - /* The instruction is 1 byte so far, read the next byte. */ - insnlen = 1; - } - - /* If the lower bit isn't set, it's a single byte access */ - byte_access = !(insn[insnlen] & 1); - - /* - * Now we can ignore the lower bit and decode the 4 opcodes - * we need to emulate. - */ - switch (insn[insnlen] & 0xFE) { - case 0xE4: /* in ,%al */ - port = insn[insnlen+1]; - insnlen += 2; - in = 1; - break; - case 0xEC: /* in (%dx),%al */ - port = getreg(edx) & 0xFFFF; - insnlen += 1; - in = 1; - break; - case 0xE6: /* out %al, */ - port = insn[insnlen+1]; - insnlen += 2; - break; - case 0xEE: /* out %al,(%dx) */ - port = getreg(edx) & 0xFFFF; - insnlen += 1; - break; - default: - /* OK, we don't know what this is, can't emulate. */ - goto no_emulate; - } - - /* Set a mask of the 1, 2 or 4 bytes, depending on size of IO */ - if (byte_access) - mask = 0xFF; - else if (small_operand) - mask = 0xFFFF; - else - mask = 0xFFFFFFFF; - - /* - * If it was an "IN" instruction, they expect the result to be read - * into %eax, so we change %eax. - */ - eax = getreg(eax); - - if (in) { - /* This is the PS/2 keyboard status; 1 means ready for output */ - if (port == 0x64) - val = 1; - else if (is_pci_addr_port(port)) - pci_addr_ioread(port, mask, &val); - else if (is_pci_data_port(port)) - pci_data_ioread(port, mask, &val); - - /* Clear the bits we're about to read */ - eax &= ~mask; - /* Copy bits in from val. */ - eax |= val & mask; - /* Now update the register. */ - setreg(eax, eax); - } else { - if (is_pci_addr_port(port)) { - if (!pci_addr_iowrite(port, mask, eax)) - goto bad_io; - } else if (is_pci_data_port(port)) { - if (!pci_data_iowrite(port, mask, eax)) - goto bad_io; - } - /* There are many other ports, eg. CMOS clock, serial - * and parallel ports, so we ignore them all. */ - } - - verbose("IO %s of %x to %u: %#08x\n", - in ? "IN" : "OUT", mask, port, eax); -skip_insn: - /* Finally, we've "done" the instruction, so move past it. */ - setreg(eip, getreg(eip) + insnlen); - return; - -bad_io: - warnx("Attempt to %s port %u (%#x mask)", - in ? "read from" : "write to", port, mask); - -no_emulate: - /* Inject trap into Guest. */ - if (write(lguest_fd, args, sizeof(args)) < 0) - err(1, "Reinjecting trap 13 for fault at %#x", getreg(eip)); -} - -static struct device *find_mmio_region(unsigned long paddr, u32 *off) -{ - unsigned int i; - - for (i = 1; i < MAX_PCI_DEVICES; i++) { - struct device *d = devices.pci[i]; - - if (!d) - continue; - if (paddr < d->mmio_addr) - continue; - if (paddr >= d->mmio_addr + d->mmio_size) - continue; - *off = paddr - d->mmio_addr; - return d; - } - return NULL; -} - -/* FIXME: Use vq array. */ -static struct virtqueue *vq_by_num(struct device *d, u32 num) -{ - struct virtqueue *vq = d->vq; - - while (num-- && vq) - vq = vq->next; - - return vq; -} - -static void save_vq_config(const struct virtio_pci_common_cfg *cfg, - struct virtqueue *vq) -{ - vq->pci_config = *cfg; -} - -static void restore_vq_config(struct virtio_pci_common_cfg *cfg, - struct virtqueue *vq) -{ - /* Only restore the per-vq part */ - size_t off = offsetof(struct virtio_pci_common_cfg, queue_size); - - memcpy((void *)cfg + off, (void *)&vq->pci_config + off, - sizeof(*cfg) - off); -} - -/* - * 4.1.4.3.2: - * - * The driver MUST configure the other virtqueue fields before - * enabling the virtqueue with queue_enable. - * - * When they enable the virtqueue, we check that their setup is valid. - */ -static void check_virtqueue(struct device *d, struct virtqueue *vq) -{ - /* Because lguest is 32 bit, all the descriptor high bits must be 0 */ - if (vq->pci_config.queue_desc_hi - || vq->pci_config.queue_avail_hi - || vq->pci_config.queue_used_hi) - bad_driver_vq(vq, "invalid 64-bit queue address"); - - /* - * 2.4.1: - * - * The driver MUST ensure that the physical address of the first byte - * of each virtqueue part is a multiple of the specified alignment - * value in the above table. - */ - if (vq->pci_config.queue_desc_lo % 16 - || vq->pci_config.queue_avail_lo % 2 - || vq->pci_config.queue_used_lo % 4) - bad_driver_vq(vq, "invalid alignment in queue addresses"); - - /* Initialize the virtqueue and check they're all in range. */ - vq->vring.num = vq->pci_config.queue_size; - vq->vring.desc = check_pointer(vq->dev, - vq->pci_config.queue_desc_lo, - sizeof(*vq->vring.desc) * vq->vring.num); - vq->vring.avail = check_pointer(vq->dev, - vq->pci_config.queue_avail_lo, - sizeof(*vq->vring.avail) - + (sizeof(vq->vring.avail->ring[0]) - * vq->vring.num)); - vq->vring.used = check_pointer(vq->dev, - vq->pci_config.queue_used_lo, - sizeof(*vq->vring.used) - + (sizeof(vq->vring.used->ring[0]) - * vq->vring.num)); - - /* - * 2.4.9.1: - * - * The driver MUST initialize flags in the used ring to 0 - * when allocating the used ring. - */ - if (vq->vring.used->flags != 0) - bad_driver_vq(vq, "invalid initial used.flags %#x", - vq->vring.used->flags); -} - -static void start_virtqueue(struct virtqueue *vq) -{ - /* - * Create stack for thread. Since the stack grows upwards, we point - * the stack pointer to the end of this region. - */ - char *stack = malloc(32768); - - /* Create a zero-initialized eventfd. */ - vq->eventfd = eventfd(0, 0); - if (vq->eventfd < 0) - err(1, "Creating eventfd"); - - /* - * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so - * we get a signal if it dies. - */ - vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); - if (vq->thread == (pid_t)-1) - err(1, "Creating clone"); -} - -static void start_virtqueues(struct device *d) -{ - struct virtqueue *vq; - - for (vq = d->vq; vq; vq = vq->next) { - if (vq->pci_config.queue_enable) - start_virtqueue(vq); - } -} - -static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask) -{ - struct virtqueue *vq; - - switch (off) { - case offsetof(struct virtio_pci_mmio, cfg.device_feature_select): - /* - * 4.1.4.3.1: - * - * The device MUST present the feature bits it is offering in - * device_feature, starting at bit device_feature_select ∗ 32 - * for any device_feature_select written by the driver - */ - if (val == 0) - d->mmio->cfg.device_feature = d->features; - else if (val == 1) - d->mmio->cfg.device_feature = (d->features >> 32); - else - d->mmio->cfg.device_feature = 0; - goto feature_write_through32; - case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select): - if (val > 1) - bad_driver(d, "Unexpected driver select %u", val); - goto feature_write_through32; - case offsetof(struct virtio_pci_mmio, cfg.guest_feature): - if (d->mmio->cfg.guest_feature_select == 0) { - d->features_accepted &= ~((u64)0xFFFFFFFF); - d->features_accepted |= val; - } else { - assert(d->mmio->cfg.guest_feature_select == 1); - d->features_accepted &= 0xFFFFFFFF; - d->features_accepted |= ((u64)val) << 32; - } - /* - * 2.2.1: - * - * The driver MUST NOT accept a feature which the device did - * not offer - */ - if (d->features_accepted & ~d->features) - bad_driver(d, "over-accepted features %#llx of %#llx", - d->features_accepted, d->features); - goto feature_write_through32; - case offsetof(struct virtio_pci_mmio, cfg.device_status): { - u8 prev; - - verbose("%s: device status -> %#x\n", d->name, val); - /* - * 4.1.4.3.1: - * - * The device MUST reset when 0 is written to device_status, - * and present a 0 in device_status once that is done. - */ - if (val == 0) { - reset_device(d); - goto write_through8; - } - - /* 2.1.1: The driver MUST NOT clear a device status bit. */ - if (d->mmio->cfg.device_status & ~val) - bad_driver(d, "unset of device status bit %#x -> %#x", - d->mmio->cfg.device_status, val); - - /* - * 2.1.2: - * - * The device MUST NOT consume buffers or notify the driver - * before DRIVER_OK. - */ - if (val & VIRTIO_CONFIG_S_DRIVER_OK - && !(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK)) - start_virtqueues(d); - - /* - * 3.1.1: - * - * The driver MUST follow this sequence to initialize a device: - * - Reset the device. - * - Set the ACKNOWLEDGE status bit: the guest OS has - * notice the device. - * - Set the DRIVER status bit: the guest OS knows how - * to drive the device. - * - Read device feature bits, and write the subset - * of feature bits understood by the OS and driver - * to the device. During this step the driver MAY - * read (but MUST NOT write) the device-specific - * configuration fields to check that it can - * support the device before accepting it. - * - Set the FEATURES_OK status bit. The driver - * MUST not accept new feature bits after this - * step. - * - Re-read device status to ensure the FEATURES_OK - * bit is still set: otherwise, the device does - * not support our subset of features and the - * device is unusable. - * - Perform device-specific setup, including - * discovery of virtqueues for the device, - * optional per-bus setup, reading and possibly - * writing the device’s virtio configuration - * space, and population of virtqueues. - * - Set the DRIVER_OK status bit. At this point the - * device is “live”. - */ - prev = 0; - switch (val & ~d->mmio->cfg.device_status) { - case VIRTIO_CONFIG_S_DRIVER_OK: - prev |= VIRTIO_CONFIG_S_FEATURES_OK; /* fall thru */ - case VIRTIO_CONFIG_S_FEATURES_OK: - prev |= VIRTIO_CONFIG_S_DRIVER; /* fall thru */ - case VIRTIO_CONFIG_S_DRIVER: - prev |= VIRTIO_CONFIG_S_ACKNOWLEDGE; /* fall thru */ - case VIRTIO_CONFIG_S_ACKNOWLEDGE: - break; - default: - bad_driver(d, "unknown device status bit %#x -> %#x", - d->mmio->cfg.device_status, val); - } - if (d->mmio->cfg.device_status != prev) - bad_driver(d, "unexpected status transition %#x -> %#x", - d->mmio->cfg.device_status, val); - - /* If they just wrote FEATURES_OK, we make sure they read */ - switch (val & ~d->mmio->cfg.device_status) { - case VIRTIO_CONFIG_S_FEATURES_OK: - d->wrote_features_ok = true; - break; - case VIRTIO_CONFIG_S_DRIVER_OK: - if (d->wrote_features_ok) - bad_driver(d, "did not re-read FEATURES_OK"); - break; - } - goto write_through8; - } - case offsetof(struct virtio_pci_mmio, cfg.queue_select): - vq = vq_by_num(d, val); - /* - * 4.1.4.3.1: - * - * The device MUST present a 0 in queue_size if the virtqueue - * corresponding to the current queue_select is unavailable. - */ - if (!vq) { - d->mmio->cfg.queue_size = 0; - goto write_through16; - } - /* Save registers for old vq, if it was a valid vq */ - if (d->mmio->cfg.queue_size) - save_vq_config(&d->mmio->cfg, - vq_by_num(d, d->mmio->cfg.queue_select)); - /* Restore the registers for the queue they asked for */ - restore_vq_config(&d->mmio->cfg, vq); - goto write_through16; - case offsetof(struct virtio_pci_mmio, cfg.queue_size): - /* - * 4.1.4.3.2: - * - * The driver MUST NOT write a value which is not a power of 2 - * to queue_size. - */ - if (val & (val-1)) - bad_driver(d, "invalid queue size %u", val); - if (d->mmio->cfg.queue_enable) - bad_driver(d, "changing queue size on live device"); - goto write_through16; - case offsetof(struct virtio_pci_mmio, cfg.queue_msix_vector): - bad_driver(d, "attempt to set MSIX vector to %u", val); - case offsetof(struct virtio_pci_mmio, cfg.queue_enable): { - struct virtqueue *vq = vq_by_num(d, d->mmio->cfg.queue_select); - - /* - * 4.1.4.3.2: - * - * The driver MUST NOT write a 0 to queue_enable. - */ - if (val != 1) - bad_driver(d, "setting queue_enable to %u", val); - - /* - * 3.1.1: - * - * 7. Perform device-specific setup, including discovery of - * virtqueues for the device, optional per-bus setup, - * reading and possibly writing the device’s virtio - * configuration space, and population of virtqueues. - * 8. Set the DRIVER_OK status bit. - * - * All our devices require all virtqueues to be enabled, so - * they should have done that before setting DRIVER_OK. - */ - if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK) - bad_driver(d, "enabling vq after DRIVER_OK"); - - d->mmio->cfg.queue_enable = val; - save_vq_config(&d->mmio->cfg, vq); - check_virtqueue(d, vq); - goto write_through16; - } - case offsetof(struct virtio_pci_mmio, cfg.queue_notify_off): - bad_driver(d, "attempt to write to queue_notify_off"); - case offsetof(struct virtio_pci_mmio, cfg.queue_desc_lo): - case offsetof(struct virtio_pci_mmio, cfg.queue_desc_hi): - case offsetof(struct virtio_pci_mmio, cfg.queue_avail_lo): - case offsetof(struct virtio_pci_mmio, cfg.queue_avail_hi): - case offsetof(struct virtio_pci_mmio, cfg.queue_used_lo): - case offsetof(struct virtio_pci_mmio, cfg.queue_used_hi): - /* - * 4.1.4.3.2: - * - * The driver MUST configure the other virtqueue fields before - * enabling the virtqueue with queue_enable. - */ - if (d->mmio->cfg.queue_enable) - bad_driver(d, "changing queue on live device"); - - /* - * 3.1.1: - * - * The driver MUST follow this sequence to initialize a device: - *... - * 5. Set the FEATURES_OK status bit. The driver MUST not - * accept new feature bits after this step. - */ - if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK)) - bad_driver(d, "setting up vq before FEATURES_OK"); - - /* - * 6. Re-read device status to ensure the FEATURES_OK bit is - * still set... - */ - if (d->wrote_features_ok) - bad_driver(d, "didn't re-read FEATURES_OK before setup"); - - goto write_through32; - case offsetof(struct virtio_pci_mmio, notify): - vq = vq_by_num(d, val); - if (!vq) - bad_driver(d, "Invalid vq notification on %u", val); - /* Notify the process handling this vq by adding 1 to eventfd */ - write(vq->eventfd, "\1\0\0\0\0\0\0\0", 8); - goto write_through16; - case offsetof(struct virtio_pci_mmio, isr): - bad_driver(d, "Unexpected write to isr"); - /* Weird corner case: write to emerg_wr of console */ - case sizeof(struct virtio_pci_mmio) - + offsetof(struct virtio_console_config, emerg_wr): - if (strcmp(d->name, "console") == 0) { - char c = val; - write(STDOUT_FILENO, &c, 1); - goto write_through32; - } - /* Fall through... */ - default: - /* - * 4.1.4.3.2: - * - * The driver MUST NOT write to device_feature, num_queues, - * config_generation or queue_notify_off. - */ - bad_driver(d, "Unexpected write to offset %u", off); - } - -feature_write_through32: - /* - * 3.1.1: - * - * The driver MUST follow this sequence to initialize a device: - *... - * - Set the DRIVER status bit: the guest OS knows how - * to drive the device. - * - Read device feature bits, and write the subset - * of feature bits understood by the OS and driver - * to the device. - *... - * - Set the FEATURES_OK status bit. The driver MUST not - * accept new feature bits after this step. - */ - if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER)) - bad_driver(d, "feature write before VIRTIO_CONFIG_S_DRIVER"); - if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK) - bad_driver(d, "feature write after VIRTIO_CONFIG_S_FEATURES_OK"); - - /* - * 4.1.3.1: - * - * The driver MUST access each field using the “natural” access - * method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for - * 16-bit fields and 8-bit accesses for 8-bit fields. - */ -write_through32: - if (mask != 0xFFFFFFFF) { - bad_driver(d, "non-32-bit write to offset %u (%#x)", - off, getreg(eip)); - return; - } - memcpy((char *)d->mmio + off, &val, 4); - return; - -write_through16: - if (mask != 0xFFFF) - bad_driver(d, "non-16-bit write to offset %u (%#x)", - off, getreg(eip)); - memcpy((char *)d->mmio + off, &val, 2); - return; - -write_through8: - if (mask != 0xFF) - bad_driver(d, "non-8-bit write to offset %u (%#x)", - off, getreg(eip)); - memcpy((char *)d->mmio + off, &val, 1); - return; -} - -static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask) -{ - u8 isr; - u32 val = 0; - - switch (off) { - case offsetof(struct virtio_pci_mmio, cfg.device_feature_select): - case offsetof(struct virtio_pci_mmio, cfg.device_feature): - case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select): - case offsetof(struct virtio_pci_mmio, cfg.guest_feature): - /* - * 3.1.1: - * - * The driver MUST follow this sequence to initialize a device: - *... - * - Set the DRIVER status bit: the guest OS knows how - * to drive the device. - * - Read device feature bits, and write the subset - * of feature bits understood by the OS and driver - * to the device. - */ - if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER)) - bad_driver(d, - "feature read before VIRTIO_CONFIG_S_DRIVER"); - goto read_through32; - case offsetof(struct virtio_pci_mmio, cfg.msix_config): - bad_driver(d, "read of msix_config"); - case offsetof(struct virtio_pci_mmio, cfg.num_queues): - goto read_through16; - case offsetof(struct virtio_pci_mmio, cfg.device_status): - /* As they did read, any write of FEATURES_OK is now fine. */ - d->wrote_features_ok = false; - goto read_through8; - case offsetof(struct virtio_pci_mmio, cfg.config_generation): - /* - * 4.1.4.3.1: - * - * The device MUST present a changed config_generation after - * the driver has read a device-specific configuration value - * which has changed since any part of the device-specific - * configuration was last read. - * - * This is simple: none of our devices change config, so this - * is always 0. - */ - goto read_through8; - case offsetof(struct virtio_pci_mmio, notify): - /* - * 3.1.1: - * - * The driver MUST NOT notify the device before setting - * DRIVER_OK. - */ - if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK)) - bad_driver(d, "notify before VIRTIO_CONFIG_S_DRIVER_OK"); - goto read_through16; - case offsetof(struct virtio_pci_mmio, isr): - if (mask != 0xFF) - bad_driver(d, "non-8-bit read from offset %u (%#x)", - off, getreg(eip)); - isr = d->mmio->isr; - /* - * 4.1.4.5.1: - * - * The device MUST reset ISR status to 0 on driver read. - */ - d->mmio->isr = 0; - return isr; - case offsetof(struct virtio_pci_mmio, padding): - bad_driver(d, "read from padding (%#x)", getreg(eip)); - default: - /* Read from device config space, beware unaligned overflow */ - if (off > d->mmio_size - 4) - bad_driver(d, "read past end (%#x)", getreg(eip)); - - /* - * 3.1.1: - * The driver MUST follow this sequence to initialize a device: - *... - * 3. Set the DRIVER status bit: the guest OS knows how to - * drive the device. - * 4. Read device feature bits, and write the subset of - * feature bits understood by the OS and driver to the - * device. During this step the driver MAY read (but MUST NOT - * write) the device-specific configuration fields to check - * that it can support the device before accepting it. - */ - if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER)) - bad_driver(d, - "config read before VIRTIO_CONFIG_S_DRIVER"); - - if (mask == 0xFFFFFFFF) - goto read_through32; - else if (mask == 0xFFFF) - goto read_through16; - else - goto read_through8; - } - - /* - * 4.1.3.1: - * - * The driver MUST access each field using the “natural” access - * method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for - * 16-bit fields and 8-bit accesses for 8-bit fields. - */ -read_through32: - if (mask != 0xFFFFFFFF) - bad_driver(d, "non-32-bit read to offset %u (%#x)", - off, getreg(eip)); - memcpy(&val, (char *)d->mmio + off, 4); - return val; - -read_through16: - if (mask != 0xFFFF) - bad_driver(d, "non-16-bit read to offset %u (%#x)", - off, getreg(eip)); - memcpy(&val, (char *)d->mmio + off, 2); - return val; - -read_through8: - if (mask != 0xFF) - bad_driver(d, "non-8-bit read to offset %u (%#x)", - off, getreg(eip)); - memcpy(&val, (char *)d->mmio + off, 1); - return val; -} - -static void emulate_mmio(unsigned long paddr, const u8 *insn) -{ - u32 val, off, mask = 0xFFFFFFFF, insnlen = 0; - struct device *d = find_mmio_region(paddr, &off); - unsigned long args[] = { LHREQ_TRAP, 14 }; - - if (!d) { - warnx("MMIO touching %#08lx (not a device)", paddr); - goto reinject; - } - - /* Prefix makes it a 16 bit op */ - if (insn[0] == 0x66) { - mask = 0xFFFF; - insnlen++; - } - - /* iowrite */ - if (insn[insnlen] == 0x89) { - /* Next byte is r/m byte: bits 3-5 are register. */ - val = getreg_num((insn[insnlen+1] >> 3) & 0x7, mask); - emulate_mmio_write(d, off, val, mask); - insnlen += 2 + insn_displacement_len(insn[insnlen+1]); - } else if (insn[insnlen] == 0x8b) { /* ioread */ - /* Next byte is r/m byte: bits 3-5 are register. */ - val = emulate_mmio_read(d, off, mask); - setreg_num((insn[insnlen+1] >> 3) & 0x7, val, mask); - insnlen += 2 + insn_displacement_len(insn[insnlen+1]); - } else if (insn[0] == 0x88) { /* 8-bit iowrite */ - mask = 0xff; - /* Next byte is r/m byte: bits 3-5 are register. */ - val = getreg_num((insn[1] >> 3) & 0x7, mask); - emulate_mmio_write(d, off, val, mask); - insnlen = 2 + insn_displacement_len(insn[1]); - } else if (insn[0] == 0x8a) { /* 8-bit ioread */ - mask = 0xff; - val = emulate_mmio_read(d, off, mask); - setreg_num((insn[1] >> 3) & 0x7, val, mask); - insnlen = 2 + insn_displacement_len(insn[1]); - } else { - warnx("Unknown MMIO instruction touching %#08lx:" - " %02x %02x %02x %02x at %u", - paddr, insn[0], insn[1], insn[2], insn[3], getreg(eip)); - reinject: - /* Inject trap into Guest. */ - if (write(lguest_fd, args, sizeof(args)) < 0) - err(1, "Reinjecting trap 14 for fault at %#x", - getreg(eip)); - return; - } - - /* Finally, we've "done" the instruction, so move past it. */ - setreg(eip, getreg(eip) + insnlen); -} - -/*L:190 - * Device Setup - * - * All devices need a descriptor so the Guest knows it exists, and a "struct - * device" so the Launcher can keep track of it. We have common helper - * routines to allocate and manage them. - */ -static void add_pci_virtqueue(struct device *dev, - void (*service)(struct virtqueue *), - const char *name) -{ - struct virtqueue **i, *vq = malloc(sizeof(*vq)); - - /* Initialize the virtqueue */ - vq->next = NULL; - vq->last_avail_idx = 0; - vq->dev = dev; - vq->name = name; - - /* - * This is the routine the service thread will run, and its Process ID - * once it's running. - */ - vq->service = service; - vq->thread = (pid_t)-1; - - /* Initialize the configuration. */ - reset_vq_pci_config(vq); - vq->pci_config.queue_notify_off = 0; - - /* Add one to the number of queues */ - vq->dev->mmio->cfg.num_queues++; - - /* - * Add to tail of list, so dev->vq is first vq, dev->vq->next is - * second. - */ - for (i = &dev->vq; *i; i = &(*i)->next); - *i = vq; -} - -/* The Guest accesses the feature bits via the PCI common config MMIO region */ -static void add_pci_feature(struct device *dev, unsigned bit) -{ - dev->features |= (1ULL << bit); -} - -/* For devices with no config. */ -static void no_device_config(struct device *dev) -{ - dev->mmio_addr = get_mmio_region(dev->mmio_size); - - dev->config.bar[0] = dev->mmio_addr; - /* Bottom 4 bits must be zero */ - assert(~(dev->config.bar[0] & 0xF)); -} - -/* This puts the device config into BAR0 */ -static void set_device_config(struct device *dev, const void *conf, size_t len) -{ - /* Set up BAR 0 */ - dev->mmio_size += len; - dev->mmio = realloc(dev->mmio, dev->mmio_size); - memcpy(dev->mmio + 1, conf, len); - - /* - * 4.1.4.6: - * - * The device MUST present at least one VIRTIO_PCI_CAP_DEVICE_CFG - * capability for any device type which has a device-specific - * configuration. - */ - /* Hook up device cfg */ - dev->config.cfg_access.cap.cap_next - = offsetof(struct pci_config, device); - - /* - * 4.1.4.6.1: - * - * The offset for the device-specific configuration MUST be 4-byte - * aligned. - */ - assert(dev->config.cfg_access.cap.cap_next % 4 == 0); - - /* Fix up device cfg field length. */ - dev->config.device.length = len; - - /* The rest is the same as the no-config case */ - no_device_config(dev); -} - -static void init_cap(struct virtio_pci_cap *cap, size_t caplen, int type, - size_t bar_offset, size_t bar_bytes, u8 next) -{ - cap->cap_vndr = PCI_CAP_ID_VNDR; - cap->cap_next = next; - cap->cap_len = caplen; - cap->cfg_type = type; - cap->bar = 0; - memset(cap->padding, 0, sizeof(cap->padding)); - cap->offset = bar_offset; - cap->length = bar_bytes; -} - -/* - * This sets up the pci_config structure, as defined in the virtio 1.0 - * standard (and PCI standard). - */ -static void init_pci_config(struct pci_config *pci, u16 type, - u8 class, u8 subclass) -{ - size_t bar_offset, bar_len; - - /* - * 4.1.4.4.1: - * - * The device MUST either present notify_off_multiplier as an even - * power of 2, or present notify_off_multiplier as 0. - * - * 2.1.2: - * - * The device MUST initialize device status to 0 upon reset. - */ - memset(pci, 0, sizeof(*pci)); - - /* 4.1.2.1: Devices MUST have the PCI Vendor ID 0x1AF4 */ - pci->vendor_id = 0x1AF4; - /* 4.1.2.1: ... PCI Device ID calculated by adding 0x1040 ... */ - pci->device_id = 0x1040 + type; - - /* - * PCI have specific codes for different types of devices. - * Linux doesn't care, but it's a good clue for people looking - * at the device. - */ - pci->class = class; - pci->subclass = subclass; - - /* - * 4.1.2.1: - * - * Non-transitional devices SHOULD have a PCI Revision ID of 1 or - * higher - */ - pci->revid = 1; - - /* - * 4.1.2.1: - * - * Non-transitional devices SHOULD have a PCI Subsystem Device ID of - * 0x40 or higher. - */ - pci->subsystem_device_id = 0x40; - - /* We use our dummy interrupt controller, and irq_line is the irq */ - pci->irq_line = devices.next_irq++; - pci->irq_pin = 0; - - /* Support for extended capabilities. */ - pci->status = (1 << 4); - - /* Link them in. */ - /* - * 4.1.4.3.1: - * - * The device MUST present at least one common configuration - * capability. - */ - pci->capabilities = offsetof(struct pci_config, common); - - /* 4.1.4.3.1 ... offset MUST be 4-byte aligned. */ - assert(pci->capabilities % 4 == 0); - - bar_offset = offsetof(struct virtio_pci_mmio, cfg); - bar_len = sizeof(((struct virtio_pci_mmio *)0)->cfg); - init_cap(&pci->common, sizeof(pci->common), VIRTIO_PCI_CAP_COMMON_CFG, - bar_offset, bar_len, - offsetof(struct pci_config, notify)); - - /* - * 4.1.4.4.1: - * - * The device MUST present at least one notification capability. - */ - bar_offset += bar_len; - bar_len = sizeof(((struct virtio_pci_mmio *)0)->notify); - - /* - * 4.1.4.4.1: - * - * The cap.offset MUST be 2-byte aligned. - */ - assert(pci->common.cap_next % 2 == 0); - - /* FIXME: Use a non-zero notify_off, for per-queue notification? */ - /* - * 4.1.4.4.1: - * - * The value cap.length presented by the device MUST be at least 2 and - * MUST be large enough to support queue notification offsets for all - * supported queues in all possible configurations. - */ - assert(bar_len >= 2); - - init_cap(&pci->notify.cap, sizeof(pci->notify), - VIRTIO_PCI_CAP_NOTIFY_CFG, - bar_offset, bar_len, - offsetof(struct pci_config, isr)); - - bar_offset += bar_len; - bar_len = sizeof(((struct virtio_pci_mmio *)0)->isr); - /* - * 4.1.4.5.1: - * - * The device MUST present at least one VIRTIO_PCI_CAP_ISR_CFG - * capability. - */ - init_cap(&pci->isr, sizeof(pci->isr), - VIRTIO_PCI_CAP_ISR_CFG, - bar_offset, bar_len, - offsetof(struct pci_config, cfg_access)); - - /* - * 4.1.4.7.1: - * - * The device MUST present at least one VIRTIO_PCI_CAP_PCI_CFG - * capability. - */ - /* This doesn't have any presence in the BAR */ - init_cap(&pci->cfg_access.cap, sizeof(pci->cfg_access), - VIRTIO_PCI_CAP_PCI_CFG, - 0, 0, 0); - - bar_offset += bar_len + sizeof(((struct virtio_pci_mmio *)0)->padding); - assert(bar_offset == sizeof(struct virtio_pci_mmio)); - - /* - * This gets sewn in and length set in set_device_config(). - * Some devices don't have a device configuration interface, so - * we never expose this if we don't call set_device_config(). - */ - init_cap(&pci->device, sizeof(pci->device), VIRTIO_PCI_CAP_DEVICE_CFG, - bar_offset, 0, 0); -} - -/* - * This routine does all the creation and setup of a new device, but we don't - * actually place the MMIO region until we know the size (if any) of the - * device-specific config. And we don't actually start the service threads - * until later. - * - * See what I mean about userspace being boring? - */ -static struct device *new_pci_device(const char *name, u16 type, - u8 class, u8 subclass) -{ - struct device *dev = malloc(sizeof(*dev)); - - /* Now we populate the fields one at a time. */ - dev->name = name; - dev->vq = NULL; - dev->running = false; - dev->wrote_features_ok = false; - dev->mmio_size = sizeof(struct virtio_pci_mmio); - dev->mmio = calloc(1, dev->mmio_size); - dev->features = (u64)1 << VIRTIO_F_VERSION_1; - dev->features_accepted = 0; - - if (devices.device_num + 1 >= MAX_PCI_DEVICES) - errx(1, "Can only handle 31 PCI devices"); - - init_pci_config(&dev->config, type, class, subclass); - assert(!devices.pci[devices.device_num+1]); - devices.pci[++devices.device_num] = dev; - - return dev; -} - -/* - * Our first setup routine is the console. It's a fairly simple device, but - * UNIX tty handling makes it uglier than it could be. - */ -static void setup_console(void) -{ - struct device *dev; - struct virtio_console_config conf; - - /* If we can save the initial standard input settings... */ - if (tcgetattr(STDIN_FILENO, &orig_term) == 0) { - struct termios term = orig_term; - /* - * Then we turn off echo, line buffering and ^C etc: We want a - * raw input stream to the Guest. - */ - term.c_lflag &= ~(ISIG|ICANON|ECHO); - tcsetattr(STDIN_FILENO, TCSANOW, &term); - } - - dev = new_pci_device("console", VIRTIO_ID_CONSOLE, 0x07, 0x00); - - /* We store the console state in dev->priv, and initialize it. */ - dev->priv = malloc(sizeof(struct console_abort)); - ((struct console_abort *)dev->priv)->count = 0; - - /* - * The console needs two virtqueues: the input then the output. When - * they put something the input queue, we make sure we're listening to - * stdin. When they put something in the output queue, we write it to - * stdout. - */ - add_pci_virtqueue(dev, console_input, "input"); - add_pci_virtqueue(dev, console_output, "output"); - - /* We need a configuration area for the emerg_wr early writes. */ - add_pci_feature(dev, VIRTIO_CONSOLE_F_EMERG_WRITE); - set_device_config(dev, &conf, sizeof(conf)); - - verbose("device %u: console\n", devices.device_num); -} -/*:*/ - -/*M:010 - * Inter-guest networking is an interesting area. Simplest is to have a - * --sharenet= option which opens or creates a named pipe. This can be - * used to send packets to another guest in a 1:1 manner. - * - * More sophisticated is to use one of the tools developed for project like UML - * to do networking. - * - * Faster is to do virtio bonding in kernel. Doing this 1:1 would be - * completely generic ("here's my vring, attach to your vring") and would work - * for any traffic. Of course, namespace and permissions issues need to be - * dealt with. A more sophisticated "multi-channel" virtio_net.c could hide - * multiple inter-guest channels behind one interface, although it would - * require some manner of hotplugging new virtio channels. - * - * Finally, we could use a virtio network switch in the kernel, ie. vhost. -:*/ - -static u32 str2ip(const char *ipaddr) -{ - unsigned int b[4]; - - if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4) - errx(1, "Failed to parse IP address '%s'", ipaddr); - return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3]; -} - -static void str2mac(const char *macaddr, unsigned char mac[6]) -{ - unsigned int m[6]; - if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x", - &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6) - errx(1, "Failed to parse mac address '%s'", macaddr); - mac[0] = m[0]; - mac[1] = m[1]; - mac[2] = m[2]; - mac[3] = m[3]; - mac[4] = m[4]; - mac[5] = m[5]; -} - -/* - * This code is "adapted" from libbridge: it attaches the Host end of the - * network device to the bridge device specified by the command line. - * - * This is yet another James Morris contribution (I'm an IP-level guy, so I - * dislike bridging), and I just try not to break it. - */ -static void add_to_bridge(int fd, const char *if_name, const char *br_name) -{ - int ifidx; - struct ifreq ifr; - - if (!*br_name) - errx(1, "must specify bridge name"); - - ifidx = if_nametoindex(if_name); - if (!ifidx) - errx(1, "interface %s does not exist!", if_name); - - strncpy(ifr.ifr_name, br_name, IFNAMSIZ); - ifr.ifr_name[IFNAMSIZ-1] = '\0'; - ifr.ifr_ifindex = ifidx; - if (ioctl(fd, SIOCBRADDIF, &ifr) < 0) - err(1, "can't add %s to bridge %s", if_name, br_name); -} - -/* - * This sets up the Host end of the network device with an IP address, brings - * it up so packets will flow, the copies the MAC address into the hwaddr - * pointer. - */ -static void configure_device(int fd, const char *tapif, u32 ipaddr) -{ - struct ifreq ifr; - struct sockaddr_in sin; - - memset(&ifr, 0, sizeof(ifr)); - strcpy(ifr.ifr_name, tapif); - - /* Don't read these incantations. Just cut & paste them like I did! */ - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = htonl(ipaddr); - memcpy(&ifr.ifr_addr, &sin, sizeof(sin)); - if (ioctl(fd, SIOCSIFADDR, &ifr) != 0) - err(1, "Setting %s interface address", tapif); - ifr.ifr_flags = IFF_UP; - if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0) - err(1, "Bringing interface %s up", tapif); -} - -static int get_tun_device(char tapif[IFNAMSIZ]) -{ - struct ifreq ifr; - int vnet_hdr_sz; - int netfd; - - /* Start with this zeroed. Messy but sure. */ - memset(&ifr, 0, sizeof(ifr)); - - /* - * We open the /dev/net/tun device and tell it we want a tap device. A - * tap device is like a tun device, only somehow different. To tell - * the truth, I completely blundered my way through this code, but it - * works now! - */ - netfd = open_or_die("/dev/net/tun", O_RDWR); - ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; - strcpy(ifr.ifr_name, "tap%d"); - if (ioctl(netfd, TUNSETIFF, &ifr) != 0) - err(1, "configuring /dev/net/tun"); - - if (ioctl(netfd, TUNSETOFFLOAD, - TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0) - err(1, "Could not set features for tun device"); - - /* - * We don't need checksums calculated for packets coming in this - * device: trust us! - */ - ioctl(netfd, TUNSETNOCSUM, 1); - - /* - * In virtio before 1.0 (aka legacy virtio), we added a 16-bit - * field at the end of the network header iff - * VIRTIO_NET_F_MRG_RXBUF was negotiated. For virtio 1.0, - * that became the norm, but we need to tell the tun device - * about our expanded header (which is called - * virtio_net_hdr_mrg_rxbuf in the legacy system). - */ - vnet_hdr_sz = sizeof(struct virtio_net_hdr_v1); - if (ioctl(netfd, TUNSETVNETHDRSZ, &vnet_hdr_sz) != 0) - err(1, "Setting tun header size to %u", vnet_hdr_sz); - - memcpy(tapif, ifr.ifr_name, IFNAMSIZ); - return netfd; -} - -/*L:195 - * Our network is a Host<->Guest network. This can either use bridging or - * routing, but the principle is the same: it uses the "tun" device to inject - * packets into the Host as if they came in from a normal network card. We - * just shunt packets between the Guest and the tun device. - */ -static void setup_tun_net(char *arg) -{ - struct device *dev; - struct net_info *net_info = malloc(sizeof(*net_info)); - int ipfd; - u32 ip = INADDR_ANY; - bool bridging = false; - char tapif[IFNAMSIZ], *p; - struct virtio_net_config conf; - - net_info->tunfd = get_tun_device(tapif); - - /* First we create a new network device. */ - dev = new_pci_device("net", VIRTIO_ID_NET, 0x02, 0x00); - dev->priv = net_info; - - /* Network devices need a recv and a send queue, just like console. */ - add_pci_virtqueue(dev, net_input, "rx"); - add_pci_virtqueue(dev, net_output, "tx"); - - /* - * We need a socket to perform the magic network ioctls to bring up the - * tap interface, connect to the bridge etc. Any socket will do! - */ - ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); - if (ipfd < 0) - err(1, "opening IP socket"); - - /* If the command line was --tunnet=bridge: do bridging. */ - if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) { - arg += strlen(BRIDGE_PFX); - bridging = true; - } - - /* A mac address may follow the bridge name or IP address */ - p = strchr(arg, ':'); - if (p) { - str2mac(p+1, conf.mac); - add_pci_feature(dev, VIRTIO_NET_F_MAC); - *p = '\0'; - } - - /* arg is now either an IP address or a bridge name */ - if (bridging) - add_to_bridge(ipfd, tapif, arg); - else - ip = str2ip(arg); - - /* Set up the tun device. */ - configure_device(ipfd, tapif, ip); - - /* Expect Guest to handle everything except UFO */ - add_pci_feature(dev, VIRTIO_NET_F_CSUM); - add_pci_feature(dev, VIRTIO_NET_F_GUEST_CSUM); - add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO4); - add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO6); - add_pci_feature(dev, VIRTIO_NET_F_GUEST_ECN); - add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO4); - add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO6); - add_pci_feature(dev, VIRTIO_NET_F_HOST_ECN); - /* We handle indirect ring entries */ - add_pci_feature(dev, VIRTIO_RING_F_INDIRECT_DESC); - set_device_config(dev, &conf, sizeof(conf)); - - /* We don't need the socket any more; setup is done. */ - close(ipfd); - - if (bridging) - verbose("device %u: tun %s attached to bridge: %s\n", - devices.device_num, tapif, arg); - else - verbose("device %u: tun %s: %s\n", - devices.device_num, tapif, arg); -} -/*:*/ - -/* This hangs off device->priv. */ -struct vblk_info { - /* The size of the file. */ - off64_t len; - - /* The file descriptor for the file. */ - int fd; - -}; - -/*L:210 - * The Disk - * - * The disk only has one virtqueue, so it only has one thread. It is really - * simple: the Guest asks for a block number and we read or write that position - * in the file. - * - * Before we serviced each virtqueue in a separate thread, that was unacceptably - * slow: the Guest waits until the read is finished before running anything - * else, even if it could have been doing useful work. - * - * We could have used async I/O, except it's reputed to suck so hard that - * characters actually go missing from your code when you try to use it. - */ -static void blk_request(struct virtqueue *vq) -{ - struct vblk_info *vblk = vq->dev->priv; - unsigned int head, out_num, in_num, wlen; - int ret, i; - u8 *in; - struct virtio_blk_outhdr out; - struct iovec iov[vq->vring.num]; - off64_t off; - - /* - * Get the next request, where we normally wait. It triggers the - * interrupt to acknowledge previously serviced requests (if any). - */ - head = wait_for_vq_desc(vq, iov, &out_num, &in_num); - - /* Copy the output header from the front of the iov (adjusts iov) */ - iov_consume(vq->dev, iov, out_num, &out, sizeof(out)); - - /* Find and trim end of iov input array, for our status byte. */ - in = NULL; - for (i = out_num + in_num - 1; i >= out_num; i--) { - if (iov[i].iov_len > 0) { - in = iov[i].iov_base + iov[i].iov_len - 1; - iov[i].iov_len--; - break; - } - } - if (!in) - bad_driver_vq(vq, "Bad virtblk cmd with no room for status"); - - /* - * For historical reasons, block operations are expressed in 512 byte - * "sectors". - */ - off = out.sector * 512; - - if (out.type & VIRTIO_BLK_T_OUT) { - /* - * Write - * - * Move to the right location in the block file. This can fail - * if they try to write past end. - */ - if (lseek64(vblk->fd, off, SEEK_SET) != off) - err(1, "Bad seek to sector %llu", out.sector); - - ret = writev(vblk->fd, iov, out_num); - verbose("WRITE to sector %llu: %i\n", out.sector, ret); - - /* - * Grr... Now we know how long the descriptor they sent was, we - * make sure they didn't try to write over the end of the block - * file (possibly extending it). - */ - if (ret > 0 && off + ret > vblk->len) { - /* Trim it back to the correct length */ - ftruncate64(vblk->fd, vblk->len); - /* Die, bad Guest, die. */ - bad_driver_vq(vq, "Write past end %llu+%u", off, ret); - } - - wlen = sizeof(*in); - *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); - } else if (out.type & VIRTIO_BLK_T_FLUSH) { - /* Flush */ - ret = fdatasync(vblk->fd); - verbose("FLUSH fdatasync: %i\n", ret); - wlen = sizeof(*in); - *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); - } else { - /* - * Read - * - * Move to the right location in the block file. This can fail - * if they try to read past end. - */ - if (lseek64(vblk->fd, off, SEEK_SET) != off) - err(1, "Bad seek to sector %llu", out.sector); - - ret = readv(vblk->fd, iov + out_num, in_num); - if (ret >= 0) { - wlen = sizeof(*in) + ret; - *in = VIRTIO_BLK_S_OK; - } else { - wlen = sizeof(*in); - *in = VIRTIO_BLK_S_IOERR; - } - } - - /* Finished that request. */ - add_used(vq, head, wlen); -} - -/*L:198 This actually sets up a virtual block device. */ -static void setup_block_file(const char *filename) -{ - struct device *dev; - struct vblk_info *vblk; - struct virtio_blk_config conf; - - /* Create the device. */ - dev = new_pci_device("block", VIRTIO_ID_BLOCK, 0x01, 0x80); - - /* The device has one virtqueue, where the Guest places requests. */ - add_pci_virtqueue(dev, blk_request, "request"); - - /* Allocate the room for our own bookkeeping */ - vblk = dev->priv = malloc(sizeof(*vblk)); - - /* First we open the file and store the length. */ - vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); - vblk->len = lseek64(vblk->fd, 0, SEEK_END); - - /* Tell Guest how many sectors this device has. */ - conf.capacity = cpu_to_le64(vblk->len / 512); - - /* - * Tell Guest not to put in too many descriptors at once: two are used - * for the in and out elements. - */ - add_pci_feature(dev, VIRTIO_BLK_F_SEG_MAX); - conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2); - - set_device_config(dev, &conf, sizeof(struct virtio_blk_config)); - - verbose("device %u: virtblock %llu sectors\n", - devices.device_num, le64_to_cpu(conf.capacity)); -} - -/*L:211 - * Our random number generator device reads from /dev/urandom into the Guest's - * input buffers. The usual case is that the Guest doesn't want random numbers - * and so has no buffers although /dev/urandom is still readable, whereas - * console is the reverse. - * - * The same logic applies, however. - */ -struct rng_info { - int rfd; -}; - -static void rng_input(struct virtqueue *vq) -{ - int len; - unsigned int head, in_num, out_num, totlen = 0; - struct rng_info *rng_info = vq->dev->priv; - struct iovec iov[vq->vring.num]; - - /* First we need a buffer from the Guests's virtqueue. */ - head = wait_for_vq_desc(vq, iov, &out_num, &in_num); - if (out_num) - bad_driver_vq(vq, "Output buffers in rng?"); - - /* - * Just like the console write, we loop to cover the whole iovec. - * In this case, short reads actually happen quite a bit. - */ - while (!iov_empty(iov, in_num)) { - len = readv(rng_info->rfd, iov, in_num); - if (len <= 0) - err(1, "Read from /dev/urandom gave %i", len); - iov_consume(vq->dev, iov, in_num, NULL, len); - totlen += len; - } - - /* Tell the Guest about the new input. */ - add_used(vq, head, totlen); -} - -/*L:199 - * This creates a "hardware" random number device for the Guest. - */ -static void setup_rng(void) -{ - struct device *dev; - struct rng_info *rng_info = malloc(sizeof(*rng_info)); - - /* Our device's private info simply contains the /dev/urandom fd. */ - rng_info->rfd = open_or_die("/dev/urandom", O_RDONLY); - - /* Create the new device. */ - dev = new_pci_device("rng", VIRTIO_ID_RNG, 0xff, 0); - dev->priv = rng_info; - - /* The device has one virtqueue, where the Guest places inbufs. */ - add_pci_virtqueue(dev, rng_input, "input"); - - /* We don't have any configuration space */ - no_device_config(dev); - - verbose("device %u: rng\n", devices.device_num); -} -/* That's the end of device setup. */ - -/*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */ -static void __attribute__((noreturn)) restart_guest(void) -{ - unsigned int i; - - /* - * Since we don't track all open fds, we simply close everything beyond - * stderr. - */ - for (i = 3; i < FD_SETSIZE; i++) - close(i); - - /* Reset all the devices (kills all threads). */ - cleanup_devices(); - - execv(main_args[0], main_args); - err(1, "Could not exec %s", main_args[0]); -} - -/*L:220 - * Finally we reach the core of the Launcher which runs the Guest, serves - * its input and output, and finally, lays it to rest. - */ -static void __attribute__((noreturn)) run_guest(void) -{ - for (;;) { - struct lguest_pending notify; - int readval; - - /* We read from the /dev/lguest device to run the Guest. */ - readval = pread(lguest_fd, ¬ify, sizeof(notify), cpu_id); - if (readval == sizeof(notify)) { - if (notify.trap == 13) { - verbose("Emulating instruction at %#x\n", - getreg(eip)); - emulate_insn(notify.insn); - } else if (notify.trap == 14) { - verbose("Emulating MMIO at %#x\n", - getreg(eip)); - emulate_mmio(notify.addr, notify.insn); - } else - errx(1, "Unknown trap %i addr %#08x\n", - notify.trap, notify.addr); - /* ENOENT means the Guest died. Reading tells us why. */ - } else if (errno == ENOENT) { - char reason[1024] = { 0 }; - pread(lguest_fd, reason, sizeof(reason)-1, cpu_id); - errx(1, "%s", reason); - /* ERESTART means that we need to reboot the guest */ - } else if (errno == ERESTART) { - restart_guest(); - /* Anything else means a bug or incompatible change. */ - } else - err(1, "Running guest failed"); - } -} -/*L:240 - * This is the end of the Launcher. The good news: we are over halfway - * through! The bad news: the most fiendish part of the code still lies ahead - * of us. - * - * Are you ready? Take a deep breath and join me in the core of the Host, in - * "make Host". -:*/ - -static struct option opts[] = { - { "verbose", 0, NULL, 'v' }, - { "tunnet", 1, NULL, 't' }, - { "block", 1, NULL, 'b' }, - { "rng", 0, NULL, 'r' }, - { "initrd", 1, NULL, 'i' }, - { "username", 1, NULL, 'u' }, - { "chroot", 1, NULL, 'c' }, - { NULL }, -}; -static void usage(void) -{ - errx(1, "Usage: lguest [--verbose] " - "[--tunnet=(:|bridge::)\n" - "|--block=|--initrd=]...\n" - " vmlinux [args...]"); -} - -/*L:105 The main routine is where the real work begins: */ -int main(int argc, char *argv[]) -{ - /* Memory, code startpoint and size of the (optional) initrd. */ - unsigned long mem = 0, start, initrd_size = 0; - /* Two temporaries. */ - int i, c; - /* The boot information for the Guest. */ - struct boot_params *boot; - /* If they specify an initrd file to load. */ - const char *initrd_name = NULL; - - /* Password structure for initgroups/setres[gu]id */ - struct passwd *user_details = NULL; - - /* Directory to chroot to */ - char *chroot_path = NULL; - - /* Save the args: we "reboot" by execing ourselves again. */ - main_args = argv; - - /* - * First we initialize the device list. We remember next interrupt - * number to use for devices (1: remember that 0 is used by the timer). - */ - devices.next_irq = 1; - - /* We're CPU 0. In fact, that's the only CPU possible right now. */ - cpu_id = 0; - - /* - * We need to know how much memory so we can set up the device - * descriptor and memory pages for the devices as we parse the command - * line. So we quickly look through the arguments to find the amount - * of memory now. - */ - for (i = 1; i < argc; i++) { - if (argv[i][0] != '-') { - mem = atoi(argv[i]) * 1024 * 1024; - /* - * We start by mapping anonymous pages over all of - * guest-physical memory range. This fills it with 0, - * and ensures that the Guest won't be killed when it - * tries to access it. - */ - guest_base = map_zeroed_pages(mem / getpagesize() - + DEVICE_PAGES); - guest_limit = mem; - guest_max = guest_mmio = mem + DEVICE_PAGES*getpagesize(); - break; - } - } - - /* If we exit via err(), this kills all the threads, restores tty. */ - atexit(cleanup_devices); - - /* We always have a console device, and it's always device 1. */ - setup_console(); - - /* The options are fairly straight-forward */ - while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) { - switch (c) { - case 'v': - verbose = true; - break; - case 't': - setup_tun_net(optarg); - break; - case 'b': - setup_block_file(optarg); - break; - case 'r': - setup_rng(); - break; - case 'i': - initrd_name = optarg; - break; - case 'u': - user_details = getpwnam(optarg); - if (!user_details) - err(1, "getpwnam failed, incorrect username?"); - break; - case 'c': - chroot_path = optarg; - break; - default: - warnx("Unknown argument %s", argv[optind]); - usage(); - } - } - /* - * After the other arguments we expect memory and kernel image name, - * followed by command line arguments for the kernel. - */ - if (optind + 2 > argc) - usage(); - - verbose("Guest base is at %p\n", guest_base); - - /* Initialize the (fake) PCI host bridge device. */ - init_pci_host_bridge(); - - /* Now we load the kernel */ - start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); - - /* Boot information is stashed at physical address 0 */ - boot = from_guest_phys(0); - - /* Map the initrd image if requested (at top of physical memory) */ - if (initrd_name) { - initrd_size = load_initrd(initrd_name, mem); - /* - * These are the location in the Linux boot header where the - * start and size of the initrd are expected to be found. - */ - boot->hdr.ramdisk_image = mem - initrd_size; - boot->hdr.ramdisk_size = initrd_size; - /* The bootloader type 0xFF means "unknown"; that's OK. */ - boot->hdr.type_of_loader = 0xFF; - } - - /* - * The Linux boot header contains an "E820" memory map: ours is a - * simple, single region. - */ - boot->e820_entries = 1; - boot->e820_table[0] = ((struct e820_entry) { 0, mem, E820_TYPE_RAM }); - /* - * The boot header contains a command line pointer: we put the command - * line after the boot header. - */ - boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); - /* We use a simple helper to copy the arguments separated by spaces. */ - concat((char *)(boot + 1), argv+optind+2); - - /* Set kernel alignment to 16M (CONFIG_PHYSICAL_ALIGN) */ - boot->hdr.kernel_alignment = 0x1000000; - - /* Boot protocol version: 2.07 supports the fields for lguest. */ - boot->hdr.version = 0x207; - - /* X86_SUBARCH_LGUEST tells the Guest it's an lguest. */ - boot->hdr.hardware_subarch = X86_SUBARCH_LGUEST; - - /* Tell the entry path not to try to reload segment registers. */ - boot->hdr.loadflags |= KEEP_SEGMENTS; - - /* We don't support tboot: */ - boot->tboot_addr = 0; - - /* Ensure this is 0 to prevent APM from loading: */ - boot->apm_bios_info.version = 0; - - /* We tell the kernel to initialize the Guest. */ - tell_kernel(start); - - /* Ensure that we terminate if a device-servicing child dies. */ - signal(SIGCHLD, kill_launcher); - - /* If requested, chroot to a directory */ - if (chroot_path) { - if (chroot(chroot_path) != 0) - err(1, "chroot(\"%s\") failed", chroot_path); - - if (chdir("/") != 0) - err(1, "chdir(\"/\") failed"); - - verbose("chroot done\n"); - } - - /* If requested, drop privileges */ - if (user_details) { - uid_t u; - gid_t g; - - u = user_details->pw_uid; - g = user_details->pw_gid; - - if (initgroups(user_details->pw_name, g) != 0) - err(1, "initgroups failed"); - - if (setresgid(g, g, g) != 0) - err(1, "setresgid failed"); - - if (setresuid(u, u, u) != 0) - err(1, "setresuid failed"); - - verbose("Dropping privileges completed\n"); - } - - /* Finally, run the Guest. This doesn't return. */ - run_guest(); -} -/*:*/ - -/*M:999 - * Mastery is done: you now know everything I do. - * - * But surely you have seen code, features and bugs in your wanderings which - * you now yearn to attack? That is the real game, and I look forward to you - * patching and forking lguest into the Your-Name-Here-visor. - * - * Farewell, and good coding! - * Rusty Russell. - */ diff --git a/tools/lguest/lguest.txt b/tools/lguest/lguest.txt deleted file mode 100644 index 06e1f4649511..000000000000 --- a/tools/lguest/lguest.txt +++ /dev/null @@ -1,125 +0,0 @@ - __ - (___()'`; Rusty's Remarkably Unreliable Guide to Lguest - /, /` - or, A Young Coder's Illustrated Hypervisor - \\"--\\ http://lguest.ozlabs.org - -Lguest is designed to be a minimal 32-bit x86 hypervisor for the Linux kernel, -for Linux developers and users to experiment with virtualization with the -minimum of complexity. Nonetheless, it should have sufficient features to -make it useful for specific tasks, and, of course, you are encouraged to fork -and enhance it (see drivers/lguest/README). - -Features: - -- Kernel module which runs in a normal kernel. -- Simple I/O model for communication. -- Simple program to create new guests. -- Logo contains cute puppies: http://lguest.ozlabs.org - -Developer features: - -- Fun to hack on. -- No ABI: being tied to a specific kernel anyway, you can change anything. -- Many opportunities for improvement or feature implementation. - -Running Lguest: - -- The easiest way to run lguest is to use same kernel as guest and host. - You can configure them differently, but usually it's easiest not to. - - You will need to configure your kernel with the following options: - - "Processor type and features": - "Paravirtualized guest support" = Y - "Lguest guest support" = Y - "High Memory Support" = off/4GB - "Alignment value to which kernel should be aligned" = 0x100000 - (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and - CONFIG_PHYSICAL_ALIGN=0x100000) - - "Device Drivers": - "Block devices" - "Virtio block driver" = M/Y - "Network device support" - "Universal TUN/TAP device driver support" = M/Y - "Virtio network driver" = M/Y - (CONFIG_VIRTIO_BLK=m, CONFIG_VIRTIO_NET=m and CONFIG_TUN=m) - - "Virtualization" - "Linux hypervisor example code" = M/Y - (CONFIG_LGUEST=m) - -- A tool called "lguest" is available in this directory: type "make" - to build it. If you didn't build your kernel in-tree, use "make - O=". - -- Create or find a root disk image. There are several useful ones - around, such as the xm-test tiny root image at - http://xm-test.xensource.com/ramdisks/initrd-1.1-i386.img - - For more serious work, I usually use a distribution ISO image and - install it under qemu, then make multiple copies: - - dd if=/dev/zero of=rootfile bs=1M count=2048 - qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d - - Make sure that you install a getty on /dev/hvc0 if you want to log in on the - console! - -- "modprobe lg" if you built it as a module. - -- Run an lguest as root: - - tools/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \ - --block=rootfile root=/dev/vda - - Explanation: - 64: the amount of memory to use, in MB. - - vmlinux: the kernel image found in the top of your build directory. You - can also use a standard bzImage. - - --tunnet=192.168.19.1: configures a "tap" device for networking with this - IP address. - - --block=rootfile: a file or block device which becomes /dev/vda - inside the guest. - - root=/dev/vda: this (and anything else on the command line) are - kernel boot parameters. - -- Configuring networking. I usually have the host masquerade, using - "iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE" and "echo 1 > - /proc/sys/net/ipv4/ip_forward". In this example, I would configure - eth0 inside the guest at 192.168.19.2. - - Another method is to bridge the tap device to an external interface - using --tunnet=bridge:, and perhaps run dhcp on the guest - to obtain an IP address. The bridge needs to be configured first: - this option simply adds the tap interface to it. - - A simple example on my system: - - ifconfig eth0 0.0.0.0 - brctl addbr lg0 - ifconfig lg0 up - brctl addif lg0 eth0 - dhclient lg0 - - Then use --tunnet=bridge:lg0 when launching the guest. - - See: - - http://www.linuxfoundation.org/collaborate/workgroups/networking/bridge - - for general information on how to get bridging to work. - -- Random number generation. Using the --rng option will provide a - /dev/hwrng in the guest that will read from the host's /dev/random. - Use this option in conjunction with rng-tools (see ../hw_random.txt) - to provide entropy to the guest kernel's /dev/random. - -There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest - -Good luck! -Rusty Russell rusty@rustcorp.com.au.