From 489f1d6526ab68ca1842398fa3ae95c597fe3d32 Mon Sep 17 00:00:00 2001 From: "Dong, Eddie" Date: Mon, 7 Jan 2008 11:14:20 +0200 Subject: [PATCH 001/147] KVM: MMU: Update shadow ptes on partial guest pte writes A guest partial guest pte write will leave shadow_trap_nonpresent_pte in spte, which generates a vmexit at the next guest access through that pte. This patch improves this by reading the full guest pte in advance and thus being able to update the spte and eliminate the vmexit. This helps pae guests which use two 32-bit writes to set a single 64-bit pte. [truncation fix by Eric] Signed-off-by: Yaozu (Eddie) Dong Signed-off-by: Feng (Eric) Liu Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 23 ++++++++++++++++------- arch/x86/kvm/paging_tmpl.h | 7 ++----- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e55af12e11b7..28f9a44060cc 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1329,8 +1329,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, - const void *new, int bytes, - int offset_in_pte) + const void *new) { if (sp->role.level != PT_PAGE_TABLE_LEVEL) { ++vcpu->kvm->stat.mmu_pde_zapped; @@ -1339,9 +1338,9 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, ++vcpu->kvm->stat.mmu_pte_updated; if (sp->role.glevels == PT32_ROOT_LEVEL) - paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); + paging32_update_pte(vcpu, sp, spte, new); else - paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); + paging64_update_pte(vcpu, sp, spte, new); } static bool need_remote_flush(u64 old, u64 new) @@ -1423,7 +1422,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, struct hlist_node *node, *n; struct hlist_head *bucket; unsigned index; - u64 entry; + u64 entry, gentry; u64 *spte; unsigned offset = offset_in_page(gpa); unsigned pte_size; @@ -1433,6 +1432,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, int level; int flooded = 0; int npte; + int r; pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); @@ -1496,11 +1496,20 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, continue; } spte = &sp->spt[page_offset / sizeof(*spte)]; + if ((gpa & (pte_size - 1)) || (bytes < pte_size)) { + gentry = 0; + r = kvm_read_guest_atomic(vcpu->kvm, + gpa & ~(u64)(pte_size - 1), + &gentry, pte_size); + new = (const void *)&gentry; + if (r < 0) + new = NULL; + } while (npte--) { entry = *spte; mmu_pte_write_zap_pte(vcpu, sp, spte); - mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes, - page_offset & (pte_size - 1)); + if (new) + mmu_pte_write_new_pte(vcpu, sp, spte, new); mmu_pte_write_flush_tlb(vcpu, entry, *spte); ++spte; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index ecc0856268c4..c2fd2b96144f 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -243,8 +243,7 @@ err: } static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, - u64 *spte, const void *pte, int bytes, - int offset_in_pte) + u64 *spte, const void *pte) { pt_element_t gpte; unsigned pte_access; @@ -252,12 +251,10 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, gpte = *(const pt_element_t *)pte; if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { - if (!offset_in_pte && !is_present_pte(gpte)) + if (!is_present_pte(gpte)) set_shadow_pte(spte, shadow_notrap_nonpresent_pte); return; } - if (bytes < sizeof(pt_element_t)) - return; pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) From 1ae0a13def678876b9acfb5ac1e2cf7d5d45a60d Mon Sep 17 00:00:00 2001 From: "Dong, Eddie" Date: Mon, 7 Jan 2008 13:20:25 +0200 Subject: [PATCH 002/147] KVM: MMU: Simplify hash table indexing Signed-off-by: Yaozu (Eddie) Dong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 10 +++++----- include/asm-x86/kvm_host.h | 3 ++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 28f9a44060cc..6f8392d4034e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -559,7 +559,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) static unsigned kvm_page_table_hashfn(gfn_t gfn) { - return gfn; + return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); } static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, @@ -663,7 +663,7 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) struct hlist_node *node; pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); - index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + index = kvm_page_table_hashfn(gfn); bucket = &kvm->arch.mmu_page_hash[index]; hlist_for_each_entry(sp, node, bucket, hash_link) if (sp->gfn == gfn && !sp->role.metaphysical) { @@ -701,7 +701,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, } pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__, gfn, role.word); - index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; hlist_for_each_entry(sp, node, bucket, hash_link) if (sp->gfn == gfn && sp->role.word == role.word) { @@ -840,7 +840,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); r = 0; - index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + index = kvm_page_table_hashfn(gfn); bucket = &kvm->arch.mmu_page_hash[index]; hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) if (sp->gfn == gfn && !sp->role.metaphysical) { @@ -1450,7 +1450,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, vcpu->arch.last_pt_write_count = 1; vcpu->arch.last_pte_updated = NULL; } - index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { if (sp->gfn != gfn || sp->role.metaphysical) diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 68ee390b2844..e076790ee794 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -58,7 +58,8 @@ #define KVM_PERMILLE_MMU_PAGES 20 #define KVM_MIN_ALLOC_MMU_PAGES 64 -#define KVM_NUM_MMU_PAGES 1024 +#define KVM_MMU_HASH_SHIFT 10 +#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 #define KVM_MAX_CPUID_ENTRIES 40 From e09d082c03e137015bc0a17ca77e4b9dca08a5d7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 18 Jan 2008 12:38:59 +0200 Subject: [PATCH 003/147] KVM: x86 emulator: add support for group decoding Certain x86 instructions use bits 3:5 of the byte following the opcode as an opcode extension, with the decode sometimes depending on bits 6:7 as well. Add support for this in the main decoding table rather than an ad-hock adaptation per opcode. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 79586003397a..46ecf349a116 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -65,6 +65,9 @@ #define MemAbs (1<<9) /* Memory operand is absolute displacement */ #define String (1<<10) /* String instruction (rep capable) */ #define Stack (1<<11) /* Stack instruction (push/pop) */ +#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ +#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ +#define GroupMask 0xff /* Group number stored in bits 0:7 */ static u16 opcode_table[256] = { /* 0x00 - 0x07 */ @@ -229,6 +232,12 @@ static u16 twobyte_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; +static u16 group_table[] = { +}; + +static u16 group2_table[] = { +}; + /* EFLAGS bit definitions. */ #define EFLG_OF (1<<11) #define EFLG_DF (1<<10) @@ -763,7 +772,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) struct decode_cache *c = &ctxt->decode; int rc = 0; int mode = ctxt->mode; - int def_op_bytes, def_ad_bytes; + int def_op_bytes, def_ad_bytes, group; /* Shadow copy of register state. Committed on successful emulation. */ @@ -864,12 +873,24 @@ done_prefixes: c->b = insn_fetch(u8, 1, c->eip); c->d = twobyte_table[c->b]; } + } - /* Unrecognised? */ - if (c->d == 0) { - DPRINTF("Cannot emulate %02x\n", c->b); - return -1; - } + if (c->d & Group) { + group = c->d & GroupMask; + c->modrm = insn_fetch(u8, 1, c->eip); + --c->eip; + + group = (group << 3) + ((c->modrm >> 3) & 7); + if ((c->d & GroupDual) && (c->modrm >> 6) == 3) + c->d = group2_table[group]; + else + c->d = group_table[group]; + } + + /* Unrecognised? */ + if (c->d == 0) { + DPRINTF("Cannot emulate %02x\n", c->b); + return -1; } if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) From 43bb19cd3398d3f544d8e2d6ed6c5c5d7b4e5819 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 18 Jan 2008 12:46:50 +0200 Subject: [PATCH 004/147] KVM: x86 emulator: group decoding for group 1A This adds group decode support for opcode 0x8f. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 46ecf349a116..cf1ce7c316a9 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -69,6 +69,10 @@ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define GroupMask 0xff /* Group number stored in bits 0:7 */ +enum { + Group1A, +}; + static u16 opcode_table[256] = { /* 0x00 - 0x07 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, @@ -133,7 +137,7 @@ static u16 opcode_table[256] = { /* 0x88 - 0x8F */ ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack, + 0, ModRM | DstReg, 0, Group | Group1A, /* 0x90 - 0x9F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, @@ -233,6 +237,8 @@ static u16 twobyte_table[256] = { }; static u16 group_table[] = { + [Group1A*8] = + DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, }; static u16 group2_table[] = { From 7d858a19efe5844a98e060931570359b70dea6d1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 18 Jan 2008 12:58:04 +0200 Subject: [PATCH 005/147] KVM: x86 emulator: Group decoding for group 3 This adds group decoding support for opcodes 0xf6, 0xf7 (group 3). Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index cf1ce7c316a9..52e65ae37f06 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -70,7 +70,7 @@ #define GroupMask 0xff /* Group number stored in bits 0:7 */ enum { - Group1A, + Group1A, Group3_Byte, Group3, }; static u16 opcode_table[256] = { @@ -171,8 +171,7 @@ static u16 opcode_table[256] = { 0, 0, 0, 0, /* 0xF0 - 0xF7 */ 0, 0, 0, 0, - ImplicitOps, ImplicitOps, - ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, /* 0xF8 - 0xFF */ ImplicitOps, 0, ImplicitOps, ImplicitOps, 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM @@ -239,6 +238,14 @@ static u16 twobyte_table[256] = { static u16 group_table[] = { [Group1A*8] = DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, + [Group3_Byte*8] = + ByteOp | SrcImm | DstMem | ModRM, 0, + ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + 0, 0, 0, 0, + [Group3*8] = + DstMem | SrcImm | ModRM | SrcImm, 0, + DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + 0, 0, 0, 0, }; static u16 group2_table[] = { @@ -1070,26 +1077,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, switch (c->modrm_reg) { case 0 ... 1: /* test */ - /* - * Special case in Grp3: test has an immediate - * source operand. - */ - c->src.type = OP_IMM; - c->src.ptr = (unsigned long *)c->eip; - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - if (c->src.bytes == 8) - c->src.bytes = 4; - switch (c->src.bytes) { - case 1: - c->src.val = insn_fetch(s8, 1, c->eip); - break; - case 2: - c->src.val = insn_fetch(s16, 2, c->eip); - break; - case 4: - c->src.val = insn_fetch(s32, 4, c->eip); - break; - } emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); break; case 2: /* not */ @@ -1103,7 +1090,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, rc = X86EMUL_UNHANDLEABLE; break; } -done: return rc; } From fd60754e4ffa992586346dd56451723b4c096626 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 18 Jan 2008 13:12:26 +0200 Subject: [PATCH 006/147] KVM: x86 emulator: Group decoding for groups 4 and 5 Add group decoding support for opcode 0xfe (group 4) and 0xff (group 5). Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 40 ++++++++++---------------------------- 1 file changed, 10 insertions(+), 30 deletions(-) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 52e65ae37f06..7310368c4857 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -70,7 +70,7 @@ #define GroupMask 0xff /* Group number stored in bits 0:7 */ enum { - Group1A, Group3_Byte, Group3, + Group1A, Group3_Byte, Group3, Group4, Group5, }; static u16 opcode_table[256] = { @@ -174,7 +174,7 @@ static u16 opcode_table[256] = { ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, /* 0xF8 - 0xFF */ ImplicitOps, 0, ImplicitOps, ImplicitOps, - 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM + 0, 0, Group | Group4, Group | Group5, }; static u16 twobyte_table[256] = { @@ -246,6 +246,12 @@ static u16 group_table[] = { DstMem | SrcImm | ModRM | SrcImm, 0, DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 0, 0, 0, 0, + [Group4*8] = + ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + 0, 0, 0, 0, 0, 0, + [Group5*8] = + DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, + SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0, }; static u16 group2_table[] = { @@ -1097,7 +1103,6 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - int rc; switch (c->modrm_reg) { case 0: /* inc */ @@ -1107,36 +1112,11 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, emulate_1op("dec", c->dst, ctxt->eflags); break; case 4: /* jmp abs */ - if (c->b == 0xff) - c->eip = c->dst.val; - else { - DPRINTF("Cannot emulate %02x\n", c->b); - return X86EMUL_UNHANDLEABLE; - } + c->eip = c->src.val; break; case 6: /* push */ - - /* 64-bit mode: PUSH always pushes a 64-bit operand. */ - - if (ctxt->mode == X86EMUL_MODE_PROT64) { - c->dst.bytes = 8; - rc = ops->read_std((unsigned long)c->dst.ptr, - &c->dst.val, 8, ctxt->vcpu); - if (rc != 0) - return rc; - } - register_address_increment(c->regs[VCPU_REGS_RSP], - -c->dst.bytes); - rc = ops->write_emulated(register_address(ctxt->ss_base, - c->regs[VCPU_REGS_RSP]), &c->dst.val, - c->dst.bytes, ctxt->vcpu); - if (rc != 0) - return rc; - c->dst.type = OP_NONE; + emulate_push(ctxt); break; - default: - DPRINTF("Cannot emulate %02x\n", c->b); - return X86EMUL_UNHANDLEABLE; } return 0; } From d95058a1a7170ae2af2939cbdab0ff5d5e005238 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 18 Jan 2008 13:36:50 +0200 Subject: [PATCH 007/147] KVM: x86 emulator: add group 7 decoding This adds group decoding for opcode 0x0f 0x01 (group 7). Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 7310368c4857..ef6de16bb597 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -70,7 +70,7 @@ #define GroupMask 0xff /* Group number stored in bits 0:7 */ enum { - Group1A, Group3_Byte, Group3, Group4, Group5, + Group1A, Group3_Byte, Group3, Group4, Group5, Group7, }; static u16 opcode_table[256] = { @@ -179,7 +179,7 @@ static u16 opcode_table[256] = { static u16 twobyte_table[256] = { /* 0x00 - 0x0F */ - 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0, + 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0, ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, /* 0x10 - 0x1F */ 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, @@ -252,9 +252,14 @@ static u16 group_table[] = { [Group5*8] = DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0, + [Group7*8] = + 0, 0, ModRM | SrcMem, ModRM | SrcMem, + SrcNone | ModRM | DstMem, 0, SrcMem | ModRM, SrcMem | ModRM | ByteOp, }; static u16 group2_table[] = { + [Group7*8] = + SrcNone | ModRM, 0, 0, 0, SrcNone | ModRM | DstMem, 0, SrcMem | ModRM, 0, }; /* EFLAGS bit definitions. */ From 5c5027425ec23ded452879ee5d0775a9a90fb9bf Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Tue, 22 Jan 2008 20:46:14 +0100 Subject: [PATCH 008/147] KVM: constify function pointer tables Signed-off-by: Jan Engelhardt Signed-off-by: Avi Kivity --- virt/kvm/kvm_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b2e12893e3f4..04595fe77efb 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -705,7 +705,7 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp) return 0; } -static struct file_operations kvm_vcpu_fops = { +static const struct file_operations kvm_vcpu_fops = { .release = kvm_vcpu_release, .unlocked_ioctl = kvm_vcpu_ioctl, .compat_ioctl = kvm_vcpu_ioctl, @@ -1005,7 +1005,7 @@ static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static struct file_operations kvm_vm_fops = { +static const struct file_operations kvm_vm_fops = { .release = kvm_vm_release, .unlocked_ioctl = kvm_vm_ioctl, .compat_ioctl = kvm_vm_ioctl, From 09566765efd034feba45611f9d0ae9a702f8bb1d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 23 Jan 2008 18:14:23 +0200 Subject: [PATCH 009/147] KVM: Only x86 has pio Signed-off-by: Avi Kivity --- virt/kvm/kvm_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 04595fe77efb..121e65cccc58 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -678,8 +678,10 @@ static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (vmf->pgoff == 0) page = virt_to_page(vcpu->run); +#ifdef CONFIG_X86 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) page = virt_to_page(vcpu->arch.pio_data); +#endif else return VM_FAULT_SIGBUS; get_page(page); From 1d6ad2073e5354912291277c606a57fd37330f04 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 23 Jan 2008 22:26:09 +0200 Subject: [PATCH 010/147] KVM: x86 emulator: group decoding for group 1 instructions Opcodes 0x80-0x83 Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index ef6de16bb597..22900f70172b 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -70,6 +70,7 @@ #define GroupMask 0xff /* Group number stored in bits 0:7 */ enum { + Group1_80, Group1_81, Group1_82, Group1_83, Group1A, Group3_Byte, Group3, Group4, Group5, Group7, }; @@ -130,8 +131,8 @@ static u16 opcode_table[256] = { ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, /* 0x80 - 0x87 */ - ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, + Group | Group1_80, Group | Group1_81, + Group | Group1_82, Group | Group1_83, ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, /* 0x88 - 0x8F */ @@ -236,6 +237,26 @@ static u16 twobyte_table[256] = { }; static u16 group_table[] = { + [Group1_80*8] = + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + [Group1_81*8] = + DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + [Group1_82*8] = + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + [Group1_83*8] = + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, [Group1A*8] = DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, [Group3_Byte*8] = From d196e343361c229496adeda42335856da9d057de Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 24 Jan 2008 11:44:11 +0200 Subject: [PATCH 011/147] KVM: MMU: Decouple mmio from shadow page tables Currently an mmio guest pte is encoded in the shadow pagetable as a not-present trapping pte, with the SHADOW_IO_MARK bit set. However nothing is ever done with this information, so maintaining it is a useless complication. This patch moves the check for mmio to before shadow ptes are instantiated, so the shadow code is never invoked for ptes that reference mmio. The code is simpler, and with future work, can be made to handle mmio concurrently. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 34 +++++++++++++++------------------- arch/x86/kvm/paging_tmpl.h | 17 ++++++++--------- 2 files changed, 23 insertions(+), 28 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6f8392d4034e..6651dfadae50 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -101,8 +101,6 @@ static int dbg = 1; #define PT_FIRST_AVAIL_BITS_SHIFT 9 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 -#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) - #define VALID_PAGE(x) ((x) != INVALID_PAGE) #define PT64_LEVEL_BITS 9 @@ -200,7 +198,6 @@ static int is_present_pte(unsigned long pte) static int is_shadow_present_pte(u64 pte) { - pte &= ~PT_SHADOW_IO_MARK; return pte != shadow_trap_nonpresent_pte && pte != shadow_notrap_nonpresent_pte; } @@ -215,11 +212,6 @@ static int is_dirty_pte(unsigned long pte) return pte & PT_DIRTY_MASK; } -static int is_io_pte(unsigned long pte) -{ - return pte & PT_SHADOW_IO_MARK; -} - static int is_rmap_pte(u64 pte) { return is_shadow_present_pte(pte); @@ -538,7 +530,7 @@ static int is_empty_shadow_page(u64 *spt) u64 *end; for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) - if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) { + if (*pos != shadow_trap_nonpresent_pte) { printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, pos, *pos); return 0; @@ -926,13 +918,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, if (pte_access & ACC_USER_MASK) spte |= PT_USER_MASK; - if (is_error_page(page)) { - set_shadow_pte(shadow_pte, - shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK); - kvm_release_page_clean(page); - return; - } - spte |= page_to_phys(page); if ((pte_access & ACC_WRITE_MASK) @@ -1002,7 +987,7 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, if (level == 1) { mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, 0, write, 1, &pt_write, gfn, page); - return pt_write || is_io_pte(table[index]); + return pt_write; } if (table[index] == shadow_trap_nonpresent_pte) { @@ -1039,6 +1024,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) page = gfn_to_page(vcpu->kvm, gfn); up_read(¤t->mm->mmap_sem); + /* mmio */ + if (is_error_page(page)) { + kvm_release_page_clean(page); + up_read(&vcpu->kvm->slots_lock); + return 1; + } + spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); r = __nonpaging_map(vcpu, v, write, gfn, page); @@ -1406,10 +1398,14 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, return; gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; - down_read(¤t->mm->mmap_sem); + down_read(&vcpu->kvm->slots_lock); page = gfn_to_page(vcpu->kvm, gfn); - up_read(¤t->mm->mmap_sem); + up_read(&vcpu->kvm->slots_lock); + if (is_error_page(page)) { + kvm_release_page_clean(page); + return; + } vcpu->arch.update_pte.gfn = gfn; vcpu->arch.update_pte.page = page; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index c2fd2b96144f..4b55f462e2b3 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -399,6 +399,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, page = gfn_to_page(vcpu->kvm, walker.gfn); up_read(¤t->mm->mmap_sem); + /* mmio */ + if (is_error_page(page)) { + pgprintk("gfn %x is mmio\n", walker.gfn); + kvm_release_page_clean(page); + up_read(&vcpu->kvm->slots_lock); + return 1; + } + spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, @@ -409,15 +417,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, if (!write_pt) vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ - /* - * mmio: emulate if accessible, otherwise its a guest fault. - */ - if (shadow_pte && is_io_pte(*shadow_pte)) { - spin_unlock(&vcpu->kvm->mmu_lock); - up_read(&vcpu->kvm->slots_lock); - return 1; - } - ++vcpu->stat.pf_fixed; kvm_mmu_audit(vcpu, "post page fault (fixed)"); spin_unlock(&vcpu->kvm->mmu_lock); From adb1ff46754a87f3f6c9e7ee0a92f9a8a183bb38 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 24 Jan 2008 15:13:08 +0200 Subject: [PATCH 012/147] KVM: Limit vcpu mmap size to one page on non-x86 The second page is only needed on archs that support pio. Noted by Carsten Otte. Signed-off-by: Avi Kivity --- virt/kvm/kvm_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 121e65cccc58..7972e3aa2cae 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1061,7 +1061,10 @@ static long kvm_dev_ioctl(struct file *filp, r = -EINVAL; if (arg) goto out; - r = 2 * PAGE_SIZE; + r = PAGE_SIZE; /* struct kvm_run */ +#ifdef CONFIG_X86 + r += PAGE_SIZE; /* pio data page */ +#endif break; default: return kvm_arch_dev_ioctl(filp, ioctl, arg); From 2384d2b32640839a4d4d260ca7c5aa4edbf68d91 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Thu, 17 Jan 2008 15:14:33 +0800 Subject: [PATCH 013/147] KVM: VMX: Enable Virtual Processor Identification (VPID) To allow TLB entries to be retained across VM entry and VM exit, the VMM can now identify distinct address spaces through a new virtual-processor ID (VPID) field of the VMCS. [avi: drop vpid_sync_all()] [avi: add "cc" to asm constraints] Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 80 +++++++++++++++++++++++++++++++++++--- arch/x86/kvm/vmx.h | 6 +++ include/asm-x86/kvm_host.h | 1 + 3 files changed, 82 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8e1462880d1f..1157e8a4059b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -37,6 +37,9 @@ MODULE_LICENSE("GPL"); static int bypass_guest_pf = 1; module_param(bypass_guest_pf, bool, 0); +static int enable_vpid = 1; +module_param(enable_vpid, bool, 0); + struct vmcs { u32 revision_id; u32 abort; @@ -71,6 +74,7 @@ struct vcpu_vmx { unsigned rip; } irq; } rmode; + int vpid; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -86,6 +90,9 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs); static struct page *vmx_io_bitmap_a; static struct page *vmx_io_bitmap_b; +static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); +static DEFINE_SPINLOCK(vmx_vpid_lock); + static struct vmcs_config { int size; int order; @@ -204,6 +211,12 @@ static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) (irqchip_in_kernel(kvm))); } +static inline int cpu_has_vmx_vpid(void) +{ + return (vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_VPID); +} + static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -214,6 +227,20 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) return -1; } +static inline void __invvpid(int ext, u16 vpid, gva_t gva) +{ + struct { + u64 vpid : 16; + u64 rsvd : 48; + u64 gva; + } operand = { vpid, 0, gva }; + + asm volatile (ASM_VMX_INVVPID + /* CF==1 or ZF==1 --> rc = -1 */ + "; ja 1f ; ud2 ; 1:" + : : "a"(&operand), "c"(ext) : "cc", "memory"); +} + static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -257,6 +284,14 @@ static void vcpu_clear(struct vcpu_vmx *vmx) vmx->launched = 0; } +static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) +{ + if (vmx->vpid == 0) + return; + + __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); +} + static unsigned long vmcs_readl(unsigned long field) { unsigned long value; @@ -490,6 +525,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (vcpu->cpu != cpu) { vcpu_clear(vmx); kvm_migrate_apic_timer(vcpu); + vpid_sync_vcpu_all(vmx); } if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { @@ -971,7 +1007,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { min = 0; opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | - SECONDARY_EXEC_WBINVD_EXITING; + SECONDARY_EXEC_WBINVD_EXITING | + SECONDARY_EXEC_ENABLE_VPID; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) return -EIO; @@ -1239,6 +1276,11 @@ static void exit_lmode(struct kvm_vcpu *vcpu) #endif +static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +{ + vpid_sync_vcpu_all(to_vmx(vcpu)); +} + static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; @@ -1275,6 +1317,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { + vmx_flush_tlb(vcpu); vmcs_writel(GUEST_CR3, cr3); if (vcpu->arch.cr0 & X86_CR0_PE) vmx_fpu_deactivate(vcpu); @@ -1494,6 +1537,22 @@ out: return r; } +static void allocate_vpid(struct vcpu_vmx *vmx) +{ + int vpid; + + vmx->vpid = 0; + if (!enable_vpid || !cpu_has_vmx_vpid()) + return; + spin_lock(&vmx_vpid_lock); + vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); + if (vpid < VMX_NR_VPIDS) { + vmx->vpid = vpid; + __set_bit(vpid, vmx_vpid_bitmap); + } + spin_unlock(&vmx_vpid_lock); +} + /* * Sets up the vmcs for emulated real mode. */ @@ -1532,6 +1591,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + if (vmx->vpid == 0) + exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } @@ -1704,6 +1765,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); + if (vmx->vpid != 0) + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + vmx->vcpu.arch.cr0 = 0x60000010; vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ vmx_set_cr4(&vmx->vcpu, 0); @@ -1713,6 +1777,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx_fpu_activate(&vmx->vcpu); update_exception_bitmap(&vmx->vcpu); + vpid_sync_vcpu_all(vmx); + return 0; out: @@ -2221,10 +2287,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return 0; } -static void vmx_flush_tlb(struct kvm_vcpu *vcpu) -{ -} - static void update_tpr_threshold(struct kvm_vcpu *vcpu) { int max_irr, tpr; @@ -2489,6 +2551,10 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + spin_lock(&vmx_vpid_lock); + if (vmx->vpid != 0) + __clear_bit(vmx->vpid, vmx_vpid_bitmap); + spin_unlock(&vmx_vpid_lock); vmx_free_vmcs(vcpu); kfree(vmx->host_msrs); kfree(vmx->guest_msrs); @@ -2505,6 +2571,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!vmx) return ERR_PTR(-ENOMEM); + allocate_vpid(vmx); + err = kvm_vcpu_init(&vmx->vcpu, kvm, id); if (err) goto free_vcpu; @@ -2652,6 +2720,8 @@ static int __init vmx_init(void) memset(iova, 0xff, PAGE_SIZE); kunmap(vmx_io_bitmap_b); + set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ + r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); if (r) goto out1; diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h index d52ae8d7303d..436ce0f29092 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h @@ -49,6 +49,7 @@ * Definitions of Secondary Processor-Based VM-Execution Controls. */ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 +#define SECONDARY_EXEC_ENABLE_VPID 0x00000020 #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 @@ -65,6 +66,7 @@ /* VMCS Encodings */ enum vmcs_field { + VIRTUAL_PROCESSOR_ID = 0x00000000, GUEST_ES_SELECTOR = 0x00000800, GUEST_CS_SELECTOR = 0x00000802, GUEST_SS_SELECTOR = 0x00000804, @@ -321,4 +323,8 @@ enum vmcs_field { #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 +#define VMX_NR_VPIDS (1 << 16) +#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 +#define VMX_VPID_EXTENT_ALL_CONTEXT 2 + #endif diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index e076790ee794..28e8177ea4a0 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -601,6 +601,7 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) #define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4" #define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4" #define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30" +#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08" #define MSR_IA32_TIME_STAMP_COUNTER 0x010 From 31bb117eb48f2629e030ca547ca89a1c34150183 Mon Sep 17 00:00:00 2001 From: Hollis Blanchard Date: Mon, 28 Jan 2008 17:42:34 -0600 Subject: [PATCH 014/147] KVM: Use CONFIG_PREEMPT_NOTIFIERS around struct preempt_notifier This allows kvm_host.h to be #included even when struct preempt_notifier is undefined. This is needed to build ppc asm-offsets.h. Signed-off-by: Hollis Blanchard Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 928b0d59e9ba..b90ca368dcf6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -67,7 +67,9 @@ void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_vcpu { struct kvm *kvm; +#ifdef CONFIG_PREEMPT_NOTIFIERS struct preempt_notifier preempt_notifier; +#endif int vcpu_id; struct mutex mutex; int cpu; From 0aac03f07b37da96e00371e66973d5ffaae578a4 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 30 Jan 2008 19:57:35 +0100 Subject: [PATCH 015/147] KVM: Disable pagefaults during copy_from_user_inatomic() With CONFIG_PREEMPT=n, this is needed in order to disable the fault-in code from sleeping. Signed-off-by: Andrea Arcangeli Signed-off-by: Avi Kivity --- virt/kvm/kvm_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7972e3aa2cae..cf6df5167af6 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -554,7 +554,9 @@ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, addr = gfn_to_hva(kvm, gfn); if (kvm_is_error_hva(addr)) return -EFAULT; + pagefault_disable(); r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); + pagefault_enable(); if (r) return -EFAULT; return 0; From f2b4b7ddf633ffa24ce7c89c9e0d8a06463484e3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 31 Jan 2008 14:57:37 +0100 Subject: [PATCH 016/147] KVM: make EFER_RESERVED_BITS configurable for architecture code This patch give the SVM and VMX implementations the ability to add some bits the guest can set in its EFER register. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 11 +++++++++-- include/asm-x86/kvm_host.h | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6b01552bd1f1..ec9265b354b0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -41,7 +41,7 @@ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) -#define EFER_RESERVED_BITS 0xfffffffffffff2fe +static u64 __read_mostly efer_reserved_bits = 0xfffffffffffff2fe; #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU @@ -428,7 +428,7 @@ static u32 emulated_msrs[] = { static void set_efer(struct kvm_vcpu *vcpu, u64 efer) { - if (efer & EFER_RESERVED_BITS) { + if (efer & efer_reserved_bits) { printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", efer); kvm_inject_gp(vcpu, 0); @@ -452,6 +452,13 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) #endif +void kvm_enable_efer_bits(u64 mask) +{ + efer_reserved_bits &= ~mask; +} +EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); + + /* * Writes msr value into into the appropriate "register". * Returns 0 on success, non-0 otherwise. diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 28e8177ea4a0..274f153c8704 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -430,6 +430,7 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr); void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value, unsigned long *rflags); +void kvm_enable_efer_bits(u64); int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); From 50a37eb4e05efaa7bac6a948fd4db1a48c728b99 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 31 Jan 2008 14:57:38 +0100 Subject: [PATCH 017/147] KVM: align valid EFER bits with the features of the host system This patch aligns the bits the guest can set in the EFER register with the features in the host processor. Currently it lets EFER.NX disabled if the processor does not support it and enables EFER.LME and EFER.LMA only for KVM on 64 bit hosts. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 3 +++ arch/x86/kvm/vmx.c | 4 ++++ arch/x86/kvm/x86.c | 10 +++++++++- 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1a582f1090e8..ff3bc74af728 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -403,6 +403,9 @@ static __init int svm_hardware_setup(void) set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1); set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1); + if (boot_cpu_has(X86_FEATURE_NX)) + kvm_enable_efer_bits(EFER_NX); + for_each_online_cpu(cpu) { r = svm_cpu_init(cpu); if (r) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1157e8a4059b..a509910f6b53 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1117,6 +1117,10 @@ static __init int hardware_setup(void) { if (setup_vmcs_config(&vmcs_config) < 0) return -EIO; + + if (boot_cpu_has(X86_FEATURE_NX)) + kvm_enable_efer_bits(EFER_NX); + return alloc_kvm_area(); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ec9265b354b0..db16f2353e4b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -41,7 +41,15 @@ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) -static u64 __read_mostly efer_reserved_bits = 0xfffffffffffff2fe; +/* EFER defaults: + * - enable syscall per default because its emulated by KVM + * - enable LME and LMA per default on 64 bit KVM + */ +#ifdef CONFIG_X86_64 +static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; +#else +static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; +#endif #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU From 9f62e19a1107466b9e9501e23a9dd5acb81fdca1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 31 Jan 2008 14:57:39 +0100 Subject: [PATCH 018/147] KVM: VMX: unifdef the EFER specific code To allow access to the EFER register in 32bit KVM the EFER specific code has to be exported to the x86 generic code. This patch does this in a backwards compatible manner. [avi: add check for EFER-less hosts] Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a509910f6b53..76944f2c883b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1335,14 +1335,14 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vcpu->arch.cr4 = cr4; } -#ifdef CONFIG_X86_64 - static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); vcpu->arch.shadow_efer = efer; + if (!msr) + return; if (efer & EFER_LMA) { vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) | @@ -1359,8 +1359,6 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) setup_msrs(vmx); } -#endif - static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) { struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; @@ -1775,9 +1773,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->vcpu.arch.cr0 = 0x60000010; vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ vmx_set_cr4(&vmx->vcpu, 0); -#ifdef CONFIG_X86_64 vmx_set_efer(&vmx->vcpu, 0); -#endif vmx_fpu_activate(&vmx->vcpu); update_exception_bitmap(&vmx->vcpu); @@ -2668,9 +2664,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_cr0 = vmx_set_cr0, .set_cr3 = vmx_set_cr3, .set_cr4 = vmx_set_cr4, -#ifdef CONFIG_X86_64 .set_efer = vmx_set_efer, -#endif .get_idt = vmx_get_idt, .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, From 9457a712a2f464c4b21bb7f78998775c69673a0c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 31 Jan 2008 14:57:40 +0100 Subject: [PATCH 019/147] KVM: allow access to EFER in 32bit KVM This patch makes the EFER register accessible on a 32bit KVM host. This is necessary to boot 32 bit PAE guests under SVM. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index db16f2353e4b..38edb2f558ea 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -432,8 +432,6 @@ static u32 emulated_msrs[] = { MSR_IA32_MISC_ENABLE, }; -#ifdef CONFIG_X86_64 - static void set_efer(struct kvm_vcpu *vcpu, u64 efer) { if (efer & efer_reserved_bits) { @@ -458,8 +456,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) vcpu->arch.shadow_efer = efer; } -#endif - void kvm_enable_efer_bits(u64 mask) { efer_reserved_bits &= ~mask; @@ -489,11 +485,9 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { -#ifdef CONFIG_X86_64 case MSR_EFER: set_efer(vcpu, data); break; -#endif case MSR_IA32_MC0_STATUS: pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", __FUNCTION__, data); @@ -571,11 +565,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MISC_ENABLE: data = vcpu->arch.ia32_misc_enable_msr; break; -#ifdef CONFIG_X86_64 case MSR_EFER: data = vcpu->arch.shadow_efer; break; -#endif default: pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); return 1; @@ -2880,9 +2872,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, set_cr8(vcpu, sregs->cr8); mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; -#ifdef CONFIG_X86_64 kvm_x86_ops->set_efer(vcpu, sregs->efer); -#endif kvm_set_apic_base(vcpu, sregs->apic_base); kvm_x86_ops->decache_cr4_guest_bits(vcpu); From 33bd6a0b3e8baed6469c8e68ea1b16cb50c4f5af Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 7 Feb 2008 13:47:38 +0100 Subject: [PATCH 020/147] KVM: SVM: move feature detection to hardware setup code By moving the SVM feature detection from the each_cpu code to the hardware setup code it runs only once. As an additional advance the feature check is now available earlier in the module setup process. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ff3bc74af728..5f527dc0e162 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -302,7 +302,6 @@ static void svm_hardware_enable(void *garbage) svm_data->asid_generation = 1; svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; svm_data->next_asid = svm_data->max_asid + 1; - svm_features = cpuid_edx(SVM_CPUID_FUNC); asm volatile ("sgdt %0" : "=m"(gdt_descr)); gdt = (struct desc_struct *)gdt_descr.address; @@ -411,6 +410,9 @@ static __init int svm_hardware_setup(void) if (r) goto err_2; } + + svm_features = cpuid_edx(SVM_CPUID_FUNC); + return 0; err_2: From e3da3acdb32c1804a5c853feebcc037b7434076f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 7 Feb 2008 13:47:39 +0100 Subject: [PATCH 021/147] KVM: SVM: add detection of Nested Paging feature Let SVM detect if the Nested Paging feature is available on the hardware. Disable it to keep this patch series bisectable. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5f527dc0e162..c12a75953b5b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -47,6 +47,8 @@ MODULE_LICENSE("GPL"); #define SVM_FEATURE_LBRV (1 << 1) #define SVM_DEATURE_SVML (1 << 2) +static bool npt_enabled = false; + static void kvm_reput_irq(struct vcpu_svm *svm); static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) @@ -413,6 +415,12 @@ static __init int svm_hardware_setup(void) svm_features = cpuid_edx(SVM_CPUID_FUNC); + if (!svm_has(SVM_FEATURE_NPT)) + npt_enabled = false; + + if (npt_enabled) + printk(KERN_INFO "kvm: Nested Paging enabled\n"); + return 0; err_2: From 6c7dac72d5c7dc0e09512dce865398167be9a8f7 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 7 Feb 2008 13:47:40 +0100 Subject: [PATCH 022/147] KVM: SVM: add module parameter to disable Nested Paging To disable the use of the Nested Paging feature even if it is available in hardware this patch adds a module parameter. Nested Paging can be disabled by passing npt=0 to the kvm_amd module. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index c12a75953b5b..fb5d6c2e6a08 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -48,6 +48,9 @@ MODULE_LICENSE("GPL"); #define SVM_DEATURE_SVML (1 << 2) static bool npt_enabled = false; +static int npt = 1; + +module_param(npt, int, S_IRUGO); static void kvm_reput_irq(struct vcpu_svm *svm); @@ -418,6 +421,11 @@ static __init int svm_hardware_setup(void) if (!svm_has(SVM_FEATURE_NPT)) npt_enabled = false; + if (npt_enabled && !npt) { + printk(KERN_INFO "kvm: Nested Paging disabled\n"); + npt_enabled = false; + } + if (npt_enabled) printk(KERN_INFO "kvm: Nested Paging enabled\n"); From 1855267210e1a8c9d41fe3a3c7a0d42eca5fb7cd Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 7 Feb 2008 13:47:41 +0100 Subject: [PATCH 023/147] KVM: export information about NPT to generic x86 code The generic x86 code has to know if the specific implementation uses Nested Paging. In the generic code Nested Paging is called Two Dimensional Paging (TDP) to avoid confusion with (future) TDP implementations of other vendors. This patch exports the availability of TDP to the generic x86 code. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 15 +++++++++++++++ arch/x86/kvm/svm.c | 4 +++- include/asm-x86/kvm_host.h | 2 ++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6651dfadae50..21cfa289d0fe 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -32,6 +32,15 @@ #include #include +/* + * When setting this variable to true it enables Two-Dimensional-Paging + * where the hardware walks 2 page tables: + * 1. the guest-virtual to guest-physical + * 2. while doing 1. it walks guest-physical to host-physical + * If the hardware supports that we don't need to do shadow paging. + */ +static bool tdp_enabled = false; + #undef MMU_DEBUG #undef AUDIT @@ -1582,6 +1591,12 @@ out: } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); +void kvm_enable_tdp(void) +{ + tdp_enabled = true; +} +EXPORT_SYMBOL_GPL(kvm_enable_tdp); + static void free_mmu_pages(struct kvm_vcpu *vcpu) { struct kvm_mmu_page *sp; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index fb5d6c2e6a08..9e29a13136c4 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -426,8 +426,10 @@ static __init int svm_hardware_setup(void) npt_enabled = false; } - if (npt_enabled) + if (npt_enabled) { printk(KERN_INFO "kvm: Nested Paging enabled\n"); + kvm_enable_tdp(); + } return 0; diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 274f153c8704..5c6ba2212b1b 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -493,6 +493,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code); +void kvm_enable_tdp(void); + int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); int complete_pio(struct kvm_vcpu *vcpu); From 4d9976bbdc09e08b69fc12fee2042c3528187b32 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 7 Feb 2008 13:47:42 +0100 Subject: [PATCH 024/147] KVM: MMU: make the __nonpaging_map function generic The mapping function for the nonpaging case in the softmmu does basically the same as required for Nested Paging. Make this function generic so it can be used for both. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 21cfa289d0fe..33cd7c982dd3 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -979,10 +979,9 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } -static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, - gfn_t gfn, struct page *page) +static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, + gfn_t gfn, struct page *page, int level) { - int level = PT32E_ROOT_LEVEL; hpa_t table_addr = vcpu->arch.mmu.root_hpa; int pt_write = 0; @@ -1042,7 +1041,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); - r = __nonpaging_map(vcpu, v, write, gfn, page); + r = __direct_map(vcpu, v, write, gfn, page, PT32E_ROOT_LEVEL); spin_unlock(&vcpu->kvm->mmu_lock); up_read(&vcpu->kvm->slots_lock); From cc4b6871e771e76dc1de06adb8aed261a1c66be8 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 7 Feb 2008 13:47:43 +0100 Subject: [PATCH 025/147] KVM: export the load_pdptrs() function to modules The load_pdptrs() function is required in the SVM module for NPT support. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 1 + include/asm-x86/kvm_host.h | 2 ++ 2 files changed, 3 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 38edb2f558ea..0c910c774a9b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -213,6 +213,7 @@ out: return ret; } +EXPORT_SYMBOL_GPL(load_pdptrs); static bool pdptrs_changed(struct kvm_vcpu *vcpu) { diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 5c6ba2212b1b..623249890a0b 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -411,6 +411,8 @@ void kvm_mmu_zap_all(struct kvm *kvm); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); +int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); + enum emulation_result { EMULATE_DONE, /* no further processing */ EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ From fb72d1674d860b0c9ef9b66b7f4f01fe5b3d2c00 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 7 Feb 2008 13:47:44 +0100 Subject: [PATCH 026/147] KVM: MMU: add TDP support to the KVM MMU This patch contains the changes to the KVM MMU necessary for support of the Nested Paging feature in AMD Barcelona and Phenom Processors. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 79 ++++++++++++++++++++++++++++++++++++++++++++-- arch/x86/kvm/mmu.h | 6 ++++ 2 files changed, 82 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 33cd7c982dd3..f7541fe22cd8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1097,6 +1097,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) int i; gfn_t root_gfn; struct kvm_mmu_page *sp; + int metaphysical = 0; root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; @@ -1105,14 +1106,20 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) hpa_t root = vcpu->arch.mmu.root_hpa; ASSERT(!VALID_PAGE(root)); + if (tdp_enabled) + metaphysical = 1; sp = kvm_mmu_get_page(vcpu, root_gfn, 0, - PT64_ROOT_LEVEL, 0, ACC_ALL, NULL); + PT64_ROOT_LEVEL, metaphysical, + ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; vcpu->arch.mmu.root_hpa = root; return; } #endif + metaphysical = !is_paging(vcpu); + if (tdp_enabled) + metaphysical = 1; for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; @@ -1126,7 +1133,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) } else if (vcpu->arch.mmu.root_level == 0) root_gfn = 0; sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, - PT32_ROOT_LEVEL, !is_paging(vcpu), + PT32_ROOT_LEVEL, metaphysical, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; @@ -1160,6 +1167,36 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, error_code & PFERR_WRITE_MASK, gfn); } +static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, + u32 error_code) +{ + struct page *page; + int r; + + ASSERT(vcpu); + ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + r = mmu_topup_memory_caches(vcpu); + if (r) + return r; + + down_read(¤t->mm->mmap_sem); + page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + if (is_error_page(page)) { + kvm_release_page_clean(page); + up_read(¤t->mm->mmap_sem); + return 1; + } + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); + r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, + gpa >> PAGE_SHIFT, page, TDP_ROOT_LEVEL); + spin_unlock(&vcpu->kvm->mmu_lock); + up_read(¤t->mm->mmap_sem); + + return r; +} + static void nonpaging_free(struct kvm_vcpu *vcpu) { mmu_free_roots(vcpu); @@ -1253,7 +1290,35 @@ static int paging32E_init_context(struct kvm_vcpu *vcpu) return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); } -static int init_kvm_mmu(struct kvm_vcpu *vcpu) +static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *context = &vcpu->arch.mmu; + + context->new_cr3 = nonpaging_new_cr3; + context->page_fault = tdp_page_fault; + context->free = nonpaging_free; + context->prefetch_page = nonpaging_prefetch_page; + context->shadow_root_level = TDP_ROOT_LEVEL; + context->root_hpa = INVALID_PAGE; + + if (!is_paging(vcpu)) { + context->gva_to_gpa = nonpaging_gva_to_gpa; + context->root_level = 0; + } else if (is_long_mode(vcpu)) { + context->gva_to_gpa = paging64_gva_to_gpa; + context->root_level = PT64_ROOT_LEVEL; + } else if (is_pae(vcpu)) { + context->gva_to_gpa = paging64_gva_to_gpa; + context->root_level = PT32E_ROOT_LEVEL; + } else { + context->gva_to_gpa = paging32_gva_to_gpa; + context->root_level = PT32_ROOT_LEVEL; + } + + return 0; +} + +static int init_kvm_softmmu(struct kvm_vcpu *vcpu) { ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); @@ -1268,6 +1333,14 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu) return paging32_init_context(vcpu); } +static int init_kvm_mmu(struct kvm_vcpu *vcpu) +{ + if (tdp_enabled) + return init_kvm_tdp_mmu(vcpu); + else + return init_kvm_softmmu(vcpu); +} + static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) { ASSERT(vcpu); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 1fce19ec7a23..e64e9f56a65e 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -3,6 +3,12 @@ #include +#ifdef CONFIG_X86_64 +#define TDP_ROOT_LEVEL PT64_ROOT_LEVEL +#else +#define TDP_ROOT_LEVEL PT32E_ROOT_LEVEL +#endif + static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) From 709ddebf81cb40e3c36c6109a7892e8b93a09464 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 7 Feb 2008 13:47:45 +0100 Subject: [PATCH 027/147] KVM: SVM: add support for Nested Paging This patch contains the SVM architecture dependent changes for KVM to enable support for the Nested Paging feature of AMD Barcelona and Phenom processors. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 72 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 67 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9e29a13136c4..8e9d4a5dacda 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -47,7 +47,12 @@ MODULE_LICENSE("GPL"); #define SVM_FEATURE_LBRV (1 << 1) #define SVM_DEATURE_SVML (1 << 2) +/* enable NPT for AMD64 and X86 with PAE */ +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +static bool npt_enabled = true; +#else static bool npt_enabled = false; +#endif static int npt = 1; module_param(npt, int, S_IRUGO); @@ -187,7 +192,7 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { - if (!(efer & EFER_LMA)) + if (!npt_enabled && !(efer & EFER_LMA)) efer &= ~EFER_LME; to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; @@ -573,6 +578,22 @@ static void init_vmcb(struct vmcb *vmcb) save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP; save->cr4 = X86_CR4_PAE; /* rdx = ?? */ + + if (npt_enabled) { + /* Setup VMCB for Nested Paging */ + control->nested_ctl = 1; + control->intercept_exceptions &= ~(1 << PF_VECTOR); + control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| + INTERCEPT_CR3_MASK); + control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK| + INTERCEPT_CR3_MASK); + save->g_pat = 0x0007040600070406ULL; + /* enable caching because the QEMU Bios doesn't enable it */ + save->cr0 = X86_CR0_ET; + save->cr3 = 0; + save->cr4 = 0; + } + } static int svm_vcpu_reset(struct kvm_vcpu *vcpu) @@ -807,6 +828,9 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } } #endif + if (npt_enabled) + goto set; + if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); vcpu->fpu_active = 1; @@ -814,18 +838,26 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vcpu->arch.cr0 = cr0; cr0 |= X86_CR0_PG | X86_CR0_WP; - cr0 &= ~(X86_CR0_CD | X86_CR0_NW); if (!vcpu->fpu_active) { svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); cr0 |= X86_CR0_TS; } +set: + /* + * re-enable caching here because the QEMU bios + * does not do it - this results in some delay at + * reboot + */ + cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; } static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { vcpu->arch.cr4 = cr4; - to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE; + if (!npt_enabled) + cr4 |= X86_CR4_PAE; + to_svm(vcpu)->vmcb->save.cr4 = cr4; } static void svm_set_segment(struct kvm_vcpu *vcpu, @@ -1313,14 +1345,34 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_WBINVD] = emulate_on_interception, [SVM_EXIT_MONITOR] = invalid_op_interception, [SVM_EXIT_MWAIT] = invalid_op_interception, + [SVM_EXIT_NPF] = pf_interception, }; - static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); u32 exit_code = svm->vmcb->control.exit_code; + if (npt_enabled) { + int mmu_reload = 0; + if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { + svm_set_cr0(vcpu, svm->vmcb->save.cr0); + mmu_reload = 1; + } + vcpu->arch.cr0 = svm->vmcb->save.cr0; + vcpu->arch.cr3 = svm->vmcb->save.cr3; + if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { + if (!load_pdptrs(vcpu, vcpu->arch.cr3)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + } + if (mmu_reload) { + kvm_mmu_reset_context(vcpu); + kvm_mmu_load(vcpu); + } + } + kvm_reput_irq(svm); if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { @@ -1331,7 +1383,8 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) } if (is_external_interrupt(svm->vmcb->control.exit_int_info) && - exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR) + exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && + exit_code != SVM_EXIT_NPF) printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " "exit_code 0x%x\n", __FUNCTION__, svm->vmcb->control.exit_int_info, @@ -1522,6 +1575,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) svm->host_dr6 = read_dr6(); svm->host_dr7 = read_dr7(); svm->vmcb->save.cr2 = vcpu->arch.cr2; + /* required for live migration with NPT */ + if (npt_enabled) + svm->vmcb->save.cr3 = vcpu->arch.cr3; if (svm->vmcb->save.dr7 & 0xff) { write_dr7(0); @@ -1665,6 +1721,12 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) { struct vcpu_svm *svm = to_svm(vcpu); + if (npt_enabled) { + svm->vmcb->control.nested_cr3 = root; + force_new_asid(vcpu); + return; + } + svm->vmcb->save.cr3 = root; force_new_asid(vcpu); From 2e11384c2c6f1ce662b1e5b05ba49b216a052f2a Mon Sep 17 00:00:00 2001 From: Ryan Harper Date: Mon, 11 Feb 2008 10:26:38 -0600 Subject: [PATCH 028/147] KVM: VMX: fix typo in VMX header define Looking at Intel Volume 3b, page 148, table 20-11 and noticed that the field name is 'Deliver' not 'Deliever'. Attached patch changes the define name and its user in vmx.c Signed-off-by: Ryan Harper Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 6 +++--- arch/x86/kvm/vmx.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 76944f2c883b..2d5ccec3f9e7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -632,7 +632,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, { vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, nr | INTR_TYPE_EXCEPTION - | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0) + | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) | INTR_INFO_VALID_MASK); if (has_error_code) vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); @@ -1935,7 +1935,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) error_code = 0; rip = vmcs_readl(GUEST_RIP); - if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) + if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); if (is_page_fault(intr_info)) { cr2 = vmcs_readl(EXIT_QUALIFICATION); @@ -2351,7 +2351,7 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); - if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK)) + if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK)) vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, vmcs_read32(IDT_VECTORING_ERROR_CODE)); if (unlikely(has_ext_irq)) diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h index 436ce0f29092..5dff4606b988 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h @@ -233,12 +233,12 @@ enum vmcs_field { */ #define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ #define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ -#define INTR_INFO_DELIEVER_CODE_MASK 0x800 /* 11 */ +#define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */ #define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ #define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK #define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK -#define VECTORING_INFO_DELIEVER_CODE_MASK INTR_INFO_DELIEVER_CODE_MASK +#define VECTORING_INFO_DELIVER_CODE_MASK INTR_INFO_DELIVER_CODE_MASK #define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK #define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ From e6101a96c9efb74c98bba6322d4c5ea89e47e0fe Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 13 Feb 2008 18:58:45 +0100 Subject: [PATCH 029/147] KVM: SVM: let init_vmcb() take struct vcpu_svm as parameter Change the parameter of the init_vmcb() function in the kvm-amd module from struct vmcb to struct vcpu_svm. Signed-off-by: Joerg Roedel Signed-off-by: Markus Rechberger Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8e9d4a5dacda..d934819733ce 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -471,10 +471,10 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } -static void init_vmcb(struct vmcb *vmcb) +static void init_vmcb(struct vcpu_svm *svm) { - struct vmcb_control_area *control = &vmcb->control; - struct vmcb_save_area *save = &vmcb->save; + struct vmcb_control_area *control = &svm->vmcb->control; + struct vmcb_save_area *save = &svm->vmcb->save; control->intercept_cr_read = INTERCEPT_CR0_MASK | INTERCEPT_CR3_MASK | @@ -600,7 +600,7 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - init_vmcb(svm->vmcb); + init_vmcb(svm); if (vcpu->vcpu_id != 0) { svm->vmcb->save.rip = 0; @@ -638,7 +638,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; memset(svm->db_regs, 0, sizeof(svm->db_regs)); - init_vmcb(svm->vmcb); + init_vmcb(svm); fx_init(&svm->vcpu); svm->vcpu.fpu_active = 1; @@ -1024,7 +1024,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) * so reinitialize it. */ clear_page(svm->vmcb); - init_vmcb(svm->vmcb); + init_vmcb(svm); kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; From f65c229c3e7743c6654c16b9ec6248466b5eef21 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 13 Feb 2008 18:58:46 +0100 Subject: [PATCH 030/147] KVM: SVM: allocate the MSR permission map per VCPU This patch changes the kvm-amd module to allocate the SVM MSR permission map per VCPU instead of a global map for all VCPUs. With this we have more flexibility allowing specific guests to access virtualized MSRs. This is required for LBR virtualization. Signed-off-by: Joerg Roedel Signed-off-by: Markus Rechberger Signed-off-by: Avi Kivity --- arch/x86/kvm/kvm_svm.h | 2 ++ arch/x86/kvm/svm.c | 67 ++++++++++++++++++++---------------------- 2 files changed, 34 insertions(+), 35 deletions(-) diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h index ecdfe97e4635..65ef0fc2c036 100644 --- a/arch/x86/kvm/kvm_svm.h +++ b/arch/x86/kvm/kvm_svm.h @@ -39,6 +39,8 @@ struct vcpu_svm { unsigned long host_db_regs[NUM_DB_REGS]; unsigned long host_dr6; unsigned long host_dr7; + + u32 *msrpm; }; #endif diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d934819733ce..281a2ffe1224 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -65,7 +65,6 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) } unsigned long iopm_base; -unsigned long msrpm_base; struct kvm_ldttss_desc { u16 limit0; @@ -370,12 +369,29 @@ static void set_msr_interception(u32 *msrpm, unsigned msr, BUG(); } +static void svm_vcpu_init_msrpm(u32 *msrpm) +{ + memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); + +#ifdef CONFIG_X86_64 + set_msr_interception(msrpm, MSR_GS_BASE, 1, 1); + set_msr_interception(msrpm, MSR_FS_BASE, 1, 1); + set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1); + set_msr_interception(msrpm, MSR_LSTAR, 1, 1); + set_msr_interception(msrpm, MSR_CSTAR, 1, 1); + set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1); +#endif + set_msr_interception(msrpm, MSR_K6_STAR, 1, 1); + set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1); + set_msr_interception(msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); + set_msr_interception(msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); +} + static __init int svm_hardware_setup(void) { int cpu; struct page *iopm_pages; - struct page *msrpm_pages; - void *iopm_va, *msrpm_va; + void *iopm_va; int r; iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER); @@ -388,37 +404,13 @@ static __init int svm_hardware_setup(void) clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */ iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; - - msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); - - r = -ENOMEM; - if (!msrpm_pages) - goto err_1; - - msrpm_va = page_address(msrpm_pages); - memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); - msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT; - -#ifdef CONFIG_X86_64 - set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1); - set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1); - set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1); - set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1); - set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1); - set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1); -#endif - set_msr_interception(msrpm_va, MSR_K6_STAR, 1, 1); - set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1); - set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1); - set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1); - if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); for_each_online_cpu(cpu) { r = svm_cpu_init(cpu); if (r) - goto err_2; + goto err; } svm_features = cpuid_edx(SVM_CPUID_FUNC); @@ -438,10 +430,7 @@ static __init int svm_hardware_setup(void) return 0; -err_2: - __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); - msrpm_base = 0; -err_1: +err: __free_pages(iopm_pages, IOPM_ALLOC_ORDER); iopm_base = 0; return r; @@ -449,9 +438,8 @@ err_1: static __exit void svm_hardware_unsetup(void) { - __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER); __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); - iopm_base = msrpm_base = 0; + iopm_base = 0; } static void init_seg(struct vmcb_seg *seg) @@ -536,7 +524,7 @@ static void init_vmcb(struct vcpu_svm *svm) (1ULL << INTERCEPT_MWAIT); control->iopm_base_pa = iopm_base; - control->msrpm_base_pa = msrpm_base; + control->msrpm_base_pa = __pa(svm->msrpm); control->tsc_offset = 0; control->int_ctl = V_INTR_MASKING_MASK; @@ -615,6 +603,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) { struct vcpu_svm *svm; struct page *page; + struct page *msrpm_pages; int err; svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); @@ -633,6 +622,13 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) goto uninit; } + err = -ENOMEM; + msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); + if (!msrpm_pages) + goto uninit; + svm->msrpm = page_address(msrpm_pages); + svm_vcpu_init_msrpm(svm->msrpm); + svm->vmcb = page_address(page); clear_page(svm->vmcb); svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; @@ -661,6 +657,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); + __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, svm); } From 24e09cbf480a72f9c952af4ca77b159503dca44b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 13 Feb 2008 18:58:47 +0100 Subject: [PATCH 031/147] KVM: SVM: enable LBR virtualization This patch implements the Last Branch Record Virtualization (LBRV) feature of the AMD Barcelona and Phenom processors into the kvm-amd module. It will only be enabled if the guest enables last branch recording in the DEBUG_CTL MSR. So there is no increased world switch overhead when the guest doesn't use these MSRs. Signed-off-by: Joerg Roedel Signed-off-by: Markus Rechberger Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 281a2ffe1224..7d73e935dcc1 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -47,6 +47,8 @@ MODULE_LICENSE("GPL"); #define SVM_FEATURE_LBRV (1 << 1) #define SVM_DEATURE_SVML (1 << 2) +#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) + /* enable NPT for AMD64 and X86 with PAE */ #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) static bool npt_enabled = true; @@ -387,6 +389,28 @@ static void svm_vcpu_init_msrpm(u32 *msrpm) set_msr_interception(msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); } +static void svm_enable_lbrv(struct vcpu_svm *svm) +{ + u32 *msrpm = svm->msrpm; + + svm->vmcb->control.lbr_ctl = 1; + set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); + set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); + set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); + set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1); +} + +static void svm_disable_lbrv(struct vcpu_svm *svm) +{ + u32 *msrpm = svm->msrpm; + + svm->vmcb->control.lbr_ctl = 0; + set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); + set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); + set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); + set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); +} + static __init int svm_hardware_setup(void) { int cpu; @@ -1231,8 +1255,19 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) svm->vmcb->save.sysenter_esp = data; break; case MSR_IA32_DEBUGCTLMSR: - pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", - __FUNCTION__, data); + if (!svm_has(SVM_FEATURE_LBRV)) { + pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", + __FUNCTION__, data); + break; + } + if (data & DEBUGCTL_RESERVED_BITS) + return 1; + + svm->vmcb->save.dbgctl = data; + if (data & (1ULL<<0)) + svm_enable_lbrv(svm); + else + svm_disable_lbrv(svm); break; case MSR_K7_EVNTSEL0: case MSR_K7_EVNTSEL1: From 18068523d3a0b41fcee5b53cdb437a0ab4d65e4b Mon Sep 17 00:00:00 2001 From: Glauber de Oliveira Costa Date: Fri, 15 Feb 2008 17:52:47 -0200 Subject: [PATCH 032/147] KVM: paravirtualized clocksource: host part This is the host part of kvm clocksource implementation. As it does not include clockevents, it is a fairly simple implementation. We only have to register a per-vcpu area, and start writing to it periodically. The area is binary compatible with xen, as we use the same shadow_info structure. [marcelo: fix bad_page on MSR_KVM_SYSTEM_TIME] [avi: save full value of the msr, even if enable bit is clear] [avi: clear previous value of time_page] Signed-off-by: Glauber de Oliveira Costa Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 113 ++++++++++++++++++++++++++++++++++++- include/asm-x86/kvm_host.h | 7 +++ include/asm-x86/kvm_para.h | 25 ++++++++ include/linux/kvm.h | 1 + 4 files changed, 145 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0c910c774a9b..256c0fc92b67 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -19,6 +19,7 @@ #include "irq.h" #include "mmu.h" +#include #include #include #include @@ -424,7 +425,7 @@ static u32 msrs_to_save[] = { #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TIME_STAMP_COUNTER, + MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, }; static unsigned num_msrs_to_save; @@ -482,6 +483,70 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) return kvm_set_msr(vcpu, index, *data); } +static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) +{ + static int version; + struct kvm_wall_clock wc; + struct timespec wc_ts; + + if (!wall_clock) + return; + + version++; + + down_read(&kvm->slots_lock); + kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); + + wc_ts = current_kernel_time(); + wc.wc_sec = wc_ts.tv_sec; + wc.wc_nsec = wc_ts.tv_nsec; + wc.wc_version = version; + + kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); + + version++; + kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); + up_read(&kvm->slots_lock); +} + +static void kvm_write_guest_time(struct kvm_vcpu *v) +{ + struct timespec ts; + unsigned long flags; + struct kvm_vcpu_arch *vcpu = &v->arch; + void *shared_kaddr; + + if ((!vcpu->time_page)) + return; + + /* Keep irq disabled to prevent changes to the clock */ + local_irq_save(flags); + kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, + &vcpu->hv_clock.tsc_timestamp); + ktime_get_ts(&ts); + local_irq_restore(flags); + + /* With all the info we got, fill in the values */ + + vcpu->hv_clock.system_time = ts.tv_nsec + + (NSEC_PER_SEC * (u64)ts.tv_sec); + /* + * The interface expects us to write an even number signaling that the + * update is finished. Since the guest won't see the intermediate + * state, we just write "2" at the end + */ + vcpu->hv_clock.version = 2; + + shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); + + memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, + sizeof(vcpu->hv_clock)); + + kunmap_atomic(shared_kaddr, KM_USER0); + + mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { @@ -511,6 +576,44 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_IA32_MISC_ENABLE: vcpu->arch.ia32_misc_enable_msr = data; break; + case MSR_KVM_WALL_CLOCK: + vcpu->kvm->arch.wall_clock = data; + kvm_write_wall_clock(vcpu->kvm, data); + break; + case MSR_KVM_SYSTEM_TIME: { + if (vcpu->arch.time_page) { + kvm_release_page_dirty(vcpu->arch.time_page); + vcpu->arch.time_page = NULL; + } + + vcpu->arch.time = data; + + /* we verify if the enable bit is set... */ + if (!(data & 1)) + break; + + /* ...but clean it before doing the actual write */ + vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); + + vcpu->arch.hv_clock.tsc_to_system_mul = + clocksource_khz2mult(tsc_khz, 22); + vcpu->arch.hv_clock.tsc_shift = 22; + + down_read(¤t->mm->mmap_sem); + down_read(&vcpu->kvm->slots_lock); + vcpu->arch.time_page = + gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); + up_read(&vcpu->kvm->slots_lock); + up_read(¤t->mm->mmap_sem); + + if (is_error_page(vcpu->arch.time_page)) { + kvm_release_page_clean(vcpu->arch.time_page); + vcpu->arch.time_page = NULL; + } + + kvm_write_guest_time(vcpu); + break; + } default: pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); return 1; @@ -569,6 +672,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_EFER: data = vcpu->arch.shadow_efer; break; + case MSR_KVM_WALL_CLOCK: + data = vcpu->kvm->arch.wall_clock; + break; + case MSR_KVM_SYSTEM_TIME: + data = vcpu->arch.time; + break; default: pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); return 1; @@ -696,6 +805,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_USER_MEMORY: case KVM_CAP_SET_TSS_ADDR: case KVM_CAP_EXT_CPUID: + case KVM_CAP_CLOCKSOURCE: r = 1; break; case KVM_CAP_VAPIC: @@ -771,6 +881,7 @@ out: void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { kvm_x86_ops->vcpu_load(vcpu, cpu); + kvm_write_guest_time(vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 623249890a0b..90c80fd830fc 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -262,6 +262,11 @@ struct kvm_vcpu_arch { /* emulate context */ struct x86_emulate_ctxt emulate_ctxt; + + gpa_t time; + struct kvm_vcpu_time_info hv_clock; + unsigned int time_offset; + struct page *time_page; }; struct kvm_mem_alias { @@ -288,6 +293,8 @@ struct kvm_arch{ int round_robin_prev_vcpu; unsigned int tss_addr; struct page *apic_access_page; + + gpa_t wall_clock; }; struct kvm_vm_stat { diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h index c6f3fd8d8c53..5ab7d3dbd357 100644 --- a/include/asm-x86/kvm_para.h +++ b/include/asm-x86/kvm_para.h @@ -10,10 +10,35 @@ * paravirtualization, the appropriate feature bit should be checked. */ #define KVM_CPUID_FEATURES 0x40000001 +#define KVM_FEATURE_CLOCKSOURCE 0 + +#define MSR_KVM_WALL_CLOCK 0x11 +#define MSR_KVM_SYSTEM_TIME 0x12 #ifdef __KERNEL__ #include +/* xen binary-compatible interface. See xen headers for details */ +struct kvm_vcpu_time_info { + uint32_t version; + uint32_t pad0; + uint64_t tsc_timestamp; + uint64_t system_time; + uint32_t tsc_to_system_mul; + int8_t tsc_shift; + int8_t pad[3]; +} __attribute__((__packed__)); /* 32 bytes */ + +struct kvm_wall_clock { + uint32_t wc_version; + uint32_t wc_sec; + uint32_t wc_nsec; +} __attribute__((__packed__)); + + +extern void kvmclock_init(void); + + /* This instruction is vmcall. On non-VT architectures, it will generate a * trap that we will then rewrite to the appropriate instruction. */ diff --git a/include/linux/kvm.h b/include/linux/kvm.h index c1ec04fd000d..94540b3c6872 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -233,6 +233,7 @@ struct kvm_vapic_addr { #define KVM_CAP_SET_TSS_ADDR 4 #define KVM_CAP_VAPIC 6 #define KVM_CAP_EXT_CPUID 7 +#define KVM_CAP_CLOCKSOURCE 8 /* * ioctls for VM fds From 790c73f6289a204f858ffdcbe4a2b38e91657ec6 Mon Sep 17 00:00:00 2001 From: Glauber de Oliveira Costa Date: Fri, 15 Feb 2008 17:52:48 -0200 Subject: [PATCH 033/147] x86: KVM guest: paravirtualized clocksource This is the guest part of kvm clock implementation It does not do tsc-only timing, as tsc can have deltas between cpus, and it did not seem worthy to me to keep adjusting them. We do use it, however, for fine-grained adjustment. Other than that, time comes from the host. [randy dunlap: add missing include] [randy dunlap: disallow on Voyager or Visual WS] Signed-off-by: Glauber de Oliveira Costa Signed-off-by: Randy Dunlap Signed-off-by: Avi Kivity --- arch/x86/Kconfig | 11 +++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/kvmclock.c | 160 +++++++++++++++++++++++++++++++++++++ arch/x86/kernel/setup_32.c | 5 ++ arch/x86/kernel/setup_64.c | 5 ++ 5 files changed, 182 insertions(+) create mode 100644 arch/x86/kernel/kvmclock.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2fadf794483d..40cedc255eda 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -373,6 +373,17 @@ config VMI at the moment), by linking the kernel to a GPL-ed ROM module provided by the hypervisor. +config KVM_CLOCK + bool "KVM paravirtualized clock" + select PARAVIRT + depends on !(X86_VISWS || X86_VOYAGER) + help + Turning on this option will allow you to run a paravirtualized clock + when running over the KVM hypervisor. Instead of relying on a PIT + (or probably other) emulation by the underlying device model, the host + provides the guest with timing infrastructure such as time of day, and + system time + source "arch/x86/lguest/Kconfig" config PARAVIRT diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 90e092d0af0c..483047a33024 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -80,6 +80,7 @@ obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o +obj-$(CONFIG_KVM_CLOCK) += kvmclock.o obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o ifdef CONFIG_INPUT_PCSPKR diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c new file mode 100644 index 000000000000..b999f5e5b3bf --- /dev/null +++ b/arch/x86/kernel/kvmclock.c @@ -0,0 +1,160 @@ +/* KVM paravirtual clock driver. A clocksource implementation + Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include +#include +#include +#include +#include +#include + +#define KVM_SCALE 22 + +static int kvmclock = 1; + +static int parse_no_kvmclock(char *arg) +{ + kvmclock = 0; + return 0; +} +early_param("no-kvmclock", parse_no_kvmclock); + +/* The hypervisor will put information about time periodically here */ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock); +#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field + +static inline u64 kvm_get_delta(u64 last_tsc) +{ + int cpu = smp_processor_id(); + u64 delta = native_read_tsc() - last_tsc; + return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE; +} + +static struct kvm_wall_clock wall_clock; +static cycle_t kvm_clock_read(void); +/* + * The wallclock is the time of day when we booted. Since then, some time may + * have elapsed since the hypervisor wrote the data. So we try to account for + * that with system time + */ +unsigned long kvm_get_wallclock(void) +{ + u32 wc_sec, wc_nsec; + u64 delta; + struct timespec ts; + int version, nsec; + int low, high; + + low = (int)__pa(&wall_clock); + high = ((u64)__pa(&wall_clock) >> 32); + + delta = kvm_clock_read(); + + native_write_msr(MSR_KVM_WALL_CLOCK, low, high); + do { + version = wall_clock.wc_version; + rmb(); + wc_sec = wall_clock.wc_sec; + wc_nsec = wall_clock.wc_nsec; + rmb(); + } while ((wall_clock.wc_version != version) || (version & 1)); + + delta = kvm_clock_read() - delta; + delta += wc_nsec; + nsec = do_div(delta, NSEC_PER_SEC); + set_normalized_timespec(&ts, wc_sec + delta, nsec); + /* + * Of all mechanisms of time adjustment I've tested, this one + * was the champion! + */ + return ts.tv_sec + 1; +} + +int kvm_set_wallclock(unsigned long now) +{ + return 0; +} + +/* + * This is our read_clock function. The host puts an tsc timestamp each time + * it updates a new time. Without the tsc adjustment, we can have a situation + * in which a vcpu starts to run earlier (smaller system_time), but probes + * time later (compared to another vcpu), leading to backwards time + */ +static cycle_t kvm_clock_read(void) +{ + u64 last_tsc, now; + int cpu; + + preempt_disable(); + cpu = smp_processor_id(); + + last_tsc = get_clock(cpu, tsc_timestamp); + now = get_clock(cpu, system_time); + + now += kvm_get_delta(last_tsc); + preempt_enable(); + + return now; +} +static struct clocksource kvm_clock = { + .name = "kvm-clock", + .read = kvm_clock_read, + .rating = 400, + .mask = CLOCKSOURCE_MASK(64), + .mult = 1 << KVM_SCALE, + .shift = KVM_SCALE, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static int kvm_register_clock(void) +{ + int cpu = smp_processor_id(); + int low, high; + low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; + high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); + + return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); +} + +static void kvm_setup_secondary_clock(void) +{ + /* + * Now that the first cpu already had this clocksource initialized, + * we shouldn't fail. + */ + WARN_ON(kvm_register_clock()); + /* ok, done with our trickery, call native */ + setup_secondary_APIC_clock(); +} + +void __init kvmclock_init(void) +{ + if (!kvm_para_available()) + return; + + if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { + if (kvm_register_clock()) + return; + pv_time_ops.get_wallclock = kvm_get_wallclock; + pv_time_ops.set_wallclock = kvm_set_wallclock; + pv_time_ops.sched_clock = kvm_clock_read; + pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; + clocksource_register(&kvm_clock); + } +} diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 44cc9b933932..5a849ddd09ee 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -47,6 +47,7 @@ #include #include #include +#include #include