From 61a734d305e16944b42730ef582a7171dc733321 Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Mon, 18 Aug 2014 10:41:36 +0100 Subject: [PATCH 1/5] xen/manage: Always freeze/thaw processes when suspend/resuming Always freeze processes when suspending and thaw processes when resuming to prevent a race noticeable with HVM guests. This prevents a deadlock where the khubd kthread (which is designed to be freezable) acquires a usb device lock and then tries to allocate memory which requires the disk which hasn't been resumed yet. Meanwhile, the xenwatch thread deadlocks waiting for the usb device lock. Freezing processes fixes this because the khubd thread is only thawed after the xenwatch thread finishes resuming all the devices. Signed-off-by: Ross Lagerwall Signed-off-by: David Vrabel Cc: stable@vger.kernel.org --- drivers/xen/manage.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 5f1e1f3cd186..f8bb36f9d9ce 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -103,16 +103,11 @@ static void do_suspend(void) shutting_down = SHUTDOWN_SUSPEND; -#ifdef CONFIG_PREEMPT - /* If the kernel is preemptible, we need to freeze all the processes - to prevent them from being in the middle of a pagetable update - during suspend. */ err = freeze_processes(); if (err) { pr_err("%s: freeze failed %d\n", __func__, err); goto out; } -#endif err = dpm_suspend_start(PMSG_FREEZE); if (err) { @@ -157,10 +152,8 @@ out_resume: dpm_resume_end(si.cancelled ? PMSG_THAW : PMSG_RESTORE); out_thaw: -#ifdef CONFIG_PREEMPT thaw_processes(); out: -#endif shutting_down = SHUTDOWN_INVALID; } #endif /* CONFIG_HIBERNATE_CALLBACKS */ From 3dcf63677d4eb7fdfc13290c8558c301d2588fe8 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 1 Sep 2014 18:52:44 +0100 Subject: [PATCH 2/5] xen/balloon: cancel ballooning if adding new memory failed If the balloon driver is adding additional memory regions to the balloon and add_memory() fails it will likely continuously fail so cancel the balloon operation. Signed-off-by: David Vrabel Reviewed-by: Daniel Kiper --- drivers/xen/balloon.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 5c660c77f03b..1e0a317d3dcd 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -230,8 +230,8 @@ static enum bp_state reserve_additional_memory(long credit) rc = add_memory(nid, hotplug_start_paddr, balloon_hotplug << PAGE_SHIFT); if (rc) { - pr_info("%s: add_memory() failed: %i\n", __func__, rc); - return BP_EAGAIN; + pr_warn("Cannot add additional memory (%i)\n", rc); + return BP_ECANCELED; } balloon_hotplug -= credit; From e9de2e5fd602c4f5ddf212d3837b19ad4f5878ad Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Tue, 2 Sep 2014 15:21:29 +0100 Subject: [PATCH 3/5] xen/gntalloc: fix oops after runnning out of grant refs Only set gref->gref_id if foreign access was successfully granted and the grant ref is valid. If gref->gref_id == -ENOSPC the test in __del_gref() would incorrectly attempt to end foreign access (because grant_ref_t is unsigned). Signed-off-by: David Vrabel Reported-by: Dave Scott Reviewed-by: Boris Ostrovsky --- drivers/xen/gntalloc.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index 787d17945418..8ed2bb4f6f21 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c @@ -141,13 +141,11 @@ static int add_grefs(struct ioctl_gntalloc_alloc_gref *op, goto undo; /* Grant foreign access to the page. */ - gref->gref_id = gnttab_grant_foreign_access(op->domid, + rc = gnttab_grant_foreign_access(op->domid, pfn_to_mfn(page_to_pfn(gref->page)), readonly); - if ((int)gref->gref_id < 0) { - rc = gref->gref_id; + if (rc < 0) goto undo; - } - gref_ids[i] = gref->gref_id; + gref_ids[i] = gref->gref_id = rc; } /* Add to gref lists. */ @@ -193,7 +191,7 @@ static void __del_gref(struct gntalloc_gref *gref) gref->notify.flags = 0; - if (gref->gref_id > 0) { + if (gref->gref_id) { if (gnttab_query_foreign_access(gref->gref_id)) return; From 5903c6bd1a48d90b99e207ec2a6a7673cbbb732d Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Tue, 2 Sep 2014 15:21:30 +0100 Subject: [PATCH 4/5] xen/gntalloc: safely delete grefs in add_grefs() undo path If a gref could not be added (perhaps because the limit has been reached or there are no more grant references available), the undo path may crash because __del_gref() frees the gref while it is being used for a list iteration. A comment suggests that using list_for_each_entry() is safe since the gref isn't removed from the list being iterated over, but it is freed and thus list_for_each_entry_safe() must be used. Also, explicitly delete the gref from the local per-file list, even though this is not strictly necessary. Signed-off-by: David Vrabel Reviewed-by: Boris Ostrovsky --- drivers/xen/gntalloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index 8ed2bb4f6f21..e53fe191738c 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c @@ -124,7 +124,7 @@ static int add_grefs(struct ioctl_gntalloc_alloc_gref *op, int i, rc, readonly; LIST_HEAD(queue_gref); LIST_HEAD(queue_file); - struct gntalloc_gref *gref; + struct gntalloc_gref *gref, *next; readonly = !(op->flags & GNTALLOC_FLAG_WRITABLE); rc = -ENOMEM; @@ -160,8 +160,8 @@ undo: mutex_lock(&gref_mutex); gref_size -= (op->count - i); - list_for_each_entry(gref, &queue_file, next_file) { - /* __del_gref does not remove from queue_file */ + list_for_each_entry_safe(gref, next, &queue_file, next_file) { + list_del(&gref->next_file); __del_gref(gref); } From 0b5a50635fc916cf46e3de0b819a61fc3f17e7ee Mon Sep 17 00:00:00 2001 From: Stefan Bader Date: Tue, 2 Sep 2014 11:16:01 +0100 Subject: [PATCH 5/5] x86/xen: don't copy bogus duplicate entries into kernel page tables When RANDOMIZE_BASE (KASLR) is enabled; or the sum of all loaded modules exceeds 512 MiB, then loading modules fails with a warning (and hence a vmalloc allocation failure) because the PTEs for the newly-allocated vmalloc address space are not zero. WARNING: CPU: 0 PID: 494 at linux/mm/vmalloc.c:128 vmap_page_range_noflush+0x2a1/0x360() This is caused by xen_setup_kernel_pagetables() copying level2_kernel_pgt into level2_fixmap_pgt, overwriting many non-present entries. Without KASLR, the normal kernel image size only covers the first half of level2_kernel_pgt and module space starts after that. L4[511]->level3_kernel_pgt[510]->level2_kernel_pgt[ 0..255]->kernel [256..511]->module [511]->level2_fixmap_pgt[ 0..505]->module This allows 512 MiB of of module vmalloc space to be used before having to use the corrupted level2_fixmap_pgt entries. With KASLR enabled, the kernel image uses the full PUD range of 1G and module space starts in the level2_fixmap_pgt. So basically: L4[511]->level3_kernel_pgt[510]->level2_kernel_pgt[0..511]->kernel [511]->level2_fixmap_pgt[0..505]->module And now no module vmalloc space can be used without using the corrupt level2_fixmap_pgt entries. Fix this by properly converting the level2_fixmap_pgt entries to MFNs, and setting level1_fixmap_pgt as read-only. A number of comments were also using the the wrong L3 offset for level2_kernel_pgt. These have been corrected. Signed-off-by: Stefan Bader Signed-off-by: David Vrabel Reviewed-by: Boris Ostrovsky Cc: stable@vger.kernel.org --- arch/x86/include/asm/pgtable_64.h | 1 + arch/x86/xen/mmu.c | 27 ++++++++++++--------------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 5be9063545d2..3874693c0e53 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -19,6 +19,7 @@ extern pud_t level3_ident_pgt[512]; extern pmd_t level2_kernel_pgt[512]; extern pmd_t level2_fixmap_pgt[512]; extern pmd_t level2_ident_pgt[512]; +extern pte_t level1_fixmap_pgt[512]; extern pgd_t init_level4_pgt[]; #define swapper_pg_dir init_level4_pgt diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index e8a1201c3293..16fb0099b7f2 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1866,12 +1866,11 @@ static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, * * We can construct this by grafting the Xen provided pagetable into * head_64.S's preconstructed pagetables. We copy the Xen L2's into - * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This - * means that only the kernel has a physical mapping to start with - - * but that's enough to get __va working. We need to fill in the rest - * of the physical mapping once some sort of allocator has been set - * up. - * NOTE: for PVH, the page tables are native. + * level2_ident_pgt, and level2_kernel_pgt. This means that only the + * kernel has a physical mapping to start with - but that's enough to + * get __va working. We need to fill in the rest of the physical + * mapping once some sort of allocator has been set up. NOTE: for + * PVH, the page tables are native. */ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { @@ -1902,8 +1901,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* L3_i[0] -> level2_ident_pgt */ convert_pfn_mfn(level3_ident_pgt); /* L3_k[510] -> level2_kernel_pgt - * L3_i[511] -> level2_fixmap_pgt */ + * L3_k[511] -> level2_fixmap_pgt */ convert_pfn_mfn(level3_kernel_pgt); + + /* L3_k[511][506] -> level1_fixmap_pgt */ + convert_pfn_mfn(level2_fixmap_pgt); } /* We get [511][511] and have Xen's version of level2_kernel_pgt */ l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); @@ -1913,21 +1915,15 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) addr[1] = (unsigned long)l3; addr[2] = (unsigned long)l2; /* Graft it onto L4[272][0]. Note that we creating an aliasing problem: - * Both L4[272][0] and L4[511][511] have entries that point to the same + * Both L4[272][0] and L4[511][510] have entries that point to the same * L2 (PMD) tables. Meaning that if you modify it in __va space * it will be also modified in the __ka space! (But if you just * modify the PMD table to point to other PTE's or none, then you * are OK - which is what cleanup_highmap does) */ copy_page(level2_ident_pgt, l2); - /* Graft it onto L4[511][511] */ + /* Graft it onto L4[511][510] */ copy_page(level2_kernel_pgt, l2); - /* Get [511][510] and graft that in level2_fixmap_pgt */ - l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd); - l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); - copy_page(level2_fixmap_pgt, l2); - /* Note that we don't do anything with level1_fixmap_pgt which - * we don't need. */ if (!xen_feature(XENFEAT_auto_translated_physmap)) { /* Make pagetable pieces RO */ set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); @@ -1937,6 +1933,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); + set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO); /* Pin down new L4 */ pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,