alistair23-linux/arch/i386/mm/pageattr.c
Jeremy Fitzhardinge 5311ab62cd [PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).

Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.

There are several side-effects of this.  One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared.  In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated.  pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.

Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.

Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately.  (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).

[ Many thanks to wli for review comments. ]

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-02 19:27:13 +02:00

259 lines
6.3 KiB
C

/*
* Copyright 2002 Andi Kleen, SuSE Labs.
* Thanks to Ben LaHaise for precious feedback.
*/
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/highmem.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
#include <asm/sections.h>
static DEFINE_SPINLOCK(cpa_lock);
static struct list_head df_list = LIST_HEAD_INIT(df_list);
pte_t *lookup_address(unsigned long address)
{
pgd_t *pgd = pgd_offset_k(address);
pud_t *pud;
pmd_t *pmd;
if (pgd_none(*pgd))
return NULL;
pud = pud_offset(pgd, address);
if (pud_none(*pud))
return NULL;
pmd = pmd_offset(pud, address);
if (pmd_none(*pmd))
return NULL;
if (pmd_large(*pmd))
return (pte_t *)pmd;
return pte_offset_kernel(pmd, address);
}
static struct page *split_large_page(unsigned long address, pgprot_t prot,
pgprot_t ref_prot)
{
int i;
unsigned long addr;
struct page *base;
pte_t *pbase;
spin_unlock_irq(&cpa_lock);
base = alloc_pages(GFP_KERNEL, 0);
spin_lock_irq(&cpa_lock);
if (!base)
return NULL;
/*
* page_private is used to track the number of entries in
* the page table page that have non standard attributes.
*/
SetPagePrivate(base);
page_private(base) = 0;
address = __pa(address);
addr = address & LARGE_PAGE_MASK;
pbase = (pte_t *)page_address(base);
paravirt_alloc_pt(page_to_pfn(base));
for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
addr == address ? prot : ref_prot));
}
return base;
}
static void flush_kernel_map(void *arg)
{
unsigned long adr = (unsigned long)arg;
if (adr && cpu_has_clflush) {
int i;
for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
asm volatile("clflush (%0)" :: "r" (adr + i));
} else if (boot_cpu_data.x86_model >= 4)
wbinvd();
/* Flush all to work around Errata in early athlons regarding
* large page flushing.
*/
__flush_tlb_all();
}
static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
{
struct page *page;
unsigned long flags;
set_pte_atomic(kpte, pte); /* change init_mm */
if (SHARED_KERNEL_PMD)
return;
spin_lock_irqsave(&pgd_lock, flags);
for (page = pgd_list; page; page = (struct page *)page->index) {
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pgd = (pgd_t *)page_address(page) + pgd_index(address);
pud = pud_offset(pgd, address);
pmd = pmd_offset(pud, address);
set_pte_atomic((pte_t *)pmd, pte);
}
spin_unlock_irqrestore(&pgd_lock, flags);
}
/*
* No more special protections in this 2/4MB area - revert to a
* large page again.
*/
static inline void revert_page(struct page *kpte_page, unsigned long address)
{
pgprot_t ref_prot;
pte_t *linear;
ref_prot =
((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
? PAGE_KERNEL_LARGE_EXEC : PAGE_KERNEL_LARGE;
linear = (pte_t *)
pmd_offset(pud_offset(pgd_offset_k(address), address), address);
set_pmd_pte(linear, address,
pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
ref_prot));
}
static int
__change_page_attr(struct page *page, pgprot_t prot)
{
pte_t *kpte;
unsigned long address;
struct page *kpte_page;
BUG_ON(PageHighMem(page));
address = (unsigned long)page_address(page);
kpte = lookup_address(address);
if (!kpte)
return -EINVAL;
kpte_page = virt_to_page(kpte);
if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) {
if (!pte_huge(*kpte)) {
set_pte_atomic(kpte, mk_pte(page, prot));
} else {
pgprot_t ref_prot;
struct page *split;
ref_prot =
((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
? PAGE_KERNEL_EXEC : PAGE_KERNEL;
split = split_large_page(address, prot, ref_prot);
if (!split)
return -ENOMEM;
set_pmd_pte(kpte,address,mk_pte(split, ref_prot));
kpte_page = split;
}
page_private(kpte_page)++;
} else if (!pte_huge(*kpte)) {
set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
BUG_ON(page_private(kpte_page) == 0);
page_private(kpte_page)--;
} else
BUG();
/*
* If the pte was reserved, it means it was created at boot
* time (not via split_large_page) and in turn we must not
* replace it with a largepage.
*/
if (!PageReserved(kpte_page)) {
if (cpu_has_pse && (page_private(kpte_page) == 0)) {
ClearPagePrivate(kpte_page);
paravirt_release_pt(page_to_pfn(kpte_page));
list_add(&kpte_page->lru, &df_list);
revert_page(kpte_page, address);
}
}
return 0;
}
static inline void flush_map(void *adr)
{
on_each_cpu(flush_kernel_map, adr, 1, 1);
}
/*
* Change the page attributes of an page in the linear mapping.
*
* This should be used when a page is mapped with a different caching policy
* than write-back somewhere - some CPUs do not like it when mappings with
* different caching policies exist. This changes the page attributes of the
* in kernel linear mapping too.
*
* The caller needs to ensure that there are no conflicting mappings elsewhere.
* This function only deals with the kernel linear map.
*
* Caller must call global_flush_tlb() after this.
*/
int change_page_attr(struct page *page, int numpages, pgprot_t prot)
{
int err = 0;
int i;
unsigned long flags;
spin_lock_irqsave(&cpa_lock, flags);
for (i = 0; i < numpages; i++, page++) {
err = __change_page_attr(page, prot);
if (err)
break;
}
spin_unlock_irqrestore(&cpa_lock, flags);
return err;
}
void global_flush_tlb(void)
{
struct list_head l;
struct page *pg, *next;
BUG_ON(irqs_disabled());
spin_lock_irq(&cpa_lock);
list_replace_init(&df_list, &l);
spin_unlock_irq(&cpa_lock);
if (!cpu_has_clflush)
flush_map(NULL);
list_for_each_entry_safe(pg, next, &l, lru) {
if (cpu_has_clflush)
flush_map(page_address(pg));
__free_page(pg);
}
}
#ifdef CONFIG_DEBUG_PAGEALLOC
void kernel_map_pages(struct page *page, int numpages, int enable)
{
if (PageHighMem(page))
return;
if (!enable)
debug_check_no_locks_freed(page_address(page),
numpages * PAGE_SIZE);
/* the return value is ignored - the calls cannot fail,
* large pages are disabled at boot time.
*/
change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
/* we should perform an IPI and flush all tlbs,
* but that can deadlock->flush only current cpu.
*/
__flush_tlb_all();
}
#endif
EXPORT_SYMBOL(change_page_attr);
EXPORT_SYMBOL(global_flush_tlb);