From 655d406a7c80bffc03263d071b6ba1e0fcf548f9 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 6 Feb 2009 18:46:48 -0800
Subject: [PATCH 01/41] xen: add /proc/xen/xsd_{kva,port} to xenfs

These are used by the userspace xenstore daemon, which runs in dom0.
Xenstored is what's behind the xenfs "xenbus" filesystem.

[ Impact: provide mapping and port to usermode for xenstore ]

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/xenfs/Makefile    |  3 +-
 drivers/xen/xenfs/super.c     | 54 +++++++++++++++++++++++++++-
 drivers/xen/xenfs/xenfs.h     |  2 ++
 drivers/xen/xenfs/xenstored.c | 68 +++++++++++++++++++++++++++++++++++
 4 files changed, 125 insertions(+), 2 deletions(-)
 create mode 100644 drivers/xen/xenfs/xenstored.c

diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile
index 25275c3bbdff..5d45ff13cc01 100644
--- a/drivers/xen/xenfs/Makefile
+++ b/drivers/xen/xenfs/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_XENFS) += xenfs.o
 
-xenfs-objs = super.o xenbus.o
\ No newline at end of file
+xenfs-y			  = super.o xenbus.o
+xenfs-$(CONFIG_XEN_DOM0) += xenstored.o
diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
index 78bfab0700ba..3cf7707217f2 100644
--- a/drivers/xen/xenfs/super.c
+++ b/drivers/xen/xenfs/super.c
@@ -22,6 +22,46 @@
 MODULE_DESCRIPTION("Xen filesystem");
 MODULE_LICENSE("GPL");
 
+static struct inode *xenfs_make_inode(struct super_block *sb, int mode)
+{
+	struct inode *ret = new_inode(sb);
+
+	if (ret) {
+		ret->i_mode = mode;
+		ret->i_uid = ret->i_gid = 0;
+		ret->i_blocks = 0;
+		ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME;
+	}
+	return ret;
+}
+
+static struct dentry *xenfs_create_file(struct super_block *sb,
+					struct dentry *parent,
+					const char *name,
+					const struct file_operations *fops,
+					void *data,
+					int mode)
+{
+	struct dentry *dentry;
+	struct inode *inode;
+
+	dentry = d_alloc_name(parent, name);
+	if (!dentry)
+		return NULL;
+
+	inode = xenfs_make_inode(sb, S_IFREG | mode);
+	if (!inode) {
+		dput(dentry);
+		return NULL;
+	}
+
+	inode->i_fop = fops;
+	inode->i_private = data;
+
+	d_add(dentry, inode);
+	return dentry;
+}
+
 static ssize_t capabilities_read(struct file *file, char __user *buf,
 				 size_t size, loff_t *off)
 {
@@ -45,8 +85,20 @@ static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
 		{ "capabilities", &capabilities_file_ops, S_IRUGO },
 		{""},
 	};
+	int rc;
 
-	return simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files);
+	rc = simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files);
+	if (rc < 0)
+		return rc;
+
+	if (xen_initial_domain()) {
+		xenfs_create_file(sb, sb->s_root, "xsd_kva",
+				  &xsd_kva_file_ops, NULL, S_IRUSR|S_IWUSR);
+		xenfs_create_file(sb, sb->s_root, "xsd_port",
+				  &xsd_port_file_ops, NULL, S_IRUSR|S_IWUSR);
+	}
+
+	return rc;
 }
 
 static int xenfs_get_sb(struct file_system_type *fs_type,
diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h
index 51f08b2d0bf1..5056306e7aa8 100644
--- a/drivers/xen/xenfs/xenfs.h
+++ b/drivers/xen/xenfs/xenfs.h
@@ -2,5 +2,7 @@
 #define _XENFS_XENBUS_H
 
 extern const struct file_operations xenbus_file_ops;
+extern const struct file_operations xsd_kva_file_ops;
+extern const struct file_operations xsd_port_file_ops;
 
 #endif	/* _XENFS_XENBUS_H */
diff --git a/drivers/xen/xenfs/xenstored.c b/drivers/xen/xenfs/xenstored.c
new file mode 100644
index 000000000000..fef20dbc6a5c
--- /dev/null
+++ b/drivers/xen/xenfs/xenstored.c
@@ -0,0 +1,68 @@
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+
+#include <xen/page.h>
+
+#include "xenfs.h"
+#include "../xenbus/xenbus_comms.h"
+
+static ssize_t xsd_read(struct file *file, char __user *buf,
+			    size_t size, loff_t *off)
+{
+	const char *str = (const char *)file->private_data;
+	return simple_read_from_buffer(buf, size, off, str, strlen(str));
+}
+
+static int xsd_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+static int xsd_kva_open(struct inode *inode, struct file *file)
+{
+	file->private_data = (void *)kasprintf(GFP_KERNEL, "0x%p",
+					       xen_store_interface);
+	if (!file->private_data)
+		return -ENOMEM;
+	return 0;
+}
+
+static int xsd_kva_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	size_t size = vma->vm_end - vma->vm_start;
+
+	if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0))
+		return -EINVAL;
+
+	if (remap_pfn_range(vma, vma->vm_start,
+			    virt_to_pfn(xen_store_interface),
+			    size, vma->vm_page_prot))
+		return -EAGAIN;
+
+	return 0;
+}
+
+const struct file_operations xsd_kva_file_ops = {
+	.open = xsd_kva_open,
+	.mmap = xsd_kva_mmap,
+	.read = xsd_read,
+	.release = xsd_release,
+};
+
+static int xsd_port_open(struct inode *inode, struct file *file)
+{
+	file->private_data = (void *)kasprintf(GFP_KERNEL, "%d",
+					       xen_store_evtchn);
+	if (!file->private_data)
+		return -ENOMEM;
+	return 0;
+}
+
+const struct file_operations xsd_port_file_ops = {
+	.open = xsd_port_open,
+	.read = xsd_read,
+	.release = xsd_release,
+};

From eba3ff8b99863bcc9e66b8d528e4750229e29693 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 9 Feb 2009 12:05:49 -0800
Subject: [PATCH 02/41] xen: add xen_set_domain_pte()

Add xen_set_domain_pte() to allow setting a pte mapping a page from
another domain.  The common case is to map from DOMID_IO, the pseudo
domain which owns all IO pages, but will also be used in the privcmd
interface to map other domain pages.

[ Impact: new Xen-internal API for cross-domain mappings ]

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/include/asm/xen/page.h |  1 +
 arch/x86/xen/mmu.c              | 10 ++++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index bf5f7d32bd08..5e0eb8758919 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -159,6 +159,7 @@ static inline pte_t __pte_ma(pteval_t x)
 
 #define pgd_val_ma(x)	((x).pgd)
 
+void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid);
 
 xmaddr_t arbitrary_virt_to_machine(void *address);
 unsigned long arbitrary_virt_to_mfn(void *vaddr);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 42086ac406af..1ceb0f2fa0af 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -395,7 +395,7 @@ static bool xen_iomap_pte(pte_t pte)
 	return pte_flags(pte) & _PAGE_IOMAP;
 }
 
-static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
+void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
 {
 	struct multicall_space mcs;
 	struct mmu_update *u;
@@ -407,10 +407,16 @@ static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
 	u->ptr = arbitrary_virt_to_machine(ptep).maddr;
 	u->val = pte_val_ma(pteval);
 
-	MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO);
+	MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
 
 	xen_mc_issue(PARAVIRT_LAZY_MMU);
 }
+EXPORT_SYMBOL_GPL(xen_set_domain_pte);
+
+static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
+{
+	xen_set_domain_pte(ptep, pteval, DOMID_IO);
+}
 
 static void xen_extend_mmu_update(const struct mmu_update *update)
 {

From 1246ae0bb992f106a245eea2b8dd901ced868e7a Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 9 Feb 2009 12:05:49 -0800
Subject: [PATCH 03/41] xen: add variable hypercall caller

Allow non-constant hypercall to be called, for privcmd.

[ Impact: make arbitrary hypercalls; needed for privcmd ]

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/include/asm/xen/hypercall.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 7fda040a76cd..a3c28ae4025b 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -200,6 +200,23 @@ extern struct { char _entry[32]; } hypercall_page[];
 	(type)__res;							\
 })
 
+static inline long
+privcmd_call(unsigned call,
+	     unsigned long a1, unsigned long a2,
+	     unsigned long a3, unsigned long a4,
+	     unsigned long a5)
+{
+	__HYPERCALL_DECLS;
+	__HYPERCALL_5ARG(a1, a2, a3, a4, a5);
+
+	asm volatile("call *%[call]"
+		     : __HYPERCALL_5PARAM
+		     : [call] "a" (&hypercall_page[call])
+		     : __HYPERCALL_CLOBBER5);
+
+	return (long)__res;
+}
+
 static inline int
 HYPERVISOR_set_trap_table(struct trap_info *table)
 {

From 1c5de1939c204bde9cce87f4eb3d26e9f9eb732b Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 9 Feb 2009 12:05:49 -0800
Subject: [PATCH 04/41] xen: add privcmd driver

The privcmd interface in xenfs allows the tool stack in the privileged
domain to get fairly direct access to the hypervisor in order to do
various management things such as domain construction.

[ Impact: new xenfs interface for privileged operations ]

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/xenfs/Makefile  |   2 +-
 drivers/xen/xenfs/privcmd.c | 436 ++++++++++++++++++++++++++++++++++++
 drivers/xen/xenfs/super.c   |   2 +
 drivers/xen/xenfs/xenfs.h   |   1 +
 include/xen/Kbuild          |   1 +
 include/xen/privcmd.h       |  80 +++++++
 6 files changed, 521 insertions(+), 1 deletion(-)
 create mode 100644 drivers/xen/xenfs/privcmd.c
 create mode 100644 include/xen/privcmd.h

diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile
index 5d45ff13cc01..4a0be9a82af3 100644
--- a/drivers/xen/xenfs/Makefile
+++ b/drivers/xen/xenfs/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_XENFS) += xenfs.o
 
 xenfs-y			  = super.o xenbus.o
-xenfs-$(CONFIG_XEN_DOM0) += xenstored.o
+xenfs-$(CONFIG_XEN_DOM0) += xenstored.o privcmd.o
diff --git a/drivers/xen/xenfs/privcmd.c b/drivers/xen/xenfs/privcmd.c
new file mode 100644
index 000000000000..c7192f314f86
--- /dev/null
+++ b/drivers/xen/xenfs/privcmd.c
@@ -0,0 +1,436 @@
+/******************************************************************************
+ * privcmd.c
+ *
+ * Interface to privileged domain-0 commands.
+ *
+ * Copyright (c) 2002-2004, K A Fraser, B Dragovic
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/uaccess.h>
+#include <linux/swap.h>
+#include <linux/smp_lock.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
+#include <xen/privcmd.h>
+#include <xen/interface/xen.h>
+#include <xen/features.h>
+#include <xen/page.h>
+
+#ifndef HAVE_ARCH_PRIVCMD_MMAP
+static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
+#endif
+
+struct remap_data {
+	unsigned long mfn;
+	unsigned domid;
+	pgprot_t prot;
+};
+
+static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
+				 unsigned long addr, void *data)
+{
+	struct remap_data *rmd = data;
+	pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
+
+	xen_set_domain_pte(ptep, pte, rmd->domid);
+
+	return 0;
+}
+
+int remap_domain_mfn_range(struct vm_area_struct *vma, unsigned long addr,
+			   unsigned long mfn, unsigned long size,
+			   pgprot_t prot, unsigned domid)
+{
+	struct remap_data rmd;
+	int err;
+
+	prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
+
+	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+
+	rmd.mfn = mfn;
+	rmd.prot = prot;
+	rmd.domid = domid;
+
+	err = apply_to_page_range(vma->vm_mm, addr, size,
+				  remap_area_mfn_pte_fn, &rmd);
+
+	return err;
+}
+
+static long privcmd_ioctl_hypercall(void __user *udata)
+{
+	struct privcmd_hypercall hypercall;
+	long ret;
+
+	if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
+		return -EFAULT;
+
+	ret = privcmd_call(hypercall.op,
+			   hypercall.arg[0], hypercall.arg[1],
+			   hypercall.arg[2], hypercall.arg[3],
+			   hypercall.arg[4]);
+
+	return ret;
+}
+
+static void free_page_list(struct list_head *pages)
+{
+	struct page *p, *n;
+
+	list_for_each_entry_safe(p, n, pages, lru)
+		__free_page(p);
+
+	INIT_LIST_HEAD(pages);
+}
+
+/*
+ * Given an array of items in userspace, return a list of pages
+ * containing the data.  If copying fails, either because of memory
+ * allocation failure or a problem reading user memory, return an
+ * error code; its up to the caller to dispose of any partial list.
+ */
+static int gather_array(struct list_head *pagelist,
+			unsigned nelem, size_t size,
+			void __user *data)
+{
+	unsigned pageidx;
+	void *pagedata;
+	int ret;
+
+	if (size > PAGE_SIZE)
+		return 0;
+
+	pageidx = PAGE_SIZE;
+	pagedata = NULL;	/* quiet, gcc */
+	while (nelem--) {
+		if (pageidx > PAGE_SIZE-size) {
+			struct page *page = alloc_page(GFP_KERNEL);
+
+			ret = -ENOMEM;
+			if (page == NULL)
+				goto fail;
+
+			pagedata = page_address(page);
+
+			list_add_tail(&page->lru, pagelist);
+			pageidx = 0;
+		}
+
+		ret = -EFAULT;
+		if (copy_from_user(pagedata + pageidx, data, size))
+			goto fail;
+
+		data += size;
+		pageidx += size;
+	}
+
+	ret = 0;
+
+fail:
+	return ret;
+}
+
+/*
+ * Call function "fn" on each element of the array fragmented
+ * over a list of pages.
+ */
+static int traverse_pages(unsigned nelem, size_t size,
+			  struct list_head *pos,
+			  int (*fn)(void *data, void *state),
+			  void *state)
+{
+	void *pagedata;
+	unsigned pageidx;
+	int ret;
+
+	BUG_ON(size > PAGE_SIZE);
+
+	pageidx = PAGE_SIZE;
+	pagedata = NULL;	/* hush, gcc */
+
+	while (nelem--) {
+		if (pageidx > PAGE_SIZE-size) {
+			struct page *page;
+			pos = pos->next;
+			page = list_entry(pos, struct page, lru);
+			pagedata = page_address(page);
+			pageidx = 0;
+		}
+
+		ret = (*fn)(pagedata + pageidx, state);
+		if (ret)
+			break;
+		pageidx += size;
+	}
+
+	return ret;
+}
+
+struct mmap_mfn_state {
+	unsigned long va;
+	struct vm_area_struct *vma;
+	domid_t domain;
+};
+
+static int mmap_mfn_range(void *data, void *state)
+{
+	struct privcmd_mmap_entry *msg = data;
+	struct mmap_mfn_state *st = state;
+	struct vm_area_struct *vma = st->vma;
+	int rc;
+
+	/* Do not allow range to wrap the address space. */
+	if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
+	    ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va))
+		return -EINVAL;
+
+	/* Range chunks must be contiguous in va space. */
+	if ((msg->va != st->va) ||
+	    ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
+		return -EINVAL;
+
+	rc = remap_domain_mfn_range(vma,
+				    msg->va & PAGE_MASK,
+				    msg->mfn,
+				    msg->npages << PAGE_SHIFT,
+				    vma->vm_page_prot,
+				    st->domain);
+	if (rc < 0)
+		return rc;
+
+	st->va += msg->npages << PAGE_SHIFT;
+
+	return 0;
+}
+
+static long privcmd_ioctl_mmap(void __user *udata)
+{
+	struct privcmd_mmap mmapcmd;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	int rc;
+	LIST_HEAD(pagelist);
+	struct mmap_mfn_state state;
+
+	if (!xen_initial_domain())
+		return -EPERM;
+
+	if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
+		return -EFAULT;
+
+	rc = gather_array(&pagelist,
+			  mmapcmd.num, sizeof(struct privcmd_mmap_entry),
+			  mmapcmd.entry);
+
+	if (rc || list_empty(&pagelist))
+		goto out;
+
+	down_write(&mm->mmap_sem);
+
+	{
+		struct page *page = list_first_entry(&pagelist,
+						     struct page, lru);
+		struct privcmd_mmap_entry *msg = page_address(page);
+
+		vma = find_vma(mm, msg->va);
+		rc = -EINVAL;
+
+		if (!vma || (msg->va != vma->vm_start) ||
+		    !privcmd_enforce_singleshot_mapping(vma))
+			goto out_up;
+	}
+
+	state.va = vma->vm_start;
+	state.vma = vma;
+	state.domain = mmapcmd.dom;
+
+	rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
+			    &pagelist,
+			    mmap_mfn_range, &state);
+
+
+out_up:
+	up_write(&mm->mmap_sem);
+
+out:
+	free_page_list(&pagelist);
+
+	return rc;
+}
+
+struct mmap_batch_state {
+	domid_t domain;
+	unsigned long va;
+	struct vm_area_struct *vma;
+	int err;
+
+	xen_pfn_t __user *user;
+};
+
+static int mmap_batch_fn(void *data, void *state)
+{
+	xen_pfn_t *mfnp = data;
+	struct mmap_batch_state *st = state;
+
+	if (remap_domain_mfn_range(st->vma, st->va & PAGE_MASK,
+				   *mfnp, PAGE_SIZE,
+				   st->vma->vm_page_prot, st->domain) < 0) {
+		*mfnp |= 0xf0000000U;
+		st->err++;
+	}
+	st->va += PAGE_SIZE;
+
+	return 0;
+}
+
+static int mmap_return_errors(void *data, void *state)
+{
+	xen_pfn_t *mfnp = data;
+	struct mmap_batch_state *st = state;
+
+	put_user(*mfnp, st->user++);
+
+	return 0;
+}
+
+static long privcmd_ioctl_mmap_batch(void __user *udata)
+{
+	int ret;
+	struct privcmd_mmapbatch m;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	unsigned long nr_pages;
+	LIST_HEAD(pagelist);
+	struct mmap_batch_state state;
+
+	if (!xen_initial_domain())
+		return -EPERM;
+
+	if (copy_from_user(&m, udata, sizeof(m)))
+		return -EFAULT;
+
+	nr_pages = m.num;
+	if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
+		return -EINVAL;
+
+	ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t),
+			   m.arr);
+
+	if (ret || list_empty(&pagelist))
+		goto out;
+
+	down_write(&mm->mmap_sem);
+
+	vma = find_vma(mm, m.addr);
+	ret = -EINVAL;
+	if (!vma ||
+	    (m.addr != vma->vm_start) ||
+	    ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) ||
+	    !privcmd_enforce_singleshot_mapping(vma)) {
+		up_write(&mm->mmap_sem);
+		goto out;
+	}
+
+	state.domain = m.dom;
+	state.vma = vma;
+	state.va = m.addr;
+	state.err = 0;
+
+	ret = traverse_pages(m.num, sizeof(xen_pfn_t),
+			     &pagelist, mmap_batch_fn, &state);
+
+	up_write(&mm->mmap_sem);
+
+	if (state.err > 0) {
+		ret = state.err;
+
+		state.user = udata;
+		traverse_pages(m.num, sizeof(xen_pfn_t),
+			       &pagelist,
+			       mmap_return_errors, &state);
+	}
+
+out:
+	free_page_list(&pagelist);
+
+	return ret;
+}
+
+static long privcmd_ioctl(struct file *file,
+			  unsigned int cmd, unsigned long data)
+{
+	int ret = -ENOSYS;
+	void __user *udata = (void __user *) data;
+
+	switch (cmd) {
+	case IOCTL_PRIVCMD_HYPERCALL:
+		ret = privcmd_ioctl_hypercall(udata);
+		break;
+
+	case IOCTL_PRIVCMD_MMAP:
+		ret = privcmd_ioctl_mmap(udata);
+		break;
+
+	case IOCTL_PRIVCMD_MMAPBATCH:
+		ret = privcmd_ioctl_mmap_batch(udata);
+		break;
+
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+#ifndef HAVE_ARCH_PRIVCMD_MMAP
+static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	return VM_FAULT_SIGBUS;
+}
+
+static struct vm_operations_struct privcmd_vm_ops = {
+	.fault = privcmd_fault
+};
+
+static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	/* Unsupported for auto-translate guests. */
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return -ENOSYS;
+
+	/* DONTCOPY is essential for Xen as copy_page_range is broken. */
+	vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY;
+	vma->vm_ops = &privcmd_vm_ops;
+	vma->vm_private_data = NULL;
+
+	return 0;
+}
+
+static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
+{
+	return (xchg(&vma->vm_private_data, (void *)1) == NULL);
+}
+#endif
+
+const struct file_operations privcmd_file_ops = {
+	.unlocked_ioctl = privcmd_ioctl,
+	.mmap = privcmd_mmap,
+};
diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
index 3cf7707217f2..8c7462866e90 100644
--- a/drivers/xen/xenfs/super.c
+++ b/drivers/xen/xenfs/super.c
@@ -96,6 +96,8 @@ static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
 				  &xsd_kva_file_ops, NULL, S_IRUSR|S_IWUSR);
 		xenfs_create_file(sb, sb->s_root, "xsd_port",
 				  &xsd_port_file_ops, NULL, S_IRUSR|S_IWUSR);
+		xenfs_create_file(sb, sb->s_root, "privcmd",
+				  &privcmd_file_ops, NULL, S_IRUSR|S_IWUSR);
 	}
 
 	return rc;
diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h
index 5056306e7aa8..b68aa6200003 100644
--- a/drivers/xen/xenfs/xenfs.h
+++ b/drivers/xen/xenfs/xenfs.h
@@ -2,6 +2,7 @@
 #define _XENFS_XENBUS_H
 
 extern const struct file_operations xenbus_file_ops;
+extern const struct file_operations privcmd_file_ops;
 extern const struct file_operations xsd_kva_file_ops;
 extern const struct file_operations xsd_port_file_ops;
 
diff --git a/include/xen/Kbuild b/include/xen/Kbuild
index 4e65c16a445b..84ad8f02fee5 100644
--- a/include/xen/Kbuild
+++ b/include/xen/Kbuild
@@ -1 +1,2 @@
 header-y += evtchn.h
+header-y += privcmd.h
diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h
new file mode 100644
index 000000000000..b42cdfd92fee
--- /dev/null
+++ b/include/xen/privcmd.h
@@ -0,0 +1,80 @@
+/******************************************************************************
+ * privcmd.h
+ *
+ * Interface to /proc/xen/privcmd.
+ *
+ * Copyright (c) 2003-2005, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __LINUX_PUBLIC_PRIVCMD_H__
+#define __LINUX_PUBLIC_PRIVCMD_H__
+
+#include <linux/types.h>
+
+typedef unsigned long xen_pfn_t;
+
+#ifndef __user
+#define __user
+#endif
+
+struct privcmd_hypercall {
+	__u64 op;
+	__u64 arg[5];
+};
+
+struct privcmd_mmap_entry {
+	__u64 va;
+	__u64 mfn;
+	__u64 npages;
+};
+
+struct privcmd_mmap {
+	int num;
+	domid_t dom; /* target domain */
+	struct privcmd_mmap_entry __user *entry;
+};
+
+struct privcmd_mmapbatch {
+	int num;     /* number of pages to populate */
+	domid_t dom; /* target domain */
+	__u64 addr;  /* virtual address */
+	xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */
+};
+
+/*
+ * @cmd: IOCTL_PRIVCMD_HYPERCALL
+ * @arg: &privcmd_hypercall_t
+ * Return: Value returned from execution of the specified hypercall.
+ */
+#define IOCTL_PRIVCMD_HYPERCALL					\
+	_IOC(_IOC_NONE, 'P', 0, sizeof(struct privcmd_hypercall))
+#define IOCTL_PRIVCMD_MMAP					\
+	_IOC(_IOC_NONE, 'P', 2, sizeof(struct privcmd_mmap))
+#define IOCTL_PRIVCMD_MMAPBATCH					\
+	_IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch))
+
+#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */

From 24a89b5be4cf2b7f1b49b56b6cb4a7b71fccf241 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 9 Feb 2009 12:05:49 -0800
Subject: [PATCH 05/41] xen/privcmd: create address space to allow writable
 mmaps

These are necessary to allow writeable mmap of the privcmd node to
succeed without being marked read-only for writenotify purposes. Which
in turn is necessary to allow mappings of foreign guest pages

[ Impact: bugfix: allow writable mappings ]

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/xenfs/super.c | 42 +++++++++++++++++++++++++++++++++++----
 1 file changed, 38 insertions(+), 4 deletions(-)

diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
index 8c7462866e90..23f1cca5a2e9 100644
--- a/drivers/xen/xenfs/super.c
+++ b/drivers/xen/xenfs/super.c
@@ -12,6 +12,8 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/magic.h>
+#include <linux/mm.h>
+#include <linux/backing-dev.h>
 
 #include <xen/xen.h>
 
@@ -22,12 +24,30 @@
 MODULE_DESCRIPTION("Xen filesystem");
 MODULE_LICENSE("GPL");
 
+static int xenfs_set_page_dirty(struct page *page)
+{
+	if (!PageDirty(page))
+		SetPageDirty(page);
+	return 0;
+}
+
+static const struct address_space_operations xenfs_aops = {
+	.set_page_dirty = xenfs_set_page_dirty,
+};
+
+static struct backing_dev_info xenfs_backing_dev_info = {
+	.ra_pages	= 0,	/* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
+};
+
 static struct inode *xenfs_make_inode(struct super_block *sb, int mode)
 {
 	struct inode *ret = new_inode(sb);
 
 	if (ret) {
 		ret->i_mode = mode;
+		ret->i_mapping->a_ops = &xenfs_aops;
+		ret->i_mapping->backing_dev_info = &xenfs_backing_dev_info;
 		ret->i_uid = ret->i_gid = 0;
 		ret->i_blocks = 0;
 		ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME;
@@ -119,11 +139,25 @@ static struct file_system_type xenfs_type = {
 
 static int __init xenfs_init(void)
 {
-	if (xen_domain())
-		return register_filesystem(&xenfs_type);
+	int err;
+	if (!xen_domain()) {
+		printk(KERN_INFO "xenfs: not registering filesystem on non-xen platform\n");
+		return 0;
+	}
 
-	printk(KERN_INFO "XENFS: not registering filesystem on non-xen platform\n");
-	return 0;
+	err = register_filesystem(&xenfs_type);
+	if (err) {
+		printk(KERN_ERR "xenfs: Unable to register filesystem!\n");
+		goto out;
+	}
+
+	err = bdi_init(&xenfs_backing_dev_info);
+	if (err)
+		unregister_filesystem(&xenfs_type);
+
+ out:
+
+	return err;
 }
 
 static void __exit xenfs_exit(void)

From 35f8c1c343f2918ea24f05282d14e711887d8278 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 6 Mar 2009 09:56:26 -0800
Subject: [PATCH 06/41] xen/xenfs: set_page_dirty is supposed to return true if
 it dirties

I don't think it matters at all in this case (there's only one caller
which checks the return value), but may as well be strictly correct.

[ Impact: cleanup ]

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/xenfs/super.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
index 23f1cca5a2e9..afaa6ede0168 100644
--- a/drivers/xen/xenfs/super.c
+++ b/drivers/xen/xenfs/super.c
@@ -26,9 +26,7 @@ MODULE_LICENSE("GPL");
 
 static int xenfs_set_page_dirty(struct page *page)
 {
-	if (!PageDirty(page))
-		SetPageDirty(page);
-	return 0;
+	return !TestSetPageDirty(page);
 }
 
 static const struct address_space_operations xenfs_aops = {

From 441c7416b55d3d48b4aaafc5bdd804092387d877 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 6 Mar 2009 09:56:59 -0800
Subject: [PATCH 07/41] xen/privcmd: print SIGBUS faults

Print more detail about privcmd mapping faults for debugging.

[ Impact: debug ]

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/xenfs/privcmd.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/xen/xenfs/privcmd.c b/drivers/xen/xenfs/privcmd.c
index c7192f314f86..6b602f505363 100644
--- a/drivers/xen/xenfs/privcmd.c
+++ b/drivers/xen/xenfs/privcmd.c
@@ -403,6 +403,10 @@ static long privcmd_ioctl(struct file *file,
 #ifndef HAVE_ARCH_PRIVCMD_MMAP
 static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
+	       vma, vma->vm_start, vma->vm_end,
+	       vmf->pgoff, vmf->virtual_address);
+
 	return VM_FAULT_SIGBUS;
 }
 

From f31fdf510531333dea95f0a92e6eaa1c3a7541e2 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Sun, 8 Mar 2009 04:10:00 -0700
Subject: [PATCH 08/41] xen/privcmd: make sure vma is ours before doing
 anything to it

Test vma->vm_ops is our operations to make sure we created it.
We don't want to stomp on other random vmas.

[ Impact: bugfix; prevent ioctl from affecting other mappings ]

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/xenfs/privcmd.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/xen/xenfs/privcmd.c b/drivers/xen/xenfs/privcmd.c
index 6b602f505363..80526afd3063 100644
--- a/drivers/xen/xenfs/privcmd.c
+++ b/drivers/xen/xenfs/privcmd.c
@@ -310,6 +310,8 @@ static int mmap_return_errors(void *data, void *state)
 	return 0;
 }
 
+static struct vm_operations_struct privcmd_vm_ops;
+
 static long privcmd_ioctl_mmap_batch(void __user *udata)
 {
 	int ret;
@@ -341,6 +343,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata)
 	vma = find_vma(mm, m.addr);
 	ret = -EINVAL;
 	if (!vma ||
+	    vma->vm_ops != &privcmd_vm_ops ||
 	    (m.addr != vma->vm_start) ||
 	    ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) ||
 	    !privcmd_enforce_singleshot_mapping(vma)) {

From 8e3e99918b9ccd6bc2369ddbcd74056f8796e1e0 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Sat, 21 Mar 2009 23:51:26 -0700
Subject: [PATCH 09/41] xenbus: export xen_store_interface for xenfs

xen_store_interface is needed by xenfs, and xenfs may be a module.

[ Impact: build fix for modular xenfs ]

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/xenbus/xenbus_probe.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index d409495876f1..132939f36020 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -64,9 +64,11 @@
 
 
 int xen_store_evtchn;
-EXPORT_SYMBOL(xen_store_evtchn);
+EXPORT_SYMBOL_GPL(xen_store_evtchn);
 
 struct xenstore_domain_interface *xen_store_interface;
+EXPORT_SYMBOL_GPL(xen_store_interface);
+
 static unsigned long xen_store_mfn;
 
 static BLOCKING_NOTIFIER_HEAD(xenstore_chain);

From f020e2905166e12f9a8f109fe968cb5a9db887e9 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Wed, 20 May 2009 15:42:14 +0100
Subject: [PATCH 10/41] privcmd: MMAPBATCH: Fix error handling/reporting

On error IOCTL_PRIVCMD_MMAPBATCH is expected to set the top nibble of
the effected MFN and return 0. Currently it leaves the MFN unmodified
and returns the number of failures. Therefore:

- reimplement remap_domain_mfn_range() using direct
  HYPERVISOR_mmu_update() calls and small batches. The xen_set_domain_pte()
  interface does not report errors and since some failures are
  expected/normal using the multicall infrastructure is too noisy.
- return 0 as expected
- writeback the updated MFN list to mmapbatch->arr not over mmapbatch,
  smashing the caller's stack.
- remap_domain_mfn_range can be static.

With this change I am able to start an HVM domain.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/xenfs/privcmd.c | 56 +++++++++++++++++++++++++++----------
 1 file changed, 41 insertions(+), 15 deletions(-)

diff --git a/drivers/xen/xenfs/privcmd.c b/drivers/xen/xenfs/privcmd.c
index 80526afd3063..438223ae0fc3 100644
--- a/drivers/xen/xenfs/privcmd.c
+++ b/drivers/xen/xenfs/privcmd.c
@@ -32,14 +32,16 @@
 #include <xen/features.h>
 #include <xen/page.h>
 
+#define REMAP_BATCH_SIZE 16
+
 #ifndef HAVE_ARCH_PRIVCMD_MMAP
 static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
 #endif
 
 struct remap_data {
 	unsigned long mfn;
-	unsigned domid;
 	pgprot_t prot;
+	struct mmu_update *mmu_update;
 };
 
 static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
@@ -48,17 +50,23 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
 	struct remap_data *rmd = data;
 	pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
 
-	xen_set_domain_pte(ptep, pte, rmd->domid);
+	rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr;
+	rmd->mmu_update->val = pte_val_ma(pte);
+	rmd->mmu_update++;
 
 	return 0;
 }
 
-int remap_domain_mfn_range(struct vm_area_struct *vma, unsigned long addr,
-			   unsigned long mfn, unsigned long size,
-			   pgprot_t prot, unsigned domid)
+static int remap_domain_mfn_range(struct vm_area_struct *vma,
+				  unsigned long addr,
+				  unsigned long mfn, int nr,
+				  pgprot_t prot, unsigned domid)
 {
 	struct remap_data rmd;
-	int err;
+	struct mmu_update mmu_update[REMAP_BATCH_SIZE];
+	int batch;
+	unsigned long range;
+	int err = 0;
 
 	prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
 
@@ -66,10 +74,29 @@ int remap_domain_mfn_range(struct vm_area_struct *vma, unsigned long addr,
 
 	rmd.mfn = mfn;
 	rmd.prot = prot;
-	rmd.domid = domid;
 
-	err = apply_to_page_range(vma->vm_mm, addr, size,
-				  remap_area_mfn_pte_fn, &rmd);
+	while (nr) {
+		batch = min(REMAP_BATCH_SIZE, nr);
+		range = (unsigned long)batch << PAGE_SHIFT;
+
+		rmd.mmu_update = mmu_update;
+		err = apply_to_page_range(vma->vm_mm, addr, range,
+					  remap_area_mfn_pte_fn, &rmd);
+		if (err)
+			goto out;
+
+		err = -EFAULT;
+		if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
+			goto out;
+
+		nr -= batch;
+		addr += range;
+	}
+
+	err = 0;
+out:
+
+	flush_tlb_all();
 
 	return err;
 }
@@ -158,7 +185,7 @@ static int traverse_pages(unsigned nelem, size_t size,
 {
 	void *pagedata;
 	unsigned pageidx;
-	int ret;
+	int ret = 0;
 
 	BUG_ON(size > PAGE_SIZE);
 
@@ -208,8 +235,7 @@ static int mmap_mfn_range(void *data, void *state)
 
 	rc = remap_domain_mfn_range(vma,
 				    msg->va & PAGE_MASK,
-				    msg->mfn,
-				    msg->npages << PAGE_SHIFT,
+				    msg->mfn, msg->npages,
 				    vma->vm_page_prot,
 				    st->domain);
 	if (rc < 0)
@@ -290,7 +316,7 @@ static int mmap_batch_fn(void *data, void *state)
 	struct mmap_batch_state *st = state;
 
 	if (remap_domain_mfn_range(st->vma, st->va & PAGE_MASK,
-				   *mfnp, PAGE_SIZE,
+				   *mfnp, 1,
 				   st->vma->vm_page_prot, st->domain) < 0) {
 		*mfnp |= 0xf0000000U;
 		st->err++;
@@ -362,9 +388,9 @@ static long privcmd_ioctl_mmap_batch(void __user *udata)
 	up_write(&mm->mmap_sem);
 
 	if (state.err > 0) {
-		ret = state.err;
+		ret = 0;
 
-		state.user = udata;
+		state.user = m.arr;
 		traverse_pages(m.num, sizeof(xen_pfn_t),
 			       &pagelist,
 			       mmap_return_errors, &state);

From de1ef2065c4675ab1062ebc8d1cb6c5f42b61d04 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Thu, 21 May 2009 10:09:46 +0100
Subject: [PATCH 11/41] xen/privcmd: move remap_domain_mfn_range() to core xen
 code and export.

This allows xenfs to be built as a module, previously it required flush_tlb_all
and arbitrary_virt_to_machine to be exported.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c          | 66 ++++++++++++++++++++++++++++++
 drivers/xen/xenfs/privcmd.c | 81 ++++---------------------------------
 include/xen/xen-ops.h       |  5 +++
 3 files changed, 79 insertions(+), 73 deletions(-)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 1ceb0f2fa0af..f08ea045620f 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2265,6 +2265,72 @@ void __init xen_hvm_init_mmu_ops(void)
 }
 #endif
 
+#define REMAP_BATCH_SIZE 16
+
+struct remap_data {
+	unsigned long mfn;
+	pgprot_t prot;
+	struct mmu_update *mmu_update;
+};
+
+static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
+				 unsigned long addr, void *data)
+{
+	struct remap_data *rmd = data;
+	pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
+
+	rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr;
+	rmd->mmu_update->val = pte_val_ma(pte);
+	rmd->mmu_update++;
+
+	return 0;
+}
+
+int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+			       unsigned long addr,
+			       unsigned long mfn, int nr,
+			       pgprot_t prot, unsigned domid)
+{
+	struct remap_data rmd;
+	struct mmu_update mmu_update[REMAP_BATCH_SIZE];
+	int batch;
+	unsigned long range;
+	int err = 0;
+
+	prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
+
+	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+
+	rmd.mfn = mfn;
+	rmd.prot = prot;
+
+	while (nr) {
+		batch = min(REMAP_BATCH_SIZE, nr);
+		range = (unsigned long)batch << PAGE_SHIFT;
+
+		rmd.mmu_update = mmu_update;
+		err = apply_to_page_range(vma->vm_mm, addr, range,
+					  remap_area_mfn_pte_fn, &rmd);
+		if (err)
+			goto out;
+
+		err = -EFAULT;
+		if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
+			goto out;
+
+		nr -= batch;
+		addr += range;
+	}
+
+	err = 0;
+out:
+
+	flush_tlb_all();
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
+
 #ifdef CONFIG_XEN_DEBUG_FS
 
 static struct dentry *d_mmu_debug;
diff --git a/drivers/xen/xenfs/privcmd.c b/drivers/xen/xenfs/privcmd.c
index 438223ae0fc3..f80be7f6eb95 100644
--- a/drivers/xen/xenfs/privcmd.c
+++ b/drivers/xen/xenfs/privcmd.c
@@ -31,76 +31,12 @@
 #include <xen/interface/xen.h>
 #include <xen/features.h>
 #include <xen/page.h>
-
-#define REMAP_BATCH_SIZE 16
+#include <xen/xen-ops.h>
 
 #ifndef HAVE_ARCH_PRIVCMD_MMAP
 static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
 #endif
 
-struct remap_data {
-	unsigned long mfn;
-	pgprot_t prot;
-	struct mmu_update *mmu_update;
-};
-
-static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
-				 unsigned long addr, void *data)
-{
-	struct remap_data *rmd = data;
-	pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
-
-	rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr;
-	rmd->mmu_update->val = pte_val_ma(pte);
-	rmd->mmu_update++;
-
-	return 0;
-}
-
-static int remap_domain_mfn_range(struct vm_area_struct *vma,
-				  unsigned long addr,
-				  unsigned long mfn, int nr,
-				  pgprot_t prot, unsigned domid)
-{
-	struct remap_data rmd;
-	struct mmu_update mmu_update[REMAP_BATCH_SIZE];
-	int batch;
-	unsigned long range;
-	int err = 0;
-
-	prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
-
-	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
-
-	rmd.mfn = mfn;
-	rmd.prot = prot;
-
-	while (nr) {
-		batch = min(REMAP_BATCH_SIZE, nr);
-		range = (unsigned long)batch << PAGE_SHIFT;
-
-		rmd.mmu_update = mmu_update;
-		err = apply_to_page_range(vma->vm_mm, addr, range,
-					  remap_area_mfn_pte_fn, &rmd);
-		if (err)
-			goto out;
-
-		err = -EFAULT;
-		if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
-			goto out;
-
-		nr -= batch;
-		addr += range;
-	}
-
-	err = 0;
-out:
-
-	flush_tlb_all();
-
-	return err;
-}
-
 static long privcmd_ioctl_hypercall(void __user *udata)
 {
 	struct privcmd_hypercall hypercall;
@@ -233,11 +169,11 @@ static int mmap_mfn_range(void *data, void *state)
 	    ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
 		return -EINVAL;
 
-	rc = remap_domain_mfn_range(vma,
-				    msg->va & PAGE_MASK,
-				    msg->mfn, msg->npages,
-				    vma->vm_page_prot,
-				    st->domain);
+	rc = xen_remap_domain_mfn_range(vma,
+					msg->va & PAGE_MASK,
+					msg->mfn, msg->npages,
+					vma->vm_page_prot,
+					st->domain);
 	if (rc < 0)
 		return rc;
 
@@ -315,9 +251,8 @@ static int mmap_batch_fn(void *data, void *state)
 	xen_pfn_t *mfnp = data;
 	struct mmap_batch_state *st = state;
 
-	if (remap_domain_mfn_range(st->vma, st->va & PAGE_MASK,
-				   *mfnp, 1,
-				   st->vma->vm_page_prot, st->domain) < 0) {
+	if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1,
+				       st->vma->vm_page_prot, st->domain) < 0) {
 		*mfnp |= 0xf0000000U;
 		st->err++;
 	}
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 351f4051f6d8..98b92154a264 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -23,4 +23,9 @@ int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
 
 void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order);
 
+int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+			       unsigned long addr,
+			       unsigned long mfn, int nr,
+			       pgprot_t prot, unsigned domid);
+
 #endif /* INCLUDE_XEN_OPS_H */

From 9387377eb79a44f453fd27c3d00a2e5da587e369 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 6 Oct 2010 08:51:32 -0700
Subject: [PATCH 12/41] xen/privcmd: make privcmd visible in domU

It has its uses in a domU as well as dom0.  Xen will prevent an
unprivileged domain from doing anything untoward.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/xenfs/Makefile | 4 ++--
 drivers/xen/xenfs/super.c  | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile
index 4a0be9a82af3..4fde9440fe1f 100644
--- a/drivers/xen/xenfs/Makefile
+++ b/drivers/xen/xenfs/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_XENFS) += xenfs.o
 
-xenfs-y			  = super.o xenbus.o
-xenfs-$(CONFIG_XEN_DOM0) += xenstored.o privcmd.o
+xenfs-y			  = super.o xenbus.o privcmd.o
+xenfs-$(CONFIG_XEN_DOM0) += xenstored.o
diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
index afaa6ede0168..984891e9a394 100644
--- a/drivers/xen/xenfs/super.c
+++ b/drivers/xen/xenfs/super.c
@@ -101,6 +101,7 @@ static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
 		[1] = {},
 		{ "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR },
 		{ "capabilities", &capabilities_file_ops, S_IRUGO },
+		{ "privcmd", &privcmd_file_ops, S_IRUSR|S_IWUSR },
 		{""},
 	};
 	int rc;
@@ -114,8 +115,6 @@ static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
 				  &xsd_kva_file_ops, NULL, S_IRUSR|S_IWUSR);
 		xenfs_create_file(sb, sb->s_root, "xsd_port",
 				  &xsd_port_file_ops, NULL, S_IRUSR|S_IWUSR);
-		xenfs_create_file(sb, sb->s_root, "privcmd",
-				  &privcmd_file_ops, NULL, S_IRUSR|S_IWUSR);
 	}
 
 	return rc;

From 5e941c093989dfb6b67148d2410d79b1be8debfe Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 26 Aug 2010 15:31:36 -0700
Subject: [PATCH 13/41] x86: add RESERVE_BRK_ARRAY() helper

Useful when converting static arrays into boottime brk allocated objects.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/include/asm/setup.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index ef292c792d74..d6763b139a84 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -93,6 +93,11 @@ void *extend_brk(size_t size, size_t align);
 			: : "i" (sz));					\
 	}
 
+/* Helper for reserving space for arrays of things */
+#define RESERVE_BRK_ARRAY(type, name, entries)		\
+	type *name;					\
+	RESERVE_BRK(name, sizeof(type) * entries)
+
 #ifdef __i386__
 
 void __init i386_start_kernel(void);

From a171ce6e7b4d967b9f9b8ba7c076a8a6d26e432b Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 26 Aug 2010 15:04:48 -0700
Subject: [PATCH 14/41] xen: dynamically allocate p2m space

Use early brk mechanism to allocate p2m tables, to save memory when
booting non-Xen.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 42086ac406af..ecbdcf0d45d4 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -174,18 +174,16 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
 #define TOP_ENTRIES		(MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
 
 /* Placeholder for holes in the address space */
-static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
-		{ [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
+static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_ENTRIES_PER_PAGE);
 
  /* Array of pointers to pages containing p2m entries */
-static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
-		{ [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_top, TOP_ENTRIES);
 
 /* Arrays of p2m arrays expressed in mfns used for save/restore */
-static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
+static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, TOP_ENTRIES);
 
-static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
-	__page_aligned_bss;
+static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn_list,
+			 (TOP_ENTRIES / P2M_ENTRIES_PER_PAGE));
 
 static inline unsigned p2m_top_index(unsigned long pfn)
 {
@@ -209,7 +207,7 @@ void xen_build_mfn_list_list(void)
 		p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
 	}
 
-	for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
+	for (idx = 0; idx < TOP_ENTRIES/P2M_ENTRIES_PER_PAGE; idx++) {
 		unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
 		p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
 	}
@@ -230,6 +228,22 @@ void __init xen_build_dynamic_phys_to_machine(void)
 	unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
 	unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
 	unsigned pfn;
+	unsigned i;
+
+	p2m_missing = extend_brk(sizeof(*p2m_missing) * P2M_ENTRIES_PER_PAGE,
+				 PAGE_SIZE);
+	for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
+		p2m_missing[i] = ~0UL;
+
+	p2m_top = extend_brk(sizeof(*p2m_top) * TOP_ENTRIES,
+			     PAGE_SIZE);
+	for (i = 0; i < TOP_ENTRIES; i++)
+		p2m_top[i] = p2m_missing;
+
+	p2m_top_mfn = extend_brk(sizeof(*p2m_top_mfn) * TOP_ENTRIES, PAGE_SIZE);
+	p2m_top_mfn_list = extend_brk(sizeof(*p2m_top_mfn_list) *
+				      (TOP_ENTRIES / P2M_ENTRIES_PER_PAGE),
+				      PAGE_SIZE);
 
 	for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
 		unsigned topidx = p2m_top_index(pfn);

From a2e875298729540300a9a0324ee66e3b7883a912 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 26 Aug 2010 16:08:31 -0700
Subject: [PATCH 15/41] xen: allocate p2m size based on actual max size

Allocate p2m tables based on the actual runtime maximum pfn rather than
the static config-time limit.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index ecbdcf0d45d4..151813d97552 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -169,25 +169,27 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
  */
 #define USER_LIMIT	((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
 
+static unsigned long max_p2m_pfn __read_mostly = MAX_DOMAIN_PAGES;
 
-#define P2M_ENTRIES_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long))
-#define TOP_ENTRIES		(MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
+#define P2M_ENTRIES_PER_PAGE		(PAGE_SIZE / sizeof(unsigned long))
+#define TOP_ENTRIES(pages)		((pages) / P2M_ENTRIES_PER_PAGE)
+#define MAX_TOP_ENTRIES			TOP_ENTRIES(MAX_DOMAIN_PAGES)
 
 /* Placeholder for holes in the address space */
 static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_ENTRIES_PER_PAGE);
 
  /* Array of pointers to pages containing p2m entries */
-static RESERVE_BRK_ARRAY(unsigned long *, p2m_top, TOP_ENTRIES);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_top, MAX_TOP_ENTRIES);
 
 /* Arrays of p2m arrays expressed in mfns used for save/restore */
-static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, TOP_ENTRIES);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, MAX_TOP_ENTRIES);
 
 static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn_list,
-			 (TOP_ENTRIES / P2M_ENTRIES_PER_PAGE));
+			 (MAX_TOP_ENTRIES / P2M_ENTRIES_PER_PAGE));
 
 static inline unsigned p2m_top_index(unsigned long pfn)
 {
-	BUG_ON(pfn >= MAX_DOMAIN_PAGES);
+	BUG_ON(pfn >= max_p2m_pfn);
 	return pfn / P2M_ENTRIES_PER_PAGE;
 }
 
@@ -201,13 +203,15 @@ void xen_build_mfn_list_list(void)
 {
 	unsigned pfn, idx;
 
-	for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
+	for (pfn = 0; pfn < max_p2m_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
 		unsigned topidx = p2m_top_index(pfn);
 
 		p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
 	}
 
-	for (idx = 0; idx < TOP_ENTRIES/P2M_ENTRIES_PER_PAGE; idx++) {
+	for (idx = 0;
+	     idx < TOP_ENTRIES(max_p2m_pfn)/P2M_ENTRIES_PER_PAGE;
+	     idx++) {
 		unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
 		p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
 	}
@@ -230,19 +234,22 @@ void __init xen_build_dynamic_phys_to_machine(void)
 	unsigned pfn;
 	unsigned i;
 
+	max_p2m_pfn = max_pfn;
+
 	p2m_missing = extend_brk(sizeof(*p2m_missing) * P2M_ENTRIES_PER_PAGE,
 				 PAGE_SIZE);
 	for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
 		p2m_missing[i] = ~0UL;
 
-	p2m_top = extend_brk(sizeof(*p2m_top) * TOP_ENTRIES,
+	p2m_top = extend_brk(sizeof(*p2m_top) * TOP_ENTRIES(max_pfn),
 			     PAGE_SIZE);
-	for (i = 0; i < TOP_ENTRIES; i++)
+	for (i = 0; i < TOP_ENTRIES(max_pfn); i++)
 		p2m_top[i] = p2m_missing;
 
-	p2m_top_mfn = extend_brk(sizeof(*p2m_top_mfn) * TOP_ENTRIES, PAGE_SIZE);
+	p2m_top_mfn = extend_brk(sizeof(*p2m_top_mfn) * TOP_ENTRIES(max_pfn),
+				 PAGE_SIZE);
 	p2m_top_mfn_list = extend_brk(sizeof(*p2m_top_mfn_list) *
-				      (TOP_ENTRIES / P2M_ENTRIES_PER_PAGE),
+				      (TOP_ENTRIES(max_pfn) / P2M_ENTRIES_PER_PAGE),
 				      PAGE_SIZE);
 
 	for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
@@ -258,7 +265,7 @@ unsigned long get_phys_to_machine(unsigned long pfn)
 {
 	unsigned topidx, idx;
 
-	if (unlikely(pfn >= MAX_DOMAIN_PAGES))
+	if (unlikely(pfn >= max_p2m_pfn))
 		return INVALID_P2M_ENTRY;
 
 	topidx = p2m_top_index(pfn);
@@ -304,7 +311,7 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 {
 	unsigned topidx, idx;
 
-	if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
+	if (unlikely(pfn >= max_p2m_pfn)) {
 		BUG_ON(mfn != INVALID_P2M_ENTRY);
 		return true;
 	}

From f0991802bb4368e33848e7f823caa487d23555fb Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 26 Aug 2010 16:16:28 -0700
Subject: [PATCH 16/41] xen: use early_brk for level2_kernel_pgt

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 151813d97552..71c6af6c89a5 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1843,13 +1843,15 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
 	return pgd;
 }
 #else	/* !CONFIG_X86_64 */
-static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
+static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD);
 
 __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
 					 unsigned long max_pfn)
 {
 	pmd_t *kernel_pmd;
 
+	level2_kernel_pgt = extend_brk(sizeof(pmd_t *) * PTRS_PER_PMD, PAGE_SIZE);
+
 	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
 				  xen_start_info->nr_pt_frames * PAGE_SIZE +
 				  512*1024);

From 764f0138b9f54aa96761810055a74fce1e58c300 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 26 Aug 2010 16:23:51 -0700
Subject: [PATCH 17/41] xen: allocate level1_ident_pgt

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 71c6af6c89a5..3de42d1e475b 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -138,7 +138,8 @@ static inline void check_zero(void)
  * large enough to allocate page table pages to allocate the rest.
  * Each page can map 2MB.
  */
-static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
+#define LEVEL1_IDENT_ENTRIES	(PTRS_PER_PTE * 4)
+static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
 
 #ifdef CONFIG_X86_64
 /* l3 pud for userspace vsyscall mapping */
@@ -1718,6 +1719,9 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
 	unsigned ident_pte;
 	unsigned long pfn;
 
+	level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
+				      PAGE_SIZE);
+
 	ident_pte = 0;
 	pfn = 0;
 	for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
@@ -1728,7 +1732,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
 			pte_page = m2v(pmd[pmdidx].pmd);
 		else {
 			/* Check for free pte pages */
-			if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
+			if (ident_pte == LEVEL1_IDENT_ENTRIES)
 				break;
 
 			pte_page = &level1_ident_pgt[ident_pte];

From 1e17fc7eff56d23a835d5d33e71d813aa9eb8ecc Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 3 Sep 2010 15:04:08 -0700
Subject: [PATCH 18/41] xen: remove noise about registering vcpu info

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 7d46c8441418..ee304b52d8b7 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -135,9 +135,6 @@ static void xen_vcpu_setup(int cpu)
 	info.mfn = arbitrary_virt_to_mfn(vcpup);
 	info.offset = offset_in_page(vcpup);
 
-	printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
-	       cpu, vcpup, info.mfn, info.offset);
-
 	/* Check to see if the hypervisor will put the vcpu_info
 	   structure where we want it, which allows direct access via
 	   a percpu-variable. */
@@ -151,9 +148,6 @@ static void xen_vcpu_setup(int cpu)
 		/* This cpu is using the registered vcpu info, even if
 		   later ones fail to. */
 		per_cpu(xen_vcpu, cpu) = vcpup;
-
-		printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
-		       cpu, vcpup);
 	}
 }
 
@@ -873,8 +867,6 @@ void xen_setup_vcpu_info_placement(void)
 	/* xen_vcpu_setup managed to place the vcpu_info within the
 	   percpu area for all cpus, so make use of it */
 	if (have_vcpu_info_placement) {
-		printk(KERN_INFO "Xen: using vcpu_info placement\n");
-
 		pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
 		pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
 		pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);

From 3588fe2e3f36543664beafedd3bb6dc3ffa896c5 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 27 Aug 2010 17:30:24 -0700
Subject: [PATCH 19/41] xen/events: change to using fasteoi

Change event delivery to:
 - mask+clear event in the upcall function
 - use handle_fasteoi_irq as the handler
 - unmask in the eoi function (and handle migration)

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/events.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 13365ba35218..8beb2bc60f7d 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -378,7 +378,7 @@ int bind_evtchn_to_irq(unsigned int evtchn)
 		irq = find_unbound_irq();
 
 		set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
-					      handle_edge_irq, "event");
+					      handle_fasteoi_irq, "event");
 
 		evtchn_to_irq[evtchn] = irq;
 		irq_info[irq] = mk_evtchn_info(evtchn);
@@ -664,6 +664,9 @@ static void __xen_evtchn_do_upcall(void)
 				int irq = evtchn_to_irq[port];
 				struct irq_desc *desc;
 
+				mask_evtchn(port);
+				clear_evtchn(port);
+
 				if (irq != -1) {
 					desc = irq_to_desc(irq);
 					if (desc)
@@ -801,10 +804,10 @@ static void ack_dynirq(unsigned int irq)
 {
 	int evtchn = evtchn_from_irq(irq);
 
-	move_native_irq(irq);
+	move_masked_irq(irq);
 
 	if (VALID_EVTCHN(evtchn))
-		clear_evtchn(evtchn);
+		unmask_evtchn(evtchn);
 }
 
 static int retrigger_dynirq(unsigned int irq)
@@ -960,7 +963,7 @@ static struct irq_chip xen_dynamic_chip __read_mostly = {
 	.mask		= disable_dynirq,
 	.unmask		= enable_dynirq,
 
-	.ack		= ack_dynirq,
+	.eoi		= ack_dynirq,
 	.set_affinity	= set_affinity_irq,
 	.retrigger	= retrigger_dynirq,
 };

From b7eb4ad39134ee5b09634a710e50c2990f533231 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 26 Aug 2010 17:06:58 -0700
Subject: [PATCH 20/41] xen: set shared_info->arch.max_pfn to max_p2m_pfn

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3de42d1e475b..909ad637c38d 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -224,7 +224,7 @@ void xen_setup_mfn_list_list(void)
 
 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
 		virt_to_mfn(p2m_top_mfn_list);
-	HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
+	HYPERVISOR_shared_info->arch.max_pfn = max_p2m_mfn;
 }
 
 /* Set up p2m_top to point to the domain-builder provided p2m pages */

From 1f2d9dd309feb08fdbc711fa03841650dfff87d8 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 26 Aug 2010 17:11:35 -0700
Subject: [PATCH 21/41] xen: set the actual extent of the mfn_list_list

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 909ad637c38d..fcff8c829799 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -224,7 +224,7 @@ void xen_setup_mfn_list_list(void)
 
 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
 		virt_to_mfn(p2m_top_mfn_list);
-	HYPERVISOR_shared_info->arch.max_pfn = max_p2m_mfn;
+	HYPERVISOR_shared_info->arch.max_pfn = max_p2m_pfn;
 }
 
 /* Set up p2m_top to point to the domain-builder provided p2m pages */

From bbbf61eff92c7c236f57ee1953ad84055443717e Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 26 Aug 2010 17:12:17 -0700
Subject: [PATCH 22/41] xen: make install_p2mtop_page() static

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 4 ++--
 arch/x86/xen/mmu.h | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index fcff8c829799..00969099b057 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -275,8 +275,8 @@ unsigned long get_phys_to_machine(unsigned long pfn)
 }
 EXPORT_SYMBOL_GPL(get_phys_to_machine);
 
-/* install a  new p2m_top page */
-bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
+/* install a new p2m_top page */
+static bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
 {
 	unsigned topidx = p2m_top_index(pfn);
 	unsigned long **pfnp, *mfnp;
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index fa938c4aa2f7..537bb9aab777 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -12,7 +12,6 @@ enum pt_level {
 
 
 bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
-bool install_p2mtop_page(unsigned long pfn, unsigned long *p);
 
 void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 

From 58e05027b530ff081ecea68e38de8d59db8f87e0 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 27 Aug 2010 13:28:48 -0700
Subject: [PATCH 23/41] xen: convert p2m to a 3 level tree

Make the p2m structure a 3 level tree which covers the full possible
physical space.

The p2m structure contains mappings from the domain's pfns to system-wide
mfns.  The structure has 3 levels and two roots.  The first root is for
the domain's own use, and is linked with virtual addresses.  The second
is all mfn references, and is used by Xen on save/restore to allow it to
update the p2m mapping for the domain.

At boot, the domain builder provides a simple flat p2m array for all the
initially present pages.  We construct the two levels above that using
the early_brk allocator.  After early boot time, set_phys_to_machine()
will allocate any missing levels using the normal kernel allocator
(at GFP_KERNEL, so it must be called in a normal blocking context).

Because the early_brk() API requires us to pre-reserve the maximum amount
of memory we could allocate, there is still a CONFIG_XEN_MAX_DOMAIN_MEMORY
config option, but its only negative side-effect is to increase the
kernel's apparent bss size.  However, since all unused brk memory is
returned to the heap, there's no real downside to making it large.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/Kconfig |  11 +-
 arch/x86/xen/mmu.c   | 324 ++++++++++++++++++++++++++++++++-----------
 2 files changed, 249 insertions(+), 86 deletions(-)

diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 68128a1b401a..90a7f5ad6916 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -19,15 +19,12 @@ config XEN_PVHVM
 	depends on X86_LOCAL_APIC
 
 config XEN_MAX_DOMAIN_MEMORY
-       int "Maximum allowed size of a domain in gigabytes"
-       default 8 if X86_32
-       default 32 if X86_64
+       int
+       default 128
        depends on XEN
        help
-         The pseudo-physical to machine address array is sized
-         according to the maximum possible memory size of a Xen
-         domain.  This array uses 1 page per gigabyte, so there's no
-         need to be too stingy here.
+         This only affects the sizing of some bss arrays, the unused
+         portions of which are freed.
 
 config XEN_SAVE_RESTORE
        bool
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 00969099b057..d4c7265cf0a0 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -170,51 +170,162 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
  */
 #define USER_LIMIT	((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
 
-static unsigned long max_p2m_pfn __read_mostly = MAX_DOMAIN_PAGES;
+/*
+ * Xen leaves the responsibility for maintaining p2m mappings to the
+ * guests themselves, but it must also access and update the p2m array
+ * during suspend/resume when all the pages are reallocated.
+ *
+ * The p2m table is logically a flat array, but we implement it as a
+ * three-level tree to allow the address space to be sparse.
+ *
+ *                               Xen
+ *                                |
+ *     p2m_top              p2m_top_mfn
+ *       /  \                   /   \
+ * p2m_mid p2m_mid	p2m_mid_mfn p2m_mid_mfn
+ *    / \      / \         /           /
+ *  p2m p2m p2m p2m p2m p2m p2m ...
+ *
+ * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
+ * maximum representable pseudo-physical address space is:
+ *  P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
+ *
+ * P2M_PER_PAGE depends on the architecture, as a mfn is always
+ * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
+ * 512 and 1024 entries respectively. 
+ */
 
-#define P2M_ENTRIES_PER_PAGE		(PAGE_SIZE / sizeof(unsigned long))
-#define TOP_ENTRIES(pages)		((pages) / P2M_ENTRIES_PER_PAGE)
-#define MAX_TOP_ENTRIES			TOP_ENTRIES(MAX_DOMAIN_PAGES)
+static unsigned long max_p2m_pfn __read_mostly;
 
-/* Placeholder for holes in the address space */
-static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_ENTRIES_PER_PAGE);
+#define P2M_PER_PAGE		(PAGE_SIZE / sizeof(unsigned long))
+#define P2M_MID_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long *))
+#define P2M_TOP_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long **))
 
- /* Array of pointers to pages containing p2m entries */
-static RESERVE_BRK_ARRAY(unsigned long *, p2m_top, MAX_TOP_ENTRIES);
+#define MAX_P2M_PFN		(P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
 
-/* Arrays of p2m arrays expressed in mfns used for save/restore */
-static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, MAX_TOP_ENTRIES);
+/* Placeholders for holes in the address space */
+static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
 
-static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn_list,
-			 (MAX_TOP_ENTRIES / P2M_ENTRIES_PER_PAGE));
+static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
+
+RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
+RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
 
 static inline unsigned p2m_top_index(unsigned long pfn)
 {
-	BUG_ON(pfn >= max_p2m_pfn);
-	return pfn / P2M_ENTRIES_PER_PAGE;
+	BUG_ON(pfn >= MAX_P2M_PFN);
+	return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
+}
+
+static inline unsigned p2m_mid_index(unsigned long pfn)
+{
+	return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
 }
 
 static inline unsigned p2m_index(unsigned long pfn)
 {
-	return pfn % P2M_ENTRIES_PER_PAGE;
+	return pfn % P2M_PER_PAGE;
 }
 
-/* Build the parallel p2m_top_mfn structures */
+static void p2m_top_init(unsigned long ***top)
+{
+	unsigned i;
+
+	for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+		top[i] = p2m_mid_missing;
+}
+
+static void p2m_top_mfn_init(unsigned long *top)
+{
+	unsigned i;
+
+	for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+		top[i] = virt_to_mfn(p2m_mid_missing_mfn);
+}
+
+static void p2m_mid_init(unsigned long **mid)
+{
+	unsigned i;
+
+	for (i = 0; i < P2M_MID_PER_PAGE; i++)
+		mid[i] = p2m_missing;
+}
+
+static void p2m_mid_mfn_init(unsigned long *mid)
+{
+	unsigned i;
+
+	for (i = 0; i < P2M_MID_PER_PAGE; i++)
+		mid[i] = virt_to_mfn(p2m_missing);
+}
+
+static void p2m_init(unsigned long *p2m)
+{
+	unsigned i;
+
+	for (i = 0; i < P2M_MID_PER_PAGE; i++)
+		p2m[i] = INVALID_P2M_ENTRY;
+}
+
+/*
+ * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
+ *
+ * This is called both at boot time, and after resuming from suspend:
+ * - At boot time we're called very early, and must use extend_brk()
+ *   to allocate memory.
+ *
+ * - After resume we're called from within stop_machine, but the mfn
+ *   tree should alreay be completely allocated.
+ */
 void xen_build_mfn_list_list(void)
 {
-	unsigned pfn, idx;
+	unsigned pfn, i;
 
-	for (pfn = 0; pfn < max_p2m_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
-		unsigned topidx = p2m_top_index(pfn);
+	/* Pre-initialize p2m_top_mfn to be completely missing */
+	if (p2m_top_mfn == NULL) {
+		p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+		p2m_mid_mfn_init(p2m_mid_missing_mfn);
 
-		p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
+		p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+		p2m_top_mfn_init(p2m_top_mfn);
 	}
 
-	for (idx = 0;
-	     idx < TOP_ENTRIES(max_p2m_pfn)/P2M_ENTRIES_PER_PAGE;
-	     idx++) {
-		unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
-		p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
+	for (pfn = 0; pfn < max_p2m_pfn; pfn += P2M_PER_PAGE) {
+		unsigned topidx = p2m_top_index(pfn);
+		unsigned mididx = p2m_mid_index(pfn);
+		unsigned long **mid;
+		unsigned long mid_mfn;
+		unsigned long *mid_mfn_p;
+
+		mid = p2m_top[topidx];
+
+		/* Don't bother allocating any mfn mid levels if
+		   they're just missing */
+		if (mid[mididx] == p2m_missing)
+			continue;
+
+		mid_mfn = p2m_top_mfn[topidx];
+		mid_mfn_p = mfn_to_virt(mid_mfn);
+
+		if (mid_mfn_p == p2m_mid_missing_mfn) {
+			/*
+			 * XXX boot-time only!  We should never find
+			 * missing parts of the mfn tree after
+			 * runtime.  extend_brk() will BUG if we call
+			 * it too late.
+			 */
+			mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+			p2m_mid_mfn_init(mid_mfn_p);
+
+			mid_mfn = virt_to_mfn(mid_mfn_p);
+			
+			p2m_top_mfn[topidx] = mid_mfn;
+		}
+
+		mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
 	}
 }
 
@@ -223,7 +334,7 @@ void xen_setup_mfn_list_list(void)
 	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
 
 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
-		virt_to_mfn(p2m_top_mfn_list);
+		virt_to_mfn(p2m_top_mfn);
 	HYPERVISOR_shared_info->arch.max_pfn = max_p2m_pfn;
 }
 
@@ -233,99 +344,154 @@ void __init xen_build_dynamic_phys_to_machine(void)
 	unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
 	unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
 	unsigned pfn;
-	unsigned i;
 
 	max_p2m_pfn = max_pfn;
 
-	p2m_missing = extend_brk(sizeof(*p2m_missing) * P2M_ENTRIES_PER_PAGE,
-				 PAGE_SIZE);
-	for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
-		p2m_missing[i] = ~0UL;
+	p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+	p2m_init(p2m_missing);
 
-	p2m_top = extend_brk(sizeof(*p2m_top) * TOP_ENTRIES(max_pfn),
-			     PAGE_SIZE);
-	for (i = 0; i < TOP_ENTRIES(max_pfn); i++)
-		p2m_top[i] = p2m_missing;
+	p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+	p2m_mid_init(p2m_mid_missing);
 
-	p2m_top_mfn = extend_brk(sizeof(*p2m_top_mfn) * TOP_ENTRIES(max_pfn),
-				 PAGE_SIZE);
-	p2m_top_mfn_list = extend_brk(sizeof(*p2m_top_mfn_list) *
-				      (TOP_ENTRIES(max_pfn) / P2M_ENTRIES_PER_PAGE),
-				      PAGE_SIZE);
+	p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
+	p2m_top_init(p2m_top);
 
-	for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
+	/*
+	 * The domain builder gives us a pre-constructed p2m array in
+	 * mfn_list for all the pages initially given to us, so we just
+	 * need to graft that into our tree structure.
+	 */
+	for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
 		unsigned topidx = p2m_top_index(pfn);
+		unsigned mididx = p2m_mid_index(pfn);
 
-		p2m_top[topidx] = &mfn_list[pfn];
+		if (p2m_top[topidx] == p2m_mid_missing) {
+			unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
+			p2m_mid_init(mid);
+
+			p2m_top[topidx] = mid;
+		}
+
+		p2m_top[topidx][mididx] = &mfn_list[pfn];
 	}
 
+	/* Allocate and initialize top and mid mfn levels */
 	xen_build_mfn_list_list();
 }
 
 unsigned long get_phys_to_machine(unsigned long pfn)
 {
-	unsigned topidx, idx;
+	unsigned topidx, mididx, idx;
 
-	if (unlikely(pfn >= max_p2m_pfn))
+	if (unlikely(pfn >= MAX_P2M_PFN))
 		return INVALID_P2M_ENTRY;
 
 	topidx = p2m_top_index(pfn);
+	mididx = p2m_mid_index(pfn);
 	idx = p2m_index(pfn);
-	return p2m_top[topidx][idx];
+
+	return p2m_top[topidx][mididx][idx];
 }
 EXPORT_SYMBOL_GPL(get_phys_to_machine);
 
-/* install a new p2m_top page */
-static bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
+static void *alloc_p2m_page(void)
 {
-	unsigned topidx = p2m_top_index(pfn);
-	unsigned long **pfnp, *mfnp;
-	unsigned i;
-
-	pfnp = &p2m_top[topidx];
-	mfnp = &p2m_top_mfn[topidx];
-
-	for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
-		p[i] = INVALID_P2M_ENTRY;
-
-	if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) {
-		*mfnp = virt_to_mfn(p);
-		return true;
-	}
-
-	return false;
+	return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
 }
 
-static void alloc_p2m(unsigned long pfn)
+static void free_p2m_page(void *p)
 {
-	unsigned long *p;
+	free_page((unsigned long)p);
+}
 
-	p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
-	BUG_ON(p == NULL);
+/* 
+ * Fully allocate the p2m structure for a given pfn.  We need to check
+ * that both the top and mid levels are allocated, and make sure the
+ * parallel mfn tree is kept in sync.  We may race with other cpus, so
+ * the new pages are installed with cmpxchg; if we lose the race then
+ * simply free the page we allocated and use the one that's there.
+ */
+static bool alloc_p2m(unsigned long pfn)
+{
+	unsigned topidx, mididx;
+	unsigned long ***top_p, **mid;
+	unsigned long *top_mfn_p, *mid_mfn;
 
-	if (!install_p2mtop_page(pfn, p))
-		free_page((unsigned long)p);
+	topidx = p2m_top_index(pfn);
+	mididx = p2m_mid_index(pfn);
+
+	top_p = &p2m_top[topidx];
+	mid = *top_p;
+
+	if (mid == p2m_mid_missing) {
+		/* Mid level is missing, allocate a new one */
+		mid = alloc_p2m_page();
+		if (!mid)
+			return false;
+
+		p2m_mid_init(mid);
+
+		if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
+			free_p2m_page(mid);
+	}
+
+	top_mfn_p = &p2m_top_mfn[topidx];
+	mid_mfn = mfn_to_virt(*top_mfn_p);
+
+	if (mid_mfn == p2m_mid_missing_mfn) {
+		/* Separately check the mid mfn level */
+		unsigned long missing_mfn;
+		unsigned long mid_mfn_mfn;
+
+		mid_mfn = alloc_p2m_page();
+		if (!mid_mfn)
+			return false;
+
+		p2m_mid_mfn_init(mid_mfn);
+		
+		missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
+		mid_mfn_mfn = virt_to_mfn(mid_mfn);
+		if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
+			free_p2m_page(mid_mfn);
+	}
+
+	if (p2m_top[topidx][mididx] == p2m_missing) {
+		/* p2m leaf page is missing */
+		unsigned long *p2m;
+
+		p2m = alloc_p2m_page();
+		if (!p2m)
+			return false;
+
+		p2m_init(p2m);
+
+		if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
+			free_p2m_page(p2m);
+		else
+			mid_mfn[mididx] = virt_to_mfn(p2m);
+	}
+
+	return true;
 }
 
 /* Try to install p2m mapping; fail if intermediate bits missing */
 bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 {
-	unsigned topidx, idx;
+	unsigned topidx, mididx, idx;
 
-	if (unlikely(pfn >= max_p2m_pfn)) {
+	if (unlikely(pfn >= MAX_P2M_PFN)) {
 		BUG_ON(mfn != INVALID_P2M_ENTRY);
 		return true;
 	}
 
 	topidx = p2m_top_index(pfn);
-	if (p2m_top[topidx] == p2m_missing) {
-		if (mfn == INVALID_P2M_ENTRY)
-			return true;
-		return false;
-	}
-
+	mididx = p2m_mid_index(pfn);
 	idx = p2m_index(pfn);
-	p2m_top[topidx][idx] = mfn;
+
+	if (p2m_top[topidx][mididx] == p2m_missing)
+		return mfn == INVALID_P2M_ENTRY;
+
+	p2m_top[topidx][mididx][idx] = mfn;
 
 	return true;
 }
@@ -338,7 +504,7 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 	}
 
 	if (unlikely(!__set_phys_to_machine(pfn, mfn)))  {
-		alloc_p2m(pfn);
+		WARN(!alloc_p2m(pfn), "Can't allocate p2m for %lx, %lx", pfn, mfn);
 
 		if (!__set_phys_to_machine(pfn, mfn))
 			BUG();

From c3798062f100c3e1d4ae1241bc536f3b1f28a6ca Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 27 Aug 2010 13:42:04 -0700
Subject: [PATCH 24/41] xen: add return value to set_phys_to_machine()

set_phys_to_machine() can return false on failure, which means a memory
allocation failure for the p2m structure.  It can only fail if setting
the mfn for a pfn in previously unused address space.  It is guaranteed
to succeed if you're setting a mapping to INVALID_P2M_ENTRY or updating
the mfn for an existing pfn.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/include/asm/xen/page.h |  2 +-
 arch/x86/xen/mmu.c              | 13 ++++++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index bf5f7d32bd08..e40ca6e67bb5 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -37,7 +37,7 @@ typedef struct xpaddr {
 
 
 extern unsigned long get_phys_to_machine(unsigned long pfn);
-extern void set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
 
 static inline unsigned long pfn_to_mfn(unsigned long pfn)
 {
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index d4c7265cf0a0..b96513437236 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -282,7 +282,7 @@ static void p2m_init(unsigned long *p2m)
  */
 void xen_build_mfn_list_list(void)
 {
-	unsigned pfn, i;
+	unsigned pfn;
 
 	/* Pre-initialize p2m_top_mfn to be completely missing */
 	if (p2m_top_mfn == NULL) {
@@ -496,19 +496,22 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 	return true;
 }
 
-void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 {
 	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
 		BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
-		return;
+		return true;
 	}
 
 	if (unlikely(!__set_phys_to_machine(pfn, mfn)))  {
-		WARN(!alloc_p2m(pfn), "Can't allocate p2m for %lx, %lx", pfn, mfn);
+		if (!alloc_p2m(pfn))
+			return false;
 
 		if (!__set_phys_to_machine(pfn, mfn))
-			BUG();
+			return false;
 	}
+
+	return true;
 }
 
 unsigned long arbitrary_virt_to_mfn(void *vaddr)

From 33a847502b0338351cebd8fc0c68ac796cfadbbd Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 27 Aug 2010 15:18:19 -0700
Subject: [PATCH 25/41] xen: defer building p2m mfn structures until kernel is
 mapped

When building mfn parts of p2m structure, we rely on being able to
use mfn_to_virt, which in turn requires kernel to be mapped into
the linear area (which is distinct from the kernel image mapping
on 64-bit).  Defer calling xen_build_mfn_list_list() until after
xen_setup_kernel_pagetable();

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c | 3 +++
 arch/x86/xen/mmu.c       | 3 ---
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index ee304b52d8b7..d8873014b5ed 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1178,6 +1178,9 @@ asmlinkage void __init xen_start_kernel(void)
 	xen_raw_console_write("mapping kernel into physical memory\n");
 	pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
 
+	/* Allocate and initialize top and mid mfn levels for p2m structure */
+	xen_build_mfn_list_list();
+
 	init_mm.pgd = pgd;
 
 	/* keep using Xen gdt for now; no urgent need to change it */
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index b96513437236..9b43bb398d37 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -374,9 +374,6 @@ void __init xen_build_dynamic_phys_to_machine(void)
 
 		p2m_top[topidx][mididx] = &mfn_list[pfn];
 	}
-
-	/* Allocate and initialize top and mid mfn levels */
-	xen_build_mfn_list_list();
 }
 
 unsigned long get_phys_to_machine(unsigned long pfn)

From cfd8951e082a589637f9de3c33efd3218fdb3c03 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 31 Aug 2010 14:06:22 -0700
Subject: [PATCH 26/41] xen: don't map missing memory

When setting up a pte for a missing pfn (no matching mfn), just create
an empty pte rather than a junk mapping.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/include/asm/xen/page.h |  9 ++++++++-
 arch/x86/xen/mmu.c              | 15 ++++++++++++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index e40ca6e67bb5..875f5a08a6c7 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -41,10 +41,17 @@ extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
 
 static inline unsigned long pfn_to_mfn(unsigned long pfn)
 {
+	unsigned long mfn;
+
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return pfn;
 
-	return get_phys_to_machine(pfn) & ~FOREIGN_FRAME_BIT;
+	mfn = get_phys_to_machine(pfn);
+
+	if (mfn != INVALID_P2M_ENTRY)
+		mfn &= ~FOREIGN_FRAME_BIT;
+
+	return mfn;
 }
 
 static inline int phys_to_machine_mapping_valid(unsigned long pfn)
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 9b43bb398d37..4c63b7f452dd 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -745,7 +745,20 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
 	if (val & _PAGE_PRESENT) {
 		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 		pteval_t flags = val & PTE_FLAGS_MASK;
-		val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
+		unsigned long mfn = pfn_to_mfn(pfn);
+
+		/*
+		 * If there's no mfn for the pfn, then just create an
+		 * empty non-present pte.  Unfortunately this loses
+		 * information about the original pfn, so
+		 * pte_mfn_to_pfn is asymmetric.
+		 */
+		if (unlikely(mfn == INVALID_P2M_ENTRY)) {
+			mfn = 0;
+			flags = 0;
+		}
+
+		val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
 	}
 
 	return val;

From 35ae11fd146384d222f3bb1f17eed1970cc92c36 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 6 Feb 2009 19:09:48 -0800
Subject: [PATCH 27/41] xen: Use host-provided E820 map

Rather than simply using a flat memory map from Xen, use its provided
E820 map.  This allows the domain builder to tell the domain to reserve
space for more pages than those initially provided at domain-build time.

It also allows the host to specify holes in the address space (for
PCI-passthrough, for example).

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/setup.c           | 38 ++++++++++++++++++++++++++++++++--
 include/xen/interface/memory.h | 29 ++++++++++++++++++++++++++
 2 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 328b00305426..dd2eb2a9303f 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -19,6 +19,7 @@
 
 #include <xen/page.h>
 #include <xen/interface/callback.h>
+#include <xen/interface/memory.h>
 #include <xen/interface/physdev.h>
 #include <xen/interface/memory.h>
 #include <xen/features.h>
@@ -107,13 +108,46 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
 
 char * __init xen_memory_setup(void)
 {
+	static struct e820entry map[E820MAX] __initdata;
+
 	unsigned long max_pfn = xen_start_info->nr_pages;
+	unsigned long long mem_end;
+	int rc;
+	struct xen_memory_map memmap;
+	int i;
 
 	max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
+	mem_end = PFN_PHYS(max_pfn);
+
+	memmap.nr_entries = E820MAX;
+	set_xen_guest_handle(memmap.buffer, map);
+
+	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+	if (rc == -ENOSYS) {
+		memmap.nr_entries = 1;
+		map[0].addr = 0ULL;
+		map[0].size = mem_end;
+		/* 8MB slack (to balance backend allocations). */
+		map[0].size += 8ULL << 20;
+		map[0].type = E820_RAM;
+		rc = 0;
+	}
+	BUG_ON(rc);
 
 	e820.nr_map = 0;
-
-	e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM);
+	for (i = 0; i < memmap.nr_entries; i++) {
+		unsigned long long end = map[i].addr + map[i].size;
+		if (map[i].type == E820_RAM) {
+			if (map[i].addr > mem_end)
+				continue;
+			if (end > mem_end) {
+				/* Truncate region to max_mem. */
+				map[i].size -= end - mem_end;
+			}
+		}
+		if (map[i].size > 0)
+			e820_add_region(map[i].addr, map[i].size, map[i].type);
+	}
 
 	/*
 	 * Even though this is normal, usable memory under Xen, reserve
diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
index d3938d3e71f8..d7a6c13bde69 100644
--- a/include/xen/interface/memory.h
+++ b/include/xen/interface/memory.h
@@ -186,6 +186,35 @@ struct xen_translate_gpfn_list {
 };
 DEFINE_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list);
 
+/*
+ * Returns the pseudo-physical memory map as it was when the domain
+ * was started (specified by XENMEM_set_memory_map).
+ * arg == addr of struct xen_memory_map.
+ */
+#define XENMEM_memory_map           9
+struct xen_memory_map {
+    /*
+     * On call the number of entries which can be stored in buffer. On
+     * return the number of entries which have been stored in
+     * buffer.
+     */
+    unsigned int nr_entries;
+
+    /*
+     * Entries in the buffer are in the same format as returned by the
+     * BIOS INT 0x15 EAX=0xE820 call.
+     */
+    GUEST_HANDLE(void) buffer;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map);
+
+/*
+ * Returns the real physical memory map. Passes the same structure as
+ * XENMEM_memory_map.
+ * arg == addr of struct xen_memory_map.
+ */
+#define XENMEM_machine_memory_map   10
+
 
 /*
  * Prevent the balloon driver from changing the memory reservation

From 42ee1471e9b879479a15debac752314a596c738e Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Mon, 30 Aug 2010 16:41:02 -0700
Subject: [PATCH 28/41] xen: implement "extra" memory to reserve space for
 pages not present at boot

When using the e820 map to get the initial pseudo-physical address space,
look for either Xen-provided memory which doesn't lie within an E820
region, or an E820 RAM region which extends beyond the Xen-provided
memory range.

Count these pages, and add them to a new "extra memory" range.  This range
has an E820 RAM range to describe it - so the kernel will allocate page
structures for it - but it is also marked reserved so that the kernel
will not attempt to use it.

The balloon driver can then add this range as a set of currently
ballooned-out pages, which can be used to extend the domain beyond its
original size.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/setup.c | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index dd2eb2a9303f..f9a99eaddcdc 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -34,6 +34,26 @@ extern void xen_sysenter_target(void);
 extern void xen_syscall_target(void);
 extern void xen_syscall32_target(void);
 
+/* Amount of extra memory space we add to the e820 ranges */
+phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
+
+static __init void xen_add_extra_mem(unsigned long pages)
+{
+	u64 size = (u64)pages * PAGE_SIZE;
+
+	if (!pages)
+		return;
+
+	e820_add_region(xen_extra_mem_start + xen_extra_mem_size, size, E820_RAM);
+	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+
+	reserve_early(xen_extra_mem_start + xen_extra_mem_size,
+		      xen_extra_mem_start + xen_extra_mem_size + size,
+		      "XEN EXTRA");
+
+	xen_extra_mem_size += size;
+}
+
 static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
 					      phys_addr_t end_addr)
 {
@@ -105,7 +125,6 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
 /**
  * machine_specific_memory_setup - Hook for machine specific memory setup.
  **/
-
 char * __init xen_memory_setup(void)
 {
 	static struct e820entry map[E820MAX] __initdata;
@@ -114,6 +133,7 @@ char * __init xen_memory_setup(void)
 	unsigned long long mem_end;
 	int rc;
 	struct xen_memory_map memmap;
+	unsigned long extra_pages = 0;
 	int i;
 
 	max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
@@ -135,6 +155,7 @@ char * __init xen_memory_setup(void)
 	BUG_ON(rc);
 
 	e820.nr_map = 0;
+	xen_extra_mem_start = mem_end;
 	for (i = 0; i < memmap.nr_entries; i++) {
 		unsigned long long end = map[i].addr + map[i].size;
 		if (map[i].type == E820_RAM) {
@@ -143,6 +164,8 @@ char * __init xen_memory_setup(void)
 			if (end > mem_end) {
 				/* Truncate region to max_mem. */
 				map[i].size -= end - mem_end;
+
+				extra_pages += PFN_DOWN(end - mem_end);
 			}
 		}
 		if (map[i].size > 0)
@@ -169,7 +192,9 @@ char * __init xen_memory_setup(void)
 
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 
-	xen_return_unused_memory(xen_start_info->nr_pages, &e820);
+	extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820);
+
+	xen_add_extra_mem(extra_pages);
 
 	return "Xen";
 }

From 36bc251b87f88147e9d8346e4b431f42353c3d38 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 2 Sep 2010 17:07:03 -0700
Subject: [PATCH 29/41] xen: make sure xen_extra_mem_start is beyond all
 non-RAM e820

If Xen gives us non-RAM E820 entries (dom0 only, typically), then
make sure the extra RAM region is beyond them.  It's OK for
the extra space to grow into E820 regions, however.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/setup.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index f9a99eaddcdc..eac010008cd2 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -167,7 +167,8 @@ char * __init xen_memory_setup(void)
 
 				extra_pages += PFN_DOWN(end - mem_end);
 			}
-		}
+		} else if (map[i].type != E820_RAM)
+			xen_extra_mem_start = end;
 		if (map[i].size > 0)
 			e820_add_region(map[i].addr, map[i].size, map[i].type);
 	}

From b5b43ced7a6e79d30df3232b37dc82c5d8dfa843 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 2 Sep 2010 17:10:12 -0700
Subject: [PATCH 30/41] xen: add extra pages for E820 RAM regions, even if
 beyond mem_end

If an entire E820 RAM region is beyond mem_end, still add its
pages to the extra area so that space can be used by the kernel.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/setup.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index eac010008cd2..1e85e26efa69 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -158,9 +158,8 @@ char * __init xen_memory_setup(void)
 	xen_extra_mem_start = mem_end;
 	for (i = 0; i < memmap.nr_entries; i++) {
 		unsigned long long end = map[i].addr + map[i].size;
+
 		if (map[i].type == E820_RAM) {
-			if (map[i].addr > mem_end)
-				continue;
 			if (end > mem_end) {
 				/* Truncate region to max_mem. */
 				map[i].size -= end - mem_end;
@@ -169,7 +168,9 @@ char * __init xen_memory_setup(void)
 			}
 		} else if (map[i].type != E820_RAM)
 			xen_extra_mem_start = end;
-		if (map[i].size > 0)
+
+		if ((map[i].type != E820_RAM || map[i].addr < mem_end) &&
+		    map[i].size > 0)
 			e820_add_region(map[i].addr, map[i].size, map[i].type);
 	}
 

From 698bb8d14a5b577b6841acaccdf5095d3b7c7389 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 14 Sep 2010 10:19:14 -0700
Subject: [PATCH 31/41] xen: limit extra memory to a certain ratio of base

If extra memory is very much larger than the base memory size
then all of the base memory can be filled with structures reserved to
describe the extra memory, leaving no space for anything else.

Even at the maximum ratio there will be little space for anything else,
but this change is intended to at least allow the system to boot rather
than crash mysteriously.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/setup.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 1e85e26efa69..6c9039e92f81 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -37,6 +37,18 @@ extern void xen_syscall32_target(void);
 /* Amount of extra memory space we add to the e820 ranges */
 phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
 
+/* 
+ * The maximum amount of extra memory compared to the base size.  The
+ * main scaling factor is the size of struct page.  At extreme ratios
+ * of base:extra, all the base memory can be filled with page
+ * structures for the extra memory, leaving no space for anything
+ * else.
+ * 
+ * 10x seems like a reasonable balance between scaling flexibility and
+ * leaving a practically usable system.
+ */
+#define EXTRA_MEM_RATIO		(10)
+
 static __init void xen_add_extra_mem(unsigned long pages)
 {
 	u64 size = (u64)pages * PAGE_SIZE;
@@ -134,6 +146,7 @@ char * __init xen_memory_setup(void)
 	int rc;
 	struct xen_memory_map memmap;
 	unsigned long extra_pages = 0;
+	unsigned long extra_limit;
 	int i;
 
 	max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
@@ -196,6 +209,25 @@ char * __init xen_memory_setup(void)
 
 	extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820);
 
+	/*
+	 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
+	 * factor the base size.  On non-highmem systems, the base
+	 * size is the full initial memory allocation; on highmem it
+	 * is limited to the max size of lowmem, so that it doesn't
+	 * get completely filled.
+	 *
+	 * In principle there could be a problem in lowmem systems if
+	 * the initial memory is also very large with respect to
+	 * lowmem, but we won't try to deal with that here.
+	 */
+	extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
+			  max_pfn + extra_pages);
+
+	if (extra_limit >= max_pfn)
+		extra_pages = extra_limit - max_pfn;
+	else
+		extra_pages = 0;
+
 	xen_add_extra_mem(extra_pages);
 
 	return "Xen";

From 2f7acb208523a3bf5f1830f01c29f7feda045169 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 15 Sep 2010 13:32:49 -0700
Subject: [PATCH 32/41] xen: make sure xen_max_p2m_pfn is up to date

Keep xen_max_p2m_pfn up to date with the end of the extra memory
we're adding.  It is possible that it will be too high since memory
may be truncated by a "mem=" option on the kernel command line, but
that won't matter.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c     | 8 ++++----
 arch/x86/xen/setup.c   | 2 ++
 arch/x86/xen/xen-ops.h | 1 +
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 4c63b7f452dd..b2371671b11c 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -195,7 +195,7 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
  * 512 and 1024 entries respectively. 
  */
 
-static unsigned long max_p2m_pfn __read_mostly;
+unsigned long xen_max_p2m_pfn __read_mostly;
 
 #define P2M_PER_PAGE		(PAGE_SIZE / sizeof(unsigned long))
 #define P2M_MID_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long *))
@@ -293,7 +293,7 @@ void xen_build_mfn_list_list(void)
 		p2m_top_mfn_init(p2m_top_mfn);
 	}
 
-	for (pfn = 0; pfn < max_p2m_pfn; pfn += P2M_PER_PAGE) {
+	for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
 		unsigned topidx = p2m_top_index(pfn);
 		unsigned mididx = p2m_mid_index(pfn);
 		unsigned long **mid;
@@ -335,7 +335,7 @@ void xen_setup_mfn_list_list(void)
 
 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
 		virt_to_mfn(p2m_top_mfn);
-	HYPERVISOR_shared_info->arch.max_pfn = max_p2m_pfn;
+	HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
 }
 
 /* Set up p2m_top to point to the domain-builder provided p2m pages */
@@ -345,7 +345,7 @@ void __init xen_build_dynamic_phys_to_machine(void)
 	unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
 	unsigned pfn;
 
-	max_p2m_pfn = max_pfn;
+	xen_max_p2m_pfn = max_pfn;
 
 	p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
 	p2m_init(p2m_missing);
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 6c9039e92f81..cad2fcd130ec 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -64,6 +64,8 @@ static __init void xen_add_extra_mem(unsigned long pages)
 		      "XEN EXTRA");
 
 	xen_extra_mem_size += size;
+
+	xen_max_p2m_pfn = PFN_DOWN(xen_extra_mem_start + xen_extra_mem_size);
 }
 
 static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 7c8ab86163e9..d505e98e03f8 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -30,6 +30,7 @@ void xen_setup_machphys_mapping(void);
 pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
 void xen_ident_map_ISA(void);
 void xen_reserve_top(void);
+extern unsigned long xen_max_p2m_pfn;
 
 char * __init xen_memory_setup(void);
 void __init xen_arch_setup(void);

From 41f2e4771a4f1ba26c35438daf32917b9ef7858d Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 30 Mar 2010 11:47:40 -0700
Subject: [PATCH 33/41] xen: add support for PAT

Convert Linux PAT entries into Xen ones when constructing ptes.  Linux
doesn't use _PAGE_PAT for ptes, so the only difference in the first 4
entries is that Linux uses _PAGE_PWT for WC, whereas Xen (and default)
use it for WT.

xen_pte_val does the inverse conversion.

We hard-code assumptions about Linux's current PAT layout, but a
warning on the wrmsr to MSR_IA32_CR_PAT should point out any problems.
If necessary we could go to a more general table-based conversion between
Linux and Xen PAT entries.

hugetlbfs poses a problem at the moment, the x86 architecture uses the
same flag for _PAGE_PAT and _PAGE_PSE, which changes meaning depending
on which pagetable level we're using.  At the moment this should be OK
so long as nobody tries to do a pte_val on a hugetlbfs pte.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c |  5 ++++
 arch/x86/xen/mmu.c       | 53 +++++++++++++++++++++++++++++++++++++---
 arch/x86/xen/xen-ops.h   |  2 ++
 3 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index d8873014b5ed..b860e576ed0c 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -829,6 +829,11 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
 		   Xen console noise. */
 		break;
 
+	case MSR_IA32_CR_PAT:
+		if (smp_processor_id() == 0)
+			xen_set_pat(((u64)high << 32) | low);
+		break;
+
 	default:
 		ret = native_write_msr_safe(msr, low, high);
 	}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index b2371671b11c..67b41017f7b8 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -55,6 +55,7 @@
 #include <asm/e820.h>
 #include <asm/linkage.h>
 #include <asm/page.h>
+#include <asm/pat.h>
 
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
@@ -780,10 +781,18 @@ static pteval_t iomap_pte(pteval_t val)
 
 pteval_t xen_pte_val(pte_t pte)
 {
-	if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP))
-		return pte.pte;
+	pteval_t pteval = pte.pte;
 
-	return pte_mfn_to_pfn(pte.pte);
+	/* If this is a WC pte, convert back from Xen WC to Linux WC */
+	if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
+		WARN_ON(!pat_enabled);
+		pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
+	}
+
+	if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
+		return pteval;
+
+	return pte_mfn_to_pfn(pteval);
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
 
@@ -793,10 +802,48 @@ pgdval_t xen_pgd_val(pgd_t pgd)
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
 
+/*
+ * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
+ * are reserved for now, to correspond to the Intel-reserved PAT
+ * types.
+ *
+ * We expect Linux's PAT set as follows:
+ *
+ * Idx  PTE flags        Linux    Xen    Default
+ * 0                     WB       WB     WB
+ * 1            PWT      WC       WT     WT
+ * 2        PCD          UC-      UC-    UC-
+ * 3        PCD PWT      UC       UC     UC
+ * 4    PAT              WB       WC     WB
+ * 5    PAT     PWT      WC       WP     WT
+ * 6    PAT PCD          UC-      UC     UC-
+ * 7    PAT PCD PWT      UC       UC     UC
+ */
+
+void xen_set_pat(u64 pat)
+{
+	/* We expect Linux to use a PAT setting of
+	 * UC UC- WC WB (ignoring the PAT flag) */
+	WARN_ON(pat != 0x0007010600070106ull);
+}
+
 pte_t xen_make_pte(pteval_t pte)
 {
 	phys_addr_t addr = (pte & PTE_PFN_MASK);
 
+	/* If Linux is trying to set a WC pte, then map to the Xen WC.
+	 * If _PAGE_PAT is set, then it probably means it is really
+	 * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
+	 * things work out OK...
+	 *
+	 * (We should never see kernel mappings with _PAGE_PSE set,
+	 * but we could see hugetlbfs mappings, I think.).
+	 */
+	if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
+		if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
+			pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
+	}
+
 	/*
 	 * Unprivileged domains are allowed to do IOMAPpings for
 	 * PCI passthrough, but not map ISA space.  The ISA
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index d505e98e03f8..64044747348e 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -32,6 +32,8 @@ void xen_ident_map_ISA(void);
 void xen_reserve_top(void);
 extern unsigned long xen_max_p2m_pfn;
 
+void xen_set_pat(u64);
+
 char * __init xen_memory_setup(void);
 void __init xen_arch_setup(void);
 void __init xen_init_IRQ(void);

From 3654581e47adc07072aebe239818485b68ea04f0 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 29 Sep 2010 16:54:33 -0700
Subject: [PATCH 34/41] xen: don't add extra_pages for RAM after mem_end

If an E820 region is entirely beyond mem_end, don't attempt to truncate
it and add the truncated pages to extra_pages, as they will be negative.

Also, make sure the extra memory region starts after all BIOS provided
E820 regions (and in the case of RAM regions, post-clipping).

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/setup.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index cad2fcd130ec..7a4ab05cff8a 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -52,20 +52,19 @@ phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
 static __init void xen_add_extra_mem(unsigned long pages)
 {
 	u64 size = (u64)pages * PAGE_SIZE;
+	u64 extra_start = xen_extra_mem_start + xen_extra_mem_size;
 
 	if (!pages)
 		return;
 
-	e820_add_region(xen_extra_mem_start + xen_extra_mem_size, size, E820_RAM);
+	e820_add_region(extra_start, size, E820_RAM);
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 
-	reserve_early(xen_extra_mem_start + xen_extra_mem_size,
-		      xen_extra_mem_start + xen_extra_mem_size + size,
-		      "XEN EXTRA");
+	reserve_early(extra_start, extra_start + size, "XEN EXTRA");
 
 	xen_extra_mem_size += size;
 
-	xen_max_p2m_pfn = PFN_DOWN(xen_extra_mem_start + xen_extra_mem_size);
+	xen_max_p2m_pfn = PFN_DOWN(extra_start + size);
 }
 
 static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
@@ -175,15 +174,21 @@ char * __init xen_memory_setup(void)
 		unsigned long long end = map[i].addr + map[i].size;
 
 		if (map[i].type == E820_RAM) {
-			if (end > mem_end) {
+			if (map[i].addr < mem_end && end > mem_end) {
 				/* Truncate region to max_mem. */
-				map[i].size -= end - mem_end;
+				u64 delta = end - mem_end;
 
-				extra_pages += PFN_DOWN(end - mem_end);
+				map[i].size -= delta;
+				extra_pages += PFN_DOWN(delta);
+
+				end = mem_end;
 			}
-		} else if (map[i].type != E820_RAM)
+		}
+
+		if (end > xen_extra_mem_start)
 			xen_extra_mem_start = end;
 
+		/* If region is non-RAM or below mem_end, add what remains */
 		if ((map[i].type != E820_RAM || map[i].addr < mem_end) &&
 		    map[i].size > 0)
 			e820_add_region(map[i].addr, map[i].size, map[i].type);

From 403a85ff001710bb92689790b9a5c1c80e4b37a6 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 14 Oct 2010 11:38:47 -0700
Subject: [PATCH 35/41] xen/hvc: only notify if we actually sent something

Don't spam dom0/xenconsoled with events unless we've actually added
something to the ring.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/char/hvc_xen.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/char/hvc_xen.c b/drivers/char/hvc_xen.c
index 60446f82a3fc..6b8e6d18a8e6 100644
--- a/drivers/char/hvc_xen.c
+++ b/drivers/char/hvc_xen.c
@@ -74,7 +74,8 @@ static int __write_console(const char *data, int len)
 	wmb();			/* write ring before updating pointer */
 	intf->out_prod = prod;
 
-	notify_daemon();
+	if (sent)
+		notify_daemon();
 	return sent;
 }
 

From b0097adeec27e30223c989561ab0f7aa60d1fe93 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 8 Oct 2010 16:59:12 +0100
Subject: [PATCH 36/41] xen: ensure that all event channels start off bound to
 VCPU 0

All event channels startbound to VCPU 0 so ensure that cpu_evtchn_mask
is initialised to reflect this. Otherwise there is a race after registering an
event channel but before the affinity is explicitly set where the event channel
can be delivered. If this happens then the event channel remains pending in the
L1 (evtchn_pending) array but is cleared in L2 (evtchn_pending_sel), this means
the event channel cannot be reraised until another event channel happens to
trigger the same L2 entry on that VCPU.

sizeof(cpu_evtchn_mask(0))==sizeof(unsigned long*) which is not correct, and
causes only the first 32 or 64 event channels (depending on architecture) to be
initially bound to VCPU0. Use sizeof(struct cpu_evtchn_s) instead.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: stable@kernel.org
---
 drivers/xen/events.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 8beb2bc60f7d..b4e73011a80e 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -261,7 +261,7 @@ static void init_evtchn_cpu_bindings(void)
 	}
 #endif
 
-	memset(cpu_evtchn_mask(0), ~0, sizeof(cpu_evtchn_mask(0)));
+	memset(cpu_evtchn_mask(0), ~0, sizeof(struct cpu_evtchn_s));
 }
 
 static inline void clear_evtchn(int port)

From a52521f149c42b35a28423ee30be9a7afa51dfbf Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 22 Sep 2010 15:28:52 -0700
Subject: [PATCH 37/41] xen: set up IRQ before binding virq to evtchn

Make sure the irq is set up before binding a virq event channel to it.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/events.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index b4e73011a80e..a3362479cfcf 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -436,6 +436,11 @@ static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
 	irq = per_cpu(virq_to_irq, cpu)[virq];
 
 	if (irq == -1) {
+		irq = find_unbound_irq();
+
+		set_irq_chip_and_handler_name(irq, &xen_percpu_chip,
+					      handle_percpu_irq, "virq");
+
 		bind_virq.virq = virq;
 		bind_virq.vcpu = cpu;
 		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
@@ -443,11 +448,6 @@ static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
 			BUG();
 		evtchn = bind_virq.port;
 
-		irq = find_unbound_irq();
-
-		set_irq_chip_and_handler_name(irq, &xen_percpu_chip,
-					      handle_percpu_irq, "virq");
-
 		evtchn_to_irq[evtchn] = irq;
 		irq_info[irq] = mk_virq_info(evtchn, virq);
 

From cb52e6d9ed9bb9cf74f8396a603ecf12b64c1ec1 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 15 Oct 2010 11:52:46 +0100
Subject: [PATCH 38/41] xen: improvements to VIRQ_DEBUG output

* Fix bitmask formatting on 64 bit by specifying correct field widths.

* Output both global and local masked and pending information.

* Indicate in list of pending interrupts whether they are pending in
  the L2, masked globally and/or masked locally.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/events.c | 76 ++++++++++++++++++++++++++++++++------------
 1 file changed, 55 insertions(+), 21 deletions(-)

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index a3362479cfcf..b0cf80bf4169 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -579,41 +579,75 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
 {
 	struct shared_info *sh = HYPERVISOR_shared_info;
 	int cpu = smp_processor_id();
+	unsigned long *cpu_evtchn = cpu_evtchn_mask(cpu);
 	int i;
 	unsigned long flags;
 	static DEFINE_SPINLOCK(debug_lock);
+	struct vcpu_info *v;
 
 	spin_lock_irqsave(&debug_lock, flags);
 
-	printk("vcpu %d\n  ", cpu);
+	printk("\nvcpu %d\n  ", cpu);
 
 	for_each_online_cpu(i) {
-		struct vcpu_info *v = per_cpu(xen_vcpu, i);
-		printk("%d: masked=%d pending=%d event_sel %08lx\n  ", i,
-			(get_irq_regs() && i == cpu) ? xen_irqs_disabled(get_irq_regs()) : v->evtchn_upcall_mask,
-			v->evtchn_upcall_pending,
-			v->evtchn_pending_sel);
+		int pending;
+		v = per_cpu(xen_vcpu, i);
+		pending = (get_irq_regs() && i == cpu)
+			? xen_irqs_disabled(get_irq_regs())
+			: v->evtchn_upcall_mask;
+		printk("%d: masked=%d pending=%d event_sel %0*lx\n  ", i,
+		       pending, v->evtchn_upcall_pending,
+		       (int)(sizeof(v->evtchn_pending_sel)*2),
+		       v->evtchn_pending_sel);
 	}
-	printk("pending:\n   ");
-	for(i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
-		printk("%08lx%s", sh->evtchn_pending[i],
-			i % 8 == 0 ? "\n   " : " ");
-	printk("\nmasks:\n   ");
-	for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
-		printk("%08lx%s", sh->evtchn_mask[i],
-			i % 8 == 0 ? "\n   " : " ");
+	v = per_cpu(xen_vcpu, cpu);
 
-	printk("\nunmasked:\n   ");
-	for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
-		printk("%08lx%s", sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
-			i % 8 == 0 ? "\n   " : " ");
+	printk("\npending:\n   ");
+	for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
+		printk("%0*lx%s", (int)sizeof(sh->evtchn_pending[0])*2,
+		       sh->evtchn_pending[i],
+		       i % 8 == 0 ? "\n   " : " ");
+	printk("\nglobal mask:\n   ");
+	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+		printk("%0*lx%s",
+		       (int)(sizeof(sh->evtchn_mask[0])*2),
+		       sh->evtchn_mask[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nglobally unmasked:\n   ");
+	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+		printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2),
+		       sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nlocal cpu%d mask:\n   ", cpu);
+	for (i = (NR_EVENT_CHANNELS/BITS_PER_LONG)-1; i >= 0; i--)
+		printk("%0*lx%s", (int)(sizeof(cpu_evtchn[0])*2),
+		       cpu_evtchn[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nlocally unmasked:\n   ");
+	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) {
+		unsigned long pending = sh->evtchn_pending[i]
+			& ~sh->evtchn_mask[i]
+			& cpu_evtchn[i];
+		printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2),
+		       pending, i % 8 == 0 ? "\n   " : " ");
+	}
 
 	printk("\npending list:\n");
-	for(i = 0; i < NR_EVENT_CHANNELS; i++) {
+	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
 		if (sync_test_bit(i, sh->evtchn_pending)) {
-			printk("  %d: event %d -> irq %d\n",
+			int word_idx = i / BITS_PER_LONG;
+			printk("  %d: event %d -> irq %d%s%s%s\n",
 			       cpu_from_evtchn(i), i,
-			       evtchn_to_irq[i]);
+			       evtchn_to_irq[i],
+			       sync_test_bit(word_idx, &v->evtchn_pending_sel)
+					     ? "" : " l2-clear",
+			       !sync_test_bit(i, sh->evtchn_mask)
+					     ? "" : " globally-masked",
+			       sync_test_bit(i, cpu_evtchn)
+					     ? "" : " locally-masked");
 		}
 	}
 

From 375b2a9ada6d105483aab22f1af1d727bc3c418d Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Thu, 21 Oct 2010 11:00:46 +0100
Subject: [PATCH 39/41] xen: correctly rebuild mfn list list after migration.

Otherwise the second migration attempt fails because the mfn_list_list
still refers to all the old mfns.

We need to update the entires in both p2m_top_mfn and the mid_mfn
pages which p2m_top_mfn refers to.

In order to do this we need to keep track of the virtual addresses
mapping the p2m_mid_mfn pages since we cannot rely on
mfn_to_virt(p2m_top_mfn[idx]) since p2m_top_mfn[idx] will still
contain the old MFN after a migration, which may now belong to another
domain and hence have a different mapping in the m2p.

Therefore add and maintain a third top level page, p2m_top_mfn_p[],
which tracks the virtual addresses of the mfns contained in
p2m_top_mfn[].

We also need to update the content of the p2m_mid_missing_mfn page on
resume to refer to the page's new mfn.

p2m_missing does not need updating since the migration process takes
care of the leaf p2m pages for us.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 50 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 37 insertions(+), 13 deletions(-)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 67b41017f7b8..e41683cf290a 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -187,6 +187,8 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
  *    / \      / \         /           /
  *  p2m p2m p2m p2m p2m p2m p2m ...
  *
+ * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
+ *
  * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
  * maximum representable pseudo-physical address space is:
  *  P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
@@ -211,6 +213,7 @@ static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
 
 static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
 static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
 
 RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
 RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
@@ -247,6 +250,14 @@ static void p2m_top_mfn_init(unsigned long *top)
 		top[i] = virt_to_mfn(p2m_mid_missing_mfn);
 }
 
+static void p2m_top_mfn_p_init(unsigned long **top)
+{
+	unsigned i;
+
+	for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+		top[i] = p2m_mid_missing_mfn;
+}
+
 static void p2m_mid_init(unsigned long **mid)
 {
 	unsigned i;
@@ -283,33 +294,43 @@ static void p2m_init(unsigned long *p2m)
  */
 void xen_build_mfn_list_list(void)
 {
-	unsigned pfn;
+	unsigned long pfn;
 
 	/* Pre-initialize p2m_top_mfn to be completely missing */
 	if (p2m_top_mfn == NULL) {
 		p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
 		p2m_mid_mfn_init(p2m_mid_missing_mfn);
 
+		p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+		p2m_top_mfn_p_init(p2m_top_mfn_p);
+
 		p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
 		p2m_top_mfn_init(p2m_top_mfn);
+	} else {
+		/* Reinitialise, mfn's all change after migration */
+		p2m_mid_mfn_init(p2m_mid_missing_mfn);
 	}
 
 	for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
 		unsigned topidx = p2m_top_index(pfn);
 		unsigned mididx = p2m_mid_index(pfn);
 		unsigned long **mid;
-		unsigned long mid_mfn;
 		unsigned long *mid_mfn_p;
 
 		mid = p2m_top[topidx];
+		mid_mfn_p = p2m_top_mfn_p[topidx];
 
 		/* Don't bother allocating any mfn mid levels if
-		   they're just missing */
-		if (mid[mididx] == p2m_missing)
+		 * they're just missing, just update the stored mfn,
+		 * since all could have changed over a migrate.
+		 */
+		if (mid == p2m_mid_missing) {
+			BUG_ON(mididx);
+			BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
+			p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
+			pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
 			continue;
-
-		mid_mfn = p2m_top_mfn[topidx];
-		mid_mfn_p = mfn_to_virt(mid_mfn);
+		}
 
 		if (mid_mfn_p == p2m_mid_missing_mfn) {
 			/*
@@ -321,11 +342,10 @@ void xen_build_mfn_list_list(void)
 			mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
 			p2m_mid_mfn_init(mid_mfn_p);
 
-			mid_mfn = virt_to_mfn(mid_mfn_p);
-			
-			p2m_top_mfn[topidx] = mid_mfn;
+			p2m_top_mfn_p[topidx] = mid_mfn_p;
 		}
 
+		p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
 		mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
 	}
 }
@@ -344,7 +364,7 @@ void __init xen_build_dynamic_phys_to_machine(void)
 {
 	unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
 	unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
-	unsigned pfn;
+	unsigned long pfn;
 
 	xen_max_p2m_pfn = max_pfn;
 
@@ -434,7 +454,9 @@ static bool alloc_p2m(unsigned long pfn)
 	}
 
 	top_mfn_p = &p2m_top_mfn[topidx];
-	mid_mfn = mfn_to_virt(*top_mfn_p);
+	mid_mfn = p2m_top_mfn_p[topidx];
+
+	BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
 
 	if (mid_mfn == p2m_mid_missing_mfn) {
 		/* Separately check the mid mfn level */
@@ -446,11 +468,13 @@ static bool alloc_p2m(unsigned long pfn)
 			return false;
 
 		p2m_mid_mfn_init(mid_mfn);
-		
+
 		missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
 		mid_mfn_mfn = virt_to_mfn(mid_mfn);
 		if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
 			free_p2m_page(mid_mfn);
+		else
+			p2m_top_mfn_p[topidx] = mid_mfn;
 	}
 
 	if (p2m_top[topidx][mididx] == p2m_missing) {

From 9e9a5fcb04e3af077d1be32710298b852210d93f Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Thu, 2 Sep 2010 16:16:00 +0100
Subject: [PATCH 40/41] xen: use host E820 map for dom0

When running as initial domain, get the real physical memory map from
xen using the XENMEM_machine_memory_map hypercall and use it to setup
the e820 regions.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/setup.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 7a4ab05cff8a..0ce9d58cb29d 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -149,6 +149,7 @@ char * __init xen_memory_setup(void)
 	unsigned long extra_pages = 0;
 	unsigned long extra_limit;
 	int i;
+	int op;
 
 	max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
 	mem_end = PFN_PHYS(max_pfn);
@@ -156,7 +157,10 @@ char * __init xen_memory_setup(void)
 	memmap.nr_entries = E820MAX;
 	set_xen_guest_handle(memmap.buffer, map);
 
-	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+	op = xen_initial_domain() ?
+		XENMEM_machine_memory_map :
+		XENMEM_memory_map;
+	rc = HYPERVISOR_memory_op(op, &memmap);
 	if (rc == -ENOSYS) {
 		memmap.nr_entries = 1;
 		map[0].addr = 0ULL;
@@ -235,7 +239,8 @@ char * __init xen_memory_setup(void)
 	else
 		extra_pages = 0;
 
-	xen_add_extra_mem(extra_pages);
+	if (!xen_initial_domain())
+		xen_add_extra_mem(extra_pages);
 
 	return "Xen";
 }

From 45263cb0993de738e158c625c84a5feb18bed317 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Mon, 25 Oct 2010 16:32:29 -0700
Subject: [PATCH 41/41] xen: include xen/xen.h for definition of
 xen_initial_domain()

          CC      arch/x86/xen/setup.o
        arch/x86/xen/setup.c: In function 'xen_memory_setup':
        arch/x86/xen/setup.c:161: error: implicit declaration of function 'xen_initial_domain'

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/setup.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 0ce9d58cb29d..8e2c9f21fa37 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -17,6 +17,7 @@
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
 
+#include <xen/xen.h>
 #include <xen/page.h>
 #include <xen/interface/callback.h>
 #include <xen/interface/memory.h>