diff --git a/fs/proc/base.c b/fs/proc/base.c index 1bd646d3fe9a..9004db04efa0 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -783,7 +783,7 @@ out_no_task: } #endif -static loff_t mem_lseek(struct file * file, loff_t offset, int orig) +loff_t mem_lseek(struct file *file, loff_t offset, int orig) { switch (orig) { case 0: @@ -2252,6 +2252,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_MMU REG("clear_refs", S_IWUSR, clear_refs), REG("smaps", S_IRUGO, smaps), + REG("pagemap", S_IRUSR, pagemap), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, attr_dir), @@ -2580,6 +2581,7 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_MMU REG("clear_refs", S_IWUSR, clear_refs), REG("smaps", S_IRUGO, smaps), + REG("pagemap", S_IRUSR, pagemap), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, attr_dir), diff --git a/fs/proc/internal.h b/fs/proc/internal.h index ddfaeec37492..7d57e8069924 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -52,11 +52,13 @@ extern int proc_tid_stat(struct task_struct *, char *); extern int proc_tgid_stat(struct task_struct *, char *); extern int proc_pid_status(struct task_struct *, char *); extern int proc_pid_statm(struct task_struct *, char *); +extern loff_t mem_lseek(struct file *file, loff_t offset, int orig); extern const struct file_operations proc_maps_operations; extern const struct file_operations proc_numa_maps_operations; extern const struct file_operations proc_smaps_operations; extern const struct file_operations proc_clear_refs_operations; +extern const struct file_operations proc_pagemap_operations; void free_proc_entry(struct proc_dir_entry *de); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 308fc5451e43..bbd9b145051d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -5,7 +5,10 @@ #include #include #include +#include #include +#include +#include #include #include @@ -519,6 +522,202 @@ const struct file_operations proc_clear_refs_operations = { .write = clear_refs_write, }; +struct pagemapread { + char __user *out, *end; +}; + +#define PM_ENTRY_BYTES sizeof(u64) +#define PM_RESERVED_BITS 3 +#define PM_RESERVED_OFFSET (64 - PM_RESERVED_BITS) +#define PM_RESERVED_MASK (((1LL<out + PM_ENTRY_BYTES >= pm->end) { + if (copy_to_user(pm->out, &pfn, pm->end - pm->out)) + return -EFAULT; + pm->out = pm->end; + return PM_END_OF_BUFFER; + } + + if (put_user(pfn, pm->out)) + return -EFAULT; + pm->out += PM_ENTRY_BYTES; + return 0; +} + +static int pagemap_pte_hole(unsigned long start, unsigned long end, + void *private) +{ + struct pagemapread *pm = private; + unsigned long addr; + int err = 0; + for (addr = start; addr < end; addr += PAGE_SIZE) { + err = add_to_pagemap(addr, PM_NOT_PRESENT, pm); + if (err) + break; + } + return err; +} + +u64 swap_pte_to_pagemap_entry(pte_t pte) +{ + swp_entry_t e = pte_to_swp_entry(pte); + return PM_SWAP | swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT); +} + +static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + void *private) +{ + struct pagemapread *pm = private; + pte_t *pte; + int err = 0; + + for (; addr != end; addr += PAGE_SIZE) { + u64 pfn = PM_NOT_PRESENT; + pte = pte_offset_map(pmd, addr); + if (is_swap_pte(*pte)) + pfn = swap_pte_to_pagemap_entry(*pte); + else if (pte_present(*pte)) + pfn = pte_pfn(*pte); + /* unmap so we're not in atomic when we copy to userspace */ + pte_unmap(pte); + err = add_to_pagemap(addr, pfn, pm); + if (err) + return err; + } + + cond_resched(); + + return err; +} + +static struct mm_walk pagemap_walk = { + .pmd_entry = pagemap_pte_range, + .pte_hole = pagemap_pte_hole +}; + +/* + * /proc/pid/pagemap - an array mapping virtual pages to pfns + * + * For each page in the address space, this file contains one 64-bit + * entry representing the corresponding physical page frame number + * (PFN) if the page is present. If there is a swap entry for the + * physical page, then an encoding of the swap file number and the + * page's offset into the swap file are returned. If no page is + * present at all, PM_NOT_PRESENT is returned. This allows determining + * precisely which pages are mapped (or in swap) and comparing mapped + * pages between processes. + * + * Efficient users of this interface will use /proc/pid/maps to + * determine which areas of memory are actually mapped and llseek to + * skip over unmapped regions. + */ +static ssize_t pagemap_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + struct page **pages, *page; + unsigned long uaddr, uend; + struct mm_struct *mm; + struct pagemapread pm; + int pagecount; + int ret = -ESRCH; + + if (!task) + goto out; + + ret = -EACCES; + if (!ptrace_may_attach(task)) + goto out; + + ret = -EINVAL; + /* file position must be aligned */ + if (*ppos % PM_ENTRY_BYTES) + goto out; + + ret = 0; + mm = get_task_mm(task); + if (!mm) + goto out; + + ret = -ENOMEM; + uaddr = (unsigned long)buf & PAGE_MASK; + uend = (unsigned long)(buf + count); + pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE; + pages = kmalloc(pagecount * sizeof(struct page *), GFP_KERNEL); + if (!pages) + goto out_task; + + down_read(¤t->mm->mmap_sem); + ret = get_user_pages(current, current->mm, uaddr, pagecount, + 1, 0, pages, NULL); + up_read(¤t->mm->mmap_sem); + + if (ret < 0) + goto out_free; + + pm.out = buf; + pm.end = buf + count; + + if (!ptrace_may_attach(task)) { + ret = -EIO; + } else { + unsigned long src = *ppos; + unsigned long svpfn = src / PM_ENTRY_BYTES; + unsigned long start_vaddr = svpfn << PAGE_SHIFT; + unsigned long end_vaddr = TASK_SIZE_OF(task); + + /* watch out for wraparound */ + if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT) + start_vaddr = end_vaddr; + + /* + * The odds are that this will stop walking way + * before end_vaddr, because the length of the + * user buffer is tracked in "pm", and the walk + * will stop when we hit the end of the buffer. + */ + ret = walk_page_range(mm, start_vaddr, end_vaddr, + &pagemap_walk, &pm); + if (ret == PM_END_OF_BUFFER) + ret = 0; + /* don't need mmap_sem for these, but this looks cleaner */ + *ppos += pm.out - buf; + if (!ret) + ret = pm.out - buf; + } + + for (; pagecount; pagecount--) { + page = pages[pagecount-1]; + if (!PageReserved(page)) + SetPageDirty(page); + page_cache_release(page); + } + mmput(mm); +out_free: + kfree(pages); +out_task: + put_task_struct(task); +out: + return ret; +} + +const struct file_operations proc_pagemap_operations = { + .llseek = mem_lseek, /* borrow this */ + .read = pagemap_read, +}; + #ifdef CONFIG_NUMA extern int show_numa_map(struct seq_file *m, void *v); @@ -552,4 +751,3 @@ const struct file_operations proc_numa_maps_operations = { .release = seq_release_private, }; #endif -