1
0
Fork 0
alistair23-linux/mm/nommu.c

1480 lines
35 KiB
C
Raw Normal View History

/*
* linux/mm/nommu.c
*
* Replacement code for mm functions to support CPU's that don't
* have any form of memory management unit (thus no virtual memory).
*
* See Documentation/nommu-mmap.txt
*
* Copyright (c) 2004-2005 David Howells <dhowells@redhat.com>
* Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
* Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
* Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com>
* Copyright (c) 2007 Paul Mundt <lethal@linux-sh.org>
*/
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/file.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/tracehook.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/mount.h>
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
void *high_memory;
struct page *mem_map;
unsigned long max_mapnr;
unsigned long num_physpages;
unsigned long askedalloc, realalloc;
atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio = 50; /* default is 50% */
int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
int heap_stack_gap = 0;
EXPORT_SYMBOL(mem_map);
EXPORT_SYMBOL(num_physpages);
/* list of shareable VMAs */
struct rb_root nommu_vma_tree = RB_ROOT;
DECLARE_RWSEM(nommu_vma_sem);
struct vm_operations_struct generic_file_vm_ops = {
};
/*
* Handle all mappings that got truncated by a "truncate()"
* system call.
*
* NOTE! We have to be ready to update the memory sharing
* between the file and the memory map for a potential last
* incomplete page. Ugly, but necessary.
*/
int vmtruncate(struct inode *inode, loff_t offset)
{
struct address_space *mapping = inode->i_mapping;
unsigned long limit;
if (inode->i_size < offset)
goto do_expand;
i_size_write(inode, offset);
truncate_inode_pages(mapping, offset);
goto out_truncate;
do_expand:
limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
if (limit != RLIM_INFINITY && offset > limit)
goto out_sig;
if (offset > inode->i_sb->s_maxbytes)
goto out;
i_size_write(inode, offset);
out_truncate:
if (inode->i_op && inode->i_op->truncate)
inode->i_op->truncate(inode);
return 0;
out_sig:
send_sig(SIGXFSZ, current, 0);
out:
return -EFBIG;
}
EXPORT_SYMBOL(vmtruncate);
/*
* Return the total memory allocated for this pointer, not
* just what the caller asked for.
*
* Doesn't have to be accurate, i.e. may have races.
*/
unsigned int kobjsize(const void *objp)
{
struct page *page;
mm/nommu.c: return 0 from kobjsize with invalid objects Don't perform kobjsize operations on objects the kernel doesn't manage. On Blackfin, drivers can get dma coherent memory by calling a function dma_alloc_coherent(). We do this in nommu by configuring a chunk of uncached memory at the top of memory. Since we don't want the kernel to use the uncached memory, we lie to the kernel, and tell it that it's max memory is between 0, and the start of the uncached dma coherent section. this all works well, until this memory gets exposed into userspace (with a frame buffer), when you look at the process's maps, it shows the framebuf: root:/proc> cat maps [snip] 03f0ef00-03f34700 rw-p 00000000 1f:00 192 /dev/fb0 root:/proc> This is outside the "normal" range for the kernel. When the kernel tries to find the size of this object (when you run ps), it dies in nommu.c in kobjsize. BUG_ON(page->index >= MAX_ORDER); since the page we are referring to is outside what the kernel thinks is it's max valid memory. root:~> while [ 1 ]; ps > /dev/null; done kernel BUG at mm/nommu.c:119! Kernel panic - not syncing: BUG! We fixed this by adding a check to reject out of range object pointers as it already does that for NULL pointers. Signed-off-by: Michael Hennerich <Michael.Hennerich@analog.com> Signed-off-by: Robin Getz <rgetz@blackfin.uclinux.org> Acked-by: David Howells <dhowells@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 03:13:38 -06:00
/*
* If the object we have should not have ksize performed on it,
* return size of 0
*/
nommu: Correct kobjsize() page validity checks. This implements a few changes on top of the recent kobjsize() refactoring introduced by commit 6cfd53fc03670c7a544a56d441eb1a6cc800d72b. As Christoph points out: virt_to_head_page cannot return NULL. virt_to_page also does not return NULL. pfn_valid() needs to be used to figure out if a page is valid. Otherwise the page struct reference that was returned may have PageReserved() set to indicate that it is not a valid page. As discussed further in the thread, virt_addr_valid() is the preferable way to validate the object pointer in this case. In addition to fixing up the reserved page case, it also has the benefit of encapsulating the hack introduced by commit 4016a1390d07f15b267eecb20e76a48fd5c524ef on the impacted platforms, allowing us to get rid of the extra checking in kobjsize() for the platforms that don't perform this type of bizarre memory_end abuse (every nommu platform that isn't blackfin). If blackfin decides to get in line with every other platform and use PageReserved for the DMA pages in question, kobjsize() will also continue to work fine. It also turns out that compound_order() will give us back 0-order for non-head pages, so we can get rid of the PageCompound check and just use compound_order() directly. Clean that up while we're at it. Signed-off-by: Paul Mundt <lethal@linux-sh.org> Reviewed-by: Christoph Lameter <clameter@sgi.com> Acked-by: David Howells <dhowells@redhat.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-06-12 01:29:55 -06:00
if (!objp || !virt_addr_valid(objp))
nommu: fix kobjsize() for SLOB and SLUB kobjsize() has been abusing page->index as a method for sorting out compound order, which blows up both for page cache pages, and SLOB's reuse of the index in struct slob_page. Presently we are not able to accurately size arbitrary pointers that don't come from kmalloc(), so the best we can do is sort out the compound order from the head page if it's a compound page, or default to 0-order if it's impossible to ksize() the object. Obviously this leaves quite a bit to be desired in terms of object sizing accuracy, but the behaviour is unchanged over the existing implementation, while fixing the page->index oopses originally reported here: http://marc.info/?l=linux-mm&m=121127773325245&w=2 Accuracy could also be improved by having SLUB and SLOB both set PG_slab on ksizeable pages, rather than just handling the __GFP_COMP cases irregardless of the PG_slab setting, as made possibly with Pekka's patches: http://marc.info/?l=linux-kernel&m=121139439900534&w=2 http://marc.info/?l=linux-kernel&m=121139440000537&w=2 http://marc.info/?l=linux-kernel&m=121139440000540&w=2 This is primarily a bugfix for nommu systems for 2.6.26, with the aim being to gradually kill off kobjsize() and its particular brand of object abuse entirely. Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi> Signed-off-by: Paul Mundt <lethal@linux-sh.org> Acked-by: David Howells <dhowells@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-06-05 23:46:08 -06:00
return 0;
page = virt_to_head_page(objp);
/*
* If the allocator sets PageSlab, we know the pointer came from
* kmalloc().
*/
if (PageSlab(page))
return ksize(objp);
nommu: fix kobjsize() for SLOB and SLUB kobjsize() has been abusing page->index as a method for sorting out compound order, which blows up both for page cache pages, and SLOB's reuse of the index in struct slob_page. Presently we are not able to accurately size arbitrary pointers that don't come from kmalloc(), so the best we can do is sort out the compound order from the head page if it's a compound page, or default to 0-order if it's impossible to ksize() the object. Obviously this leaves quite a bit to be desired in terms of object sizing accuracy, but the behaviour is unchanged over the existing implementation, while fixing the page->index oopses originally reported here: http://marc.info/?l=linux-mm&m=121127773325245&w=2 Accuracy could also be improved by having SLUB and SLOB both set PG_slab on ksizeable pages, rather than just handling the __GFP_COMP cases irregardless of the PG_slab setting, as made possibly with Pekka's patches: http://marc.info/?l=linux-kernel&m=121139439900534&w=2 http://marc.info/?l=linux-kernel&m=121139440000537&w=2 http://marc.info/?l=linux-kernel&m=121139440000540&w=2 This is primarily a bugfix for nommu systems for 2.6.26, with the aim being to gradually kill off kobjsize() and its particular brand of object abuse entirely. Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi> Signed-off-by: Paul Mundt <lethal@linux-sh.org> Acked-by: David Howells <dhowells@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-06-05 23:46:08 -06:00
/*
* The ksize() function is only guaranteed to work for pointers
nommu: Correct kobjsize() page validity checks. This implements a few changes on top of the recent kobjsize() refactoring introduced by commit 6cfd53fc03670c7a544a56d441eb1a6cc800d72b. As Christoph points out: virt_to_head_page cannot return NULL. virt_to_page also does not return NULL. pfn_valid() needs to be used to figure out if a page is valid. Otherwise the page struct reference that was returned may have PageReserved() set to indicate that it is not a valid page. As discussed further in the thread, virt_addr_valid() is the preferable way to validate the object pointer in this case. In addition to fixing up the reserved page case, it also has the benefit of encapsulating the hack introduced by commit 4016a1390d07f15b267eecb20e76a48fd5c524ef on the impacted platforms, allowing us to get rid of the extra checking in kobjsize() for the platforms that don't perform this type of bizarre memory_end abuse (every nommu platform that isn't blackfin). If blackfin decides to get in line with every other platform and use PageReserved for the DMA pages in question, kobjsize() will also continue to work fine. It also turns out that compound_order() will give us back 0-order for non-head pages, so we can get rid of the PageCompound check and just use compound_order() directly. Clean that up while we're at it. Signed-off-by: Paul Mundt <lethal@linux-sh.org> Reviewed-by: Christoph Lameter <clameter@sgi.com> Acked-by: David Howells <dhowells@redhat.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-06-12 01:29:55 -06:00
* returned by kmalloc(). So handle arbitrary pointers here.
nommu: fix kobjsize() for SLOB and SLUB kobjsize() has been abusing page->index as a method for sorting out compound order, which blows up both for page cache pages, and SLOB's reuse of the index in struct slob_page. Presently we are not able to accurately size arbitrary pointers that don't come from kmalloc(), so the best we can do is sort out the compound order from the head page if it's a compound page, or default to 0-order if it's impossible to ksize() the object. Obviously this leaves quite a bit to be desired in terms of object sizing accuracy, but the behaviour is unchanged over the existing implementation, while fixing the page->index oopses originally reported here: http://marc.info/?l=linux-mm&m=121127773325245&w=2 Accuracy could also be improved by having SLUB and SLOB both set PG_slab on ksizeable pages, rather than just handling the __GFP_COMP cases irregardless of the PG_slab setting, as made possibly with Pekka's patches: http://marc.info/?l=linux-kernel&m=121139439900534&w=2 http://marc.info/?l=linux-kernel&m=121139440000537&w=2 http://marc.info/?l=linux-kernel&m=121139440000540&w=2 This is primarily a bugfix for nommu systems for 2.6.26, with the aim being to gradually kill off kobjsize() and its particular brand of object abuse entirely. Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi> Signed-off-by: Paul Mundt <lethal@linux-sh.org> Acked-by: David Howells <dhowells@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-06-05 23:46:08 -06:00
*/
nommu: Correct kobjsize() page validity checks. This implements a few changes on top of the recent kobjsize() refactoring introduced by commit 6cfd53fc03670c7a544a56d441eb1a6cc800d72b. As Christoph points out: virt_to_head_page cannot return NULL. virt_to_page also does not return NULL. pfn_valid() needs to be used to figure out if a page is valid. Otherwise the page struct reference that was returned may have PageReserved() set to indicate that it is not a valid page. As discussed further in the thread, virt_addr_valid() is the preferable way to validate the object pointer in this case. In addition to fixing up the reserved page case, it also has the benefit of encapsulating the hack introduced by commit 4016a1390d07f15b267eecb20e76a48fd5c524ef on the impacted platforms, allowing us to get rid of the extra checking in kobjsize() for the platforms that don't perform this type of bizarre memory_end abuse (every nommu platform that isn't blackfin). If blackfin decides to get in line with every other platform and use PageReserved for the DMA pages in question, kobjsize() will also continue to work fine. It also turns out that compound_order() will give us back 0-order for non-head pages, so we can get rid of the PageCompound check and just use compound_order() directly. Clean that up while we're at it. Signed-off-by: Paul Mundt <lethal@linux-sh.org> Reviewed-by: Christoph Lameter <clameter@sgi.com> Acked-by: David Howells <dhowells@redhat.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-06-12 01:29:55 -06:00
return PAGE_SIZE << compound_order(page);
}
/*
* get a list of pages in an address range belonging to the specified process
* and indicate the VMA that covers each page
* - this is potentially dodgy as we may end incrementing the page count of a
* slab page or a secondary page from a compound page
* - don't permit access to VMAs that don't support it, such as I/O mappings
*/
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int write, int force,
struct page **pages, struct vm_area_struct **vmas)
{
struct vm_area_struct *vma;
unsigned long vm_flags;
int i;
/* calculate required read or write permissions.
* - if 'force' is set, we only require the "MAY" flags.
*/
vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
for (i = 0; i < len; i++) {
vma = find_vma(mm, start);
if (!vma)
goto finish_or_fault;
/* protect what we can, including chardevs */
if (vma->vm_flags & (VM_IO | VM_PFNMAP) ||
!(vm_flags & vma->vm_flags))
goto finish_or_fault;
if (pages) {
pages[i] = virt_to_page(start);
if (pages[i])
page_cache_get(pages[i]);
}
if (vmas)
vmas[i] = vma;
start += PAGE_SIZE;
}
return i;
finish_or_fault:
return i ? : -EFAULT;
}
EXPORT_SYMBOL(get_user_pages);
DEFINE_RWLOCK(vmlist_lock);
struct vm_struct *vmlist;
void vfree(const void *addr)
{
kfree(addr);
}
EXPORT_SYMBOL(vfree);
void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
{
/*
* You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc()
* returns only a logical address.
*/
return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
}
EXPORT_SYMBOL(__vmalloc);
void *vmalloc_user(unsigned long size)
{
void *ret;
ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
PAGE_KERNEL);
if (ret) {
struct vm_area_struct *vma;
down_write(&current->mm->mmap_sem);
vma = find_vma(current->mm, (unsigned long)ret);
if (vma)
vma->vm_flags |= VM_USERMAP;
up_write(&current->mm->mmap_sem);
}
return ret;
}
EXPORT_SYMBOL(vmalloc_user);
struct page *vmalloc_to_page(const void *addr)
{
return virt_to_page(addr);
}
EXPORT_SYMBOL(vmalloc_to_page);
unsigned long vmalloc_to_pfn(const void *addr)
{
return page_to_pfn(virt_to_page(addr));
}
EXPORT_SYMBOL(vmalloc_to_pfn);
long vread(char *buf, char *addr, unsigned long count)
{
memcpy(buf, addr, count);
return count;
}
long vwrite(char *buf, char *addr, unsigned long count)
{
/* Don't allow overflow */
if ((unsigned long) addr + count < count)
count = -(unsigned long) addr;
memcpy(addr, buf, count);
return(count);
}
/*
* vmalloc - allocate virtually continguos memory
*
* @size: allocation size
*
* Allocate enough pages to cover @size from the page level
* allocator and map them into continguos kernel virtual space.
*
* For tight control over page level allocator and protection flags
* use __vmalloc() instead.
*/
void *vmalloc(unsigned long size)
{
return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
}
EXPORT_SYMBOL(vmalloc);
void *vmalloc_node(unsigned long size, int node)
{
return vmalloc(size);
}
EXPORT_SYMBOL(vmalloc_node);
/**
* vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
* @size: allocation size
*
* Allocate enough 32bit PA addressable pages to cover @size from the
* page level allocator and map them into continguos kernel virtual space.
*/
void *vmalloc_32(unsigned long size)
{
return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
}
EXPORT_SYMBOL(vmalloc_32);
/**
* vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
* @size: allocation size
*
* The resulting memory area is 32bit addressable and zeroed so it can be
* mapped to userspace without leaking data.
*
* VM_USERMAP is set on the corresponding VMA so that subsequent calls to
* remap_vmalloc_range() are permissible.
*/
void *vmalloc_32_user(unsigned long size)
{
/*
* We'll have to sort out the ZONE_DMA bits for 64-bit,
* but for now this can simply use vmalloc_user() directly.
*/
return vmalloc_user(size);
}
EXPORT_SYMBOL(vmalloc_32_user);
void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot)
{
BUG();
return NULL;
}
EXPORT_SYMBOL(vmap);
void vunmap(const void *addr)
{
BUG();
}
EXPORT_SYMBOL(vunmap);
/*
* Implement a stub for vmalloc_sync_all() if the architecture chose not to
* have one.
*/
void __attribute__((weak)) vmalloc_sync_all(void)
{
}
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
struct page *page)
{
return -EINVAL;
}
EXPORT_SYMBOL(vm_insert_page);
/*
* sys_brk() for the most part doesn't need the global kernel
* lock, except when an application is doing something nasty
* like trying to un-brk an area that has already been mapped
* to a regular file. in this case, the unmapping will need
* to invoke file system routines that need the global lock.
*/
asmlinkage unsigned long sys_brk(unsigned long brk)
{
struct mm_struct *mm = current->mm;
if (brk < mm->start_brk || brk > mm->context.end_brk)
return mm->brk;
if (mm->brk == brk)
return mm->brk;
/*
* Always allow shrinking brk
*/
if (brk <= mm->brk) {
mm->brk = brk;
return brk;
}
/*
* Ok, looks good - let it rip.
*/
return mm->brk = brk;
}
#ifdef DEBUG
static void show_process_blocks(void)
{
struct vm_list_struct *vml;
printk("Process blocks %d:", current->pid);
for (vml = &current->mm->context.vmlist; vml; vml = vml->next) {
printk(" %p: %p", vml, vml->vma);
if (vml->vma)
printk(" (%d @%lx #%d)",
kobjsize((void *) vml->vma->vm_start),
vml->vma->vm_start,
atomic_read(&vml->vma->vm_usage));
printk(vml->next ? " ->" : ".\n");
}
}
#endif /* DEBUG */
/*
* add a VMA into a process's mm_struct in the appropriate place in the list
* - should be called with mm->mmap_sem held writelocked
*/
static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml)
{
struct vm_list_struct **ppv;
for (ppv = &current->mm->context.vmlist; *ppv; ppv = &(*ppv)->next)
if ((*ppv)->vma->vm_start > vml->vma->vm_start)
break;
vml->next = *ppv;
*ppv = vml;
}
/*
* look up the first VMA in which addr resides, NULL if none
* - should be called with mm->mmap_sem at least held readlocked
*/
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
struct vm_list_struct *loop, *vml;
/* search the vm_start ordered list */
vml = NULL;
for (loop = mm->context.vmlist; loop; loop = loop->next) {
if (loop->vma->vm_start > addr)
break;
vml = loop;
}
if (vml && vml->vma->vm_end > addr)
return vml->vma;
return NULL;
}
EXPORT_SYMBOL(find_vma);
[PATCH] NOMMU: Make futexes work under NOMMU conditions Make futexes work under NOMMU conditions. This can be tested by running this in one shell: #define SYSERROR(X, Y) \ do { if ((long)(X) == -1L) { perror(Y); exit(1); }} while(0) int main() { int shmid, tmp, *f, n; shmid = shmget(23, 4, IPC_CREAT|0666); SYSERROR(shmid, "shmget"); f = shmat(shmid, NULL, 0); SYSERROR(f, "shmat"); n = *f; printf("WAIT: %p{%x}\n", f, n); tmp = futex(f, FUTEX_WAIT, n, NULL, NULL, 0); SYSERROR(tmp, "futex"); printf("WAITED: %d\n", tmp); tmp = shmdt(f); SYSERROR(tmp, "shmdt"); exit(0); } And then this in the other shell: #define SYSERROR(X, Y) \ do { if ((long)(X) == -1L) { perror(Y); exit(1); }} while(0) int main() { int shmid, tmp, *f; shmid = shmget(23, 4, IPC_CREAT|0666); SYSERROR(shmid, "shmget"); f = shmat(shmid, NULL, 0); SYSERROR(f, "shmat"); (*f)++; printf("WAKE: %p{%x}\n", f, *f); tmp = futex(f, FUTEX_WAKE, 1, NULL, NULL, 0); SYSERROR(tmp, "futex"); printf("WOKE: %d\n", tmp); tmp = shmdt(f); SYSERROR(tmp, "shmdt"); exit(0); } The first program will set up a SYSV IPC SHM segment and wait on a futex in it for the number at the start to change. The program will increment that number and wake the first program up. This leads to output of the form: SHELL 1 SHELL 2 ======================= ======================= # /dowait WAIT: 0xc32ac000{0} # /dowake WAKE: 0xc32ac000{1} WAITED: 0 WOKE: 1 Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 02:50:22 -06:00
/*
* find a VMA
* - we don't extend stack VMAs under NOMMU conditions
*/
struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
{
return find_vma(mm, addr);
}
int expand_stack(struct vm_area_struct *vma, unsigned long address)
{
return -ENOMEM;
}
/*
* look up the first VMA exactly that exactly matches addr
* - should be called with mm->mmap_sem at least held readlocked
*/
static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
unsigned long addr)
{
struct vm_list_struct *vml;
/* search the vm_start ordered list */
for (vml = mm->context.vmlist; vml; vml = vml->next) {
if (vml->vma->vm_start == addr)
return vml->vma;
if (vml->vma->vm_start > addr)
break;
}
return NULL;
}
/*
* find a VMA in the global tree
*/
static inline struct vm_area_struct *find_nommu_vma(unsigned long start)
{
struct vm_area_struct *vma;
struct rb_node *n = nommu_vma_tree.rb_node;
while (n) {
vma = rb_entry(n, struct vm_area_struct, vm_rb);
if (start < vma->vm_start)
n = n->rb_left;
else if (start > vma->vm_start)
n = n->rb_right;
else
return vma;
}
return NULL;
}
/*
* add a VMA in the global tree
*/
static void add_nommu_vma(struct vm_area_struct *vma)
{
struct vm_area_struct *pvma;
struct address_space *mapping;
struct rb_node **p = &nommu_vma_tree.rb_node;
struct rb_node *parent = NULL;
/* add the VMA to the mapping */
if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
flush_dcache_mmap_lock(mapping);
vma_prio_tree_insert(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
}
/* add the VMA to the master list */
while (*p) {
parent = *p;
pvma = rb_entry(parent, struct vm_area_struct, vm_rb);
if (vma->vm_start < pvma->vm_start) {
p = &(*p)->rb_left;
}
else if (vma->vm_start > pvma->vm_start) {
p = &(*p)->rb_right;
}
else {
/* mappings are at the same address - this can only
* happen for shared-mem chardevs and shared file
* mappings backed by ramfs/tmpfs */
BUG_ON(!(pvma->vm_flags & VM_SHARED));
if (vma < pvma)
p = &(*p)->rb_left;
else if (vma > pvma)
p = &(*p)->rb_right;
else
BUG();
}
}
rb_link_node(&vma->vm_rb, parent, p);
rb_insert_color(&vma->vm_rb, &nommu_vma_tree);
}
/*
* delete a VMA from the global list
*/
static void delete_nommu_vma(struct vm_area_struct *vma)
{
struct address_space *mapping;
/* remove the VMA from the mapping */
if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
flush_dcache_mmap_lock(mapping);
vma_prio_tree_remove(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
}
/* remove from the master list */
rb_erase(&vma->vm_rb, &nommu_vma_tree);
}
/*
* determine whether a mapping should be permitted and, if so, what sort of
* mapping we're capable of supporting
*/
static int validate_mmap_request(struct file *file,
unsigned long addr,
unsigned long len,
unsigned long prot,
unsigned long flags,
unsigned long pgoff,
unsigned long *_capabilities)
{
unsigned long capabilities;
unsigned long reqprot = prot;
int ret;
/* do the simple checks first */
if (flags & MAP_FIXED || addr) {
printk(KERN_DEBUG
"%d: Can't do fixed-address/overlay mmap of RAM\n",
current->pid);
return -EINVAL;
}
if ((flags & MAP_TYPE) != MAP_PRIVATE &&
(flags & MAP_TYPE) != MAP_SHARED)
return -EINVAL;
if (!len)
return -EINVAL;
/* Careful about overflows.. */
len = PAGE_ALIGN(len);
if (!len || len > TASK_SIZE)
return -ENOMEM;
/* offset overflow? */
if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
return -EOVERFLOW;
if (file) {
/* validate file mapping requests */
struct address_space *mapping;
/* files must support mmap */
if (!file->f_op || !file->f_op->mmap)
return -ENODEV;
/* work out if what we've got could possibly be shared
* - we support chardevs that provide their own "memory"
* - we support files/blockdevs that are memory backed
*/
mapping = file->f_mapping;
if (!mapping)
mapping = file->f_path.dentry->d_inode->i_mapping;
capabilities = 0;
if (mapping && mapping->backing_dev_info)
capabilities = mapping->backing_dev_info->capabilities;
if (!capabilities) {
/* no explicit capabilities set, so assume some
* defaults */
switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) {
case S_IFREG:
case S_IFBLK:
capabilities = BDI_CAP_MAP_COPY;
break;
case S_IFCHR:
capabilities =
BDI_CAP_MAP_DIRECT |
BDI_CAP_READ_MAP |
BDI_CAP_WRITE_MAP;
break;
default:
return -EINVAL;
}
}
/* eliminate any capabilities that we can't support on this
* device */
if (!file->f_op->get_unmapped_area)
capabilities &= ~BDI_CAP_MAP_DIRECT;
if (!file->f_op->read)
capabilities &= ~BDI_CAP_MAP_COPY;
if (flags & MAP_SHARED) {
/* do checks for writing, appending and locking */
if ((prot & PROT_WRITE) &&
!(file->f_mode & FMODE_WRITE))
return -EACCES;
if (IS_APPEND(file->f_path.dentry->d_inode) &&
(file->f_mode & FMODE_WRITE))
return -EACCES;
if (locks_verify_locked(file->f_path.dentry->d_inode))
return -EAGAIN;
if (!(capabilities & BDI_CAP_MAP_DIRECT))
return -ENODEV;
if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) ||
((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP))
) {
printk("MAP_SHARED not completely supported on !MMU\n");
return -EINVAL;
}
/* we mustn't privatise shared mappings */
capabilities &= ~BDI_CAP_MAP_COPY;
}
else {
/* we're going to read the file into private memory we
* allocate */
if (!(capabilities & BDI_CAP_MAP_COPY))
return -ENODEV;
/* we don't permit a private writable mapping to be
* shared with the backing device */
if (prot & PROT_WRITE)
capabilities &= ~BDI_CAP_MAP_DIRECT;
}
/* handle executable mappings and implied executable
* mappings */
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
if (prot & PROT_EXEC)
return -EPERM;
}
else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
/* handle implication of PROT_EXEC by PROT_READ */
if (current->personality & READ_IMPLIES_EXEC) {
if (capabilities & BDI_CAP_EXEC_MAP)
prot |= PROT_EXEC;
}
}
else if ((prot & PROT_READ) &&
(prot & PROT_EXEC) &&
!(capabilities & BDI_CAP_EXEC_MAP)
) {
/* backing file is not executable, try to copy */
capabilities &= ~BDI_CAP_MAP_DIRECT;
}
}
else {
/* anonymous mappings are always memory backed and can be
* privately mapped
*/
capabilities = BDI_CAP_MAP_COPY;
/* handle PROT_EXEC implication by PROT_READ */
if ((prot & PROT_READ) &&
(current->personality & READ_IMPLIES_EXEC))
prot |= PROT_EXEC;
}
/* allow the security API to have its say */
ret = security_file_mmap(file, reqprot, prot, flags, addr, 0);
if (ret < 0)
return ret;
/* looks okay */
*_capabilities = capabilities;
return 0;
}
/*
* we've determined that we can make the mapping, now translate what we
* now know into VMA flags
*/
static unsigned long determine_vm_flags(struct file *file,
unsigned long prot,
unsigned long flags,
unsigned long capabilities)
{
unsigned long vm_flags;
vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
/* vm_flags |= mm->def_flags; */
if (!(capabilities & BDI_CAP_MAP_DIRECT)) {
/* attempt to share read-only copies of mapped file chunks */
if (file && !(prot & PROT_WRITE))
vm_flags |= VM_MAYSHARE;
}
else {
/* overlay a shareable mapping on the backing device or inode
* if possible - used for chardevs, ramfs/tmpfs/shmfs and
* romfs/cramfs */
if (flags & MAP_SHARED)
vm_flags |= VM_MAYSHARE | VM_SHARED;
else if ((((vm_flags & capabilities) ^ vm_flags) & BDI_CAP_VMFLAGS) == 0)
vm_flags |= VM_MAYSHARE;
}
/* refuse to let anyone share private mappings with this process if
* it's being traced - otherwise breakpoints set in it may interfere
* with another untraced process
*/
if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current))
vm_flags &= ~VM_MAYSHARE;
return vm_flags;
}
/*
* set up a shared mapping on a file
*/
static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len)
{
int ret;
ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
if (ret != -ENOSYS)
return ret;
/* getting an ENOSYS error indicates that direct mmap isn't
* possible (as opposed to tried but failed) so we'll fall
* through to making a private copy of the data and mapping
* that if we can */
return -ENODEV;
}
/*
* set up a private mapping or an anonymous shared mapping
*/
static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
{
void *base;
int ret;
/* invoke the file's mapping function so that it can keep track of
* shared mappings on devices or memory
* - VM_MAYSHARE will be set if it may attempt to share
*/
if (vma->vm_file) {
ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
if (ret != -ENOSYS) {
/* shouldn't return success if we're not sharing */
BUG_ON(ret == 0 && !(vma->vm_flags & VM_MAYSHARE));
return ret; /* success or a real error */
}
/* getting an ENOSYS error indicates that direct mmap isn't
* possible (as opposed to tried but failed) so we'll try to
* make a private copy of the data and map that instead */
}
/* allocate some memory to hold the mapping
* - note that this may not return a page-aligned address if the object
* we're allocating is smaller than a page
*/
base = kmalloc(len, GFP_KERNEL|__GFP_COMP);
if (!base)
goto enomem;
vma->vm_start = (unsigned long) base;
vma->vm_end = vma->vm_start + len;
vma->vm_flags |= VM_MAPPED_COPY;
#ifdef WARN_ON_SLACK
if (len + WARN_ON_SLACK <= kobjsize(result))
printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n",
len, current->pid, kobjsize(result) - len);
#endif
if (vma->vm_file) {
/* read the contents of a file into the copy */
mm_segment_t old_fs;
loff_t fpos;
fpos = vma->vm_pgoff;
fpos <<= PAGE_SHIFT;
old_fs = get_fs();
set_fs(KERNEL_DS);
ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos);
set_fs(old_fs);
if (ret < 0)
goto error_free;
/* clear the last little bit */
if (ret < len)
memset(base + ret, 0, len - ret);
} else {
/* if it's an anonymous mapping, then just clear it */
memset(base, 0, len);
}
return 0;
error_free:
kfree(base);
vma->vm_start = 0;
return ret;
enomem:
printk("Allocation of length %lu from process %d failed\n",
len, current->pid);
show_free_areas();
return -ENOMEM;
}
/*
* handle mapping creation for uClinux
*/
unsigned long do_mmap_pgoff(struct file *file,
unsigned long addr,
unsigned long len,
unsigned long prot,
unsigned long flags,
unsigned long pgoff)
{
struct vm_list_struct *vml = NULL;
struct vm_area_struct *vma = NULL;
struct rb_node *rb;
unsigned long capabilities, vm_flags;
void *result;
int ret;
if (!(flags & MAP_FIXED))
addr = round_hint_to_min(addr);
/* decide whether we should attempt the mapping, and if so what sort of
* mapping */
ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
&capabilities);
if (ret < 0)
return ret;
/* we've determined that we can make the mapping, now translate what we
* now know into VMA flags */
vm_flags = determine_vm_flags(file, prot, flags, capabilities);
/* we're going to need to record the mapping if it works */
vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL);
if (!vml)
goto error_getting_vml;
down_write(&nommu_vma_sem);
/* if we want to share, we need to check for VMAs created by other
* mmap() calls that overlap with our proposed mapping
* - we can only share with an exact match on most regular files
* - shared mappings on character devices and memory backed files are
* permitted to overlap inexactly as far as we are concerned for in
* these cases, sharing is handled in the driver or filesystem rather
* than here
*/
if (vm_flags & VM_MAYSHARE) {
unsigned long pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
unsigned long vmpglen;
[PATCH] NOMMU: make SYSV SHM nattch work correctly Make the SYSV SHM nattch counter work correctly by forcing multiple VMAs to be produced to represent MAP_SHARED segments, even if they overlap exactly. Using this test program: http://people.redhat.com/~dhowells/doshm.c Run as: doshm sysv I can see nattch going from one before the patch: # /doshm sysv Command: sysv shmid: 65536 memory: 0xc3700000 c0b00000-c0b04000 rw-p 00000000 00:00 0 c0bb0000-c0bba788 r-xs 00000000 00:0b 14582157 /lib/ld-uClibc-0.9.28.so c3180000-c31dede4 r-xs 00000000 00:0b 14582179 /lib/libuClibc-0.9.28.so c3520000-c352278c rw-p 00000000 00:0b 13763417 /doshm c3584000-c35865e8 r-xs 00000000 00:0b 13763417 /doshm c3588000-c358aa00 rw-p 00008000 00:0b 14582157 /lib/ld-uClibc-0.9.28.so c3590000-c359b6c0 rw-p 00000000 00:00 0 c3620000-c3640000 rwxp 00000000 00:00 0 c3700000-c37fa000 rw-S 00000000 00:06 1411 /SYSV00000000 (deleted) c3700000-c37fa000 rw-S 00000000 00:06 1411 /SYSV00000000 (deleted) nattch 1 To two after the patch: # /doshm sysv Command: sysv shmid: 0 memory: 0xc3700000 c0bb0000-c0bba788 r-xs 00000000 00:0b 14582157 /lib/ld-uClibc-0.9.28.so c3180000-c31dede4 r-xs 00000000 00:0b 14582179 /lib/libuClibc-0.9.28.so c3320000-c3340000 rwxp 00000000 00:00 0 c3530000-c35325e8 r-xs 00000000 00:0b 13763417 /doshm c3534000-c353678c rw-p 00000000 00:0b 13763417 /doshm c3538000-c353aa00 rw-p 00008000 00:0b 14582157 /lib/ld-uClibc-0.9.28.so c3590000-c359b6c0 rw-p 00000000 00:00 0 c35a4000-c35a8000 rw-p 00000000 00:00 0 c3700000-c37fa000 rw-S 00000000 00:06 1369 /SYSV00000000 (deleted) c3700000-c37fa000 rw-S 00000000 00:06 1369 /SYSV00000000 (deleted) nattch 2 That's +1 to nattch for each shmat() made. Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-03-22 02:11:24 -06:00
/* suppress VMA sharing for shared regions */
if (vm_flags & VM_SHARED &&
capabilities & BDI_CAP_MAP_DIRECT)
goto dont_share_VMAs;
for (rb = rb_first(&nommu_vma_tree); rb; rb = rb_next(rb)) {
vma = rb_entry(rb, struct vm_area_struct, vm_rb);
if (!(vma->vm_flags & VM_MAYSHARE))
continue;
/* search for overlapping mappings on the same file */
if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode)
continue;
if (vma->vm_pgoff >= pgoff + pglen)
continue;
vmpglen = vma->vm_end - vma->vm_start + PAGE_SIZE - 1;
vmpglen >>= PAGE_SHIFT;
if (pgoff >= vma->vm_pgoff + vmpglen)
continue;
/* handle inexactly overlapping matches between mappings */
if (vma->vm_pgoff != pgoff || vmpglen != pglen) {
if (!(capabilities & BDI_CAP_MAP_DIRECT))
goto sharing_violation;
continue;
}
/* we've found a VMA we can share */
atomic_inc(&vma->vm_usage);
vml->vma = vma;
result = (void *) vma->vm_start;
goto shared;
}
[PATCH] NOMMU: make SYSV SHM nattch work correctly Make the SYSV SHM nattch counter work correctly by forcing multiple VMAs to be produced to represent MAP_SHARED segments, even if they overlap exactly. Using this test program: http://people.redhat.com/~dhowells/doshm.c Run as: doshm sysv I can see nattch going from one before the patch: # /doshm sysv Command: sysv shmid: 65536 memory: 0xc3700000 c0b00000-c0b04000 rw-p 00000000 00:00 0 c0bb0000-c0bba788 r-xs 00000000 00:0b 14582157 /lib/ld-uClibc-0.9.28.so c3180000-c31dede4 r-xs 00000000 00:0b 14582179 /lib/libuClibc-0.9.28.so c3520000-c352278c rw-p 00000000 00:0b 13763417 /doshm c3584000-c35865e8 r-xs 00000000 00:0b 13763417 /doshm c3588000-c358aa00 rw-p 00008000 00:0b 14582157 /lib/ld-uClibc-0.9.28.so c3590000-c359b6c0 rw-p 00000000 00:00 0 c3620000-c3640000 rwxp 00000000 00:00 0 c3700000-c37fa000 rw-S 00000000 00:06 1411 /SYSV00000000 (deleted) c3700000-c37fa000 rw-S 00000000 00:06 1411 /SYSV00000000 (deleted) nattch 1 To two after the patch: # /doshm sysv Command: sysv shmid: 0 memory: 0xc3700000 c0bb0000-c0bba788 r-xs 00000000 00:0b 14582157 /lib/ld-uClibc-0.9.28.so c3180000-c31dede4 r-xs 00000000 00:0b 14582179 /lib/libuClibc-0.9.28.so c3320000-c3340000 rwxp 00000000 00:00 0 c3530000-c35325e8 r-xs 00000000 00:0b 13763417 /doshm c3534000-c353678c rw-p 00000000 00:0b 13763417 /doshm c3538000-c353aa00 rw-p 00008000 00:0b 14582157 /lib/ld-uClibc-0.9.28.so c3590000-c359b6c0 rw-p 00000000 00:00 0 c35a4000-c35a8000 rw-p 00000000 00:00 0 c3700000-c37fa000 rw-S 00000000 00:06 1369 /SYSV00000000 (deleted) c3700000-c37fa000 rw-S 00000000 00:06 1369 /SYSV00000000 (deleted) nattch 2 That's +1 to nattch for each shmat() made. Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-03-22 02:11:24 -06:00
dont_share_VMAs:
vma = NULL;
/* obtain the address at which to make a shared mapping
* - this is the hook for quasi-memory character devices to
* tell us the location of a shared mapping
*/
if (file && file->f_op->get_unmapped_area) {
addr = file->f_op->get_unmapped_area(file, addr, len,
pgoff, flags);
if (IS_ERR((void *) addr)) {
ret = addr;
if (ret != (unsigned long) -ENOSYS)
goto error;
/* the driver refused to tell us where to site
* the mapping so we'll have to attempt to copy
* it */
ret = (unsigned long) -ENODEV;
if (!(capabilities & BDI_CAP_MAP_COPY))
goto error;
capabilities &= ~BDI_CAP_MAP_DIRECT;
}
}
}
/* we're going to need a VMA struct as well */
vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
if (!vma)
goto error_getting_vma;
INIT_LIST_HEAD(&vma->anon_vma_node);
atomic_set(&vma->vm_usage, 1);
if (file) {
get_file(file);
if (vm_flags & VM_EXECUTABLE) {
added_exe_file_vma(current->mm);
vma->vm_mm = current->mm;
}
}
vma->vm_file = file;
vma->vm_flags = vm_flags;
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_pgoff = pgoff;
vml->vma = vma;
/* set up the mapping */
if (file && vma->vm_flags & VM_SHARED)
ret = do_mmap_shared_file(vma, len);
else
ret = do_mmap_private(vma, len);
if (ret < 0)
goto error;
/* okay... we have a mapping; now we have to register it */
result = (void *) vma->vm_start;
if (vma->vm_flags & VM_MAPPED_COPY) {
realalloc += kobjsize(result);
askedalloc += len;
}
realalloc += kobjsize(vma);
askedalloc += sizeof(*vma);
current->mm->total_vm += len >> PAGE_SHIFT;
add_nommu_vma(vma);
shared:
realalloc += kobjsize(vml);
askedalloc += sizeof(*vml);
add_vma_to_mm(current->mm, vml);
up_write(&nommu_vma_sem);
if (prot & PROT_EXEC)
flush_icache_range((unsigned long) result,
(unsigned long) result + len);
#ifdef DEBUG
printk("do_mmap:\n");
show_process_blocks();
#endif
return (unsigned long) result;
error:
up_write(&nommu_vma_sem);
kfree(vml);
if (vma) {
if (vma->vm_file) {
fput(vma->vm_file);
if (vma->vm_flags & VM_EXECUTABLE)
removed_exe_file_vma(vma->vm_mm);
}
kfree(vma);
}
return ret;
sharing_violation:
up_write(&nommu_vma_sem);
printk("Attempt to share mismatched mappings\n");
kfree(vml);
return -EINVAL;
error_getting_vma:
up_write(&nommu_vma_sem);
kfree(vml);
printk("Allocation of vma for %lu byte allocation from process %d failed\n",
len, current->pid);
show_free_areas();
return -ENOMEM;
error_getting_vml:
printk("Allocation of vml for %lu byte allocation from process %d failed\n",
len, current->pid);
show_free_areas();
return -ENOMEM;
}
EXPORT_SYMBOL(do_mmap_pgoff);
/*
* handle mapping disposal for uClinux
*/
static void put_vma(struct mm_struct *mm, struct vm_area_struct *vma)
{
if (vma) {
down_write(&nommu_vma_sem);
if (atomic_dec_and_test(&vma->vm_usage)) {
delete_nommu_vma(vma);
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
/* IO memory and memory shared directly out of the pagecache from
* ramfs/tmpfs mustn't be released here */
if (vma->vm_flags & VM_MAPPED_COPY) {
realalloc -= kobjsize((void *) vma->vm_start);
askedalloc -= vma->vm_end - vma->vm_start;
kfree((void *) vma->vm_start);
}
realalloc -= kobjsize(vma);
askedalloc -= sizeof(*vma);
if (vma->vm_file) {
fput(vma->vm_file);
if (vma->vm_flags & VM_EXECUTABLE)
removed_exe_file_vma(mm);
}
kfree(vma);
}
up_write(&nommu_vma_sem);
}
}
/*
* release a mapping
* - under NOMMU conditions the parameters must match exactly to the mapping to
* be removed
*/
int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
{
struct vm_list_struct *vml, **parent;
unsigned long end = addr + len;
#ifdef DEBUG
printk("do_munmap:\n");
#endif
for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) {
if ((*parent)->vma->vm_start > addr)
break;
if ((*parent)->vma->vm_start == addr &&
((len == 0) || ((*parent)->vma->vm_end == end)))
goto found;
}
printk("munmap of non-mmaped memory by process %d (%s): %p\n",
current->pid, current->comm, (void *) addr);
return -EINVAL;
found:
vml = *parent;
put_vma(mm, vml->vma);
*parent = vml->next;
realalloc -= kobjsize(vml);
askedalloc -= sizeof(*vml);
kfree(vml);
[PATCH] mm: update_hiwaters just in time update_mem_hiwater has attracted various criticisms, in particular from those concerned with mm scalability. Originally it was called whenever rss or total_vm got raised. Then many of those callsites were replaced by a timer tick call from account_system_time. Now Frank van Maarseveen reports that to be found inadequate. How about this? Works for Frank. Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros update_hiwater_rss and update_hiwater_vm. Don't attempt to keep mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually by 1): those are hot paths. Do the opposite, update only when about to lower rss (usually by many), or just before final accounting in do_exit. Handle mm->hiwater_vm in the same way, though it's much less of an issue. Demand that whoever collects these hiwater statistics do the work of taking the maximum with rss or total_vm. And there has been no collector of these hiwater statistics in the tree. The new convention needs an example, so match Frank's usage by adding a VmPeak line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS (High-Water-Mark or High-Water-Memory). There was a particular anomaly during mremap move, that hiwater_vm might be captured too high. A fleeting such anomaly remains, but it's quickly corrected now, whereas before it would stick. What locking? None: if the app is racy then these statistics will be racy, it's not worth any overhead to make them exact. But whenever it suits, hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under page_table_lock (for now) or with preemption disabled (later on): without going to any trouble, minimize the time between reading current values and updating, to minimize those occasions when a racing thread bumps a count up and back down in between. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 19:16:18 -06:00
update_hiwater_vm(mm);
mm->total_vm -= len >> PAGE_SHIFT;
#ifdef DEBUG
show_process_blocks();
#endif
return 0;
}
EXPORT_SYMBOL(do_munmap);
asmlinkage long sys_munmap(unsigned long addr, size_t len)
{
int ret;
struct mm_struct *mm = current->mm;
down_write(&mm->mmap_sem);
ret = do_munmap(mm, addr, len);
up_write(&mm->mmap_sem);
return ret;
}
/*
* Release all mappings
*/
void exit_mmap(struct mm_struct * mm)
{
struct vm_list_struct *tmp;
if (mm) {
#ifdef DEBUG
printk("Exit_mmap:\n");
#endif
mm->total_vm = 0;
while ((tmp = mm->context.vmlist)) {
mm->context.vmlist = tmp->next;
put_vma(mm, tmp->vma);
realalloc -= kobjsize(tmp);
askedalloc -= sizeof(*tmp);
kfree(tmp);
}
#ifdef DEBUG
show_process_blocks();
#endif
}
}
unsigned long do_brk(unsigned long addr, unsigned long len)
{
return -ENOMEM;
}
/*
* expand (or shrink) an existing mapping, potentially moving it at the same
* time (controlled by the MREMAP_MAYMOVE flag and available VM space)
*
* under NOMMU conditions, we only permit changing a mapping's size, and only
* as long as it stays within the hole allocated by the kmalloc() call in
* do_mmap_pgoff() and the block is not shareable
*
* MREMAP_FIXED is not supported under NOMMU conditions
*/
unsigned long do_mremap(unsigned long addr,
unsigned long old_len, unsigned long new_len,
unsigned long flags, unsigned long new_addr)
{
struct vm_area_struct *vma;
/* insanity checks first */
if (new_len == 0)
return (unsigned long) -EINVAL;
if (flags & MREMAP_FIXED && new_addr != addr)
return (unsigned long) -EINVAL;
vma = find_vma_exact(current->mm, addr);
if (!vma)
return (unsigned long) -EINVAL;
if (vma->vm_end != vma->vm_start + old_len)
return (unsigned long) -EFAULT;
if (vma->vm_flags & VM_MAYSHARE)
return (unsigned long) -EPERM;
if (new_len > kobjsize((void *) addr))
return (unsigned long) -ENOMEM;
/* all checks complete - do it */
vma->vm_end = vma->vm_start + new_len;
askedalloc -= old_len;
askedalloc += new_len;
return vma->vm_start;
}
EXPORT_SYMBOL(do_mremap);
asmlinkage unsigned long sys_mremap(unsigned long addr,
unsigned long old_len, unsigned long new_len,
unsigned long flags, unsigned long new_addr)
{
unsigned long ret;
down_write(&current->mm->mmap_sem);
ret = do_mremap(addr, old_len, new_len, flags, new_addr);
up_write(&current->mm->mmap_sem);
return ret;
}
struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
unsigned int foll_flags)
{
return NULL;
}
int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
unsigned long to, unsigned long size, pgprot_t prot)
{
vma->vm_start = vma->vm_pgoff << PAGE_SHIFT;
return 0;
}
EXPORT_SYMBOL(remap_pfn_range);
int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
unsigned long pgoff)
{
unsigned int size = vma->vm_end - vma->vm_start;
if (!(vma->vm_flags & VM_USERMAP))
return -EINVAL;
vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT));
vma->vm_end = vma->vm_start + size;
return 0;
}
EXPORT_SYMBOL(remap_vmalloc_range);
void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
{
}
unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
return -ENOMEM;
}
[PATCH] Avoiding mmap fragmentation Ingo recently introduced a great speedup for allocating new mmaps using the free_area_cache pointer which boosts the specweb SSL benchmark by 4-5% and causes huge performance increases in thread creation. The downside of this patch is that it does lead to fragmentation in the mmap-ed areas (visible via /proc/self/maps), such that some applications that work fine under 2.4 kernels quickly run out of memory on any 2.6 kernel. The problem is twofold: 1) the free_area_cache is used to continue a search for memory where the last search ended. Before the change new areas were always searched from the base address on. So now new small areas are cluttering holes of all sizes throughout the whole mmap-able region whereas before small holes tended to close holes near the base leaving holes far from the base large and available for larger requests. 2) the free_area_cache also is set to the location of the last munmap-ed area so in scenarios where we allocate e.g. five regions of 1K each, then free regions 4 2 3 in this order the next request for 1K will be placed in the position of the old region 3, whereas before we appended it to the still active region 1, placing it at the location of the old region 2. Before we had 1 free region of 2K, now we only get two free regions of 1K -> fragmentation. The patch addresses thes issues by introducing yet another cache descriptor cached_hole_size that contains the largest known hole size below the current free_area_cache. If a new request comes in the size is compared against the cached_hole_size and if the request can be filled with a hole below free_area_cache the search is started from the base instead. The results look promising: Whereas 2.6.12-rc4 fragments quickly and my (earlier posted) leakme.c test program terminates after 50000+ iterations with 96 distinct and fragmented maps in /proc/self/maps it performs nicely (as expected) with thread creation, Ingo's test_str02 with 20000 threads requires 0.7s system time. Taking out Ingo's patch (un-patch available per request) by basically deleting all mentions of free_area_cache from the kernel and starting the search for new memory always at the respective bases we observe: leakme terminates successfully with 11 distinctive hardly fragmented areas in /proc/self/maps but thread creating is gringdingly slow: 30+s(!) system time for Ingo's test_str02 with 20000 threads. Now - drumroll ;-) the appended patch works fine with leakme: it ends with only 7 distinct areas in /proc/self/maps and also thread creation seems sufficiently fast with 0.71s for 20000 threads. Signed-off-by: Wolfgang Wander <wwc@rentec.com> Credit-to: "Richard Purdie" <rpurdie@rpsys.net> Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Acked-by: Ingo Molnar <mingo@elte.hu> (partly) Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-21 18:14:49 -06:00
void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
{
}
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen,
int even_cows)
{
}
EXPORT_SYMBOL(unmap_mapping_range);
/*
* ask for an unmapped area at which to create a mapping on a file
*/
unsigned long get_unmapped_area(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff,
unsigned long flags)
{
unsigned long (*get_area)(struct file *, unsigned long, unsigned long,
unsigned long, unsigned long);
get_area = current->mm->get_unmapped_area;
if (file && file->f_op && file->f_op->get_unmapped_area)
get_area = file->f_op->get_unmapped_area;
if (!get_area)
return -ENOSYS;
return get_area(file, addr, len, pgoff, flags);
}
EXPORT_SYMBOL(get_unmapped_area);
/*
* Check that a process has enough memory to allocate a new virtual
* mapping. 0 means there is enough memory for the allocation to
* succeed and -ENOMEM implies there is not.
*
* We currently support three overcommit policies, which are set via the
* vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
*
* Strict overcommit modes added 2002 Feb 26 by Alan Cox.
* Additional code 2002 Jul 20 by Robert Love.
*
* cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
*
* Note this is a helper function intended to be used by LSMs which
* wish to use this logic.
*/
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
unsigned long free, allowed;
vm_acct_memory(pages);
/*
* Sometimes we want to use more memory than we have
*/
if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
return 0;
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
unsigned long n;
free = global_page_state(NR_FILE_PAGES);
free += nr_swap_pages;
/*
* Any slabs which are created with the
* SLAB_RECLAIM_ACCOUNT flag claim to have contents
* which are reclaimable, under pressure. The dentry
* cache and most inode caches should fall into this
*/
free += global_page_state(NR_SLAB_RECLAIMABLE);
/*
* Leave the last 3% for root
*/
if (!cap_sys_admin)
free -= free / 32;
if (free > pages)
return 0;
/*
* nr_free_pages() is very expensive on large systems,
* only call if we're about to fail.
*/
n = nr_free_pages();
/*
* Leave reserved pages. The pages are not for anonymous pages.
*/
if (n <= totalreserve_pages)
goto error;
else
n -= totalreserve_pages;
/*
* Leave the last 3% for root
*/
if (!cap_sys_admin)
n -= n / 32;
free += n;
if (free > pages)
return 0;
goto error;
}
allowed = totalram_pages * sysctl_overcommit_ratio / 100;
/*
* Leave the last 3% for root
*/
if (!cap_sys_admin)
allowed -= allowed / 32;
allowed += total_swap_pages;
/* Don't let a single process grow too big:
leave 3% of the size of this process for other processes */
allowed -= current->mm->total_vm / 32;
/*
* cast `allowed' as a signed long because vm_committed_space
* sometimes has a negative value
*/
if (atomic_long_read(&vm_committed_space) < (long)allowed)
return 0;
error:
vm_unacct_memory(pages);
return -ENOMEM;
}
int in_gate_area_no_task(unsigned long addr)
{
return 0;
}
int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
BUG();
return 0;
}
EXPORT_SYMBOL(filemap_fault);
/*
* Access another process' address space.
* - source/target buffer must be kernel space
*/
int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
{
struct vm_area_struct *vma;
struct mm_struct *mm;
if (addr + len < addr)
return 0;
mm = get_task_mm(tsk);
if (!mm)
return 0;
down_read(&mm->mmap_sem);
/* the access must start within one of the target process's mappings */
vma = find_vma(mm, addr);
if (vma) {
/* don't overrun this mapping */
if (addr + len >= vma->vm_end)
len = vma->vm_end - addr;
/* only read or write mappings where it is permitted */
if (write && vma->vm_flags & VM_MAYWRITE)
len -= copy_to_user((void *) addr, buf, len);
else if (!write && vma->vm_flags & VM_MAYREAD)
len -= copy_from_user(buf, (void *) addr, len);
else
len = 0;
} else {
len = 0;
}
up_read(&mm->mmap_sem);
mmput(mm);
return len;
}