From fc47e7b592dc45b03c27b0a4c8f2d215dae9944d Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Wed, 6 Sep 2006 00:03:39 -0700 Subject: [PATCH 01/21] [PATCH] lockdep ifdef fix With CONFIG_SMP=y CONFIG_PREEMPT=y CONFIG_LOCKDEP=y CONFIG_DEBUG_LOCK_ALLOC=y # CONFIG_PROVE_LOCKING is not set spin_unlock_irqrestore() goes through lockdep but spin_lock_irqsave() doesn't. Apparently, bad things happen. Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/spinlock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/spinlock.c b/kernel/spinlock.c index bfd6ad9c0330..fb524b009eef 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -72,7 +72,7 @@ EXPORT_SYMBOL(_write_trylock); * not re-enabled during lock-acquire (which the preempt-spin-ops do): */ #if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \ - defined(CONFIG_PROVE_LOCKING) + defined(CONFIG_DEBUG_LOCK_ALLOC) void __lockfunc _read_lock(rwlock_t *lock) { From 6dba28379edc08327ede01ff41bd3c9dd46a7fa0 Mon Sep 17 00:00:00 2001 From: Henrik Kretzschmar Date: Wed, 6 Sep 2006 00:03:40 -0700 Subject: [PATCH 02/21] [PATCH] Documentation for lock_key in struct hrtimer_base Fixes an error message on make xmldocs. Signed-off-by: Henrik Kretzschmar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hrtimer.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index e4bccbcc2750..4fc379de6c2f 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -80,6 +80,7 @@ struct hrtimer_sleeper { * @get_softirq_time: function to retrieve the current time from the softirq * @curr_timer: the timer which is executing a callback right now * @softirq_time: the time when running the hrtimer queue in the softirq + * @lock_key: the lock_class_key for use with lockdep */ struct hrtimer_base { clockid_t index; From fe2bbc4832659b7ffc867cac03e0a92ae81e11e4 Mon Sep 17 00:00:00 2001 From: Henrik Kretzschmar Date: Wed, 6 Sep 2006 00:03:41 -0700 Subject: [PATCH 03/21] [PATCH] add missing desctiption in super.c Adds kernel-doc for alloc_super() type in fs/super.c. Signed-off-by: Henrik Kretzschmar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/super.c b/fs/super.c index 6d4e8174b6db..5c4c94d5495e 100644 --- a/fs/super.c +++ b/fs/super.c @@ -49,6 +49,7 @@ DEFINE_SPINLOCK(sb_lock); /** * alloc_super - create new superblock + * @type: filesystem type superblock should belong to * * Allocates and initializes a new &struct super_block. alloc_super() * returns a pointer new superblock or %NULL if allocation had failed. From 96dd7421a06a5bc6eb731323b95efcb2fd864854 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Sep 2006 00:03:42 -0700 Subject: [PATCH 04/21] [PATCH] prevent timespec/timeval to ktime_t overflow Frank v. Waveren pointed out that on 64bit machines the timespec to ktime_t conversion might overflow. This is also true for timeval to ktime_t conversions. This breaks a "sleep inf" on 64bit machines. While a timespec/timeval with tx.sec = MAX_LONG is valid by specification the internal representation of ktime_t is based on nanoseconds. The conversion of seconds to nanoseconds overflows for seconds values >= (MAX_LONG / NSEC_PER_SEC). Check the seconds argument to the conversion and limit it to the maximum time which can be represented by ktime_t. Signed-off-by: Thomas Gleixner Cc: Ingo Molnar Cc: Frank v Waveren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ktime.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/ktime.h b/include/linux/ktime.h index ed3396dcc4f7..84eeecd60a02 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -56,7 +56,8 @@ typedef union { #endif } ktime_t; -#define KTIME_MAX (~((u64)1 << 63)) +#define KTIME_MAX ((s64)~((u64)1 << 63)) +#define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) /* * ktime_t definitions when using the 64-bit scalar representation: @@ -73,6 +74,10 @@ typedef union { */ static inline ktime_t ktime_set(const long secs, const unsigned long nsecs) { +#if (BITS_PER_LONG == 64) + if (unlikely(secs >= KTIME_SEC_MAX)) + return (ktime_t){ .tv64 = KTIME_MAX }; +#endif return (ktime_t) { .tv64 = (s64)secs * NSEC_PER_SEC + (s64)nsecs }; } From 471b40d0dfc17bf0161629950b82524d41bc37ce Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Wed, 6 Sep 2006 00:03:43 -0700 Subject: [PATCH 05/21] [PATCH] prevent swsusp with PAE PAE + swsusp results in hard-to-debug crash about 50% of time during resume. Cause is known, fix needs to be ported from x86-64 (but we can't make it to 2.6.18, and I'd like this to be worked around in 2.6.18). Signed-off-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/Kconfig | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ae44a70aae8a..619ecabf7c58 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -56,7 +56,7 @@ config PM_TRACE config SOFTWARE_SUSPEND bool "Software Suspend" - depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) + depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP) && !X86_PAE) || ((FRV || PPC32) && !SMP)) ---help--- Enable the possibility of suspending the machine. It doesn't need ACPI or APM. @@ -78,6 +78,10 @@ config SOFTWARE_SUSPEND For more information take a look at . + (For now, swsusp is incompatible with PAE aka HIGHMEM_64G on i386. + we need identity mapping for resume to work, and that is trivial + to get with 4MB pages, but less than trivial on PAE). + config PM_STD_PARTITION string "Default resume partition" depends on SOFTWARE_SUSPEND From 068c4579fe5c21e84c7cb2ba89db80899e25104e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 6 Sep 2006 00:03:44 -0700 Subject: [PATCH 06/21] [PATCH] lockdep: do not touch console state when tainting the kernel Remove an unintended console_verbose() side-effect from add_taint(). Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/panic.c b/kernel/panic.c index 9b8dcfd1ca93..8010b9b17aca 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -173,7 +173,7 @@ const char *print_tainted(void) void add_taint(unsigned flag) { - debug_locks_off(); /* can't trust the integrity of the kernel anymore */ + debug_locks = 0; /* can't trust the integrity of the kernel anymore */ tainted |= flag; } EXPORT_SYMBOL(add_taint); From bb98ad77d8451a3ccf9478738ffe7ec63394fcdf Mon Sep 17 00:00:00 2001 From: Ismail Donmez Date: Wed, 6 Sep 2006 00:03:44 -0700 Subject: [PATCH 07/21] [PATCH] Move linux/device.h include in linux/atmdev.h to #ifdef __KERNEL__ section linux/device.h header is not included in the David Woodhouse's kernel-headers git tree which is used for userspace kernel headers. Which results in compile errors when building iproute2. Attached patch moves linux/device.h include under the #ifdef __KERNEL__ section. Signed-off-by: Ismail Donmez Signed-off-by: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/atmdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h index 41788a31c438..2096e5c72827 100644 --- a/include/linux/atmdev.h +++ b/include/linux/atmdev.h @@ -7,7 +7,6 @@ #define LINUX_ATMDEV_H -#include #include #include #include @@ -210,6 +209,7 @@ struct atm_cirange { #ifdef __KERNEL__ +#include #include /* wait_queue_head_t */ #include /* struct timeval */ #include From b4a228346c1a7d09f565e750d2e988c5671e9fa3 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 6 Sep 2006 09:03:26 -0700 Subject: [PATCH 08/21] [PATCH] Remove unneeded asm-i386/cpufeature.h from user visibility. Signed-off-by: David Woodhouse Signed-off-by: Linus Torvalds --- include/asm-i386/Kbuild | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-i386/Kbuild b/include/asm-i386/Kbuild index c064a8e9170f..335b2fa4e066 100644 --- a/include/asm-i386/Kbuild +++ b/include/asm-i386/Kbuild @@ -1,5 +1,5 @@ include include/asm-generic/Kbuild.asm -header-y += boot.h cpufeature.h debugreg.h ldt.h setup.h ucontext.h +header-y += boot.h debugreg.h ldt.h setup.h ucontext.h unifdef-y += mtrr.h vm86.h From ebd6c17109aed086908ae3b0949265fd07712659 Mon Sep 17 00:00:00 2001 From: john stultz Date: Wed, 6 Sep 2006 11:02:45 +0100 Subject: [PATCH 09/21] [PATCH] FRV: Use the generic time stuff for FRV Use the generic time stuff for FRV. Signed-off-by: John Stultz Signed-Off-By: David Howells Signed-off-by: Linus Torvalds --- arch/frv/Kconfig | 4 +++ arch/frv/kernel/time.c | 81 ------------------------------------------ 2 files changed, 4 insertions(+), 81 deletions(-) diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index 95a3892b8d1b..a601a17cf568 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -29,6 +29,10 @@ config GENERIC_HARDIRQS bool default n +config GENERIC_TIME + bool + default y + config TIME_LOW_RES bool default y diff --git a/arch/frv/kernel/time.c b/arch/frv/kernel/time.c index d5b64e193d92..68a77fe3bb40 100644 --- a/arch/frv/kernel/time.c +++ b/arch/frv/kernel/time.c @@ -32,8 +32,6 @@ #define TICK_SIZE (tick_nsec / 1000) -extern unsigned long wall_jiffies; - unsigned long __nongprelbss __clkin_clock_speed_HZ; unsigned long __nongprelbss __ext_bus_clock_speed_HZ; unsigned long __nongprelbss __res_bus_clock_speed_HZ; @@ -144,85 +142,6 @@ void time_init(void) time_divisor_init(); } -/* - * This version of gettimeofday has near microsecond resolution. - */ -void do_gettimeofday(struct timeval *tv) -{ - unsigned long seq; - unsigned long usec, sec; - unsigned long max_ntp_tick; - - do { - unsigned long lost; - - seq = read_seqbegin(&xtime_lock); - - usec = 0; - lost = jiffies - wall_jiffies; - - /* - * If time_adjust is negative then NTP is slowing the clock - * so make sure not to go into next possible interval. - * Better to lose some accuracy than have time go backwards.. - */ - if (unlikely(time_adjust < 0)) { - max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj; - usec = min(usec, max_ntp_tick); - - if (lost) - usec += lost * max_ntp_tick; - } - else if (unlikely(lost)) - usec += lost * (USEC_PER_SEC / HZ); - - sec = xtime.tv_sec; - usec += (xtime.tv_nsec / 1000); - } while (read_seqretry(&xtime_lock, seq)); - - while (usec >= 1000000) { - usec -= 1000000; - sec++; - } - - tv->tv_sec = sec; - tv->tv_usec = usec; -} - -EXPORT_SYMBOL(do_gettimeofday); - -int do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - /* - * This is revolting. We need to set "xtime" correctly. However, the - * value in this location is the value at the most recent update of - * wall time. Discover what correction gettimeofday() would have - * made, and then undo it! - */ - nsec -= 0 * NSEC_PER_USEC; - nsec -= (jiffies - wall_jiffies) * TICK_NSEC; - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - ntp_clear(); - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); - /* * Scheduler clock - returns current time in nanosec units. */ From 3a459756810912d2c2bf188cef566af255936b4d Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Thu, 7 Sep 2006 14:17:04 +0400 Subject: [PATCH 10/21] [PATCH] IA64,sparc: local DoS with corrupted ELFs This prevents cross-region mappings on IA64 and SPARC which could lead to system crash. They were correctly trapped for normal mmap() calls, but not for the kernel internal calls generated by executable loading. This code just moves the architecture-specific cross-region checks into an arch-specific "arch_mmap_check()" macro, and defines that for the architectures that needed it (ia64, sparc and sparc64). Architectures that don't have any special requirements can just ignore the new cross-region check, since the mmap() code will just notice on its own when the macro isn't defined. Signed-off-by: Pavel Emelianov Signed-off-by: Kirill Korotaev Acked-by: David Miller Signed-off-by: Greg Kroah-Hartman [ Cleaned up to not affect architectures that don't need it ] Signed-off-by: Linus Torvalds --- arch/ia64/kernel/sys_ia64.c | 28 ++++++++++++++----------- arch/sparc/kernel/sys_sparc.c | 27 ++++++++++++++----------- arch/sparc64/kernel/sys_sparc.c | 36 ++++++++++++++++++--------------- include/asm-ia64/mman.h | 8 ++++++++ include/asm-sparc/mman.h | 8 ++++++++ include/asm-sparc64/mman.h | 8 ++++++++ mm/mmap.c | 17 ++++++++++++++-- 7 files changed, 90 insertions(+), 42 deletions(-) diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c index 40722d88607a..9ef62a3fbfad 100644 --- a/arch/ia64/kernel/sys_ia64.c +++ b/arch/ia64/kernel/sys_ia64.c @@ -163,10 +163,25 @@ sys_pipe (void) return retval; } +int ia64_mmap_check(unsigned long addr, unsigned long len, + unsigned long flags) +{ + unsigned long roff; + + /* + * Don't permit mappings into unmapped space, the virtual page table + * of a region, or across a region boundary. Note: RGN_MAP_LIMIT is + * equal to 2^n-PAGE_SIZE (for some integer n <= 61) and len > 0. + */ + roff = REGION_OFFSET(addr); + if ((len > RGN_MAP_LIMIT) || (roff > (RGN_MAP_LIMIT - len))) + return -EINVAL; + return 0; +} + static inline unsigned long do_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, unsigned long pgoff) { - unsigned long roff; struct file *file = NULL; flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); @@ -188,17 +203,6 @@ do_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, un goto out; } - /* - * Don't permit mappings into unmapped space, the virtual page table of a region, - * or across a region boundary. Note: RGN_MAP_LIMIT is equal to 2^n-PAGE_SIZE - * (for some integer n <= 61) and len > 0. - */ - roff = REGION_OFFSET(addr); - if ((len > RGN_MAP_LIMIT) || (roff > (RGN_MAP_LIMIT - len))) { - addr = -EINVAL; - goto out; - } - down_write(¤t->mm->mmap_sem); addr = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); up_write(¤t->mm->mmap_sem); diff --git a/arch/sparc/kernel/sys_sparc.c b/arch/sparc/kernel/sys_sparc.c index a41c8a5c2007..94ff58c9d4a9 100644 --- a/arch/sparc/kernel/sys_sparc.c +++ b/arch/sparc/kernel/sys_sparc.c @@ -219,6 +219,21 @@ out: return err; } +int sparc_mmap_check(unsigned long addr, unsigned long len, unsigned long flags) +{ + if (ARCH_SUN4C_SUN4 && + (len > 0x20000000 || + ((flags & MAP_FIXED) && + addr < 0xe0000000 && addr + len > 0x20000000))) + return -EINVAL; + + /* See asm-sparc/uaccess.h */ + if (len > TASK_SIZE - PAGE_SIZE || addr + len > TASK_SIZE - PAGE_SIZE) + return -EINVAL; + + return 0; +} + /* Linux version of mmap */ static unsigned long do_mmap2(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, @@ -233,25 +248,13 @@ static unsigned long do_mmap2(unsigned long addr, unsigned long len, goto out; } - retval = -EINVAL; len = PAGE_ALIGN(len); - if (ARCH_SUN4C_SUN4 && - (len > 0x20000000 || - ((flags & MAP_FIXED) && - addr < 0xe0000000 && addr + len > 0x20000000))) - goto out_putf; - - /* See asm-sparc/uaccess.h */ - if (len > TASK_SIZE - PAGE_SIZE || addr + len > TASK_SIZE - PAGE_SIZE) - goto out_putf; - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); down_write(¤t->mm->mmap_sem); retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); up_write(¤t->mm->mmap_sem); -out_putf: if (file) fput(file); out: diff --git a/arch/sparc64/kernel/sys_sparc.c b/arch/sparc64/kernel/sys_sparc.c index 054d0abdb7ee..bf5f14ee73de 100644 --- a/arch/sparc64/kernel/sys_sparc.c +++ b/arch/sparc64/kernel/sys_sparc.c @@ -548,6 +548,26 @@ asmlinkage long sparc64_personality(unsigned long personality) return ret; } +int sparc64_mmap_check(unsigned long addr, unsigned long len, + unsigned long flags) +{ + if (test_thread_flag(TIF_32BIT)) { + if (len >= STACK_TOP32) + return -EINVAL; + + if ((flags & MAP_FIXED) && addr > STACK_TOP32 - len) + return -EINVAL; + } else { + if (len >= VA_EXCLUDE_START) + return -EINVAL; + + if ((flags & MAP_FIXED) && invalid_64bit_range(addr, len)) + return -EINVAL; + } + + return 0; +} + /* Linux version of mmap */ asmlinkage unsigned long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, @@ -563,27 +583,11 @@ asmlinkage unsigned long sys_mmap(unsigned long addr, unsigned long len, } flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); len = PAGE_ALIGN(len); - retval = -EINVAL; - - if (test_thread_flag(TIF_32BIT)) { - if (len >= STACK_TOP32) - goto out_putf; - - if ((flags & MAP_FIXED) && addr > STACK_TOP32 - len) - goto out_putf; - } else { - if (len >= VA_EXCLUDE_START) - goto out_putf; - - if ((flags & MAP_FIXED) && invalid_64bit_range(addr, len)) - goto out_putf; - } down_write(¤t->mm->mmap_sem); retval = do_mmap(file, addr, len, prot, flags, off); up_write(¤t->mm->mmap_sem); -out_putf: if (file) fput(file); out: diff --git a/include/asm-ia64/mman.h b/include/asm-ia64/mman.h index 6ba179f12718..c73b87832a1e 100644 --- a/include/asm-ia64/mman.h +++ b/include/asm-ia64/mman.h @@ -22,4 +22,12 @@ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ +#ifdef __KERNEL__ +#ifndef __ASSEMBLY__ +#define arch_mmap_check ia64_mmap_check +int ia64_mmap_check(unsigned long addr, unsigned long len, + unsigned long flags); +#endif +#endif + #endif /* _ASM_IA64_MMAN_H */ diff --git a/include/asm-sparc/mman.h b/include/asm-sparc/mman.h index 88d1886abf3b..b7dc40bc68f4 100644 --- a/include/asm-sparc/mman.h +++ b/include/asm-sparc/mman.h @@ -35,4 +35,12 @@ #define MADV_FREE 0x5 /* (Solaris) contents can be freed */ +#ifdef __KERNEL__ +#ifndef __ASSEMBLY__ +#define arch_mmap_check sparc_mmap_check +int sparc_mmap_check(unsigned long addr, unsigned long len, + unsigned long flags); +#endif +#endif + #endif /* __SPARC_MMAN_H__ */ diff --git a/include/asm-sparc64/mman.h b/include/asm-sparc64/mman.h index 6fd878e61435..8cc1860be630 100644 --- a/include/asm-sparc64/mman.h +++ b/include/asm-sparc64/mman.h @@ -35,4 +35,12 @@ #define MADV_FREE 0x5 /* (Solaris) contents can be freed */ +#ifdef __KERNEL__ +#ifndef __ASSEMBLY__ +#define arch_mmap_check sparc64_mmap_check +int sparc64_mmap_check(unsigned long addr, unsigned long len, + unsigned long flags); +#endif +#endif + #endif /* __SPARC64_MMAN_H__ */ diff --git a/mm/mmap.c b/mm/mmap.c index c1868ecdbc5f..e66a0b524aff 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -30,6 +30,10 @@ #include #include +#ifndef arch_mmap_check +#define arch_mmap_check(addr, len, flags) (0) +#endif + static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); @@ -913,6 +917,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, if (!len) return -EINVAL; + error = arch_mmap_check(addr, len, flags); + if (error) + return error; + /* Careful about overflows.. */ len = PAGE_ALIGN(len); if (!len || len > TASK_SIZE) @@ -1859,6 +1867,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) unsigned long flags; struct rb_node ** rb_link, * rb_parent; pgoff_t pgoff = addr >> PAGE_SHIFT; + int error; len = PAGE_ALIGN(len); if (!len) @@ -1867,6 +1876,12 @@ unsigned long do_brk(unsigned long addr, unsigned long len) if ((addr + len) > TASK_SIZE || (addr + len) < addr) return -EINVAL; + flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + + error = arch_mmap_check(addr, len, flags); + if (error) + return error; + /* * mlock MCL_FUTURE? */ @@ -1907,8 +1922,6 @@ unsigned long do_brk(unsigned long addr, unsigned long len) if (security_vm_enough_memory(len >> PAGE_SHIFT)) return -ENOMEM; - flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - /* Can we just expand an old private anonymous mapping? */ if (vma_merge(mm, prev, addr, addr + len, flags, NULL, NULL, pgoff, NULL)) From c5780e976e19faff345fcef4a01db87108b51a44 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 8 Sep 2006 09:47:15 -0700 Subject: [PATCH 11/21] [PATCH] Use the correct restart option for futex_lock_pi The current implementation of futex_lock_pi returns -ERESTART_RESTARTBLOCK in case that the lock operation has been interrupted by a signal. This results in a return of -EINTR to userspace in case there is an handler for the signal. This is wrong, because userspace expects that the lock function does not return in any case of signal delivery. This was not caught by my insufficient test case, but triggered a nasty userspace problem in an high load application scenario. Unfortunately also glibc does not check for this invalid return value. Using -ERSTARTNOINTR makes sure, that the interrupted syscall is restarted. The restart block related code can be safely removed, as the possible timeout argument is an absolute time value. Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 84 +++++++------------------------------------------- 1 file changed, 11 insertions(+), 73 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index b9b8aea5389e..9d260e838cff 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1120,9 +1120,10 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) * if there are waiters then it will block, it does PI, etc. (Due to * races the kernel might see a 0 value of the futex too.) */ -static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, - struct hrtimer_sleeper *to) +static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, + long nsec, int trylock) { + struct hrtimer_sleeper timeout, *to = NULL; struct task_struct *curr = current; struct futex_hash_bucket *hb; u32 uval, newval, curval; @@ -1132,6 +1133,13 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, if (refill_pi_state_cache()) return -ENOMEM; + if (sec != MAX_SCHEDULE_TIMEOUT) { + to = &timeout; + hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); + hrtimer_init_sleeper(to, current); + to->timer.expires = ktime_set(sec, nsec); + } + q.pi_state = NULL; retry: down_read(&curr->mm->mmap_sem); @@ -1307,7 +1315,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, if (!detect && ret == -EDEADLK && 0) force_sig(SIGKILL, current); - return ret; + return ret != -EINTR ? ret : -ERESTARTNOINTR; out_unlock_release_sem: queue_unlock(&q, hb); @@ -1341,76 +1349,6 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, return ret; } -/* - * Restart handler - */ -static long futex_lock_pi_restart(struct restart_block *restart) -{ - struct hrtimer_sleeper timeout, *to = NULL; - int ret; - - restart->fn = do_no_restart_syscall; - - if (restart->arg2 || restart->arg3) { - to = &timeout; - hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); - hrtimer_init_sleeper(to, current); - to->timer.expires.tv64 = ((u64)restart->arg1 << 32) | - (u64) restart->arg0; - } - - pr_debug("lock_pi restart: %p, %d (%d)\n", - (u32 __user *)restart->arg0, current->pid); - - ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1, - 0, to); - - if (ret != -EINTR) - return ret; - - restart->fn = futex_lock_pi_restart; - - /* The other values are filled in */ - return -ERESTART_RESTARTBLOCK; -} - -/* - * Called from the syscall entry below. - */ -static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, - long nsec, int trylock) -{ - struct hrtimer_sleeper timeout, *to = NULL; - struct restart_block *restart; - int ret; - - if (sec != MAX_SCHEDULE_TIMEOUT) { - to = &timeout; - hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); - hrtimer_init_sleeper(to, current); - to->timer.expires = ktime_set(sec, nsec); - } - - ret = do_futex_lock_pi(uaddr, detect, trylock, to); - - if (ret != -EINTR) - return ret; - - pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid); - - restart = ¤t_thread_info()->restart_block; - restart->fn = futex_lock_pi_restart; - restart->arg0 = (unsigned long) uaddr; - restart->arg1 = detect; - if (to) { - restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF; - restart->arg3 = to->timer.expires.tv64 >> 32; - } else - restart->arg2 = restart->arg3 = 0; - - return -ERESTART_RESTARTBLOCK; -} - /* * Userspace attempted a TID -> 0 atomic transition, and failed. * This is the in-kernel slowpath: we look up the PI state (if any), From 4495c9e5cabc82bb4ce930eb5d3dc7544f3f8389 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 8 Sep 2006 09:47:24 -0700 Subject: [PATCH 12/21] [PATCH] optical /proc/ide/*/media Sergey Vlasov reported that his "FUJITSU MCC3064AP, ATAPI OPTICAL drive" pops up as UNKNOWN in /proc/ide/*/media . Closes #4145. Signed-off-by: Alexey Dobriyan Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/ide/ide-proc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c index c12f1b71e934..41b74b13a00c 100644 --- a/drivers/ide/ide-proc.c +++ b/drivers/ide/ide-proc.c @@ -376,6 +376,8 @@ static int proc_ide_read_media break; case ide_floppy:media = "floppy\n"; break; + case ide_optical:media = "optical\n"; + break; default: media = "UNKNOWN\n"; break; } From 67bb2c692cc02e53c23e4debc92c3a79ddc52a8c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 8 Sep 2006 09:47:34 -0700 Subject: [PATCH 13/21] [PATCH] sh: fix FPN_START typo Not that it passes allmodconfig without it... Signed-off-by: Alexey Dobriyan Cc: Paul Mundt Cc: Kazumoto Kojima Cc: Mark Haverkamp Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-sh/page.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-sh/page.h b/include/asm-sh/page.h index a5559e38744e..5a057b00f19a 100644 --- a/include/asm-sh/page.h +++ b/include/asm-sh/page.h @@ -104,7 +104,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; /* PFN start number, because of __MEMORY_START */ #define PFN_START (__MEMORY_START >> PAGE_SHIFT) -#define ARCH_PFN_OFFSET (FPN_START) +#define ARCH_PFN_OFFSET (PFN_START) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define pfn_valid(pfn) (((pfn) - PFN_START) < max_mapnr) #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) From 47d4b9066df023670a61e74565a75293cf15a441 Mon Sep 17 00:00:00 2001 From: David Wang Date: Fri, 8 Sep 2006 09:47:51 -0700 Subject: [PATCH 14/21] [PATCH] sis5513: add SiS south bridge ID 0x966 and 0x968 New SiS south bridge device ID is 0x966. Next coming product will be 0x968. (Will be released in Q4, this year) We don't make any updates to the IDE controller. Signed-off-by: David Wang Cc: Jeff Garzik Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/ide/pci/sis5513.c | 2 ++ include/linux/pci_ids.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/drivers/ide/pci/sis5513.c b/drivers/ide/pci/sis5513.c index 8a6c23ac8cc1..f03196c5db37 100644 --- a/drivers/ide/pci/sis5513.c +++ b/drivers/ide/pci/sis5513.c @@ -86,6 +86,8 @@ static const struct { u8 chipset_family; u8 flags; } SiSHostChipInfo[] = { + { "SiS968", PCI_DEVICE_ID_SI_968, ATA_133 }, + { "SiS966", PCI_DEVICE_ID_SI_966, ATA_133 }, { "SiS965", PCI_DEVICE_ID_SI_965, ATA_133 }, { "SiS745", PCI_DEVICE_ID_SI_745, ATA_100 }, { "SiS735", PCI_DEVICE_ID_SI_735, ATA_100 }, diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index c91164ea3dec..7a249155ee4e 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -648,6 +648,8 @@ #define PCI_DEVICE_ID_SI_962 0x0962 #define PCI_DEVICE_ID_SI_963 0x0963 #define PCI_DEVICE_ID_SI_965 0x0965 +#define PCI_DEVICE_ID_SI_966 0x0966 +#define PCI_DEVICE_ID_SI_968 0x0968 #define PCI_DEVICE_ID_SI_5511 0x5511 #define PCI_DEVICE_ID_SI_5513 0x5513 #define PCI_DEVICE_ID_SI_5517 0x5517 From 3665d0e58fa44f50c744f85c7e8ad21d5b10e206 Mon Sep 17 00:00:00 2001 From: Badari Pulavarty Date: Fri, 8 Sep 2006 09:48:21 -0700 Subject: [PATCH 15/21] [PATCH] ext3_getblk() should handle HOLE correctly It has been reported that ext3_getblk() is not doing the right thing and triggering following WARN(): BUG: warning at fs/ext3/inode.c:1016/ext3_getblk() ext3_getblk+0x98/0x2a6 md_wakeup_thread+0x26/0x2a ext3_bread+0x1f/0x88 ext3_quota_read+0x136/0x1ae v1_read_dqblk+0x61/0xac dquot_acquire+0xf6/0x107 ext3_acquire_dquot+0x46/0x68 dqget+0x155/0x1e7 dquot_transfer+0x3e0/0x3e9 dput+0x23/0x13e ext3_setattr+0xc3/0x240 current_fs_time+0x52/0x6a notify_change+0x2bd/0x30d chown_common+0x9c/0xc5 strncpy_from_user+0x3b/0x68 do_path_lookup+0xdf/0x266 __user_walk_fd+0x44/0x5a sys_chown+0x4a/0x55 vfs_write+0xe7/0x13c sys_mkdir+0x1f/0x23 syscall_call+0x7/0xb Looking at the code, it looks like it's not handle HOLE correctly. It ends up returning -EIO. Here is the patch to fix it. If we really want to be paranoid, we can allow return values 0 (HOLE), 1 (we asked for one block) and return -EIO for more than 1 block. But I really don't see a reason for doing it - all we need is the block# here. (doesn't matter how many blocks are mapped). ext3_get_blocks_handle() returns number of blocks it mapped. It returns 0 in case of HOLE. ext3_getblk() should handle HOLE properly (currently its dumping warning stack and returning -EIO). Signed-off-by: Badari Pulavarty Acked-by: Mingming Cao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/inode.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index c5ee9f0691e3..0f0b1eadb98d 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1009,11 +1009,14 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode, buffer_trace_init(&dummy.b_history); err = ext3_get_blocks_handle(handle, inode, block, 1, &dummy, create, 1); - if (err == 1) { + /* + * ext3_get_blocks_handle() returns number of blocks + * mapped. 0 in case of a HOLE. + */ + if (err > 0) { + if (err > 1) + WARN_ON(1); err = 0; - } else if (err >= 0) { - WARN_ON(1); - err = -EIO; } *errp = err; if (!err && buffer_mapped(&dummy)) { From 016eb4a0ed06a3677d67a584da901f0e9a63c666 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 8 Sep 2006 09:48:38 -0700 Subject: [PATCH 16/21] [PATCH] invalidate_complete_page() race fix If a CPU faults this page into pagetables after invalidate_mapping_pages() checked page_mapped(), invalidate_complete_page() will still proceed to remove the page from pagecache. This leaves the page-faulting process with a detached page. If it was MAP_SHARED then file data loss will ensue. Fix that up by checking the page's refcount after taking tree_lock. Cc: Nick Piggin Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mm/truncate.c b/mm/truncate.c index cf1b015df4a7..c6ab55ec6883 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -68,10 +68,10 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) return 0; write_lock_irq(&mapping->tree_lock); - if (PageDirty(page)) { - write_unlock_irq(&mapping->tree_lock); - return 0; - } + if (PageDirty(page)) + goto failed; + if (page_count(page) != 2) /* caller's ref + pagecache ref */ + goto failed; BUG_ON(PagePrivate(page)); __remove_from_page_cache(page); @@ -79,6 +79,9 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) ClearPageUptodate(page); page_cache_release(page); /* pagecache ref */ return 1; +failed: + write_unlock_irq(&mapping->tree_lock); + return 0; } /** From e9f7bee1df223dcf83743b46cb06c08d95497ec0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 8 Sep 2006 09:48:54 -0700 Subject: [PATCH 17/21] [PATCH] NFS: large non-page-aligned direct I/O clobbers memory The logic in nfs_direct_read_schedule and nfs_direct_write_schedule can allow data->npages to be one larger than rpages. This causes a page pointer to be written beyond the end of the pagevec in nfs_read_data (or nfs_write_data). Fix this by making nfs_(read|write)_alloc() calculate the size of the pagevec array, and initialise data->npages. Also get rid of the redundant argument to nfs_commit_alloc(). Signed-off-by: Trond Myklebust Cc: Chuck Lever Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfs/direct.c | 50 ++++++++++++----------------------------- fs/nfs/read.c | 24 +++++++++++--------- fs/nfs/write.c | 37 +++++++++++++----------------- include/linux/nfs_fs.h | 6 ++--- include/linux/nfs_xdr.h | 4 ++-- 5 files changed, 47 insertions(+), 74 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index fecd3b095deb..76ca1cbc38f9 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -100,25 +100,6 @@ static inline int put_dreq(struct nfs_direct_req *dreq) return atomic_dec_and_test(&dreq->io_count); } -/* - * "size" is never larger than rsize or wsize. - */ -static inline int nfs_direct_count_pages(unsigned long user_addr, size_t size) -{ - int page_count; - - page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT; - page_count -= user_addr >> PAGE_SHIFT; - BUG_ON(page_count < 0); - - return page_count; -} - -static inline unsigned int nfs_max_pages(unsigned int size) -{ - return (size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; -} - /** * nfs_direct_IO - NFS address space operation for direct I/O * @rw: direction (read or write) @@ -276,28 +257,24 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo struct nfs_open_context *ctx = dreq->ctx; struct inode *inode = ctx->dentry->d_inode; size_t rsize = NFS_SERVER(inode)->rsize; - unsigned int rpages = nfs_max_pages(rsize); unsigned int pgbase; int result; ssize_t started = 0; get_dreq(dreq); - pgbase = user_addr & ~PAGE_MASK; do { struct nfs_read_data *data; size_t bytes; + pgbase = user_addr & ~PAGE_MASK; + bytes = min(rsize,count); + result = -ENOMEM; - data = nfs_readdata_alloc(rpages); + data = nfs_readdata_alloc(pgbase + bytes); if (unlikely(!data)) break; - bytes = rsize; - if (count < rsize) - bytes = count; - - data->npages = nfs_direct_count_pages(user_addr, bytes); down_read(¤t->mm->mmap_sem); result = get_user_pages(current, current->mm, user_addr, data->npages, 1, 0, data->pagevec, NULL); @@ -344,8 +321,10 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo started += bytes; user_addr += bytes; pos += bytes; + /* FIXME: Remove this unnecessary math from final patch */ pgbase += bytes; pgbase &= ~PAGE_MASK; + BUG_ON(pgbase != (user_addr & ~PAGE_MASK)); count -= bytes; } while (count != 0); @@ -524,7 +503,7 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode static void nfs_alloc_commit_data(struct nfs_direct_req *dreq) { - dreq->commit_data = nfs_commit_alloc(0); + dreq->commit_data = nfs_commit_alloc(); if (dreq->commit_data != NULL) dreq->commit_data->req = (struct nfs_page *) dreq; } @@ -605,28 +584,24 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l struct nfs_open_context *ctx = dreq->ctx; struct inode *inode = ctx->dentry->d_inode; size_t wsize = NFS_SERVER(inode)->wsize; - unsigned int wpages = nfs_max_pages(wsize); unsigned int pgbase; int result; ssize_t started = 0; get_dreq(dreq); - pgbase = user_addr & ~PAGE_MASK; do { struct nfs_write_data *data; size_t bytes; + pgbase = user_addr & ~PAGE_MASK; + bytes = min(wsize,count); + result = -ENOMEM; - data = nfs_writedata_alloc(wpages); + data = nfs_writedata_alloc(pgbase + bytes); if (unlikely(!data)) break; - bytes = wsize; - if (count < wsize) - bytes = count; - - data->npages = nfs_direct_count_pages(user_addr, bytes); down_read(¤t->mm->mmap_sem); result = get_user_pages(current, current->mm, user_addr, data->npages, 0, 0, data->pagevec, NULL); @@ -676,8 +651,11 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l started += bytes; user_addr += bytes; pos += bytes; + + /* FIXME: Remove this useless math from the final patch */ pgbase += bytes; pgbase &= ~PAGE_MASK; + BUG_ON(pgbase != (user_addr & ~PAGE_MASK)); count -= bytes; } while (count != 0); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index da9cf11c326f..7a9ee00e0c61 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -43,13 +43,15 @@ static mempool_t *nfs_rdata_mempool; #define MIN_POOL_READ (32) -struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) +struct nfs_read_data *nfs_readdata_alloc(size_t len) { + unsigned int pagecount = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, SLAB_NOFS); if (p) { memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); + p->npages = pagecount; if (pagecount <= ARRAY_SIZE(p->page_array)) p->pagevec = p->page_array; else { @@ -140,7 +142,7 @@ static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode, int result; struct nfs_read_data *rdata; - rdata = nfs_readdata_alloc(1); + rdata = nfs_readdata_alloc(count); if (!rdata) return -ENOMEM; @@ -336,25 +338,25 @@ static int nfs_pagein_multi(struct list_head *head, struct inode *inode) struct nfs_page *req = nfs_list_entry(head->next); struct page *page = req->wb_page; struct nfs_read_data *data; - unsigned int rsize = NFS_SERVER(inode)->rsize; - unsigned int nbytes, offset; + size_t rsize = NFS_SERVER(inode)->rsize, nbytes; + unsigned int offset; int requests = 0; LIST_HEAD(list); nfs_list_remove_request(req); nbytes = req->wb_bytes; - for(;;) { - data = nfs_readdata_alloc(1); + do { + size_t len = min(nbytes,rsize); + + data = nfs_readdata_alloc(len); if (!data) goto out_bad; INIT_LIST_HEAD(&data->pages); list_add(&data->pages, &list); requests++; - if (nbytes <= rsize) - break; - nbytes -= rsize; - } + nbytes -= len; + } while(nbytes != 0); atomic_set(&req->wb_complete, requests); ClearPageError(page); @@ -402,7 +404,7 @@ static int nfs_pagein_one(struct list_head *head, struct inode *inode) if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE) return nfs_pagein_multi(head, inode); - data = nfs_readdata_alloc(NFS_SERVER(inode)->rpages); + data = nfs_readdata_alloc(NFS_SERVER(inode)->rsize); if (!data) goto out_bad; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 50774991f8d5..8ab3cf10d792 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -90,22 +90,13 @@ static mempool_t *nfs_commit_mempool; static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion); -struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount) +struct nfs_write_data *nfs_commit_alloc(void) { struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, SLAB_NOFS); if (p) { memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); - if (pagecount <= ARRAY_SIZE(p->page_array)) - p->pagevec = p->page_array; - else { - p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS); - if (!p->pagevec) { - mempool_free(p, nfs_commit_mempool); - p = NULL; - } - } } return p; } @@ -117,13 +108,15 @@ void nfs_commit_free(struct nfs_write_data *p) mempool_free(p, nfs_commit_mempool); } -struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) +struct nfs_write_data *nfs_writedata_alloc(size_t len) { + unsigned int pagecount = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, SLAB_NOFS); if (p) { memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); + p->npages = pagecount; if (pagecount <= ARRAY_SIZE(p->page_array)) p->pagevec = p->page_array; else { @@ -208,7 +201,7 @@ static int nfs_writepage_sync(struct nfs_open_context *ctx, struct inode *inode, int result, written = 0; struct nfs_write_data *wdata; - wdata = nfs_writedata_alloc(1); + wdata = nfs_writedata_alloc(wsize); if (!wdata) return -ENOMEM; @@ -999,24 +992,24 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, int how) struct nfs_page *req = nfs_list_entry(head->next); struct page *page = req->wb_page; struct nfs_write_data *data; - unsigned int wsize = NFS_SERVER(inode)->wsize; - unsigned int nbytes, offset; + size_t wsize = NFS_SERVER(inode)->wsize, nbytes; + unsigned int offset; int requests = 0; LIST_HEAD(list); nfs_list_remove_request(req); nbytes = req->wb_bytes; - for (;;) { - data = nfs_writedata_alloc(1); + do { + size_t len = min(nbytes, wsize); + + data = nfs_writedata_alloc(len); if (!data) goto out_bad; list_add(&data->pages, &list); requests++; - if (nbytes <= wsize) - break; - nbytes -= wsize; - } + nbytes -= len; + } while (nbytes != 0); atomic_set(&req->wb_complete, requests); ClearPageError(page); @@ -1070,7 +1063,7 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, int how) struct nfs_write_data *data; unsigned int count; - data = nfs_writedata_alloc(NFS_SERVER(inode)->wpages); + data = nfs_writedata_alloc(NFS_SERVER(inode)->wsize); if (!data) goto out_bad; @@ -1378,7 +1371,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) struct nfs_write_data *data; struct nfs_page *req; - data = nfs_commit_alloc(NFS_SERVER(inode)->wpages); + data = nfs_commit_alloc(); if (!data) goto out_bad; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 247434553ae8..530b1e6173b1 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -427,7 +427,7 @@ extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); extern void nfs_writedata_release(void *); #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) -struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount); +struct nfs_write_data *nfs_commit_alloc(void); void nfs_commit_free(struct nfs_write_data *p); #endif @@ -478,7 +478,7 @@ static inline int nfs_wb_page(struct inode *inode, struct page* page) /* * Allocate nfs_write_data structures */ -extern struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount); +extern struct nfs_write_data *nfs_writedata_alloc(size_t len); /* * linux/fs/nfs/read.c @@ -492,7 +492,7 @@ extern void nfs_readdata_release(void *data); /* * Allocate nfs_read_data structures */ -extern struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount); +extern struct nfs_read_data *nfs_readdata_alloc(size_t len); /* * linux/fs/nfs3proc.c diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index db9cbf68e12b..41e5a19199e9 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -729,7 +729,7 @@ struct nfs_read_data { struct list_head pages; /* Coalesced read requests */ struct nfs_page *req; /* multi ops per nfs_page */ struct page **pagevec; - unsigned int npages; /* active pages in pagevec */ + unsigned int npages; /* Max length of pagevec */ struct nfs_readargs args; struct nfs_readres res; #ifdef CONFIG_NFS_V4 @@ -748,7 +748,7 @@ struct nfs_write_data { struct list_head pages; /* Coalesced requests we wish to flush */ struct nfs_page *req; /* multi ops per nfs_page */ struct page **pagevec; - unsigned int npages; /* active pages in pagevec */ + unsigned int npages; /* Max length of pagevec */ struct nfs_writeargs args; /* argument struct */ struct nfs_writeres res; /* result struct */ #ifdef CONFIG_NFS_V4 From b8444d00762703e1b6146fce12ce2684885f8bf6 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Fri, 25 Aug 2006 14:00:19 -0700 Subject: [PATCH 18/21] [IA64] correct file descriptor reference counting in perfmon Fix a bug in sys_perfmonctl() whereby it was not correctly decrementing the file descriptor reference count. Signed-off-by: stephane eranian Signed-off-by: Tony Luck --- arch/ia64/kernel/perfmon.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index c7ccd6ee1ddf..84a7e52f56f6 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -4936,13 +4936,15 @@ abort_locked: if (likely(ctx)) { DPRINT(("context unlocked\n")); UNPROTECT_CTX(ctx, flags); - fput(file); } /* copy argument back to user, if needed */ if (call_made && PFM_CMD_RW_ARG(cmd) && copy_to_user(arg, args_k, base_sz*count)) ret = -EFAULT; error_args: + if (file) + fput(file); + kfree(args_k); DPRINT(("cmd=%s ret=%ld\n", PFM_CMD_NAME(cmd), ret)); From 2636255488484e04d6d54303d2b0ec30f7ef7e02 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Mon, 4 Sep 2006 21:56:09 +0200 Subject: [PATCH 19/21] [IA64] Unwire set/get_robust_list The syscalls set/get_robust_list must not be wired up until futex_atomic_cmpxchg_inatomic is implemented. Otherwise the kernel will hang in handle_futex_death. Signed-off-by: Andreas Schwab Signed-off-by: Tony Luck --- arch/ia64/kernel/entry.S | 4 ++-- include/asm-ia64/unistd.h | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index 12701cf32d99..fef06571be99 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -1605,8 +1605,8 @@ sys_call_table: data8 sys_ni_syscall // 1295 reserved for ppoll data8 sys_unshare data8 sys_splice - data8 sys_set_robust_list - data8 sys_get_robust_list + data8 sys_ni_syscall // reserved for set_robust_list + data8 sys_ni_syscall // reserved for get_robust_list data8 sys_sync_file_range // 1300 data8 sys_tee data8 sys_vmsplice diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h index bb0eb727dcd0..f581662c5ab8 100644 --- a/include/asm-ia64/unistd.h +++ b/include/asm-ia64/unistd.h @@ -286,8 +286,7 @@ /* 1294, 1295 reserved for pselect/ppoll */ #define __NR_unshare 1296 #define __NR_splice 1297 -#define __NR_set_robust_list 1298 -#define __NR_get_robust_list 1299 +/* 1298, 1299 reserved for set_robust_list/get_robust_list */ #define __NR_sync_file_range 1300 #define __NR_tee 1301 #define __NR_vmsplice 1302 From 1c7d67073e2d196597f541351bc9b109c8a93528 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Thu, 31 Aug 2006 11:34:47 -0500 Subject: [PATCH 20/21] [IA64] Save register stack contents on cpu start The SN PROM uses the register stack in the slave loop. The contents must be preserved for the OS to return to the slave loop via offlining a cpu or for kexec. A 'flushrs" is needed to force the stack to be written to memory prior to changing bspstore. Signed-off-by: Jack Steiner Signed-off-by: Tony Luck --- arch/ia64/kernel/head.S | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S index 29236f0c62b5..44d540efa6d1 100644 --- a/arch/ia64/kernel/head.S +++ b/arch/ia64/kernel/head.S @@ -197,6 +197,11 @@ start_ap: ;; srlz.i ;; + { + flushrs // must be first insn in group + srlz.i + } + ;; /* * Save the region registers, predicate before they get clobbered */ From 38f5745c5a90641079fd5b48600ae63f7ab6edcd Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Thu, 31 Aug 2006 11:35:57 -0500 Subject: [PATCH 21/21] [IA64] SN fix for cpu hotplug/kexec The sn_cpu_init() is required for cpu initialization on SN platforms. Change __init to __cpuinit so that the function is not freed with init code/data. Signed-off-by: Jack Steiner Signed-off-by: Tony Luck --- arch/ia64/sn/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c index c119e8b620de..5f2dcba7fa8d 100644 --- a/arch/ia64/sn/kernel/setup.c +++ b/arch/ia64/sn/kernel/setup.c @@ -565,7 +565,7 @@ static void __init sn_init_pdas(char **cmdline_p) * Also sets up a few fields in the nodepda. Also known as * platform_cpu_init() by the ia64 machvec code. */ -void __init sn_cpu_init(void) +void __cpuinit sn_cpu_init(void) { int cpuid; int cpuphyid;