From e2690695ce5085677b84fbf2e38d2ed57cad39cd Mon Sep 17 00:00:00 2001 From: Peter Huewe Date: Sun, 15 Apr 2012 02:20:27 +0200 Subject: [PATCH 001/178] fuse: Convert to kstrtoul_from_user This patch replaces the code for getting an number from a userspace buffer by a simple call to kstroul_from_user. This makes it easier to read and less error prone. Signed-off-by: Peter Huewe Signed-off-by: Miklos Szeredi --- fs/fuse/control.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 42593c587d48..03ff5b1eba93 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -75,19 +75,13 @@ static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf, unsigned global_limit) { unsigned long t; - char tmp[32]; unsigned limit = (1 << 16) - 1; int err; - if (*ppos || count >= sizeof(tmp) - 1) + if (*ppos) return -EINVAL; - if (copy_from_user(tmp, buf, count)) - return -EINVAL; - - tmp[count] = '\0'; - - err = strict_strtoul(tmp, 0, &t); + err = kstrtoul_from_user(buf, count, 0, &t); if (err) return err; From 05ba1f0823004e947748523782e9c2f07f3bff0d Mon Sep 17 00:00:00 2001 From: Anatol Pomozov Date: Sun, 22 Apr 2012 18:45:24 -0700 Subject: [PATCH 002/178] fuse: add FALLOCATE operation fallocate filesystem operation preallocates media space for the given file. If fallocate returns success then any subsequent write to the given range never fails with 'not enough space' error. Signed-off-by: Anatol Pomozov Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 33 +++++++++++++++++++++++++++++++++ include/linux/fuse.h | 14 +++++++++++++- 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 504e61b7fd75..e3fee88831d4 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2171,6 +2171,37 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, return ret; } +long fuse_file_fallocate(struct file *file, int mode, loff_t offset, + loff_t length) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + struct fuse_req *req; + struct fuse_fallocate_in inarg = { + .fh = ff->fh, + .offset = offset, + .length = length, + .mode = mode + }; + int err; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->in.h.opcode = FUSE_FALLOCATE; + req->in.h.nodeid = ff->nodeid; + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + + return err; +} +EXPORT_SYMBOL_GPL(fuse_file_fallocate); + static const struct file_operations fuse_file_operations = { .llseek = fuse_file_llseek, .read = do_sync_read, @@ -2188,6 +2219,7 @@ static const struct file_operations fuse_file_operations = { .unlocked_ioctl = fuse_file_ioctl, .compat_ioctl = fuse_file_compat_ioctl, .poll = fuse_file_poll, + .fallocate = fuse_file_fallocate, }; static const struct file_operations fuse_direct_io_file_operations = { @@ -2204,6 +2236,7 @@ static const struct file_operations fuse_direct_io_file_operations = { .unlocked_ioctl = fuse_file_ioctl, .compat_ioctl = fuse_file_compat_ioctl, .poll = fuse_file_poll, + .fallocate = fuse_file_fallocate, /* no splice_read */ }; diff --git a/include/linux/fuse.h b/include/linux/fuse.h index 8f2ab8fef929..9303348965fb 100644 --- a/include/linux/fuse.h +++ b/include/linux/fuse.h @@ -54,6 +54,9 @@ * 7.18 * - add FUSE_IOCTL_DIR flag * - add FUSE_NOTIFY_DELETE + * + * 7.19 + * - add FUSE_FALLOCATE */ #ifndef _LINUX_FUSE_H @@ -85,7 +88,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 18 +#define FUSE_KERNEL_MINOR_VERSION 19 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -278,6 +281,7 @@ enum fuse_opcode { FUSE_POLL = 40, FUSE_NOTIFY_REPLY = 41, FUSE_BATCH_FORGET = 42, + FUSE_FALLOCATE = 43, /* CUSE specific operations */ CUSE_INIT = 4096, @@ -571,6 +575,14 @@ struct fuse_notify_poll_wakeup_out { __u64 kh; }; +struct fuse_fallocate_in { + __u64 fh; + __u64 offset; + __u64 length; + __u32 mode; + __u32 padding; +}; + struct fuse_in_header { __u32 len; __u32 opcode; From 519c6040ce04474bc893774f866fd8d907b20429 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 26 Apr 2012 10:56:36 +0200 Subject: [PATCH 003/178] fuse: optimize fallocate on permanent failure If userspace filesystem doesn't support fallocate, remember this and don't send request next time. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 7 +++++++ fs/fuse/fuse_i.h | 3 +++ 2 files changed, 10 insertions(+) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e3fee88831d4..bbfd571b37e1 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2185,6 +2185,9 @@ long fuse_file_fallocate(struct file *file, int mode, loff_t offset, }; int err; + if (fc->no_fallocate) + return -EOPNOTSUPP; + req = fuse_get_req(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -2196,6 +2199,10 @@ long fuse_file_fallocate(struct file *file, int mode, loff_t offset, req->in.args[0].value = &inarg; fuse_request_send(fc, req); err = req->out.h.error; + if (err == -ENOSYS) { + fc->no_fallocate = 1; + err = -EOPNOTSUPP; + } fuse_put_request(fc, req); return err; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 572cefc78012..f38fb795f03c 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -478,6 +478,9 @@ struct fuse_conn { /** Are BSD file locking primitives not implemented by fs? */ unsigned no_flock:1; + /** Is fallocate not implemented by fs? */ + unsigned no_fallocate:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; From c5971456964290da7e98222892797b71ef793e62 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 3 May 2012 14:48:26 +0100 Subject: [PATCH 004/178] ACPI battery: only refresh the sysfs files when pertinent information changes We only need to regenerate the sysfs files when the capacity units change, avoid the update otherwise. The origin of this issue is dates way back to 2.6.38: da8aeb92d4853f37e281f11fddf61f9c7d84c3cd (ACPI / Battery: Update information on info notification and resume) cc: Signed-off-by: Andy Whitcroft Tested-by: Ralf Jung Signed-off-by: Len Brown --- drivers/acpi/battery.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index 86933ca8b472..7dd3f9fb9f3f 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -643,11 +643,19 @@ static int acpi_battery_update(struct acpi_battery *battery) static void acpi_battery_refresh(struct acpi_battery *battery) { + int power_unit; + if (!battery->bat.dev) return; + power_unit = battery->power_unit; + acpi_battery_get_info(battery); - /* The battery may have changed its reporting units. */ + + if (power_unit == battery->power_unit) + return; + + /* The battery has changed its reporting units. */ sysfs_remove_battery(battery); sysfs_add_battery(battery); } From d8e725f356fd5f225ad97f21213fc007e409c9f5 Mon Sep 17 00:00:00 2001 From: Marco Aurelio da Costa Date: Fri, 4 May 2012 18:53:44 +0200 Subject: [PATCH 005/178] ACPI: Ignore invalid _PSS entries, but use valid ones The EliteBook 8560W has non-initialized entries in its _PSS ACPI table. Instead of bailing out when the first non-initialized entry is found, ignore it and use only the valid entries. Only bail out if there is no valid entry at all. [v3: Fixes suggested by Konrad] Signed-off-by: Marco Aurelio da Costa Signed-off-by: Len Brown --- drivers/acpi/processor_perflib.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c index 0af48a8554cd..a093dc163a42 100644 --- a/drivers/acpi/processor_perflib.c +++ b/drivers/acpi/processor_perflib.c @@ -333,6 +333,7 @@ static int acpi_processor_get_performance_states(struct acpi_processor *pr) struct acpi_buffer state = { 0, NULL }; union acpi_object *pss = NULL; int i; + int last_invalid = -1; status = acpi_evaluate_object(pr->handle, "_PSS", NULL, &buffer); @@ -394,14 +395,33 @@ static int acpi_processor_get_performance_states(struct acpi_processor *pr) ((u32)(px->core_frequency * 1000) != (px->core_frequency * 1000))) { printk(KERN_ERR FW_BUG PREFIX - "Invalid BIOS _PSS frequency: 0x%llx MHz\n", - px->core_frequency); - result = -EFAULT; - kfree(pr->performance->states); - goto end; + "Invalid BIOS _PSS frequency found for processor %d: 0x%llx MHz\n", + pr->id, px->core_frequency); + if (last_invalid == -1) + last_invalid = i; + } else { + if (last_invalid != -1) { + /* + * Copy this valid entry over last_invalid entry + */ + memcpy(&(pr->performance->states[last_invalid]), + px, sizeof(struct acpi_processor_px)); + ++last_invalid; + } } } + if (last_invalid == 0) { + printk(KERN_ERR FW_BUG PREFIX + "No valid BIOS _PSS frequency found for processor %d\n", pr->id); + result = -EFAULT; + kfree(pr->performance->states); + pr->performance->states = NULL; + } + + if (last_invalid > 0) + pr->performance->state_count = last_invalid; + end: kfree(buffer.pointer); From 45c72cd73c788dd18c8113d4a404d6b4a01decf1 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 10 May 2012 19:49:38 +0400 Subject: [PATCH 006/178] fuse: fix stat call on 32 bit platforms Now we store attr->ino at inode->i_ino, return attr->ino at the first time and then return inode->i_ino if the attribute timeout isn't expired. That's wrong on 32 bit platforms because attr->ino is 64 bit and inode->i_ino is 32 bit in this case. Fix this by saving 64 bit ino in fuse_inode structure and returning it every time we call getattr. Also squash attr->ino into inode->i_ino explicitly. Signed-off-by: Pavel Shilovsky Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 1 + fs/fuse/fuse_i.h | 3 +++ fs/fuse/inode.c | 17 ++++++++++++++++- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index df5ac048dc74..bc438320cac5 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -863,6 +863,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat, if (stat) { generic_fillattr(inode, stat); stat->mode = fi->orig_i_mode; + stat->ino = fi->orig_ino; } } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index f38fb795f03c..771fb6322c07 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -82,6 +82,9 @@ struct fuse_inode { preserve the original mode */ umode_t orig_i_mode; + /** 64 bit inode number */ + u64 orig_ino; + /** Version of last attribute change */ u64 attr_version; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 26783eb2b1fc..a59cf5e673d7 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -91,6 +91,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi->nlookup = 0; fi->attr_version = 0; fi->writectr = 0; + fi->orig_ino = 0; INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); INIT_LIST_HEAD(&fi->writepages); @@ -139,6 +140,18 @@ static int fuse_remount_fs(struct super_block *sb, int *flags, char *data) return 0; } +/* + * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down + * so that it will fit. + */ +static ino_t fuse_squash_ino(u64 ino64) +{ + ino_t ino = (ino_t) ino64; + if (sizeof(ino_t) < sizeof(u64)) + ino ^= ino64 >> (sizeof(u64) - sizeof(ino_t)) * 8; + return ino; +} + void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, u64 attr_valid) { @@ -148,7 +161,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, fi->attr_version = ++fc->attr_version; fi->i_time = attr_valid; - inode->i_ino = attr->ino; + inode->i_ino = fuse_squash_ino(attr->ino); inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); set_nlink(inode, attr->nlink); inode->i_uid = attr->uid; @@ -174,6 +187,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, fi->orig_i_mode = inode->i_mode; if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) inode->i_mode &= ~S_ISVTX; + + fi->orig_ino = attr->ino; } void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, From 203627bbc90377c509e32450c67c5d957ba2d989 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 10 May 2012 19:49:38 +0400 Subject: [PATCH 007/178] fuse: fix blksize calculation Don't use inode->i_blkbits which might be stale, instead calculate the blksize information from the freshly obtained attributes. Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index bc438320cac5..334e0b18a014 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -775,6 +775,8 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, struct kstat *stat) { + unsigned int blkbits; + stat->dev = inode->i_sb->s_dev; stat->ino = attr->ino; stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); @@ -790,7 +792,13 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, stat->ctime.tv_nsec = attr->ctimensec; stat->size = attr->size; stat->blocks = attr->blocks; - stat->blksize = (1 << inode->i_blkbits); + + if (attr->blksize != 0) + blkbits = ilog2(attr->blksize); + else + blkbits = inode->i_sb->s_blocksize_bits; + + stat->blksize = 1 << blkbits; } static int fuse_do_getattr(struct inode *inode, struct kstat *stat, From c3ba9698152b17fdc2c7cd0f7cbeb571e3367e9d Mon Sep 17 00:00:00 2001 From: Dan Magenheimer Date: Mon, 9 Apr 2012 17:06:54 -0600 Subject: [PATCH 008/178] mm: frontswap: add frontswap header file Frontswap is the alter ego of cleancache, the "yang" to cleancache's "yin"... and more precisely frontswap is the provider of anonymous pages to transcendent memory to nicely complement cleancache's providing of clean pagecache pages to transcendent memory. For optimal use of transcendent memory, both are necessary... because a kernel under memory pressure first reclaims clean pagecache pages and, when under more memory pressure, starts swapping anonymous pages. Frontswap and cleancache (which was merged at 3.0) are the "frontends" and the only necessary changes to the core kernel for transcendent memory; all other supporting code -- the "backends" -- is implemented as drivers. See the LWN.net article "Transcendent memory in a nutshell" for a detailed overview of frontswap and related kernel parts: https://lwn.net/Articles/454795/ Frontswap code was first posted publicly in January 2009 and on LKML in May 2009, and has remained functionally stable for nearly three years now. It is barely invasive, touching only the swap subsystem and adds less than 100 lines of code to existing swap subsystem code files. It has improved syntactically substantially between V1 and this posting of V14, thanks to the review of a few kernel developers, and has adapted easily to at least one major swap subsystem change. As of 3.4, there are three in-tree users of frontswap patiently waiting for this patchset and for CONFIG_FRONTSWAP to be enabled: zcache (staging driver merged at 2.6.39), Xen tmem (merged at 3.0 and 3.1) and RAMster (staging driver merged at 3.4). In addition, a RFC has been posted for a KVM backend. The frontswap patchset has been in linux-next since next-110603. Earlier versions of frontswap already ship in the Oracle Unbreakable Enterprise Kernel and SuSE SLES. This patch, 1of4, provides the header file for the core code for frontswap that interfaces between the hooks in the swap subsystem and a frontswap backend via frontswap_ops. --- New file added: include/linux/frontswap.h [v14: add support for writethrough, per suggestion by aarcange@redhat.com] [v14: rebase to 3.4-rc2] [v11: konrad.wilk@oracle.com: squashed s/flush/invalidate/ in] [v10: no change] [v9: akpm@linux-foundation.org: change "flush" to "invalidate", part 1] [v8: rebase to 3.0-rc4] [v7: rebase to 3.0-rc3] [v7: JBeulich@novell.com: new static inlines resolve to no-ops if not config'd] [v7: JBeulich@novell.com: avoid redundant shifts/divides for *_bit lib calls] [v6: rebase to 3.1-rc1] [v5: no change from v4] [v4: rebase to 2.6.39] Signed-off-by: Dan Magenheimer Reviewed-by: Andrew Morton Acked-by: Jan Beulich Acked-by: Seth Jennings Cc: Jeremy Fitzhardinge Cc: Hugh Dickins Cc: Johannes Weiner Cc: Nitin Gupta Cc: Matthew Wilcox Cc: Chris Mason Cc: Rik Riel [v15: int/bool on some functions] Signed-off-by: Konrad Wilk --- include/linux/frontswap.h | 127 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 include/linux/frontswap.h diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h new file mode 100644 index 000000000000..68ff7af5c5fb --- /dev/null +++ b/include/linux/frontswap.h @@ -0,0 +1,127 @@ +#ifndef _LINUX_FRONTSWAP_H +#define _LINUX_FRONTSWAP_H + +#include +#include +#include + +struct frontswap_ops { + void (*init)(unsigned); + int (*put_page)(unsigned, pgoff_t, struct page *); + int (*get_page)(unsigned, pgoff_t, struct page *); + void (*invalidate_page)(unsigned, pgoff_t); + void (*invalidate_area)(unsigned); +}; + +extern bool frontswap_enabled; +extern struct frontswap_ops + frontswap_register_ops(struct frontswap_ops *ops); +extern void frontswap_shrink(unsigned long); +extern unsigned long frontswap_curr_pages(void); +extern void frontswap_writethrough(bool); + +extern void __frontswap_init(unsigned type); +extern int __frontswap_put_page(struct page *page); +extern int __frontswap_get_page(struct page *page); +extern void __frontswap_invalidate_page(unsigned, pgoff_t); +extern void __frontswap_invalidate_area(unsigned); + +#ifdef CONFIG_FRONTSWAP + +static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset) +{ + bool ret = false; + + if (frontswap_enabled && sis->frontswap_map) + ret = test_bit(offset, sis->frontswap_map); + return ret; +} + +static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset) +{ + if (frontswap_enabled && sis->frontswap_map) + set_bit(offset, sis->frontswap_map); +} + +static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset) +{ + if (frontswap_enabled && sis->frontswap_map) + clear_bit(offset, sis->frontswap_map); +} + +static inline void frontswap_map_set(struct swap_info_struct *p, + unsigned long *map) +{ + p->frontswap_map = map; +} + +static inline unsigned long *frontswap_map_get(struct swap_info_struct *p) +{ + return p->frontswap_map; +} +#else +/* all inline routines become no-ops and all externs are ignored */ + +#define frontswap_enabled (0) + +static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset) +{ + return false; +} + +static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset) +{ +} + +static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset) +{ +} + +static inline void frontswap_map_set(struct swap_info_struct *p, + unsigned long *map) +{ +} + +static inline unsigned long *frontswap_map_get(struct swap_info_struct *p) +{ + return NULL; +} +#endif + +static inline int frontswap_put_page(struct page *page) +{ + int ret = -1; + + if (frontswap_enabled) + ret = __frontswap_put_page(page); + return ret; +} + +static inline int frontswap_get_page(struct page *page) +{ + int ret = -1; + + if (frontswap_enabled) + ret = __frontswap_get_page(page); + return ret; +} + +static inline void frontswap_invalidate_page(unsigned type, pgoff_t offset) +{ + if (frontswap_enabled) + __frontswap_invalidate_page(type, offset); +} + +static inline void frontswap_invalidate_area(unsigned type) +{ + if (frontswap_enabled) + __frontswap_invalidate_area(type); +} + +static inline void frontswap_init(unsigned type) +{ + if (frontswap_enabled) + __frontswap_init(type); +} + +#endif /* _LINUX_FRONTSWAP_H */ From 38b5faf4b178d5279b1fca5d7dadc68881342660 Mon Sep 17 00:00:00 2001 From: Dan Magenheimer Date: Mon, 9 Apr 2012 17:08:06 -0600 Subject: [PATCH 009/178] mm: frontswap: core swap subsystem hooks and headers This patch, 2of4, contains the changes to the core swap subsystem. This includes: (1) makes available core swap data structures (swap_lock, swap_list and swap_info) that are needed by frontswap.c but we don't need to expose them to the dozens of files that include swap.h so we create a new swapfile.h just to extern-ify these and modify their declarations to non-static (2) adds frontswap-related elements to swap_info_struct. Frontswap_map points to vzalloc'ed one-bit-per-swap-page metadata that indicates whether the swap page is in frontswap or in the device and frontswap_pages counts how many pages are in frontswap. (3) adds hooks in the swap subsystem and extends try_to_unuse so that frontswap_shrink can do a "partial swapoff". Note that a failed frontswap_map allocation is safe... failure is noted by lack of "FS" in the subsequent printk. --- [v14: rebase to 3.4-rc2] [v10: no change] [v9: akpm@linux-foundation.org: mark some statics __read_mostly] [v9: akpm@linux-foundation.org: add clarifying comments] [v9: akpm@linux-foundation.org: no need to loop repeating try_to_unuse] [v9: error27@gmail.com: remove superfluous check for NULL] [v8: rebase to 3.0-rc4] [v8: kamezawa.hiroyu@jp.fujitsu.com: change counter to atomic_t to avoid races] [v8: kamezawa.hiroyu@jp.fujitsu.com: comment to clarify informational counters] [v7: rebase to 3.0-rc3] [v7: JBeulich@novell.com: add new swap struct elements only if config'd] [v6: rebase to 3.0-rc1] [v6: lliubbo@gmail.com: fix null pointer deref if vzalloc fails] [v6: konrad.wilk@oracl.com: various checks and code clarifications/comments] [v5: no change from v4] [v4: rebase to 2.6.39] Signed-off-by: Dan Magenheimer Reviewed-by: Kamezawa Hiroyuki Acked-by: Jan Beulich Acked-by: Seth Jennings Cc: Jeremy Fitzhardinge Cc: Hugh Dickins Cc: Johannes Weiner Cc: Nitin Gupta Cc: Matthew Wilcox Cc: Chris Mason Cc: Rik Riel Cc: Andrew Morton [v11: Rebased, fixed mm/swapfile.c context change] Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swap.h | 4 +++ include/linux/swapfile.h | 13 ++++++++++ mm/page_io.c | 12 +++++++++ mm/swapfile.c | 54 ++++++++++++++++++++++++++++++---------- 4 files changed, 70 insertions(+), 13 deletions(-) create mode 100644 include/linux/swapfile.h diff --git a/include/linux/swap.h b/include/linux/swap.h index b1fd5c7925fe..50a55e2d58ec 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -197,6 +197,10 @@ struct swap_info_struct { struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ unsigned int old_block_size; /* seldom referenced */ +#ifdef CONFIG_FRONTSWAP + unsigned long *frontswap_map; /* frontswap in-use, one bit per page */ + atomic_t frontswap_pages; /* frontswap pages in-use counter */ +#endif }; struct swap_list_t { diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h new file mode 100644 index 000000000000..e282624e8c10 --- /dev/null +++ b/include/linux/swapfile.h @@ -0,0 +1,13 @@ +#ifndef _LINUX_SWAPFILE_H +#define _LINUX_SWAPFILE_H + +/* + * these were static in swapfile.c but frontswap.c needs them and we don't + * want to expose them to the dozens of source files that include swap.h + */ +extern spinlock_t swap_lock; +extern struct swap_list_t swap_list; +extern struct swap_info_struct *swap_info[]; +extern int try_to_unuse(unsigned int, bool, unsigned long); + +#endif /* _LINUX_SWAPFILE_H */ diff --git a/mm/page_io.c b/mm/page_io.c index dc76b4d0611e..651a91259317 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -18,6 +18,7 @@ #include #include #include +#include #include static struct bio *get_swap_bio(gfp_t gfp_flags, @@ -98,6 +99,12 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) unlock_page(page); goto out; } + if (frontswap_put_page(page) == 0) { + set_page_writeback(page); + unlock_page(page); + end_page_writeback(page); + goto out; + } bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); if (bio == NULL) { set_page_dirty(page); @@ -122,6 +129,11 @@ int swap_readpage(struct page *page) VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageUptodate(page)); + if (frontswap_get_page(page) == 0) { + SetPageUptodate(page); + unlock_page(page); + goto out; + } bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); if (bio == NULL) { unlock_page(page); diff --git a/mm/swapfile.c b/mm/swapfile.c index fafc26d1b1dc..9c7be87175c5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -31,6 +31,8 @@ #include #include #include +#include +#include #include #include @@ -42,7 +44,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, static void free_swap_count_continuations(struct swap_info_struct *); static sector_t map_swap_entry(swp_entry_t, struct block_device**); -static DEFINE_SPINLOCK(swap_lock); +DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; long nr_swap_pages; long total_swap_pages; @@ -53,9 +55,9 @@ static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; static const char Unused_offset[] = "Unused swap offset entry "; -static struct swap_list_t swap_list = {-1, -1}; +struct swap_list_t swap_list = {-1, -1}; -static struct swap_info_struct *swap_info[MAX_SWAPFILES]; +struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); @@ -556,6 +558,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, swap_list.next = p->type; nr_swap_pages++; p->inuse_pages--; + frontswap_invalidate_page(p->type, offset); if ((p->flags & SWP_BLKDEV) && disk->fops->swap_slot_free_notify) disk->fops->swap_slot_free_notify(p->bdev, offset); @@ -1016,11 +1019,12 @@ static int unuse_mm(struct mm_struct *mm, } /* - * Scan swap_map from current position to next entry still in use. + * Scan swap_map (or frontswap_map if frontswap parameter is true) + * from current position to next entry still in use. * Recycle to start on reaching the end, returning 0 when empty. */ static unsigned int find_next_to_unuse(struct swap_info_struct *si, - unsigned int prev) + unsigned int prev, bool frontswap) { unsigned int max = si->max; unsigned int i = prev; @@ -1046,6 +1050,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, prev = 0; i = 1; } + if (frontswap) { + if (frontswap_test(si, i)) + break; + else + continue; + } count = si->swap_map[i]; if (count && swap_count(count) != SWAP_MAP_BAD) break; @@ -1057,8 +1067,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, * We completely avoid races by reading each swap page in advance, * and then search for the process using it. All the necessary * page table adjustments can then be made atomically. + * + * if the boolean frontswap is true, only unuse pages_to_unuse pages; + * pages_to_unuse==0 means all pages; ignored if frontswap is false */ -static int try_to_unuse(unsigned int type) +int try_to_unuse(unsigned int type, bool frontswap, + unsigned long pages_to_unuse) { struct swap_info_struct *si = swap_info[type]; struct mm_struct *start_mm; @@ -1091,7 +1105,7 @@ static int try_to_unuse(unsigned int type) * one pass through swap_map is enough, but not necessarily: * there are races when an instance of an entry might be missed. */ - while ((i = find_next_to_unuse(si, i)) != 0) { + while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { if (signal_pending(current)) { retval = -EINTR; break; @@ -1258,6 +1272,10 @@ static int try_to_unuse(unsigned int type) * interactive performance. */ cond_resched(); + if (frontswap && pages_to_unuse > 0) { + if (!--pages_to_unuse) + break; + } } mmput(start_mm); @@ -1517,7 +1535,8 @@ bad_bmap: } static void enable_swap_info(struct swap_info_struct *p, int prio, - unsigned char *swap_map) + unsigned char *swap_map, + unsigned long *frontswap_map) { int i, prev; @@ -1527,6 +1546,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, else p->prio = --least_priority; p->swap_map = swap_map; + frontswap_map_set(p, frontswap_map); p->flags |= SWP_WRITEOK; nr_swap_pages += p->pages; total_swap_pages += p->pages; @@ -1543,6 +1563,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, swap_list.head = swap_list.next = p->type; else swap_info[prev]->next = p->type; + frontswap_init(p->type); spin_unlock(&swap_lock); } @@ -1616,7 +1637,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); - err = try_to_unuse(type); + err = try_to_unuse(type, false, 0); /* force all pages to be unused */ compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); if (err) { @@ -1627,7 +1648,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) * sys_swapoff for this swap_info_struct at this point. */ /* re-insert swap space back into swap_list */ - enable_swap_info(p, p->prio, p->swap_map); + enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); goto out_dput; } @@ -1653,9 +1674,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) swap_map = p->swap_map; p->swap_map = NULL; p->flags = 0; + frontswap_invalidate_area(type); spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); vfree(swap_map); + vfree(frontswap_map_get(p)); /* Destroy swap account informatin */ swap_cgroup_swapoff(type); @@ -2019,6 +2042,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) sector_t span; unsigned long maxpages; unsigned char *swap_map = NULL; + unsigned long *frontswap_map = NULL; struct page *page = NULL; struct inode *inode = NULL; @@ -2102,6 +2126,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = nr_extents; goto bad_swap; } + /* frontswap enabled? set up bit-per-page map for frontswap */ + if (frontswap_enabled) + frontswap_map = vzalloc(maxpages / sizeof(long)); if (p->bdev) { if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { @@ -2117,14 +2144,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (swap_flags & SWAP_FLAG_PREFER) prio = (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; - enable_swap_info(p, prio, swap_map); + enable_swap_info(p, prio, swap_map, frontswap_map); printk(KERN_INFO "Adding %uk swap on %s. " - "Priority:%d extents:%d across:%lluk %s%s\n", + "Priority:%d extents:%d across:%lluk %s%s%s\n", p->pages<<(PAGE_SHIFT-10), name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", - (p->flags & SWP_DISCARDABLE) ? "D" : ""); + (p->flags & SWP_DISCARDABLE) ? "D" : "", + (frontswap_map) ? "FS" : ""); mutex_unlock(&swapon_mutex); atomic_inc(&proc_poll_event); From 29f233cfffe7fbc6672938117ce7e4154a2f515f Mon Sep 17 00:00:00 2001 From: Dan Magenheimer Date: Mon, 9 Apr 2012 17:09:27 -0600 Subject: [PATCH 010/178] mm: frontswap: core frontswap functionality This patch, 3of4, provides the core frontswap code that interfaces between the hooks in the swap subsystem and a frontswap backend via frontswap_ops. --- New file added: mm/frontswap.c [v14: add support for writethrough, per suggestion by aarcange@redhat.com] [v11: sjenning@linux.vnet.ibm.com: s/puts/failed_puts/] [v10: sjenning@linux.vnet.ibm.com: fix debugfs calls on 32-bit] [v9: akpm@linux-foundation.org: change "flush" to "invalidate", part 1] [v9: akpm@linux-foundation.org: mark some statics __read_mostly] [v9: akpm@linux-foundation.org: add clarifying comments] [v9: akpm@linux-foundation.org: no need to loop repeating try_to_unuse] [v9: error27@gmail.com: remove superfluous check for NULL] [v8: rebase to 3.0-rc4] [v8: kamezawa.hiroyu@jp.fujitsu.com: add comment to clarify find_next_to_unuse] [v7: rebase to 3.0-rc3] [v7: JBeulich@novell.com: use new static inlines, no-ops if not config'd] [v6: rebase to 3.1-rc1] [v6: lliubbo@gmail.com: use vzalloc] [v6: lliubbo@gmail.com: fix null pointer deref if vzalloc fails] [v6: konrad.wilk@oracl.com: various checks and code clarifications/comments] [v4: rebase to 2.6.39] Signed-off-by: Dan Magenheimer Acked-by: Jan Beulich Acked-by: Seth Jennings Cc: Jeremy Fitzhardinge Cc: Hugh Dickins Cc: Johannes Weiner Cc: Nitin Gupta Cc: Matthew Wilcox Cc: Chris Mason Cc: Rik Riel Cc: Andrew Morton [v12: Squashed s/flush/invalidate/ in] [v15: A bit of cleanup and seperate DEBUGFS] Signed-off-by: Konrad Wilk --- mm/frontswap.c | 314 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 314 insertions(+) create mode 100644 mm/frontswap.c diff --git a/mm/frontswap.c b/mm/frontswap.c new file mode 100644 index 000000000000..8c0a5f8683f0 --- /dev/null +++ b/mm/frontswap.c @@ -0,0 +1,314 @@ +/* + * Frontswap frontend + * + * This code provides the generic "frontend" layer to call a matching + * "backend" driver implementation of frontswap. See + * Documentation/vm/frontswap.txt for more information. + * + * Copyright (C) 2009-2012 Oracle Corp. All rights reserved. + * Author: Dan Magenheimer + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * frontswap_ops is set by frontswap_register_ops to contain the pointers + * to the frontswap "backend" implementation functions. + */ +static struct frontswap_ops frontswap_ops __read_mostly; + +/* + * This global enablement flag reduces overhead on systems where frontswap_ops + * has not been registered, so is preferred to the slower alternative: a + * function call that checks a non-global. + */ +bool frontswap_enabled __read_mostly; +EXPORT_SYMBOL(frontswap_enabled); + +/* + * If enabled, frontswap_put will return failure even on success. As + * a result, the swap subsystem will always write the page to swap, in + * effect converting frontswap into a writethrough cache. In this mode, + * there is no direct reduction in swap writes, but a frontswap backend + * can unilaterally "reclaim" any pages in use with no data loss, thus + * providing increases control over maximum memory usage due to frontswap. + */ +static bool frontswap_writethrough_enabled __read_mostly; + +#ifdef CONFIG_DEBUG_FS +/* + * Counters available via /sys/kernel/debug/frontswap (if debugfs is + * properly configured). These are for information only so are not protected + * against increment races. + */ +static u64 frontswap_gets; +static u64 frontswap_succ_puts; +static u64 frontswap_failed_puts; +static u64 frontswap_invalidates; + +static inline void inc_frontswap_gets(void) { + frontswap_gets++; +} +static inline void inc_frontswap_succ_puts(void) { + frontswap_succ_puts++; +} +static inline void inc_frontswap_failed_puts(void) { + frontswap_failed_puts++; +} +static inline void inc_frontswap_invalidates(void) { + frontswap_invalidates++; +} +#else +static inline void inc_frontswap_gets(void) { } +static inline void inc_frontswap_succ_puts(void) { } +static inline void inc_frontswap_failed_puts(void) { } +static inline void inc_frontswap_invalidates(void) { } +#endif +/* + * Register operations for frontswap, returning previous thus allowing + * detection of multiple backends and possible nesting. + */ +struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops) +{ + struct frontswap_ops old = frontswap_ops; + + frontswap_ops = *ops; + frontswap_enabled = true; + return old; +} +EXPORT_SYMBOL(frontswap_register_ops); + +/* + * Enable/disable frontswap writethrough (see above). + */ +void frontswap_writethrough(bool enable) +{ + frontswap_writethrough_enabled = enable; +} +EXPORT_SYMBOL(frontswap_writethrough); + +/* + * Called when a swap device is swapon'd. + */ +void __frontswap_init(unsigned type) +{ + struct swap_info_struct *sis = swap_info[type]; + + BUG_ON(sis == NULL); + if (sis->frontswap_map == NULL) + return; + if (frontswap_enabled) + (*frontswap_ops.init)(type); +} +EXPORT_SYMBOL(__frontswap_init); + +/* + * "Put" data from a page to frontswap and associate it with the page's + * swaptype and offset. Page must be locked and in the swap cache. + * If frontswap already contains a page with matching swaptype and + * offset, the frontswap implmentation may either overwrite the data and + * return success or invalidate the page from frontswap and return failure. + */ +int __frontswap_put_page(struct page *page) +{ + int ret = -1, dup = 0; + swp_entry_t entry = { .val = page_private(page), }; + int type = swp_type(entry); + struct swap_info_struct *sis = swap_info[type]; + pgoff_t offset = swp_offset(entry); + + BUG_ON(!PageLocked(page)); + BUG_ON(sis == NULL); + if (frontswap_test(sis, offset)) + dup = 1; + ret = (*frontswap_ops.put_page)(type, offset, page); + if (ret == 0) { + frontswap_set(sis, offset); + inc_frontswap_succ_puts(); + if (!dup) + atomic_inc(&sis->frontswap_pages); + } else if (dup) { + /* + failed dup always results in automatic invalidate of + the (older) page from frontswap + */ + frontswap_clear(sis, offset); + atomic_dec(&sis->frontswap_pages); + inc_frontswap_failed_puts(); + } else + inc_frontswap_failed_puts(); + if (frontswap_writethrough_enabled) + /* report failure so swap also writes to swap device */ + ret = -1; + return ret; +} +EXPORT_SYMBOL(__frontswap_put_page); + +/* + * "Get" data from frontswap associated with swaptype and offset that were + * specified when the data was put to frontswap and use it to fill the + * specified page with data. Page must be locked and in the swap cache. + */ +int __frontswap_get_page(struct page *page) +{ + int ret = -1; + swp_entry_t entry = { .val = page_private(page), }; + int type = swp_type(entry); + struct swap_info_struct *sis = swap_info[type]; + pgoff_t offset = swp_offset(entry); + + BUG_ON(!PageLocked(page)); + BUG_ON(sis == NULL); + if (frontswap_test(sis, offset)) + ret = (*frontswap_ops.get_page)(type, offset, page); + if (ret == 0) + inc_frontswap_gets(); + return ret; +} +EXPORT_SYMBOL(__frontswap_get_page); + +/* + * Invalidate any data from frontswap associated with the specified swaptype + * and offset so that a subsequent "get" will fail. + */ +void __frontswap_invalidate_page(unsigned type, pgoff_t offset) +{ + struct swap_info_struct *sis = swap_info[type]; + + BUG_ON(sis == NULL); + if (frontswap_test(sis, offset)) { + (*frontswap_ops.invalidate_page)(type, offset); + atomic_dec(&sis->frontswap_pages); + frontswap_clear(sis, offset); + inc_frontswap_invalidates(); + } +} +EXPORT_SYMBOL(__frontswap_invalidate_page); + +/* + * Invalidate all data from frontswap associated with all offsets for the + * specified swaptype. + */ +void __frontswap_invalidate_area(unsigned type) +{ + struct swap_info_struct *sis = swap_info[type]; + + BUG_ON(sis == NULL); + if (sis->frontswap_map == NULL) + return; + (*frontswap_ops.invalidate_area)(type); + atomic_set(&sis->frontswap_pages, 0); + memset(sis->frontswap_map, 0, sis->max / sizeof(long)); +} +EXPORT_SYMBOL(__frontswap_invalidate_area); + +/* + * Frontswap, like a true swap device, may unnecessarily retain pages + * under certain circumstances; "shrink" frontswap is essentially a + * "partial swapoff" and works by calling try_to_unuse to attempt to + * unuse enough frontswap pages to attempt to -- subject to memory + * constraints -- reduce the number of pages in frontswap to the + * number given in the parameter target_pages. + */ +void frontswap_shrink(unsigned long target_pages) +{ + struct swap_info_struct *si = NULL; + int si_frontswap_pages; + unsigned long total_pages = 0, total_pages_to_unuse; + unsigned long pages = 0, pages_to_unuse = 0; + int type; + bool locked = false; + + /* + * we don't want to hold swap_lock while doing a very + * lengthy try_to_unuse, but swap_list may change + * so restart scan from swap_list.head each time + */ + spin_lock(&swap_lock); + locked = true; + total_pages = 0; + for (type = swap_list.head; type >= 0; type = si->next) { + si = swap_info[type]; + total_pages += atomic_read(&si->frontswap_pages); + } + if (total_pages <= target_pages) + goto out; + total_pages_to_unuse = total_pages - target_pages; + for (type = swap_list.head; type >= 0; type = si->next) { + si = swap_info[type]; + si_frontswap_pages = atomic_read(&si->frontswap_pages); + if (total_pages_to_unuse < si_frontswap_pages) + pages = pages_to_unuse = total_pages_to_unuse; + else { + pages = si_frontswap_pages; + pages_to_unuse = 0; /* unuse all */ + } + /* ensure there is enough RAM to fetch pages from frontswap */ + if (security_vm_enough_memory_mm(current->mm, pages)) + continue; + vm_unacct_memory(pages); + break; + } + if (type < 0) + goto out; + locked = false; + spin_unlock(&swap_lock); + try_to_unuse(type, true, pages_to_unuse); +out: + if (locked) + spin_unlock(&swap_lock); + return; +} +EXPORT_SYMBOL(frontswap_shrink); + +/* + * Count and return the number of frontswap pages across all + * swap devices. This is exported so that backend drivers can + * determine current usage without reading debugfs. + */ +unsigned long frontswap_curr_pages(void) +{ + int type; + unsigned long totalpages = 0; + struct swap_info_struct *si = NULL; + + spin_lock(&swap_lock); + for (type = swap_list.head; type >= 0; type = si->next) { + si = swap_info[type]; + totalpages += atomic_read(&si->frontswap_pages); + } + spin_unlock(&swap_lock); + return totalpages; +} +EXPORT_SYMBOL(frontswap_curr_pages); + +static int __init init_frontswap(void) +{ +#ifdef CONFIG_DEBUG_FS + struct dentry *root = debugfs_create_dir("frontswap", NULL); + if (root == NULL) + return -ENXIO; + debugfs_create_u64("gets", S_IRUGO, root, &frontswap_gets); + debugfs_create_u64("succ_puts", S_IRUGO, root, &frontswap_succ_puts); + debugfs_create_u64("failed_puts", S_IRUGO, root, + &frontswap_failed_puts); + debugfs_create_u64("invalidates", S_IRUGO, + root, &frontswap_invalidates); +#endif + return 0; +} + +module_init(init_frontswap); From 27c6aec214264992603526d47da9dabddf3521b3 Mon Sep 17 00:00:00 2001 From: Dan Magenheimer Date: Mon, 9 Apr 2012 17:10:34 -0600 Subject: [PATCH 011/178] mm: frontswap: config and doc files This patch 4of4 adds configuration and documentation files including a FAQ. [v14: updated docs/FAQ to use zcache and RAMster as examples] [v10: no change] [v9: akpm@linux-foundation.org: sysfs->debugfs; no longer need Doc/ABI file] [v8: rebase to 3.0-rc4] [v7: rebase to 3.0-rc3] [v6: rebase to 3.0-rc1] [v5: change config default to n] [v4: rebase to 2.6.39] Signed-off-by: Dan Magenheimer Acked-by: Jan Beulich Acked-by: Seth Jennings Cc: Jeremy Fitzhardinge Cc: Hugh Dickins Cc: Johannes Weiner Cc: Nitin Gupta Cc: Matthew Wilcox Cc: Chris Mason Cc: Rik Riel Cc: Andrew Morton Signed-off-by: Konrad Rzeszutek Wilk --- Documentation/vm/frontswap.txt | 278 +++++++++++++++++++++++++++++++++ mm/Kconfig | 17 ++ mm/Makefile | 1 + 3 files changed, 296 insertions(+) create mode 100644 Documentation/vm/frontswap.txt diff --git a/Documentation/vm/frontswap.txt b/Documentation/vm/frontswap.txt new file mode 100644 index 000000000000..a9f731af0fac --- /dev/null +++ b/Documentation/vm/frontswap.txt @@ -0,0 +1,278 @@ +Frontswap provides a "transcendent memory" interface for swap pages. +In some environments, dramatic performance savings may be obtained because +swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk. + +(Note, frontswap -- and cleancache (merged at 3.0) -- are the "frontends" +and the only necessary changes to the core kernel for transcendent memory; +all other supporting code -- the "backends" -- is implemented as drivers. +See the LWN.net article "Transcendent memory in a nutshell" for a detailed +overview of frontswap and related kernel parts: +https://lwn.net/Articles/454795/ ) + +Frontswap is so named because it can be thought of as the opposite of +a "backing" store for a swap device. The storage is assumed to be +a synchronous concurrency-safe page-oriented "pseudo-RAM device" conforming +to the requirements of transcendent memory (such as Xen's "tmem", or +in-kernel compressed memory, aka "zcache", or future RAM-like devices); +this pseudo-RAM device is not directly accessible or addressable by the +kernel and is of unknown and possibly time-varying size. The driver +links itself to frontswap by calling frontswap_register_ops to set the +frontswap_ops funcs appropriately and the functions it provides must +conform to certain policies as follows: + +An "init" prepares the device to receive frontswap pages associated +with the specified swap device number (aka "type"). A "put_page" will +copy the page to transcendent memory and associate it with the type and +offset associated with the page. A "get_page" will copy the page, if found, +from transcendent memory into kernel memory, but will NOT remove the page +from from transcendent memory. An "invalidate_page" will remove the page +from transcendent memory and an "invalidate_area" will remove ALL pages +associated with the swap type (e.g., like swapoff) and notify the "device" +to refuse further puts with that swap type. + +Once a page is successfully put, a matching get on the page will normally +succeed. So when the kernel finds itself in a situation where it needs +to swap out a page, it first attempts to use frontswap. If the put returns +success, the data has been successfully saved to transcendent memory and +a disk write and, if the data is later read back, a disk read are avoided. +If a put returns failure, transcendent memory has rejected the data, and the +page can be written to swap as usual. + +If a backend chooses, frontswap can be configured as a "writethrough +cache" by calling frontswap_writethrough(). In this mode, the reduction +in swap device writes is lost (and also a non-trivial performance advantage) +in order to allow the backend to arbitrarily "reclaim" space used to +store frontswap pages to more completely manage its memory usage. + +Note that if a page is put and the page already exists in transcendent memory +(a "duplicate" put), either the put succeeds and the data is overwritten, +or the put fails AND the page is invalidated. This ensures stale data may +never be obtained from frontswap. + +If properly configured, monitoring of frontswap is done via debugfs in +the /sys/kernel/debug/frontswap directory. The effectiveness of +frontswap can be measured (across all swap devices) with: + +failed_puts - how many put attempts have failed +gets - how many gets were attempted (all should succeed) +succ_puts - how many put attempts have succeeded +invalidates - how many invalidates were attempted + +A backend implementation may provide additional metrics. + +FAQ + +1) Where's the value? + +When a workload starts swapping, performance falls through the floor. +Frontswap significantly increases performance in many such workloads by +providing a clean, dynamic interface to read and write swap pages to +"transcendent memory" that is otherwise not directly addressable to the kernel. +This interface is ideal when data is transformed to a different form +and size (such as with compression) or secretly moved (as might be +useful for write-balancing for some RAM-like devices). Swap pages (and +evicted page-cache pages) are a great use for this kind of slower-than-RAM- +but-much-faster-than-disk "pseudo-RAM device" and the frontswap (and +cleancache) interface to transcendent memory provides a nice way to read +and write -- and indirectly "name" -- the pages. + +Frontswap -- and cleancache -- with a fairly small impact on the kernel, +provides a huge amount of flexibility for more dynamic, flexible RAM +utilization in various system configurations: + +In the single kernel case, aka "zcache", pages are compressed and +stored in local memory, thus increasing the total anonymous pages +that can be safely kept in RAM. Zcache essentially trades off CPU +cycles used in compression/decompression for better memory utilization. +Benchmarks have shown little or no impact when memory pressure is +low while providing a significant performance improvement (25%+) +on some workloads under high memory pressure. + +"RAMster" builds on zcache by adding "peer-to-peer" transcendent memory +support for clustered systems. Frontswap pages are locally compressed +as in zcache, but then "remotified" to another system's RAM. This +allows RAM to be dynamically load-balanced back-and-forth as needed, +i.e. when system A is overcommitted, it can swap to system B, and +vice versa. RAMster can also be configured as a memory server so +many servers in a cluster can swap, dynamically as needed, to a single +server configured with a large amount of RAM... without pre-configuring +how much of the RAM is available for each of the clients! + +In the virtual case, the whole point of virtualization is to statistically +multiplex physical resources acrosst the varying demands of multiple +virtual machines. This is really hard to do with RAM and efforts to do +it well with no kernel changes have essentially failed (except in some +well-publicized special-case workloads). +Specifically, the Xen Transcendent Memory backend allows otherwise +"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple +virtual machines, but the pages can be compressed and deduplicated to +optimize RAM utilization. And when guest OS's are induced to surrender +underutilized RAM (e.g. with "selfballooning"), sudden unexpected +memory pressure may result in swapping; frontswap allows those pages +to be swapped to and from hypervisor RAM (if overall host system memory +conditions allow), thus mitigating the potentially awful performance impact +of unplanned swapping. + +A KVM implementation is underway and has been RFC'ed to lkml. And, +using frontswap, investigation is also underway on the use of NVM as +a memory extension technology. + +2) Sure there may be performance advantages in some situations, but + what's the space/time overhead of frontswap? + +If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into +nothingness and the only overhead is a few extra bytes per swapon'ed +swap device. If CONFIG_FRONTSWAP is enabled but no frontswap "backend" +registers, there is one extra global variable compared to zero for +every swap page read or written. If CONFIG_FRONTSWAP is enabled +AND a frontswap backend registers AND the backend fails every "put" +request (i.e. provides no memory despite claiming it might), +CPU overhead is still negligible -- and since every frontswap fail +precedes a swap page write-to-disk, the system is highly likely +to be I/O bound and using a small fraction of a percent of a CPU +will be irrelevant anyway. + +As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend +registers, one bit is allocated for every swap page for every swap +device that is swapon'd. This is added to the EIGHT bits (which +was sixteen until about 2.6.34) that the kernel already allocates +for every swap page for every swap device that is swapon'd. (Hugh +Dickins has observed that frontswap could probably steal one of +the existing eight bits, but let's worry about that minor optimization +later.) For very large swap disks (which are rare) on a standard +4K pagesize, this is 1MB per 32GB swap. + +When swap pages are stored in transcendent memory instead of written +out to disk, there is a side effect that this may create more memory +pressure that can potentially outweigh the other advantages. A +backend, such as zcache, must implement policies to carefully (but +dynamically) manage memory limits to ensure this doesn't happen. + +3) OK, how about a quick overview of what this frontswap patch does + in terms that a kernel hacker can grok? + +Let's assume that a frontswap "backend" has registered during +kernel initialization; this registration indicates that this +frontswap backend has access to some "memory" that is not directly +accessible by the kernel. Exactly how much memory it provides is +entirely dynamic and random. + +Whenever a swap-device is swapon'd frontswap_init() is called, +passing the swap device number (aka "type") as a parameter. +This notifies frontswap to expect attempts to "put" swap pages +associated with that number. + +Whenever the swap subsystem is readying a page to write to a swap +device (c.f swap_writepage()), frontswap_put_page is called. Frontswap +consults with the frontswap backend and if the backend says it does NOT +have room, frontswap_put_page returns -1 and the kernel swaps the page +to the swap device as normal. Note that the response from the frontswap +backend is unpredictable to the kernel; it may choose to never accept a +page, it could accept every ninth page, or it might accept every +page. But if the backend does accept a page, the data from the page +has already been copied and associated with the type and offset, +and the backend guarantees the persistence of the data. In this case, +frontswap sets a bit in the "frontswap_map" for the swap device +corresponding to the page offset on the swap device to which it would +otherwise have written the data. + +When the swap subsystem needs to swap-in a page (swap_readpage()), +it first calls frontswap_get_page() which checks the frontswap_map to +see if the page was earlier accepted by the frontswap backend. If +it was, the page of data is filled from the frontswap backend and +the swap-in is complete. If not, the normal swap-in code is +executed to obtain the page of data from the real swap device. + +So every time the frontswap backend accepts a page, a swap device read +and (potentially) a swap device write are replaced by a "frontswap backend +put" and (possibly) a "frontswap backend get", which are presumably much +faster. + +4) Can't frontswap be configured as a "special" swap device that is + just higher priority than any real swap device (e.g. like zswap, + or maybe swap-over-nbd/NFS)? + +No. First, the existing swap subsystem doesn't allow for any kind of +swap hierarchy. Perhaps it could be rewritten to accomodate a hierarchy, +but this would require fairly drastic changes. Even if it were +rewritten, the existing swap subsystem uses the block I/O layer which +assumes a swap device is fixed size and any page in it is linearly +addressable. Frontswap barely touches the existing swap subsystem, +and works around the constraints of the block I/O subsystem to provide +a great deal of flexibility and dynamicity. + +For example, the acceptance of any swap page by the frontswap backend is +entirely unpredictable. This is critical to the definition of frontswap +backends because it grants completely dynamic discretion to the +backend. In zcache, one cannot know a priori how compressible a page is. +"Poorly" compressible pages can be rejected, and "poorly" can itself be +defined dynamically depending on current memory constraints. + +Further, frontswap is entirely synchronous whereas a real swap +device is, by definition, asynchronous and uses block I/O. The +block I/O layer is not only unnecessary, but may perform "optimizations" +that are inappropriate for a RAM-oriented device including delaying +the write of some pages for a significant amount of time. Synchrony is +required to ensure the dynamicity of the backend and to avoid thorny race +conditions that would unnecessarily and greatly complicate frontswap +and/or the block I/O subsystem. That said, only the initial "put" +and "get" operations need be synchronous. A separate asynchronous thread +is free to manipulate the pages stored by frontswap. For example, +the "remotification" thread in RAMster uses standard asynchronous +kernel sockets to move compressed frontswap pages to a remote machine. +Similarly, a KVM guest-side implementation could do in-guest compression +and use "batched" hypercalls. + +In a virtualized environment, the dynamicity allows the hypervisor +(or host OS) to do "intelligent overcommit". For example, it can +choose to accept pages only until host-swapping might be imminent, +then force guests to do their own swapping. + +There is a downside to the transcendent memory specifications for +frontswap: Since any "put" might fail, there must always be a real +slot on a real swap device to swap the page. Thus frontswap must be +implemented as a "shadow" to every swapon'd device with the potential +capability of holding every page that the swap device might have held +and the possibility that it might hold no pages at all. This means +that frontswap cannot contain more pages than the total of swapon'd +swap devices. For example, if NO swap device is configured on some +installation, frontswap is useless. Swapless portable devices +can still use frontswap but a backend for such devices must configure +some kind of "ghost" swap device and ensure that it is never used. + +5) Why this weird definition about "duplicate puts"? If a page + has been previously successfully put, can't it always be + successfully overwritten? + +Nearly always it can, but no, sometimes it cannot. Consider an example +where data is compressed and the original 4K page has been compressed +to 1K. Now an attempt is made to overwrite the page with data that +is non-compressible and so would take the entire 4K. But the backend +has no more space. In this case, the put must be rejected. Whenever +frontswap rejects a put that would overwrite, it also must invalidate +the old data and ensure that it is no longer accessible. Since the +swap subsystem then writes the new data to the read swap device, +this is the correct course of action to ensure coherency. + +6) What is frontswap_shrink for? + +When the (non-frontswap) swap subsystem swaps out a page to a real +swap device, that page is only taking up low-value pre-allocated disk +space. But if frontswap has placed a page in transcendent memory, that +page may be taking up valuable real estate. The frontswap_shrink +routine allows code outside of the swap subsystem to force pages out +of the memory managed by frontswap and back into kernel-addressable memory. +For example, in RAMster, a "suction driver" thread will attempt +to "repatriate" pages sent to a remote machine back to the local machine; +this is driven using the frontswap_shrink mechanism when memory pressure +subsides. + +7) Why does the frontswap patch create the new include file swapfile.h? + +The frontswap code depends on some swap-subsystem-internal data +structures that have, over the years, moved back and forth between +static and global. This seemed a reasonable compromise: Define +them as global but declare them in a new include file that isn't +included by the large number of source files that include swap.h. + +Dan Magenheimer, last updated April 9, 2012 diff --git a/mm/Kconfig b/mm/Kconfig index e338407f1225..2613c910935a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -379,3 +379,20 @@ config CLEANCACHE in a negligible performance hit. If unsure, say Y to enable cleancache + +config FRONTSWAP + bool "Enable frontswap to cache swap pages if tmem is present" + depends on SWAP + default n + help + Frontswap is so named because it can be thought of as the opposite + of a "backing" store for a swap device. The data is stored into + "transcendent memory", memory that is not directly accessible or + addressable by the kernel and is of unknown and possibly + time-varying size. When space in transcendent memory is available, + a significant swap I/O reduction may be achieved. When none is + available, all frontswap calls are reduced to a single pointer- + compare-against-NULL resulting in a negligible performance hit + and swap data is stored as normal on the matching swap device. + + If unsure, say Y to enable frontswap. diff --git a/mm/Makefile b/mm/Makefile index 50ec00ef2a0e..306742a28266 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -26,6 +26,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o +obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o From 839a1f79ed0ce026e6d1106f202eaaae929b4200 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 16 Apr 2012 17:06:35 -0400 Subject: [PATCH 012/178] MAINTAINER: Add myself for the frontswap API Signed-off-by: Konrad Rzeszutek Wilk --- MAINTAINERS | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 2dcfca850639..bc8905dc2252 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2876,6 +2876,13 @@ F: Documentation/power/freezing-of-tasks.txt F: include/linux/freezer.h F: kernel/freezer.c +FRONTSWAP API +M: Konrad Rzeszutek Wilk +L: linux-kernel@vger.kernel.org +S: Maintained +F: mm/frontswap.c +F: include/linux/frontswap.h + FS-CACHE: LOCAL CACHING FOR NETWORK FILESYSTEMS M: David Howells L: linux-cachefs@redhat.com From 165c8aed5bbc6bdddbccae0ba9db451732558ff9 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 15 May 2012 11:32:15 -0400 Subject: [PATCH 013/178] frontswap: s/put_page/store/g s/get_page/load Sounds so much more natural. Suggested-by: Andrea Arcangeli Signed-off-by: Konrad Rzeszutek Wilk --- Documentation/vm/frontswap.txt | 50 ++++++++++++------------ drivers/staging/ramster/zcache-main.c | 8 ++-- drivers/staging/zcache/zcache-main.c | 10 ++--- drivers/xen/tmem.c | 8 ++-- include/linux/frontswap.h | 16 ++++---- mm/frontswap.c | 56 +++++++++++++-------------- mm/page_io.c | 4 +- 7 files changed, 76 insertions(+), 76 deletions(-) diff --git a/Documentation/vm/frontswap.txt b/Documentation/vm/frontswap.txt index a9f731af0fac..37067cf455f4 100644 --- a/Documentation/vm/frontswap.txt +++ b/Documentation/vm/frontswap.txt @@ -21,21 +21,21 @@ frontswap_ops funcs appropriately and the functions it provides must conform to certain policies as follows: An "init" prepares the device to receive frontswap pages associated -with the specified swap device number (aka "type"). A "put_page" will +with the specified swap device number (aka "type"). A "store" will copy the page to transcendent memory and associate it with the type and -offset associated with the page. A "get_page" will copy the page, if found, +offset associated with the page. A "load" will copy the page, if found, from transcendent memory into kernel memory, but will NOT remove the page from from transcendent memory. An "invalidate_page" will remove the page from transcendent memory and an "invalidate_area" will remove ALL pages associated with the swap type (e.g., like swapoff) and notify the "device" -to refuse further puts with that swap type. +to refuse further stores with that swap type. -Once a page is successfully put, a matching get on the page will normally +Once a page is successfully stored, a matching load on the page will normally succeed. So when the kernel finds itself in a situation where it needs -to swap out a page, it first attempts to use frontswap. If the put returns +to swap out a page, it first attempts to use frontswap. If the store returns success, the data has been successfully saved to transcendent memory and a disk write and, if the data is later read back, a disk read are avoided. -If a put returns failure, transcendent memory has rejected the data, and the +If a store returns failure, transcendent memory has rejected the data, and the page can be written to swap as usual. If a backend chooses, frontswap can be configured as a "writethrough @@ -44,18 +44,18 @@ in swap device writes is lost (and also a non-trivial performance advantage) in order to allow the backend to arbitrarily "reclaim" space used to store frontswap pages to more completely manage its memory usage. -Note that if a page is put and the page already exists in transcendent memory -(a "duplicate" put), either the put succeeds and the data is overwritten, -or the put fails AND the page is invalidated. This ensures stale data may +Note that if a page is stored and the page already exists in transcendent memory +(a "duplicate" store), either the store succeeds and the data is overwritten, +or the store fails AND the page is invalidated. This ensures stale data may never be obtained from frontswap. If properly configured, monitoring of frontswap is done via debugfs in the /sys/kernel/debug/frontswap directory. The effectiveness of frontswap can be measured (across all swap devices) with: -failed_puts - how many put attempts have failed -gets - how many gets were attempted (all should succeed) -succ_puts - how many put attempts have succeeded +failed_stores - how many store attempts have failed +loads - how many loads were attempted (all should succeed) +succ_stores - how many store attempts have succeeded invalidates - how many invalidates were attempted A backend implementation may provide additional metrics. @@ -125,7 +125,7 @@ nothingness and the only overhead is a few extra bytes per swapon'ed swap device. If CONFIG_FRONTSWAP is enabled but no frontswap "backend" registers, there is one extra global variable compared to zero for every swap page read or written. If CONFIG_FRONTSWAP is enabled -AND a frontswap backend registers AND the backend fails every "put" +AND a frontswap backend registers AND the backend fails every "store" request (i.e. provides no memory despite claiming it might), CPU overhead is still negligible -- and since every frontswap fail precedes a swap page write-to-disk, the system is highly likely @@ -159,13 +159,13 @@ entirely dynamic and random. Whenever a swap-device is swapon'd frontswap_init() is called, passing the swap device number (aka "type") as a parameter. -This notifies frontswap to expect attempts to "put" swap pages +This notifies frontswap to expect attempts to "store" swap pages associated with that number. Whenever the swap subsystem is readying a page to write to a swap -device (c.f swap_writepage()), frontswap_put_page is called. Frontswap +device (c.f swap_writepage()), frontswap_store is called. Frontswap consults with the frontswap backend and if the backend says it does NOT -have room, frontswap_put_page returns -1 and the kernel swaps the page +have room, frontswap_store returns -1 and the kernel swaps the page to the swap device as normal. Note that the response from the frontswap backend is unpredictable to the kernel; it may choose to never accept a page, it could accept every ninth page, or it might accept every @@ -177,7 +177,7 @@ corresponding to the page offset on the swap device to which it would otherwise have written the data. When the swap subsystem needs to swap-in a page (swap_readpage()), -it first calls frontswap_get_page() which checks the frontswap_map to +it first calls frontswap_load() which checks the frontswap_map to see if the page was earlier accepted by the frontswap backend. If it was, the page of data is filled from the frontswap backend and the swap-in is complete. If not, the normal swap-in code is @@ -185,7 +185,7 @@ executed to obtain the page of data from the real swap device. So every time the frontswap backend accepts a page, a swap device read and (potentially) a swap device write are replaced by a "frontswap backend -put" and (possibly) a "frontswap backend get", which are presumably much +store" and (possibly) a "frontswap backend loads", which are presumably much faster. 4) Can't frontswap be configured as a "special" swap device that is @@ -215,8 +215,8 @@ that are inappropriate for a RAM-oriented device including delaying the write of some pages for a significant amount of time. Synchrony is required to ensure the dynamicity of the backend and to avoid thorny race conditions that would unnecessarily and greatly complicate frontswap -and/or the block I/O subsystem. That said, only the initial "put" -and "get" operations need be synchronous. A separate asynchronous thread +and/or the block I/O subsystem. That said, only the initial "store" +and "load" operations need be synchronous. A separate asynchronous thread is free to manipulate the pages stored by frontswap. For example, the "remotification" thread in RAMster uses standard asynchronous kernel sockets to move compressed frontswap pages to a remote machine. @@ -229,7 +229,7 @@ choose to accept pages only until host-swapping might be imminent, then force guests to do their own swapping. There is a downside to the transcendent memory specifications for -frontswap: Since any "put" might fail, there must always be a real +frontswap: Since any "store" might fail, there must always be a real slot on a real swap device to swap the page. Thus frontswap must be implemented as a "shadow" to every swapon'd device with the potential capability of holding every page that the swap device might have held @@ -240,16 +240,16 @@ installation, frontswap is useless. Swapless portable devices can still use frontswap but a backend for such devices must configure some kind of "ghost" swap device and ensure that it is never used. -5) Why this weird definition about "duplicate puts"? If a page - has been previously successfully put, can't it always be +5) Why this weird definition about "duplicate stores"? If a page + has been previously successfully stored, can't it always be successfully overwritten? Nearly always it can, but no, sometimes it cannot. Consider an example where data is compressed and the original 4K page has been compressed to 1K. Now an attempt is made to overwrite the page with data that is non-compressible and so would take the entire 4K. But the backend -has no more space. In this case, the put must be rejected. Whenever -frontswap rejects a put that would overwrite, it also must invalidate +has no more space. In this case, the store must be rejected. Whenever +frontswap rejects a store that would overwrite, it also must invalidate the old data and ensure that it is no longer accessible. Since the swap subsystem then writes the new data to the read swap device, this is the correct course of action to ensure coherency. diff --git a/drivers/staging/ramster/zcache-main.c b/drivers/staging/ramster/zcache-main.c index 68b2e053a0e6..2627b3de0d21 100644 --- a/drivers/staging/ramster/zcache-main.c +++ b/drivers/staging/ramster/zcache-main.c @@ -3002,7 +3002,7 @@ static inline struct tmem_oid oswiz(unsigned type, u32 ind) return oid; } -static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, +static int zcache_frontswap_store(unsigned type, pgoff_t offset, struct page *page) { u64 ind64 = (u64)offset; @@ -3025,7 +3025,7 @@ static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, /* returns 0 if the page was successfully gotten from frontswap, -1 if * was not present (should never happen!) */ -static int zcache_frontswap_get_page(unsigned type, pgoff_t offset, +static int zcache_frontswap_load(unsigned type, pgoff_t offset, struct page *page) { u64 ind64 = (u64)offset; @@ -3080,8 +3080,8 @@ static void zcache_frontswap_init(unsigned ignored) } static struct frontswap_ops zcache_frontswap_ops = { - .put_page = zcache_frontswap_put_page, - .get_page = zcache_frontswap_get_page, + .store = zcache_frontswap_store, + .load = zcache_frontswap_load, .invalidate_page = zcache_frontswap_flush_page, .invalidate_area = zcache_frontswap_flush_area, .init = zcache_frontswap_init diff --git a/drivers/staging/zcache/zcache-main.c b/drivers/staging/zcache/zcache-main.c index 2734dacacbaf..784c796b9848 100644 --- a/drivers/staging/zcache/zcache-main.c +++ b/drivers/staging/zcache/zcache-main.c @@ -1835,7 +1835,7 @@ static int zcache_frontswap_poolid = -1; * Swizzling increases objects per swaptype, increasing tmem concurrency * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS * Setting SWIZ_BITS to 27 basically reconstructs the swap entry from - * frontswap_get_page(), but has side-effects. Hence using 8. + * frontswap_load(), but has side-effects. Hence using 8. */ #define SWIZ_BITS 8 #define SWIZ_MASK ((1 << SWIZ_BITS) - 1) @@ -1849,7 +1849,7 @@ static inline struct tmem_oid oswiz(unsigned type, u32 ind) return oid; } -static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, +static int zcache_frontswap_store(unsigned type, pgoff_t offset, struct page *page) { u64 ind64 = (u64)offset; @@ -1870,7 +1870,7 @@ static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, /* returns 0 if the page was successfully gotten from frontswap, -1 if * was not present (should never happen!) */ -static int zcache_frontswap_get_page(unsigned type, pgoff_t offset, +static int zcache_frontswap_load(unsigned type, pgoff_t offset, struct page *page) { u64 ind64 = (u64)offset; @@ -1919,8 +1919,8 @@ static void zcache_frontswap_init(unsigned ignored) } static struct frontswap_ops zcache_frontswap_ops = { - .put_page = zcache_frontswap_put_page, - .get_page = zcache_frontswap_get_page, + .store = zcache_frontswap_store, + .load = zcache_frontswap_load, .invalidate_page = zcache_frontswap_flush_page, .invalidate_area = zcache_frontswap_flush_area, .init = zcache_frontswap_init diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c index dcb79521e6c8..89f264c67420 100644 --- a/drivers/xen/tmem.c +++ b/drivers/xen/tmem.c @@ -269,7 +269,7 @@ static inline struct tmem_oid oswiz(unsigned type, u32 ind) } /* returns 0 if the page was successfully put into frontswap, -1 if not */ -static int tmem_frontswap_put_page(unsigned type, pgoff_t offset, +static int tmem_frontswap_store(unsigned type, pgoff_t offset, struct page *page) { u64 ind64 = (u64)offset; @@ -295,7 +295,7 @@ static int tmem_frontswap_put_page(unsigned type, pgoff_t offset, * returns 0 if the page was successfully gotten from frontswap, -1 if * was not present (should never happen!) */ -static int tmem_frontswap_get_page(unsigned type, pgoff_t offset, +static int tmem_frontswap_load(unsigned type, pgoff_t offset, struct page *page) { u64 ind64 = (u64)offset; @@ -362,8 +362,8 @@ static int __init no_frontswap(char *s) __setup("nofrontswap", no_frontswap); static struct frontswap_ops __initdata tmem_frontswap_ops = { - .put_page = tmem_frontswap_put_page, - .get_page = tmem_frontswap_get_page, + .store = tmem_frontswap_store, + .load = tmem_frontswap_load, .invalidate_page = tmem_frontswap_flush_page, .invalidate_area = tmem_frontswap_flush_area, .init = tmem_frontswap_init diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index 68ff7af5c5fb..0e4e2eec5c1d 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -7,8 +7,8 @@ struct frontswap_ops { void (*init)(unsigned); - int (*put_page)(unsigned, pgoff_t, struct page *); - int (*get_page)(unsigned, pgoff_t, struct page *); + int (*store)(unsigned, pgoff_t, struct page *); + int (*load)(unsigned, pgoff_t, struct page *); void (*invalidate_page)(unsigned, pgoff_t); void (*invalidate_area)(unsigned); }; @@ -21,8 +21,8 @@ extern unsigned long frontswap_curr_pages(void); extern void frontswap_writethrough(bool); extern void __frontswap_init(unsigned type); -extern int __frontswap_put_page(struct page *page); -extern int __frontswap_get_page(struct page *page); +extern int __frontswap_store(struct page *page); +extern int __frontswap_load(struct page *page); extern void __frontswap_invalidate_page(unsigned, pgoff_t); extern void __frontswap_invalidate_area(unsigned); @@ -88,21 +88,21 @@ static inline unsigned long *frontswap_map_get(struct swap_info_struct *p) } #endif -static inline int frontswap_put_page(struct page *page) +static inline int frontswap_store(struct page *page) { int ret = -1; if (frontswap_enabled) - ret = __frontswap_put_page(page); + ret = __frontswap_store(page); return ret; } -static inline int frontswap_get_page(struct page *page) +static inline int frontswap_load(struct page *page) { int ret = -1; if (frontswap_enabled) - ret = __frontswap_get_page(page); + ret = __frontswap_load(page); return ret; } diff --git a/mm/frontswap.c b/mm/frontswap.c index 8c0a5f8683f0..e25025574a02 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -39,7 +39,7 @@ bool frontswap_enabled __read_mostly; EXPORT_SYMBOL(frontswap_enabled); /* - * If enabled, frontswap_put will return failure even on success. As + * If enabled, frontswap_store will return failure even on success. As * a result, the swap subsystem will always write the page to swap, in * effect converting frontswap into a writethrough cache. In this mode, * there is no direct reduction in swap writes, but a frontswap backend @@ -54,27 +54,27 @@ static bool frontswap_writethrough_enabled __read_mostly; * properly configured). These are for information only so are not protected * against increment races. */ -static u64 frontswap_gets; -static u64 frontswap_succ_puts; -static u64 frontswap_failed_puts; +static u64 frontswap_loads; +static u64 frontswap_succ_stores; +static u64 frontswap_failed_stores; static u64 frontswap_invalidates; -static inline void inc_frontswap_gets(void) { - frontswap_gets++; +static inline void inc_frontswap_loads(void) { + frontswap_loads++; } -static inline void inc_frontswap_succ_puts(void) { - frontswap_succ_puts++; +static inline void inc_frontswap_succ_stores(void) { + frontswap_succ_stores++; } -static inline void inc_frontswap_failed_puts(void) { - frontswap_failed_puts++; +static inline void inc_frontswap_failed_stores(void) { + frontswap_failed_stores++; } static inline void inc_frontswap_invalidates(void) { frontswap_invalidates++; } #else -static inline void inc_frontswap_gets(void) { } -static inline void inc_frontswap_succ_puts(void) { } -static inline void inc_frontswap_failed_puts(void) { } +static inline void inc_frontswap_loads(void) { } +static inline void inc_frontswap_succ_stores(void) { } +static inline void inc_frontswap_failed_stores(void) { } static inline void inc_frontswap_invalidates(void) { } #endif /* @@ -116,13 +116,13 @@ void __frontswap_init(unsigned type) EXPORT_SYMBOL(__frontswap_init); /* - * "Put" data from a page to frontswap and associate it with the page's + * "Store" data from a page to frontswap and associate it with the page's * swaptype and offset. Page must be locked and in the swap cache. * If frontswap already contains a page with matching swaptype and * offset, the frontswap implmentation may either overwrite the data and * return success or invalidate the page from frontswap and return failure. */ -int __frontswap_put_page(struct page *page) +int __frontswap_store(struct page *page) { int ret = -1, dup = 0; swp_entry_t entry = { .val = page_private(page), }; @@ -134,10 +134,10 @@ int __frontswap_put_page(struct page *page) BUG_ON(sis == NULL); if (frontswap_test(sis, offset)) dup = 1; - ret = (*frontswap_ops.put_page)(type, offset, page); + ret = (*frontswap_ops.store)(type, offset, page); if (ret == 0) { frontswap_set(sis, offset); - inc_frontswap_succ_puts(); + inc_frontswap_succ_stores(); if (!dup) atomic_inc(&sis->frontswap_pages); } else if (dup) { @@ -147,22 +147,22 @@ int __frontswap_put_page(struct page *page) */ frontswap_clear(sis, offset); atomic_dec(&sis->frontswap_pages); - inc_frontswap_failed_puts(); + inc_frontswap_failed_stores(); } else - inc_frontswap_failed_puts(); + inc_frontswap_failed_stores(); if (frontswap_writethrough_enabled) /* report failure so swap also writes to swap device */ ret = -1; return ret; } -EXPORT_SYMBOL(__frontswap_put_page); +EXPORT_SYMBOL(__frontswap_store); /* * "Get" data from frontswap associated with swaptype and offset that were * specified when the data was put to frontswap and use it to fill the * specified page with data. Page must be locked and in the swap cache. */ -int __frontswap_get_page(struct page *page) +int __frontswap_load(struct page *page) { int ret = -1; swp_entry_t entry = { .val = page_private(page), }; @@ -173,12 +173,12 @@ int __frontswap_get_page(struct page *page) BUG_ON(!PageLocked(page)); BUG_ON(sis == NULL); if (frontswap_test(sis, offset)) - ret = (*frontswap_ops.get_page)(type, offset, page); + ret = (*frontswap_ops.load)(type, offset, page); if (ret == 0) - inc_frontswap_gets(); + inc_frontswap_loads(); return ret; } -EXPORT_SYMBOL(__frontswap_get_page); +EXPORT_SYMBOL(__frontswap_load); /* * Invalidate any data from frontswap associated with the specified swaptype @@ -301,10 +301,10 @@ static int __init init_frontswap(void) struct dentry *root = debugfs_create_dir("frontswap", NULL); if (root == NULL) return -ENXIO; - debugfs_create_u64("gets", S_IRUGO, root, &frontswap_gets); - debugfs_create_u64("succ_puts", S_IRUGO, root, &frontswap_succ_puts); - debugfs_create_u64("failed_puts", S_IRUGO, root, - &frontswap_failed_puts); + debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads); + debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores); + debugfs_create_u64("failed_stores", S_IRUGO, root, + &frontswap_failed_stores); debugfs_create_u64("invalidates", S_IRUGO, root, &frontswap_invalidates); #endif diff --git a/mm/page_io.c b/mm/page_io.c index 651a91259317..34f02923744c 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -99,7 +99,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) unlock_page(page); goto out; } - if (frontswap_put_page(page) == 0) { + if (frontswap_store(page) == 0) { set_page_writeback(page); unlock_page(page); end_page_writeback(page); @@ -129,7 +129,7 @@ int swap_readpage(struct page *page) VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageUptodate(page)); - if (frontswap_get_page(page) == 0) { + if (frontswap_load(page) == 0) { SetPageUptodate(page); unlock_page(page); goto out; From 23812b9d9e497580d38c62ebdc6f308733b0a32a Mon Sep 17 00:00:00 2001 From: Ning Jiang Date: Tue, 22 May 2012 00:19:20 +0800 Subject: [PATCH 014/178] genirq: Add IRQS_PENDING for nested and simple irq Every interrupt which is an active wakeup source needs the ability to abort suspend if there is a pending irq. Right now only edge and level irqs can do that. | +---------+ | INTC | +---------+ | GPIO_IRQ +------------+ | gpio-exp | +------------+ | | GPIO0_IRQ GPIO1_IRQ In the above diagram, gpio expander has irq number GPIO_IRQ, it is connected with two sub GPIO pins, GPIO0 and GPIO1. During suspend, we set IRQF_NO_SUSPEND for GPIO_IRQ so that gpio expander driver can handle the sub irq GPIO0_IRQ and GPIO1_IRQ, and these two irqs themselves can further be handled by simple or nested irq in some drivers(typically gpio and mfd driver). If they are used as wakeup sources during suspend, we want them to be able to abort suspend too. Setting IRQS_PENDING flag in handle_nested_irq() and handle_simple_irq() when the irq is disabled allows check_wakeup_irqs() to identify such irqs as source for aborting suspend. Signed-off-by: Ning Jiang Cc: rjw@sisk.pl Link: http://lkml.kernel.org/r/CAH3Oq6T905%2B3fkF43NAMMFvJvq7dsk_so6T2vQ8ZJrA5xiU3YA@mail.gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index fc275e4f629b..eebd6d5cfb44 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -275,8 +275,10 @@ void handle_nested_irq(unsigned int irq) kstat_incr_irqs_this_cpu(irq, desc); action = desc->action; - if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) + if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { + desc->istate |= IRQS_PENDING; goto out_unlock; + } irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock_irq(&desc->lock); @@ -324,8 +326,10 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { + desc->istate |= IRQS_PENDING; goto out_unlock; + } handle_irq_event(desc); From 818b0f3bfb236ae66cac3ff38e86b9e47f24b7aa Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 30 Mar 2012 23:11:34 +0800 Subject: [PATCH 015/178] genirq: Introduce irq_do_set_affinity() to reduce duplicated code All invocations of chip->irq_set_affinity() are doing the same return value checks. Let them all use a common function. [ tglx: removed the silly likely while at it ] Signed-off-by: Jiang Liu Cc: Jiang Liu Cc: Keping Chen Link: http://lkml.kernel.org/r/1333120296-13563-3-git-send-email-jiang.liu@huawei.com Signed-off-by: Thomas Gleixner --- kernel/irq/internals.h | 3 +++ kernel/irq/manage.c | 39 ++++++++++++++++++++++----------------- kernel/irq/migration.c | 13 ++----------- 3 files changed, 27 insertions(+), 28 deletions(-) diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 8e5c56b3b7d9..001fa5bab490 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -101,6 +101,9 @@ extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); extern void irq_set_thread_affinity(struct irq_desc *desc); +extern int irq_do_set_affinity(struct irq_data *data, + const struct cpumask *dest, bool force); + /* Inline functions for support of irq chips on slow busses */ static inline void chip_bus_lock(struct irq_desc *desc) { diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index bb32326afe87..a1b903380bcf 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -139,6 +139,25 @@ static inline void irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } #endif +int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ + struct irq_desc *desc = irq_data_to_desc(data); + struct irq_chip *chip = irq_data_get_irq_chip(data); + int ret; + + ret = chip->irq_set_affinity(data, mask, false); + switch (ret) { + case IRQ_SET_MASK_OK: + cpumask_copy(data->affinity, mask); + case IRQ_SET_MASK_OK_NOCOPY: + irq_set_thread_affinity(desc); + ret = 0; + } + + return ret; +} + int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) { struct irq_chip *chip = irq_data_get_irq_chip(data); @@ -149,14 +168,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) return -EINVAL; if (irq_can_move_pcntxt(data)) { - ret = chip->irq_set_affinity(data, mask, false); - switch (ret) { - case IRQ_SET_MASK_OK: - cpumask_copy(data->affinity, mask); - case IRQ_SET_MASK_OK_NOCOPY: - irq_set_thread_affinity(desc); - ret = 0; - } + ret = irq_do_set_affinity(data, mask, false); } else { irqd_set_move_pending(data); irq_copy_pending(desc, mask); @@ -280,9 +292,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); static int setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) { - struct irq_chip *chip = irq_desc_get_chip(desc); struct cpumask *set = irq_default_affinity; - int ret, node = desc->irq_data.node; + int node = desc->irq_data.node; /* Excludes PER_CPU and NO_BALANCE interrupts */ if (!irq_can_set_affinity(irq)) @@ -308,13 +319,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) if (cpumask_intersects(mask, nodemask)) cpumask_and(mask, mask, nodemask); } - ret = chip->irq_set_affinity(&desc->irq_data, mask, false); - switch (ret) { - case IRQ_SET_MASK_OK: - cpumask_copy(desc->irq_data.affinity, mask); - case IRQ_SET_MASK_OK_NOCOPY: - irq_set_thread_affinity(desc); - } + irq_do_set_affinity(&desc->irq_data, mask, false); return 0; } #else diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index c3c89751b327..ca3f4aaff707 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -42,17 +42,8 @@ void irq_move_masked_irq(struct irq_data *idata) * For correct operation this depends on the caller * masking the irqs. */ - if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) - < nr_cpu_ids)) { - int ret = chip->irq_set_affinity(&desc->irq_data, - desc->pending_mask, false); - switch (ret) { - case IRQ_SET_MASK_OK: - cpumask_copy(desc->irq_data.affinity, desc->pending_mask); - case IRQ_SET_MASK_OK_NOCOPY: - irq_set_thread_affinity(desc); - } - } + if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) + irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false); cpumask_clear(desc->pending_mask); } From ee74d13229fb606353ff56f4927fa93b37e95bbe Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 24 May 2012 20:40:55 +0530 Subject: [PATCH 016/178] smpboot, idle: Optimize calls to smp_processor_id() in idle_threads_init() While trying to initialize idle threads for all cpus, idle_threads_init() calls smp_processor_id() in a loop, which is unnecessary. The intent is to initialize idle threads for all non-boot cpus. So just use a variable to note the boot cpu and use it in the loop. Signed-off-by: Srivatsa S. Bhat Cc: suresh.b.siddha@intel.com Cc: venki@google.com Cc: nikunj@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/20120524151055.2549.64309.stgit@srivatsabhat.in.ibm.com Signed-off-by: Thomas Gleixner --- kernel/smpboot.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/smpboot.c b/kernel/smpboot.c index e1a797e028a3..0f2162f808a7 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -52,10 +52,12 @@ static inline void idle_init(unsigned int cpu) */ void __init idle_threads_init(void) { - unsigned int cpu; + unsigned int cpu, boot_cpu; + + boot_cpu = smp_processor_id(); for_each_possible_cpu(cpu) { - if (cpu != smp_processor_id()) + if (cpu != boot_cpu) idle_init(cpu); } } From 4a70d2d9909b43ed88043b98cabe2c7fbd563021 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 24 May 2012 20:41:00 +0530 Subject: [PATCH 017/178] smpboot, idle: Fix comment mismatch over idle_threads_init() The comment over idle_threads_init() really talks about the functionality of idle_init(). Move that comment to idle_init(), and add a suitable comment over idle_threads_init(). Signed-off-by: Srivatsa S. Bhat Cc: suresh.b.siddha@intel.com Cc: venki@google.com Cc: nikunj@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/20120524151100.2549.66501.stgit@srivatsabhat.in.ibm.com Signed-off-by: Thomas Gleixner --- kernel/smpboot.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 0f2162f808a7..98f60c5caa1b 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -31,6 +31,12 @@ void __init idle_thread_set_boot_cpu(void) per_cpu(idle_threads, smp_processor_id()) = current; } +/** + * idle_init - Initialize the idle thread for a cpu + * @cpu: The cpu for which the idle thread should be initialized + * + * Creates the thread if it does not exist. + */ static inline void idle_init(unsigned int cpu) { struct task_struct *tsk = per_cpu(idle_threads, cpu); @@ -45,10 +51,7 @@ static inline void idle_init(unsigned int cpu) } /** - * idle_thread_init - Initialize the idle thread for a cpu - * @cpu: The cpu for which the idle thread should be initialized - * - * Creates the thread if it does not exist. + * idle_threads_init - Initialize idle threads for all cpus */ void __init idle_threads_init(void) { From 5307c9556bc17e3cd26d4e94fc3b2565921834de Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 8 May 2012 12:20:58 +0200 Subject: [PATCH 018/178] tick: Add tick skew boot option Let the user decide whether power consumption or jitter is the more important consideration for their machines. Quoting removal commit af5ab277ded04bd9bc6b048c5a2f0e7d70ef0867: "Historically, Linux has tried to make the regular timer tick on the various CPUs not happen at the same time, to avoid contention on xtime_lock. Nowadays, with the tickless kernel, this contention no longer happens since time keeping and updating are done differently. In addition, this skew is actually hurting power consumption in a measurable way on many-core systems." Problems: - Contrary to the above, systems do encounter contention on both xtime_lock and RCU structure locks when the tick is synchronized. - Moderate sized RT systems suffer intolerable jitter due to the tick being synchronized. - SGI reports the same for their large systems. - Fully utilized systems reap no power saving benefit from skew removal, but do suffer from resulting induced lock contention. - 0209f649 rcu: limit rcu_node leaf-level fanout This patch was born to combat lock contention which testing showed to have been _induced by_ skew removal. Skew the tick, contention disappeared virtually completely. Signed-off-by: Mike Galbraith Link: http://lkml.kernel.org/r/1336472458.21924.78.camel@marge.simpson.net Signed-off-by: Thomas Gleixner --- Documentation/kernel-parameters.txt | 9 +++++++++ kernel/time/tick-sched.c | 18 ++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index b69cfdc12112..ea38cd1f0aba 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2532,6 +2532,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. sched_debug [KNL] Enables verbose scheduler debug messages. + skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate + xtime_lock contention on larger systems, and/or RCU lock + contention on all systems with CONFIG_MAXSMP set. + Format: { "0" | "1" } + 0 -- disable. (may be 1 via CONFIG_CMDLINE="skew_tick=1" + 1 -- enable. + Note: increases power consumption, thus should only be + enabled if running jitter sensitive (HPC/RT) workloads. + security= [SECURITY] Choose a security module to enable at boot. If this boot parameter is not specified, only the first security module asking for security registration will be diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6a3a5b9ff561..4eddbb5ea9c5 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -814,6 +814,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) return HRTIMER_RESTART; } +static int sched_skew_tick; + /** * tick_setup_sched_timer - setup the tick emulation timer */ @@ -831,6 +833,14 @@ void tick_setup_sched_timer(void) /* Get the next period (per cpu) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); + /* Offset the tick to avert xtime_lock contention. */ + if (sched_skew_tick) { + u64 offset = ktime_to_ns(tick_period) >> 1; + do_div(offset, num_possible_cpus()); + offset *= smp_processor_id(); + hrtimer_add_expires_ns(&ts->sched_timer, offset); + } + for (;;) { hrtimer_forward(&ts->sched_timer, now, tick_period); hrtimer_start_expires(&ts->sched_timer, @@ -910,3 +920,11 @@ int tick_check_oneshot_change(int allow_nohz) tick_nohz_switch_to_nohz(); return 0; } + +static int __init skew_tick(char *str) +{ + get_option(&str, &sched_skew_tick); + + return 0; +} +early_param("skew_tick", skew_tick); From e5400321a6f15ce0fe77c8455954f213ef7dcc54 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 9 May 2012 23:39:34 +0900 Subject: [PATCH 019/178] clockevents: Make clockevents_config() a global symbol Make clockevents_config() into a global symbol to allow it to be used by compiled-in clockevent drivers. This is needed by drivers that want to update the timer frequency after registration time. Signed-off-by: Magnus Damm Tested-by: Simon Horman Cc: arnd@arndb.de Cc: johnstul@us.ibm.com Cc: rjw@sisk.pl Cc: lethal@linux-sh.org Cc: gregkh@linuxfoundation.org Cc: olof@lixom.net Cc: Magnus Damm Link: http://lkml.kernel.org/r/20120509143934.27521.46553.sendpatchset@w520 Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 1 + kernel/time/clockevents.c | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 81e803e90aa4..acba894374a1 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -132,6 +132,7 @@ extern u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt); extern void clockevents_register_device(struct clock_event_device *dev); +extern void clockevents_config(struct clock_event_device *dev, u32 freq); extern void clockevents_config_and_register(struct clock_event_device *dev, u32 freq, unsigned long min_delta, unsigned long max_delta); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 9cd928f7a7c6..7e1ce012a851 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -297,8 +297,7 @@ void clockevents_register_device(struct clock_event_device *dev) } EXPORT_SYMBOL_GPL(clockevents_register_device); -static void clockevents_config(struct clock_event_device *dev, - u32 freq) +void clockevents_config(struct clock_event_device *dev, u32 freq) { u64 sec; From b9dbf9517784084ee9496f9f17f9754c1d021a9e Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Fri, 25 May 2012 16:03:44 +0900 Subject: [PATCH 020/178] clocksource: em_sti: Emma Mobile STI driver The STI hardware is based on a single 48-bit 32kHz counter that together with two individual compare registers can generate interrupts. There are no timer operating modes selectable which means that the timer can not clear on match. This driver is providing clocksource support for the 48-bit counter. Clockevents are also supported using the same timer in oneshot mode. Signed-off-by: Magnus Damm Cc: horms@verge.net.au Cc: arnd@arndb.de Cc: johnstul@us.ibm.com Cc: rjw@sisk.pl Cc: lethal@linux-sh.org Cc: gregkh@linuxfoundation.org Cc: olof@lixom.net Link: http://lkml.kernel.org/r/20120525070344.23443.69756.sendpatchset@w520 Signed-off-by: Thomas Gleixner --- arch/arm/mach-shmobile/Kconfig | 6 + drivers/clocksource/Makefile | 1 + drivers/clocksource/em_sti.c | 399 +++++++++++++++++++++++++++++++++ 3 files changed, 406 insertions(+) create mode 100644 drivers/clocksource/em_sti.c diff --git a/arch/arm/mach-shmobile/Kconfig b/arch/arm/mach-shmobile/Kconfig index f31383c32f9c..df33909205e2 100644 --- a/arch/arm/mach-shmobile/Kconfig +++ b/arch/arm/mach-shmobile/Kconfig @@ -186,6 +186,12 @@ config SH_TIMER_TMU help This enables build of the TMU timer driver. +config EM_TIMER_STI + bool "STI timer driver" + default y + help + This enables build of the STI timer driver. + endmenu config SH_CLK_CPG diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile index 8d81a1d32653..dd3e661a124d 100644 --- a/drivers/clocksource/Makefile +++ b/drivers/clocksource/Makefile @@ -6,6 +6,7 @@ obj-$(CONFIG_CS5535_CLOCK_EVENT_SRC) += cs5535-clockevt.o obj-$(CONFIG_SH_TIMER_CMT) += sh_cmt.o obj-$(CONFIG_SH_TIMER_MTU2) += sh_mtu2.o obj-$(CONFIG_SH_TIMER_TMU) += sh_tmu.o +obj-$(CONFIG_EM_TIMER_STI) += em_sti.o obj-$(CONFIG_CLKBLD_I8253) += i8253.o obj-$(CONFIG_CLKSRC_MMIO) += mmio.o obj-$(CONFIG_DW_APB_TIMER) += dw_apb_timer.o diff --git a/drivers/clocksource/em_sti.c b/drivers/clocksource/em_sti.c new file mode 100644 index 000000000000..719584a5952b --- /dev/null +++ b/drivers/clocksource/em_sti.c @@ -0,0 +1,399 @@ +/* + * Emma Mobile Timer Support - STI + * + * Copyright (C) 2012 Magnus Damm + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +enum { USER_CLOCKSOURCE, USER_CLOCKEVENT, USER_NR }; + +struct em_sti_priv { + void __iomem *base; + struct clk *clk; + struct platform_device *pdev; + unsigned int active[USER_NR]; + unsigned long rate; + raw_spinlock_t lock; + struct clock_event_device ced; + struct clocksource cs; +}; + +#define STI_CONTROL 0x00 +#define STI_COMPA_H 0x10 +#define STI_COMPA_L 0x14 +#define STI_COMPB_H 0x18 +#define STI_COMPB_L 0x1c +#define STI_COUNT_H 0x20 +#define STI_COUNT_L 0x24 +#define STI_COUNT_RAW_H 0x28 +#define STI_COUNT_RAW_L 0x2c +#define STI_SET_H 0x30 +#define STI_SET_L 0x34 +#define STI_INTSTATUS 0x40 +#define STI_INTRAWSTATUS 0x44 +#define STI_INTENSET 0x48 +#define STI_INTENCLR 0x4c +#define STI_INTFFCLR 0x50 + +static inline unsigned long em_sti_read(struct em_sti_priv *p, int offs) +{ + return ioread32(p->base + offs); +} + +static inline void em_sti_write(struct em_sti_priv *p, int offs, + unsigned long value) +{ + iowrite32(value, p->base + offs); +} + +static int em_sti_enable(struct em_sti_priv *p) +{ + int ret; + + /* enable clock */ + ret = clk_enable(p->clk); + if (ret) { + dev_err(&p->pdev->dev, "cannot enable clock\n"); + return ret; + } + + /* configure channel, periodic mode and maximum timeout */ + p->rate = clk_get_rate(p->clk); + + /* reset the counter */ + em_sti_write(p, STI_SET_H, 0x40000000); + em_sti_write(p, STI_SET_L, 0x00000000); + + /* mask and clear pending interrupts */ + em_sti_write(p, STI_INTENCLR, 3); + em_sti_write(p, STI_INTFFCLR, 3); + + /* enable updates of counter registers */ + em_sti_write(p, STI_CONTROL, 1); + + return 0; +} + +static void em_sti_disable(struct em_sti_priv *p) +{ + /* mask interrupts */ + em_sti_write(p, STI_INTENCLR, 3); + + /* stop clock */ + clk_disable(p->clk); +} + +static cycle_t em_sti_count(struct em_sti_priv *p) +{ + cycle_t ticks; + unsigned long flags; + + /* the STI hardware buffers the 48-bit count, but to + * break it out into two 32-bit access the registers + * must be accessed in a certain order. + * Always read STI_COUNT_H before STI_COUNT_L. + */ + raw_spin_lock_irqsave(&p->lock, flags); + ticks = (cycle_t)(em_sti_read(p, STI_COUNT_H) & 0xffff) << 32; + ticks |= em_sti_read(p, STI_COUNT_L); + raw_spin_unlock_irqrestore(&p->lock, flags); + + return ticks; +} + +static cycle_t em_sti_set_next(struct em_sti_priv *p, cycle_t next) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&p->lock, flags); + + /* mask compare A interrupt */ + em_sti_write(p, STI_INTENCLR, 1); + + /* update compare A value */ + em_sti_write(p, STI_COMPA_H, next >> 32); + em_sti_write(p, STI_COMPA_L, next & 0xffffffff); + + /* clear compare A interrupt source */ + em_sti_write(p, STI_INTFFCLR, 1); + + /* unmask compare A interrupt */ + em_sti_write(p, STI_INTENSET, 1); + + raw_spin_unlock_irqrestore(&p->lock, flags); + + return next; +} + +static irqreturn_t em_sti_interrupt(int irq, void *dev_id) +{ + struct em_sti_priv *p = dev_id; + + p->ced.event_handler(&p->ced); + return IRQ_HANDLED; +} + +static int em_sti_start(struct em_sti_priv *p, unsigned int user) +{ + unsigned long flags; + int used_before; + int ret = 0; + + raw_spin_lock_irqsave(&p->lock, flags); + used_before = p->active[USER_CLOCKSOURCE] | p->active[USER_CLOCKEVENT]; + if (!used_before) + ret = em_sti_enable(p); + + if (!ret) + p->active[user] = 1; + raw_spin_unlock_irqrestore(&p->lock, flags); + + return ret; +} + +static void em_sti_stop(struct em_sti_priv *p, unsigned int user) +{ + unsigned long flags; + int used_before, used_after; + + raw_spin_lock_irqsave(&p->lock, flags); + used_before = p->active[USER_CLOCKSOURCE] | p->active[USER_CLOCKEVENT]; + p->active[user] = 0; + used_after = p->active[USER_CLOCKSOURCE] | p->active[USER_CLOCKEVENT]; + + if (used_before && !used_after) + em_sti_disable(p); + raw_spin_unlock_irqrestore(&p->lock, flags); +} + +static struct em_sti_priv *cs_to_em_sti(struct clocksource *cs) +{ + return container_of(cs, struct em_sti_priv, cs); +} + +static cycle_t em_sti_clocksource_read(struct clocksource *cs) +{ + return em_sti_count(cs_to_em_sti(cs)); +} + +static int em_sti_clocksource_enable(struct clocksource *cs) +{ + int ret; + struct em_sti_priv *p = cs_to_em_sti(cs); + + ret = em_sti_start(p, USER_CLOCKSOURCE); + if (!ret) + __clocksource_updatefreq_hz(cs, p->rate); + return ret; +} + +static void em_sti_clocksource_disable(struct clocksource *cs) +{ + em_sti_stop(cs_to_em_sti(cs), USER_CLOCKSOURCE); +} + +static void em_sti_clocksource_resume(struct clocksource *cs) +{ + em_sti_clocksource_enable(cs); +} + +static int em_sti_register_clocksource(struct em_sti_priv *p) +{ + struct clocksource *cs = &p->cs; + + memset(cs, 0, sizeof(*cs)); + cs->name = dev_name(&p->pdev->dev); + cs->rating = 200; + cs->read = em_sti_clocksource_read; + cs->enable = em_sti_clocksource_enable; + cs->disable = em_sti_clocksource_disable; + cs->suspend = em_sti_clocksource_disable; + cs->resume = em_sti_clocksource_resume; + cs->mask = CLOCKSOURCE_MASK(48); + cs->flags = CLOCK_SOURCE_IS_CONTINUOUS; + + dev_info(&p->pdev->dev, "used as clock source\n"); + + /* Register with dummy 1 Hz value, gets updated in ->enable() */ + clocksource_register_hz(cs, 1); + return 0; +} + +static struct em_sti_priv *ced_to_em_sti(struct clock_event_device *ced) +{ + return container_of(ced, struct em_sti_priv, ced); +} + +static void em_sti_clock_event_mode(enum clock_event_mode mode, + struct clock_event_device *ced) +{ + struct em_sti_priv *p = ced_to_em_sti(ced); + + /* deal with old setting first */ + switch (ced->mode) { + case CLOCK_EVT_MODE_ONESHOT: + em_sti_stop(p, USER_CLOCKEVENT); + break; + default: + break; + } + + switch (mode) { + case CLOCK_EVT_MODE_ONESHOT: + dev_info(&p->pdev->dev, "used for oneshot clock events\n"); + em_sti_start(p, USER_CLOCKEVENT); + clockevents_config(&p->ced, p->rate); + break; + case CLOCK_EVT_MODE_SHUTDOWN: + case CLOCK_EVT_MODE_UNUSED: + em_sti_stop(p, USER_CLOCKEVENT); + break; + default: + break; + } +} + +static int em_sti_clock_event_next(unsigned long delta, + struct clock_event_device *ced) +{ + struct em_sti_priv *p = ced_to_em_sti(ced); + cycle_t next; + int safe; + + next = em_sti_set_next(p, em_sti_count(p) + delta); + safe = em_sti_count(p) < (next - 1); + + return !safe; +} + +static void em_sti_register_clockevent(struct em_sti_priv *p) +{ + struct clock_event_device *ced = &p->ced; + + memset(ced, 0, sizeof(*ced)); + ced->name = dev_name(&p->pdev->dev); + ced->features = CLOCK_EVT_FEAT_ONESHOT; + ced->rating = 200; + ced->cpumask = cpumask_of(0); + ced->set_next_event = em_sti_clock_event_next; + ced->set_mode = em_sti_clock_event_mode; + + dev_info(&p->pdev->dev, "used for clock events\n"); + + /* Register with dummy 1 Hz value, gets updated in ->set_mode() */ + clockevents_config_and_register(ced, 1, 2, 0xffffffff); +} + +static int __devinit em_sti_probe(struct platform_device *pdev) +{ + struct em_sti_priv *p; + struct resource *res; + int irq, ret; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (p == NULL) { + dev_err(&pdev->dev, "failed to allocate driver data\n"); + ret = -ENOMEM; + goto err0; + } + + p->pdev = pdev; + platform_set_drvdata(pdev, p); + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { + dev_err(&pdev->dev, "failed to get I/O memory\n"); + ret = -EINVAL; + goto err0; + } + + irq = platform_get_irq(pdev, 0); + if (irq < 0) { + dev_err(&pdev->dev, "failed to get irq\n"); + ret = -EINVAL; + goto err0; + } + + /* map memory, let base point to the STI instance */ + p->base = ioremap_nocache(res->start, resource_size(res)); + if (p->base == NULL) { + dev_err(&pdev->dev, "failed to remap I/O memory\n"); + ret = -ENXIO; + goto err0; + } + + /* get hold of clock */ + p->clk = clk_get(&pdev->dev, "sclk"); + if (IS_ERR(p->clk)) { + dev_err(&pdev->dev, "cannot get clock\n"); + ret = PTR_ERR(p->clk); + goto err1; + } + + if (request_irq(irq, em_sti_interrupt, + IRQF_TIMER | IRQF_IRQPOLL | IRQF_NOBALANCING, + dev_name(&pdev->dev), p)) { + dev_err(&pdev->dev, "failed to request low IRQ\n"); + ret = -ENOENT; + goto err2; + } + + raw_spin_lock_init(&p->lock); + em_sti_register_clockevent(p); + em_sti_register_clocksource(p); + return 0; + +err2: + clk_put(p->clk); +err1: + iounmap(p->base); +err0: + kfree(p); + return ret; +} + +static int __devexit em_sti_remove(struct platform_device *pdev) +{ + return -EBUSY; /* cannot unregister clockevent and clocksource */ +} + +static struct platform_driver em_sti_device_driver = { + .probe = em_sti_probe, + .remove = __devexit_p(em_sti_remove), + .driver = { + .name = "em_sti", + } +}; + +module_platform_driver(em_sti_device_driver); + +MODULE_AUTHOR("Magnus Damm"); +MODULE_DESCRIPTION("Renesas Emma Mobile STI Timer Driver"); +MODULE_LICENSE("GPL v2"); From fc0830fe017d02b7b4995b5c402b484b65d9dfc6 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 9 May 2012 23:39:50 +0900 Subject: [PATCH 021/178] clocksource: em_sti: Add DT support Update the em-sti driver to support DT. Signed-off-by: Magnus Damm Cc: arnd@arndb.de Cc: horms@verge.net.au Cc: johnstul@us.ibm.com Cc: rjw@sisk.pl Cc: lethal@linux-sh.org Cc: gregkh@linuxfoundation.org Cc: olof@lixom.net Link: http://lkml.kernel.org/r/20120509143950.27521.7949.sendpatchset@w520 Signed-off-by: Thomas Gleixner --- drivers/clocksource/em_sti.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/clocksource/em_sti.c b/drivers/clocksource/em_sti.c index 719584a5952b..372051d1bba8 100644 --- a/drivers/clocksource/em_sti.c +++ b/drivers/clocksource/em_sti.c @@ -384,11 +384,18 @@ static int __devexit em_sti_remove(struct platform_device *pdev) return -EBUSY; /* cannot unregister clockevent and clocksource */ } +static const struct of_device_id em_sti_dt_ids[] __devinitconst = { + { .compatible = "renesas,em-sti", }, + {}, +}; +MODULE_DEVICE_TABLE(of, em_sti_dt_ids); + static struct platform_driver em_sti_device_driver = { .probe = em_sti_probe, .remove = __devexit_p(em_sti_remove), .driver = { .name = "em_sti", + .of_match_table = em_sti_dt_ids, } }; From 62cf20b32aee4ae889a2eb40fd41c0eab73de970 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 May 2012 14:08:57 +0200 Subject: [PATCH 022/178] tick: Move skew_tick option into the HIGH_RES_TIMER section commit 5307c95 (tick: Add tick skew boot option) broke the !CONFIG_HIGH_RES_TIMERS build. Move the boot option parsing into the CONFIG_HIGH_RES_TIMERS section. Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: Mike Galbraith --- kernel/time/tick-sched.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 4eddbb5ea9c5..efd386667536 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -816,6 +816,14 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) static int sched_skew_tick; +static int __init skew_tick(char *str) +{ + get_option(&str, &sched_skew_tick); + + return 0; +} +early_param("skew_tick", skew_tick); + /** * tick_setup_sched_timer - setup the tick emulation timer */ @@ -920,11 +928,3 @@ int tick_check_oneshot_change(int allow_nohz) tick_nohz_switch_to_nohz(); return 0; } - -static int __init skew_tick(char *str) -{ - get_option(&str, &sched_skew_tick); - - return 0; -} -early_param("skew_tick", skew_tick); From fa980ca87d15bb8a1317853f257a505990f3ffde Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 May 2012 08:24:39 -0700 Subject: [PATCH 023/178] cgroup: superblock can't be released with active dentries 48ddbe1946 "cgroup: make css->refcnt clearing on cgroup removal optional" allowed a css to linger after the associated cgroup is removed. As a css holds a reference on the cgroup's dentry, it means that cgroup dentries may linger for a while. cgroup_create() does grab an active reference on the superblock to prevent it from going away while there are !root cgroups; however, the reference is put from cgroup_diput() which is invoked on cgroup removal, so cgroup dentries which are removed but persisting due to lingering csses already have released their superblock active refs allowing superblock to be killed while those dentries are around. Given the right condition, this makes cgroup_kill_sb() call kill_litter_super() with dentries with non-zero d_count leading to BUG() in shrink_dcache_for_umount_subtree(). Fix it by adding cgroup_dops->d_release() operation and moving deactivate_super() to it. cgroup_diput() now marks dentry->d_fsdata with itself if superblock should be deactivated and cgroup_d_release() deactivates the superblock on dentry release. Signed-off-by: Tejun Heo Reported-by: Sasha Levin Tested-by: Sasha Levin LKML-Reference: Acked-by: Li Zefan --- kernel/cgroup.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ad8eae5bb801..e887b55f1f29 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -896,10 +896,13 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) mutex_unlock(&cgroup_mutex); /* - * Drop the active superblock reference that we took when we - * created the cgroup + * We want to drop the active superblock reference from the + * cgroup creation after all the dentry refs are gone - + * kill_sb gets mighty unhappy otherwise. Mark + * dentry->d_fsdata with cgroup_diput() to tell + * cgroup_d_release() to call deactivate_super(). */ - deactivate_super(cgrp->root->sb); + dentry->d_fsdata = cgroup_diput; /* * if we're getting rid of the cgroup, refcount should ensure @@ -925,6 +928,13 @@ static int cgroup_delete(const struct dentry *d) return 1; } +static void cgroup_d_release(struct dentry *dentry) +{ + /* did cgroup_diput() tell me to deactivate super? */ + if (dentry->d_fsdata == cgroup_diput) + deactivate_super(dentry->d_sb); +} + static void remove_dir(struct dentry *d) { struct dentry *parent = dget(d->d_parent); @@ -1532,6 +1542,7 @@ static int cgroup_get_rootdir(struct super_block *sb) static const struct dentry_operations cgroup_dops = { .d_iput = cgroup_diput, .d_delete = cgroup_delete, + .d_release = cgroup_d_release, }; struct inode *inode = From 2aa4ee2a8805ec0260dde971e9e6699917c868a7 Mon Sep 17 00:00:00 2001 From: Jim Kukunas Date: Mon, 28 May 2012 14:10:22 +1000 Subject: [PATCH 024/178] lib/raid6: fix sparse warnings in recovery functions Make the recovery functions static to fix the following sparse warnings: lib/raid6/recov.c:25:6: warning: symbol 'raid6_2data_recov_intx1' was not declared. Should it be static? lib/raid6/recov.c:69:6: warning: symbol 'raid6_datap_recov_intx1' was not declared. Should it be static? lib/raid6/recov_ssse3.c:22:6: warning: symbol 'raid6_2data_recov_ssse3' was not declared. Should it be static? lib/raid6/recov_ssse3.c:197:6: warning: symbol 'raid6_datap_recov_ssse3' was not declared. Should it be static? Reported-by: Fengguang Wu Signed-off-by: Jim Kukunas Signed-off-by: NeilBrown --- lib/raid6/recov.c | 7 ++++--- lib/raid6/recov_ssse3.c | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c index 1805a5cc5daa..a95bccb8497d 100644 --- a/lib/raid6/recov.c +++ b/lib/raid6/recov.c @@ -22,8 +22,8 @@ #include /* Recover two failed data blocks. */ -void raid6_2data_recov_intx1(int disks, size_t bytes, int faila, int failb, - void **ptrs) +static void raid6_2data_recov_intx1(int disks, size_t bytes, int faila, + int failb, void **ptrs) { u8 *p, *q, *dp, *dq; u8 px, qx, db; @@ -66,7 +66,8 @@ void raid6_2data_recov_intx1(int disks, size_t bytes, int faila, int failb, } /* Recover failure of one data block plus the P block */ -void raid6_datap_recov_intx1(int disks, size_t bytes, int faila, void **ptrs) +static void raid6_datap_recov_intx1(int disks, size_t bytes, int faila, + void **ptrs) { u8 *p, *q, *dq; const u8 *qmul; /* Q multiplier table */ diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c index 37ae61930559..ecb710c0b4d9 100644 --- a/lib/raid6/recov_ssse3.c +++ b/lib/raid6/recov_ssse3.c @@ -19,8 +19,8 @@ static int raid6_has_ssse3(void) boot_cpu_has(X86_FEATURE_SSSE3); } -void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila, int failb, - void **ptrs) +static void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila, + int failb, void **ptrs) { u8 *p, *q, *dp, *dq; const u8 *pbmul; /* P multiplier table for B data */ @@ -194,7 +194,8 @@ void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila, int failb, } -void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila, void **ptrs) +static void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila, + void **ptrs) { u8 *p, *q, *dq; const u8 *qmul; /* Q multiplier table */ From c3b20037926e607b6cdaecbf9d3103e2ca63bc31 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 28 May 2012 22:33:02 +0100 Subject: [PATCH 025/178] drm/i915: Reset last_retired_head when resetting ring When we reset the ring control registers, including the HEAD and TAIL of the ring, we also need to reset associated state. In this instance, we were failing to reset the cached value of ring->last_retired_head and so upon the first request for more space following a resume would potentially (depending on a narrow race window) believe that the HEAD had advanced much further than reality. This is a regression from: commit a71d8d94525e8fd855c0466fb586ae1cb008f3a2 Author: Chris Wilson Date: Wed Feb 15 11:25:36 2012 +0000 drm/i915: Record the tail at each request and use it to estimate the head Signed-off-by: Chris Wilson Cc: stable@vger.kernel.org # 3.4 Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/intel_ringbuffer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index b59b6d5b7583..490df551e312 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -326,6 +326,7 @@ static int init_ring_common(struct intel_ring_buffer *ring) ring->head = I915_READ_HEAD(ring); ring->tail = I915_READ_TAIL(ring) & TAIL_ADDR; ring->space = ring_space(ring); + ring->last_retired_head = -1; } return 0; From c10d7a13846bffa5c77f1122500b687ab902e2d2 Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Thu, 10 May 2012 00:08:43 +0100 Subject: [PATCH 026/178] ACPI / PM: Generate wakeup events on fixed power button When the system is woken up by the ACPI fixed power button, currently there is no way of userspace becoming aware that the power button was pressed. OLPC would like to know this, so that we can respond appropriately. For example, if the system was woken up by a network packet, we know we can go back to sleep very quickly. But if the user explicitly woke the system with the power button, we're going to want to stay awake for a while. The wakeup count mechanism seems like a good fit for communicating this. Mark the fixed power button as wakeup-enabled, and increment its wakeup counter when the system is woken with the power button. (The wakeup counter is also incremented when the power button is pressed during system operation; this is already handled by an existing acpi-button codepath). Signed-off-by: Daniel Drake Acked-by: Zhang Rui Signed-off-by: Rafael J. Wysocki --- drivers/acpi/scan.c | 1 + drivers/acpi/sleep.c | 45 ++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 85cbfdccc97c..c8a1f3b68110 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1567,6 +1567,7 @@ static int acpi_bus_scan_fixed(void) ACPI_BUS_TYPE_POWER_BUTTON, ACPI_STA_DEFAULT, &ops); + device_init_wakeup(&device->dev, true); } if ((acpi_gbl_FADT.flags & ACPI_FADT_SLEEP_BUTTON) == 0) { diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index eb6fd233764b..a564fc3ffa1c 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -57,6 +57,7 @@ MODULE_PARM_DESC(gts, "Enable evaluation of _GTS on suspend."); MODULE_PARM_DESC(bfs, "Enable evaluation of _BFS on resume".); static u8 sleep_states[ACPI_S_STATE_COUNT]; +static bool pwr_btn_event_pending; static void acpi_sleep_tts_switch(u32 acpi_state) { @@ -186,6 +187,14 @@ static int acpi_pm_prepare(void) return error; } +static int find_powerf_dev(struct device *dev, void *data) +{ + struct acpi_device *device = to_acpi_device(dev); + const char *hid = acpi_device_hid(device); + + return !strcmp(hid, ACPI_BUTTON_HID_POWERF); +} + /** * acpi_pm_finish - Instruct the platform to leave a sleep state. * @@ -194,6 +203,7 @@ static int acpi_pm_prepare(void) */ static void acpi_pm_finish(void) { + struct device *pwr_btn_dev; u32 acpi_state = acpi_target_sleep_state; acpi_ec_unblock_transactions(); @@ -211,6 +221,23 @@ static void acpi_pm_finish(void) acpi_set_firmware_waking_vector((acpi_physical_address) 0); acpi_target_sleep_state = ACPI_STATE_S0; + + /* If we were woken with the fixed power button, provide a small + * hint to userspace in the form of a wakeup event on the fixed power + * button device (if it can be found). + * + * We delay the event generation til now, as the PM layer requires + * timekeeping to be running before we generate events. */ + if (!pwr_btn_event_pending) + return; + + pwr_btn_event_pending = false; + pwr_btn_dev = bus_find_device(&acpi_bus_type, NULL, NULL, + find_powerf_dev); + if (pwr_btn_dev) { + pm_wakeup_event(pwr_btn_dev, 0); + put_device(pwr_btn_dev); + } } /** @@ -300,9 +327,23 @@ static int acpi_suspend_enter(suspend_state_t pm_state) /* ACPI 3.0 specs (P62) says that it's the responsibility * of the OSPM to clear the status bit [ implying that the * POWER_BUTTON event should not reach userspace ] + * + * However, we do generate a small hint for userspace in the form of + * a wakeup event. We flag this condition for now and generate the + * event later, as we're currently too early in resume to be able to + * generate wakeup events. */ - if (ACPI_SUCCESS(status) && (acpi_state == ACPI_STATE_S3)) - acpi_clear_event(ACPI_EVENT_POWER_BUTTON); + if (ACPI_SUCCESS(status) && (acpi_state == ACPI_STATE_S3)) { + acpi_event_status pwr_btn_status; + + acpi_get_event_status(ACPI_EVENT_POWER_BUTTON, &pwr_btn_status); + + if (pwr_btn_status & ACPI_EVENT_FLAG_SET) { + acpi_clear_event(ACPI_EVENT_POWER_BUTTON); + /* Flag for later */ + pwr_btn_event_pending = true; + } + } /* * Disable and clear GPE status before interrupt is enabled. Some GPEs From b2201e5482bc2376ea5c049442850a260142ac40 Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Fri, 18 May 2012 22:59:41 +0200 Subject: [PATCH 027/178] rtc-cmos / PM: report wakeup event on ACPI RTC alarm When the ACPI-driven RTC alarm wakes the system, report it as a wakeup event. This allows userspace to determine that the reason for system wakeup was RTC alarm. Signed-off-by: Daniel Drake Signed-off-by: Rafael J. Wysocki --- drivers/rtc/rtc-cmos.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index 7d5f56edb8ef..4267789ca995 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -910,14 +910,17 @@ static inline int cmos_poweroff(struct device *dev) static u32 rtc_handler(void *context) { + struct device *dev = context; + + pm_wakeup_event(dev, 0); acpi_clear_event(ACPI_EVENT_RTC); acpi_disable_event(ACPI_EVENT_RTC, 0); return ACPI_INTERRUPT_HANDLED; } -static inline void rtc_wake_setup(void) +static inline void rtc_wake_setup(struct device *dev) { - acpi_install_fixed_event_handler(ACPI_EVENT_RTC, rtc_handler, NULL); + acpi_install_fixed_event_handler(ACPI_EVENT_RTC, rtc_handler, dev); /* * After the RTC handler is installed, the Fixed_RTC event should * be disabled. Only when the RTC alarm is set will it be enabled. @@ -950,7 +953,7 @@ cmos_wake_setup(struct device *dev) if (acpi_disabled) return; - rtc_wake_setup(); + rtc_wake_setup(dev); acpi_rtc_info.wake_on = rtc_wake_on; acpi_rtc_info.wake_off = rtc_wake_off; From 63a1a765dffb1e59d82c7948638e56d5f4f2e3a1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 20 May 2012 13:57:52 +0200 Subject: [PATCH 028/178] ACPI / PM: Fix error messages in drivers/acpi/bus.c After recent changes of the ACPI device low-power states definitions kernel messages in drivers/acpi/bus.c need to be updated so that they include the correct names of the states in question (currently is "D3" for D3hot and "D4" for D3cold, which is incorrect). Signed-off-by: Rafael J. Wysocki --- drivers/acpi/bus.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index 3188da3df8da..a41be56c1cc0 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -182,6 +182,24 @@ EXPORT_SYMBOL(acpi_bus_get_private_data); Power Management -------------------------------------------------------------------------- */ +static const char *state_string(int state) +{ + switch (state) { + case ACPI_STATE_D0: + return "D0"; + case ACPI_STATE_D1: + return "D1"; + case ACPI_STATE_D2: + return "D2"; + case ACPI_STATE_D3_HOT: + return "D3hot"; + case ACPI_STATE_D3_COLD: + return "D3"; + default: + return "(unknown)"; + } +} + static int __acpi_bus_get_power(struct acpi_device *device, int *state) { int result = 0; @@ -215,8 +233,8 @@ static int __acpi_bus_get_power(struct acpi_device *device, int *state) device->parent->power.state : ACPI_STATE_D0; } - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Device [%s] power state is D%d\n", - device->pnp.bus_id, *state)); + ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Device [%s] power state is %s\n", + device->pnp.bus_id, state_string(*state))); return 0; } @@ -234,13 +252,14 @@ static int __acpi_bus_set_power(struct acpi_device *device, int state) /* Make sure this is a valid target state */ if (state == device->power.state) { - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Device is already at D%d\n", - state)); + ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Device is already at %s\n", + state_string(state))); return 0; } if (!device->power.states[state].flags.valid) { - printk(KERN_WARNING PREFIX "Device does not support D%d\n", state); + printk(KERN_WARNING PREFIX "Device does not support %s\n", + state_string(state)); return -ENODEV; } if (device->parent && (state < device->parent->power.state)) { @@ -294,13 +313,13 @@ static int __acpi_bus_set_power(struct acpi_device *device, int state) end: if (result) printk(KERN_WARNING PREFIX - "Device [%s] failed to transition to D%d\n", - device->pnp.bus_id, state); + "Device [%s] failed to transition to %s\n", + device->pnp.bus_id, state_string(state)); else { device->power.state = state; ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Device [%s] transitioned to D%d\n", - device->pnp.bus_id, state)); + "Device [%s] transitioned to %s\n", + device->pnp.bus_id, state_string(state))); } return result; From 38c92fff988d518fe80dc23d0d44d66bd7e47ddd Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 20 May 2012 13:58:00 +0200 Subject: [PATCH 029/178] ACPI / PM: Make __acpi_bus_get_power() cover D3cold correctly After recent changes of the ACPI device power states definitions, if power resources are not used for the device's power management, the state returned by __acpi_bus_get_power() cannot exceed D3hot, because the return values of _PSC are 0 through 3. However, if the _PR3 method is not present for the device and _PS3 returns 3, we have to assume that the device is in D3cold, so the value returned by __acpi_bus_get_power() in that case should be 4. Similarly, acpi_power_get_inferred_state() should take the power resources for the D3hot state into account in general, so that it can return 3 if those resources are "on" or 4 (D3cold) otherwise. Fix the the above two issues and make sure that if both _PSC and _PR3 are present for the device, the power resources listed by _PR3 will be used to determine if the number 3 returned by _PSC is meant to represent D3cold or D3hot. Signed-off-by: Rafael J. Wysocki --- drivers/acpi/bus.c | 51 +++++++++++++++++++++++++------------------- drivers/acpi/power.c | 2 +- 2 files changed, 30 insertions(+), 23 deletions(-) diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index a41be56c1cc0..adceafda9c17 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -202,37 +202,44 @@ static const char *state_string(int state) static int __acpi_bus_get_power(struct acpi_device *device, int *state) { - int result = 0; - acpi_status status = 0; - unsigned long long psc = 0; + int result = ACPI_STATE_UNKNOWN; if (!device || !state) return -EINVAL; - *state = ACPI_STATE_UNKNOWN; - - if (device->flags.power_manageable) { - /* - * Get the device's power state either directly (via _PSC) or - * indirectly (via power resources). - */ - if (device->power.flags.power_resources) { - result = acpi_power_get_inferred_state(device, state); - if (result) - return result; - } else if (device->power.flags.explicit_get) { - status = acpi_evaluate_integer(device->handle, "_PSC", - NULL, &psc); - if (ACPI_FAILURE(status)) - return -ENODEV; - *state = (int)psc; - } - } else { + if (!device->flags.power_manageable) { /* TBD: Non-recursive algorithm for walking up hierarchy. */ *state = device->parent ? device->parent->power.state : ACPI_STATE_D0; + goto out; } + /* + * Get the device's power state either directly (via _PSC) or + * indirectly (via power resources). + */ + if (device->power.flags.explicit_get) { + unsigned long long psc; + acpi_status status = acpi_evaluate_integer(device->handle, + "_PSC", NULL, &psc); + if (ACPI_FAILURE(status)) + return -ENODEV; + + result = psc; + } + /* The test below covers ACPI_STATE_UNKNOWN too. */ + if (result <= ACPI_STATE_D2) { + ; /* Do nothing. */ + } else if (device->power.flags.power_resources) { + int error = acpi_power_get_inferred_state(device, &result); + if (error) + return error; + } else if (result == ACPI_STATE_D3_HOT) { + result = ACPI_STATE_D3; + } + *state = result; + + out: ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Device [%s] power state is %s\n", device->pnp.bus_id, state_string(*state))); diff --git a/drivers/acpi/power.c b/drivers/acpi/power.c index 0500f719f63e..dd6d6a3c6780 100644 --- a/drivers/acpi/power.c +++ b/drivers/acpi/power.c @@ -631,7 +631,7 @@ int acpi_power_get_inferred_state(struct acpi_device *device, int *state) * We know a device's inferred power state when all the resources * required for a given D-state are 'on'. */ - for (i = ACPI_STATE_D0; i < ACPI_STATE_D3_HOT; i++) { + for (i = ACPI_STATE_D0; i <= ACPI_STATE_D3_HOT; i++) { list = &device->power.states[i].resources; if (list->count < 1) continue; From dbe9a2edd17d843d80faf2b99f20a691c1853418 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 29 May 2012 21:21:07 +0200 Subject: [PATCH 030/178] ACPI / PM: Make acpi_pm_device_sleep_state() follow the specification The comparison between the system sleep state being entered and the lowest system sleep state the given device may wake up from in acpi_pm_device_sleep_state() is reversed, because the specification (ACPI 5.0) says that for wakeup to work: "The sleeping state being entered must be less than or equal to the power state declared in element 1 of the _PRW object." In other words, the state returned by _PRW is the deepest (lowest-power) system sleep state the device is capable of waking up the system from. Moreover, acpi_pm_device_sleep_state() also should check if the wakeup capability is supported through ACPI, because in principle it may be done via native PCIe PME, for example, in which case _SxW should not be evaluated. Signed-off-by: Rafael J. Wysocki --- drivers/acpi/sleep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index a564fc3ffa1c..d8b381e94ee2 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -773,8 +773,8 @@ int acpi_pm_device_sleep_state(struct device *dev, int *d_min_p) * can wake the system. _S0W may be valid, too. */ if (acpi_target_sleep_state == ACPI_STATE_S0 || - (device_may_wakeup(dev) && - adev->wakeup.sleep_state <= acpi_target_sleep_state)) { + (device_may_wakeup(dev) && adev->wakeup.flags.valid && + adev->wakeup.sleep_state >= acpi_target_sleep_state)) { acpi_status status; acpi_method[3] = 'W'; From cd4fedf9cfe2d453148dce0f105a41725e5107a3 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 23 May 2012 21:02:07 +0530 Subject: [PATCH 031/178] RDMA/ocrdma: Correct queue free count math Correct queue free count math for SQ, RQ for all hardware type. Update user-kernel ABI interface. Signed-off-by: Parav Pandit Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma.h | 1 - drivers/infiniband/hw/ocrdma/ocrdma_abi.h | 5 +---- drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 7 ------- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 5 ----- 4 files changed, 1 insertion(+), 17 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h index 85a69c958559..037f5cea85bd 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma.h @@ -231,7 +231,6 @@ struct ocrdma_qp_hwq_info { u32 entry_size; u32 max_cnt; u32 max_wqe_idx; - u32 free_delta; u16 dbid; /* qid, where to ring the doorbell. */ u32 len; dma_addr_t pa; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_abi.h b/drivers/infiniband/hw/ocrdma/ocrdma_abi.h index a411a4e3193d..517ab20b727c 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_abi.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_abi.h @@ -101,8 +101,6 @@ struct ocrdma_create_qp_uresp { u32 rsvd1; u32 num_wqe_allocated; u32 num_rqe_allocated; - u32 free_wqe_delta; - u32 free_rqe_delta; u32 db_sq_offset; u32 db_rq_offset; u32 db_shift; @@ -126,8 +124,7 @@ struct ocrdma_create_srq_uresp { u32 db_rq_offset; u32 db_shift; - u32 free_rqe_delta; - u32 rsvd2; + u64 rsvd2; u64 rsvd3; } __packed; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 9b204b1ba336..f26314fd5f6e 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -1990,19 +1990,12 @@ static void ocrdma_get_create_qp_rsp(struct ocrdma_create_qp_rsp *rsp, max_wqe_allocated = 1 << max_wqe_allocated; max_rqe_allocated = 1 << ((u16)rsp->max_wqe_rqe); - if (qp->dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) { - qp->sq.free_delta = 0; - qp->rq.free_delta = 1; - } else - qp->sq.free_delta = 1; - qp->sq.max_cnt = max_wqe_allocated; qp->sq.max_wqe_idx = max_wqe_allocated - 1; if (!attrs->srq) { qp->rq.max_cnt = max_rqe_allocated; qp->rq.max_wqe_idx = max_rqe_allocated - 1; - qp->rq.free_delta = 1; } } diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index e9f74d1b48f6..d16d172b6b6b 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -940,8 +940,6 @@ static int ocrdma_copy_qp_uresp(struct ocrdma_qp *qp, uresp.db_rq_offset = OCRDMA_DB_RQ_OFFSET; uresp.db_shift = 16; } - uresp.free_wqe_delta = qp->sq.free_delta; - uresp.free_rqe_delta = qp->rq.free_delta; if (qp->dpp_enabled) { uresp.dpp_credit = dpp_credit_lmt; @@ -1307,8 +1305,6 @@ static int ocrdma_hwq_free_cnt(struct ocrdma_qp_hwq_info *q) free_cnt = (q->max_cnt - q->head) + q->tail; else free_cnt = q->tail - q->head; - if (q->free_delta) - free_cnt -= q->free_delta; return free_cnt; } @@ -1501,7 +1497,6 @@ static int ocrdma_copy_srq_uresp(struct ocrdma_srq *srq, struct ib_udata *udata) (srq->pd->id * srq->dev->nic_info.db_page_size); uresp.db_page_size = srq->dev->nic_info.db_page_size; uresp.num_rqe_allocated = srq->rq.max_cnt; - uresp.free_rqe_delta = 1; if (srq->dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) { uresp.db_rq_offset = OCRDMA_DB_GEN2_RQ1_OFFSET; uresp.db_shift = 24; From 804eaf29bac4148aa265bedc62182ed41a4c6120 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 23 May 2012 21:11:17 +0530 Subject: [PATCH 032/178] RDMA/ocrdma: Fix signaled event for SRQ_LIMIT_REACHED Signed-off-by: Parav Pandit Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index f26314fd5f6e..9343a1522977 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -732,7 +732,7 @@ static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev, break; case OCRDMA_SRQ_LIMIT_EVENT: ib_evt.element.srq = &qp->srq->ibsrq; - ib_evt.event = IB_EVENT_QP_LAST_WQE_REACHED; + ib_evt.event = IB_EVENT_SRQ_LIMIT_REACHED; srq_event = 1; qp_event = 0; break; From 7ad5e449b96bd82f406ed4657a64c8f72a48896d Mon Sep 17 00:00:00 2001 From: Devendra Naga Date: Tue, 29 May 2012 12:51:47 +0530 Subject: [PATCH 033/178] RDMA/ocrdma: Remove unnecessary version.h includes "make versioncheck" shows: drivers/infiniband/hw/ocrdma/ocrdma_main.c: 29 linux/version.h not needed. drivers/infiniband/hw/ocrdma/ocrdma_verbs.h: 31 linux/version.h not needed. Signed-off-by: Devendra Naga Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 1 - drivers/infiniband/hw/ocrdma/ocrdma_verbs.h | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index a20d16eaae71..04fef3de6d75 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -26,7 +26,6 @@ *******************************************************************/ #include -#include #include #include #include diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h index e6483439f25f..633f03d80274 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h @@ -28,7 +28,6 @@ #ifndef __OCRDMA_VERBS_H__ #define __OCRDMA_VERBS_H__ -#include int ocrdma_post_send(struct ib_qp *, struct ib_send_wr *, struct ib_send_wr **bad_wr); int ocrdma_post_recv(struct ib_qp *, struct ib_recv_wr *, From 9f646389aa7727a2fd8f9ae6337b92af9cfbc264 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 29 May 2012 16:39:09 +0200 Subject: [PATCH 034/178] sched/x86: Use cpu_llc_shared_mask(cpu) for coregroup_mask Commit commit 8e7fbcbc2 ("sched: Remove stale power aware scheduling remnants and dysfunctional knobs") made a boo-boo with removing the power aware scheduling muck from the x86 topology bits. We should unconditionally use the llc_shared mask for multi-core. Reported-and-tested-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: Borislav Petkov Cc: Andreas Herrmann Link: http://lkml.kernel.org/n/tip-lsksc2kfyeveb13avh327p0d@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f56f96da77f5..fd019d78b1f4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -410,15 +410,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) /* maps the cpu to the sched domain representing multi-core */ const struct cpumask *cpu_coregroup_mask(int cpu) { - struct cpuinfo_x86 *c = &cpu_data(cpu); - /* - * For perf, we return last level cache shared map. - * And for power savings, we return cpu_core_map - */ - if (!(cpu_has(c, X86_FEATURE_AMD_DCM))) - return cpu_core_mask(cpu); - else - return cpu_llc_shared_mask(cpu); + return cpu_llc_shared_mask(cpu); } static void impress_friends(void) From 5aaa0b7a2ed5b12692c9ffb5222182bd558d3146 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 May 2012 17:15:29 +0200 Subject: [PATCH 035/178] sched/nohz: Fix rq->cpu_load calculations some more Follow up on commit 556061b00 ("sched/nohz: Fix rq->cpu_load[] calculations") since while that fixed the busy case it regressed the mostly idle case. Add a callback from the nohz exit to also age the rq->cpu_load[] array. This closes the hole where either there was no nohz load balance pass during the nohz, or there was a 'significant' amount of idle time between the last nohz balance and the nohz exit. So we'll update unconditionally from the tick to not insert any accidental 0 load periods while busy, and we try and catch up from nohz idle balance and nohz exit. Both these are still prone to missing a jiffy, but that has always been the case. Signed-off-by: Peter Zijlstra Cc: pjt@google.com Cc: Venkatesh Pallipadi Link: http://lkml.kernel.org/n/tip-kt0trz0apodbf84ucjfdbr1a@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/core.c | 53 ++++++++++++++++++++++++++++++++-------- kernel/time/tick-sched.c | 1 + 3 files changed, 45 insertions(+), 10 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index f45c0b280b5d..d61e5977e517 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -145,6 +145,7 @@ extern unsigned long this_cpu_load(void); extern void calc_global_load(unsigned long ticks); +extern void update_cpu_load_nohz(void); extern unsigned long get_parent_ip(unsigned long addr); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 39eb6011bc38..75844a8f9aeb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2517,25 +2517,32 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, sched_avg_update(this_rq); } +#ifdef CONFIG_NO_HZ +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we cannot use the delta approach from the regular tick since that + * would seriously skew the load calculation. However we'll make do for those + * updates happening while idle (nohz_idle_balance) or coming out of idle + * (tick_nohz_idle_exit). + * + * This means we might still be one tick off for nohz periods. + */ + /* * Called from nohz_idle_balance() to update the load ratings before doing the * idle balance. */ void update_idle_cpu_load(struct rq *this_rq) { - unsigned long curr_jiffies = jiffies; + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); unsigned long load = this_rq->load.weight; unsigned long pending_updates; /* - * Bloody broken means of dealing with nohz, but better than nothing.. - * jiffies is updated by one cpu, another cpu can drift wrt the jiffy - * update and see 0 difference the one time and 2 the next, even though - * we ticked at roughtly the same rate. - * - * Hence we only use this from nohz_idle_balance() and skip this - * nonsense when called from the scheduler_tick() since that's - * guaranteed a stable rate. + * bail if there's load or we're actually up-to-date. */ if (load || curr_jiffies == this_rq->last_load_update_tick) return; @@ -2546,13 +2553,39 @@ void update_idle_cpu_load(struct rq *this_rq) __update_cpu_load(this_rq, load, pending_updates); } +/* + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + */ +void update_cpu_load_nohz(void) +{ + struct rq *this_rq = this_rq(); + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long pending_updates; + + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + raw_spin_lock(&this_rq->lock); + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * We were idle, this means load 0, the current load might be + * !0 due to remote wakeups and the sort. + */ + __update_cpu_load(this_rq, 0, pending_updates); + } + raw_spin_unlock(&this_rq->lock); +} +#endif /* CONFIG_NO_HZ */ + /* * Called from scheduler_tick() */ static void update_cpu_load_active(struct rq *this_rq) { /* - * See the mess in update_idle_cpu_load(). + * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). */ this_rq->last_load_update_tick = jiffies; __update_cpu_load(this_rq, this_rq->load.weight, 1); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6a3a5b9ff561..0c927cd85345 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -576,6 +576,7 @@ void tick_nohz_idle_exit(void) /* Update jiffies first */ select_nohz_load_balancer(0); tick_do_update_jiffies64(now); + update_cpu_load_nohz(); #ifndef CONFIG_VIRT_CPU_ACCOUNTING /* From 2ea45800d8e1c3c51c45a233d6bd6289a297a386 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 25 May 2012 09:26:43 +0200 Subject: [PATCH 036/178] sched: Don't try allocating memory from offline nodes Allocators don't appreciate it when you try and allocate memory from offline nodes. Reported-and-tested-by: Tony Luck Reported-and-tested-by: Anton Blanchard Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-epfc1io9whb7o22bcujf31vn@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 75844a8f9aeb..55733616baaa 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6436,7 +6436,7 @@ static void sched_init_numa(void) return; for (j = 0; j < nr_node_ids; j++) { - struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j); + struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); if (!mask) return; From 74a5ce20e6eeeb3751340b390e7ac1d1d07bbf55 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 23 May 2012 18:00:43 +0200 Subject: [PATCH 037/178] sched: Fix SD_OVERLAP SD_OVERLAP exists to allow overlapping groups, overlapping groups appear in NUMA topologies that aren't fully connected. The typical result of not fully connected NUMA is that each cpu (or rather node) will have different spans for a particular distance. However due to how sched domains are traversed -- only the first cpu in the mask goes one level up -- the next level only cares about the spans of the cpus that went up. Due to this two things were observed to be broken: - build_overlap_sched_groups() -- since its possible the cpu we're building the groups for exists in multiple (or all) groups, the selection criteria of the first group didn't ensure there was a cpu for which is was true that cpumask_first(span) == cpu. Thus load- balancing would terminate. - update_group_power() -- assumed that the cpu span of the first group of the domain was covered by all groups of the child domain. The above explains why this isn't true, so deal with it. Signed-off-by: Peter Zijlstra Cc: David Rientjes Link: http://lkml.kernel.org/r/1337788843.9783.14.camel@laptop Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 +++++-- kernel/sched/fair.c | 25 ++++++++++++++++++++----- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 55733616baaa..3a69374fb427 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6030,11 +6030,14 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) cpumask_or(covered, covered, sg_span); - sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); + sg->sgp = *per_cpu_ptr(sdd->sgp, i); atomic_inc(&sg->sgp->ref); - if (cpumask_test_cpu(cpu, sg_span)) + if ((!groups && cpumask_test_cpu(cpu, sg_span)) || + cpumask_first(sg_span) == cpu) { + WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span)); groups = sg; + } if (!first) first = sg; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 940e6d17cf96..f0380d4987b3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3574,11 +3574,26 @@ void update_group_power(struct sched_domain *sd, int cpu) power = 0; - group = child->groups; - do { - power += group->sgp->power; - group = group->next; - } while (group != child->groups); + if (child->flags & SD_OVERLAP) { + /* + * SD_OVERLAP domains cannot assume that child groups + * span the current group. + */ + + for_each_cpu(cpu, sched_group_cpus(sdg)) + power += power_of(cpu); + } else { + /* + * !SD_OVERLAP domains can assume that child groups + * span the current group. + */ + + group = child->groups; + do { + power += group->sgp->power; + group = group->next; + } while (group != child->groups); + } sdg->sgp->power = power; } From b654f7de41b0e3903ee2b51d3b8db77fe52ce728 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 May 2012 14:04:28 +0200 Subject: [PATCH 038/178] sched: Make sure to not re-read variables after validation We could re-read rq->rt_avg after we validated it was smaller than total, invalidating the check and resulting in an unintended negative. Signed-off-by: Peter Zijlstra Cc: David Rientjes Link: http://lkml.kernel.org/r/1337688268.9698.29.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f0380d4987b3..2b449a762074 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3503,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) unsigned long scale_rt_power(int cpu) { struct rq *rq = cpu_rq(cpu); - u64 total, available; + u64 total, available, age_stamp, avg; - total = sched_avg_period() + (rq->clock - rq->age_stamp); + /* + * Since we're reading these variables without serialization make sure + * we read them once before doing sanity checks on them. + */ + age_stamp = ACCESS_ONCE(rq->age_stamp); + avg = ACCESS_ONCE(rq->rt_avg); - if (unlikely(total < rq->rt_avg)) { + total = sched_avg_period() + (rq->clock - age_stamp); + + if (unlikely(total < avg)) { /* Ensures that power won't end up being negative */ available = 0; } else { - available = total - rq->rt_avg; + available = total - avg; } if (unlikely((s64)total < SCHED_POWER_SCALE)) From 29baa7478ba47d746e3625c91d3b2afbf46b4312 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Apr 2012 12:11:21 +0200 Subject: [PATCH 039/178] sched: Move nr_cpus_allowed out of 'struct sched_rt_entity' Since nr_cpus_allowed is used outside of sched/rt.c and wants to be used outside of there more, move it to a more natural site. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-kr61f02y9brwzkh6x53pdptm@git.kernel.org Signed-off-by: Ingo Molnar --- arch/blackfin/kernel/process.c | 2 +- include/linux/init_task.h | 2 +- include/linux/sched.h | 2 +- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/rt.c | 36 ++++++++++++++++++++-------------- 6 files changed, 26 insertions(+), 20 deletions(-) diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c index 2e3994b20169..62bcea7dcc6d 100644 --- a/arch/blackfin/kernel/process.c +++ b/arch/blackfin/kernel/process.c @@ -173,7 +173,7 @@ asmlinkage int bfin_clone(struct pt_regs *regs) unsigned long newsp; #ifdef __ARCH_SYNC_CORE_DCACHE - if (current->rt.nr_cpus_allowed == num_possible_cpus()) + if (current->nr_cpus_allowed == num_possible_cpus()) set_cpus_allowed_ptr(current, cpumask_of(smp_processor_id())); #endif diff --git a/include/linux/init_task.h b/include/linux/init_task.h index e4baff5f7ff4..9e65eff6af3b 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -149,6 +149,7 @@ extern struct cred init_cred; .normal_prio = MAX_PRIO-20, \ .policy = SCHED_NORMAL, \ .cpus_allowed = CPU_MASK_ALL, \ + .nr_cpus_allowed= NR_CPUS, \ .mm = NULL, \ .active_mm = &init_mm, \ .se = { \ @@ -157,7 +158,6 @@ extern struct cred init_cred; .rt = { \ .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \ .time_slice = RR_TIMESLICE, \ - .nr_cpus_allowed = NR_CPUS, \ }, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ INIT_PUSHABLE_TASKS(tsk) \ diff --git a/include/linux/sched.h b/include/linux/sched.h index d61e5977e517..0f50e78f7f44 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1188,7 +1188,6 @@ struct sched_rt_entity { struct list_head run_list; unsigned long timeout; unsigned int time_slice; - int nr_cpus_allowed; struct sched_rt_entity *back; #ifdef CONFIG_RT_GROUP_SCHED @@ -1253,6 +1252,7 @@ struct task_struct { #endif unsigned int policy; + int nr_cpus_allowed; cpumask_t cpus_allowed; #ifdef CONFIG_PREEMPT_RCU diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3a69374fb427..70cc36a6073f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5015,7 +5015,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) p->sched_class->set_cpus_allowed(p, new_mask); cpumask_copy(&p->cpus_allowed, new_mask); - p->rt.nr_cpus_allowed = cpumask_weight(new_mask); + p->nr_cpus_allowed = cpumask_weight(new_mask); } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2b449a762074..b2a2d236f27b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) int want_sd = 1; int sync = wake_flags & WF_SYNC; - if (p->rt.nr_cpus_allowed == 1) + if (p->nr_cpus_allowed == 1) return prev_cpu; if (sd_flag & SD_BALANCE_WAKE) { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index c5565c3c515f..295da737b6fe 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq) static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { + struct task_struct *p; + if (!rt_entity_is_task(rt_se)) return; + p = rt_task_of(rt_se); rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total++; - if (rt_se->nr_cpus_allowed > 1) + if (p->nr_cpus_allowed > 1) rt_rq->rt_nr_migratory++; update_rt_migration(rt_rq); @@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { + struct task_struct *p; + if (!rt_entity_is_task(rt_se)) return; + p = rt_task_of(rt_se); rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total--; - if (rt_se->nr_cpus_allowed > 1) + if (p->nr_cpus_allowed > 1) rt_rq->rt_nr_migratory--; update_rt_migration(rt_rq); @@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); - if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) + if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); inc_nr_running(rq); @@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) cpu = task_cpu(p); - if (p->rt.nr_cpus_allowed == 1) + if (p->nr_cpus_allowed == 1) goto out; /* For anything but wake ups, just return the task_cpu */ @@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) * will have to sort it out. */ if (curr && unlikely(rt_task(curr)) && - (curr->rt.nr_cpus_allowed < 2 || + (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio) && - (p->rt.nr_cpus_allowed > 1)) { + (p->nr_cpus_allowed > 1)) { int target = find_lowest_rq(p); if (target != -1) @@ -1276,10 +1282,10 @@ out: static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - if (rq->curr->rt.nr_cpus_allowed == 1) + if (rq->curr->nr_cpus_allowed == 1) return; - if (p->rt.nr_cpus_allowed != 1 + if (p->nr_cpus_allowed != 1 && cpupri_find(&rq->rd->cpupri, p, NULL)) return; @@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) * The previous task needs to be made eligible for pushing * if it is still active */ - if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) + if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); } @@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && - (p->rt.nr_cpus_allowed > 1)) + (p->nr_cpus_allowed > 1)) return 1; return 0; } @@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task) if (unlikely(!lowest_mask)) return -1; - if (task->rt.nr_cpus_allowed == 1) + if (task->nr_cpus_allowed == 1) return -1; /* No other targets possible */ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) @@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); - BUG_ON(p->rt.nr_cpus_allowed <= 1); + BUG_ON(p->nr_cpus_allowed <= 1); BUG_ON(!p->on_rq); BUG_ON(!rt_task(p)); @@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && has_pushable_tasks(rq) && - p->rt.nr_cpus_allowed > 1 && + p->nr_cpus_allowed > 1 && rt_task(rq->curr) && - (rq->curr->rt.nr_cpus_allowed < 2 || + (rq->curr->nr_cpus_allowed < 2 || rq->curr->prio <= p->prio)) push_rt_tasks(rq); } @@ -1817,7 +1823,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, * Only update if the process changes its state from whether it * can migrate or not. */ - if ((p->rt.nr_cpus_allowed > 1) == (weight > 1)) + if ((p->nr_cpus_allowed > 1) == (weight > 1)) return; rq = task_rq(p); From 454c79999f7eaedcdf4c15c449e43902980cbdf5 Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Wed, 16 May 2012 21:34:23 -0700 Subject: [PATCH 040/178] sched/rt: Fix SCHED_RR across cgroups task_tick_rt() has an optimization to only reschedule SCHED_RR tasks if they were the only element on their rq. However, with cgroups a SCHED_RR task could be the only element on its per-cgroup rq but still be competing with other SCHED_RR tasks in its parent's cgroup. In this case, the SCHED_RR task in the child cgroup would never yield at the end of its timeslice. If the child cgroup rt_runtime_us was the same as the parent cgroup rt_runtime_us, the task in the parent cgroup would starve completely. Modify task_tick_rt() to check that the task is the only task on its rq, and that the each of the scheduling entities of its ancestors is also the only entity on its rq. Signed-off-by: Colin Cross Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1337229266-15798-1-git-send-email-ccross@android.com Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 295da737b6fe..2a4e8dffbd6b 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1985,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p) static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) { + struct sched_rt_entity *rt_se = &p->rt; + update_curr_rt(rq); watchdog(rq, p); @@ -2002,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) p->rt.time_slice = RR_TIMESLICE; /* - * Requeue to the end of queue if we are not the only element - * on the queue: + * Requeue to the end of queue if we (and all of our ancestors) are the + * only element on the queue */ - if (p->rt.run_list.prev != p->rt.run_list.next) { - requeue_task_rt(rq, p, 0); - set_tsk_need_resched(p); + for_each_sched_rt_entity(rt_se) { + if (rt_se->run_list.prev != rt_se->run_list.next) { + requeue_task_rt(rq, p, 0); + set_tsk_need_resched(p); + return; + } } } From 1292531f6f27af909e713671dd9cc3bcab8114b7 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 25 May 2012 15:41:54 +0900 Subject: [PATCH 041/178] sched: Make sched_feat_names const The strings sched_feat_names are never changed. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4FBF29B2.9030904@ct.jp.nec.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 70cc36a6073f..c1679a098fc7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -142,7 +142,7 @@ const_debug unsigned int sysctl_sched_features = #define SCHED_FEAT(name, enabled) \ #name , -static __read_mostly char *sched_feat_names[] = { +static const char * const sched_feat_names[] = { #include "features.h" NULL }; From 7997a456ef841bb78eb6f881d7cc2c17c2f9b35e Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 25 May 2012 15:42:47 +0900 Subject: [PATCH 042/178] sched: Remove the last NULL entry from sched_feat_names No need to have the last NULL entry. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4FBF29E7.5020805@ct.jp.nec.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c1679a098fc7..94d598ac5e64 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -144,7 +144,6 @@ const_debug unsigned int sysctl_sched_features = static const char * const sched_feat_names[] = { #include "features.h" - NULL }; #undef SCHED_FEAT From 6a4c96eef42f835734a82c6b512abf9881b7c55d Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Wed, 23 May 2012 14:44:11 +0530 Subject: [PATCH 043/178] sched: Remove NULL assignment of dattr_cur Remove explicit NULL assignment of static pointer dattr_cur from init_sched_domains(). Signed-off-by: Kamalesh Babulal Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120523091411.GG5005@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 94d598ac5e64..c46958e26121 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6726,7 +6726,6 @@ static int init_sched_domains(const struct cpumask *cpu_map) if (!doms_cur) doms_cur = &fallback_doms; cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); - dattr_cur = NULL; err = build_sched_domains(doms_cur[0], NULL); register_sched_domain_sysctl(); From 91557e847dbc715acf2d847a1ffec63f71b00b65 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 30 May 2012 12:25:52 -0300 Subject: [PATCH 044/178] perf report: Use the right symbol for annotation In non symbolic views, i.e. --sort without "symbol", as in: perf report --sort comm We're segfaulting in the --tui because we're testing the symbol resolved and then trying to use the symbol on the histogram entry where we're coalescing all hits for a COMM, and the first hist_entry for a comm may have a NULL symbol, i.e. the RIP didn't resolve to any symbol. In this case we're segfaulting, fix it by testing against the symbol in the histogram entry. Reported-by: William Cohen Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-8ylwubbcmu27ucc9ffrku3yv@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 8c767c6bca91..2400e009f149 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -162,7 +162,7 @@ static int perf_evsel__add_hist_entry(struct perf_evsel *evsel, * so we don't allocated the extra space needed because the stdio * code will not use it. */ - if (al->sym != NULL && use_browser > 0) { + if (he->ms.sym != NULL && use_browser > 0) { struct annotation *notes = symbol__annotation(he->ms.sym); assert(evsel != NULL); From 107baecaca0b2843f1a0464701b253e51ef6f0e2 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 30 May 2012 12:31:44 -0300 Subject: [PATCH 045/178] perf annotate browser: Fix help window entry for navigating to hottest line Its 'H', not 'h'. The later is for getting to the help window. Reported-by: Ingo Molnar Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-7zvwphhm815y2zczoxgstzuf@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index 4deea6aaf927..34b1c46eaf42 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -668,7 +668,7 @@ static int annotate_browser__run(struct annotate_browser *browser, int evidx, "q/ESC/CTRL+C Exit\n\n" "-> Go to target\n" "<- Exit\n" - "h Cycle thru hottest instructions\n" + "H Cycle thru hottest instructions\n" "j Toggle showing jump to target arrows\n" "J Toggle showing number of jump sources on targets\n" "n Search next string\n" From 79695e1bb65ba0e21488c360a1bed6e358354aaa Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 30 May 2012 13:53:54 -0300 Subject: [PATCH 046/178] perf stat: Initialize default events wrt exclude_{guest,host} When no event is specified the tools use perf_evlist__add_default(), that will call event_attr_init to initialize the KVM exclusion bits. When the change was made to the tools so that by default guest samples would be excluded, the changes were made just to the parsing routines and to perf_evlist__add_default(), not to perf_evlist__add_attrs, that is used so far just by perf stat to add multiple events, according to the level of detail specified. Recently the tools were changed to reconstruct the event name from all the details in perf_event_attr, not just from .type and .config, but taking into account all the feature bits (.exclude_{guest,host,user,kernel,etc}, .precise_ip, etc). That is when we noticed that the default for perf stat wasn't the one for the rest of the tools, i.e. the .exclude_guest bit wasn't being set. I.e. the default, that doesn't call event_attr_init was showing the :HG modifier: $ perf stat usleep 1 Performance counter stats for 'usleep 1': 0.942119 task-clock # 0.454 CPUs utilized 1 context-switches # 0.001 M/sec 0 CPU-migrations # 0.000 K/sec 126 page-faults # 0.134 M/sec 693,193 cycles:HG # 0.736 GHz [40.11%] 407,461 stalled-cycles-frontend:HG # 58.78% frontend cycles idle [72.29%] 365,403 stalled-cycles-backend:HG # 52.71% backend cycles idle 465,982 instructions:HG # 0.67 insns per cycle # 0.87 stalled cycles per insn 89,760 branches:HG # 95.275 M/sec 6,178 branch-misses:HG # 6.88% of all branches 0.002077228 seconds time elapsed While if one explicitely specifies the same events, which will make the parsing code to be called and thus event_attr_init is called: $ perf stat -e task-clock,context-switches,migrations,page-faults,cycles,stalled-cycles-frontend,stalled-cycles-backend,instructions,branches,branch-misses usleep 1 Performance counter stats for 'usleep 1': 1.040349 task-clock # 0.500 CPUs utilized 2 context-switches # 0.002 M/sec 0 CPU-migrations # 0.000 K/sec 127 page-faults # 0.122 M/sec 587,966 cycles # 0.565 GHz [13.18%] 459,167 stalled-cycles-frontend # 78.09% frontend cycles idle 390,249 stalled-cycles-backend # 66.37% backend cycles idle 504,006 instructions # 0.86 insns per cycle # 0.91 stalled cycles per insn 96,455 branches # 92.714 M/sec 6,522 branch-misses # 6.76% of all branches [96.12%] 0.002078681 seconds time elapsed Fix it by introducing a perf_evlist__add_default_attrs method that will call evlist_attr_init in all the perf_event_attr entries before adding the events. Reported-by: Ingo Molnar Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-4eysr236r0pgiyum9epwxw7s@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 8 ++++---- tools/perf/util/evlist.c | 11 +++++++++++ tools/perf/util/evlist.h | 4 ++++ 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 62ae30d34fa6..262589991ea4 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1129,7 +1129,7 @@ static int add_default_attributes(void) return 0; if (!evsel_list->nr_entries) { - if (perf_evlist__add_attrs_array(evsel_list, default_attrs) < 0) + if (perf_evlist__add_default_attrs(evsel_list, default_attrs) < 0) return -1; } @@ -1139,21 +1139,21 @@ static int add_default_attributes(void) return 0; /* Append detailed run extra attributes: */ - if (perf_evlist__add_attrs_array(evsel_list, detailed_attrs) < 0) + if (perf_evlist__add_default_attrs(evsel_list, detailed_attrs) < 0) return -1; if (detailed_run < 2) return 0; /* Append very detailed run extra attributes: */ - if (perf_evlist__add_attrs_array(evsel_list, very_detailed_attrs) < 0) + if (perf_evlist__add_default_attrs(evsel_list, very_detailed_attrs) < 0) return -1; if (detailed_run < 3) return 0; /* Append very, very detailed run extra attributes: */ - return perf_evlist__add_attrs_array(evsel_list, very_very_detailed_attrs); + return perf_evlist__add_default_attrs(evsel_list, very_very_detailed_attrs); } int cmd_stat(int argc, const char **argv, const char *prefix __used) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 4ac5f5ae4ce9..ed277e5627cf 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -159,6 +159,17 @@ out_delete_partial_list: return -1; } +int __perf_evlist__add_default_attrs(struct perf_evlist *evlist, + struct perf_event_attr *attrs, size_t nr_attrs) +{ + size_t i; + + for (i = 0; i < nr_attrs; i++) + event_attr_init(attrs + i); + + return perf_evlist__add_attrs(evlist, attrs, nr_attrs); +} + static int trace_event__id(const char *evname) { char *filename, *colon; diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 58abb63ac13a..989bee9624c2 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -54,6 +54,8 @@ void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry); int perf_evlist__add_default(struct perf_evlist *evlist); int perf_evlist__add_attrs(struct perf_evlist *evlist, struct perf_event_attr *attrs, size_t nr_attrs); +int __perf_evlist__add_default_attrs(struct perf_evlist *evlist, + struct perf_event_attr *attrs, size_t nr_attrs); int perf_evlist__add_tracepoints(struct perf_evlist *evlist, const char *tracepoints[], size_t nr_tracepoints); int perf_evlist__set_tracepoints_handlers(struct perf_evlist *evlist, @@ -62,6 +64,8 @@ int perf_evlist__set_tracepoints_handlers(struct perf_evlist *evlist, #define perf_evlist__add_attrs_array(evlist, array) \ perf_evlist__add_attrs(evlist, array, ARRAY_SIZE(array)) +#define perf_evlist__add_default_attrs(evlist, array) \ + __perf_evlist__add_default_attrs(evlist, array, ARRAY_SIZE(array)) #define perf_evlist__add_tracepoints_array(evlist, array) \ perf_evlist__add_tracepoints(evlist, array, ARRAY_SIZE(array)) From 52deff71bc2b2c24587ab71f588ff5e4c9279349 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 29 May 2012 22:58:26 -0600 Subject: [PATCH 047/178] perf script: Fix regression in callchain dso name $ perf script -i /tmp/perf.data ... gcc 13623 544315.062858: context-switches: ffffffff815f65c9 __schedule ([kernel.kallsyms]) ffffffff81087cea __cond_resched ([kernel.kallsyms]) ffffffff815f6b92 _cond_resched ([kernel.kallsyms]) ffffffff815fb87a do_page_fault ([kernel.kallsyms]) ffffffff815f8465 page_fault ([kernel.kallsyms]) 2b7a71ea0303 _dl_lookup_symbol_x ([kernel.kallsyms]) 2b7a71ea1eb5 _dl_relocate_object ([kernel.kallsyms]) 2b7a71e99b2e dl_main ([kernel.kallsyms]) 2b7a71eab7f4 _dl_sysdep_start ([kernel.kallsyms]) All DSO's in a callchain are printed as [kernel.kallsyms]. git bisect chased it to: 547a92e0aedb88129e7fbd804697a11949de2e5a is the first bad commit commit 547a92e0aedb88129e7fbd804697a11949de2e5a Author: Akihiro Nagai Date: Mon Jan 30 13:42:57 2012 +0900 perf script: Unify the expressions indicating "unknown" The perf script command uses various expressions to indicate "unknown". It is unfriendly for user scripts to parse it. So, this patch unifies the expressions to "[unknown]". Looks like a copy-paste in that the other references use al.map but this one should be node->map. With this patch you get: $ perf script -i /tmp/perf.data ... gcc 13623 544315.062858: context-switches: ffffffff815f65c9 __schedule ([kernel.kallsyms]) ffffffff81087cea __cond_resched ([kernel.kallsyms]) ffffffff815f6b92 _cond_resched ([kernel.kallsyms]) ffffffff815fb87a do_page_fault ([kernel.kallsyms]) ffffffff815f8465 page_fault ([kernel.kallsyms]) 2b7a71ea0303 _dl_lookup_symbol_x (/lib64/ld-2.14.90.so) 2b7a71ea1eb5 _dl_relocate_object (/lib64/ld-2.14.90.so) 2b7a71e99b2e dl_main (/lib64/ld-2.14.90.so) 2b7a71eab7f4 _dl_sysdep_start (/lib64/ld-2.14.90.so) Signed-off-by: David Ahern Cc: Akihiro Nagai Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1338353906-60706-1-git-send-email-dsahern@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 93d355d27109..48206144758e 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1460,7 +1460,7 @@ void perf_event__print_ip(union perf_event *event, struct perf_sample *sample, } if (print_dso) { printf(" ("); - map__fprintf_dsoname(al.map, stdout); + map__fprintf_dsoname(node->map, stdout); printf(")"); } printf("\n"); From f1439c315b328bbb3f7f6fdd814a7057f9e130d5 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 30 May 2012 15:02:42 -0300 Subject: [PATCH 048/178] perf tools: Fix make tarballs The patch series that introduced the top level tools/ makefile and the libtraceevent broke this feature where files needed to build in a detached tarball were not included in the MANIFEST file and thus not included in the tarball. Fix it by adding the relevant files to the MANIFEST. Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Steven Rostedt Link: http://lkml.kernel.org/n/tip-z3mjj74927xvqwhlmu18kj80@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/MANIFEST | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST index 5476bc0a1eac..b4b572e8c100 100644 --- a/tools/perf/MANIFEST +++ b/tools/perf/MANIFEST @@ -1,4 +1,6 @@ tools/perf +tools/scripts +tools/lib/traceevent include/linux/const.h include/linux/perf_event.h include/linux/rbtree.h From ea1b3ebac9a1c72c4362c784b4ed069a938a4ddb Mon Sep 17 00:00:00 2001 From: Avik Sil Date: Tue, 29 May 2012 16:05:25 +0530 Subject: [PATCH 049/178] perf tools: Fix pager on minimal-install embedded systems Some Distributions may lack "less" package being included by default, e.g., Linaro nano rootfs. In those cases use the portable "pager" command instead of "less". Signed-off-by: Avik Sil Acked-by: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1338287725-26382-1-git-send-email-avik.sil@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pager.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/perf/util/pager.c b/tools/perf/util/pager.c index 1915de20dcac..3322b8446e89 100644 --- a/tools/perf/util/pager.c +++ b/tools/perf/util/pager.c @@ -57,6 +57,10 @@ void setup_pager(void) } if (!pager) pager = getenv("PAGER"); + if (!pager) { + if (!access("/usr/bin/pager", X_OK)) + pager = "/usr/bin/pager"; + } if (!pager) pager = "less"; else if (!*pager || !strcmp(pager, "cat")) From 5f2a3d6191e49df5d56332d3b65d6636c355f635 Mon Sep 17 00:00:00 2001 From: Stefan Richter Date: Thu, 24 May 2012 22:07:35 +0200 Subject: [PATCH 050/178] sbp-target: rename a variable to avoid name clash 'int login_id' shadows 'static atomic_t login_id'. Seen as compilation warning on x86-32. Signed-off-by: Stefan Richter Acked-by: Chris Boot Signed-off-by: Nicholas Bellinger --- drivers/target/sbp/sbp_target.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/target/sbp/sbp_target.c b/drivers/target/sbp/sbp_target.c index 37c609898f84..7e6136e2ce81 100644 --- a/drivers/target/sbp/sbp_target.c +++ b/drivers/target/sbp/sbp_target.c @@ -587,14 +587,14 @@ static void sbp_management_request_logout( { struct sbp_tport *tport = agent->tport; struct sbp_tpg *tpg = tport->tpg; - int login_id; + int id; struct sbp_login_descriptor *login; - login_id = LOGOUT_ORB_LOGIN_ID(be32_to_cpu(req->orb.misc)); + id = LOGOUT_ORB_LOGIN_ID(be32_to_cpu(req->orb.misc)); - login = sbp_login_find_by_id(tpg, login_id); + login = sbp_login_find_by_id(tpg, id); if (!login) { - pr_warn("cannot find login: %d\n", login_id); + pr_warn("cannot find login: %d\n", id); req->status.status = cpu_to_be32( STATUS_BLOCK_RESP(STATUS_RESP_REQUEST_COMPLETE) | From aba336bd1d46d6b0404b06f6915ed76150739057 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 31 May 2012 15:39:11 +1000 Subject: [PATCH 051/178] md: raid1/raid10: fix problem with merge_bvec_fn The new merge_bvec_fn which calls the corresponding function in subsidiary devices requires that mddev->merge_check_needed be set if any child has a merge_bvec_fn. However were were only setting that when a device was hot-added, not when a device was present from the start. This bug was introduced in 3.4 so patch is suitable for 3.4.y kernels. However that are conflicts in raid10.c so a separate patch will be needed for 3.4.y. Cc: stable@vger.kernel.org Reported-by: Sebastian Riemer Signed-off-by: NeilBrown --- drivers/md/raid1.c | 4 ++++ drivers/md/raid10.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 835de7168cd3..a9c7981ddd24 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2550,6 +2550,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) err = -EINVAL; spin_lock_init(&conf->device_lock); rdev_for_each(rdev, mddev) { + struct request_queue *q; int disk_idx = rdev->raid_disk; if (disk_idx >= mddev->raid_disks || disk_idx < 0) @@ -2562,6 +2563,9 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (disk->rdev) goto abort; disk->rdev = rdev; + q = bdev_get_queue(rdev->bdev); + if (q->merge_bvec_fn) + mddev->merge_check_needed = 1; disk->head_position = 0; } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 987db37cb875..99ae6068e456 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3475,6 +3475,7 @@ static int run(struct mddev *mddev) rdev_for_each(rdev, mddev) { long long diff; + struct request_queue *q; disk_idx = rdev->raid_disk; if (disk_idx < 0) @@ -3493,6 +3494,9 @@ static int run(struct mddev *mddev) goto out_free_conf; disk->rdev = rdev; } + q = bdev_get_queue(rdev->bdev); + if (q->merge_bvec_fn) + mddev->merge_check_needed = 1; diff = (rdev->new_data_offset - rdev->data_offset); if (!mddev->reshape_backwards) diff = -diff; From 9e612a008fa7fe493a473454def56aa321479495 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 31 May 2012 13:08:53 +0100 Subject: [PATCH 052/178] drm/i915/crt: Do not rely upon the HPD presence pin Whilst most monitors do wire up the HPD presence pin, it seems quite a few KVM do not. Therefore if we simply rely on the HPD pin being asserted to indicate a connected monitor we fail miserable, so fall back to performing a DCC query for the EDID. Reported-and-tested-by: Matthieu LAVIE Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=50501 Signed-off-by: Chris Wilson Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/intel_crt.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_crt.c b/drivers/gpu/drm/i915/intel_crt.c index 75a70c46ef1b..844e93e1e395 100644 --- a/drivers/gpu/drm/i915/intel_crt.c +++ b/drivers/gpu/drm/i915/intel_crt.c @@ -453,13 +453,15 @@ intel_crt_detect(struct drm_connector *connector, bool force) struct intel_load_detect_pipe tmp; if (I915_HAS_HOTPLUG(dev)) { + /* We can not rely on the HPD pin always being correctly wired + * up, for example many KVM do not pass it through, and so + * only trust an assertion that the monitor is connected. + */ if (intel_crt_detect_hotplug(connector)) { DRM_DEBUG_KMS("CRT detected via hotplug\n"); return connector_status_connected; - } else { + } else DRM_DEBUG_KMS("CRT not detected via hotplug\n"); - return connector_status_disconnected; - } } if (intel_crt_detect_ddc(connector)) From 472606458f3e1ced5fe3cc5f04e90a6b5a4732cf Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 31 May 2012 14:43:26 +0900 Subject: [PATCH 053/178] perf callchain: Make callchain cursors TLS perf top -G has a race on callchain cursor between main thread and display thread. Since the callchain cursors are used locally make them thread-local data would solve the problem. Signed-off-by: Namhyung Kim Reported-by: Sunjin Yang Suggested-by: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Sunjin Yang Link: http://lkml.kernel.org/r/1338443007-24857-1-git-send-email-namhyung.kim@lge.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 2 +- tools/perf/builtin-top.c | 2 +- tools/perf/util/callchain.c | 2 ++ tools/perf/util/callchain.h | 2 ++ tools/perf/util/hist.c | 7 ++++--- tools/perf/util/hist.h | 2 -- tools/perf/util/session.c | 14 +++++++------- 7 files changed, 17 insertions(+), 14 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 2400e009f149..25249f76329d 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -152,7 +152,7 @@ static int perf_evsel__add_hist_entry(struct perf_evsel *evsel, if (symbol_conf.use_callchain) { err = callchain_append(he->callchain, - &evsel->hists.callchain_cursor, + &callchain_cursor, sample->period); if (err) return err; diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 871b540293e1..6bb0277b7dfe 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -787,7 +787,7 @@ static void perf_event__process_sample(struct perf_tool *tool, } if (symbol_conf.use_callchain) { - err = callchain_append(he->callchain, &evsel->hists.callchain_cursor, + err = callchain_append(he->callchain, &callchain_cursor, sample->period); if (err) return; diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 9f7106a8d9a4..3a6bff47614f 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -18,6 +18,8 @@ #include "util.h" #include "callchain.h" +__thread struct callchain_cursor callchain_cursor; + bool ip_callchain__valid(struct ip_callchain *chain, const union perf_event *event) { diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index 7f9c0f1ae3a9..3bdb407f9cd9 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -76,6 +76,8 @@ struct callchain_cursor { struct callchain_cursor_node *curr; }; +extern __thread struct callchain_cursor callchain_cursor; + static inline void callchain_init(struct callchain_root *root) { INIT_LIST_HEAD(&root->node.siblings); diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 1293b5ebea4d..514e2a4b367d 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -378,7 +378,7 @@ void hist_entry__free(struct hist_entry *he) * collapse the histogram */ -static bool hists__collapse_insert_entry(struct hists *hists, +static bool hists__collapse_insert_entry(struct hists *hists __used, struct rb_root *root, struct hist_entry *he) { @@ -397,8 +397,9 @@ static bool hists__collapse_insert_entry(struct hists *hists, iter->period += he->period; iter->nr_events += he->nr_events; if (symbol_conf.use_callchain) { - callchain_cursor_reset(&hists->callchain_cursor); - callchain_merge(&hists->callchain_cursor, iter->callchain, + callchain_cursor_reset(&callchain_cursor); + callchain_merge(&callchain_cursor, + iter->callchain, he->callchain); } hist_entry__free(he); diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index cfc64e293f90..34bb556d6219 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -67,8 +67,6 @@ struct hists { struct events_stats stats; u64 event_stream; u16 col_len[HISTC_NR_COLS]; - /* Best would be to reuse the session callchain cursor */ - struct callchain_cursor callchain_cursor; }; struct hist_entry *__hists__add_entry(struct hists *self, diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 48206144758e..3b6f8e460a31 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -288,7 +288,8 @@ struct branch_info *machine__resolve_bstack(struct machine *self, return bi; } -int machine__resolve_callchain(struct machine *self, struct perf_evsel *evsel, +int machine__resolve_callchain(struct machine *self, + struct perf_evsel *evsel __used, struct thread *thread, struct ip_callchain *chain, struct symbol **parent) @@ -297,7 +298,7 @@ int machine__resolve_callchain(struct machine *self, struct perf_evsel *evsel, unsigned int i; int err; - callchain_cursor_reset(&evsel->hists.callchain_cursor); + callchain_cursor_reset(&callchain_cursor); for (i = 0; i < chain->nr; i++) { u64 ip; @@ -333,7 +334,7 @@ int machine__resolve_callchain(struct machine *self, struct perf_evsel *evsel, break; } - err = callchain_cursor_append(&evsel->hists.callchain_cursor, + err = callchain_cursor_append(&callchain_cursor, ip, al.map, al.sym); if (err) return err; @@ -1428,7 +1429,6 @@ void perf_event__print_ip(union perf_event *event, struct perf_sample *sample, int print_sym, int print_dso, int print_symoffset) { struct addr_location al; - struct callchain_cursor *cursor = &evsel->hists.callchain_cursor; struct callchain_cursor_node *node; if (perf_event__preprocess_sample(event, machine, &al, sample, @@ -1446,10 +1446,10 @@ void perf_event__print_ip(union perf_event *event, struct perf_sample *sample, error("Failed to resolve callchain. Skipping\n"); return; } - callchain_cursor_commit(cursor); + callchain_cursor_commit(&callchain_cursor); while (1) { - node = callchain_cursor_current(cursor); + node = callchain_cursor_current(&callchain_cursor); if (!node) break; @@ -1465,7 +1465,7 @@ void perf_event__print_ip(union perf_event *event, struct perf_sample *sample, } printf("\n"); - callchain_cursor_advance(cursor); + callchain_cursor_advance(&callchain_cursor); } } else { From 114067b69e7b2c691faace0e33db2f04096f668d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 31 May 2012 14:43:27 +0900 Subject: [PATCH 054/178] perf tools: Check if callchain is corrupted We faced segmentation fault on perf top -G at very high sampling rate due to a corrupted callchain. While the root cause was not revealed (I failed to figure it out), this patch tries to protect us from the segfault on such cases. Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Namhyung Kim Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Sunjin Yang Link: http://lkml.kernel.org/r/1338443007-24857-2-git-send-email-namhyung.kim@lge.com Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/perf_event.h | 4 ++-- tools/perf/util/session.c | 14 +++++++++++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f32578634d9d..1817d4015e5f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -555,6 +555,8 @@ enum perf_event_type { PERF_RECORD_MAX, /* non-ABI */ }; +#define PERF_MAX_STACK_DEPTH 255 + enum perf_callchain_context { PERF_CONTEXT_HV = (__u64)-32, PERF_CONTEXT_KERNEL = (__u64)-128, @@ -609,8 +611,6 @@ struct perf_guest_info_callbacks { #include #include -#define PERF_MAX_STACK_DEPTH 255 - struct perf_callchain_entry { __u64 nr; __u64 ip[PERF_MAX_STACK_DEPTH]; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 3b6f8e460a31..04d1e33f4592 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -300,6 +300,11 @@ int machine__resolve_callchain(struct machine *self, callchain_cursor_reset(&callchain_cursor); + if (chain->nr > PERF_MAX_STACK_DEPTH) { + pr_warning("corrupted callchain. skipping...\n"); + return 0; + } + for (i = 0; i < chain->nr; i++) { u64 ip; struct addr_location al; @@ -318,7 +323,14 @@ int machine__resolve_callchain(struct machine *self, case PERF_CONTEXT_USER: cpumode = PERF_RECORD_MISC_USER; break; default: - break; + pr_debug("invalid callchain context: " + "%"PRId64"\n", (s64) ip); + /* + * It seems the callchain is corrupted. + * Discard all. + */ + callchain_cursor_reset(&callchain_cursor); + return 0; } continue; } From aa5cdd308ddbb08959534be408eba8ef040b5989 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 31 May 2012 11:17:34 -0300 Subject: [PATCH 055/178] perf tools: Make --version show kernel version instead of pull req tag Before: $ perf --version perf version perf.urgent.for.mingo.5.g37da28 After: $ perf --version perf version 3.4.8941.g37da28.dirty Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-vc9b4e6023iegz9kabr3yvyv@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/PERF-VERSION-GEN | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/PERF-VERSION-GEN b/tools/perf/util/PERF-VERSION-GEN index ad73300f7bac..95264f304179 100755 --- a/tools/perf/util/PERF-VERSION-GEN +++ b/tools/perf/util/PERF-VERSION-GEN @@ -12,7 +12,7 @@ LF=' # First check if there is a .git to get the version from git describe # otherwise try to get the version from the kernel makefile if test -d ../../.git -o -f ../../.git && - VN=$(git describe --abbrev=4 HEAD 2>/dev/null) && + VN=$(git describe --match 'v[0-9].[0-9]*' --abbrev=4 HEAD 2>/dev/null) && case "$VN" in *$LF*) (exit 1) ;; v[0-9]*) From a59e64a13a927fb7530bef39e9f5e7de8268137e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 31 May 2012 14:51:45 +0900 Subject: [PATCH 056/178] perf tools: Update ioctl documentation for PERF_IOC_FLAG_GROUP The ioctl interface of perf event fd receives 3 arguments to control event group behavior but it lacked documentation. Signed-off-by: Namhyung Kim Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1338443506-25009-2-git-send-email-namhyung.kim@lge.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/design.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/perf/design.txt b/tools/perf/design.txt index bd0bb1b1279b..67e5d0cace85 100644 --- a/tools/perf/design.txt +++ b/tools/perf/design.txt @@ -409,14 +409,15 @@ Counters can be enabled and disabled in two ways: via ioctl and via prctl. When a counter is disabled, it doesn't count or generate events but does continue to exist and maintain its count value. -An individual counter or counter group can be enabled with +An individual counter can be enabled with - ioctl(fd, PERF_EVENT_IOC_ENABLE); + ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); or disabled with - ioctl(fd, PERF_EVENT_IOC_DISABLE); + ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); +For a counter group, pass PERF_IOC_FLAG_GROUP as the third argument. Enabling or disabling the leader of a group enables or disables the whole group; that is, while the group leader is disabled, none of the counters in the group will count. Enabling or disabling a member of a From 55da80059de6c7533724fcd95f16c5d5618ecf4d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 31 May 2012 14:51:46 +0900 Subject: [PATCH 057/178] perf evlist: Pass third argument to ioctl explicitly The ioctl on perf event fd wants 3 arguments but we only passed 2. As the only user of the functions is perf record and it calls them for every event (regardless of group setting), just pass 0 for now. Signed-off-by: Namhyung Kim Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1338443506-25009-3-git-send-email-namhyung.kim@lge.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index ed277e5627cf..7400fb3fc50c 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -274,7 +274,8 @@ void perf_evlist__disable(struct perf_evlist *evlist) for (cpu = 0; cpu < evlist->cpus->nr; cpu++) { list_for_each_entry(pos, &evlist->entries, node) { for (thread = 0; thread < evlist->threads->nr; thread++) - ioctl(FD(pos, cpu, thread), PERF_EVENT_IOC_DISABLE); + ioctl(FD(pos, cpu, thread), + PERF_EVENT_IOC_DISABLE, 0); } } } @@ -287,7 +288,8 @@ void perf_evlist__enable(struct perf_evlist *evlist) for (cpu = 0; cpu < evlist->cpus->nr; cpu++) { list_for_each_entry(pos, &evlist->entries, node) { for (thread = 0; thread < evlist->threads->nr; thread++) - ioctl(FD(pos, cpu, thread), PERF_EVENT_IOC_ENABLE); + ioctl(FD(pos, cpu, thread), + PERF_EVENT_IOC_ENABLE, 0); } } } From 8db4841fc72acc58254029f050226ea5f8103854 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 30 May 2012 14:23:42 +0200 Subject: [PATCH 058/178] perf symbols: Handle different endians properly during symbol load Currently we dont care about the file object's endianness. It's possible we read buildid file object from different architecture than we are currentlly running on. So we need to care about properly reading such object's data - handle different endianness properly. Adding: needs_swap DSO field dso__swap_init function to initialize DSO's needs_swap DSO__SWAP to read the data with proper swaps Together with other endianity patches, this change fixies perf report discrepancies on origin and target systems as described in test 1 below, e.g. following perf report diff: ... 0.12% ps [kernel.kallsyms] [k] clear_page - 0.12% awk bash [.] alloc_word_desc + 0.12% awk bash [.] yyparse 0.11% beah-rhts-task libpython2.6.so.1.0 [.] 0x5560e 0.10% perf libc-2.12.so [.] __ctype_toupper_loc - 0.09% rhts-test-runne bash [.] maybe_make_export_env + 0.09% rhts-test-runne bash [.] 0x385a0 0.09% ps [kernel.kallsyms] [k] page_fault ... Note, running following to test perf endianity handling: test 1) - origin system: # perf record -a -- sleep 10 (any perf record will do) # perf report > report.origin # perf archive perf.data - copy the perf.data, report.origin and perf.data.tar.bz2 to a target system and run: # tar xjvf perf.data.tar.bz2 -C ~/.debug # perf report > report.target # diff -u report.origin report.target - the diff should produce no output (besides some white space stuff and possibly different date/TZ output) test 1) - origin system: # perf record -ag -fo /tmp/perf.data -- sleep 1 - mount origin system root to the target system on /mnt/origin - target system: # perf script --symfs /mnt/origin -I -i /mnt/origin/tmp/perf.data \ --kallsyms /mnt/origin/proc/kallsyms - complete perf.data header is displayed Signed-off-by: Jiri Olsa Cc: Corey Ashford Cc: David Ahern Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1338380624-7443-2-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol.c | 33 ++++++++++++++++++++++++++++++++- tools/perf/util/symbol.h | 30 ++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index e2ba8858f3e1..9d04dcd91815 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -323,6 +323,7 @@ struct dso *dso__new(const char *name) dso->sorted_by_name = 0; dso->has_build_id = 0; dso->kernel = DSO_TYPE_USER; + dso->needs_swap = DSO_SWAP__UNSET; INIT_LIST_HEAD(&dso->node); } @@ -1156,6 +1157,33 @@ static size_t elf_addr_to_index(Elf *elf, GElf_Addr addr) return -1; } +static int dso__swap_init(struct dso *dso, unsigned char eidata) +{ + static unsigned int const endian = 1; + + dso->needs_swap = DSO_SWAP__NO; + + switch (eidata) { + case ELFDATA2LSB: + /* We are big endian, DSO is little endian. */ + if (*(unsigned char const *)&endian != 1) + dso->needs_swap = DSO_SWAP__YES; + break; + + case ELFDATA2MSB: + /* We are little endian, DSO is big endian. */ + if (*(unsigned char const *)&endian != 0) + dso->needs_swap = DSO_SWAP__YES; + break; + + default: + pr_err("unrecognized DSO data encoding %d\n", eidata); + return -EINVAL; + } + + return 0; +} + static int dso__load_sym(struct dso *dso, struct map *map, const char *name, int fd, symbol_filter_t filter, int kmodule, int want_symtab) @@ -1187,6 +1215,9 @@ static int dso__load_sym(struct dso *dso, struct map *map, const char *name, goto out_elf_end; } + if (dso__swap_init(dso, ehdr.e_ident[EI_DATA])) + goto out_elf_end; + /* Always reject images with a mismatched build-id: */ if (dso->has_build_id) { u8 build_id[BUILD_ID_SIZE]; @@ -1272,7 +1303,7 @@ static int dso__load_sym(struct dso *dso, struct map *map, const char *name, if (opdsec && sym.st_shndx == opdidx) { u32 offset = sym.st_value - opdshdr.sh_addr; u64 *opd = opddata->d_buf + offset; - sym.st_value = *opd; + sym.st_value = DSO__SWAP(dso, u64, *opd); sym.st_shndx = elf_addr_to_index(elf, sym.st_value); } diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index 5649d63798cb..af0752b1aca1 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -9,6 +9,7 @@ #include #include #include +#include #ifdef HAVE_CPLUS_DEMANGLE extern char *cplus_demangle(const char *, int); @@ -160,11 +161,18 @@ enum dso_kernel_type { DSO_TYPE_GUEST_KERNEL }; +enum dso_swap_type { + DSO_SWAP__UNSET, + DSO_SWAP__NO, + DSO_SWAP__YES, +}; + struct dso { struct list_head node; struct rb_root symbols[MAP__NR_TYPES]; struct rb_root symbol_names[MAP__NR_TYPES]; enum dso_kernel_type kernel; + enum dso_swap_type needs_swap; u8 adjust_symbols:1; u8 has_build_id:1; u8 hit:1; @@ -182,6 +190,28 @@ struct dso { char name[0]; }; +#define DSO__SWAP(dso, type, val) \ +({ \ + type ____r = val; \ + BUG_ON(dso->needs_swap == DSO_SWAP__UNSET); \ + if (dso->needs_swap == DSO_SWAP__YES) { \ + switch (sizeof(____r)) { \ + case 2: \ + ____r = bswap_16(val); \ + break; \ + case 4: \ + ____r = bswap_32(val); \ + break; \ + case 8: \ + ____r = bswap_64(val); \ + break; \ + default: \ + BUG_ON(1); \ + } \ + } \ + ____r; \ +}) + struct dso *dso__new(const char *name); void dso__delete(struct dso *dso); From 268fb20f832e1eb4afd5113ee31fef9332986b13 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 30 May 2012 14:23:43 +0200 Subject: [PATCH 059/178] perf session: Handle endianity swap on sample_id_all header data Adding endianity swapping for event header attached via sample_id_all. Currently we dont do that and it's causing wrong data to be read when running report on architecture with different endianity than the record. The perf is currently able to process 32-bit PPC samples on 32-bit and 64-bit x86. Together with other endianity patches, this change fixies perf report discrepancies on origin and target systems as described in test 1 below, e.g. following perf report diff: ... 0.12% ps [kernel.kallsyms] [k] clear_page - 0.12% awk bash [.] alloc_word_desc + 0.12% awk bash [.] yyparse 0.11% beah-rhts-task libpython2.6.so.1.0 [.] 0x5560e 0.10% perf libc-2.12.so [.] __ctype_toupper_loc - 0.09% rhts-test-runne bash [.] maybe_make_export_env + 0.09% rhts-test-runne bash [.] 0x385a0 0.09% ps [kernel.kallsyms] [k] page_fault ... Note, running following to test perf endianity handling: test 1) - origin system: # perf record -a -- sleep 10 (any perf record will do) # perf report > report.origin # perf archive perf.data - copy the perf.data, report.origin and perf.data.tar.bz2 to a target system and run: # tar xjvf perf.data.tar.bz2 -C ~/.debug # perf report > report.target # diff -u report.origin report.target - the diff should produce no output (besides some white space stuff and possibly different date/TZ output) test 2) - origin system: # perf record -ag -fo /tmp/perf.data -- sleep 1 - mount origin system root to the target system on /mnt/origin - target system: # perf script --symfs /mnt/origin -I -i /mnt/origin/tmp/perf.data \ --kallsyms /mnt/origin/proc/kallsyms - complete perf.data header is displayed Signed-off-by: Jiri Olsa Reviewed-by: David Ahern Tested-by: David Ahern Cc: Corey Ashford Cc: David Ahern Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1338380624-7443-3-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 67 ++++++++++++++++++++++++++++++++------- 1 file changed, 55 insertions(+), 12 deletions(-) diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 04d1e33f4592..2600916efa83 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -454,37 +454,65 @@ void mem_bswap_64(void *src, int byte_size) } } -static void perf_event__all64_swap(union perf_event *event) +static void swap_sample_id_all(union perf_event *event, void *data) +{ + void *end = (void *) event + event->header.size; + int size = end - data; + + BUG_ON(size % sizeof(u64)); + mem_bswap_64(data, size); +} + +static void perf_event__all64_swap(union perf_event *event, + bool sample_id_all __used) { struct perf_event_header *hdr = &event->header; mem_bswap_64(hdr + 1, event->header.size - sizeof(*hdr)); } -static void perf_event__comm_swap(union perf_event *event) +static void perf_event__comm_swap(union perf_event *event, bool sample_id_all) { event->comm.pid = bswap_32(event->comm.pid); event->comm.tid = bswap_32(event->comm.tid); + + if (sample_id_all) { + void *data = &event->comm.comm; + + data += ALIGN(strlen(data) + 1, sizeof(u64)); + swap_sample_id_all(event, data); + } } -static void perf_event__mmap_swap(union perf_event *event) +static void perf_event__mmap_swap(union perf_event *event, + bool sample_id_all) { event->mmap.pid = bswap_32(event->mmap.pid); event->mmap.tid = bswap_32(event->mmap.tid); event->mmap.start = bswap_64(event->mmap.start); event->mmap.len = bswap_64(event->mmap.len); event->mmap.pgoff = bswap_64(event->mmap.pgoff); + + if (sample_id_all) { + void *data = &event->mmap.filename; + + data += ALIGN(strlen(data) + 1, sizeof(u64)); + swap_sample_id_all(event, data); + } } -static void perf_event__task_swap(union perf_event *event) +static void perf_event__task_swap(union perf_event *event, bool sample_id_all) { event->fork.pid = bswap_32(event->fork.pid); event->fork.tid = bswap_32(event->fork.tid); event->fork.ppid = bswap_32(event->fork.ppid); event->fork.ptid = bswap_32(event->fork.ptid); event->fork.time = bswap_64(event->fork.time); + + if (sample_id_all) + swap_sample_id_all(event, &event->fork + 1); } -static void perf_event__read_swap(union perf_event *event) +static void perf_event__read_swap(union perf_event *event, bool sample_id_all) { event->read.pid = bswap_32(event->read.pid); event->read.tid = bswap_32(event->read.tid); @@ -492,6 +520,9 @@ static void perf_event__read_swap(union perf_event *event) event->read.time_enabled = bswap_64(event->read.time_enabled); event->read.time_running = bswap_64(event->read.time_running); event->read.id = bswap_64(event->read.id); + + if (sample_id_all) + swap_sample_id_all(event, &event->read + 1); } static u8 revbyte(u8 b) @@ -543,7 +574,8 @@ void perf_event__attr_swap(struct perf_event_attr *attr) swap_bitfield((u8 *) (&attr->read_format + 1), sizeof(u64)); } -static void perf_event__hdr_attr_swap(union perf_event *event) +static void perf_event__hdr_attr_swap(union perf_event *event, + bool sample_id_all __used) { size_t size; @@ -554,18 +586,21 @@ static void perf_event__hdr_attr_swap(union perf_event *event) mem_bswap_64(event->attr.id, size); } -static void perf_event__event_type_swap(union perf_event *event) +static void perf_event__event_type_swap(union perf_event *event, + bool sample_id_all __used) { event->event_type.event_type.event_id = bswap_64(event->event_type.event_type.event_id); } -static void perf_event__tracing_data_swap(union perf_event *event) +static void perf_event__tracing_data_swap(union perf_event *event, + bool sample_id_all __used) { event->tracing_data.size = bswap_32(event->tracing_data.size); } -typedef void (*perf_event__swap_op)(union perf_event *event); +typedef void (*perf_event__swap_op)(union perf_event *event, + bool sample_id_all); static perf_event__swap_op perf_event__swap_ops[] = { [PERF_RECORD_MMAP] = perf_event__mmap_swap, @@ -999,6 +1034,15 @@ static int perf_session__process_user_event(struct perf_session *session, union } } +static void event_swap(union perf_event *event, bool sample_id_all) +{ + perf_event__swap_op swap; + + swap = perf_event__swap_ops[event->header.type]; + if (swap) + swap(event, sample_id_all); +} + static int perf_session__process_event(struct perf_session *session, union perf_event *event, struct perf_tool *tool, @@ -1007,9 +1051,8 @@ static int perf_session__process_event(struct perf_session *session, struct perf_sample sample; int ret; - if (session->header.needs_swap && - perf_event__swap_ops[event->header.type]) - perf_event__swap_ops[event->header.type](event); + if (session->header.needs_swap) + event_swap(event, session->sample_id_all); if (event->header.type >= PERF_RECORD_HEADER_MAX) return -EINVAL; From 37073f9e449bc430e6c40b9cffc2558002a0256a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 30 May 2012 14:23:44 +0200 Subject: [PATCH 060/178] perf evsel: Fix 32 bit values endianity swap for sample_id_all header We swap the sample_id_all header by u64 pointers. Some members of the header happen to be 32 bit values. We need to handle them separatelly. Together with other endianity patches, this change fixies perf report discrepancies on origin and target systems as described in test 1 below, e.g. following perf report diff: ... 0.12% ps [kernel.kallsyms] [k] clear_page - 0.12% awk bash [.] alloc_word_desc + 0.12% awk bash [.] yyparse 0.11% beah-rhts-task libpython2.6.so.1.0 [.] 0x5560e 0.10% perf libc-2.12.so [.] __ctype_toupper_loc - 0.09% rhts-test-runne bash [.] maybe_make_export_env + 0.09% rhts-test-runne bash [.] 0x385a0 0.09% ps [kernel.kallsyms] [k] page_fault ... Note, running following to test perf endianity handling: test 1) - origin system: # perf record -a -- sleep 10 (any perf record will do) # perf report > report.origin # perf archive perf.data - copy the perf.data, report.origin and perf.data.tar.bz2 to a target system and run: # tar xjvf perf.data.tar.bz2 -C ~/.debug # perf report > report.target # diff -u report.origin report.target - the diff should produce no output (besides some white space stuff and possibly different date/TZ output) test 2) - origin system: # perf record -ag -fo /tmp/perf.data -- sleep 1 - mount origin system root to the target system on /mnt/origin - target system: # perf script --symfs /mnt/origin -I -i /mnt/origin/tmp/perf.data \ --kallsyms /mnt/origin/proc/kallsyms - complete perf.data header is displayed Signed-off-by: Jiri Olsa Reviewed-by: David Ahern Tested-by: David Ahern Cc: Corey Ashford Cc: David Ahern Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1338380624-7443-4-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 91d19138f3ec..9f6cebd798ee 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -494,16 +494,24 @@ int perf_evsel__open_per_thread(struct perf_evsel *evsel, } static int perf_event__parse_id_sample(const union perf_event *event, u64 type, - struct perf_sample *sample) + struct perf_sample *sample, + bool swapped) { const u64 *array = event->sample.array; + union u64_swap u; array += ((event->header.size - sizeof(event->header)) / sizeof(u64)) - 1; if (type & PERF_SAMPLE_CPU) { - u32 *p = (u32 *)array; - sample->cpu = *p; + u.val64 = *array; + if (swapped) { + /* undo swap of u64, then swap on individual u32s */ + u.val64 = bswap_64(u.val64); + u.val32[0] = bswap_32(u.val32[0]); + } + + sample->cpu = u.val32[0]; array--; } @@ -523,9 +531,16 @@ static int perf_event__parse_id_sample(const union perf_event *event, u64 type, } if (type & PERF_SAMPLE_TID) { - u32 *p = (u32 *)array; - sample->pid = p[0]; - sample->tid = p[1]; + u.val64 = *array; + if (swapped) { + /* undo swap of u64, then swap on individual u32s */ + u.val64 = bswap_64(u.val64); + u.val32[0] = bswap_32(u.val32[0]); + u.val32[1] = bswap_32(u.val32[1]); + } + + sample->pid = u.val32[0]; + sample->tid = u.val32[1]; } return 0; @@ -562,7 +577,7 @@ int perf_event__parse_sample(const union perf_event *event, u64 type, if (event->header.type != PERF_RECORD_SAMPLE) { if (!sample_id_all) return 0; - return perf_event__parse_id_sample(event, type, data); + return perf_event__parse_id_sample(event, type, data, swapped); } array = event->sample.array; From 378474e4b23183002f1791b294a4440b6b7df36a Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 31 May 2012 17:16:56 +0530 Subject: [PATCH 061/178] perf symbols: Check for valid dso before creating map dso__new() can return NULL. Hence verify dso before creating a new map. Signed-off-by: Srikar Dronamraju Suggested-by: Arnaldo Carvalho de Melo Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20120531114656.23691.54223.sendpatchset@srdronam.in.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 9d04dcd91815..3e2e5ea0f03f 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -2817,8 +2817,11 @@ int machine__load_vmlinux_path(struct machine *machine, enum map_type type, struct map *dso__new_map(const char *name) { + struct map *map = NULL; struct dso *dso = dso__new(name); - struct map *map = map__new2(0, dso, MAP__FUNCTION); + + if (dso) + map = map__new2(0, dso, MAP__FUNCTION); return map; } From a23c4dc422cff7bd30f40f5dc7c9ac4a6339005a Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 31 May 2012 17:16:43 +0530 Subject: [PATCH 062/178] perf uprobes: Remove unnecessary check before strlist__delete Since strlist__delete() itself checks, the additional check before calling strlist__delete() is redundant. No Functional change. Signed-off-by: Srikar Dronamraju Suggested-by: Arnaldo Carvalho de Melo Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20120531114643.23691.38666.sendpatchset@srdronam.in.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-event.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 59dccc98b554..0dda25d82d06 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -2164,16 +2164,12 @@ int del_perf_probe_events(struct strlist *dellist) error: if (kfd >= 0) { - if (namelist) - strlist__delete(namelist); - + strlist__delete(namelist); close(kfd); } if (ufd >= 0) { - if (unamelist) - strlist__delete(unamelist); - + strlist__delete(unamelist); close(ufd); } From cb7225feec627e91d598198996429e9ee6804f8d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 31 May 2012 14:51:44 +0900 Subject: [PATCH 063/178] perf: Remove duplicate invocation on perf_event_for_each The @func callback was invoked twice for group leader when perf_event_for_each() called. It seems the commit 75f937f24bd9 ("perf_counter: Fix ctx->mutex vs counter ->mutex inversion") made the mistake during the change. Signed-off-by: Namhyung Kim Acked-by: Peter Zijlstra Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1338443506-25009-1-git-send-email-namhyung.kim@lge.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 5b06cbbf6931..f85c0154b333 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3181,7 +3181,6 @@ static void perf_event_for_each(struct perf_event *event, event = event->group_leader; perf_event_for_each_child(event, func); - func(event); list_for_each_entry(sibling, &event->sibling_list, group_entry) perf_event_for_each_child(sibling, func); mutex_unlock(&ctx->mutex); From 0013fb4ca3171c64a4a5d3851fb591bb575e7f04 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 31 May 2012 13:03:26 +0400 Subject: [PATCH 064/178] CIFS: Fix possible wrong memory allocation when cifs_reconnect sets maxBuf to 0 and we try to calculate a size of memory we need to store locks. Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/file.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 253170dfa716..de8abb6f7b56 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -876,7 +876,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) struct cifsLockInfo *li, *tmp; struct cifs_tcon *tcon; struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); - unsigned int num, max_num; + unsigned int num, max_num, max_buf; LOCKING_ANDX_RANGE *buf, *cur; int types[] = {LOCKING_ANDX_LARGE_FILES, LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES}; @@ -892,8 +892,19 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) return rc; } - max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) / - sizeof(LOCKING_ANDX_RANGE); + /* + * Accessing maxBuf is racy with cifs_reconnect - need to store value + * and check it for zero before using. + */ + max_buf = tcon->ses->server->maxBuf; + if (!max_buf) { + mutex_unlock(&cinode->lock_mutex); + FreeXid(xid); + return -EINVAL; + } + + max_num = (max_buf - sizeof(struct smb_hdr)) / + sizeof(LOCKING_ANDX_RANGE); buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); if (!buf) { mutex_unlock(&cinode->lock_mutex); @@ -1218,7 +1229,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid) int types[] = {LOCKING_ANDX_LARGE_FILES, LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES}; unsigned int i; - unsigned int max_num, num; + unsigned int max_num, num, max_buf; LOCKING_ANDX_RANGE *buf, *cur; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); @@ -1228,8 +1239,16 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid) INIT_LIST_HEAD(&tmp_llist); - max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) / - sizeof(LOCKING_ANDX_RANGE); + /* + * Accessing maxBuf is racy with cifs_reconnect - need to store value + * and check it for zero before using. + */ + max_buf = tcon->ses->server->maxBuf; + if (!max_buf) + return -EINVAL; + + max_num = (max_buf - sizeof(struct smb_hdr)) / + sizeof(LOCKING_ANDX_RANGE); buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); if (!buf) return -ENOMEM; From ea319d57d3372a9dbee0b3807d75bb36b8d54adc Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 31 May 2012 13:59:36 +0400 Subject: [PATCH 065/178] CIFS: Improve identation in cifs_unlock_range Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/file.c | 75 +++++++++++++++++++++++--------------------------- 1 file changed, 35 insertions(+), 40 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index de8abb6f7b56..513adbc211d7 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1266,46 +1266,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid) continue; if (types[i] != li->type) continue; - if (!cinode->can_cache_brlcks) { - cur->Pid = cpu_to_le16(li->pid); - cur->LengthLow = cpu_to_le32((u32)li->length); - cur->LengthHigh = - cpu_to_le32((u32)(li->length>>32)); - cur->OffsetLow = cpu_to_le32((u32)li->offset); - cur->OffsetHigh = - cpu_to_le32((u32)(li->offset>>32)); - /* - * We need to save a lock here to let us add - * it again to the file's list if the unlock - * range request fails on the server. - */ - list_move(&li->llist, &tmp_llist); - if (++num == max_num) { - stored_rc = cifs_lockv(xid, tcon, - cfile->netfid, - li->type, num, - 0, buf); - if (stored_rc) { - /* - * We failed on the unlock range - * request - add all locks from - * the tmp list to the head of - * the file's list. - */ - cifs_move_llist(&tmp_llist, - &cfile->llist); - rc = stored_rc; - } else - /* - * The unlock range request - * succeed - free the tmp list. - */ - cifs_free_llist(&tmp_llist); - cur = buf; - num = 0; - } else - cur++; - } else { + if (cinode->can_cache_brlcks) { /* * We can cache brlock requests - simply remove * a lock from the file's list. @@ -1313,7 +1274,41 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid) list_del(&li->llist); cifs_del_lock_waiters(li); kfree(li); + continue; } + cur->Pid = cpu_to_le16(li->pid); + cur->LengthLow = cpu_to_le32((u32)li->length); + cur->LengthHigh = cpu_to_le32((u32)(li->length>>32)); + cur->OffsetLow = cpu_to_le32((u32)li->offset); + cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32)); + /* + * We need to save a lock here to let us add it again to + * the file's list if the unlock range request fails on + * the server. + */ + list_move(&li->llist, &tmp_llist); + if (++num == max_num) { + stored_rc = cifs_lockv(xid, tcon, cfile->netfid, + li->type, num, 0, buf); + if (stored_rc) { + /* + * We failed on the unlock range + * request - add all locks from the tmp + * list to the head of the file's list. + */ + cifs_move_llist(&tmp_llist, + &cfile->llist); + rc = stored_rc; + } else + /* + * The unlock range request succeed - + * free the tmp list. + */ + cifs_free_llist(&tmp_llist); + cur = buf; + num = 0; + } else + cur++; } if (num) { stored_rc = cifs_lockv(xid, tcon, cfile->netfid, From 7f0adb53bcf5bdb92236cda8ec92ea5e40993028 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Mon, 28 May 2012 15:50:10 +0400 Subject: [PATCH 066/178] CIFS: Make accessing is_valid_oplock/dump_detail ops struct field safe Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/connect.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index ccafdedd0dbc..3647b1a4540b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1058,13 +1058,15 @@ cifs_demultiplex_thread(void *p) if (mid_entry != NULL) { if (!mid_entry->multiRsp || mid_entry->multiEnd) mid_entry->callback(mid_entry); - } else if (!server->ops->is_oplock_break(buf, server)) { + } else if (!server->ops->is_oplock_break || + !server->ops->is_oplock_break(buf, server)) { cERROR(1, "No task to wake, unknown frame received! " "NumMids %d", atomic_read(&midCount)); cifs_dump_mem("Received Data is: ", buf, HEADER_SIZE(server)); #ifdef CONFIG_CIFS_DEBUG2 - server->ops->dump_detail(buf); + if (server->ops->dump_detail) + server->ops->dump_detail(buf); cifs_dump_mids(server); #endif /* CIFS_DEBUG2 */ From 88257360605f9362dc4d79326c268dd334f61c90 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Wed, 23 May 2012 14:01:59 +0400 Subject: [PATCH 067/178] CIFS: Move get_next_mid to ops struct Reviewed-by: Jeff Layton Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 7 ++++ fs/cifs/cifsproto.h | 1 - fs/cifs/cifssmb.c | 8 ++-- fs/cifs/connect.c | 2 +- fs/cifs/misc.c | 89 +-------------------------------------------- fs/cifs/smb1ops.c | 89 +++++++++++++++++++++++++++++++++++++++++++++ fs/cifs/transport.c | 2 +- 7 files changed, 103 insertions(+), 95 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 20350a93ed99..6df0cbe1cbc9 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -174,6 +174,7 @@ struct smb_version_operations { void (*add_credits)(struct TCP_Server_Info *, const unsigned int); void (*set_credits)(struct TCP_Server_Info *, const int); int * (*get_credits_field)(struct TCP_Server_Info *); + __u64 (*get_next_mid)(struct TCP_Server_Info *); /* data offset from read response message */ unsigned int (*read_data_offset)(char *); /* data length from read response message */ @@ -399,6 +400,12 @@ set_credits(struct TCP_Server_Info *server, const int val) server->ops->set_credits(server, val); } +static inline __u64 +get_next_mid(struct TCP_Server_Info *server) +{ + return server->ops->get_next_mid(server); +} + /* * Macros to allow the TCP_Server_Info->net field and related code to drop out * when CONFIG_NET_NS isn't set. diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 5ec21ecf7980..0a6cbfe2761e 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -114,7 +114,6 @@ extern int small_smb_init_no_tc(const int smb_cmd, const int wct, void **request_buf); extern int CIFS_SessSetup(unsigned int xid, struct cifs_ses *ses, const struct nls_table *nls_cp); -extern __u64 GetNextMid(struct TCP_Server_Info *server); extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); extern u64 cifs_UnixTimeToNT(struct timespec); extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index b5ad716b2642..5b400730c213 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -268,7 +268,7 @@ small_smb_init_no_tc(const int smb_command, const int wct, return rc; buffer = (struct smb_hdr *)*request_buf; - buffer->Mid = GetNextMid(ses->server); + buffer->Mid = get_next_mid(ses->server); if (ses->capabilities & CAP_UNICODE) buffer->Flags2 |= SMBFLG2_UNICODE; if (ses->capabilities & CAP_STATUS32) @@ -402,7 +402,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses) cFYI(1, "secFlags 0x%x", secFlags); - pSMB->hdr.Mid = GetNextMid(server); + pSMB->hdr.Mid = get_next_mid(server); pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS); if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5) @@ -782,7 +782,7 @@ CIFSSMBLogoff(const int xid, struct cifs_ses *ses) return rc; } - pSMB->hdr.Mid = GetNextMid(ses->server); + pSMB->hdr.Mid = get_next_mid(ses->server); if (ses->server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) @@ -4762,7 +4762,7 @@ getDFSRetry: /* server pointer checked in called function, but should never be null here anyway */ - pSMB->hdr.Mid = GetNextMid(ses->server); + pSMB->hdr.Mid = get_next_mid(ses->server); pSMB->hdr.Tid = ses->ipc_tid; pSMB->hdr.Uid = ses->Suid; if (ses->capabilities & CAP_STATUS32) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 3647b1a4540b..78db68a5cf44 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3940,7 +3940,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses, header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX, NULL /*no tid */ , 4 /*wct */ ); - smb_buffer->Mid = GetNextMid(ses->server); + smb_buffer->Mid = get_next_mid(ses->server); smb_buffer->Uid = ses->Suid; pSMB = (TCONX_REQ *) smb_buffer; pSMBr = (TCONX_RSP *) smb_buffer_response; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index e2552d2b2e42..557506ae1e2a 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -212,93 +212,6 @@ cifs_small_buf_release(void *buf_to_free) return; } -/* - * Find a free multiplex id (SMB mid). Otherwise there could be - * mid collisions which might cause problems, demultiplexing the - * wrong response to this request. Multiplex ids could collide if - * one of a series requests takes much longer than the others, or - * if a very large number of long lived requests (byte range - * locks or FindNotify requests) are pending. No more than - * 64K-1 requests can be outstanding at one time. If no - * mids are available, return zero. A future optimization - * could make the combination of mids and uid the key we use - * to demultiplex on (rather than mid alone). - * In addition to the above check, the cifs demultiplex - * code already used the command code as a secondary - * check of the frame and if signing is negotiated the - * response would be discarded if the mid were the same - * but the signature was wrong. Since the mid is not put in the - * pending queue until later (when it is about to be dispatched) - * we do have to limit the number of outstanding requests - * to somewhat less than 64K-1 although it is hard to imagine - * so many threads being in the vfs at one time. - */ -__u64 GetNextMid(struct TCP_Server_Info *server) -{ - __u64 mid = 0; - __u16 last_mid, cur_mid; - bool collision; - - spin_lock(&GlobalMid_Lock); - - /* mid is 16 bit only for CIFS/SMB */ - cur_mid = (__u16)((server->CurrentMid) & 0xffff); - /* we do not want to loop forever */ - last_mid = cur_mid; - cur_mid++; - - /* - * This nested loop looks more expensive than it is. - * In practice the list of pending requests is short, - * fewer than 50, and the mids are likely to be unique - * on the first pass through the loop unless some request - * takes longer than the 64 thousand requests before it - * (and it would also have to have been a request that - * did not time out). - */ - while (cur_mid != last_mid) { - struct mid_q_entry *mid_entry; - unsigned int num_mids; - - collision = false; - if (cur_mid == 0) - cur_mid++; - - num_mids = 0; - list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) { - ++num_mids; - if (mid_entry->mid == cur_mid && - mid_entry->mid_state == MID_REQUEST_SUBMITTED) { - /* This mid is in use, try a different one */ - collision = true; - break; - } - } - - /* - * if we have more than 32k mids in the list, then something - * is very wrong. Possibly a local user is trying to DoS the - * box by issuing long-running calls and SIGKILL'ing them. If - * we get to 2^16 mids then we're in big trouble as this - * function could loop forever. - * - * Go ahead and assign out the mid in this situation, but force - * an eventual reconnect to clean out the pending_mid_q. - */ - if (num_mids > 32768) - server->tcpStatus = CifsNeedReconnect; - - if (!collision) { - mid = (__u64)cur_mid; - server->CurrentMid = mid; - break; - } - cur_mid++; - } - spin_unlock(&GlobalMid_Lock); - return mid; -} - /* NB: MID can not be set if treeCon not passed in, in that case it is responsbility of caller to set the mid */ void @@ -334,7 +247,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ , /* Uid is not converted */ buffer->Uid = treeCon->ses->Suid; - buffer->Mid = GetNextMid(treeCon->ses->server); + buffer->Mid = get_next_mid(treeCon->ses->server); } if (treeCon->Flags & SMB_SHARE_IS_IN_DFS) buffer->Flags2 |= SMBFLG2_DFS; diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index d9d615fbed3f..6dec38f5522d 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -125,6 +125,94 @@ cifs_get_credits_field(struct TCP_Server_Info *server) return &server->credits; } +/* + * Find a free multiplex id (SMB mid). Otherwise there could be + * mid collisions which might cause problems, demultiplexing the + * wrong response to this request. Multiplex ids could collide if + * one of a series requests takes much longer than the others, or + * if a very large number of long lived requests (byte range + * locks or FindNotify requests) are pending. No more than + * 64K-1 requests can be outstanding at one time. If no + * mids are available, return zero. A future optimization + * could make the combination of mids and uid the key we use + * to demultiplex on (rather than mid alone). + * In addition to the above check, the cifs demultiplex + * code already used the command code as a secondary + * check of the frame and if signing is negotiated the + * response would be discarded if the mid were the same + * but the signature was wrong. Since the mid is not put in the + * pending queue until later (when it is about to be dispatched) + * we do have to limit the number of outstanding requests + * to somewhat less than 64K-1 although it is hard to imagine + * so many threads being in the vfs at one time. + */ +static __u64 +cifs_get_next_mid(struct TCP_Server_Info *server) +{ + __u64 mid = 0; + __u16 last_mid, cur_mid; + bool collision; + + spin_lock(&GlobalMid_Lock); + + /* mid is 16 bit only for CIFS/SMB */ + cur_mid = (__u16)((server->CurrentMid) & 0xffff); + /* we do not want to loop forever */ + last_mid = cur_mid; + cur_mid++; + + /* + * This nested loop looks more expensive than it is. + * In practice the list of pending requests is short, + * fewer than 50, and the mids are likely to be unique + * on the first pass through the loop unless some request + * takes longer than the 64 thousand requests before it + * (and it would also have to have been a request that + * did not time out). + */ + while (cur_mid != last_mid) { + struct mid_q_entry *mid_entry; + unsigned int num_mids; + + collision = false; + if (cur_mid == 0) + cur_mid++; + + num_mids = 0; + list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) { + ++num_mids; + if (mid_entry->mid == cur_mid && + mid_entry->mid_state == MID_REQUEST_SUBMITTED) { + /* This mid is in use, try a different one */ + collision = true; + break; + } + } + + /* + * if we have more than 32k mids in the list, then something + * is very wrong. Possibly a local user is trying to DoS the + * box by issuing long-running calls and SIGKILL'ing them. If + * we get to 2^16 mids then we're in big trouble as this + * function could loop forever. + * + * Go ahead and assign out the mid in this situation, but force + * an eventual reconnect to clean out the pending_mid_q. + */ + if (num_mids > 32768) + server->tcpStatus = CifsNeedReconnect; + + if (!collision) { + mid = (__u64)cur_mid; + server->CurrentMid = mid; + break; + } + cur_mid++; + } + spin_unlock(&GlobalMid_Lock); + return mid; +} + struct smb_version_operations smb1_operations = { .send_cancel = send_nt_cancel, .compare_fids = cifs_compare_fids, @@ -133,6 +221,7 @@ struct smb_version_operations smb1_operations = { .add_credits = cifs_add_credits, .set_credits = cifs_set_credits, .get_credits_field = cifs_get_credits_field, + .get_next_mid = cifs_get_next_mid, .read_data_offset = cifs_read_data_offset, .read_data_length = cifs_read_data_length, .map_error = map_smb_to_linux_error, diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 1b36ffe6a47b..3097ee58fd7d 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -779,7 +779,7 @@ send_lock_cancel(const unsigned int xid, struct cifs_tcon *tcon, pSMB->LockType = LOCKING_ANDX_CANCEL_LOCK|LOCKING_ANDX_LARGE_FILES; pSMB->Timeout = 0; - pSMB->hdr.Mid = GetNextMid(ses->server); + pSMB->hdr.Mid = get_next_mid(ses->server); return SendReceive(xid, ses, in_buf, out_buf, &bytes_returned, 0); From cfb46f433a4da97c31780e08a259fac2cb6bd61f Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Wed, 25 Apr 2012 14:33:33 +0100 Subject: [PATCH 068/178] acpi_video: fix leaking PCI references Signed-off-by: Alan Cox Acked-by: Matthew Garrett Signed-off-by: Len Brown --- drivers/acpi/video.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c index 9577b6fa2650..66e8f7333e9b 100644 --- a/drivers/acpi/video.c +++ b/drivers/acpi/video.c @@ -1745,6 +1745,7 @@ static int acpi_video_bus_remove(struct acpi_device *device, int type) static int __init intel_opregion_present(void) { + int i915 = 0; #if defined(CONFIG_DRM_I915) || defined(CONFIG_DRM_I915_MODULE) struct pci_dev *dev = NULL; u32 address; @@ -1757,10 +1758,10 @@ static int __init intel_opregion_present(void) pci_read_config_dword(dev, 0xfc, &address); if (!address) continue; - return 1; + i915 = 1; } #endif - return 0; + return i915; } int acpi_video_register(void) From c6996bdd850fb53319918487d5f674203517fdc5 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Wed, 25 Apr 2012 14:33:48 +0100 Subject: [PATCH 069/178] acpi_video: Intel video is not always i915 Stop it poking at random registers on the i740 cards that may be out there still. As per Matthew's feedback remove the conditional checks and never enable the opregion handling unless an appropriate driver has been loaded. Signed-off-by: Alan Cox Signed-off-by: Len Brown --- drivers/acpi/video.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c index 66e8f7333e9b..609262dc1258 100644 --- a/drivers/acpi/video.c +++ b/drivers/acpi/video.c @@ -1743,10 +1743,18 @@ static int acpi_video_bus_remove(struct acpi_device *device, int type) return 0; } +static int __init is_i740(struct pci_dev *dev) +{ + if (dev->device == 0x00D1) + return 1; + if (dev->device == 0x7000) + return 1; + return 0; +} + static int __init intel_opregion_present(void) { - int i915 = 0; -#if defined(CONFIG_DRM_I915) || defined(CONFIG_DRM_I915_MODULE) + int opregion = 0; struct pci_dev *dev = NULL; u32 address; @@ -1755,13 +1763,15 @@ static int __init intel_opregion_present(void) continue; if (dev->vendor != PCI_VENDOR_ID_INTEL) continue; + /* We don't want to poke around undefined i740 registers */ + if (is_i740(dev)) + continue; pci_read_config_dword(dev, 0xfc, &address); if (!address) continue; - i915 = 1; + opregion = 1; } -#endif - return i915; + return opregion; } int acpi_video_register(void) From 155689defc782b486a7e6776a57ecc4ebb37ed52 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Wed, 25 Apr 2012 14:34:04 +0100 Subject: [PATCH 070/178] gma500: don't register the ACPI video bus We are not yet ready for this and it makes a mess on some devices. Signed-off-by: Alan Cox Signed-off-by: Len Brown --- drivers/gpu/drm/gma500/psb_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/gma500/psb_drv.c b/drivers/gpu/drm/gma500/psb_drv.c index c34adf9d910a..09af2ffbb307 100644 --- a/drivers/gpu/drm/gma500/psb_drv.c +++ b/drivers/gpu/drm/gma500/psb_drv.c @@ -349,7 +349,7 @@ static int psb_driver_load(struct drm_device *dev, unsigned long chipset) PSB_WSGX32(0x30000000, PSB_CR_BIF_3D_REQ_BASE); /* igd_opregion_init(&dev_priv->opregion_dev); */ - acpi_video_register(); +/* acpi_video_register(); */ if (dev_priv->lid_state) psb_lid_timer_init(dev_priv); From 301f33fbcf4ced53b3de114846ecece5d6aafeeb Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 4 Apr 2012 14:19:16 +0300 Subject: [PATCH 071/178] ACPI video: use after input_unregister_device() We can't use "input" anymore after calling input_unregister_device(). The call to input_free_device() is a double free. The normal way to deal with this is to make input_register_device() the last function called in the function. Signed-off-by: Dan Carpenter Acked-by: Dmitry Torokhov Signed-off-by: Len Brown --- drivers/acpi/video.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c index 609262dc1258..a576575617d7 100644 --- a/drivers/acpi/video.c +++ b/drivers/acpi/video.c @@ -1687,10 +1687,6 @@ static int acpi_video_bus_add(struct acpi_device *device) set_bit(KEY_BRIGHTNESS_ZERO, input->keybit); set_bit(KEY_DISPLAY_OFF, input->keybit); - error = input_register_device(input); - if (error) - goto err_stop_video; - printk(KERN_INFO PREFIX "%s [%s] (multi-head: %s rom: %s post: %s)\n", ACPI_VIDEO_DEVICE_NAME, acpi_device_bid(device), video->flags.multihead ? "yes" : "no", @@ -1701,12 +1697,16 @@ static int acpi_video_bus_add(struct acpi_device *device) video->pm_nb.priority = 0; error = register_pm_notifier(&video->pm_nb); if (error) - goto err_unregister_input_dev; + goto err_stop_video; + + error = input_register_device(input); + if (error) + goto err_unregister_pm_notifier; return 0; - err_unregister_input_dev: - input_unregister_device(input); + err_unregister_pm_notifier: + unregister_pm_notifier(&video->pm_nb); err_stop_video: acpi_video_bus_stop_devices(video); err_free_input_dev: From a4dff3043c231d57f982af635c9d2192ee40e5ae Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Wed, 30 May 2012 16:25:41 -0700 Subject: [PATCH 072/178] target/file: Use O_DSYNC by default for FILEIO backends Convert to use O_DSYNC for all cases at FILEIO backend creation time to avoid the extra syncing of pure timestamp updates with legacy O_SYNC during default operation as recommended by hch. Continue to do this independently of Write Cache Enable (WCE) bit, as WCE=0 is currently the default for all backend devices and enabled by user on per device basis via attrib/emulate_write_cache. This patch drops the now unnecessary fd_buffered_io= token usage that was originally signalling when to explictly disable O_SYNC at backend creation time for buffered I/O operation. This can end up being dangerous for a number of reasons during physical node failure, so go ahead and drop this option for now when O_DSYNC is used as the default. Also allow explict FUA WRITEs -> vfs_fsync_range() call to function in fd_execute_cmd() independently of WCE bit setting. Reported-by: Christoph Hellwig Cc: Linus Torvalds Cc: Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_file.c | 70 ++++++++----------------------- drivers/target/target_core_file.h | 1 - 2 files changed, 17 insertions(+), 54 deletions(-) diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index 686dba189f8e..9f99d0404908 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -133,16 +133,11 @@ static struct se_device *fd_create_virtdevice( ret = PTR_ERR(dev_p); goto fail; } - - /* O_DIRECT too? */ - flags = O_RDWR | O_CREAT | O_LARGEFILE; - /* - * If fd_buffered_io=1 has not been set explicitly (the default), - * use O_SYNC to force FILEIO writes to disk. + * Use O_DSYNC by default instead of O_SYNC to forgo syncing + * of pure timestamp updates. */ - if (!(fd_dev->fbd_flags & FDBD_USE_BUFFERED_IO)) - flags |= O_SYNC; + flags = O_RDWR | O_CREAT | O_LARGEFILE | O_DSYNC; file = filp_open(dev_p, flags, 0600); if (IS_ERR(file)) { @@ -380,23 +375,6 @@ static void fd_emulate_sync_cache(struct se_cmd *cmd) } } -static void fd_emulate_write_fua(struct se_cmd *cmd) -{ - struct se_device *dev = cmd->se_dev; - struct fd_dev *fd_dev = dev->dev_ptr; - loff_t start = cmd->t_task_lba * - dev->se_sub_dev->se_dev_attrib.block_size; - loff_t end = start + cmd->data_length; - int ret; - - pr_debug("FILEIO: FUA WRITE LBA: %llu, bytes: %u\n", - cmd->t_task_lba, cmd->data_length); - - ret = vfs_fsync_range(fd_dev->fd_file, start, end, 1); - if (ret != 0) - pr_err("FILEIO: vfs_fsync_range() failed: %d\n", ret); -} - static int fd_execute_cmd(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, enum dma_data_direction data_direction) { @@ -411,19 +389,21 @@ static int fd_execute_cmd(struct se_cmd *cmd, struct scatterlist *sgl, ret = fd_do_readv(cmd, sgl, sgl_nents); } else { ret = fd_do_writev(cmd, sgl, sgl_nents); - + /* + * Perform implict vfs_fsync_range() for fd_do_writev() ops + * for SCSI WRITEs with Forced Unit Access (FUA) set. + * Allow this to happen independent of WCE=0 setting. + */ if (ret > 0 && - dev->se_sub_dev->se_dev_attrib.emulate_write_cache > 0 && dev->se_sub_dev->se_dev_attrib.emulate_fua_write > 0 && (cmd->se_cmd_flags & SCF_FUA)) { - /* - * We might need to be a bit smarter here - * and return some sense data to let the initiator - * know the FUA WRITE cache sync failed..? - */ - fd_emulate_write_fua(cmd); - } + struct fd_dev *fd_dev = dev->dev_ptr; + loff_t start = cmd->t_task_lba * + dev->se_sub_dev->se_dev_attrib.block_size; + loff_t end = start + cmd->data_length; + vfs_fsync_range(fd_dev->fd_file, start, end, 1); + } } if (ret < 0) { @@ -442,7 +422,6 @@ enum { static match_table_t tokens = { {Opt_fd_dev_name, "fd_dev_name=%s"}, {Opt_fd_dev_size, "fd_dev_size=%s"}, - {Opt_fd_buffered_io, "fd_buffered_io=%d"}, {Opt_err, NULL} }; @@ -454,7 +433,7 @@ static ssize_t fd_set_configfs_dev_params( struct fd_dev *fd_dev = se_dev->se_dev_su_ptr; char *orig, *ptr, *arg_p, *opts; substring_t args[MAX_OPT_ARGS]; - int ret = 0, arg, token; + int ret = 0, token; opts = kstrdup(page, GFP_KERNEL); if (!opts) @@ -498,19 +477,6 @@ static ssize_t fd_set_configfs_dev_params( " bytes\n", fd_dev->fd_dev_size); fd_dev->fbd_flags |= FBDF_HAS_SIZE; break; - case Opt_fd_buffered_io: - match_int(args, &arg); - if (arg != 1) { - pr_err("bogus fd_buffered_io=%d value\n", arg); - ret = -EINVAL; - goto out; - } - - pr_debug("FILEIO: Using buffered I/O" - " operations for struct fd_dev\n"); - - fd_dev->fbd_flags |= FDBD_USE_BUFFERED_IO; - break; default: break; } @@ -542,10 +508,8 @@ static ssize_t fd_show_configfs_dev_params( ssize_t bl = 0; bl = sprintf(b + bl, "TCM FILEIO ID: %u", fd_dev->fd_dev_id); - bl += sprintf(b + bl, " File: %s Size: %llu Mode: %s\n", - fd_dev->fd_dev_name, fd_dev->fd_dev_size, - (fd_dev->fbd_flags & FDBD_USE_BUFFERED_IO) ? - "Buffered" : "Synchronous"); + bl += sprintf(b + bl, " File: %s Size: %llu Mode: O_DSYNC\n", + fd_dev->fd_dev_name, fd_dev->fd_dev_size); return bl; } diff --git a/drivers/target/target_core_file.h b/drivers/target/target_core_file.h index fbd59ef7d8be..70ce7fd7111d 100644 --- a/drivers/target/target_core_file.h +++ b/drivers/target/target_core_file.h @@ -14,7 +14,6 @@ #define FBDF_HAS_PATH 0x01 #define FBDF_HAS_SIZE 0x02 -#define FDBD_USE_BUFFERED_IO 0x04 struct fd_dev { u32 fbd_flags; From ca4620853a5b578725edec1fc8f1345db3a823ab Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 1 Jun 2012 23:28:23 -0700 Subject: [PATCH 073/178] MAINTAINERS: Update my e-mail address Maintainer activities moved to home server. Update e-mail address to reflect reality. Signed-off-by: Guenter Roeck Acked-by: Jean Delvare --- MAINTAINERS | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 55f0fda602ec..5b75b9706d2e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2270,7 +2270,7 @@ F: include/linux/device-mapper.h F: include/linux/dm-*.h DIOLAN U2C-12 I2C DRIVER -M: Guenter Roeck +M: Guenter Roeck L: linux-i2c@vger.kernel.org S: Maintained F: drivers/i2c/busses/i2c-diolan-u2c.c @@ -3138,7 +3138,7 @@ F: drivers/tty/hvc/ HARDWARE MONITORING M: Jean Delvare -M: Guenter Roeck +M: Guenter Roeck L: lm-sensors@lm-sensors.org W: http://www.lm-sensors.org/ T: quilt kernel.org/pub/linux/kernel/people/jdelvare/linux-2.6/jdelvare-hwmon/ @@ -4411,6 +4411,13 @@ S: Orphan F: drivers/video/matrox/matroxfb_* F: include/linux/matroxfb.h +MAX16065 HARDWARE MONITOR DRIVER +M: Guenter Roeck +L: lm-sensors@lm-sensors.org +S: Maintained +F: Documentation/hwmon/max16065 +F: drivers/hwmon/max16065.c + MAX6650 HARDWARE MONITOR AND FAN CONTROLLER DRIVER M: "Hans J. Koch" L: lm-sensors@lm-sensors.org @@ -5149,7 +5156,7 @@ F: drivers/leds/leds-pca9532.c F: include/linux/leds-pca9532.h PCA9541 I2C BUS MASTER SELECTOR DRIVER -M: Guenter Roeck +M: Guenter Roeck L: linux-i2c@vger.kernel.org S: Maintained F: drivers/i2c/muxes/i2c-mux-pca9541.c @@ -5299,7 +5306,7 @@ F: drivers/video/fb-puv3.c F: drivers/rtc/rtc-puv3.c PMBUS HARDWARE MONITORING DRIVERS -M: Guenter Roeck +M: Guenter Roeck L: lm-sensors@lm-sensors.org W: http://www.lm-sensors.org/ W: http://www.roeck-us.net/linux/drivers/ From 2f9d3df8aa1cc3c6db5cfa0bad3f0745e04cc27d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 3 Jun 2012 14:50:19 -0700 Subject: [PATCH 074/178] vfs: move inode stat information closer together The comment above it says "Stat data, not accessed from path walking", but in fact some of inode fields we use for the common stat data was way down at the end of the inode, causing unnecessary cache misses for the common stat operations. The inode structure is pretty big, and this can change padding depending on field width, but at least on the common 64-bit configurations this doesn't change the size. Some of our inode layout has historically been to tro to avoid unnecessary padding fields, but cache locality is at least as important for layout, if not more. Noticed by looking at kernel profiles, and noticing that the "i_blkbits" access stood out like a sore thumb. Signed-off-by: Linus Torvalds --- include/linux/fs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 51978ed43e97..17fd887c798f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -802,13 +802,14 @@ struct inode { unsigned int __i_nlink; }; dev_t i_rdev; + loff_t i_size; struct timespec i_atime; struct timespec i_mtime; struct timespec i_ctime; spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; + unsigned int i_blkbits; blkcnt_t i_blocks; - loff_t i_size; #ifdef __NEED_I_SIZE_ORDERED seqcount_t i_size_seqcount; @@ -828,9 +829,8 @@ struct inode { struct list_head i_dentry; struct rcu_head i_rcu; }; - atomic_t i_count; - unsigned int i_blkbits; u64 i_version; + atomic_t i_count; atomic_t i_dio_count; atomic_t i_writecount; const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ From 752dc185dacba1edcba425e67fc6df3c7793a5c3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 2 Jun 2012 00:27:47 -0700 Subject: [PATCH 075/178] mm: fix warning in __set_page_dirty_nobuffers New tmpfs use of !PageUptodate pages for fallocate() is triggering the WARNING: at mm/page-writeback.c:1990 when __set_page_dirty_nobuffers() is called from migrate_page_copy() for compaction. It is anomalous that migration should use __set_page_dirty_nobuffers() on an address_space that does not participate in dirty and writeback accounting; and this has also been observed to insert surprising dirty tags into a tmpfs radix_tree, despite tmpfs not using tags at all. We should probably give migrate_page_copy() a better way to preserve the tag and migrate accounting info, when mapping_cap_account_dirty(). But that needs some more work: so in the interim, avoid the warning by using a simple SetPageDirty on PageSwapBacked pages. Reported-and-tested-by: Dave Jones Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- mm/migrate.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index ab81d482ae6f..be26d5cbe56b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -436,7 +436,10 @@ void migrate_page_copy(struct page *newpage, struct page *page) * is actually a signal that all of the page has become dirty. * Whereas only part of our page may be dirty. */ - __set_page_dirty_nobuffers(newpage); + if (PageSwapBacked(page)) + SetPageDirty(newpage); + else + __set_page_dirty_nobuffers(newpage); } mlock_migrate_page(newpage, page); From 68e3e92620c323703bc7db75c2ba15239ee85c39 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 3 Jun 2012 20:05:57 -0700 Subject: [PATCH 076/178] Revert "mm: compaction: handle incorrect MIGRATE_UNMOVABLE type pageblocks" This reverts commit 5ceb9ce6fe9462a298bb2cd5c9f1ca6cb80a0199. That commit seems to be the cause of the mm compation list corruption issues that Dave Jones reported. The locking (or rather, absense there-of) is dubious, as is the use of the 'page' variable once it has been found to be outside the pageblock range. So revert it for now, we can re-visit this for 3.6. If we even need to: as Minchan Kim says, "The patch wasn't a bug fix and even test workload was very theoretical". Reported-and-tested-by: Dave Jones Acked-by: Hugh Dickins Acked-by: KOSAKI Motohiro Acked-by: Minchan Kim Cc: Bartlomiej Zolnierkiewicz Cc: Kyungmin Park Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 19 ----- mm/compaction.c | 142 ++++++------------------------------- mm/internal.h | 9 +-- mm/page_alloc.c | 8 +-- 4 files changed, 28 insertions(+), 150 deletions(-) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index e988037abd2a..51a90b7f2d60 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -1,8 +1,6 @@ #ifndef _LINUX_COMPACTION_H #define _LINUX_COMPACTION_H -#include - /* Return values for compact_zone() and try_to_compact_pages() */ /* compaction didn't start as it was not possible or direct reclaim was more suitable */ #define COMPACT_SKIPPED 0 @@ -13,23 +11,6 @@ /* The full zone was compacted */ #define COMPACT_COMPLETE 3 -/* - * compaction supports three modes - * - * COMPACT_ASYNC_MOVABLE uses asynchronous migration and only scans - * MIGRATE_MOVABLE pageblocks as migration sources and targets. - * COMPACT_ASYNC_UNMOVABLE uses asynchronous migration and only scans - * MIGRATE_MOVABLE pageblocks as migration sources. - * MIGRATE_UNMOVABLE pageblocks are scanned as potential migration - * targets and convers them to MIGRATE_MOVABLE if possible - * COMPACT_SYNC uses synchronous migration and scans all pageblocks - */ -enum compact_mode { - COMPACT_ASYNC_MOVABLE, - COMPACT_ASYNC_UNMOVABLE, - COMPACT_SYNC, -}; - #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, diff --git a/mm/compaction.c b/mm/compaction.c index 4ac338af5120..7ea259d82a99 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -236,7 +236,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, */ while (unlikely(too_many_isolated(zone))) { /* async migration should just abort */ - if (cc->mode != COMPACT_SYNC) + if (!cc->sync) return 0; congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -304,8 +304,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, * satisfies the allocation */ pageblock_nr = low_pfn >> pageblock_order; - if (cc->mode != COMPACT_SYNC && - last_pageblock_nr != pageblock_nr && + if (!cc->sync && last_pageblock_nr != pageblock_nr && !migrate_async_suitable(get_pageblock_migratetype(page))) { low_pfn += pageblock_nr_pages; low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; @@ -326,7 +325,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, continue; } - if (cc->mode != COMPACT_SYNC) + if (!cc->sync) mode |= ISOLATE_ASYNC_MIGRATE; lruvec = mem_cgroup_page_lruvec(page, zone); @@ -361,90 +360,27 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, #endif /* CONFIG_COMPACTION || CONFIG_CMA */ #ifdef CONFIG_COMPACTION -/* - * Returns true if MIGRATE_UNMOVABLE pageblock was successfully - * converted to MIGRATE_MOVABLE type, false otherwise. - */ -static bool rescue_unmovable_pageblock(struct page *page) -{ - unsigned long pfn, start_pfn, end_pfn; - struct page *start_page, *end_page; - pfn = page_to_pfn(page); - start_pfn = pfn & ~(pageblock_nr_pages - 1); - end_pfn = start_pfn + pageblock_nr_pages; - - start_page = pfn_to_page(start_pfn); - end_page = pfn_to_page(end_pfn); - - /* Do not deal with pageblocks that overlap zones */ - if (page_zone(start_page) != page_zone(end_page)) - return false; - - for (page = start_page, pfn = start_pfn; page < end_page; pfn++, - page++) { - if (!pfn_valid_within(pfn)) - continue; - - if (PageBuddy(page)) { - int order = page_order(page); - - pfn += (1 << order) - 1; - page += (1 << order) - 1; - - continue; - } else if (page_count(page) == 0 || PageLRU(page)) - continue; - - return false; - } - - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - move_freepages_block(page_zone(page), page, MIGRATE_MOVABLE); - return true; -} - -enum smt_result { - GOOD_AS_MIGRATION_TARGET, - FAIL_UNMOVABLE_TARGET, - FAIL_BAD_TARGET, -}; - -/* - * Returns GOOD_AS_MIGRATION_TARGET if the page is within a block - * suitable for migration to, FAIL_UNMOVABLE_TARGET if the page - * is within a MIGRATE_UNMOVABLE block, FAIL_BAD_TARGET otherwise. - */ -static enum smt_result suitable_migration_target(struct page *page, - struct compact_control *cc) +/* Returns true if the page is within a block suitable for migration to */ +static bool suitable_migration_target(struct page *page) { int migratetype = get_pageblock_migratetype(page); /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) - return FAIL_BAD_TARGET; + return false; /* If the page is a large free page, then allow migration */ if (PageBuddy(page) && page_order(page) >= pageblock_order) - return GOOD_AS_MIGRATION_TARGET; + return true; /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ - if (cc->mode != COMPACT_ASYNC_UNMOVABLE && - migrate_async_suitable(migratetype)) - return GOOD_AS_MIGRATION_TARGET; - - if (cc->mode == COMPACT_ASYNC_MOVABLE && - migratetype == MIGRATE_UNMOVABLE) - return FAIL_UNMOVABLE_TARGET; - - if (cc->mode != COMPACT_ASYNC_MOVABLE && - migratetype == MIGRATE_UNMOVABLE && - rescue_unmovable_pageblock(page)) - return GOOD_AS_MIGRATION_TARGET; + if (migrate_async_suitable(migratetype)) + return true; /* Otherwise skip the block */ - return FAIL_BAD_TARGET; + return false; } /* @@ -477,13 +413,6 @@ static void isolate_freepages(struct zone *zone, zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; - /* - * isolate_freepages() may be called more than once during - * compact_zone_order() run and we want only the most recent - * count. - */ - cc->nr_pageblocks_skipped = 0; - /* * Isolate free pages until enough are available to migrate the * pages on cc->migratepages. We stop searching if the migrate @@ -492,7 +421,6 @@ static void isolate_freepages(struct zone *zone, for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; pfn -= pageblock_nr_pages) { unsigned long isolated; - enum smt_result ret; if (!pfn_valid(pfn)) continue; @@ -509,12 +437,9 @@ static void isolate_freepages(struct zone *zone, continue; /* Check the block is suitable for migration */ - ret = suitable_migration_target(page, cc); - if (ret != GOOD_AS_MIGRATION_TARGET) { - if (ret == FAIL_UNMOVABLE_TARGET) - cc->nr_pageblocks_skipped++; + if (!suitable_migration_target(page)) continue; - } + /* * Found a block suitable for isolating free pages from. Now * we disabled interrupts, double check things are ok and @@ -523,14 +448,12 @@ static void isolate_freepages(struct zone *zone, */ isolated = 0; spin_lock_irqsave(&zone->lock, flags); - ret = suitable_migration_target(page, cc); - if (ret == GOOD_AS_MIGRATION_TARGET) { + if (suitable_migration_target(page)) { end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); isolated = isolate_freepages_block(pfn, end_pfn, freelist, false); nr_freepages += isolated; - } else if (ret == FAIL_UNMOVABLE_TARGET) - cc->nr_pageblocks_skipped++; + } spin_unlock_irqrestore(&zone->lock, flags); /* @@ -762,9 +685,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; err = migrate_pages(&cc->migratepages, compaction_alloc, - (unsigned long)&cc->freepages, false, - (cc->mode == COMPACT_SYNC) ? MIGRATE_SYNC_LIGHT - : MIGRATE_ASYNC); + (unsigned long)cc, false, + cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; @@ -793,8 +715,7 @@ out: static unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, - enum compact_mode mode, - unsigned long *nr_pageblocks_skipped) + bool sync) { struct compact_control cc = { .nr_freepages = 0, @@ -802,17 +723,12 @@ static unsigned long compact_zone_order(struct zone *zone, .order = order, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, - .mode = mode, + .sync = sync, }; - unsigned long rc; - INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); - rc = compact_zone(zone, &cc); - *nr_pageblocks_skipped = cc.nr_pageblocks_skipped; - - return rc; + return compact_zone(zone, &cc); } int sysctl_extfrag_threshold = 500; @@ -837,8 +753,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, struct zoneref *z; struct zone *zone; int rc = COMPACT_SKIPPED; - unsigned long nr_pageblocks_skipped; - enum compact_mode mode; /* * Check whether it is worth even starting compaction. The order check is @@ -855,22 +769,12 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; - mode = sync ? COMPACT_SYNC : COMPACT_ASYNC_MOVABLE; -retry: - status = compact_zone_order(zone, order, gfp_mask, mode, - &nr_pageblocks_skipped); + status = compact_zone_order(zone, order, gfp_mask, sync); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) break; - - if (rc == COMPACT_COMPLETE && mode == COMPACT_ASYNC_MOVABLE) { - if (nr_pageblocks_skipped) { - mode = COMPACT_ASYNC_UNMOVABLE; - goto retry; - } - } } return rc; @@ -904,7 +808,7 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) if (ok && cc->order > zone->compact_order_failed) zone->compact_order_failed = cc->order + 1; /* Currently async compaction is never deferred. */ - else if (!ok && cc->mode == COMPACT_SYNC) + else if (!ok && cc->sync) defer_compaction(zone, cc->order); } @@ -919,7 +823,7 @@ int compact_pgdat(pg_data_t *pgdat, int order) { struct compact_control cc = { .order = order, - .mode = COMPACT_ASYNC_MOVABLE, + .sync = false, }; return __compact_pgdat(pgdat, &cc); @@ -929,7 +833,7 @@ static int compact_node(int nid) { struct compact_control cc = { .order = -1, - .mode = COMPACT_SYNC, + .sync = true, }; return __compact_pgdat(NODE_DATA(nid), &cc); diff --git a/mm/internal.h b/mm/internal.h index 5cbb78190041..2ba87fbfb75b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -94,9 +94,6 @@ extern void putback_lru_page(struct page *page); /* * in mm/page_alloc.c */ -extern void set_pageblock_migratetype(struct page *page, int migratetype); -extern int move_freepages_block(struct zone *zone, struct page *page, - int migratetype); extern void __free_pages_bootmem(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned long order); #ifdef CONFIG_MEMORY_FAILURE @@ -104,7 +101,6 @@ extern bool is_free_buddy_page(struct page *page); #endif #if defined CONFIG_COMPACTION || defined CONFIG_CMA -#include /* * in mm/compaction.c @@ -123,14 +119,11 @@ struct compact_control { unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ - enum compact_mode mode; /* Compaction mode */ + bool sync; /* Synchronous migration */ int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; - - /* Number of UNMOVABLE destination pageblocks skipped during scan */ - unsigned long nr_pageblocks_skipped; }; unsigned long diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6092f331b32e..44030096da63 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -219,7 +219,7 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; -void set_pageblock_migratetype(struct page *page, int migratetype) +static void set_pageblock_migratetype(struct page *page, int migratetype) { if (unlikely(page_group_by_mobility_disabled)) @@ -954,8 +954,8 @@ static int move_freepages(struct zone *zone, return pages_moved; } -int move_freepages_block(struct zone *zone, struct page *page, - int migratetype) +static int move_freepages_block(struct zone *zone, struct page *page, + int migratetype) { unsigned long start_pfn, end_pfn; struct page *start_page, *end_page; @@ -5651,7 +5651,7 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) .nr_migratepages = 0, .order = -1, .zone = page_zone(pfn_to_page(start)), - .mode = COMPACT_SYNC, + .sync = true, }; INIT_LIST_HEAD(&cc.migratepages); From d15cf7c129fa4ec4b44c52521e49ffafb9749029 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 3 Jun 2012 23:24:00 -0400 Subject: [PATCH 077/178] tools/power turbostat: fix un-intended affinity of forked program Linux 3.4 included a modification to turbostat to lower cross-call overhead by using scheduler affinity: 15aaa34654831e98dd76f7738b6c7f5d05a66430 (tools turbostat: reduce measurement overhead due to IPIs) In the use-case where turbostat forks a child program, that change had the un-intended side-effect of binding the child to the last cpu in the system. This change removed the binding before forking the child. This is a back-port of a fix already included in turbostat v2. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 28 +++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index ab2f682fd44c..d5d6a3dc9bac 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -73,8 +73,8 @@ int backwards_count; char *progname; int num_cpus; -cpu_set_t *cpu_mask; -size_t cpu_mask_size; +cpu_set_t *cpu_present_set, *cpu_mask; +size_t cpu_present_setsize, cpu_mask_size; struct counters { unsigned long long tsc; /* per thread */ @@ -103,6 +103,12 @@ struct timeval tv_even; struct timeval tv_odd; struct timeval tv_delta; +int mark_cpu_present(int pkg, int core, int cpu) +{ + CPU_SET_S(cpu, cpu_present_setsize, cpu_present_set); + return 0; +} + /* * cpu_mask_init(ncpus) * @@ -118,6 +124,18 @@ void cpu_mask_init(int ncpus) } cpu_mask_size = CPU_ALLOC_SIZE(ncpus); CPU_ZERO_S(cpu_mask_size, cpu_mask); + + /* + * Allocate and initialize cpu_present_set + */ + cpu_present_set = CPU_ALLOC(ncpus); + if (cpu_present_set == NULL) { + perror("CPU_ALLOC"); + exit(3); + } + cpu_present_setsize = CPU_ALLOC_SIZE(ncpus); + CPU_ZERO_S(cpu_present_setsize, cpu_present_set); + for_all_cpus(mark_cpu_present); } void cpu_mask_uninit() @@ -125,6 +143,9 @@ void cpu_mask_uninit() CPU_FREE(cpu_mask); cpu_mask = NULL; cpu_mask_size = 0; + CPU_FREE(cpu_present_set); + cpu_present_set = NULL; + cpu_present_setsize = 0; } int cpu_migrate(int cpu) @@ -1047,6 +1068,9 @@ int fork_it(char **argv) int retval; pid_t child_pid; get_counters(cnt_even); + + /* clear affinity side-effect of get_counters() */ + sched_setaffinity(0, cpu_present_setsize, cpu_present_set); gettimeofday(&tv_even, (struct timezone *)NULL); child_pid = fork(); From 650a37f32d2bc16fa802075be579802bc4ec4132 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 3 Jun 2012 23:34:44 -0400 Subject: [PATCH 078/178] tools/power turbostat: fix IVB support Initial IVB support went into turbostat in Linux-3.1: 553575f1ae048aa44682b46b3c51929a0b3ad337 (tools turbostat: recognize and run properly on IVB) However, when running on IVB, turbostat would fail to report the new couters added with SNB, c7, pc2 and pc7. So in scenarios where these counters are non-zero on IVB, turbostat would report erroneous residencey results. In particular c7 time would be added to c1 time, since c1 time is calculated as "that which is left over". Also, turbostat reports MHz capabilities when passed the "-v" option, and it would incorrectly report 133MHz bclk instead of 100MHz bclk for IVB, which would inflate GHz reported with that option. This patch is a backport of a fix already included in turbostat v2. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index d5d6a3dc9bac..16de7ad4850f 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -933,6 +933,8 @@ int is_snb(unsigned int family, unsigned int model) switch (model) { case 0x2A: case 0x2D: + case 0x3A: /* IVB */ + case 0x3D: /* IVB Xeon */ return 1; } return 0; From 2f07a6134f670755e6ce657d019c26305bfcef89 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Thu, 5 Jan 2012 10:47:58 -0200 Subject: [PATCH 079/178] drivers: acpi: Fix dependency for ACPI_HOTPLUG_CPU Fix the following build warning: warning: (ACPI_HOTPLUG_CPU) selects ACPI_CONTAINER which has unmet direct dependencies (ACPI && EXPERIMENTAL) Signed-off-by: Fabio Estevam Signed-off-by: Len Brown --- drivers/acpi/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index 47768ff87343..80998958cf45 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -208,7 +208,7 @@ config ACPI_IPMI config ACPI_HOTPLUG_CPU bool - depends on ACPI_PROCESSOR && HOTPLUG_CPU + depends on EXPERIMENTAL && ACPI_PROCESSOR && HOTPLUG_CPU select ACPI_CONTAINER default y From 5041caa4d5e6dae418963de0c8f8a83f35e35dcf Mon Sep 17 00:00:00 2001 From: Kukjin Kim Date: Mon, 4 Jun 2012 13:07:07 +0900 Subject: [PATCH 080/178] gpio/samsung: fix the typo 'exynos5_xxx' instead of 'exonys5_xxx' Should be 'exynos5_xxx' instead of 'exonys5_xxx'. It happened at the commit 30b842889eea ("Merge tag 'soc2' of git://git.kernel.org/pub/scm/linux/kernel/git/arm/arm-soc") during v3.5 merge window. Signed-off-by: Kukjin Kim [ My bad - Linus ] Signed-off-by: Linus Torvalds --- drivers/gpio/gpio-samsung.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-samsung.c b/drivers/gpio/gpio-samsung.c index 7bb00448e13d..b6453d0e44ad 100644 --- a/drivers/gpio/gpio-samsung.c +++ b/drivers/gpio/gpio-samsung.c @@ -2833,7 +2833,7 @@ static __init void exynos5_gpiolib_init(void) } /* need to set base address for gpc4 */ - exonys5_gpios_1[11].base = gpio_base1 + 0x2E0; + exynos5_gpios_1[11].base = gpio_base1 + 0x2E0; /* need to set base address for gpx */ chip = &exynos5_gpios_1[21]; From 7ae30986dc63d214cb075a40f2cf205f0a7806cd Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 4 Jun 2012 00:29:11 -0400 Subject: [PATCH 081/178] ACPI: fix acpi_bus.h build warnings when ACPI is not enabled introduced in Linux-3.5-rc1 by 66886d6f8c9bcdee3d7fce5796dcffd6b4bc0b48 (ACPI: Add stubs for (un)register_acpi_bus_type) Fix header file warnings when CONFIG_ACPI is not enabled: include/acpi/acpi_bus.h:443:42: warning: 'struct acpi_bus_type' declared inside parameter list include/acpi/acpi_bus.h:443:42: warning: its scope is only this definition or declaration, which is probably not include/acpi/acpi_bus.h:444:44: warning: 'struct acpi_bus_type' declared inside parameter list Signed-off-by: Len Brown --- include/acpi/acpi_bus.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index b0d62820ada1..9e6e1c6eb60a 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -440,8 +440,8 @@ static inline int acpi_pm_device_sleep_wake(struct device *dev, bool enable) #else /* CONFIG_ACPI */ -static int register_acpi_bus_type(struct acpi_bus_type *bus) { return 0; } -static int unregister_acpi_bus_type(struct acpi_bus_type *bus) { return 0; } +static inline int register_acpi_bus_type(void *bus) { return 0; } +static inline int unregister_acpi_bus_type(void *bus) { return 0; } #endif /* CONFIG_ACPI */ From 71b43fd573a60972b2175df4927c4ee10d757004 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Thu, 17 May 2012 17:51:53 -0300 Subject: [PATCH 082/178] RDMA/cxgb4: Fix crash when peer address is 0.0.0.0 When using rping -c -a 0.0.0.0 with iw_cxgb4, the system crashes when rdma_connect() is called. ip_dev_find() will return NULL, but pdev is accessed anyway. Checking that pdev is NULL and returning -ENODEV prevents the system from crashing. Signed-off-by: Thadeu Lima de Souza Cascardo Acked-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb4/cm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 55ab284e22f2..b18870c455ad 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -1593,6 +1593,10 @@ static int import_ep(struct c4iw_ep *ep, __be32 peer_ip, struct dst_entry *dst, struct net_device *pdev; pdev = ip_dev_find(&init_net, peer_ip); + if (!pdev) { + err = -ENODEV; + goto out; + } ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t, n, pdev, 0); if (!ep->l2t) From f1ae98da8525c6b8b1c301c3a2b0bd2b6515cca2 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 30 May 2012 10:48:29 +0200 Subject: [PATCH 083/178] ARM: dma-mapping: remove unconditional dependency on CMA CMA has been enabled unconditionally on all ARMv6+ systems to solve the long standing issue of double kernel mappings for all dma coherent buffers. This however created a dependency on CONFIG_EXPERIMENTAL for the whole ARM architecture what should be really avoided. This patch removes this dependency and lets one use old, well-tested dma-mapping implementation also on ARMv6+ systems without the need to use EXPERIMENTAL stuff. Reported-by: Russell King Signed-off-by: Marek Szyprowski --- arch/arm/Kconfig | 1 - arch/arm/mm/dma-mapping.c | 10 ++++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index b649c5904a4f..84449dd8f031 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -7,7 +7,6 @@ config ARM select HAVE_IDE if PCI || ISA || PCMCIA select HAVE_DMA_ATTRS select HAVE_DMA_CONTIGUOUS if (CPU_V6 || CPU_V6K || CPU_V7) - select CMA if (CPU_V6 || CPU_V6K || CPU_V7) select HAVE_MEMBLOCK select RTC_LIB select SYS_SUPPORTS_APM_EMULATION diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index ea6b43154090..106c4c0ebccd 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -268,10 +268,8 @@ static int __init consistent_init(void) unsigned long base = consistent_base; unsigned long num_ptes = (CONSISTENT_END - base) >> PMD_SHIFT; -#ifndef CONFIG_ARM_DMA_USE_IOMMU - if (cpu_architecture() >= CPU_ARCH_ARMv6) + if (IS_ENABLED(CONFIG_CMA) && !IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU)) return 0; -#endif consistent_pte = kmalloc(num_ptes * sizeof(pte_t), GFP_KERNEL); if (!consistent_pte) { @@ -342,7 +340,7 @@ static int __init coherent_init(void) struct page *page; void *ptr; - if (cpu_architecture() < CPU_ARCH_ARMv6) + if (!IS_ENABLED(CONFIG_CMA)) return 0; ptr = __alloc_from_contiguous(NULL, size, prot, &page); @@ -704,7 +702,7 @@ static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, if (arch_is_coherent() || nommu()) addr = __alloc_simple_buffer(dev, size, gfp, &page); - else if (cpu_architecture() < CPU_ARCH_ARMv6) + else if (!IS_ENABLED(CONFIG_CMA)) addr = __alloc_remap_buffer(dev, size, gfp, prot, &page, caller); else if (gfp & GFP_ATOMIC) addr = __alloc_from_pool(dev, size, &page, caller); @@ -773,7 +771,7 @@ void arm_dma_free(struct device *dev, size_t size, void *cpu_addr, if (arch_is_coherent() || nommu()) { __dma_free_buffer(page, size); - } else if (cpu_architecture() < CPU_ARCH_ARMv6) { + } else if (!IS_ENABLED(CONFIG_CMA)) { __dma_free_remap(cpu_addr, size); __dma_free_buffer(page, size); } else { From 3aac6ff16a2097be668975fd51084df2e27e4999 Mon Sep 17 00:00:00 2001 From: Shlomo Pongratz Date: Thu, 24 May 2012 16:08:07 +0300 Subject: [PATCH 084/178] IB/mlx4: Fix EQ deallocation in legacy mode Commit e605b743f33d ("IB/mlx4: Increase the number of vectors (EQs) available for ULPs") didn't handle correctly the case where there aren't enough MSI-X vectors to increase the number of EQs, so only the legacy EQs are allocated. This results in an attempt to memset() to zero the EQ table which was never allocated and a kernel crash. Fix this by checking in the teardown flow if the table of EQs was ever allocated. Also remove some unneeded setting to zero of the EQ related fields in struct mlx4_ib_dev. Signed-off-by: Shlomo Pongratz Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index ee1c577238f7..8afea12c8d44 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1084,12 +1084,9 @@ static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) int total_eqs = 0; int i, j, eq; - /* Init eq table */ - ibdev->eq_table = NULL; - ibdev->eq_added = 0; - - /* Legacy mode? */ - if (dev->caps.comp_pool == 0) + /* Legacy mode or comp_pool is not large enough */ + if (dev->caps.comp_pool == 0 || + dev->caps.num_ports > dev->caps.comp_pool) return; eq_per_port = rounddown_pow_of_two(dev->caps.comp_pool/ @@ -1135,7 +1132,10 @@ static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) { int i; - int total_eqs; + + /* no additional eqs were added */ + if (!ibdev->eq_table) + return; /* Reset the advertised EQ number */ ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; @@ -1148,12 +1148,7 @@ static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) mlx4_release_eq(dev, ibdev->eq_table[i]); } - total_eqs = dev->caps.num_comp_vectors + ibdev->eq_added; - memset(ibdev->eq_table, 0, total_eqs * sizeof(int)); kfree(ibdev->eq_table); - - ibdev->eq_table = NULL; - ibdev->eq_added = 0; } static void *mlx4_ib_add(struct mlx4_dev *dev) From c1bf94ec1e12d76838ad485158aecf208ebd8fb9 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 31 May 2012 17:38:11 +0200 Subject: [PATCH 085/178] iommu/amd: Cache pdev pointer to root-bridge At some point pci_get_bus_and_slot started to enable interrupts. Since this function is used in the amd_iommu_resume path it will enable interrupts on resume which causes a warning. The fix will use a cached pointer to the root-bridge to re-enable the IOMMU in case the BIOS is broken. Cc: stable@vger.kernel.org Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_init.c | 13 +++++-------- drivers/iommu/amd_iommu_types.h | 3 +++ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index c56790375e0f..542024ba6dba 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -1029,6 +1029,9 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) if (!iommu->dev) return 1; + iommu->root_pdev = pci_get_bus_and_slot(iommu->dev->bus->number, + PCI_DEVFN(0, 0)); + iommu->cap_ptr = h->cap_ptr; iommu->pci_seg = h->pci_seg; iommu->mmio_phys = h->mmio_phys; @@ -1323,20 +1326,16 @@ static void iommu_apply_resume_quirks(struct amd_iommu *iommu) { int i, j; u32 ioc_feature_control; - struct pci_dev *pdev = NULL; + struct pci_dev *pdev = iommu->root_pdev; /* RD890 BIOSes may not have completely reconfigured the iommu */ - if (!is_rd890_iommu(iommu->dev)) + if (!is_rd890_iommu(iommu->dev) || !pdev) return; /* * First, we need to ensure that the iommu is enabled. This is * controlled by a register in the northbridge */ - pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0)); - - if (!pdev) - return; /* Select Northbridge indirect register 0x75 and enable writing */ pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7)); @@ -1346,8 +1345,6 @@ static void iommu_apply_resume_quirks(struct amd_iommu *iommu) if (!(ioc_feature_control & 0x1)) pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1); - pci_dev_put(pdev); - /* Restore the iommu BAR */ pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4, iommu->stored_addr_lo); diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 2452f3b71736..24355559a2ad 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -481,6 +481,9 @@ struct amd_iommu { /* Pointer to PCI device of this IOMMU */ struct pci_dev *dev; + /* Cache pdev to root device for resume quirks */ + struct pci_dev *root_pdev; + /* physical address of MMIO space */ u64 mmio_phys; /* virtual address of MMIO space */ From eee53537c476c947bf7faa1c916d2f5a0ae8ec93 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 1 Jun 2012 15:20:23 +0200 Subject: [PATCH 086/178] iommu/amd: Fix deadlock in ppr-handling error path In the error path of the ppr_notifer it can happen that the iommu->lock is taken recursivly. This patch fixes the problem by releasing the iommu->lock before any notifier is invoked. This also requires to move the erratum workaround for the ppr-log (interrupt may be faster than data in the log) one function up. Cc: stable@vger.kernel.org # v3.3, v3.4 Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 71 ++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 27 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index d90a421e9cac..a2e418cba0ff 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -547,26 +547,12 @@ static void iommu_poll_events(struct amd_iommu *iommu) spin_unlock_irqrestore(&iommu->lock, flags); } -static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u32 head) +static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u64 *raw) { struct amd_iommu_fault fault; - volatile u64 *raw; - int i; INC_STATS_COUNTER(pri_requests); - raw = (u64 *)(iommu->ppr_log + head); - - /* - * Hardware bug: Interrupt may arrive before the entry is written to - * memory. If this happens we need to wait for the entry to arrive. - */ - for (i = 0; i < LOOP_TIMEOUT; ++i) { - if (PPR_REQ_TYPE(raw[0]) != 0) - break; - udelay(1); - } - if (PPR_REQ_TYPE(raw[0]) != PPR_REQ_FAULT) { pr_err_ratelimited("AMD-Vi: Unknown PPR request received\n"); return; @@ -578,12 +564,6 @@ static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u32 head) fault.tag = PPR_TAG(raw[0]); fault.flags = PPR_FLAGS(raw[0]); - /* - * To detect the hardware bug we need to clear the entry - * to back to zero. - */ - raw[0] = raw[1] = 0; - atomic_notifier_call_chain(&ppr_notifier, 0, &fault); } @@ -595,25 +575,62 @@ static void iommu_poll_ppr_log(struct amd_iommu *iommu) if (iommu->ppr_log == NULL) return; + /* enable ppr interrupts again */ + writel(MMIO_STATUS_PPR_INT_MASK, iommu->mmio_base + MMIO_STATUS_OFFSET); + spin_lock_irqsave(&iommu->lock, flags); head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET); while (head != tail) { + volatile u64 *raw; + u64 entry[2]; + int i; - /* Handle PPR entry */ - iommu_handle_ppr_entry(iommu, head); + raw = (u64 *)(iommu->ppr_log + head); - /* Update and refresh ring-buffer state*/ + /* + * Hardware bug: Interrupt may arrive before the entry is + * written to memory. If this happens we need to wait for the + * entry to arrive. + */ + for (i = 0; i < LOOP_TIMEOUT; ++i) { + if (PPR_REQ_TYPE(raw[0]) != 0) + break; + udelay(1); + } + + /* Avoid memcpy function-call overhead */ + entry[0] = raw[0]; + entry[1] = raw[1]; + + /* + * To detect the hardware bug we need to clear the entry + * back to zero. + */ + raw[0] = raw[1] = 0UL; + + /* Update head pointer of hardware ring-buffer */ head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE; writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); + + /* + * Release iommu->lock because ppr-handling might need to + * re-aquire it + */ + spin_unlock_irqrestore(&iommu->lock, flags); + + /* Handle PPR entry */ + iommu_handle_ppr_entry(iommu, entry); + + spin_lock_irqsave(&iommu->lock, flags); + + /* Refresh ring-buffer information */ + head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET); } - /* enable ppr interrupts again */ - writel(MMIO_STATUS_PPR_INT_MASK, iommu->mmio_base + MMIO_STATUS_OFFSET); - spin_unlock_irqrestore(&iommu->lock, flags); } From ae58d1e406986f31d1e88b32f5ac601506c196d8 Mon Sep 17 00:00:00 2001 From: Stephen Warren Date: Fri, 18 May 2012 09:29:34 -0600 Subject: [PATCH 087/178] i2c: Add generic I2C multiplexer using pinctrl API This is useful for SoCs whose I2C module's signals can be routed to different sets of pins at run-time, using the pinctrl API. +-----+ +-----+ | dev | | dev | +------------------------+ +-----+ +-----+ | SoC | | | | /----|------+--------+ | +---+ +------+ | child bus A, on first set of pins | |I2C|---|Pinmux| | | +---+ +------+ | child bus B, on second set of pins | \----|------+--------+--------+ | | | | | +------------------------+ +-----+ +-----+ +-----+ | dev | | dev | | dev | +-----+ +-----+ +-----+ Signed-off-by: Stephen Warren Acked-by: Linus Walleij Acked-by: Rob Herring Signed-off-by: Wolfram Sang --- .../bindings/i2c/i2c-mux-pinctrl.txt | 93 ++++++ drivers/i2c/muxes/Kconfig | 12 + drivers/i2c/muxes/Makefile | 1 + drivers/i2c/muxes/i2c-mux-pinctrl.c | 279 ++++++++++++++++++ include/linux/i2c-mux-pinctrl.h | 41 +++ 5 files changed, 426 insertions(+) create mode 100644 Documentation/devicetree/bindings/i2c/i2c-mux-pinctrl.txt create mode 100644 drivers/i2c/muxes/i2c-mux-pinctrl.c create mode 100644 include/linux/i2c-mux-pinctrl.h diff --git a/Documentation/devicetree/bindings/i2c/i2c-mux-pinctrl.txt b/Documentation/devicetree/bindings/i2c/i2c-mux-pinctrl.txt new file mode 100644 index 000000000000..ae8af1694e95 --- /dev/null +++ b/Documentation/devicetree/bindings/i2c/i2c-mux-pinctrl.txt @@ -0,0 +1,93 @@ +Pinctrl-based I2C Bus Mux + +This binding describes an I2C bus multiplexer that uses pin multiplexing to +route the I2C signals, and represents the pin multiplexing configuration +using the pinctrl device tree bindings. + + +-----+ +-----+ + | dev | | dev | + +------------------------+ +-----+ +-----+ + | SoC | | | + | /----|------+--------+ + | +---+ +------+ | child bus A, on first set of pins + | |I2C|---|Pinmux| | + | +---+ +------+ | child bus B, on second set of pins + | \----|------+--------+--------+ + | | | | | + +------------------------+ +-----+ +-----+ +-----+ + | dev | | dev | | dev | + +-----+ +-----+ +-----+ + +Required properties: +- compatible: i2c-mux-pinctrl +- i2c-parent: The phandle of the I2C bus that this multiplexer's master-side + port is connected to. + +Also required are: + +* Standard pinctrl properties that specify the pin mux state for each child + bus. See ../pinctrl/pinctrl-bindings.txt. + +* Standard I2C mux properties. See mux.txt in this directory. + +* I2C child bus nodes. See mux.txt in this directory. + +For each named state defined in the pinctrl-names property, an I2C child bus +will be created. I2C child bus numbers are assigned based on the index into +the pinctrl-names property. + +The only exception is that no bus will be created for a state named "idle". If +such a state is defined, it must be the last entry in pinctrl-names. For +example: + + pinctrl-names = "ddc", "pta", "idle" -> ddc = bus 0, pta = bus 1 + pinctrl-names = "ddc", "idle", "pta" -> Invalid ("idle" not last) + pinctrl-names = "idle", "ddc", "pta" -> Invalid ("idle" not last) + +Whenever an access is made to a device on a child bus, the relevant pinctrl +state will be programmed into hardware. + +If an idle state is defined, whenever an access is not being made to a device +on a child bus, the idle pinctrl state will be programmed into hardware. + +If an idle state is not defined, the most recently used pinctrl state will be +left programmed into hardware whenever no access is being made of a device on +a child bus. + +Example: + + i2cmux { + compatible = "i2c-mux-pinctrl"; + #address-cells = <1>; + #size-cells = <0>; + + i2c-parent = <&i2c1>; + + pinctrl-names = "ddc", "pta", "idle"; + pinctrl-0 = <&state_i2cmux_ddc>; + pinctrl-1 = <&state_i2cmux_pta>; + pinctrl-2 = <&state_i2cmux_idle>; + + i2c@0 { + reg = <0>; + #address-cells = <1>; + #size-cells = <0>; + + eeprom { + compatible = "eeprom"; + reg = <0x50>; + }; + }; + + i2c@1 { + reg = <1>; + #address-cells = <1>; + #size-cells = <0>; + + eeprom { + compatible = "eeprom"; + reg = <0x50>; + }; + }; + }; + diff --git a/drivers/i2c/muxes/Kconfig b/drivers/i2c/muxes/Kconfig index beb2491db274..a0edd9854218 100644 --- a/drivers/i2c/muxes/Kconfig +++ b/drivers/i2c/muxes/Kconfig @@ -37,4 +37,16 @@ config I2C_MUX_PCA954x This driver can also be built as a module. If so, the module will be called i2c-mux-pca954x. +config I2C_MUX_PINCTRL + tristate "pinctrl-based I2C multiplexer" + depends on PINCTRL + help + If you say yes to this option, support will be included for an I2C + multiplexer that uses the pinctrl subsystem, i.e. pin multiplexing. + This is useful for SoCs whose I2C module's signals can be routed to + different sets of pins at run-time. + + This driver can also be built as a module. If so, the module will be + called pinctrl-i2cmux. + endmenu diff --git a/drivers/i2c/muxes/Makefile b/drivers/i2c/muxes/Makefile index 5826249b29ca..76da8692afff 100644 --- a/drivers/i2c/muxes/Makefile +++ b/drivers/i2c/muxes/Makefile @@ -4,5 +4,6 @@ obj-$(CONFIG_I2C_MUX_GPIO) += i2c-mux-gpio.o obj-$(CONFIG_I2C_MUX_PCA9541) += i2c-mux-pca9541.o obj-$(CONFIG_I2C_MUX_PCA954x) += i2c-mux-pca954x.o +obj-$(CONFIG_I2C_MUX_PINCTRL) += i2c-mux-pinctrl.o ccflags-$(CONFIG_I2C_DEBUG_BUS) := -DDEBUG diff --git a/drivers/i2c/muxes/i2c-mux-pinctrl.c b/drivers/i2c/muxes/i2c-mux-pinctrl.c new file mode 100644 index 000000000000..46a669763476 --- /dev/null +++ b/drivers/i2c/muxes/i2c-mux-pinctrl.c @@ -0,0 +1,279 @@ +/* + * I2C multiplexer using pinctrl API + * + * Copyright (c) 2012, NVIDIA CORPORATION. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct i2c_mux_pinctrl { + struct device *dev; + struct i2c_mux_pinctrl_platform_data *pdata; + struct pinctrl *pinctrl; + struct pinctrl_state **states; + struct pinctrl_state *state_idle; + struct i2c_adapter *parent; + struct i2c_adapter **busses; +}; + +static int i2c_mux_pinctrl_select(struct i2c_adapter *adap, void *data, + u32 chan) +{ + struct i2c_mux_pinctrl *mux = data; + + return pinctrl_select_state(mux->pinctrl, mux->states[chan]); +} + +static int i2c_mux_pinctrl_deselect(struct i2c_adapter *adap, void *data, + u32 chan) +{ + struct i2c_mux_pinctrl *mux = data; + + return pinctrl_select_state(mux->pinctrl, mux->state_idle); +} + +#ifdef CONFIG_OF +static int i2c_mux_pinctrl_parse_dt(struct i2c_mux_pinctrl *mux, + struct platform_device *pdev) +{ + struct device_node *np = pdev->dev.of_node; + int num_names, i, ret; + struct device_node *adapter_np; + struct i2c_adapter *adapter; + + if (!np) + return 0; + + mux->pdata = devm_kzalloc(&pdev->dev, sizeof(*mux->pdata), GFP_KERNEL); + if (!mux->pdata) { + dev_err(mux->dev, + "Cannot allocate i2c_mux_pinctrl_platform_data\n"); + return -ENOMEM; + } + + num_names = of_property_count_strings(np, "pinctrl-names"); + if (num_names < 0) { + dev_err(mux->dev, "Cannot parse pinctrl-names: %d\n", + num_names); + return num_names; + } + + mux->pdata->pinctrl_states = devm_kzalloc(&pdev->dev, + sizeof(*mux->pdata->pinctrl_states) * num_names, + GFP_KERNEL); + if (!mux->pdata->pinctrl_states) { + dev_err(mux->dev, "Cannot allocate pinctrl_states\n"); + return -ENOMEM; + } + + for (i = 0; i < num_names; i++) { + ret = of_property_read_string_index(np, "pinctrl-names", i, + &mux->pdata->pinctrl_states[mux->pdata->bus_count]); + if (ret < 0) { + dev_err(mux->dev, "Cannot parse pinctrl-names: %d\n", + ret); + return ret; + } + if (!strcmp(mux->pdata->pinctrl_states[mux->pdata->bus_count], + "idle")) { + if (i != num_names - 1) { + dev_err(mux->dev, "idle state must be last\n"); + return -EINVAL; + } + mux->pdata->pinctrl_state_idle = "idle"; + } else { + mux->pdata->bus_count++; + } + } + + adapter_np = of_parse_phandle(np, "i2c-parent", 0); + if (!adapter_np) { + dev_err(mux->dev, "Cannot parse i2c-parent\n"); + return -ENODEV; + } + adapter = of_find_i2c_adapter_by_node(adapter_np); + if (!adapter) { + dev_err(mux->dev, "Cannot find parent bus\n"); + return -ENODEV; + } + mux->pdata->parent_bus_num = i2c_adapter_id(adapter); + put_device(&adapter->dev); + + return 0; +} +#else +static inline int i2c_mux_pinctrl_parse_dt(struct i2c_mux_pinctrl *mux, + struct platform_device *pdev) +{ + return 0; +} +#endif + +static int __devinit i2c_mux_pinctrl_probe(struct platform_device *pdev) +{ + struct i2c_mux_pinctrl *mux; + int (*deselect)(struct i2c_adapter *, void *, u32); + int i, ret; + + mux = devm_kzalloc(&pdev->dev, sizeof(*mux), GFP_KERNEL); + if (!mux) { + dev_err(&pdev->dev, "Cannot allocate i2c_mux_pinctrl\n"); + ret = -ENOMEM; + goto err; + } + platform_set_drvdata(pdev, mux); + + mux->dev = &pdev->dev; + + mux->pdata = pdev->dev.platform_data; + if (!mux->pdata) { + ret = i2c_mux_pinctrl_parse_dt(mux, pdev); + if (ret < 0) + goto err; + } + if (!mux->pdata) { + dev_err(&pdev->dev, "Missing platform data\n"); + ret = -ENODEV; + goto err; + } + + mux->states = devm_kzalloc(&pdev->dev, + sizeof(*mux->states) * mux->pdata->bus_count, + GFP_KERNEL); + if (!mux->states) { + dev_err(&pdev->dev, "Cannot allocate states\n"); + ret = -ENOMEM; + goto err; + } + + mux->busses = devm_kzalloc(&pdev->dev, + sizeof(mux->busses) * mux->pdata->bus_count, + GFP_KERNEL); + if (!mux->states) { + dev_err(&pdev->dev, "Cannot allocate busses\n"); + ret = -ENOMEM; + goto err; + } + + mux->pinctrl = devm_pinctrl_get(&pdev->dev); + if (IS_ERR(mux->pinctrl)) { + ret = PTR_ERR(mux->pinctrl); + dev_err(&pdev->dev, "Cannot get pinctrl: %d\n", ret); + goto err; + } + for (i = 0; i < mux->pdata->bus_count; i++) { + mux->states[i] = pinctrl_lookup_state(mux->pinctrl, + mux->pdata->pinctrl_states[i]); + if (IS_ERR(mux->states[i])) { + ret = PTR_ERR(mux->states[i]); + dev_err(&pdev->dev, + "Cannot look up pinctrl state %s: %d\n", + mux->pdata->pinctrl_states[i], ret); + goto err; + } + } + if (mux->pdata->pinctrl_state_idle) { + mux->state_idle = pinctrl_lookup_state(mux->pinctrl, + mux->pdata->pinctrl_state_idle); + if (IS_ERR(mux->state_idle)) { + ret = PTR_ERR(mux->state_idle); + dev_err(&pdev->dev, + "Cannot look up pinctrl state %s: %d\n", + mux->pdata->pinctrl_state_idle, ret); + goto err; + } + + deselect = i2c_mux_pinctrl_deselect; + } else { + deselect = NULL; + } + + mux->parent = i2c_get_adapter(mux->pdata->parent_bus_num); + if (!mux->parent) { + dev_err(&pdev->dev, "Parent adapter (%d) not found\n", + mux->pdata->parent_bus_num); + ret = -ENODEV; + goto err; + } + + for (i = 0; i < mux->pdata->bus_count; i++) { + u32 bus = mux->pdata->base_bus_num ? + (mux->pdata->base_bus_num + i) : 0; + + mux->busses[i] = i2c_add_mux_adapter(mux->parent, &pdev->dev, + mux, bus, i, + i2c_mux_pinctrl_select, + deselect); + if (!mux->busses[i]) { + ret = -ENODEV; + dev_err(&pdev->dev, "Failed to add adapter %d\n", i); + goto err_del_adapter; + } + } + + return 0; + +err_del_adapter: + for (; i > 0; i--) + i2c_del_mux_adapter(mux->busses[i - 1]); + i2c_put_adapter(mux->parent); +err: + return ret; +} + +static int __devexit i2c_mux_pinctrl_remove(struct platform_device *pdev) +{ + struct i2c_mux_pinctrl *mux = platform_get_drvdata(pdev); + int i; + + for (i = 0; i < mux->pdata->bus_count; i++) + i2c_del_mux_adapter(mux->busses[i]); + + i2c_put_adapter(mux->parent); + + return 0; +} + +#ifdef CONFIG_OF +static const struct of_device_id i2c_mux_pinctrl_of_match[] __devinitconst = { + { .compatible = "i2c-mux-pinctrl", }, + {}, +}; +MODULE_DEVICE_TABLE(of, i2c_mux_pinctrl_of_match); +#endif + +static struct platform_driver i2c_mux_pinctrl_driver = { + .driver = { + .name = "i2c-mux-pinctrl", + .owner = THIS_MODULE, + .of_match_table = of_match_ptr(i2c_mux_pinctrl_of_match), + }, + .probe = i2c_mux_pinctrl_probe, + .remove = __devexit_p(i2c_mux_pinctrl_remove), +}; +module_platform_driver(i2c_mux_pinctrl_driver); + +MODULE_DESCRIPTION("pinctrl-based I2C multiplexer driver"); +MODULE_AUTHOR("Stephen Warren "); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("platform:i2c-mux-pinctrl"); diff --git a/include/linux/i2c-mux-pinctrl.h b/include/linux/i2c-mux-pinctrl.h new file mode 100644 index 000000000000..a65c86429e84 --- /dev/null +++ b/include/linux/i2c-mux-pinctrl.h @@ -0,0 +1,41 @@ +/* + * i2c-mux-pinctrl platform data + * + * Copyright (c) 2012, NVIDIA CORPORATION. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef _LINUX_I2C_MUX_PINCTRL_H +#define _LINUX_I2C_MUX_PINCTRL_H + +/** + * struct i2c_mux_pinctrl_platform_data - Platform data for i2c-mux-pinctrl + * @parent_bus_num: Parent I2C bus number + * @base_bus_num: Base I2C bus number for the child busses. 0 for dynamic. + * @bus_count: Number of child busses. Also the number of elements in + * @pinctrl_states + * @pinctrl_states: The names of the pinctrl state to select for each child bus + * @pinctrl_state_idle: The pinctrl state to select when no child bus is being + * accessed. If NULL, the most recently used pinctrl state will be left + * selected. + */ +struct i2c_mux_pinctrl_platform_data { + int parent_bus_num; + int base_bus_num; + int bus_count; + const char **pinctrl_states; + const char *pinctrl_state_idle; +}; + +#endif From 0640113be25d283e0ff77a9f041e1242182387f0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 4 Jun 2012 11:00:45 -0700 Subject: [PATCH 088/178] vfs: Fix /proc//fdinfo/ file handling Cyrill Gorcunov reports that I broke the fdinfo files with commit 30a08bf2d31d ("proc: move fd symlink i_mode calculations into tid_fd_revalidate()"), and he's quite right. The tid_fd_revalidate() function is not just used for the /fd symlinks, it's also used for the /fdinfo/ files, and the permission model for those are different. So do the dynamic symlink permission handling just for symlinks, making the fdinfo files once more appear as the proper regular files they are. Of course, Al Viro argued (probably correctly) that we shouldn't do the symlink permission games at all, and make the symlinks always just be the normal 'lrwxrwxrwx'. That would have avoided this issue too, but since somebody noticed that the permissions had changed (which was the reason for that original commit 30a08bf2d31d in the first place), people do apparently use this feature. [ Basically, you can use the symlink permission data as a cheap "fdinfo" replacement, since you see whether the file is open for reading and/or writing by just looking at st_mode of the symlink. So the feature does make sense, even if the pain it has caused means we probably shouldn't have done it to begin with. ] Reported-and-tested-by: Cyrill Gorcunov Signed-off-by: Linus Torvalds --- fs/proc/base.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 616f41a7cde6..437195f204e1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1803,7 +1803,7 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) rcu_read_lock(); file = fcheck_files(files, fd); if (file) { - unsigned i_mode, f_mode = file->f_mode; + unsigned f_mode = file->f_mode; rcu_read_unlock(); put_files_struct(files); @@ -1819,12 +1819,14 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_gid = GLOBAL_ROOT_GID; } - i_mode = S_IFLNK; - if (f_mode & FMODE_READ) - i_mode |= S_IRUSR | S_IXUSR; - if (f_mode & FMODE_WRITE) - i_mode |= S_IWUSR | S_IXUSR; - inode->i_mode = i_mode; + if (S_ISLNK(inode->i_mode)) { + unsigned i_mode = S_IFLNK; + if (f_mode & FMODE_READ) + i_mode |= S_IRUSR | S_IXUSR; + if (f_mode & FMODE_WRITE) + i_mode |= S_IWUSR | S_IXUSR; + inode->i_mode = i_mode; + } security_task_to_inode(task, inode); put_task_struct(task); @@ -1859,6 +1861,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir, ei = PROC_I(inode); ei->fd = fd; + inode->i_mode = S_IFLNK; inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; ei->op.proc_get_link = proc_fd_link; From 3eef8918ff440837f6af791942d8dd07e1a268ee Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 4 Jun 2012 17:05:40 +0100 Subject: [PATCH 089/178] drm/i915: Mark the ringbuffers as being in the GTT domain By correctly describing the rinbuffers as being in the GTT domain, it appears that we are more careful with the management of the CPU cache upon resume and so prevent some coherency issue when submitting commands to the GPU later. A secondary effect is that the debug logs are then consistent with the actual usage (i.e. they no longer describe the ringbuffers as being in the CPU write domain when we are accessing them through an wc iomapping.) Reported-and-tested-by: Daniel Gnoutcheff Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=41092 Signed-off-by: Chris Wilson Cc: stable@vger.kernel.org Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/intel_ringbuffer.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index 490df551e312..9fbad086cb4b 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -988,6 +988,10 @@ static int intel_init_ring_buffer(struct drm_device *dev, if (ret) goto err_unref; + ret = i915_gem_object_set_to_gtt_domain(obj, true); + if (ret) + goto err_unpin; + ring->virtual_start = ioremap_wc(dev->agp->base + obj->gtt_offset, ring->size); if (ring->virtual_start == NULL) { From b7884eb45ec98c0d34c7f49005ae9d4b4b4e38f6 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Mon, 4 Jun 2012 11:18:15 +0200 Subject: [PATCH 090/178] drm/i915: hold forcewake around ring hw init Empirical evidence suggests that we need to: On at least one ivb machine when running the hangman i-g-t test, the rings don't properly initialize properly - the RING_START registers seems to be stuck at all zeros. Holding forcewake around this register init sequences makes chip reset reliable again. Note that this is not the first such issue: commit f01db988ef6f6c70a6cc36ee71e4a98a68901229 Author: Sean Paul Date: Fri Mar 16 12:43:22 2012 -0400 drm/i915: Add wait_for in init_ring_common added delay loops to make RING_START and RING_CTL initialization reliable on the blt ring at boot-up. So I guess it won't hurt if we do this unconditionally for all force_wake needing gpus. To avoid copy&pasting of the HAS_FORCE_WAKE check I've added a new intel_info bit for that. v2: Fixup missing commas in static struct and properly handling the error case in init_ring_common, both noticed by Jani Nikula. Cc: stable@vger.kernel.org Reported-and-tested-by: Yang Guang Reviewed-by: Eugeni Dodonov Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=50522 Signed-Off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_drv.c | 13 +++++++++---- drivers/gpu/drm/i915/i915_drv.h | 3 +++ drivers/gpu/drm/i915/intel_ringbuffer.c | 16 +++++++++++++--- 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index 238a52165833..9fe9ebe52a7a 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -233,6 +233,7 @@ static const struct intel_device_info intel_sandybridge_d_info = { .has_blt_ring = 1, .has_llc = 1, .has_pch_split = 1, + .has_force_wake = 1, }; static const struct intel_device_info intel_sandybridge_m_info = { @@ -243,6 +244,7 @@ static const struct intel_device_info intel_sandybridge_m_info = { .has_blt_ring = 1, .has_llc = 1, .has_pch_split = 1, + .has_force_wake = 1, }; static const struct intel_device_info intel_ivybridge_d_info = { @@ -252,6 +254,7 @@ static const struct intel_device_info intel_ivybridge_d_info = { .has_blt_ring = 1, .has_llc = 1, .has_pch_split = 1, + .has_force_wake = 1, }; static const struct intel_device_info intel_ivybridge_m_info = { @@ -262,6 +265,7 @@ static const struct intel_device_info intel_ivybridge_m_info = { .has_blt_ring = 1, .has_llc = 1, .has_pch_split = 1, + .has_force_wake = 1, }; static const struct intel_device_info intel_valleyview_m_info = { @@ -289,6 +293,7 @@ static const struct intel_device_info intel_haswell_d_info = { .has_blt_ring = 1, .has_llc = 1, .has_pch_split = 1, + .has_force_wake = 1, }; static const struct intel_device_info intel_haswell_m_info = { @@ -298,6 +303,7 @@ static const struct intel_device_info intel_haswell_m_info = { .has_blt_ring = 1, .has_llc = 1, .has_pch_split = 1, + .has_force_wake = 1, }; static const struct pci_device_id pciidlist[] = { /* aka */ @@ -1139,10 +1145,9 @@ MODULE_LICENSE("GPL and additional rights"); /* We give fast paths for the really cool registers */ #define NEEDS_FORCE_WAKE(dev_priv, reg) \ - (((dev_priv)->info->gen >= 6) && \ - ((reg) < 0x40000) && \ - ((reg) != FORCEWAKE)) && \ - (!IS_VALLEYVIEW((dev_priv)->dev)) + ((HAS_FORCE_WAKE((dev_priv)->dev)) && \ + ((reg) < 0x40000) && \ + ((reg) != FORCEWAKE)) #define __i915_read(x, y) \ u##x i915_read##x(struct drm_i915_private *dev_priv, u32 reg) { \ diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 377c21f531e4..cff630757650 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -285,6 +285,7 @@ struct intel_device_info { u8 is_ivybridge:1; u8 is_valleyview:1; u8 has_pch_split:1; + u8 has_force_wake:1; u8 is_haswell:1; u8 has_fbc:1; u8 has_pipe_cxsr:1; @@ -1098,6 +1099,8 @@ struct drm_i915_file_private { #define HAS_PCH_CPT(dev) (INTEL_PCH_TYPE(dev) == PCH_CPT) #define HAS_PCH_IBX(dev) (INTEL_PCH_TYPE(dev) == PCH_IBX) +#define HAS_FORCE_WAKE(dev) (INTEL_INFO(dev)->has_force_wake) + #include "i915_trace.h" /** diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index 9fbad086cb4b..e5b84ff89ca5 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -266,10 +266,15 @@ u32 intel_ring_get_active_head(struct intel_ring_buffer *ring) static int init_ring_common(struct intel_ring_buffer *ring) { - drm_i915_private_t *dev_priv = ring->dev->dev_private; + struct drm_device *dev = ring->dev; + drm_i915_private_t *dev_priv = dev->dev_private; struct drm_i915_gem_object *obj = ring->obj; + int ret = 0; u32 head; + if (HAS_FORCE_WAKE(dev)) + gen6_gt_force_wake_get(dev_priv); + /* Stop the ring if it's running. */ I915_WRITE_CTL(ring, 0); I915_WRITE_HEAD(ring, 0); @@ -317,7 +322,8 @@ static int init_ring_common(struct intel_ring_buffer *ring) I915_READ_HEAD(ring), I915_READ_TAIL(ring), I915_READ_START(ring)); - return -EIO; + ret = -EIO; + goto out; } if (!drm_core_check_feature(ring->dev, DRIVER_MODESET)) @@ -329,7 +335,11 @@ static int init_ring_common(struct intel_ring_buffer *ring) ring->last_retired_head = -1; } - return 0; +out: + if (HAS_FORCE_WAKE(dev)) + gen6_gt_force_wake_put(dev_priv); + + return ret; } static int From fad0c66c4bb836d57a5f125ecd38bed653ca863a Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 30 May 2012 10:54:57 -0700 Subject: [PATCH 091/178] timekeeping: Fix CLOCK_MONOTONIC inconsistency during leapsecond Commit 6b43ae8a61 (ntp: Fix leap-second hrtimer livelock) broke the leapsecond update of CLOCK_MONOTONIC. The missing leapsecond update to wall_to_monotonic causes discontinuities in CLOCK_MONOTONIC. Adjust wall_to_monotonic when NTP inserted a leapsecond. Reported-by: Richard Cochran Signed-off-by: John Stultz Tested-by: Richard Cochran Cc: stable@kernel.org Link: http://lkml.kernel.org/r/1338400497-12420-1-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6e46cacf5969..6f46a00a1e8a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -962,6 +962,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) timekeeper.xtime.tv_sec++; leap = second_overflow(timekeeper.xtime.tv_sec); timekeeper.xtime.tv_sec += leap; + timekeeper.wall_to_monotonic.tv_sec -= leap; } /* Accumulate raw time */ @@ -1077,6 +1078,7 @@ static void update_wall_time(void) timekeeper.xtime.tv_sec++; leap = second_overflow(timekeeper.xtime.tv_sec); timekeeper.xtime.tv_sec += leap; + timekeeper.wall_to_monotonic.tv_sec -= leap; } timekeeping_update(false); From ad1ed2937eaa30adf9996ef5f21e0c1895f906b0 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Mon, 4 Jun 2012 14:29:59 +1000 Subject: [PATCH 092/178] nommu: fix compilation of nommu.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compiling 3.5-rc1 for nommu targets gives: CC mm/nommu.o mm/nommu.c: In function ‘sys_mmap_pgoff’: mm/nommu.c:1489:2: error: ‘ret’ undeclared (first use in this function) mm/nommu.c:1489:2: note: each undeclared identifier is reported only once for each function it appears in It is trivially fixed by replacing 'ret' with the local variable that is already defined for the return value 'retval'. Signed-off-by: Greg Ungerer Signed-off-by: Al Viro --- mm/nommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/nommu.c b/mm/nommu.c index c4acfbc09972..d4b0c10872de 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1486,7 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - ret = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); + retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); if (file) fput(file); From 03240b279dbbda41f2fc55ff4424acf651e25bef Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 4 Jun 2012 17:47:34 -0400 Subject: [PATCH 093/178] fixups for signal breakage Obvious brainos spotted by Geert. Signed-off-by: Al Viro --- arch/avr32/kernel/signal.c | 2 +- arch/xtensa/include/asm/syscall.h | 4 ++-- arch/xtensa/kernel/signal.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c index c140f9b41dce..d552a854dacc 100644 --- a/arch/avr32/kernel/signal.c +++ b/arch/avr32/kernel/signal.c @@ -300,7 +300,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, struct thread_info *ti) if ((sysreg_read(SR) & MODE_MASK) == MODE_SUPERVISOR) syscall = 1; - if (ti->flags & _TIF_SIGPENDING)) + if (ti->flags & _TIF_SIGPENDING) do_signal(regs, syscall); if (ti->flags & _TIF_NOTIFY_RESUME) { diff --git a/arch/xtensa/include/asm/syscall.h b/arch/xtensa/include/asm/syscall.h index 0b9f2e13c781..c1dacca312f3 100644 --- a/arch/xtensa/include/asm/syscall.h +++ b/arch/xtensa/include/asm/syscall.h @@ -31,5 +31,5 @@ asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp, asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds, struct timespec __user *tsp, const sigset_t __user *sigmask, size_t sigsetsize); - - +asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, + size_t sigsetsize); diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c index b9f8e5850d3a..efe4e854b3cd 100644 --- a/arch/xtensa/kernel/signal.c +++ b/arch/xtensa/kernel/signal.c @@ -493,7 +493,7 @@ static void do_signal(struct pt_regs *regs) if (ret) return; - signal_delivered(signr, info, ka, regs, 0); + signal_delivered(signr, &info, &ka, regs, 0); if (current->ptrace & PT_SINGLESTEP) task_pt_regs(current)->icountlevel = 1; From 363b06aaa59fc20d0a9c5a5a9ce1fa2c45946700 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Mon, 14 May 2012 11:08:51 +0900 Subject: [PATCH 094/178] drm/exynos: Use DRM_FORMAT_{NV12, YUV420} instead of DRM_FORMAT_{NV12M, YUV420M} MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The NV12M/YUV420M formats are identical to the already existing standard NV12/YUV420 formats. The M variants will be removed, so convert the driver to use the standard names. Signed-off-by: Ville Syrjälä Signed-off-by: Inki Dae Signed-off-by: Kyungmin Park --- drivers/gpu/drm/exynos/exynos_drm_fb.h | 4 ++-- drivers/gpu/drm/exynos/exynos_mixer.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_fb.h b/drivers/gpu/drm/exynos/exynos_drm_fb.h index 3ecb30d93552..50823756cdea 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_fb.h +++ b/drivers/gpu/drm/exynos/exynos_drm_fb.h @@ -31,10 +31,10 @@ static inline int exynos_drm_format_num_buffers(uint32_t format) { switch (format) { - case DRM_FORMAT_NV12M: + case DRM_FORMAT_NV12: case DRM_FORMAT_NV12MT: return 2; - case DRM_FORMAT_YUV420M: + case DRM_FORMAT_YUV420: return 3; default: return 1; diff --git a/drivers/gpu/drm/exynos/exynos_mixer.c b/drivers/gpu/drm/exynos/exynos_mixer.c index 68ef01028375..5a46e583c5b5 100644 --- a/drivers/gpu/drm/exynos/exynos_mixer.c +++ b/drivers/gpu/drm/exynos/exynos_mixer.c @@ -365,7 +365,7 @@ static void vp_video_buffer(struct mixer_context *ctx, int win) switch (win_data->pixel_format) { case DRM_FORMAT_NV12MT: tiled_mode = true; - case DRM_FORMAT_NV12M: + case DRM_FORMAT_NV12: crcb_mode = false; buf_num = 2; break; From 13b87b27421e12f82ebbaac018cea30f82e5c33e Mon Sep 17 00:00:00 2001 From: Inki Dae Date: Mon, 14 May 2012 20:04:38 +0900 Subject: [PATCH 095/178] drm/exynos: fixed size type. size type of drm_exynos_gem_mmap struct is changed to uint64_t and it adds pad for the struct to be aligned as 64bit. Signed-off-by: Inki Dae Signed-off-by: Kyungmin Park --- include/drm/exynos_drm.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/drm/exynos_drm.h b/include/drm/exynos_drm.h index b6d7ce92eadd..68733587e700 100644 --- a/include/drm/exynos_drm.h +++ b/include/drm/exynos_drm.h @@ -64,6 +64,7 @@ struct drm_exynos_gem_map_off { * A structure for mapping buffer. * * @handle: a handle to gem object created. + * @pad: just padding to be 64-bit aligned. * @size: memory size to be mapped. * @mapped: having user virtual address mmaped. * - this variable would be filled by exynos gem module @@ -72,7 +73,8 @@ struct drm_exynos_gem_map_off { */ struct drm_exynos_gem_mmap { unsigned int handle; - unsigned int size; + unsigned int pad; + uint64_t size; uint64_t mapped; }; From 293a1c128ecc523e9a74252ca64220d8081be759 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 16 May 2012 17:08:53 +0200 Subject: [PATCH 096/178] drm/exynos: DRIVER_BUS_PLATFORM is not a driver feature DRIVER_BUS_PLATFORM is a bus type used internally in the DRM core, not a flag for the drm_driver::driver_features field. Signed-off-by: Laurent Pinchart Signed-off-by: Inki Dae Signed-off-by: Kyungmin Park --- drivers/gpu/drm/exynos/exynos_drm_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_drv.c b/drivers/gpu/drm/exynos/exynos_drm_drv.c index 420953197d0a..d6de2e07fa03 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_drv.c +++ b/drivers/gpu/drm/exynos/exynos_drm_drv.c @@ -244,8 +244,8 @@ static const struct file_operations exynos_drm_driver_fops = { }; static struct drm_driver exynos_drm_driver = { - .driver_features = DRIVER_HAVE_IRQ | DRIVER_BUS_PLATFORM | - DRIVER_MODESET | DRIVER_GEM | DRIVER_PRIME, + .driver_features = DRIVER_HAVE_IRQ | DRIVER_MODESET | + DRIVER_GEM | DRIVER_PRIME, .load = exynos_drm_load, .unload = exynos_drm_unload, .open = exynos_drm_open, From 6037bafa2e676162a86e4f4dee366e394565a0ee Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 16 May 2012 17:08:55 +0200 Subject: [PATCH 097/178] drm/exynos: Don't cast GEM object to Exynos GEM object when not needed The exynos_drm_gem_dumb_map_offset() doesn't need to access any Exynos-specific GEM object fields, don't cast the GEM object. Signed-off-by: Laurent Pinchart Signed-off-by: Inki Dae Signed-off-by: Kyungmin Park --- drivers/gpu/drm/exynos/exynos_drm_gem.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c index fc91293c4560..5c8b683029ea 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gem.c +++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c @@ -689,7 +689,6 @@ int exynos_drm_gem_dumb_map_offset(struct drm_file *file_priv, struct drm_device *dev, uint32_t handle, uint64_t *offset) { - struct exynos_drm_gem_obj *exynos_gem_obj; struct drm_gem_object *obj; int ret = 0; @@ -710,15 +709,13 @@ int exynos_drm_gem_dumb_map_offset(struct drm_file *file_priv, goto unlock; } - exynos_gem_obj = to_exynos_gem_obj(obj); - - if (!exynos_gem_obj->base.map_list.map) { - ret = drm_gem_create_mmap_offset(&exynos_gem_obj->base); + if (!obj->map_list.map) { + ret = drm_gem_create_mmap_offset(obj); if (ret) goto out; } - *offset = (u64)exynos_gem_obj->base.map_list.hash.key << PAGE_SHIFT; + *offset = (u64)obj->map_list.hash.key << PAGE_SHIFT; DRM_DEBUG_KMS("offset = 0x%lx\n", (unsigned long)*offset); out: From 07b6835f2c6bc3101ed7cf471f566d53319a6d50 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 16 May 2012 17:08:56 +0200 Subject: [PATCH 098/178] drm/exynos: Keep a reference to frame buffer GEM objects GEM objects used by frame buffers must be referenced for the whole life of the frame buffer. Release the references in the frame buffer destructor instead of its constructor. Signed-off-by: Laurent Pinchart Signed-off-by: Inki Dae Signed-off-by: Kyungmin Park --- drivers/gpu/drm/exynos/exynos_drm_fb.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_fb.c b/drivers/gpu/drm/exynos/exynos_drm_fb.c index f82a299553fb..4ccfe4328fab 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_fb.c +++ b/drivers/gpu/drm/exynos/exynos_drm_fb.c @@ -51,11 +51,22 @@ struct exynos_drm_fb { static void exynos_drm_fb_destroy(struct drm_framebuffer *fb) { struct exynos_drm_fb *exynos_fb = to_exynos_fb(fb); + unsigned int i; DRM_DEBUG_KMS("%s\n", __FILE__); drm_framebuffer_cleanup(fb); + for (i = 0; i < ARRAY_SIZE(exynos_fb->exynos_gem_obj); i++) { + struct drm_gem_object *obj; + + if (exynos_fb->exynos_gem_obj[i] == NULL) + continue; + + obj = &exynos_fb->exynos_gem_obj[i]->base; + drm_gem_object_unreference_unlocked(obj); + } + kfree(exynos_fb); exynos_fb = NULL; } @@ -134,11 +145,11 @@ exynos_user_fb_create(struct drm_device *dev, struct drm_file *file_priv, return ERR_PTR(-ENOENT); } - drm_gem_object_unreference_unlocked(obj); - fb = exynos_drm_framebuffer_init(dev, mode_cmd, obj); - if (IS_ERR(fb)) + if (IS_ERR(fb)) { + drm_gem_object_unreference_unlocked(obj); return fb; + } exynos_fb = to_exynos_fb(fb); nr = exynos_drm_format_num_buffers(fb->pixel_format); @@ -152,8 +163,6 @@ exynos_user_fb_create(struct drm_device *dev, struct drm_file *file_priv, return ERR_PTR(-ENOENT); } - drm_gem_object_unreference_unlocked(obj); - exynos_fb->exynos_gem_obj[i] = to_exynos_gem_obj(obj); } From f56fdcef4d8991b0906461fec6494d7f9d401ef3 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 16 May 2012 17:08:54 +0200 Subject: [PATCH 099/178] drm/exynos: Remove dummy encoder get_crtc operation implementation The encoder get_crtc operation is called to retrieve a pointer to the CRTC the encoder is currenctly connected to, right after setting the encoder::crtc field to the new CRTC. The implementation of this operation returns the pointer to the new CRTC, which is then pointlessly compared to itself. As the operation is not mandatory, don't implement it. Signed-off-by: Laurent Pinchart Signed-off-by: Inki Dae Signed-off-by: Kyungmin Park --- drivers/gpu/drm/exynos/exynos_drm_encoder.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_encoder.c b/drivers/gpu/drm/exynos/exynos_drm_encoder.c index 6e9ac7bd1dcf..23d5ad379f86 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_encoder.c +++ b/drivers/gpu/drm/exynos/exynos_drm_encoder.c @@ -172,19 +172,12 @@ static void exynos_drm_encoder_commit(struct drm_encoder *encoder) manager_ops->commit(manager->dev); } -static struct drm_crtc * -exynos_drm_encoder_get_crtc(struct drm_encoder *encoder) -{ - return encoder->crtc; -} - static struct drm_encoder_helper_funcs exynos_encoder_helper_funcs = { .dpms = exynos_drm_encoder_dpms, .mode_fixup = exynos_drm_encoder_mode_fixup, .mode_set = exynos_drm_encoder_mode_set, .prepare = exynos_drm_encoder_prepare, .commit = exynos_drm_encoder_commit, - .get_crtc = exynos_drm_encoder_get_crtc, }; static void exynos_drm_encoder_destroy(struct drm_encoder *encoder) From 5736603bef2383b6bb07f88596ccc8c387d91121 Mon Sep 17 00:00:00 2001 From: Seung-Woo Kim Date: Tue, 15 May 2012 17:22:08 +0900 Subject: [PATCH 100/178] drm/exynos: fixed blending for hdmi graphic layer Blending for graphic layer 0 of hdmi mixer was not set so video layer cannot be showed if graphic layer 0 is enabled. This patch fixes blending values to support blending between graphic layer 0 and video layer. Signed-off-by: Seung-Woo Kim Signed-off-by: Kyungmin Park Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_mixer.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_mixer.c b/drivers/gpu/drm/exynos/exynos_mixer.c index 5a46e583c5b5..e2147a2ddcec 100644 --- a/drivers/gpu/drm/exynos/exynos_mixer.c +++ b/drivers/gpu/drm/exynos/exynos_mixer.c @@ -601,18 +601,20 @@ static void mixer_win_reset(struct mixer_context *ctx) mixer_reg_write(res, MXR_BG_COLOR2, 0x008080); /* setting graphical layers */ - val = MXR_GRP_CFG_COLOR_KEY_DISABLE; /* no blank key */ val |= MXR_GRP_CFG_WIN_BLEND_EN; + val |= MXR_GRP_CFG_BLEND_PRE_MUL; + val |= MXR_GRP_CFG_PIXEL_BLEND_EN; val |= MXR_GRP_CFG_ALPHA_VAL(0xff); /* non-transparent alpha */ /* the same configuration for both layers */ mixer_reg_write(res, MXR_GRAPHIC_CFG(0), val); - - val |= MXR_GRP_CFG_BLEND_PRE_MUL; - val |= MXR_GRP_CFG_PIXEL_BLEND_EN; mixer_reg_write(res, MXR_GRAPHIC_CFG(1), val); + /* setting video layers */ + val = MXR_GRP_CFG_ALPHA_VAL(0); + mixer_reg_write(res, MXR_VIDEO_CFG, val); + /* configuration of Video Processor Registers */ vp_win_reset(ctx); vp_default_filter(res); From f1ea8b66e5bf85802ae59961981f5e0d61510b18 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Sun, 3 Jun 2012 01:49:32 -0700 Subject: [PATCH 101/178] [PARISC] fix missing TAINT_WARN problem Al viro broke us with commit edd63a2763bdae0daa4f0a4d4c5d61d1154352a5 Author: Al Viro Date: Fri Apr 27 13:42:45 2012 -0400 set_restore_sigmask() is never called without SIGPENDING (and never should be) Although it's pretty much our fault since parisc's asm/bug.h uses BUGWARN_TAINT but doesn't include the file that defines it. Fix that. Signed-off-by: James Bottomley --- arch/parisc/include/asm/bug.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/parisc/include/asm/bug.h b/arch/parisc/include/asm/bug.h index 72cfdb0cfdd1..62a33338549c 100644 --- a/arch/parisc/include/asm/bug.h +++ b/arch/parisc/include/asm/bug.h @@ -1,6 +1,8 @@ #ifndef _PARISC_BUG_H #define _PARISC_BUG_H +#include /* for BUGFLAG_TAINT */ + /* * Tell the user there is some problem. * The offending file and line are encoded in the __bug_table section. From 731455624fe5af906877aae80fb107daccaa9599 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Tue, 5 Jun 2012 13:53:04 +0900 Subject: [PATCH 102/178] [PARISC] fix compile break in use of lib/strncopy_from_user.c Linus broke us with commit 36126f8f2ed8168eb13aa0662b9b9585cba100a9 Author: Linus Torvalds Date: Sat May 26 10:43:17 2012 -0700 word-at-a-time: make the interfaces truly generic By moving functions defined in strncopy_from_user.c into the asm-geneic version word-at-a-time.h. Spark and OpenRisc were fixed to use this, but not parisc. Fix by adding to generic-y in asm/Kbuild Signed-off-by: James Bottomley --- arch/parisc/include/asm/Kbuild | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index 19a434f55059..4383707d9801 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -1,3 +1,4 @@ include include/asm-generic/Kbuild.asm header-y += pdc.h +generic-y += word-at-a-time.h From 4c01acc01d77e8df5727247aa5ef5912c00256bf Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Tue, 5 Jun 2012 13:54:09 +0900 Subject: [PATCH 103/178] [PARISC] fix code to find libgcc Sam broke this with commit 1f2bfbd00e466ff3489b2ca5cc75b1cccd14c123 Author: Sam Ravnborg Date: Sat May 5 10:18:41 2012 +0200 kbuild: link of vmlinux moved to a script But we should be deriving the location of libgcc in the same way as all the other archs, so fix by adding a LIBGCC variable which is evaluated in the makefile Signed-off-by: James Bottomley --- arch/parisc/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/parisc/Makefile b/arch/parisc/Makefile index dbc3850b1d0d..5707f1a62341 100644 --- a/arch/parisc/Makefile +++ b/arch/parisc/Makefile @@ -21,6 +21,7 @@ KBUILD_DEFCONFIG := default_defconfig NM = sh $(srctree)/arch/parisc/nm CHECKFLAGS += -D__hppa__=1 +LIBGCC = $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name) MACHINE := $(shell uname -m) ifeq ($(MACHINE),parisc*) @@ -79,7 +80,7 @@ kernel-y := mm/ kernel/ math-emu/ kernel-$(CONFIG_HPUX) += hpux/ core-y += $(addprefix arch/parisc/, $(kernel-y)) -libs-y += arch/parisc/lib/ `$(CC) -print-libgcc-file-name` +libs-y += arch/parisc/lib/ $(LIBGCC) drivers-$(CONFIG_OPROFILE) += arch/parisc/oprofile/ From d4e30ef05c9e0fad9782de34f0acd039e238fd43 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 4 Jun 2012 17:18:51 -0400 Subject: [PATCH 104/178] drm/radeon: make audio_init consistent across asics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Call it in the asic startup callback on all asics. Previously r600 and rv770 called it in the startup and resume callbacks while all the other asics called it in the startup callback. Signed-off-by: Alex Deucher Reviewed-by: Rafał Miłecki Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/r600.c | 15 ++++++--------- drivers/gpu/drm/radeon/rs600.c | 12 ++++++------ drivers/gpu/drm/radeon/rs690.c | 12 ++++++------ drivers/gpu/drm/radeon/rv770.c | 18 ++++++------------ 4 files changed, 24 insertions(+), 33 deletions(-) diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c index 45cfcea63507..f30dc95f83b1 100644 --- a/drivers/gpu/drm/radeon/r600.c +++ b/drivers/gpu/drm/radeon/r600.c @@ -2426,6 +2426,12 @@ int r600_startup(struct radeon_device *rdev) if (r) return r; + r = r600_audio_init(rdev); + if (r) { + DRM_ERROR("radeon: audio init failed\n"); + return r; + } + return 0; } @@ -2462,12 +2468,6 @@ int r600_resume(struct radeon_device *rdev) return r; } - r = r600_audio_init(rdev); - if (r) { - DRM_ERROR("radeon: audio resume failed\n"); - return r; - } - return r; } @@ -2577,9 +2577,6 @@ int r600_init(struct radeon_device *rdev) rdev->accel_working = false; } - r = r600_audio_init(rdev); - if (r) - return r; /* TODO error handling */ return 0; } diff --git a/drivers/gpu/drm/radeon/rs600.c b/drivers/gpu/drm/radeon/rs600.c index 25f9eef12c42..e95c5e61d4e2 100644 --- a/drivers/gpu/drm/radeon/rs600.c +++ b/drivers/gpu/drm/radeon/rs600.c @@ -908,12 +908,6 @@ static int rs600_startup(struct radeon_device *rdev) return r; } - r = r600_audio_init(rdev); - if (r) { - dev_err(rdev->dev, "failed initializing audio\n"); - return r; - } - r = radeon_ib_pool_start(rdev); if (r) return r; @@ -922,6 +916,12 @@ static int rs600_startup(struct radeon_device *rdev) if (r) return r; + r = r600_audio_init(rdev); + if (r) { + dev_err(rdev->dev, "failed initializing audio\n"); + return r; + } + return 0; } diff --git a/drivers/gpu/drm/radeon/rs690.c b/drivers/gpu/drm/radeon/rs690.c index 3277ddecfe9f..159b6a43fda0 100644 --- a/drivers/gpu/drm/radeon/rs690.c +++ b/drivers/gpu/drm/radeon/rs690.c @@ -637,12 +637,6 @@ static int rs690_startup(struct radeon_device *rdev) return r; } - r = r600_audio_init(rdev); - if (r) { - dev_err(rdev->dev, "failed initializing audio\n"); - return r; - } - r = radeon_ib_pool_start(rdev); if (r) return r; @@ -651,6 +645,12 @@ static int rs690_startup(struct radeon_device *rdev) if (r) return r; + r = r600_audio_init(rdev); + if (r) { + dev_err(rdev->dev, "failed initializing audio\n"); + return r; + } + return 0; } diff --git a/drivers/gpu/drm/radeon/rv770.c b/drivers/gpu/drm/radeon/rv770.c index 04ddc365a908..4ad0281fdc37 100644 --- a/drivers/gpu/drm/radeon/rv770.c +++ b/drivers/gpu/drm/radeon/rv770.c @@ -956,6 +956,12 @@ static int rv770_startup(struct radeon_device *rdev) if (r) return r; + r = r600_audio_init(rdev); + if (r) { + DRM_ERROR("radeon: audio init failed\n"); + return r; + } + return 0; } @@ -978,12 +984,6 @@ int rv770_resume(struct radeon_device *rdev) return r; } - r = r600_audio_init(rdev); - if (r) { - dev_err(rdev->dev, "radeon: audio init failed\n"); - return r; - } - return r; } @@ -1092,12 +1092,6 @@ int rv770_init(struct radeon_device *rdev) rdev->accel_working = false; } - r = r600_audio_init(rdev); - if (r) { - dev_err(rdev->dev, "radeon: audio init failed\n"); - return r; - } - return 0; } From 0aecb5a4ba1ea4167f31d1790eec027f1e658f2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Mon, 4 Jun 2012 18:36:58 +0200 Subject: [PATCH 105/178] drm/radeon/audio: don't hardcode CRTC id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is based on info released by AMD, should allow using audio in much more cases. Signed-off-by: Rafał Miłecki Reviewed-by: Alex Deucher Cc: Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/r600_audio.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/radeon/r600_audio.c b/drivers/gpu/drm/radeon/r600_audio.c index 7c4fa77f018f..7479a5c503e4 100644 --- a/drivers/gpu/drm/radeon/r600_audio.c +++ b/drivers/gpu/drm/radeon/r600_audio.c @@ -192,6 +192,7 @@ void r600_audio_set_clock(struct drm_encoder *encoder, int clock) struct radeon_device *rdev = dev->dev_private; struct radeon_encoder *radeon_encoder = to_radeon_encoder(encoder); struct radeon_encoder_atom_dig *dig = radeon_encoder->enc_priv; + struct radeon_crtc *radeon_crtc = to_radeon_crtc(encoder->crtc); int base_rate = 48000; switch (radeon_encoder->encoder_id) { @@ -217,8 +218,8 @@ void r600_audio_set_clock(struct drm_encoder *encoder, int clock) WREG32(EVERGREEN_AUDIO_PLL1_DIV, clock * 10); WREG32(EVERGREEN_AUDIO_PLL1_UNK, 0x00000071); - /* Some magic trigger or src sel? */ - WREG32_P(0x5ac, 0x01, ~0x77); + /* Select DTO source */ + WREG32(0x5ac, radeon_crtc->crtc_id); } else { switch (dig->dig_encoder) { case 0: From 7838e05a0d29f27f4509290d866e5fc14e642c4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Mon, 4 Jun 2012 20:18:03 +0200 Subject: [PATCH 106/178] drm/radeon/hdmi: don't set SEND_MAX_PACKETS bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Many TVs and A/V receivers don't work with this bit set. Problem was confirmed using: Onkyo TX-SR605, Sony BRAVIA KDL-52X3500, Sony BRAVIA KDL-40S40xx. In theory this bit shouldn't affect audio engine when feeding it with data, however it seems it does. Driver fglrx doesn't set that bit in any of the above cases. This fixes a regression introduced by 3.5-rc1. Signed-off-by: Rafał Miłecki Reviewed-by: Alex Deucher Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/r600_hdmi.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/radeon/r600_hdmi.c b/drivers/gpu/drm/radeon/r600_hdmi.c index 226379e00ac1..969c27529dfe 100644 --- a/drivers/gpu/drm/radeon/r600_hdmi.c +++ b/drivers/gpu/drm/radeon/r600_hdmi.c @@ -348,7 +348,6 @@ void r600_hdmi_setmode(struct drm_encoder *encoder, struct drm_display_mode *mod WREG32(HDMI0_AUDIO_PACKET_CONTROL + offset, HDMI0_AUDIO_SAMPLE_SEND | /* send audio packets */ HDMI0_AUDIO_DELAY_EN(1) | /* default audio delay */ - HDMI0_AUDIO_SEND_MAX_PACKETS | /* send NULL packets if no audio is available */ HDMI0_AUDIO_PACKETS_PER_LINE(3) | /* should be suffient for all audio modes and small enough for all hblanks */ HDMI0_60958_CS_UPDATE); /* allow 60958 channel status fields to be updated */ } From 1a8ca7502c22f7a3802f9a207bf80db5439b11c2 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 1 Jun 2012 18:58:22 -0400 Subject: [PATCH 107/178] drm/radeon: fix gpu_init on si - Properly set up the RBs - Properly set up the SPI - Properly set up gb_addr_config This should fix rendering issues on certain cards. Signed-off-by: Alex Deucher Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/radeon.h | 5 +- drivers/gpu/drm/radeon/radeon_kms.c | 2 +- drivers/gpu/drm/radeon/si.c | 481 ++++++++++------------------ drivers/gpu/drm/radeon/sid.h | 19 ++ 4 files changed, 186 insertions(+), 321 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 85dac33e3cce..fefcca55c1eb 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -1374,9 +1374,9 @@ struct cayman_asic { struct si_asic { unsigned max_shader_engines; - unsigned max_pipes_per_simd; unsigned max_tile_pipes; - unsigned max_simds_per_se; + unsigned max_cu_per_sh; + unsigned max_sh_per_se; unsigned max_backends_per_se; unsigned max_texture_channel_caches; unsigned max_gprs; @@ -1387,7 +1387,6 @@ struct si_asic { unsigned sc_hiz_tile_fifo_size; unsigned sc_earlyz_tile_fifo_size; - unsigned num_shader_engines; unsigned num_tile_pipes; unsigned num_backends_per_se; unsigned backend_disable_mask_per_asic; diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index f1016a5820d1..5c58d7d90cb2 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -273,7 +273,7 @@ int radeon_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) break; case RADEON_INFO_MAX_PIPES: if (rdev->family >= CHIP_TAHITI) - value = rdev->config.si.max_pipes_per_simd; + value = rdev->config.si.max_cu_per_sh; else if (rdev->family >= CHIP_CAYMAN) value = rdev->config.cayman.max_pipes_per_simd; else if (rdev->family >= CHIP_CEDAR) diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c index 549732e56ca9..c7b61f16ecfd 100644 --- a/drivers/gpu/drm/radeon/si.c +++ b/drivers/gpu/drm/radeon/si.c @@ -867,200 +867,6 @@ void dce6_bandwidth_update(struct radeon_device *rdev) /* * Core functions */ -static u32 si_get_tile_pipe_to_backend_map(struct radeon_device *rdev, - u32 num_tile_pipes, - u32 num_backends_per_asic, - u32 *backend_disable_mask_per_asic, - u32 num_shader_engines) -{ - u32 backend_map = 0; - u32 enabled_backends_mask = 0; - u32 enabled_backends_count = 0; - u32 num_backends_per_se; - u32 cur_pipe; - u32 swizzle_pipe[SI_MAX_PIPES]; - u32 cur_backend = 0; - u32 i; - bool force_no_swizzle; - - /* force legal values */ - if (num_tile_pipes < 1) - num_tile_pipes = 1; - if (num_tile_pipes > rdev->config.si.max_tile_pipes) - num_tile_pipes = rdev->config.si.max_tile_pipes; - if (num_shader_engines < 1) - num_shader_engines = 1; - if (num_shader_engines > rdev->config.si.max_shader_engines) - num_shader_engines = rdev->config.si.max_shader_engines; - if (num_backends_per_asic < num_shader_engines) - num_backends_per_asic = num_shader_engines; - if (num_backends_per_asic > (rdev->config.si.max_backends_per_se * num_shader_engines)) - num_backends_per_asic = rdev->config.si.max_backends_per_se * num_shader_engines; - - /* make sure we have the same number of backends per se */ - num_backends_per_asic = ALIGN(num_backends_per_asic, num_shader_engines); - /* set up the number of backends per se */ - num_backends_per_se = num_backends_per_asic / num_shader_engines; - if (num_backends_per_se > rdev->config.si.max_backends_per_se) { - num_backends_per_se = rdev->config.si.max_backends_per_se; - num_backends_per_asic = num_backends_per_se * num_shader_engines; - } - - /* create enable mask and count for enabled backends */ - for (i = 0; i < SI_MAX_BACKENDS; ++i) { - if (((*backend_disable_mask_per_asic >> i) & 1) == 0) { - enabled_backends_mask |= (1 << i); - ++enabled_backends_count; - } - if (enabled_backends_count == num_backends_per_asic) - break; - } - - /* force the backends mask to match the current number of backends */ - if (enabled_backends_count != num_backends_per_asic) { - u32 this_backend_enabled; - u32 shader_engine; - u32 backend_per_se; - - enabled_backends_mask = 0; - enabled_backends_count = 0; - *backend_disable_mask_per_asic = SI_MAX_BACKENDS_MASK; - for (i = 0; i < SI_MAX_BACKENDS; ++i) { - /* calc the current se */ - shader_engine = i / rdev->config.si.max_backends_per_se; - /* calc the backend per se */ - backend_per_se = i % rdev->config.si.max_backends_per_se; - /* default to not enabled */ - this_backend_enabled = 0; - if ((shader_engine < num_shader_engines) && - (backend_per_se < num_backends_per_se)) - this_backend_enabled = 1; - if (this_backend_enabled) { - enabled_backends_mask |= (1 << i); - *backend_disable_mask_per_asic &= ~(1 << i); - ++enabled_backends_count; - } - } - } - - - memset((uint8_t *)&swizzle_pipe[0], 0, sizeof(u32) * SI_MAX_PIPES); - switch (rdev->family) { - case CHIP_TAHITI: - case CHIP_PITCAIRN: - case CHIP_VERDE: - force_no_swizzle = true; - break; - default: - force_no_swizzle = false; - break; - } - if (force_no_swizzle) { - bool last_backend_enabled = false; - - force_no_swizzle = false; - for (i = 0; i < SI_MAX_BACKENDS; ++i) { - if (((enabled_backends_mask >> i) & 1) == 1) { - if (last_backend_enabled) - force_no_swizzle = true; - last_backend_enabled = true; - } else - last_backend_enabled = false; - } - } - - switch (num_tile_pipes) { - case 1: - case 3: - case 5: - case 7: - DRM_ERROR("odd number of pipes!\n"); - break; - case 2: - swizzle_pipe[0] = 0; - swizzle_pipe[1] = 1; - break; - case 4: - if (force_no_swizzle) { - swizzle_pipe[0] = 0; - swizzle_pipe[1] = 1; - swizzle_pipe[2] = 2; - swizzle_pipe[3] = 3; - } else { - swizzle_pipe[0] = 0; - swizzle_pipe[1] = 2; - swizzle_pipe[2] = 1; - swizzle_pipe[3] = 3; - } - break; - case 6: - if (force_no_swizzle) { - swizzle_pipe[0] = 0; - swizzle_pipe[1] = 1; - swizzle_pipe[2] = 2; - swizzle_pipe[3] = 3; - swizzle_pipe[4] = 4; - swizzle_pipe[5] = 5; - } else { - swizzle_pipe[0] = 0; - swizzle_pipe[1] = 2; - swizzle_pipe[2] = 4; - swizzle_pipe[3] = 1; - swizzle_pipe[4] = 3; - swizzle_pipe[5] = 5; - } - break; - case 8: - if (force_no_swizzle) { - swizzle_pipe[0] = 0; - swizzle_pipe[1] = 1; - swizzle_pipe[2] = 2; - swizzle_pipe[3] = 3; - swizzle_pipe[4] = 4; - swizzle_pipe[5] = 5; - swizzle_pipe[6] = 6; - swizzle_pipe[7] = 7; - } else { - swizzle_pipe[0] = 0; - swizzle_pipe[1] = 2; - swizzle_pipe[2] = 4; - swizzle_pipe[3] = 6; - swizzle_pipe[4] = 1; - swizzle_pipe[5] = 3; - swizzle_pipe[6] = 5; - swizzle_pipe[7] = 7; - } - break; - } - - for (cur_pipe = 0; cur_pipe < num_tile_pipes; ++cur_pipe) { - while (((1 << cur_backend) & enabled_backends_mask) == 0) - cur_backend = (cur_backend + 1) % SI_MAX_BACKENDS; - - backend_map |= (((cur_backend & 0xf) << (swizzle_pipe[cur_pipe] * 4))); - - cur_backend = (cur_backend + 1) % SI_MAX_BACKENDS; - } - - return backend_map; -} - -static u32 si_get_disable_mask_per_asic(struct radeon_device *rdev, - u32 disable_mask_per_se, - u32 max_disable_mask_per_se, - u32 num_shader_engines) -{ - u32 disable_field_width_per_se = r600_count_pipe_bits(disable_mask_per_se); - u32 disable_mask_per_asic = disable_mask_per_se & max_disable_mask_per_se; - - if (num_shader_engines == 1) - return disable_mask_per_asic; - else if (num_shader_engines == 2) - return disable_mask_per_asic | (disable_mask_per_asic << disable_field_width_per_se); - else - return 0xffffffff; -} - static void si_tiling_mode_table_init(struct radeon_device *rdev) { const u32 num_tile_mode_states = 32; @@ -1562,18 +1368,151 @@ static void si_tiling_mode_table_init(struct radeon_device *rdev) DRM_ERROR("unknown asic: 0x%x\n", rdev->family); } +static void si_select_se_sh(struct radeon_device *rdev, + u32 se_num, u32 sh_num) +{ + u32 data = INSTANCE_BROADCAST_WRITES; + + if ((se_num == 0xffffffff) && (sh_num == 0xffffffff)) + data = SH_BROADCAST_WRITES | SE_BROADCAST_WRITES; + else if (se_num == 0xffffffff) + data |= SE_BROADCAST_WRITES | SH_INDEX(sh_num); + else if (sh_num == 0xffffffff) + data |= SH_BROADCAST_WRITES | SE_INDEX(se_num); + else + data |= SH_INDEX(sh_num) | SE_INDEX(se_num); + WREG32(GRBM_GFX_INDEX, data); +} + +static u32 si_create_bitmask(u32 bit_width) +{ + u32 i, mask = 0; + + for (i = 0; i < bit_width; i++) { + mask <<= 1; + mask |= 1; + } + return mask; +} + +static u32 si_get_cu_enabled(struct radeon_device *rdev, u32 cu_per_sh) +{ + u32 data, mask; + + data = RREG32(CC_GC_SHADER_ARRAY_CONFIG); + if (data & 1) + data &= INACTIVE_CUS_MASK; + else + data = 0; + data |= RREG32(GC_USER_SHADER_ARRAY_CONFIG); + + data >>= INACTIVE_CUS_SHIFT; + + mask = si_create_bitmask(cu_per_sh); + + return ~data & mask; +} + +static void si_setup_spi(struct radeon_device *rdev, + u32 se_num, u32 sh_per_se, + u32 cu_per_sh) +{ + int i, j, k; + u32 data, mask, active_cu; + + for (i = 0; i < se_num; i++) { + for (j = 0; j < sh_per_se; j++) { + si_select_se_sh(rdev, i, j); + data = RREG32(SPI_STATIC_THREAD_MGMT_3); + active_cu = si_get_cu_enabled(rdev, cu_per_sh); + + mask = 1; + for (k = 0; k < 16; k++) { + mask <<= k; + if (active_cu & mask) { + data &= ~mask; + WREG32(SPI_STATIC_THREAD_MGMT_3, data); + break; + } + } + } + } + si_select_se_sh(rdev, 0xffffffff, 0xffffffff); +} + +static u32 si_get_rb_disabled(struct radeon_device *rdev, + u32 max_rb_num, u32 se_num, + u32 sh_per_se) +{ + u32 data, mask; + + data = RREG32(CC_RB_BACKEND_DISABLE); + if (data & 1) + data &= BACKEND_DISABLE_MASK; + else + data = 0; + data |= RREG32(GC_USER_RB_BACKEND_DISABLE); + + data >>= BACKEND_DISABLE_SHIFT; + + mask = si_create_bitmask(max_rb_num / se_num / sh_per_se); + + return data & mask; +} + +static void si_setup_rb(struct radeon_device *rdev, + u32 se_num, u32 sh_per_se, + u32 max_rb_num) +{ + int i, j; + u32 data, mask; + u32 disabled_rbs = 0; + u32 enabled_rbs = 0; + + for (i = 0; i < se_num; i++) { + for (j = 0; j < sh_per_se; j++) { + si_select_se_sh(rdev, i, j); + data = si_get_rb_disabled(rdev, max_rb_num, se_num, sh_per_se); + disabled_rbs |= data << ((i * sh_per_se + j) * TAHITI_RB_BITMAP_WIDTH_PER_SH); + } + } + si_select_se_sh(rdev, 0xffffffff, 0xffffffff); + + mask = 1; + for (i = 0; i < max_rb_num; i++) { + if (!(disabled_rbs & mask)) + enabled_rbs |= mask; + mask <<= 1; + } + + for (i = 0; i < se_num; i++) { + si_select_se_sh(rdev, i, 0xffffffff); + data = 0; + for (j = 0; j < sh_per_se; j++) { + switch (enabled_rbs & 3) { + case 1: + data |= (RASTER_CONFIG_RB_MAP_0 << (i * sh_per_se + j) * 2); + break; + case 2: + data |= (RASTER_CONFIG_RB_MAP_3 << (i * sh_per_se + j) * 2); + break; + case 3: + default: + data |= (RASTER_CONFIG_RB_MAP_2 << (i * sh_per_se + j) * 2); + break; + } + enabled_rbs >>= 2; + } + WREG32(PA_SC_RASTER_CONFIG, data); + } + si_select_se_sh(rdev, 0xffffffff, 0xffffffff); +} + static void si_gpu_init(struct radeon_device *rdev) { - u32 cc_rb_backend_disable = 0; - u32 cc_gc_shader_array_config; u32 gb_addr_config = 0; u32 mc_shared_chmap, mc_arb_ramcfg; - u32 gb_backend_map; - u32 cgts_tcc_disable; u32 sx_debug_1; - u32 gc_user_shader_array_config; - u32 gc_user_rb_backend_disable; - u32 cgts_user_tcc_disable; u32 hdp_host_path_cntl; u32 tmp; int i, j; @@ -1581,9 +1520,9 @@ static void si_gpu_init(struct radeon_device *rdev) switch (rdev->family) { case CHIP_TAHITI: rdev->config.si.max_shader_engines = 2; - rdev->config.si.max_pipes_per_simd = 4; rdev->config.si.max_tile_pipes = 12; - rdev->config.si.max_simds_per_se = 8; + rdev->config.si.max_cu_per_sh = 8; + rdev->config.si.max_sh_per_se = 2; rdev->config.si.max_backends_per_se = 4; rdev->config.si.max_texture_channel_caches = 12; rdev->config.si.max_gprs = 256; @@ -1594,12 +1533,13 @@ static void si_gpu_init(struct radeon_device *rdev) rdev->config.si.sc_prim_fifo_size_backend = 0x100; rdev->config.si.sc_hiz_tile_fifo_size = 0x30; rdev->config.si.sc_earlyz_tile_fifo_size = 0x130; + gb_addr_config = TAHITI_GB_ADDR_CONFIG_GOLDEN; break; case CHIP_PITCAIRN: rdev->config.si.max_shader_engines = 2; - rdev->config.si.max_pipes_per_simd = 4; rdev->config.si.max_tile_pipes = 8; - rdev->config.si.max_simds_per_se = 5; + rdev->config.si.max_cu_per_sh = 5; + rdev->config.si.max_sh_per_se = 2; rdev->config.si.max_backends_per_se = 4; rdev->config.si.max_texture_channel_caches = 8; rdev->config.si.max_gprs = 256; @@ -1610,13 +1550,14 @@ static void si_gpu_init(struct radeon_device *rdev) rdev->config.si.sc_prim_fifo_size_backend = 0x100; rdev->config.si.sc_hiz_tile_fifo_size = 0x30; rdev->config.si.sc_earlyz_tile_fifo_size = 0x130; + gb_addr_config = TAHITI_GB_ADDR_CONFIG_GOLDEN; break; case CHIP_VERDE: default: rdev->config.si.max_shader_engines = 1; - rdev->config.si.max_pipes_per_simd = 4; rdev->config.si.max_tile_pipes = 4; - rdev->config.si.max_simds_per_se = 2; + rdev->config.si.max_cu_per_sh = 2; + rdev->config.si.max_sh_per_se = 2; rdev->config.si.max_backends_per_se = 4; rdev->config.si.max_texture_channel_caches = 4; rdev->config.si.max_gprs = 256; @@ -1627,6 +1568,7 @@ static void si_gpu_init(struct radeon_device *rdev) rdev->config.si.sc_prim_fifo_size_backend = 0x40; rdev->config.si.sc_hiz_tile_fifo_size = 0x30; rdev->config.si.sc_earlyz_tile_fifo_size = 0x130; + gb_addr_config = VERDE_GB_ADDR_CONFIG_GOLDEN; break; } @@ -1648,31 +1590,7 @@ static void si_gpu_init(struct radeon_device *rdev) mc_shared_chmap = RREG32(MC_SHARED_CHMAP); mc_arb_ramcfg = RREG32(MC_ARB_RAMCFG); - cc_rb_backend_disable = RREG32(CC_RB_BACKEND_DISABLE); - cc_gc_shader_array_config = RREG32(CC_GC_SHADER_ARRAY_CONFIG); - cgts_tcc_disable = 0xffff0000; - for (i = 0; i < rdev->config.si.max_texture_channel_caches; i++) - cgts_tcc_disable &= ~(1 << (16 + i)); - gc_user_rb_backend_disable = RREG32(GC_USER_RB_BACKEND_DISABLE); - gc_user_shader_array_config = RREG32(GC_USER_SHADER_ARRAY_CONFIG); - cgts_user_tcc_disable = RREG32(CGTS_USER_TCC_DISABLE); - - rdev->config.si.num_shader_engines = rdev->config.si.max_shader_engines; rdev->config.si.num_tile_pipes = rdev->config.si.max_tile_pipes; - tmp = ((~gc_user_rb_backend_disable) & BACKEND_DISABLE_MASK) >> BACKEND_DISABLE_SHIFT; - rdev->config.si.num_backends_per_se = r600_count_pipe_bits(tmp); - tmp = (gc_user_rb_backend_disable & BACKEND_DISABLE_MASK) >> BACKEND_DISABLE_SHIFT; - rdev->config.si.backend_disable_mask_per_asic = - si_get_disable_mask_per_asic(rdev, tmp, SI_MAX_BACKENDS_PER_SE_MASK, - rdev->config.si.num_shader_engines); - rdev->config.si.backend_map = - si_get_tile_pipe_to_backend_map(rdev, rdev->config.si.num_tile_pipes, - rdev->config.si.num_backends_per_se * - rdev->config.si.num_shader_engines, - &rdev->config.si.backend_disable_mask_per_asic, - rdev->config.si.num_shader_engines); - tmp = ((~cgts_user_tcc_disable) & TCC_DISABLE_MASK) >> TCC_DISABLE_SHIFT; - rdev->config.si.num_texture_channel_caches = r600_count_pipe_bits(tmp); rdev->config.si.mem_max_burst_length_bytes = 256; tmp = (mc_arb_ramcfg & NOOFCOLS_MASK) >> NOOFCOLS_SHIFT; rdev->config.si.mem_row_size_in_kb = (4 * (1 << (8 + tmp))) / 1024; @@ -1683,55 +1601,8 @@ static void si_gpu_init(struct radeon_device *rdev) rdev->config.si.num_gpus = 1; rdev->config.si.multi_gpu_tile_size = 64; - gb_addr_config = 0; - switch (rdev->config.si.num_tile_pipes) { - case 1: - gb_addr_config |= NUM_PIPES(0); - break; - case 2: - gb_addr_config |= NUM_PIPES(1); - break; - case 4: - gb_addr_config |= NUM_PIPES(2); - break; - case 8: - default: - gb_addr_config |= NUM_PIPES(3); - break; - } - - tmp = (rdev->config.si.mem_max_burst_length_bytes / 256) - 1; - gb_addr_config |= PIPE_INTERLEAVE_SIZE(tmp); - gb_addr_config |= NUM_SHADER_ENGINES(rdev->config.si.num_shader_engines - 1); - tmp = (rdev->config.si.shader_engine_tile_size / 16) - 1; - gb_addr_config |= SHADER_ENGINE_TILE_SIZE(tmp); - switch (rdev->config.si.num_gpus) { - case 1: - default: - gb_addr_config |= NUM_GPUS(0); - break; - case 2: - gb_addr_config |= NUM_GPUS(1); - break; - case 4: - gb_addr_config |= NUM_GPUS(2); - break; - } - switch (rdev->config.si.multi_gpu_tile_size) { - case 16: - gb_addr_config |= MULTI_GPU_TILE_SIZE(0); - break; - case 32: - default: - gb_addr_config |= MULTI_GPU_TILE_SIZE(1); - break; - case 64: - gb_addr_config |= MULTI_GPU_TILE_SIZE(2); - break; - case 128: - gb_addr_config |= MULTI_GPU_TILE_SIZE(3); - break; - } + /* fix up row size */ + gb_addr_config &= ~ROW_SIZE_MASK; switch (rdev->config.si.mem_row_size_in_kb) { case 1: default: @@ -1745,26 +1616,6 @@ static void si_gpu_init(struct radeon_device *rdev) break; } - tmp = (gb_addr_config & NUM_PIPES_MASK) >> NUM_PIPES_SHIFT; - rdev->config.si.num_tile_pipes = (1 << tmp); - tmp = (gb_addr_config & PIPE_INTERLEAVE_SIZE_MASK) >> PIPE_INTERLEAVE_SIZE_SHIFT; - rdev->config.si.mem_max_burst_length_bytes = (tmp + 1) * 256; - tmp = (gb_addr_config & NUM_SHADER_ENGINES_MASK) >> NUM_SHADER_ENGINES_SHIFT; - rdev->config.si.num_shader_engines = tmp + 1; - tmp = (gb_addr_config & NUM_GPUS_MASK) >> NUM_GPUS_SHIFT; - rdev->config.si.num_gpus = tmp + 1; - tmp = (gb_addr_config & MULTI_GPU_TILE_SIZE_MASK) >> MULTI_GPU_TILE_SIZE_SHIFT; - rdev->config.si.multi_gpu_tile_size = 1 << tmp; - tmp = (gb_addr_config & ROW_SIZE_MASK) >> ROW_SIZE_SHIFT; - rdev->config.si.mem_row_size_in_kb = 1 << tmp; - - gb_backend_map = - si_get_tile_pipe_to_backend_map(rdev, rdev->config.si.num_tile_pipes, - rdev->config.si.num_backends_per_se * - rdev->config.si.num_shader_engines, - &rdev->config.si.backend_disable_mask_per_asic, - rdev->config.si.num_shader_engines); - /* setup tiling info dword. gb_addr_config is not adequate since it does * not have bank info, so create a custom tiling dword. * bits 3:0 num_pipes @@ -1789,34 +1640,30 @@ static void si_gpu_init(struct radeon_device *rdev) rdev->config.si.tile_config |= (3 << 0); break; } - rdev->config.si.tile_config |= - ((mc_arb_ramcfg & NOOFBANK_MASK) >> NOOFBANK_SHIFT) << 4; + if ((mc_arb_ramcfg & NOOFBANK_MASK) >> NOOFBANK_SHIFT) + rdev->config.si.tile_config |= 1 << 4; + else + rdev->config.si.tile_config |= 0 << 4; rdev->config.si.tile_config |= ((gb_addr_config & PIPE_INTERLEAVE_SIZE_MASK) >> PIPE_INTERLEAVE_SIZE_SHIFT) << 8; rdev->config.si.tile_config |= ((gb_addr_config & ROW_SIZE_MASK) >> ROW_SIZE_SHIFT) << 12; - rdev->config.si.backend_map = gb_backend_map; WREG32(GB_ADDR_CONFIG, gb_addr_config); WREG32(DMIF_ADDR_CONFIG, gb_addr_config); WREG32(HDP_ADDR_CONFIG, gb_addr_config); - /* primary versions */ - WREG32(CC_RB_BACKEND_DISABLE, cc_rb_backend_disable); - WREG32(CC_SYS_RB_BACKEND_DISABLE, cc_rb_backend_disable); - WREG32(CC_GC_SHADER_ARRAY_CONFIG, cc_gc_shader_array_config); - - WREG32(CGTS_TCC_DISABLE, cgts_tcc_disable); - - /* user versions */ - WREG32(GC_USER_RB_BACKEND_DISABLE, cc_rb_backend_disable); - WREG32(GC_USER_SYS_RB_BACKEND_DISABLE, cc_rb_backend_disable); - WREG32(GC_USER_SHADER_ARRAY_CONFIG, cc_gc_shader_array_config); - - WREG32(CGTS_USER_TCC_DISABLE, cgts_tcc_disable); - si_tiling_mode_table_init(rdev); + si_setup_rb(rdev, rdev->config.si.max_shader_engines, + rdev->config.si.max_sh_per_se, + rdev->config.si.max_backends_per_se); + + si_setup_spi(rdev, rdev->config.si.max_shader_engines, + rdev->config.si.max_sh_per_se, + rdev->config.si.max_cu_per_sh); + + /* set HW defaults for 3D engine */ WREG32(CP_QUEUE_THRESHOLDS, (ROQ_IB1_START(0x16) | ROQ_IB2_START(0x2b))); diff --git a/drivers/gpu/drm/radeon/sid.h b/drivers/gpu/drm/radeon/sid.h index 53ea2c42dbd6..db4067962868 100644 --- a/drivers/gpu/drm/radeon/sid.h +++ b/drivers/gpu/drm/radeon/sid.h @@ -24,6 +24,11 @@ #ifndef SI_H #define SI_H +#define TAHITI_RB_BITMAP_WIDTH_PER_SH 2 + +#define TAHITI_GB_ADDR_CONFIG_GOLDEN 0x12011003 +#define VERDE_GB_ADDR_CONFIG_GOLDEN 0x12010002 + #define CG_MULT_THERMAL_STATUS 0x714 #define ASIC_MAX_TEMP(x) ((x) << 0) #define ASIC_MAX_TEMP_MASK 0x000001ff @@ -408,6 +413,12 @@ #define SOFT_RESET_IA (1 << 15) #define GRBM_GFX_INDEX 0x802C +#define INSTANCE_INDEX(x) ((x) << 0) +#define SH_INDEX(x) ((x) << 8) +#define SE_INDEX(x) ((x) << 16) +#define SH_BROADCAST_WRITES (1 << 29) +#define INSTANCE_BROADCAST_WRITES (1 << 30) +#define SE_BROADCAST_WRITES (1 << 31) #define GRBM_INT_CNTL 0x8060 # define RDERR_INT_ENABLE (1 << 0) @@ -480,6 +491,8 @@ #define VGT_TF_MEMORY_BASE 0x89B8 #define CC_GC_SHADER_ARRAY_CONFIG 0x89bc +#define INACTIVE_CUS_MASK 0xFFFF0000 +#define INACTIVE_CUS_SHIFT 16 #define GC_USER_SHADER_ARRAY_CONFIG 0x89c0 #define PA_CL_ENHANCE 0x8A14 @@ -688,6 +701,12 @@ #define RLC_MC_CNTL 0xC344 #define RLC_UCODE_CNTL 0xC348 +#define PA_SC_RASTER_CONFIG 0x28350 +# define RASTER_CONFIG_RB_MAP_0 0 +# define RASTER_CONFIG_RB_MAP_1 1 +# define RASTER_CONFIG_RB_MAP_2 2 +# define RASTER_CONFIG_RB_MAP_3 3 + #define VGT_EVENT_INITIATOR 0x28a90 # define SAMPLE_STREAMOUTSTATS1 (1 << 0) # define SAMPLE_STREAMOUTSTATS2 (2 << 0) From bb4091558228ff4a3e02328c931e683fc7f08722 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Sun, 3 Jun 2012 16:09:43 +0200 Subject: [PATCH 108/178] drm/radeon: fix vm deadlocks on cayman MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Locking mutex in different orders just screams for deadlocks, and some testing showed that it is actually quite easy to trigger them. Signed-off-by: Christian König Reviewed-by: Jerome Glisse Cc: stable@vger.kernel.org Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/radeon_gart.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_gart.c b/drivers/gpu/drm/radeon/radeon_gart.c index 79db56e6c2ac..59d44937dd9f 100644 --- a/drivers/gpu/drm/radeon/radeon_gart.c +++ b/drivers/gpu/drm/radeon/radeon_gart.c @@ -476,12 +476,18 @@ int radeon_vm_bo_add(struct radeon_device *rdev, mutex_lock(&vm->mutex); if (last_pfn > vm->last_pfn) { - /* grow va space 32M by 32M */ - unsigned align = ((32 << 20) >> 12) - 1; + /* release mutex and lock in right order */ + mutex_unlock(&vm->mutex); radeon_mutex_lock(&rdev->cs_mutex); - radeon_vm_unbind_locked(rdev, vm); + mutex_lock(&vm->mutex); + /* and check again */ + if (last_pfn > vm->last_pfn) { + /* grow va space 32M by 32M */ + unsigned align = ((32 << 20) >> 12) - 1; + radeon_vm_unbind_locked(rdev, vm); + vm->last_pfn = (last_pfn + align) & ~align; + } radeon_mutex_unlock(&rdev->cs_mutex); - vm->last_pfn = (last_pfn + align) & ~align; } head = &vm->va; last_offset = 0; @@ -595,8 +601,8 @@ int radeon_vm_bo_rmv(struct radeon_device *rdev, if (bo_va == NULL) return 0; - mutex_lock(&vm->mutex); radeon_mutex_lock(&rdev->cs_mutex); + mutex_lock(&vm->mutex); radeon_vm_bo_update_pte(rdev, vm, bo, NULL); radeon_mutex_unlock(&rdev->cs_mutex); list_del(&bo_va->vm_list); @@ -641,9 +647,8 @@ void radeon_vm_fini(struct radeon_device *rdev, struct radeon_vm *vm) struct radeon_bo_va *bo_va, *tmp; int r; - mutex_lock(&vm->mutex); - radeon_mutex_lock(&rdev->cs_mutex); + mutex_lock(&vm->mutex); radeon_vm_unbind_locked(rdev, vm); radeon_mutex_unlock(&rdev->cs_mutex); From f2ebd422f71cda9c791f76f85d2ca102ae34a1ed Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 22 Apr 2012 17:02:11 +0300 Subject: [PATCH 109/178] KVM: Fix buffer overflow in kvm_set_irq() kvm_set_irq() has an internal buffer of three irq routing entries, allowing connecting a GSI to three IRQ chips or on MSI. However setup_routing_entry() does not properly enforce this, allowing three irqchip routes followed by an MSI route to overflow the buffer. Fix by ensuring that an MSI entry is added to an empty list. Signed-off-by: Avi Kivity --- virt/kvm/irq_comm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index a6a0365475ed..5afb43114020 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -332,6 +332,7 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt, */ hlist_for_each_entry(ei, n, &rt->map[ue->gsi], link) if (ei->type == KVM_IRQ_ROUTING_MSI || + ue->type == KVM_IRQ_ROUTING_MSI || ue->u.irqchip.irqchip == ei->irqchip.irqchip) return r; From d430f7dbf7bd6aaaa40c0660b3204df8cf07b22b Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 5 Jun 2012 09:50:28 -0400 Subject: [PATCH 110/178] drm/radeon/kms: add new Trinity PCI ids Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/ni.c | 21 +++++++++++++++++---- include/drm/drm_pciids.h | 8 ++++++++ 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c index 3df4efa11942..3186522a4458 100644 --- a/drivers/gpu/drm/radeon/ni.c +++ b/drivers/gpu/drm/radeon/ni.c @@ -460,15 +460,28 @@ static void cayman_gpu_init(struct radeon_device *rdev) rdev->config.cayman.max_pipes_per_simd = 4; rdev->config.cayman.max_tile_pipes = 2; if ((rdev->pdev->device == 0x9900) || - (rdev->pdev->device == 0x9901)) { + (rdev->pdev->device == 0x9901) || + (rdev->pdev->device == 0x9905) || + (rdev->pdev->device == 0x9906) || + (rdev->pdev->device == 0x9907) || + (rdev->pdev->device == 0x9908) || + (rdev->pdev->device == 0x9909) || + (rdev->pdev->device == 0x9910) || + (rdev->pdev->device == 0x9917)) { rdev->config.cayman.max_simds_per_se = 6; rdev->config.cayman.max_backends_per_se = 2; } else if ((rdev->pdev->device == 0x9903) || - (rdev->pdev->device == 0x9904)) { + (rdev->pdev->device == 0x9904) || + (rdev->pdev->device == 0x990A) || + (rdev->pdev->device == 0x9913) || + (rdev->pdev->device == 0x9918)) { rdev->config.cayman.max_simds_per_se = 4; rdev->config.cayman.max_backends_per_se = 2; - } else if ((rdev->pdev->device == 0x9990) || - (rdev->pdev->device == 0x9991)) { + } else if ((rdev->pdev->device == 0x9919) || + (rdev->pdev->device == 0x9990) || + (rdev->pdev->device == 0x9991) || + (rdev->pdev->device == 0x9994) || + (rdev->pdev->device == 0x99A0)) { rdev->config.cayman.max_simds_per_se = 3; rdev->config.cayman.max_backends_per_se = 1; } else { diff --git a/include/drm/drm_pciids.h b/include/drm/drm_pciids.h index 58d0bdab68dd..961dae0d26e1 100644 --- a/include/drm/drm_pciids.h +++ b/include/drm/drm_pciids.h @@ -561,11 +561,19 @@ {0x1002, 0x9909, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x990A, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x990F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x9910, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x9913, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x9917, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x9918, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x9919, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9990, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9991, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9992, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9993, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9994, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x99A0, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x99A2, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x99A4, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0, 0, 0} #define r128_PCI_IDS \ From 4a6991cc1fad514745b79181df3ace72d561e7aa Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 5 Jun 2012 09:50:29 -0400 Subject: [PATCH 111/178] drm/radeon/kms: add new Palm, Sumo PCI ids Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Signed-off-by: Dave Airlie --- include/drm/drm_pciids.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/drm/drm_pciids.h b/include/drm/drm_pciids.h index 961dae0d26e1..c5b0d8cd056f 100644 --- a/include/drm/drm_pciids.h +++ b/include/drm/drm_pciids.h @@ -531,6 +531,7 @@ {0x1002, 0x9645, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SUMO2|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9647, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SUMO|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP},\ {0x1002, 0x9648, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SUMO|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP},\ + {0x1002, 0x9649, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SUMO|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP},\ {0x1002, 0x964a, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SUMO|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x964b, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SUMO|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x964c, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SUMO|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ @@ -550,6 +551,7 @@ {0x1002, 0x9807, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_PALM|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9808, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_PALM|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9809, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_PALM|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x980A, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_PALM|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9900, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9901, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9903, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ From a2bef8ce826dd1e787fd8ad9b6e0566ba59dab43 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 5 Jun 2012 09:50:30 -0400 Subject: [PATCH 112/178] drm/radeon/kms: add new BTC PCI ids Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Signed-off-by: Dave Airlie --- include/drm/drm_pciids.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/drm/drm_pciids.h b/include/drm/drm_pciids.h index c5b0d8cd056f..86c4cf916396 100644 --- a/include/drm/drm_pciids.h +++ b/include/drm/drm_pciids.h @@ -181,6 +181,7 @@ {0x1002, 0x6747, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TURKS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6748, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TURKS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6749, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TURKS|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x674A, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TURKS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6750, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TURKS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6751, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TURKS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6758, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TURKS|RADEON_NEW_MEMMAP}, \ @@ -198,6 +199,7 @@ {0x1002, 0x6767, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAICOS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6768, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAICOS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6770, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAICOS|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x6771, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAICOS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6772, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAICOS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6778, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAICOS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6779, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAICOS|RADEON_NEW_MEMMAP}, \ From 7aaa61b3476462b69f1ac7669fcca8d608ce3cb5 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 5 Jun 2012 09:50:31 -0400 Subject: [PATCH 113/178] drm/radeon/kms: add new SI PCI ids Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Signed-off-by: Dave Airlie --- include/drm/drm_pciids.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/drm/drm_pciids.h b/include/drm/drm_pciids.h index 86c4cf916396..81368ab6c611 100644 --- a/include/drm/drm_pciids.h +++ b/include/drm/drm_pciids.h @@ -231,10 +231,11 @@ {0x1002, 0x6827, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6828, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6829, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x682B, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x682D, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x682F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ - {0x1002, 0x6830, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_NEW_MEMMAP}, \ - {0x1002, 0x6831, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x6830, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x6831, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6837, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6838, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6839, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_NEW_MEMMAP}, \ From c2238f10e0c34a85a2a555c8a197316d1ca3fb7e Mon Sep 17 00:00:00 2001 From: Chen Gong Date: Tue, 5 Jun 2012 10:35:02 +0800 Subject: [PATCH 114/178] x86/mce: Fix the MCE poll timer logic In commit 82f7af09 (x86/mce: Cleanup timer mess), Thomas just forgot the "/ 2" there while cleaning up. Signed-off-by: Chen Gong Acked-by: Thomas Gleixner Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 98003bfc5556..d6b18a4d0b95 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1266,7 +1266,7 @@ static void mce_timer_fn(unsigned long data) */ iv = __this_cpu_read(mce_next_interval); if (mce_notify_irq()) - iv = max(iv, (unsigned long) HZ/100); + iv = max(iv / 2, (unsigned long) HZ/100); else iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); __this_cpu_write(mce_next_interval, iv); From fffaee365fded09f9ebf2db19066065fa54323c3 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 5 Jun 2012 21:36:33 +0400 Subject: [PATCH 115/178] radix-tree: fix contiguous iterator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fixes bug in macro radix_tree_for_each_contig(). If radix_tree_next_slot() sees NULL in next slot it returns NULL, but following radix_tree_next_chunk() switches iterating into next chunk. As result iterating becomes non-contiguous and breaks vfs "splice" and all its users. Signed-off-by: Konstantin Khlebnikov Reported-and-bisected-by: Hans de Bruin Reported-and-bisected-by: Ondrej Zary Reported-bisected-and-tested-by: Toralf Förster Link: https://lkml.org/lkml/2012/6/5/64 Cc: stable # 3.4.x Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 5 ++++- lib/radix-tree.c | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 0d04cd69ab9b..ffc444c38b0a 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -368,8 +368,11 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) iter->index++; if (likely(*slot)) return slot; - if (flags & RADIX_TREE_ITER_CONTIG) + if (flags & RADIX_TREE_ITER_CONTIG) { + /* forbid switching to the next chunk */ + iter->next_index = 0; break; + } } } return NULL; diff --git a/lib/radix-tree.c b/lib/radix-tree.c index d7c878cc006c..e7964296fd50 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -686,6 +686,9 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, * during iterating; it can be zero only at the beginning. * And we cannot overflow iter->next_index in a single step, * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG. + * + * This condition also used by radix_tree_next_slot() to stop + * contiguous iterating, and forbid swithing to the next chunk. */ index = iter->next_index; if (!index && iter->index) From 256cccbeca6e53c833c98a3651d7bac38ced4cec Mon Sep 17 00:00:00 2001 From: Karel Zak Date: Fri, 1 Jun 2012 10:13:09 +0200 Subject: [PATCH 116/178] MAINTAINERS: update util-linux info Signed-off-by: Karel Zak Signed-off-by: Linus Torvalds --- MAINTAINERS | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 6a52bb4a4fc7..f55cf8f084d5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7298,11 +7298,11 @@ F: Documentation/DocBook/uio-howto.tmpl F: drivers/uio/ F: include/linux/uio*.h -UTIL-LINUX-NG PACKAGE +UTIL-LINUX PACKAGE M: Karel Zak -L: util-linux-ng@vger.kernel.org -W: http://kernel.org/~kzak/util-linux-ng/ -T: git git://git.kernel.org/pub/scm/utils/util-linux-ng/util-linux-ng.git +L: util-linux@vger.kernel.org +W: http://en.wikipedia.org/wiki/Util-linux +T: git git://git.kernel.org/pub/scm/utils/util-linux/util-linux.git S: Maintained UVESAFB DRIVER From cb05d8dedefa3066bf5d74ef88c6ca6cf4bd1c87 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Wed, 23 May 2012 14:02:00 +0200 Subject: [PATCH 117/178] drm/i915: fix up ivb plane 3 pageflips Or at least plug another gapping hole. Apparrently hw desingers only moved the bit field, but did not bother ot re-enumerate the planes when adding support for a 3rd pipe. Discovered by i-g-t/flip_test. This may or may not fix the reference bugzilla, because that one smells like we have still larger fish to fry. v2: Fixup the impossible case to catch programming errors, noticed by Chris Wilson. References: https://bugs.freedesktop.org/show_bug.cgi?id=50069 Acked-by: Chris Wilson Tested-by: Eugeni Dodonov Eugeni Dodonov Cc: stable@vger.kernel.org Signed-Off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_reg.h | 8 ++++++++ drivers/gpu/drm/i915/intel_display.c | 19 ++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 2d49b9507ed0..76bc2756d7c4 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -210,6 +210,14 @@ #define MI_DISPLAY_FLIP MI_INSTR(0x14, 2) #define MI_DISPLAY_FLIP_I915 MI_INSTR(0x14, 1) #define MI_DISPLAY_FLIP_PLANE(n) ((n) << 20) +/* IVB has funny definitions for which plane to flip. */ +#define MI_DISPLAY_FLIP_IVB_PLANE_A (0 << 19) +#define MI_DISPLAY_FLIP_IVB_PLANE_B (1 << 19) +#define MI_DISPLAY_FLIP_IVB_SPRITE_A (2 << 19) +#define MI_DISPLAY_FLIP_IVB_SPRITE_B (3 << 19) +#define MI_DISPLAY_FLIP_IVB_PLANE_C (4 << 19) +#define MI_DISPLAY_FLIP_IVB_SPRITE_C (5 << 19) + #define MI_SET_CONTEXT MI_INSTR(0x18, 0) #define MI_MM_SPACE_GTT (1<<8) #define MI_MM_SPACE_PHYSICAL (0<<8) diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 914789420906..e0aa064def31 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -6158,17 +6158,34 @@ static int intel_gen7_queue_flip(struct drm_device *dev, struct drm_i915_private *dev_priv = dev->dev_private; struct intel_crtc *intel_crtc = to_intel_crtc(crtc); struct intel_ring_buffer *ring = &dev_priv->ring[BCS]; + uint32_t plane_bit = 0; int ret; ret = intel_pin_and_fence_fb_obj(dev, obj, ring); if (ret) goto err; + switch(intel_crtc->plane) { + case PLANE_A: + plane_bit = MI_DISPLAY_FLIP_IVB_PLANE_A; + break; + case PLANE_B: + plane_bit = MI_DISPLAY_FLIP_IVB_PLANE_B; + break; + case PLANE_C: + plane_bit = MI_DISPLAY_FLIP_IVB_PLANE_C; + break; + default: + WARN_ONCE(1, "unknown plane in flip command\n"); + ret = -ENODEV; + goto err; + } + ret = intel_ring_begin(ring, 4); if (ret) goto err_unpin; - intel_ring_emit(ring, MI_DISPLAY_FLIP_I915 | (intel_crtc->plane << 19)); + intel_ring_emit(ring, MI_DISPLAY_FLIP_I915 | plane_bit); intel_ring_emit(ring, (fb->pitches[0] | obj->tiling_mode)); intel_ring_emit(ring, (obj->gtt_offset)); intel_ring_emit(ring, (MI_NOOP)); From aa69cb8c1e72b027548f9751e6377a7a7e8bb8fd Mon Sep 17 00:00:00 2001 From: Bryan Wu Date: Thu, 31 May 2012 19:51:37 +0800 Subject: [PATCH 118/178] MAINTAINERS: add linux-leds mail list address and git URL Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index f55cf8f084d5..f935a0cef404 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4103,6 +4103,8 @@ F: drivers/scsi/53c700* LED SUBSYSTEM M: Bryan Wu M: Richard Purdie +L: linux-leds@vger.kernel.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/cooloney/linux-leds.git S: Maintained F: drivers/leds/ F: include/linux/leds.h From 958fb3c51295764599d6abce87e1a01ace897a3e Mon Sep 17 00:00:00 2001 From: Chen Gong Date: Tue, 5 Jun 2012 10:35:02 +0800 Subject: [PATCH 119/178] x86/mce: Fix the MCE poll timer logic In commit 82f7af09 ("x86/mce: Cleanup timer mess), Thomas just forgot the "/ 2" there while cleaning up. Signed-off-by: Chen Gong Acked-by: Thomas Gleixner Cc: bp@amd64.org Cc: tony.luck@intel.com Link: http://lkml.kernel.org/r/1338863702-9245-1-git-send-email-gong.chen@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 0a687fd185e6..a97f3c4a3946 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1274,7 +1274,7 @@ static void mce_timer_fn(unsigned long data) */ iv = __this_cpu_read(mce_next_interval); if (mce_notify_irq()) - iv = max(iv, (unsigned long) HZ/100); + iv = max(iv / 2, (unsigned long) HZ/100); else iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); __this_cpu_write(mce_next_interval, iv); From 436d03faf6961b30e13b2d0967aea9d772d6cf44 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 5 Jun 2012 00:09:11 +0900 Subject: [PATCH 120/178] x86/decoder: Fix bsr/bsf/jmpe decoding with operand-size prefix Fix the x86 instruction decoder to decode bsr/bsf/jmpe with operand-size prefix (66h). This fixes the test case failure reported by Linus, attached below. bsf/bsr/jmpe have a special encoding. Opcode map in Intel Software Developers Manual vol2 says they have TZCNT/LZCNT variants if it has F3h prefix. However, there is no information if it has other 66h or F2h prefixes. Current instruction decoder supposes that those are bad instructions, but it actually accepts at least operand-size prefixes. H. Peter Anvin further explains: " TZCNT/LZCNT are F3 + BSF/BSR exactly because the F2 and F3 prefixes have historically been no-ops with most instructions. This allows software to unconditionally use the prefixed versions and get TZCNT/LZCNT on the processors that have them if they don't care about the difference. " This fixes errors reported by test_get_len: Warning: arch/x86/tools/test_get_len found difference at :ffffffff81036d87 Warning: ffffffff81036de5: 66 0f bc c2 bsf %dx,%ax Warning: objdump says 4 bytes, but insn_get_length() says 3 Warning: arch/x86/tools/test_get_len found difference at :ffffffff81036ea6 Warning: ffffffff81036f04: 66 0f bd c2 bsr %dx,%ax Warning: objdump says 4 bytes, but insn_get_length() says 3 Warning: decoded and checked 13298882 instructions with 2 warnings Reported-by: Linus Torvalds Reported-by: Pekka Enberg Signed-off-by: Masami Hiramatsu Cc: "H. Peter Anvin" Cc: Link: http://lkml.kernel.org/r/20120604150911.22338.43296.stgit@localhost.localdomain Signed-off-by: Ingo Molnar --- arch/x86/lib/x86-opcode-map.txt | 8 ++++---- arch/x86/tools/gen-insn-attr-x86.awk | 14 +++++++++----- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 819137904428..5d7e51f3fd28 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -28,7 +28,7 @@ # - (66): the last prefix is 0x66 # - (F3): the last prefix is 0xF3 # - (F2): the last prefix is 0xF2 -# +# - (!F3) : the last prefix is not 0xF3 (including non-last prefix case) Table: one byte opcode Referrer: @@ -515,12 +515,12 @@ b4: LFS Gv,Mp b5: LGS Gv,Mp b6: MOVZX Gv,Eb b7: MOVZX Gv,Ew -b8: JMPE | POPCNT Gv,Ev (F3) +b8: JMPE (!F3) | POPCNT Gv,Ev (F3) b9: Grp10 (1A) ba: Grp8 Ev,Ib (1A) bb: BTC Ev,Gv -bc: BSF Gv,Ev | TZCNT Gv,Ev (F3) -bd: BSR Gv,Ev | LZCNT Gv,Ev (F3) +bc: BSF Gv,Ev (!F3) | TZCNT Gv,Ev (F3) +bd: BSR Gv,Ev (!F3) | LZCNT Gv,Ev (F3) be: MOVSX Gv,Eb bf: MOVSX Gv,Ew # 0x0f 0xc0-0xcf diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index 5f6a5b6c3a15..ddcf39b1a18d 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -66,9 +66,10 @@ BEGIN { rex_expr = "^REX(\\.[XRWB]+)*" fpu_expr = "^ESC" # TODO - lprefix1_expr = "\\(66\\)" + lprefix1_expr = "\\((66|!F3)\\)" lprefix2_expr = "\\(F3\\)" - lprefix3_expr = "\\(F2\\)" + lprefix3_expr = "\\((F2|!F3)\\)" + lprefix_expr = "\\((66|F2|F3)\\)" max_lprefix = 4 # All opcodes starting with lower-case 'v' or with (v1) superscript @@ -333,13 +334,16 @@ function convert_operands(count,opnd, i,j,imm,mod) if (match(ext, lprefix1_expr)) { lptable1[idx] = add_flags(lptable1[idx],flags) variant = "INAT_VARIANT" - } else if (match(ext, lprefix2_expr)) { + } + if (match(ext, lprefix2_expr)) { lptable2[idx] = add_flags(lptable2[idx],flags) variant = "INAT_VARIANT" - } else if (match(ext, lprefix3_expr)) { + } + if (match(ext, lprefix3_expr)) { lptable3[idx] = add_flags(lptable3[idx],flags) variant = "INAT_VARIANT" - } else { + } + if (!match(ext, lprefix_expr)){ table[idx] = add_flags(table[idx],flags) } } From 1a87fc1ec7b05b9bc60df9dc52297d4c225d7f1a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Jun 2012 11:33:21 +0200 Subject: [PATCH 121/178] x86: mce: Add the dropped timer interval init back commit 82f7af09 ("x86/mce: Cleanup timer mess) dropped the initialization of the per cpu timer interval. Duh :( Restore the previous behaviour. Reported-by: Chen Gong Cc: bp@amd64.org Cc: tony.luck@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a97f3c4a3946..da27c5d2168a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1557,7 +1557,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) static void __mcheck_cpu_init_timer(void) { struct timer_list *t = &__get_cpu_var(mce_timer); - unsigned long iv = __this_cpu_read(mce_next_interval); + unsigned long iv = check_interval * HZ; setup_timer(t, mce_timer_fn, smp_processor_id()); From aff5a62d52ff03956ff6992b9fe4b561fd855804 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Tue, 5 Jun 2012 15:00:31 -0400 Subject: [PATCH 122/178] x86/gart: Fix kmemleak warning aperture_64.c now is using memblock, the previous kmemleak_ignore() for alloc_bootmem() should be removed then. Otherwise, with kmemleak enabled, kernel will throw warnings like: [ 0.000000] kmemleak: Trying to color unknown object at 0xffff8800c4000000 as Black [ 0.000000] Pid: 0, comm: swapper/0 Not tainted 3.5.0-rc1-next-20120605+ #130 [ 0.000000] Call Trace: [ 0.000000] [] paint_ptr+0x66/0xc0 [ 0.000000] [] kmemleak_ignore+0x2b/0x60 [ 0.000000] [] kmemleak_init+0x217/0x2c1 [ 0.000000] [] start_kernel+0x32d/0x3eb [ 0.000000] [] ? repair_env_string+0x5a/0x5a [ 0.000000] [] x86_64_start_reservations+0x131/0x135 [ 0.000000] [] ? early_idt_handlers+0x120/0x120 [ 0.000000] [] x86_64_start_kernel+0x102/0x111 [ 0.000000] kmemleak: Early log backtrace: [ 0.000000] [] kmemleak_ignore+0x4b/0x60 [ 0.000000] [] gart_iommu_hole_init+0x3e7/0x547 [ 0.000000] [] pci_iommu_alloc+0x44/0x6f [ 0.000000] [] mem_init+0x19/0xec [ 0.000000] [] start_kernel+0x1ea/0x3eb [ 0.000000] [] x86_64_start_reservations+0x131/0x135 [ 0.000000] [] x86_64_start_kernel+0x102/0x111 [ 0.000000] [] 0xffffffffffffffff Signed-off-by: Xiaotian Feng Cc: Xiaotian Feng Cc: Tejun Heo Link: http://lkml.kernel.org/r/1338922831-2847-1-git-send-email-xtfeng@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/aperture_64.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 6e76c191a835..d5fd66f0d4cd 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -95,11 +94,6 @@ static u32 __init allocate_aperture(void) return 0; } memblock_reserve(addr, aper_size); - /* - * Kmemleak should not scan this block as it may not be mapped via the - * kernel direct mapping. - */ - kmemleak_ignore(phys_to_virt(addr)); printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", aper_size >> 10, addr); insert_aperture_resource((u32)addr, aper_size); From 4af463d28f1a026e25c0b879fac2a0d2b7bff599 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Mon, 4 Jun 2012 11:42:32 +0900 Subject: [PATCH 123/178] x86/numa: Set numa_nodes_parsed at acpi_numa_memory_affinity_init() When hot-adding a CPU, the system outputs following messages since node_to_cpumask_map[2] was not allocated memory. Booting Node 2 Processor 32 APIC 0xc0 node_to_cpumask_map[2] NULL Pid: 0, comm: swapper/32 Tainted: G A 3.3.5-acd #21 Call Trace: [] debug_cpumask_set_cpu+0x155/0x160 [] ? add_timer_on+0xaa/0x120 [] numa_add_cpu+0x1e/0x22 [] identify_cpu+0x1df/0x1e4 [] identify_econdary_cpu+0x16/0x1d [] smp_store_cpu_info+0x3c/0x3e [] smp_callin+0x139/0x1be [] start_secondary+0x13/0xeb The reason is that the bit of node 2 was not set at numa_nodes_parsed. numa_nodes_parsed is set by only acpi_numa_processor_affinity_init / acpi_numa_x2apic_affinity_init. Thus even if hot-added memory which is same PXM as hot-added CPU is written in ACPI SRAT Table, if the hot-added CPU is not written in ACPI SRAT table, numa_nodes_parsed is not set. But according to ACPI Spec Rev 5.0, it says about ACPI SRAT table as follows: This optional table provides information that allows OSPM to associate processors and memory ranges, including ranges of memory provided by hot-added memory devices, with system localities / proximity domains and clock domains. It means that ACPI SRAT table only provides information for CPUs present at boot time and for memory including hot-added memory. So hot-added memory is written in ACPI SRAT table, but hot-added CPU is not written in it. Thus numa_nodes_parsed should be set by not only acpi_numa_processor_affinity_init / acpi_numa_x2apic_affinity_init but also acpi_numa_memory_affinity_init for the case. Additionally, if system has cpuless memory node, acpi_numa_processor_affinity_init / acpi_numa_x2apic_affinity_init cannot set numa_nodes_parseds since these functions cannot find cpu description for the node. In this case, numa_nodes_parsed needs to be set by acpi_numa_memory_affinity_init. Signed-off-by: Yasuaki Ishimatsu Acked-by: David Rientjes Acked-by: KOSAKI Motohiro Cc: liuj97@gmail.com Cc: kosaki.motohiro@gmail.com Link: http://lkml.kernel.org/r/4FCC2098.4030007@jp.fujitsu.com [ merged it ] Signed-off-by: Ingo Molnar --- arch/x86/mm/srat.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 732af3a96183..4599c3e8bcb6 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -176,6 +176,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) return; } + node_set(node, numa_nodes_parsed); + printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n", node, pxm, (unsigned long long) start, (unsigned long long) end - 1); From 7071f6b2889bb41bea61891d8a3e6e70517ef5e6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 31 May 2012 23:20:25 +0200 Subject: [PATCH 124/178] x86/intel/moorestown: Change intel_scu_devices_create() to __devinit The allmodconfig hits: WARNING: vmlinux.o(.text+0x6553d): Section mismatch in reference from the function intel_scu_devices_create() to the function .devinit.text: spi_register_board_info() [...] This patch marks intel_scu_devices_create() as devinit because it only calls a devinit function, spi_register_board_info(). Signed-off-by: Sebastian Andrzej Siewior Cc: Alan Cox Cc: Kirill A. Shutemov Cc: Mika Westerberg Cc: Samuel Ortiz Cc: Feng Tang Link: http://lkml.kernel.org/r/20120531212025.GA8519@breakpoint.cc Signed-off-by: Ingo Molnar --- arch/x86/platform/mrst/mrst.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index e31bcd8f2eee..fd41a9262d65 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -782,7 +782,7 @@ BLOCKING_NOTIFIER_HEAD(intel_scu_notifier); EXPORT_SYMBOL_GPL(intel_scu_notifier); /* Called by IPC driver */ -void intel_scu_devices_create(void) +void __devinit intel_scu_devices_create(void) { int i; From 55c844a4dd16a4d1fdc0cf2a283ec631a02ec448 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 30 May 2012 23:15:41 +0800 Subject: [PATCH 125/178] x86/reboot: Fix a warning message triggered by stop_other_cpus() When rebooting our 24 CPU Westmere servers with 3.4-rc6, we always see this warning msg: Restarting system. machine restart ------------[ cut here ]------------ WARNING: at arch/x86/kernel/smp.c:125 native_smp_send_reschedule+0x74/0xa7() Hardware name: X8DTN Modules linked in: igb [last unloaded: scsi_wait_scan] Pid: 1, comm: systemd-shutdow Not tainted 3.4.0-rc6+ #22 Call Trace: [] warn_slowpath_common+0x7e/0x96 [] warn_slowpath_null+0x15/0x17 [] native_smp_send_reschedule+0x74/0xa7 [] trigger_load_balance+0x279/0x2a6 [] scheduler_tick+0xe0/0xe9 [] update_process_times+0x60/0x70 [] tick_sched_timer+0x68/0x92 [] __run_hrtimer+0xb3/0x13c [] ? tick_nohz_handler+0xd0/0xd0 [] hrtimer_interrupt+0xdb/0x198 [] smp_apic_timer_interrupt+0x81/0x94 [] apic_timer_interrupt+0x67/0x70 [] ? default_send_IPI_mask_allbutself_phys+0xb4/0xc4 [] physflat_send_IPI_allbutself+0x12/0x14 [] native_nmi_stop_other_cpus+0x8a/0xd6 [] native_machine_shutdown+0x50/0x67 [] machine_shutdown+0xa/0xc [] native_machine_restart+0x20/0x32 [] machine_restart+0xa/0xc [] kernel_restart+0x47/0x4c [] sys_reboot+0x13e/0x17c [] ? _raw_spin_unlock_bh+0x10/0x12 [] ? bdi_queue_work+0xcf/0xd8 [] ? __bdi_start_writeback+0xae/0xb7 [] ? iterate_supers+0xa3/0xb7 [] system_call_fastpath+0x16/0x1b ---[ end trace 320af5cb1cb60c5b ]--- The root cause seems to be the default_send_IPI_mask_allbutself_phys() takes quite some time (I measured it could be several ms) to complete sending NMIs to all the other 23 CPUs, and for HZ=250/1000 system, the time is long enough for a timer interrupt to happen, which will in turn trigger to kick load balance to a stopped CPU and cause this warning in native_smp_send_reschedule(). So disabling the local irq before stop_other_cpu() can fix this problem (tested 25 times reboot ok), and it is fine as there should be nobody caring the timer interrupt in such reboot stage. The latest 3.4 kernel slightly changes this behavior by sending REBOOT_VECTOR first and only send NMI_VECTOR if the REBOOT_VCTOR fails, and this patch is still needed to prevent the problem. Signed-off-by: Feng Tang Acked-by: Don Zickus Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120530231541.4c13433a@feng-i7 Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 79c45af81604..25b48edb847c 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -639,9 +639,11 @@ void native_machine_shutdown(void) set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id)); /* - * O.K Now that I'm on the appropriate processor, - * stop all of the others. + * O.K Now that I'm on the appropriate processor, stop all of the + * others. Also disable the local irq to not receive the per-cpu + * timer interrupt which may trigger scheduler's load balance. */ + local_irq_disable(); stop_other_cpus(); #endif From f6175f5bfb4c9f2ed32758c95f765b529b1a7f15 Mon Sep 17 00:00:00 2001 From: Tomoki Sekiyama Date: Mon, 28 May 2012 18:09:18 +0900 Subject: [PATCH 126/178] x86/ioapic: Fix NULL pointer dereference on CPU hotplug after disabling irqs In current Linux, percpu variable `vector_irq' is not cleared on offlined cpus while disabling devices' irqs. If the cpu that has the disabled irqs in vector_irq is hotplugged, __setup_vector_irq() hits invalid irq vector and may crash. This bug can be reproduced as following; # echo 0 > /sys/devices/system/cpu/cpu7/online # modprobe -r some_driver_using_interrupts # vector_irq@cpu7 uncleared # echo 1 > /sys/devices/system/cpu/cpu7/online # kernel may crash This patch fixes this bug by clearing vector_irq in __clear_irq_vector() even if the cpu is offlined. Signed-off-by: Tomoki Sekiyama Acked-by: Thomas Gleixner Cc: yrl.pp-manager.tt@hitachi.com Cc: ltc-kernel@ml.yrl.intra.hitachi.co.jp Cc: Suresh Siddha Cc: Yinghai Lu Cc: Alexander Gordeev Link: http://lkml.kernel.org/r/4FC340BE.7080101@hitachi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ac96561d1a99..5f0ff597437c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1195,7 +1195,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) BUG_ON(!cfg->vector); vector = cfg->vector; - for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) + for_each_cpu(cpu, cfg->domain) per_cpu(vector_irq, cpu)[vector] = -1; cfg->vector = 0; @@ -1203,7 +1203,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) if (likely(!cfg->move_in_progress)) return; - for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { + for_each_cpu(cpu, cfg->old_domain) { for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { if (per_cpu(vector_irq, cpu)[vector] != irq) From ceb1cbac8eda66cf0f889def226b4e82f8ff857b Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Thu, 31 May 2012 13:07:38 +0530 Subject: [PATCH 127/178] sched/x86: Calculate booted cores after construction of sibling_mask Commit 316ad248307fb ("sched/x86: Rewrite set_cpu_sibling_map()") broke the booted_cores accounting. The problem is that the booted_cores accounting needs all the sibling links set up. So restore the second loop and add a comment as to why its needed. On qemu booted with -smp sockets=1,cores=2,threads=2; Before: $ grep cores /proc/cpuinfo cpu cores : 2 cpu cores : 1 cpu cores : 4 cpu cores : 3 With the patch: $ grep cores /proc/cpuinfo cpu cores : 2 cpu cores : 2 cpu cores : 2 cpu cores : 2 Reported-by: Prarit Bhargava Reported-by: Borislav Petkov Signed-off-by: Kamalesh Babulal Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120531073738.GH7511@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fd019d78b1f4..3fab55bea29b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -382,6 +382,15 @@ void __cpuinit set_cpu_sibling_map(int cpu) if ((i == cpu) || (has_mc && match_llc(c, o))) link_mask(llc_shared, cpu, i); + } + + /* + * This needs a separate iteration over the cpus because we rely on all + * cpu_sibling_mask links to be set-up. + */ + for_each_cpu(i, cpu_sibling_setup_mask) { + o = &cpu_data(i); + if ((i == cpu) || (has_mc && match_mc(c, o))) { link_mask(core, cpu, i); From 10717dcde10d09f9fcee53a12a4236af1a82b484 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Wed, 6 Jun 2012 14:52:51 +0800 Subject: [PATCH 128/178] sched/numa: Load balance between remote nodes Commit cb83b629b ("sched/numa: Rewrite the CONFIG_NUMA sched domain support") removed the NODE sched domain and started checking if the node distance in SLIT table is farther than REMOTE_DISTANCE, if so, it will lose the load balance chance at exec/fork/wake_affine points. But actually, even the node distance is farther than REMOTE_DISTANCE. Modern CPUs also has QPI like connections, which ensures that memory access is not too slow between nodes. So the above change in behavior on NUMA machine causes a performance regression on various benchmarks: hackbench, tbench, netperf, oltp, etc. This patch will recover the scheduler behavior to old mode on all my Intel platforms: NHM EP/EX, WSM EP, SNB EP/EP4S, and thus fixes the perfromance regressions. (all of them just have 2 kinds distance, 10, 21) Signed-off-by: Alex Shi Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1338965571-9812-1-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c46958e26121..6546083af3e0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6321,7 +6321,7 @@ static int sched_domains_curr_level; static inline int sd_local_flags(int level) { - if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) + if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) return 0; return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; From 7f1b43936f0ecad14770634c021cf4a929aec74d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 May 2012 21:19:46 +0200 Subject: [PATCH 129/178] sched/rt: Fix lockdep annotation within find_lock_lowest_rq() Roland Dreier reported spurious, hard to trigger lockdep warnings within the scheduler - without any real lockup. This bit gives us the right clue: > [89945.640512] [] double_lock_balance+0x5a/0x90 > [89945.640568] [] push_rt_task+0xc6/0x290 if you look at that code you'll find the double_lock_balance() in question is the one in find_lock_lowest_rq() [yay for inlining]. Now find_lock_lowest_rq() has a bug.. it fails to use double_unlock_balance() in one exit path, if this results in a retry in push_rt_task() we'll call double_lock_balance() again, at which point we'll run into said lockdep confusion. Reported-by: Roland Dreier Acked-by: Steven Rostedt Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1337282386.4281.77.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 2a4e8dffbd6b..573e1ca01102 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1562,7 +1562,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) task_running(rq, task) || !task->on_rq)) { - raw_spin_unlock(&lowest_rq->lock); + double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; break; } From c1174876874dcf8986806e4dad3d7d07af20b439 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 May 2012 14:47:33 +0200 Subject: [PATCH 130/178] sched: Fix domain iteration Weird topologies can lead to asymmetric domain setups. This needs further consideration since these setups are typically non-minimal too. For now, make it work by adding an extra mask selecting which CPUs are allowed to iterate up. The topology that triggered it is the one from David Rientjes: 10 20 20 30 20 10 20 20 20 20 10 20 30 20 20 10 resulting in boxes that wouldn't even boot. Reported-by: David Rientjes Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-3p86l9cuaqnxz7uxsojmz5rm@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 11 ++++++++ kernel/sched/core.c | 64 +++++++++++++++++++++++++++++++++++++------ kernel/sched/fair.c | 5 ++-- kernel/sched/sched.h | 2 ++ 4 files changed, 72 insertions(+), 10 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 6029d8c54476..ac321d753470 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -876,6 +876,8 @@ struct sched_group_power { * Number of busy cpus in this group. */ atomic_t nr_busy_cpus; + + unsigned long cpumask[0]; /* iteration mask */ }; struct sched_group { @@ -900,6 +902,15 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg) return to_cpumask(sg->cpumask); } +/* + * cpumask masking which cpus in the group are allowed to iterate up the domain + * tree. + */ +static inline struct cpumask *sched_group_mask(struct sched_group *sg) +{ + return to_cpumask(sg->sgp->cpumask); +} + /** * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. * @group: The group whose first cpu is to be returned. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6546083af3e0..781acb91a50a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5994,6 +5994,44 @@ struct sched_domain_topology_level { struct sd_data data; }; +/* + * Build an iteration mask that can exclude certain CPUs from the upwards + * domain traversal. + * + * Asymmetric node setups can result in situations where the domain tree is of + * unequal depth, make sure to skip domains that already cover the entire + * range. + * + * In that case build_sched_domains() will have terminated the iteration early + * and our sibling sd spans will be empty. Domains should always include the + * cpu they're built on, so check that. + * + */ +static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) +{ + const struct cpumask *span = sched_domain_span(sd); + struct sd_data *sdd = sd->private; + struct sched_domain *sibling; + int i; + + for_each_cpu(i, span) { + sibling = *per_cpu_ptr(sdd->sd, i); + if (!cpumask_test_cpu(i, sched_domain_span(sibling))) + continue; + + cpumask_set_cpu(i, sched_group_mask(sg)); + } +} + +/* + * Return the canonical balance cpu for this group, this is the first cpu + * of this group that's also in the iteration mask. + */ +int group_balance_cpu(struct sched_group *sg) +{ + return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); +} + static int build_overlap_sched_groups(struct sched_domain *sd, int cpu) { @@ -6012,6 +6050,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) if (cpumask_test_cpu(i, covered)) continue; + child = *per_cpu_ptr(sdd->sd, i); + + /* See the comment near build_group_mask(). */ + if (!cpumask_test_cpu(i, sched_domain_span(child))) + continue; + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, cpu_to_node(cpu)); @@ -6019,8 +6063,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) goto fail; sg_span = sched_group_cpus(sg); - - child = *per_cpu_ptr(sdd->sd, i); if (child->child) { child = child->child; cpumask_copy(sg_span, sched_domain_span(child)); @@ -6030,13 +6072,18 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) cpumask_or(covered, covered, sg_span); sg->sgp = *per_cpu_ptr(sdd->sgp, i); - atomic_inc(&sg->sgp->ref); + if (atomic_inc_return(&sg->sgp->ref) == 1) + build_group_mask(sd, sg); + + /* + * Make sure the first group of this domain contains the + * canonical balance cpu. Otherwise the sched_domain iteration + * breaks. See update_sg_lb_stats(). + */ if ((!groups && cpumask_test_cpu(cpu, sg_span)) || - cpumask_first(sg_span) == cpu) { - WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span)); + group_balance_cpu(sg) == cpu) groups = sg; - } if (!first) first = sg; @@ -6109,6 +6156,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) cpumask_clear(sched_group_cpus(sg)); sg->sgp->power = 0; + cpumask_setall(sched_group_mask(sg)); for_each_cpu(j, span) { if (get_group(j, sdd, NULL) != group) @@ -6150,7 +6198,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) sg = sg->next; } while (sg != sd->groups); - if (cpu != group_first_cpu(sg)) + if (cpu != group_balance_cpu(sg)) return; update_group_power(sd, cpu); @@ -6525,7 +6573,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) *per_cpu_ptr(sdd->sg, j) = sg; - sgp = kzalloc_node(sizeof(struct sched_group_power), + sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); if (!sgp) return -ENOMEM; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b2a2d236f27b..54cbaa4e7b37 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3652,7 +3652,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, int i; if (local_group) - balance_cpu = group_first_cpu(group); + balance_cpu = group_balance_cpu(group); /* Tally up the load of all CPUs in the group */ max_cpu_load = 0; @@ -3667,7 +3667,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* Bias balancing toward cpus of our domain */ if (local_group) { - if (idle_cpu(i) && !first_idle_cpu) { + if (idle_cpu(i) && !first_idle_cpu && + cpumask_test_cpu(i, sched_group_mask(group))) { first_idle_cpu = 1; balance_cpu = i; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ba9dccfd24ce..6d52cea7f33d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(struct sched_domain *, sd_llc); DECLARE_PER_CPU(int, sd_llc_id); +extern int group_balance_cpu(struct sched_group *sg); + #endif /* CONFIG_SMP */ #include "stats.h" From c3decf0dfbc95736b7c0ab68fa4e5854c4734da9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 May 2012 12:05:32 +0200 Subject: [PATCH 131/178] sched: Always initialize cpu-power Often when we run into mis-shapen topologies the balance iteration fails to update the cpu power properly and we'll end up in /0 traps. Always initialize the cpu-power to a semi-sane value so that we can at least boot the machine, even if the load-balancer might not function correctly. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-3lbhyj25sr169ha7z3qht5na@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 13 ++++++++++++- kernel/sched/fair.c | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 781acb91a50a..725ee7c1c8cf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5604,7 +5604,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!group->sgp->power) { + /* + * Even though we initialize ->power to something semi-sane, + * we leave power_orig unset. This allows us to detect if + * domain iteration is still funny without causing /0 traps. + */ + if (!group->sgp->power_orig) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: domain->cpu_power not " "set\n"); @@ -6075,6 +6080,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) if (atomic_inc_return(&sg->sgp->ref) == 1) build_group_mask(sd, sg); + /* + * Initialize sgp->power such that even if we mess up the + * domains and no possible iteration will get us here, we won't + * die on a /0 trap. + */ + sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); /* * Make sure the first group of this domain contains the diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 54cbaa4e7b37..c9fd6d673d05 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3602,7 +3602,7 @@ void update_group_power(struct sched_domain *sd, int cpu) } while (group != child->groups); } - sdg->sgp->power = power; + sdg->sgp->power_orig = sdg->sgp->power = power; } /* From d039ac60800fe8ed8522ec3b9ca796aaf748c18b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 May 2012 21:20:16 +0200 Subject: [PATCH 132/178] sched: Validate assumptions in sched_init_numa() Add some code to validate assumptions we're making and output warnings if they are not. If this trigger we want to know about it. Signed-off-by: Peter Zijlstra Cc: Alex Shi Link: http://lkml.kernel.org/n/tip-6uc3wk5s9udxtdl9cnku0vtt@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 99 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 80 insertions(+), 19 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 725ee7c1c8cf..2bdd17616437 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5556,15 +5556,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ #ifdef CONFIG_SCHED_DEBUG -static __read_mostly int sched_domain_debug_enabled; +static __read_mostly int sched_debug_enabled; -static int __init sched_domain_debug_setup(char *str) +static int __init sched_debug_setup(char *str) { - sched_domain_debug_enabled = 1; + sched_debug_enabled = 1; return 0; } -early_param("sched_debug", sched_domain_debug_setup); +early_param("sched_debug", sched_debug_setup); + +static inline bool sched_debug(void) +{ + return sched_debug_enabled; +} static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) @@ -5657,7 +5662,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) { int level = 0; - if (!sched_domain_debug_enabled) + if (!sched_debug_enabled) return; if (!sd) { @@ -5678,6 +5683,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) } #else /* !CONFIG_SCHED_DEBUG */ # define sched_domain_debug(sd, cpu) do { } while (0) +static inline bool sched_debug(void) +{ + return false; +} #endif /* CONFIG_SCHED_DEBUG */ static int sd_degenerate(struct sched_domain *sd) @@ -6373,7 +6382,6 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol #ifdef CONFIG_NUMA static int sched_domains_numa_levels; -static int sched_domains_numa_scale; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; static int sched_domains_curr_level; @@ -6438,6 +6446,42 @@ static const struct cpumask *sd_numa_mask(int cpu) return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; } +static void sched_numa_warn(const char *str) +{ + static int done = false; + int i,j; + + if (done) + return; + + done = true; + + printk(KERN_WARNING "ERROR: %s\n\n", str); + + for (i = 0; i < nr_node_ids; i++) { + printk(KERN_WARNING " "); + for (j = 0; j < nr_node_ids; j++) + printk(KERN_CONT "%02d ", node_distance(i,j)); + printk(KERN_CONT "\n"); + } + printk(KERN_WARNING "\n"); +} + +static bool find_numa_distance(int distance) +{ + int i; + + if (distance == node_distance(0, 0)) + return true; + + for (i = 0; i < sched_domains_numa_levels; i++) { + if (sched_domains_numa_distance[i] == distance) + return true; + } + + return false; +} + static void sched_init_numa(void) { int next_distance, curr_distance = node_distance(0, 0); @@ -6445,7 +6489,6 @@ static void sched_init_numa(void) int level = 0; int i, j, k; - sched_domains_numa_scale = curr_distance; sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); if (!sched_domains_numa_distance) return; @@ -6456,23 +6499,41 @@ static void sched_init_numa(void) * * Assumes node_distance(0,j) includes all distances in * node_distance(i,j) in order to avoid cubic time. - * - * XXX: could be optimized to O(n log n) by using sort() */ next_distance = curr_distance; for (i = 0; i < nr_node_ids; i++) { for (j = 0; j < nr_node_ids; j++) { - int distance = node_distance(0, j); - if (distance > curr_distance && - (distance < next_distance || - next_distance == curr_distance)) - next_distance = distance; + for (k = 0; k < nr_node_ids; k++) { + int distance = node_distance(i, k); + + if (distance > curr_distance && + (distance < next_distance || + next_distance == curr_distance)) + next_distance = distance; + + /* + * While not a strong assumption it would be nice to know + * about cases where if node A is connected to B, B is not + * equally connected to A. + */ + if (sched_debug() && node_distance(k, i) != distance) + sched_numa_warn("Node-distance not symmetric"); + + if (sched_debug() && i && !find_numa_distance(distance)) + sched_numa_warn("Node-0 not representative"); + } + if (next_distance != curr_distance) { + sched_domains_numa_distance[level++] = next_distance; + sched_domains_numa_levels = level; + curr_distance = next_distance; + } else break; } - if (next_distance != curr_distance) { - sched_domains_numa_distance[level++] = next_distance; - sched_domains_numa_levels = level; - curr_distance = next_distance; - } else break; + + /* + * In case of sched_debug() we verify the above assumption. + */ + if (!sched_debug()) + break; } /* * 'level' contains the number of unique distances, excluding the From b430f7c4706aeba4270c7ab7744fc504b9315e1c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Jun 2012 15:30:31 +0200 Subject: [PATCH 133/178] perf/x86: Fix Intel shared extra MSR allocation Zheng Yan reported that event group validation can wreck event state when Intel extra_reg allocation changes event state. Validation shouldn't change any persistent state. Cloning events in validate_{event,group}() isn't really pretty either, so add a few special cases to avoid modifying the event state. The code is restructured to minimize the special case impact. Reported-by: Zheng Yan Acked-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1338903031.28282.175.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 1 + arch/x86/kernel/cpu/perf_event.h | 1 + arch/x86/kernel/cpu/perf_event_intel.c | 92 ++++++++++++++++++-------- 3 files changed, 66 insertions(+), 28 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index e049d6da0183..cb608383e4f6 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1496,6 +1496,7 @@ static struct cpu_hw_events *allocate_fake_cpuc(void) if (!cpuc->shared_regs) goto error; } + cpuc->is_fake = 1; return cpuc; error: free_fake_cpuc(cpuc); diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 6638aaf54493..83794d8e6af0 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -117,6 +117,7 @@ struct cpu_hw_events { struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ unsigned int group_flag; + int is_fake; /* * Intel DebugStore bits diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 166546ec6aef..965baa2fa790 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1119,27 +1119,33 @@ intel_bts_constraints(struct perf_event *event) return NULL; } -static bool intel_try_alt_er(struct perf_event *event, int orig_idx) +static int intel_alt_er(int idx) { if (!(x86_pmu.er_flags & ERF_HAS_RSP_1)) - return false; + return idx; - if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) { - event->hw.config &= ~INTEL_ARCH_EVENT_MASK; - event->hw.config |= 0x01bb; - event->hw.extra_reg.idx = EXTRA_REG_RSP_1; - event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; - } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) { + if (idx == EXTRA_REG_RSP_0) + return EXTRA_REG_RSP_1; + + if (idx == EXTRA_REG_RSP_1) + return EXTRA_REG_RSP_0; + + return idx; +} + +static void intel_fixup_er(struct perf_event *event, int idx) +{ + event->hw.extra_reg.idx = idx; + + if (idx == EXTRA_REG_RSP_0) { event->hw.config &= ~INTEL_ARCH_EVENT_MASK; event->hw.config |= 0x01b7; - event->hw.extra_reg.idx = EXTRA_REG_RSP_0; event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0; + } else if (idx == EXTRA_REG_RSP_1) { + event->hw.config &= ~INTEL_ARCH_EVENT_MASK; + event->hw.config |= 0x01bb; + event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; } - - if (event->hw.extra_reg.idx == orig_idx) - return false; - - return true; } /* @@ -1157,14 +1163,18 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, struct event_constraint *c = &emptyconstraint; struct er_account *era; unsigned long flags; - int orig_idx = reg->idx; + int idx = reg->idx; - /* already allocated shared msr */ - if (reg->alloc) + /* + * reg->alloc can be set due to existing state, so for fake cpuc we + * need to ignore this, otherwise we might fail to allocate proper fake + * state for this extra reg constraint. Also see the comment below. + */ + if (reg->alloc && !cpuc->is_fake) return NULL; /* call x86_get_event_constraint() */ again: - era = &cpuc->shared_regs->regs[reg->idx]; + era = &cpuc->shared_regs->regs[idx]; /* * we use spin_lock_irqsave() to avoid lockdep issues when * passing a fake cpuc @@ -1173,6 +1183,29 @@ again: if (!atomic_read(&era->ref) || era->config == reg->config) { + /* + * If its a fake cpuc -- as per validate_{group,event}() we + * shouldn't touch event state and we can avoid doing so + * since both will only call get_event_constraints() once + * on each event, this avoids the need for reg->alloc. + * + * Not doing the ER fixup will only result in era->reg being + * wrong, but since we won't actually try and program hardware + * this isn't a problem either. + */ + if (!cpuc->is_fake) { + if (idx != reg->idx) + intel_fixup_er(event, idx); + + /* + * x86_schedule_events() can call get_event_constraints() + * multiple times on events in the case of incremental + * scheduling(). reg->alloc ensures we only do the ER + * allocation once. + */ + reg->alloc = 1; + } + /* lock in msr value */ era->config = reg->config; era->reg = reg->reg; @@ -1180,17 +1213,17 @@ again: /* one more user */ atomic_inc(&era->ref); - /* no need to reallocate during incremental event scheduling */ - reg->alloc = 1; - /* * need to call x86_get_event_constraint() * to check if associated event has constraints */ c = NULL; - } else if (intel_try_alt_er(event, orig_idx)) { - raw_spin_unlock_irqrestore(&era->lock, flags); - goto again; + } else { + idx = intel_alt_er(idx); + if (idx != reg->idx) { + raw_spin_unlock_irqrestore(&era->lock, flags); + goto again; + } } raw_spin_unlock_irqrestore(&era->lock, flags); @@ -1204,11 +1237,14 @@ __intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc, struct er_account *era; /* - * only put constraint if extra reg was actually - * allocated. Also takes care of event which do - * not use an extra shared reg + * Only put constraint if extra reg was actually allocated. Also takes + * care of event which do not use an extra shared reg. + * + * Also, if this is a fake cpuc we shouldn't touch any event state + * (reg->alloc) and we don't care about leaving inconsistent cpuc state + * either since it'll be thrown out. */ - if (!reg->alloc) + if (!reg->alloc || cpuc->is_fake) return; era = &cpuc->shared_regs->regs[reg->idx]; From cccb9ba9e4ee0d750265f53de9258df69655c40b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Jun 2012 10:26:43 +0200 Subject: [PATCH 134/178] perf/x86: Implement cycles:p for SNB/IVB Now that there's finally a chip with working PEBS (IvyBridge), we can enable the hardware and implement cycles:p for SNB/IVB. Cc: Stephane Eranian Requested-and-tested-by: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1338884803.28282.153.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 1 + arch/x86/kernel/cpu/perf_event_intel.c | 50 +++++++++++++++++++++----- 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 83794d8e6af0..7241e2fc3c17 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -365,6 +365,7 @@ struct x86_pmu { int pebs_record_size; void (*drain_pebs)(struct pt_regs *regs); struct event_constraint *pebs_constraints; + void (*pebs_aliases)(struct perf_event *event); /* * Intel LBR diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 965baa2fa790..2312c1ff1b19 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1336,15 +1336,9 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc, intel_put_shared_regs_event_constraints(cpuc, event); } -static int intel_pmu_hw_config(struct perf_event *event) +static void intel_pebs_aliases_core2(struct perf_event *event) { - int ret = x86_pmu_hw_config(event); - - if (ret) - return ret; - - if (event->attr.precise_ip && - (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { + if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { /* * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P * (0x003c) so that we can use it with PEBS. @@ -1365,10 +1359,48 @@ static int intel_pmu_hw_config(struct perf_event *event) */ u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16); + alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); + event->hw.config = alt_config; + } +} + +static void intel_pebs_aliases_snb(struct perf_event *event) +{ + if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { + /* + * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P + * (0x003c) so that we can use it with PEBS. + * + * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't + * PEBS capable. However we can use UOPS_RETIRED.ALL + * (0x01c2), which is a PEBS capable event, to get the same + * count. + * + * UOPS_RETIRED.ALL counts the number of cycles that retires + * CNTMASK micro-ops. By setting CNTMASK to a value (16) + * larger than the maximum number of micro-ops that can be + * retired per cycle (4) and then inverting the condition, we + * count all cycles that retire 16 or less micro-ops, which + * is every cycle. + * + * Thereby we gain a PEBS capable cycle counter. + */ + u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16); alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); event->hw.config = alt_config; } +} + +static int intel_pmu_hw_config(struct perf_event *event) +{ + int ret = x86_pmu_hw_config(event); + + if (ret) + return ret; + + if (event->attr.precise_ip && x86_pmu.pebs_aliases) + x86_pmu.pebs_aliases(event); if (intel_pmu_needs_lbr_smpl(event)) { ret = intel_pmu_setup_lbr_filter(event); @@ -1643,6 +1675,7 @@ static __initconst const struct x86_pmu intel_pmu = { .max_period = (1ULL << 31) - 1, .get_event_constraints = intel_get_event_constraints, .put_event_constraints = intel_put_event_constraints, + .pebs_aliases = intel_pebs_aliases_core2, .format_attrs = intel_arch3_formats_attr, @@ -1885,6 +1918,7 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_snb_event_constraints; x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; + x86_pmu.pebs_aliases = intel_pebs_aliases_snb; x86_pmu.extra_regs = intel_snb_extra_regs; /* all extra regs are per-cpu when HT is on */ x86_pmu.er_flags |= ERF_HAS_RSP_1; From b6db437ba8322f5cee0bd355ad2ef9f73c413754 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Jun 2012 10:26:43 +0200 Subject: [PATCH 135/178] perf/x86: Enable/Add IvyBridge hardware support Implement rudimentary IVB perf support. The SDM states its identical to SNB with exception of the exact event tables, but a quick look suggests they're similar enough. Also mark SNB-EP as broken for now. Requested-and-tested-by: Linus Torvalds Cc: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1338884803.28282.153.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 2312c1ff1b19..187c294bc658 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1909,8 +1909,9 @@ __init int intel_pmu_init(void) break; case 42: /* SandyBridge */ - x86_add_quirk(intel_sandybridge_quirk); case 45: /* SandyBridge, "Romely-EP" */ + x86_add_quirk(intel_sandybridge_quirk); + case 58: /* IvyBridge */ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); From 8440ccb43fc0ecffcf1acee0273d766e6a8cd51d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Jun 2012 10:26:43 +0200 Subject: [PATCH 136/178] perf/x86: Update SNB PEBS constraints Afaict there's no need to (incompletely) iterate the MEM_UOPS_RETIRED.* umask state. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1338884803.28282.153.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_ds.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 5a3edc27f6e5..35e2192df9f4 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -400,14 +400,7 @@ struct event_constraint intel_snb_pebs_event_constraints[] = { INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */ - INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */ - INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */ - INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */ + INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */ From 67384fe3fd450536342330f684ea1f7dcaef8130 Mon Sep 17 00:00:00 2001 From: Eugeni Dodonov Date: Wed, 6 Jun 2012 11:59:06 -0300 Subject: [PATCH 137/178] char/agp: add another Ironlake host bridge This seems to come on Gigabyte H55M-S2V and was discovered through the https://bugs.freedesktop.org/show_bug.cgi?id=50381 debugging. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=50381 Signed-off-by: Eugeni Dodonov Cc: stable@vger.kernel.org Signed-off-by: Daniel Vetter --- drivers/char/agp/intel-agp.c | 1 + drivers/char/agp/intel-agp.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c index 764f70c5e690..0a4185279417 100644 --- a/drivers/char/agp/intel-agp.c +++ b/drivers/char/agp/intel-agp.c @@ -898,6 +898,7 @@ static struct pci_device_id agp_intel_pci_table[] = { ID(PCI_DEVICE_ID_INTEL_B43_HB), ID(PCI_DEVICE_ID_INTEL_B43_1_HB), ID(PCI_DEVICE_ID_INTEL_IRONLAKE_D_HB), + ID(PCI_DEVICE_ID_INTEL_IRONLAKE_D2_HB), ID(PCI_DEVICE_ID_INTEL_IRONLAKE_M_HB), ID(PCI_DEVICE_ID_INTEL_IRONLAKE_MA_HB), ID(PCI_DEVICE_ID_INTEL_IRONLAKE_MC2_HB), diff --git a/drivers/char/agp/intel-agp.h b/drivers/char/agp/intel-agp.h index c0091753a0d1..8e2d9140f300 100644 --- a/drivers/char/agp/intel-agp.h +++ b/drivers/char/agp/intel-agp.h @@ -212,6 +212,7 @@ #define PCI_DEVICE_ID_INTEL_G41_HB 0x2E30 #define PCI_DEVICE_ID_INTEL_G41_IG 0x2E32 #define PCI_DEVICE_ID_INTEL_IRONLAKE_D_HB 0x0040 +#define PCI_DEVICE_ID_INTEL_IRONLAKE_D2_HB 0x0069 #define PCI_DEVICE_ID_INTEL_IRONLAKE_D_IG 0x0042 #define PCI_DEVICE_ID_INTEL_IRONLAKE_M_HB 0x0044 #define PCI_DEVICE_ID_INTEL_IRONLAKE_MA_HB 0x0062 From a841f8cef4bb124f0f5563314d0beaf2e1249d72 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Tue, 5 Jun 2012 13:44:36 -0500 Subject: [PATCH 138/178] sched: Fix the relax_domain_level boot parameter It does not get processed because sched_domain_level_max is 0 at the time that setup_relax_domain_level() is run. Simply accept the value as it is, as we don't know the value of sched_domain_level_max until sched domain construction is completed. Fix sched_relax_domain_level in cpuset. The build_sched_domain() routine calls the set_domain_attribute() routine prior to setting the sd->level, however, the set_domain_attribute() routine relies on the sd->level to decide whether idle load balancing will be off/on. Signed-off-by: Dimitri Sivanich Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120605184436.GA15668@sgi.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2bdd17616437..d5594a4268d4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6268,11 +6268,8 @@ int sched_domain_level_max; static int __init setup_relax_domain_level(char *str) { - unsigned long val; - - val = simple_strtoul(str, NULL, 0); - if (val < sched_domain_level_max) - default_relax_domain_level = val; + if (kstrtoint(str, 0, &default_relax_domain_level)) + pr_warn("Unable to set relax_domain_level\n"); return 1; } @@ -6698,7 +6695,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, if (!sd) return child; - set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); if (child) { sd->level = child->level + 1; @@ -6706,6 +6702,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, child->parent = sd; } sd->child = child; + set_domain_attribute(sd, attr); return sd; } From 302fa4b58ac754a6da13f4f5546f710fecc3b945 Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Fri, 20 Apr 2012 15:41:33 -0700 Subject: [PATCH 139/178] perf/x86: Allow multiple stacks Without this patch, applications with two different stack regions (eg: native stack vs JIT stack) get truncated callchains even when RBP chaining is present. GDB shows proper stack traces and the frame pointer chaining is intact. This patch disables the (fp < RSP) check, hoping that other checks in the code save the day for us. In our limited testing, this didn't seem to break anything. In the long term, we could potentially have userspace advise the kernel on the range of valid stack addresses, so we don't spend a lot of time unwinding from bogus addresses. Signed-off-by: Arun Sharma CC: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Paul Mackerras Cc: Stephane Eranian Cc: Namhyung Kim Cc: Tom Zanussi Cc: linux-kernel@vger.kernel.org Cc: linux-perf-users@vger.kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1334961696-19580-2-git-send-email-asharma@fb.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index cb608383e4f6..e78bc256aea8 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1781,9 +1781,6 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) if (bytes != sizeof(frame)) break; - if (fp < compat_ptr(regs->sp)) - break; - perf_callchain_store(entry, frame.return_address); fp = compat_ptr(frame.next_frame); } @@ -1827,9 +1824,6 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) if (bytes != sizeof(frame)) break; - if ((unsigned long)fp < regs->sp) - break; - perf_callchain_store(entry, frame.return_address); fp = frame.next_frame; } From 0b0d9cf6ec7bab91977da2d71c09157f110f7c2e Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Fri, 20 Apr 2012 15:41:34 -0700 Subject: [PATCH 140/178] perf: Limit callchains to 127 Stack depth of 255 seems excessive, given that copy_from_user_nmi() could be slow. Signed-off-by: Arun Sharma Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1334961696-19580-3-git-send-email-asharma@fb.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1817d4015e5f..45db49f64bb4 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -555,7 +555,7 @@ enum perf_event_type { PERF_RECORD_MAX, /* non-ABI */ }; -#define PERF_MAX_STACK_DEPTH 255 +#define PERF_MAX_STACK_DEPTH 127 enum perf_callchain_context { PERF_CONTEXT_HV = (__u64)-32, From bc6ca7b342d5ae15c3ba3081fd40271b8039fb25 Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Fri, 20 Apr 2012 15:41:35 -0700 Subject: [PATCH 141/178] perf/x86: Check if user fp is valid Signed-off-by: Arun Sharma Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1334961696-19580-4-git-send-email-asharma@fb.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uaccess.h | 12 ++++++------ arch/x86/kernel/cpu/perf_event.c | 12 ++++++++++++ 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 04cd6882308e..e1f3a17034fc 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -33,9 +33,8 @@ #define segment_eq(a, b) ((a).seg == (b).seg) #define user_addr_max() (current_thread_info()->addr_limit.seg) -#define __addr_ok(addr) \ - ((unsigned long __force)(addr) < \ - (current_thread_info()->addr_limit.seg)) +#define __addr_ok(addr) \ + ((unsigned long __force)(addr) < user_addr_max()) /* * Test whether a block of memory is a valid user space address. @@ -47,14 +46,14 @@ * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry... */ -#define __range_not_ok(addr, size) \ +#define __range_not_ok(addr, size, limit) \ ({ \ unsigned long flag, roksum; \ __chk_user_ptr(addr); \ asm("add %3,%1 ; sbb %0,%0 ; cmp %1,%4 ; sbb $0,%0" \ : "=&r" (flag), "=r" (roksum) \ : "1" (addr), "g" ((long)(size)), \ - "rm" (current_thread_info()->addr_limit.seg)); \ + "rm" (limit)); \ flag; \ }) @@ -77,7 +76,8 @@ * checks that the pointer is in the user space range - after calling * this function, memory access functions may still return -EFAULT. */ -#define access_ok(type, addr, size) (likely(__range_not_ok(addr, size) == 0)) +#define access_ok(type, addr, size) \ + (likely(__range_not_ok(addr, size, user_addr_max()) == 0)) /* * The exception table consists of pairs of addresses relative to the diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index e78bc256aea8..c4706cf9c011 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1757,6 +1757,12 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); } +static inline int +valid_user_frame(const void __user *fp, unsigned long size) +{ + return (__range_not_ok(fp, size, TASK_SIZE) == 0); +} + #ifdef CONFIG_COMPAT #include @@ -1781,6 +1787,9 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) if (bytes != sizeof(frame)) break; + if (!valid_user_frame(fp, sizeof(frame))) + break; + perf_callchain_store(entry, frame.return_address); fp = compat_ptr(frame.next_frame); } @@ -1824,6 +1833,9 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) if (bytes != sizeof(frame)) break; + if (!valid_user_frame(fp, sizeof(frame))) + break; + perf_callchain_store(entry, frame.return_address); fp = frame.next_frame; } From db0dc75d6403b6663c0eab4c6ccb672eb9b2ed72 Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Fri, 20 Apr 2012 15:41:36 -0700 Subject: [PATCH 142/178] perf/x86: Check user address explicitly in copy_from_user_nmi() Signed-off-by: Arun Sharma Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1334961696-19580-5-git-send-email-asharma@fb.com Signed-off-by: Ingo Molnar --- arch/x86/lib/usercopy.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index f61ee67ec00f..677b1ed184c9 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -8,6 +8,7 @@ #include #include +#include /* * best effort, GUP based copy_from_user() that is NMI-safe @@ -21,6 +22,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) void *map; int ret; + if (__range_not_ok(from, n, TASK_SIZE) == 0) + return len; + do { ret = __get_user_pages_fast(addr, 1, 0, &page); if (!ret) From 10db9e009af5c3647b9ee742dcb14f6ff447a2a5 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Wed, 6 Jun 2012 11:21:44 -0400 Subject: [PATCH 143/178] tile: remove cpu_idle_on_new_stack This routine isn't used unless CONFIG_HOMECACHE is enabled, which isn't even available as a public configuration option yet. Since it no longer links correctly in 3.4, just remove it for now. Signed-off-by: Chris Metcalf --- arch/tile/include/asm/thread_info.h | 5 ----- arch/tile/kernel/entry.S | 14 -------------- 2 files changed, 19 deletions(-) diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h index 7e1fef36bde6..e9c670d7a7fe 100644 --- a/arch/tile/include/asm/thread_info.h +++ b/arch/tile/include/asm/thread_info.h @@ -91,11 +91,6 @@ extern void smp_nap(void); /* Enable interrupts racelessly and nap forever: helper for cpu_idle(). */ extern void _cpu_idle(void); -/* Switch boot idle thread to a freshly-allocated stack and free old stack. */ -extern void cpu_idle_on_new_stack(struct thread_info *old_ti, - unsigned long new_sp, - unsigned long new_ss10); - #else /* __ASSEMBLY__ */ /* diff --git a/arch/tile/kernel/entry.S b/arch/tile/kernel/entry.S index 133c4b56a99e..c31637baff28 100644 --- a/arch/tile/kernel/entry.S +++ b/arch/tile/kernel/entry.S @@ -68,20 +68,6 @@ STD_ENTRY(KBacktraceIterator_init_current) jrp lr /* keep backtracer happy */ STD_ENDPROC(KBacktraceIterator_init_current) -/* - * Reset our stack to r1/r2 (sp and ksp0+cpu respectively), then - * free the old stack (passed in r0) and re-invoke cpu_idle(). - * We update sp and ksp0 simultaneously to avoid backtracer warnings. - */ -STD_ENTRY(cpu_idle_on_new_stack) - { - move sp, r1 - mtspr SPR_SYSTEM_SAVE_K_0, r2 - } - jal free_thread_info - j cpu_idle - STD_ENDPROC(cpu_idle_on_new_stack) - /* Loop forever on a nap during SMP boot. */ STD_ENTRY(smp_nap) nap From 2ded5c2484d7375860bdb5f3df1954a346a2da26 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Wed, 6 Jun 2012 11:23:19 -0400 Subject: [PATCH 144/178] tile: add #include to unbreak build after generic init_task conversion Some code was moved from init_task.c to setup.c but the appropriate header needed to be moved as well. Signed-off-by: Chris Metcalf --- arch/tile/kernel/setup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c index 6098ccc59be2..dd87f3420390 100644 --- a/arch/tile/kernel/setup.c +++ b/arch/tile/kernel/setup.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include From edc4a67e15e34d2b3a2ab968625fd157751125d8 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Thu, 24 May 2012 16:08:09 +0300 Subject: [PATCH 145/178] mlx4_core: Fix setting VL_cap in mlx4_SET_PORT wrapper flow Commit 096335b3f983 ("mlx4_core: Allow dynamic MTU configuration for IB ports") modifies the port VL setting. This exposes a bug in mlx4_common_set_port(), where the VL cap value passed in (inside the command mailbox) is incorrectly zeroed-out: mlx4_SET_PORT modifies the VL_cap field (byte 3 of the mailbox). Since the SET_PORT command is paravirtualized on the master as well as on the slaves, mlx4_SET_PORT_wrapper() is invoked on the master. This calls mlx4_common_set_port() where mailbox byte 3 gets overwritten by code which should only set a single bit in that byte (for the reset qkey counter flag) -- but instead overwrites the entire byte. The result is that when running in SR-IOV mode, the VL_cap will be set to zero -- fix this. Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/net/ethernet/mellanox/mlx4/port.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c index 1fe2c7a8b40c..a8fb52992c64 100644 --- a/drivers/net/ethernet/mellanox/mlx4/port.c +++ b/drivers/net/ethernet/mellanox/mlx4/port.c @@ -697,10 +697,10 @@ static int mlx4_common_set_port(struct mlx4_dev *dev, int slave, u32 in_mod, if (slave != dev->caps.function) memset(inbox->buf, 0, 256); if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { - *(u8 *) inbox->buf = !!reset_qkey_viols << 6; + *(u8 *) inbox->buf |= !!reset_qkey_viols << 6; ((__be32 *) inbox->buf)[2] = agg_cap_mask; } else { - ((u8 *) inbox->buf)[3] = !!reset_qkey_viols; + ((u8 *) inbox->buf)[3] |= !!reset_qkey_viols; ((__be32 *) inbox->buf)[1] = agg_cap_mask; } From fc2d004419abebcfd740f46c32dd8201ce1b33c9 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 24 May 2012 16:08:08 +0300 Subject: [PATCH 146/178] IB/mlx4: Fix max_wqe capacity reported from query device 1. Limit the max number of WQEs per QP reported when querying the device, so that ib_create_qp() will not fail for a QP size that the device claimed to support due to additional headroom WQEs being allocated. 2. Limit qp resources accepted for ib_create_qp() to the limits reported in ib_query_device(). In kernel space, make sure that the limits returned to the caller following qp creation also lie within the reported device limits. For userspace, report as before, and do adjustment in libmlx4 (so as not to break ABI). Signed-off-by: Jack Morgenstein Signed-off-by: Sagi Grimberg Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 2 +- drivers/infiniband/hw/mlx4/mlx4_ib.h | 8 ++++++++ drivers/infiniband/hw/mlx4/qp.c | 21 +++++++++++++++------ 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 8afea12c8d44..3530c41fcd1f 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -140,7 +140,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->max_mr_size = ~0ull; props->page_size_cap = dev->dev->caps.page_size_cap; props->max_qp = dev->dev->caps.num_qps - dev->dev->caps.reserved_qps; - props->max_qp_wr = dev->dev->caps.max_wqes; + props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE; props->max_sge = min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg); props->max_cq = dev->dev->caps.num_cqs - dev->dev->caps.reserved_cqs; diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index e62297cc77cc..ff36655d23d3 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -44,6 +44,14 @@ #include #include +enum { + MLX4_IB_SQ_MIN_WQE_SHIFT = 6, + MLX4_IB_MAX_HEADROOM = 2048 +}; + +#define MLX4_IB_SQ_HEADROOM(shift) ((MLX4_IB_MAX_HEADROOM >> (shift)) + 1) +#define MLX4_IB_SQ_MAX_SPARE (MLX4_IB_SQ_HEADROOM(MLX4_IB_SQ_MIN_WQE_SHIFT)) + struct mlx4_ib_ucontext { struct ib_ucontext ibucontext; struct mlx4_uar uar; diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index ceb33327091a..8d4ed24aef93 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -310,8 +310,8 @@ static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, int is_user, int has_rq, struct mlx4_ib_qp *qp) { /* Sanity check RQ size before proceeding */ - if (cap->max_recv_wr > dev->dev->caps.max_wqes || - cap->max_recv_sge > dev->dev->caps.max_rq_sg) + if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE || + cap->max_recv_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg)) return -EINVAL; if (!has_rq) { @@ -329,8 +329,17 @@ static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg)); } - cap->max_recv_wr = qp->rq.max_post = qp->rq.wqe_cnt; - cap->max_recv_sge = qp->rq.max_gs; + /* leave userspace return values as they were, so as not to break ABI */ + if (is_user) { + cap->max_recv_wr = qp->rq.max_post = qp->rq.wqe_cnt; + cap->max_recv_sge = qp->rq.max_gs; + } else { + cap->max_recv_wr = qp->rq.max_post = + min(dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE, qp->rq.wqe_cnt); + cap->max_recv_sge = min(qp->rq.max_gs, + min(dev->dev->caps.max_sq_sg, + dev->dev->caps.max_rq_sg)); + } return 0; } @@ -341,8 +350,8 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, int s; /* Sanity check SQ size before proceeding */ - if (cap->max_send_wr > dev->dev->caps.max_wqes || - cap->max_send_sge > dev->dev->caps.max_sq_sg || + if (cap->max_send_wr > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE) || + cap->max_send_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) || cap->max_inline_data + send_wqe_overhead(type, qp->flags) + sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz) return -EINVAL; From 23e81d691a813839020f6e516b398d0f9369fe8b Mon Sep 17 00:00:00 2001 From: Adam Jackson Date: Wed, 6 Jun 2012 15:45:44 -0400 Subject: [PATCH 147/178] drm/i915: pch_irq_handler -> {ibx, cpt}_irq_handler Cougar/Panther Point redefine the bits in SDEIIR pretty completely. This function is just debugging, but if we're debugging we probably want to be told accurate things instead of lies. I'm told Lynx Point changes this yet more, but I have no idea how... Note from Eugeni's review: "For the record and for future enabling efforts, for LPT, bits 28-31 and 1-14 are gone since CPT/PPT (e.g., those must be zero). And there is the bit 15 as a new addition, but we are not using it yet and probably won't be using in foreseeable future." Signed-off-by: Adam Jackson Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=35103 Reviewed-by: Eugeni Dodonov Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_irq.c | 38 ++++++++++++++++++++++++++++++--- drivers/gpu/drm/i915/i915_reg.h | 35 +++++++++++++++++++++++++++--- 2 files changed, 67 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 1417660a93ec..b1fe0edda955 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -510,7 +510,7 @@ out: return ret; } -static void pch_irq_handler(struct drm_device *dev, u32 pch_iir) +static void ibx_irq_handler(struct drm_device *dev, u32 pch_iir) { drm_i915_private_t *dev_priv = (drm_i915_private_t *) dev->dev_private; int pipe; @@ -550,6 +550,35 @@ static void pch_irq_handler(struct drm_device *dev, u32 pch_iir) DRM_DEBUG_DRIVER("PCH transcoder A underrun interrupt\n"); } +static void cpt_irq_handler(struct drm_device *dev, u32 pch_iir) +{ + drm_i915_private_t *dev_priv = (drm_i915_private_t *) dev->dev_private; + int pipe; + + if (pch_iir & SDE_AUDIO_POWER_MASK_CPT) + DRM_DEBUG_DRIVER("PCH audio power change on port %d\n", + (pch_iir & SDE_AUDIO_POWER_MASK_CPT) >> + SDE_AUDIO_POWER_SHIFT_CPT); + + if (pch_iir & SDE_AUX_MASK_CPT) + DRM_DEBUG_DRIVER("AUX channel interrupt\n"); + + if (pch_iir & SDE_GMBUS_CPT) + DRM_DEBUG_DRIVER("PCH GMBUS interrupt\n"); + + if (pch_iir & SDE_AUDIO_CP_REQ_CPT) + DRM_DEBUG_DRIVER("Audio CP request interrupt\n"); + + if (pch_iir & SDE_AUDIO_CP_CHG_CPT) + DRM_DEBUG_DRIVER("Audio CP change interrupt\n"); + + if (pch_iir & SDE_FDI_MASK_CPT) + for_each_pipe(pipe) + DRM_DEBUG_DRIVER(" pipe %c FDI IIR: 0x%08x\n", + pipe_name(pipe), + I915_READ(FDI_RX_IIR(pipe))); +} + static irqreturn_t ivybridge_irq_handler(DRM_IRQ_ARGS) { struct drm_device *dev = (struct drm_device *) arg; @@ -591,7 +620,7 @@ static irqreturn_t ivybridge_irq_handler(DRM_IRQ_ARGS) if (pch_iir & SDE_HOTPLUG_MASK_CPT) queue_work(dev_priv->wq, &dev_priv->hotplug_work); - pch_irq_handler(dev, pch_iir); + cpt_irq_handler(dev, pch_iir); /* clear PCH hotplug event before clear CPU irq */ I915_WRITE(SDEIIR, pch_iir); @@ -684,7 +713,10 @@ static irqreturn_t ironlake_irq_handler(DRM_IRQ_ARGS) if (de_iir & DE_PCH_EVENT) { if (pch_iir & hotplug_mask) queue_work(dev_priv->wq, &dev_priv->hotplug_work); - pch_irq_handler(dev, pch_iir); + if (HAS_PCH_CPT(dev)) + cpt_irq_handler(dev, pch_iir); + else + ibx_irq_handler(dev, pch_iir); } if (de_iir & DE_PCU_EVENT) { diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 76bc2756d7c4..48d5e8e051cf 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -3321,7 +3321,7 @@ /* PCH */ -/* south display engine interrupt */ +/* south display engine interrupt: IBX */ #define SDE_AUDIO_POWER_D (1 << 27) #define SDE_AUDIO_POWER_C (1 << 26) #define SDE_AUDIO_POWER_B (1 << 25) @@ -3357,15 +3357,44 @@ #define SDE_TRANSA_CRC_ERR (1 << 1) #define SDE_TRANSA_FIFO_UNDER (1 << 0) #define SDE_TRANS_MASK (0x3f) -/* CPT */ -#define SDE_CRT_HOTPLUG_CPT (1 << 19) + +/* south display engine interrupt: CPT/PPT */ +#define SDE_AUDIO_POWER_D_CPT (1 << 31) +#define SDE_AUDIO_POWER_C_CPT (1 << 30) +#define SDE_AUDIO_POWER_B_CPT (1 << 29) +#define SDE_AUDIO_POWER_SHIFT_CPT 29 +#define SDE_AUDIO_POWER_MASK_CPT (7 << 29) +#define SDE_AUXD_CPT (1 << 27) +#define SDE_AUXC_CPT (1 << 26) +#define SDE_AUXB_CPT (1 << 25) +#define SDE_AUX_MASK_CPT (7 << 25) #define SDE_PORTD_HOTPLUG_CPT (1 << 23) #define SDE_PORTC_HOTPLUG_CPT (1 << 22) #define SDE_PORTB_HOTPLUG_CPT (1 << 21) +#define SDE_CRT_HOTPLUG_CPT (1 << 19) #define SDE_HOTPLUG_MASK_CPT (SDE_CRT_HOTPLUG_CPT | \ SDE_PORTD_HOTPLUG_CPT | \ SDE_PORTC_HOTPLUG_CPT | \ SDE_PORTB_HOTPLUG_CPT) +#define SDE_GMBUS_CPT (1 << 17) +#define SDE_AUDIO_CP_REQ_C_CPT (1 << 10) +#define SDE_AUDIO_CP_CHG_C_CPT (1 << 9) +#define SDE_FDI_RXC_CPT (1 << 8) +#define SDE_AUDIO_CP_REQ_B_CPT (1 << 6) +#define SDE_AUDIO_CP_CHG_B_CPT (1 << 5) +#define SDE_FDI_RXB_CPT (1 << 4) +#define SDE_AUDIO_CP_REQ_A_CPT (1 << 2) +#define SDE_AUDIO_CP_CHG_A_CPT (1 << 1) +#define SDE_FDI_RXA_CPT (1 << 0) +#define SDE_AUDIO_CP_REQ_CPT (SDE_AUDIO_CP_REQ_C_CPT | \ + SDE_AUDIO_CP_REQ_B_CPT | \ + SDE_AUDIO_CP_REQ_A_CPT) +#define SDE_AUDIO_CP_CHG_CPT (SDE_AUDIO_CP_CHG_C_CPT | \ + SDE_AUDIO_CP_CHG_B_CPT | \ + SDE_AUDIO_CP_CHG_A_CPT) +#define SDE_FDI_MASK_CPT (SDE_FDI_RXC_CPT | \ + SDE_FDI_RXB_CPT | \ + SDE_FDI_RXA_CPT) #define SDEISR 0xc4000 #define SDEIMR 0xc4004 From e9b4cf2094a65a05a831f070e46c554260632330 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 6 Jun 2012 15:22:41 +0300 Subject: [PATCH 148/178] UBI: fix debugfs-less systems support Commit "aa44d1d UBI: remove Kconfig debugging option" broke UBI and it refuses to initialize if debugfs (CONFIG_DEBUG_FS) is disabled. I incorrectly assumed that debugfs files creation function will return success if debugfs is disabled, but they actually return -ENODEV. This patch fixes the issue. Reported-by: Paul Parsons Signed-off-by: Artem Bityutskiy Tested-by: Paul Parsons --- drivers/mtd/ubi/debug.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/ubi/debug.c b/drivers/mtd/ubi/debug.c index 9f957c2d48e9..09d4f8d9d592 100644 --- a/drivers/mtd/ubi/debug.c +++ b/drivers/mtd/ubi/debug.c @@ -264,6 +264,9 @@ static struct dentry *dfs_rootdir; */ int ubi_debugfs_init(void) { + if (!IS_ENABLED(DEBUG_FS)) + return 0; + dfs_rootdir = debugfs_create_dir("ubi", NULL); if (IS_ERR_OR_NULL(dfs_rootdir)) { int err = dfs_rootdir ? -ENODEV : PTR_ERR(dfs_rootdir); @@ -281,7 +284,8 @@ int ubi_debugfs_init(void) */ void ubi_debugfs_exit(void) { - debugfs_remove(dfs_rootdir); + if (IS_ENABLED(DEBUG_FS)) + debugfs_remove(dfs_rootdir); } /* Read an UBI debugfs file */ @@ -403,6 +407,9 @@ int ubi_debugfs_init_dev(struct ubi_device *ubi) struct dentry *dent; struct ubi_debug_info *d = ubi->dbg; + if (!IS_ENABLED(DEBUG_FS)) + return 0; + n = snprintf(d->dfs_dir_name, UBI_DFS_DIR_LEN + 1, UBI_DFS_DIR_NAME, ubi->ubi_num); if (n == UBI_DFS_DIR_LEN) { @@ -470,5 +477,6 @@ out: */ void ubi_debugfs_exit_dev(struct ubi_device *ubi) { - debugfs_remove_recursive(ubi->dbg->dfs_dir); + if (IS_ENABLED(DEBUG_FS)) + debugfs_remove_recursive(ubi->dbg->dfs_dir); } From 818039c7d597db3b1d30964a8f9489ac42c0642d Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 6 Jun 2012 16:03:10 +0300 Subject: [PATCH 149/178] UBIFS: fix debugfs-less systems support Commit "f70b7e5 UBIFS: remove Kconfig debugging option" broke UBIFS and it refuses to initialize if debugfs (CONFIG_DEBUG_FS) is disabled. I incorrectly assumed that debugfs files creation function will return success if debugfs is disabled, but they actually return -ENODEV. This patch fixes the issue. Reported-by: Paul Parsons Signed-off-by: Artem Bityutskiy Tested-by: Paul Parsons --- fs/ubifs/debug.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 685a83756b2b..84a7e6f3c046 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2918,6 +2918,9 @@ int dbg_debugfs_init_fs(struct ubifs_info *c) struct dentry *dent; struct ubifs_debug_info *d = c->dbg; + if (!IS_ENABLED(DEBUG_FS)) + return 0; + n = snprintf(d->dfs_dir_name, UBIFS_DFS_DIR_LEN + 1, UBIFS_DFS_DIR_NAME, c->vi.ubi_num, c->vi.vol_id); if (n == UBIFS_DFS_DIR_LEN) { @@ -3010,7 +3013,8 @@ out: */ void dbg_debugfs_exit_fs(struct ubifs_info *c) { - debugfs_remove_recursive(c->dbg->dfs_dir); + if (IS_ENABLED(DEBUG_FS)) + debugfs_remove_recursive(c->dbg->dfs_dir); } struct ubifs_global_debug_info ubifs_dbg; @@ -3095,6 +3099,9 @@ int dbg_debugfs_init(void) const char *fname; struct dentry *dent; + if (!IS_ENABLED(DEBUG_FS)) + return 0; + fname = "ubifs"; dent = debugfs_create_dir(fname, NULL); if (IS_ERR_OR_NULL(dent)) @@ -3159,7 +3166,8 @@ out: */ void dbg_debugfs_exit(void) { - debugfs_remove_recursive(dfs_rootdir); + if (IS_ENABLED(DEBUG_FS)) + debugfs_remove_recursive(dfs_rootdir); } /** From 12027f1b3fd69a4e9017e6b13c72547a99c6cf54 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Thu, 7 Jun 2012 15:15:30 +0300 Subject: [PATCH 150/178] UBI: correct ubi_wl_flush locking Commit "62f38455 UBI: modify ubi_wl_flush function to clear work queue for a lnum" takes the 'work_sem' semaphore in write mode for the entire loop, which is not very good because it will block other workers for potentially long time. We do not need to have it in write mode - read mode is enough, and we do not need to hole it over the entire loop. So this patch turns changes the locking: takes 'work_sem' in read mode and pushes it down to the loop. Signed-off-by: Artem Bityutskiy --- drivers/mtd/ubi/wl.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c index 9df100a4ec38..b6be644e7b85 100644 --- a/drivers/mtd/ubi/wl.c +++ b/drivers/mtd/ubi/wl.c @@ -1262,11 +1262,11 @@ int ubi_wl_flush(struct ubi_device *ubi, int vol_id, int lnum) dbg_wl("flush pending work for LEB %d:%d (%d pending works)", vol_id, lnum, ubi->works_count); - down_write(&ubi->work_sem); while (found) { struct ubi_work *wrk; found = 0; + down_read(&ubi->work_sem); spin_lock(&ubi->wl_lock); list_for_each_entry(wrk, &ubi->works, list) { if ((vol_id == UBI_ALL || wrk->vol_id == vol_id) && @@ -1277,18 +1277,27 @@ int ubi_wl_flush(struct ubi_device *ubi, int vol_id, int lnum) spin_unlock(&ubi->wl_lock); err = wrk->func(ubi, wrk, 0); - if (err) - goto out; + if (err) { + up_read(&ubi->work_sem); + return err; + } + spin_lock(&ubi->wl_lock); found = 1; break; } } spin_unlock(&ubi->wl_lock); + up_read(&ubi->work_sem); } -out: + /* + * Make sure all the works which have been done in parallel are + * finished. + */ + down_write(&ubi->work_sem); up_write(&ubi->work_sem); + return err; } From 743628e868c5992354fc80b4d1e9a6143da1c0e6 Mon Sep 17 00:00:00 2001 From: Jordan Justen Date: Thu, 7 Jun 2012 09:05:21 -0700 Subject: [PATCH 151/178] x86, efi stub: Add .reloc section back into image Some UEFI firmware will not load a .efi with a .reloc section with a size of 0. Therefore, we create a .efi image with 4 main areas and 3 sections. 1. PE/COFF file header 2. .setup section (covers all setup code following the first sector) 3. .reloc section (contains 1 dummy reloc entry, created in build.c) 4. .text section (covers the remaining kernel image) To make room for the new .setup section data, the header bugger_off_msg had to be shortened. Reported-by: Henrik Rydberg Signed-off-by: Jordan Justen Link: http://lkml.kernel.org/r/1339085121-12760-1-git-send-email-jordan.l.justen@intel.com Tested-by: Lee G Rosenbaum Tested-by: Henrik Rydberg Cc: Matt Fleming Signed-off-by: H. Peter Anvin --- arch/x86/boot/header.S | 42 ++++++--- arch/x86/boot/tools/build.c | 172 +++++++++++++++++++++++------------- 2 files changed, 140 insertions(+), 74 deletions(-) diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 8bbea6aa40d9..efe5acfc79c3 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -94,10 +94,10 @@ bs_die: .section ".bsdata", "a" bugger_off_msg: - .ascii "Direct booting from floppy is no longer supported.\r\n" - .ascii "Please use a boot loader program instead.\r\n" + .ascii "Direct floppy boot is not supported. " + .ascii "Use a boot loader program instead.\r\n" .ascii "\n" - .ascii "Remove disk and press any key to reboot . . .\r\n" + .ascii "Remove disk and press any key to reboot ...\r\n" .byte 0 #ifdef CONFIG_EFI_STUB @@ -111,7 +111,7 @@ coff_header: #else .word 0x8664 # x86-64 #endif - .word 2 # nr_sections + .word 3 # nr_sections .long 0 # TimeDateStamp .long 0 # PointerToSymbolTable .long 1 # NumberOfSymbols @@ -158,8 +158,8 @@ extra_header_fields: #else .quad 0 # ImageBase #endif - .long 0x1000 # SectionAlignment - .long 0x200 # FileAlignment + .long 0x20 # SectionAlignment + .long 0x20 # FileAlignment .word 0 # MajorOperatingSystemVersion .word 0 # MinorOperatingSystemVersion .word 0 # MajorImageVersion @@ -200,8 +200,10 @@ extra_header_fields: # Section table section_table: - .ascii ".text" - .byte 0 + # + # The offset & size fields are filled in by build.c. + # + .ascii ".setup" .byte 0 .byte 0 .long 0 @@ -217,9 +219,8 @@ section_table: # # The EFI application loader requires a relocation section - # because EFI applications must be relocatable. But since - # we don't need the loader to fixup any relocs for us, we - # just create an empty (zero-length) .reloc section header. + # because EFI applications must be relocatable. The .reloc + # offset & size fields are filled in by build.c. # .ascii ".reloc" .byte 0 @@ -233,6 +234,25 @@ section_table: .word 0 # NumberOfRelocations .word 0 # NumberOfLineNumbers .long 0x42100040 # Characteristics (section flags) + + # + # The offset & size fields are filled in by build.c. + # + .ascii ".text" + .byte 0 + .byte 0 + .byte 0 + .long 0 + .long 0x0 # startup_{32,64} + .long 0 # Size of initialized data + # on disk + .long 0x0 # startup_{32,64} + .long 0 # PointerToRelocations + .long 0 # PointerToLineNumbers + .word 0 # NumberOfRelocations + .word 0 # NumberOfLineNumbers + .long 0x60500020 # Characteristics (section flags) + #endif /* CONFIG_EFI_STUB */ # Kernel attributes; used by setup. This is part 1 of the diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index 3f61f6e2b46f..4b8e165ee572 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -50,6 +50,8 @@ typedef unsigned int u32; u8 buf[SETUP_SECT_MAX*512]; int is_big_kernel; +#define PECOFF_RELOC_RESERVE 0x20 + /*----------------------------------------------------------------------*/ static const u32 crctab32[] = { @@ -133,11 +135,103 @@ static void usage(void) die("Usage: build setup system [> image]"); } +#ifdef CONFIG_EFI_STUB + +static void update_pecoff_section_header(char *section_name, u32 offset, u32 size) +{ + unsigned int pe_header; + unsigned short num_sections; + u8 *section; + + pe_header = get_unaligned_le32(&buf[0x3c]); + num_sections = get_unaligned_le16(&buf[pe_header + 6]); + +#ifdef CONFIG_X86_32 + section = &buf[pe_header + 0xa8]; +#else + section = &buf[pe_header + 0xb8]; +#endif + + while (num_sections > 0) { + if (strncmp((char*)section, section_name, 8) == 0) { + /* section header size field */ + put_unaligned_le32(size, section + 0x8); + + /* section header vma field */ + put_unaligned_le32(offset, section + 0xc); + + /* section header 'size of initialised data' field */ + put_unaligned_le32(size, section + 0x10); + + /* section header 'file offset' field */ + put_unaligned_le32(offset, section + 0x14); + + break; + } + section += 0x28; + num_sections--; + } +} + +static void update_pecoff_setup_and_reloc(unsigned int size) +{ + u32 setup_offset = 0x200; + u32 reloc_offset = size - PECOFF_RELOC_RESERVE; + u32 setup_size = reloc_offset - setup_offset; + + update_pecoff_section_header(".setup", setup_offset, setup_size); + update_pecoff_section_header(".reloc", reloc_offset, PECOFF_RELOC_RESERVE); + + /* + * Modify .reloc section contents with a single entry. The + * relocation is applied to offset 10 of the relocation section. + */ + put_unaligned_le32(reloc_offset + 10, &buf[reloc_offset]); + put_unaligned_le32(10, &buf[reloc_offset + 4]); +} + +static void update_pecoff_text(unsigned int text_start, unsigned int file_sz) +{ + unsigned int pe_header; + unsigned int text_sz = file_sz - text_start; + + pe_header = get_unaligned_le32(&buf[0x3c]); + + /* Size of image */ + put_unaligned_le32(file_sz, &buf[pe_header + 0x50]); + + /* + * Size of code: Subtract the size of the first sector (512 bytes) + * which includes the header. + */ + put_unaligned_le32(file_sz - 512, &buf[pe_header + 0x1c]); + +#ifdef CONFIG_X86_32 + /* + * Address of entry point. + * + * The EFI stub entry point is +16 bytes from the start of + * the .text section. + */ + put_unaligned_le32(text_start + 16, &buf[pe_header + 0x28]); +#else + /* + * Address of entry point. startup_32 is at the beginning and + * the 64-bit entry point (startup_64) is always 512 bytes + * after. The EFI stub entry point is 16 bytes after that, as + * the first instruction allows legacy loaders to jump over + * the EFI stub initialisation + */ + put_unaligned_le32(text_start + 528, &buf[pe_header + 0x28]); +#endif /* CONFIG_X86_32 */ + + update_pecoff_section_header(".text", text_start, text_sz); +} + +#endif /* CONFIG_EFI_STUB */ + int main(int argc, char ** argv) { -#ifdef CONFIG_EFI_STUB - unsigned int file_sz, pe_header; -#endif unsigned int i, sz, setup_sectors; int c; u32 sys_size; @@ -163,6 +257,12 @@ int main(int argc, char ** argv) die("Boot block hasn't got boot flag (0xAA55)"); fclose(file); +#ifdef CONFIG_EFI_STUB + /* Reserve 0x20 bytes for .reloc section */ + memset(buf+c, 0, PECOFF_RELOC_RESERVE); + c += PECOFF_RELOC_RESERVE; +#endif + /* Pad unused space with zeros */ setup_sectors = (c + 511) / 512; if (setup_sectors < SETUP_SECT_MIN) @@ -170,6 +270,10 @@ int main(int argc, char ** argv) i = setup_sectors*512; memset(buf+c, 0, i-c); +#ifdef CONFIG_EFI_STUB + update_pecoff_setup_and_reloc(i); +#endif + /* Set the default root device */ put_unaligned_le16(DEFAULT_ROOT_DEV, &buf[508]); @@ -194,66 +298,8 @@ int main(int argc, char ** argv) put_unaligned_le32(sys_size, &buf[0x1f4]); #ifdef CONFIG_EFI_STUB - file_sz = sz + i + ((sys_size * 16) - sz); - - pe_header = get_unaligned_le32(&buf[0x3c]); - - /* Size of image */ - put_unaligned_le32(file_sz, &buf[pe_header + 0x50]); - - /* - * Subtract the size of the first section (512 bytes) which - * includes the header and .reloc section. The remaining size - * is that of the .text section. - */ - file_sz -= 512; - - /* Size of code */ - put_unaligned_le32(file_sz, &buf[pe_header + 0x1c]); - -#ifdef CONFIG_X86_32 - /* - * Address of entry point. - * - * The EFI stub entry point is +16 bytes from the start of - * the .text section. - */ - put_unaligned_le32(i + 16, &buf[pe_header + 0x28]); - - /* .text size */ - put_unaligned_le32(file_sz, &buf[pe_header + 0xb0]); - - /* .text vma */ - put_unaligned_le32(0x200, &buf[pe_header + 0xb4]); - - /* .text size of initialised data */ - put_unaligned_le32(file_sz, &buf[pe_header + 0xb8]); - - /* .text file offset */ - put_unaligned_le32(0x200, &buf[pe_header + 0xbc]); -#else - /* - * Address of entry point. startup_32 is at the beginning and - * the 64-bit entry point (startup_64) is always 512 bytes - * after. The EFI stub entry point is 16 bytes after that, as - * the first instruction allows legacy loaders to jump over - * the EFI stub initialisation - */ - put_unaligned_le32(i + 528, &buf[pe_header + 0x28]); - - /* .text size */ - put_unaligned_le32(file_sz, &buf[pe_header + 0xc0]); - - /* .text vma */ - put_unaligned_le32(0x200, &buf[pe_header + 0xc4]); - - /* .text size of initialised data */ - put_unaligned_le32(file_sz, &buf[pe_header + 0xc8]); - - /* .text file offset */ - put_unaligned_le32(0x200, &buf[pe_header + 0xcc]); -#endif /* CONFIG_X86_32 */ -#endif /* CONFIG_EFI_STUB */ + update_pecoff_text(setup_sectors * 512, sz + i + ((sys_size * 16) - sz)); +#endif crc = partial_crc32(buf, i, crc); if (fwrite(buf, 1, i, stdout) != i) From 0142ef6cdca5f9784eb0762ac50fe378d98d71d4 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 7 Jun 2012 14:21:09 -0700 Subject: [PATCH 152/178] shmem: replace_page must flush_dcache and others Commit bde05d1ccd51 ("shmem: replace page if mapping excludes its zone") is not at all likely to break for anyone, but it was an earlier version from before review feedback was incorporated. Fix that up now. * shmem_replace_page must flush_dcache_page after copy_highpage [akpm] * Expand comment on why shmem_unuse_inode needs page_swapcount [akpm] * Remove excess of VM_BUG_ONs from shmem_replace_page [wangcong] * Check page_private matches swap before calling shmem_replace_page [hughd] * shmem_replace_page allow for unexpected race in radix_tree lookup [hughd] Signed-off-by: Hugh Dickins Cc: Cong Wang Cc: Christoph Hellwig Cc: KAMEZAWA Hiroyuki Cc: Alan Cox Cc: Stephane Marchesin Cc: Andi Kleen Cc: Dave Airlie Cc: Daniel Vetter Cc: Rob Clark Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 57 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 585bd220a21e..a15a466d0d1d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -683,10 +683,21 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, mutex_lock(&shmem_swaplist_mutex); /* * We needed to drop mutex to make that restrictive page - * allocation; but the inode might already be freed by now, - * and we cannot refer to inode or mapping or info to check. - * However, we do hold page lock on the PageSwapCache page, - * so can check if that still has our reference remaining. + * allocation, but the inode might have been freed while we + * dropped it: although a racing shmem_evict_inode() cannot + * complete without emptying the radix_tree, our page lock + * on this swapcache page is not enough to prevent that - + * free_swap_and_cache() of our swap entry will only + * trylock_page(), removing swap from radix_tree whatever. + * + * We must not proceed to shmem_add_to_page_cache() if the + * inode has been freed, but of course we cannot rely on + * inode or mapping or info to check that. However, we can + * safely check if our swap entry is still in use (and here + * it can't have got reused for another page): if it's still + * in use, then the inode cannot have been freed yet, and we + * can safely proceed (if it's no longer in use, that tells + * nothing about the inode, but we don't need to unuse swap). */ if (!page_swapcount(*pagep)) error = -ENOENT; @@ -730,9 +741,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page) /* * There's a faint possibility that swap page was replaced before - * caller locked it: it will come back later with the right page. + * caller locked it: caller will come back later with the right page. */ - if (unlikely(!PageSwapCache(page))) + if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) goto out; /* @@ -995,21 +1006,15 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, newpage = shmem_alloc_page(gfp, info, index); if (!newpage) return -ENOMEM; - VM_BUG_ON(shmem_should_replace_page(newpage, gfp)); - *pagep = newpage; page_cache_get(newpage); copy_highpage(newpage, oldpage); + flush_dcache_page(newpage); - VM_BUG_ON(!PageLocked(oldpage)); __set_page_locked(newpage); - VM_BUG_ON(!PageUptodate(oldpage)); SetPageUptodate(newpage); - VM_BUG_ON(!PageSwapBacked(oldpage)); SetPageSwapBacked(newpage); - VM_BUG_ON(!swap_index); set_page_private(newpage, swap_index); - VM_BUG_ON(!PageSwapCache(oldpage)); SetPageSwapCache(newpage); /* @@ -1019,13 +1024,24 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, spin_lock_irq(&swap_mapping->tree_lock); error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, newpage); - __inc_zone_page_state(newpage, NR_FILE_PAGES); - __dec_zone_page_state(oldpage, NR_FILE_PAGES); + if (!error) { + __inc_zone_page_state(newpage, NR_FILE_PAGES); + __dec_zone_page_state(oldpage, NR_FILE_PAGES); + } spin_unlock_irq(&swap_mapping->tree_lock); - BUG_ON(error); - mem_cgroup_replace_page_cache(oldpage, newpage); - lru_cache_add_anon(newpage); + if (unlikely(error)) { + /* + * Is this possible? I think not, now that our callers check + * both PageSwapCache and page_private after getting page lock; + * but be defensive. Reverse old to newpage for clear and free. + */ + oldpage = newpage; + } else { + mem_cgroup_replace_page_cache(oldpage, newpage); + lru_cache_add_anon(newpage); + *pagep = newpage; + } ClearPageSwapCache(oldpage); set_page_private(oldpage, 0); @@ -1033,7 +1049,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, unlock_page(oldpage); page_cache_release(oldpage); page_cache_release(oldpage); - return 0; + return error; } /* @@ -1107,7 +1123,8 @@ repeat: /* We have to do this with page locked to prevent races */ lock_page(page); - if (!PageSwapCache(page) || page->mapping) { + if (!PageSwapCache(page) || page_private(page) != swap.val || + page->mapping) { error = -EEXIST; /* try again */ goto failed; } From 6305902c2f871fd6db60af367bd7120fa977fa74 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 7 Jun 2012 14:21:10 -0700 Subject: [PATCH 153/178] MAINTAINERS: whitespace fixes Remove trailing spaces at EOL. Always use a tab after the type : Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index dafcba7e2312..14bc7071f9df 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1077,7 +1077,7 @@ F: drivers/media/video/s5p-fimc/ ARM/SAMSUNG S5P SERIES Multi Format Codec (MFC) SUPPORT M: Kyungmin Park M: Kamil Debski -M: Jeongtae Park +M: Jeongtae Park L: linux-arm-kernel@lists.infradead.org L: linux-media@vger.kernel.org S: Maintained @@ -1743,10 +1743,10 @@ F: include/linux/can/platform/ CAPABILITIES M: Serge Hallyn L: linux-security-module@vger.kernel.org -S: Supported +S: Supported F: include/linux/capability.h F: security/capability.c -F: security/commoncap.c +F: security/commoncap.c F: kernel/capability.c CELL BROADBAND ENGINE ARCHITECTURE @@ -2146,11 +2146,11 @@ S: Orphan F: drivers/net/wan/pc300* CYTTSP TOUCHSCREEN DRIVER -M: Javier Martinez Canillas -L: linux-input@vger.kernel.org -S: Maintained -F: drivers/input/touchscreen/cyttsp* -F: include/linux/input/cyttsp.h +M: Javier Martinez Canillas +L: linux-input@vger.kernel.org +S: Maintained +F: drivers/input/touchscreen/cyttsp* +F: include/linux/input/cyttsp.h DAMA SLAVE for AX.25 M: Joerg Reuter @@ -5185,7 +5185,7 @@ S: Maintained F: drivers/firmware/pcdp.* PCI ERROR RECOVERY -M: Linas Vepstas +M: Linas Vepstas L: linux-pci@vger.kernel.org S: Supported F: Documentation/PCI/pci-error-recovery.txt From bafb282df29c1524b1617019adebd6d0c3eb7a47 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 7 Jun 2012 14:21:11 -0700 Subject: [PATCH 154/178] c/r: prctl: update prctl_set_mm_exe_file() after mm->num_exe_file_vmas removal A fix for commit b32dfe377102 ("c/r: prctl: add ability to set new mm_struct::exe_file"). After removing mm->num_exe_file_vmas kernel keeps mm->exe_file until final mmput(), it never becomes NULL while task is alive. We can check for other mapped files in mm instead of checking mm->num_exe_file_vmas, and mark mm with flag MMF_EXE_FILE_CHANGED in order to forbid second changing of mm->exe_file. Signed-off-by: Konstantin Khlebnikov Reviewed-by: Cyrill Gorcunov Cc: Oleg Nesterov Cc: Matt Helsley Cc: Kees Cook Cc: KOSAKI Motohiro Cc: Tejun Heo Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + kernel/sys.c | 31 +++++++++++++++++++------------ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 6029d8c54476..c688d4cc2e40 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -439,6 +439,7 @@ extern int get_dumpable(struct mm_struct *mm); /* leave room for more dump flags */ #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ #define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ +#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) diff --git a/kernel/sys.c b/kernel/sys.c index 9ff89cb9657a..54f20fdee93c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1796,17 +1796,11 @@ static bool vma_flags_mismatch(struct vm_area_struct *vma, static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { + struct vm_area_struct *vma; struct file *exe_file; struct dentry *dentry; int err; - /* - * Setting new mm::exe_file is only allowed when no VM_EXECUTABLE vma's - * remain. So perform a quick test first. - */ - if (mm->num_exe_file_vmas) - return -EBUSY; - exe_file = fget(fd); if (!exe_file) return -EBADF; @@ -1827,17 +1821,30 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) if (err) goto exit; + down_write(&mm->mmap_sem); + + /* + * Forbid mm->exe_file change if there are mapped other files. + */ + err = -EBUSY; + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->vm_file && !path_equal(&vma->vm_file->f_path, + &exe_file->f_path)) + goto exit_unlock; + } + /* * The symlink can be changed only once, just to disallow arbitrary * transitions malicious software might bring in. This means one * could make a snapshot over all processes running and monitor * /proc/pid/exe changes to notice unusual activity if needed. */ - down_write(&mm->mmap_sem); - if (likely(!mm->exe_file)) - set_mm_exe_file(mm, exe_file); - else - err = -EBUSY; + err = -EPERM; + if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) + goto exit_unlock; + + set_mm_exe_file(mm, exe_file); +exit_unlock: up_write(&mm->mmap_sem); exit: From 1ad75b9e16280ca4e2501a629a225319cf2eef2e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 7 Jun 2012 14:21:11 -0700 Subject: [PATCH 155/178] c/r: prctl: add minimal address test to PR_SET_MM Make sure the address being set is greater than mmap_min_addr (as suggested by Kees Cook). Signed-off-by: Cyrill Gorcunov Acked-by: Kees Cook Cc: Serge Hallyn Cc: Tejun Heo Cc: Pavel Emelyanov Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sys.c b/kernel/sys.c index 54f20fdee93c..19a2c7139960 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1869,7 +1869,7 @@ static int prctl_set_mm(int opt, unsigned long addr, if (opt == PR_SET_MM_EXE_FILE) return prctl_set_mm_exe_file(mm, (unsigned int)addr); - if (addr >= TASK_SIZE) + if (addr >= TASK_SIZE || addr < mmap_min_addr) return -EINVAL; error = -EINVAL; From 300f786b2683f8bb1ec0afb6e1851183a479c86d Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 7 Jun 2012 14:21:12 -0700 Subject: [PATCH 156/178] c/r: prctl: add ability to get clear_tid_address Zero is written at clear_tid_address when the process exits. This functionality is used by pthread_join(). We already have sys_set_tid_address() to change this address for the current task but there is no way to obtain it from user space. Without the ability to find this address and dump it we can't restore pthread'ed apps which call pthread_join() once they have been restored. This patch introduces the PR_GET_TID_ADDRESS prctl option which allows the current process to obtain own clear_tid_address. This feature is available iif CONFIG_CHECKPOINT_RESTORE is set. [akpm@linux-foundation.org: fix prctl numbering] Signed-off-by: Andrew Vagin Signed-off-by: Cyrill Gorcunov Cc: Pedro Alves Cc: Oleg Nesterov Cc: Pavel Emelyanov Cc: Tejun Heo Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/prctl.h | 10 ++++++---- kernel/sys.c | 13 +++++++++++++ 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/include/linux/prctl.h b/include/linux/prctl.h index 711e0a30aacc..3988012255dc 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h @@ -127,8 +127,8 @@ #define PR_SET_PTRACER 0x59616d61 # define PR_SET_PTRACER_ANY ((unsigned long)-1) -#define PR_SET_CHILD_SUBREAPER 36 -#define PR_GET_CHILD_SUBREAPER 37 +#define PR_SET_CHILD_SUBREAPER 36 +#define PR_GET_CHILD_SUBREAPER 37 /* * If no_new_privs is set, then operations that grant new privileges (i.e. @@ -142,7 +142,9 @@ * asking selinux for a specific new context (e.g. with runcon) will result * in execve returning -EPERM. */ -#define PR_SET_NO_NEW_PRIVS 38 -#define PR_GET_NO_NEW_PRIVS 39 +#define PR_SET_NO_NEW_PRIVS 38 +#define PR_GET_NO_NEW_PRIVS 39 + +#define PR_GET_TID_ADDRESS 40 #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sys.c b/kernel/sys.c index 19a2c7139960..0ec1942ba7ea 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1988,12 +1988,22 @@ out: up_read(&mm->mmap_sem); return error; } + +static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) +{ + return put_user(me->clear_child_tid, tid_addr); +} + #else /* CONFIG_CHECKPOINT_RESTORE */ static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { return -EINVAL; } +static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) +{ + return -EINVAL; +} #endif SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, @@ -2131,6 +2141,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, else return -EINVAL; break; + case PR_GET_TID_ADDRESS: + error = prctl_get_tid_address(me, (int __user **)arg2); + break; default: return -EINVAL; } From 736f24d5e59d699c6e300c5da7e3bb882eddda67 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 7 Jun 2012 14:21:12 -0700 Subject: [PATCH 157/178] c/r: prctl: drop VMA flags test on PR_SET_MM_ stack data assignment In commit b76437579d13 ("procfs: mark thread stack correctly in proc//maps") the stack allocated via clone() is marked in /proc//maps as [stack:%d] thus it might be out of the former mm->start_stack/end_stack values (and even has some custom VMA flags set). So to be able to restore mm->start_stack/end_stack drop vma flags test, but still require the underlying VMA to exist. As always note this feature is under CONFIG_CHECKPOINT_RESTORE and requires CAP_SYS_RESOURCE to be granted. Signed-off-by: Cyrill Gorcunov Cc: Oleg Nesterov Acked-by: Kees Cook Cc: Pavel Emelyanov Cc: Serge Hallyn Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index 0ec1942ba7ea..f0ec44dcd415 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1786,14 +1786,6 @@ SYSCALL_DEFINE1(umask, int, mask) } #ifdef CONFIG_CHECKPOINT_RESTORE -static bool vma_flags_mismatch(struct vm_area_struct *vma, - unsigned long required, - unsigned long banned) -{ - return (vma->vm_flags & required) != required || - (vma->vm_flags & banned); -} - static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { struct vm_area_struct *vma; @@ -1931,12 +1923,6 @@ static int prctl_set_mm(int opt, unsigned long addr, error = -EFAULT; goto out; } -#ifdef CONFIG_STACK_GROWSUP - if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSUP, 0)) -#else - if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSDOWN, 0)) -#endif - goto out; if (opt == PR_SET_MM_START_STACK) mm->start_stack = addr; else if (opt == PR_SET_MM_ARG_START) From 4e791c98ae7ff889121ca93b7bd97206e4a8d793 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 7 Jun 2012 14:21:12 -0700 Subject: [PATCH 158/178] drivers/platform/x86/acerhdf.c: correct Boris' mail address Correct mail address reference to a mail account which I actually read. Signed-off-by: Borislav Petkov Cc: Peter Feuerer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/platform/x86/acerhdf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/acerhdf.c b/drivers/platform/x86/acerhdf.c index 639db4d0aa76..2fd9d36acd15 100644 --- a/drivers/platform/x86/acerhdf.c +++ b/drivers/platform/x86/acerhdf.c @@ -5,7 +5,7 @@ * * (C) 2009 - Peter Feuerer peter (a) piie.net * http://piie.net - * 2009 Borislav Petkov + * 2009 Borislav Petkov bp (a) alien8.de * * Inspired by and many thanks to: * o acerfand - Rachel Greenham From 7d8a45695cc8f9fcdf4121fcbd897ecb63f758e4 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 7 Jun 2012 14:21:13 -0700 Subject: [PATCH 159/178] ipc: shm: restore MADV_REMOVE functionality on shared memory segments Commit 17cf28afea2a ("mm/fs: remove truncate_range") removed the truncate_range inode operation in favour of the fallocate file operation. When using SYSV IPC shared memory segments, calling madvise with the MADV_REMOVE advice on an area of shared memory will attempt to invoke the .fallocate function for the shm_file_operations, which is NULL and therefore returns -EOPNOTSUPP to userspace. The previous behaviour would inherit the inode_operations from the underlying tmpfs file and invoke truncate_range there. This patch restores the previous behaviour by wrapping the underlying fallocate function in shm_fallocate, as we do for fsync. [hughd@google.com: use -ENOTSUPP in shm_fallocate()] Signed-off-by: Will Deacon Acked-by: Hugh Dickins Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ipc/shm.c b/ipc/shm.c index 5e2cbfdab6fc..41c1285d697a 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -393,6 +393,16 @@ static int shm_fsync(struct file *file, loff_t start, loff_t end, int datasync) return sfd->file->f_op->fsync(sfd->file, start, end, datasync); } +static long shm_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + struct shm_file_data *sfd = shm_file_data(file); + + if (!sfd->file->f_op->fallocate) + return -EOPNOTSUPP; + return sfd->file->f_op->fallocate(file, mode, offset, len); +} + static unsigned long shm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) @@ -410,6 +420,7 @@ static const struct file_operations shm_file_operations = { .get_unmapped_area = shm_get_unmapped_area, #endif .llseek = noop_llseek, + .fallocate = shm_fallocate, }; static const struct file_operations shm_file_operations_huge = { @@ -418,6 +429,7 @@ static const struct file_operations shm_file_operations_huge = { .release = shm_release, .get_unmapped_area = shm_get_unmapped_area, .llseek = noop_llseek, + .fallocate = shm_fallocate, }; int is_file_shm_hugepages(struct file *file) From cbf8ae32f66a9ceb8907ad9e16663c2a29e48990 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Thu, 7 Jun 2012 14:21:13 -0700 Subject: [PATCH 160/178] btree: fix tree corruption in btree_get_prev() The memory the parameter __key points to is used as an iterator in btree_get_prev(), so if we save off a bkey() pointer in retry_key and then assign that to __key, we'll end up corrupting the btree internals when we do eg longcpy(__key, bkey(geo, node, i), geo->keylen); to return the key value. What we should do instead is use longcpy() to copy the key value that retry_key points to __key. This can cause a btree to get corrupted by seemingly read-only operations such as btree_for_each_safe. [akpm@linux-foundation.org: avoid the double longcpy()] Signed-off-by: Roland Dreier Acked-by: Joern Engel Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/btree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/btree.c b/lib/btree.c index e5ec1e9c1aa5..5cf9e74ec3f3 100644 --- a/lib/btree.c +++ b/lib/btree.c @@ -319,8 +319,8 @@ void *btree_get_prev(struct btree_head *head, struct btree_geo *geo, if (head->height == 0) return NULL; -retry: longcpy(key, __key, geo->keylen); +retry: dec_key(geo, key); node = head->node; @@ -351,7 +351,7 @@ retry: } miss: if (retry_key) { - __key = retry_key; + longcpy(key, retry_key, geo->keylen); retry_key = NULL; goto retry; } From 39caa0916ef27cf1da5026eb708a2b8413156f75 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Thu, 7 Jun 2012 14:21:14 -0700 Subject: [PATCH 161/178] btree: catch NULL value before it does harm Storing NULL values in the btree is illegal and can lead to memory corruption and possible other fun as well. Catch it on insert, instead of waiting for the inevitable. Signed-off-by: Joern Engel Signed-off-by: Roland Dreier Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/btree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/btree.c b/lib/btree.c index 5cf9e74ec3f3..f9a484676cb6 100644 --- a/lib/btree.c +++ b/lib/btree.c @@ -509,6 +509,7 @@ retry: int btree_insert(struct btree_head *head, struct btree_geo *geo, unsigned long *key, void *val, gfp_t gfp) { + BUG_ON(!val); return btree_insert_level(head, geo, key, val, 1, gfp); } EXPORT_SYMBOL_GPL(btree_insert); From 40af1bbdca47e5c8a2044039bb78ca8fd8b20f94 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 7 Jun 2012 14:21:14 -0700 Subject: [PATCH 162/178] mm: correctly synchronize rss-counters at exit/exec mm->rss_stat counters have per-task delta: task->rss_stat. Before changing task->mm pointer the kernel must flush this delta with sync_mm_rss(). do_exit() already calls sync_mm_rss() to flush the rss-counters before committing the rss statistics into task->signal->maxrss, taskstats, audit and other stuff. Unfortunately the kernel does this before calling mm_release(), which can call put_user() for processing task->clear_child_tid. So at this point we can trigger page-faults and task->rss_stat becomes non-zero again. As a result mm->rss_stat becomes inconsistent and check_mm() will print something like this: | BUG: Bad rss-counter state mm:ffff88020813c380 idx:1 val:-1 | BUG: Bad rss-counter state mm:ffff88020813c380 idx:2 val:1 This patch moves sync_mm_rss() into mm_release(), and moves mm_release() out of do_exit() and calls it earlier. After mm_release() there should be no pagefaults. [akpm@linux-foundation.org: tweak comment] Signed-off-by: Konstantin Khlebnikov Reported-by: Markus Trippelsdorf Cc: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Oleg Nesterov Cc: [3.4.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 1 - kernel/exit.c | 13 ++++++++----- kernel/fork.c | 8 ++++++++ 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index a79786a8d2c8..b926ed19301e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -819,7 +819,6 @@ static int exec_mmap(struct mm_struct *mm) /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; - sync_mm_rss(old_mm); mm_release(tsk, old_mm); if (old_mm) { diff --git a/kernel/exit.c b/kernel/exit.c index 34867cc5b42a..804fb6bb8161 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -423,6 +423,7 @@ void daemonize(const char *name, ...) * user space pages. We don't need them, and if we didn't close them * they would be locked into memory. */ + mm_release(current, current->mm); exit_mm(current); /* * We don't want to get frozen, in case system-wide hibernation @@ -640,7 +641,6 @@ static void exit_mm(struct task_struct * tsk) struct mm_struct *mm = tsk->mm; struct core_state *core_state; - mm_release(tsk, mm); if (!mm) return; /* @@ -960,9 +960,13 @@ void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - /* sync mm's RSS info before statistics gathering */ - if (tsk->mm) - sync_mm_rss(tsk->mm); + + /* Set exit_code before complete_vfork_done() in mm_release() */ + tsk->exit_code = code; + + /* Release mm and sync mm's RSS info before statistics gathering */ + mm_release(tsk, tsk->mm); + group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); @@ -975,7 +979,6 @@ void do_exit(long code) tty_audit_exit(); audit_free(tsk); - tsk->exit_code = code; taskstats_exit(tsk, group_dead); exit_mm(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index ab5211b9e622..0560781c6904 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -619,6 +619,14 @@ void mmput(struct mm_struct *mm) module_put(mm->binfmt->module); mmdrop(mm); } + + /* + * Final rss-counter synchronization. After this point there must be + * no pagefaults into this mm from the current context. Otherwise + * mm->rss_stat will be inconsistent. + */ + if (mm) + sync_mm_rss(mm); } EXPORT_SYMBOL_GPL(mmput); From b0dd6b70f0fda17ae9762fbb72d98e40a4f66556 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 7 Jun 2012 18:56:06 -0400 Subject: [PATCH 163/178] ext4: fix the free blocks calculation for ext3 file systems w/ uninit_bg Ext3 filesystems that are converted to use as many ext4 file system features as possible will enable uninit_bg to speed up e2fsck times. These file systems will have a native ext3 layout of inode tables and block allocation bitmaps (as opposed to ext4's flex_bg layout). Unfortunately, in these cases, when first allocating a block in an uninitialized block group, ext4 would incorrectly calculate the number of free blocks in that block group, and then errorneously report that the file system was corrupt: EXT4-fs error (device vdd): ext4_mb_generate_buddy:741: group 30, 32254 clusters in bitmap, 32258 in gd This problem can be reproduced via: mke2fs -q -t ext4 -O ^flex_bg /dev/vdd 5g mount -t ext4 /dev/vdd /mnt fallocate -l 4600m /mnt/test The problem was caused by a bone headed mistake in the check to see if a particular metadata block was part of the block group. Many thanks to Kees Cook for finding and bisecting the buggy commit which introduced this bug (commit fd034a84e1, present since v3.2). Reported-by: Sander Eikelenboom Reported-by: Kees Cook Signed-off-by: "Theodore Ts'o" Tested-by: Kees Cook Cc: stable@kernel.org --- fs/ext4/balloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 99b6324290db..cee7812cc3cf 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -90,8 +90,8 @@ unsigned ext4_num_overhead_clusters(struct super_block *sb, * unusual file system layouts. */ if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) { - block_cluster = EXT4_B2C(sbi, (start - - ext4_block_bitmap(sb, gdp))); + block_cluster = EXT4_B2C(sbi, + ext4_block_bitmap(sb, gdp) - start); if (block_cluster < num_clusters) block_cluster = -1; else if (block_cluster == num_clusters) { @@ -102,7 +102,7 @@ unsigned ext4_num_overhead_clusters(struct super_block *sb, if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) { inode_cluster = EXT4_B2C(sbi, - start - ext4_inode_bitmap(sb, gdp)); + ext4_inode_bitmap(sb, gdp) - start); if (inode_cluster < num_clusters) inode_cluster = -1; else if (inode_cluster == num_clusters) { @@ -114,7 +114,7 @@ unsigned ext4_num_overhead_clusters(struct super_block *sb, itbl_blk = ext4_inode_table(sb, gdp); for (i = 0; i < sbi->s_itb_per_group; i++) { if (ext4_block_in_group(sb, itbl_blk + i, block_group)) { - c = EXT4_B2C(sbi, start - itbl_blk + i); + c = EXT4_B2C(sbi, itbl_blk + i - start); if ((c < num_clusters) || (c == inode_cluster) || (c == block_cluster) || (c == itbl_cluster)) continue; From b22b1f178f6799278d3178d894f37facb2085765 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 7 Jun 2012 19:04:19 -0400 Subject: [PATCH 164/178] ext4: don't set i_flags in EXT4_IOC_SETFLAGS Commit 7990696 uses the ext4_{set,clear}_inode_flags() functions to change the i_flags automatically but fails to remove the error setting of i_flags. So we still have the problem of trashing state flags. Fix this by removing the assignment. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/ioctl.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 8ad112ae0ade..e34deac3f366 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -123,7 +123,6 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) else ext4_clear_inode_flag(inode, i); } - ei->i_flags = flags; ext4_set_inode_flags(inode); inode->i_ctime = ext4_current_time(inode); From 48d212a2eecaca2e1875925837ad27b2f43f48a3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 7 Jun 2012 17:54:07 -0700 Subject: [PATCH 165/178] Revert "mm: correctly synchronize rss-counters at exit/exec" This reverts commit 40af1bbdca47e5c8a2044039bb78ca8fd8b20f94. It's horribly and utterly broken for at least the following reasons: - calling sync_mm_rss() from mmput() is fundamentally wrong, because there's absolutely no reason to believe that the task that does the mmput() always does it on its own VM. Example: fork, ptrace, /proc - you name it. - calling it *after* having done mmdrop() on it is doubly insane, since the mm struct may well be gone now. - testing mm against NULL before you call it is insane too, since a NULL mm there would have caused oopses long before. .. and those are just the three bugs I found before I decided to give up looking for me and revert it asap. I should have caught it before I even took it, but I trusted Andrew too much. Cc: Konstantin Khlebnikov Cc: Markus Trippelsdorf Cc: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Oleg Nesterov Cc: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 1 + kernel/exit.c | 13 +++++-------- kernel/fork.c | 8 -------- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index b926ed19301e..a79786a8d2c8 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -819,6 +819,7 @@ static int exec_mmap(struct mm_struct *mm) /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; + sync_mm_rss(old_mm); mm_release(tsk, old_mm); if (old_mm) { diff --git a/kernel/exit.c b/kernel/exit.c index 804fb6bb8161..34867cc5b42a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -423,7 +423,6 @@ void daemonize(const char *name, ...) * user space pages. We don't need them, and if we didn't close them * they would be locked into memory. */ - mm_release(current, current->mm); exit_mm(current); /* * We don't want to get frozen, in case system-wide hibernation @@ -641,6 +640,7 @@ static void exit_mm(struct task_struct * tsk) struct mm_struct *mm = tsk->mm; struct core_state *core_state; + mm_release(tsk, mm); if (!mm) return; /* @@ -960,13 +960,9 @@ void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - - /* Set exit_code before complete_vfork_done() in mm_release() */ - tsk->exit_code = code; - - /* Release mm and sync mm's RSS info before statistics gathering */ - mm_release(tsk, tsk->mm); - + /* sync mm's RSS info before statistics gathering */ + if (tsk->mm) + sync_mm_rss(tsk->mm); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); @@ -979,6 +975,7 @@ void do_exit(long code) tty_audit_exit(); audit_free(tsk); + tsk->exit_code = code; taskstats_exit(tsk, group_dead); exit_mm(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 0560781c6904..ab5211b9e622 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -619,14 +619,6 @@ void mmput(struct mm_struct *mm) module_put(mm->binfmt->module); mmdrop(mm); } - - /* - * Final rss-counter synchronization. After this point there must be - * no pagefaults into this mm from the current context. Otherwise - * mm->rss_stat will be inconsistent. - */ - if (mm) - sync_mm_rss(mm); } EXPORT_SYMBOL_GPL(mmput); From 860aed25a1f0936d4852ab936252b47cd1e630f1 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 1 Jun 2012 18:13:43 +1000 Subject: [PATCH 166/178] powerpc/time: Sanity check of decrementer expiration is necessary This reverts 68568add2c ("powerpc/time: Remove unnecessary sanity check of decrementer expiration"). We do need to check whether we have reached the expiration time of the next event, because we sometimes get an early decrementer interrupt, most notably when we set the decrementer to 1 in arch_irq_work_raise(). The effect of not having the sanity check is that if timer_interrupt() gets called early, we leave the decrementer set to its maximum value, which means we then don't get any more decrementer interrupts for about 4 seconds (or longer, depending on timebase frequency). I saw these pauses as a consequence of getting a stray hypervisor decrementer interrupt left over from exiting a KVM guest. This isn't quite a straight revert because of changes to the surrounding code, but it restores the same algorithm as was previously used. Cc: stable@vger.kernel.org Acked-by: Anton Blanchard Acked-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/time.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 99a995c2a3f2..be171ee73bf8 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -475,6 +475,7 @@ void timer_interrupt(struct pt_regs * regs) struct pt_regs *old_regs; u64 *next_tb = &__get_cpu_var(decrementers_next_tb); struct clock_event_device *evt = &__get_cpu_var(decrementers); + u64 now; /* Ensure a positive value is written to the decrementer, or else * some CPUs will continue to take decrementer exceptions. @@ -509,9 +510,16 @@ void timer_interrupt(struct pt_regs * regs) irq_work_run(); } - *next_tb = ~(u64)0; - if (evt->event_handler) - evt->event_handler(evt); + now = get_tb_or_rtc(); + if (now >= *next_tb) { + *next_tb = ~(u64)0; + if (evt->event_handler) + evt->event_handler(evt); + } else { + now = *next_tb - now; + if (now <= DECREMENTER_MAX) + set_dec((int)now); + } #ifdef CONFIG_PPC64 /* collect purr register values often, for accurate calculations */ From ae82fdb1406ad41d68f07027fe31f2d35ba22a90 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 8 Jun 2012 14:58:13 +0930 Subject: [PATCH 167/178] module_param: stop double-calling parameters. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 026cee0086fe1df4cf74691cf273062cc769617d "params: _initcall-like kernel parameters" set old-style module parameters to level 0. And we call those level 0 calls where we used to, early in start_kernel(). We also loop through the initcall levels and call the levelled module_params before the corresponding initcall. Unfortunately level 0 is early_init(), so we call the standard module_param calls twice. (Turns out most things don't care, but at least ubi.mtd does). Change the level to -1 for standard module_param calls. Reported-by: Benoît Thébaudeau Signed-off-by: Rusty Russell Cc: stable@kernel.org --- include/linux/moduleparam.h | 10 +++++----- init/main.c | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 1b14d25162cb..d6a58065c09c 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -128,7 +128,7 @@ struct kparam_array * The ops can have NULL set or get functions. */ #define module_param_cb(name, ops, arg, perm) \ - __module_param_call(MODULE_PARAM_PREFIX, name, ops, arg, perm, 0) + __module_param_call(MODULE_PARAM_PREFIX, name, ops, arg, perm, -1) /** * _param_cb - general callback for a module/cmdline parameter @@ -192,7 +192,7 @@ struct kparam_array { (void *)set, (void *)get }; \ __module_param_call(MODULE_PARAM_PREFIX, \ name, &__param_ops_##name, arg, \ - (perm) + sizeof(__check_old_set_param(set))*0, 0) + (perm) + sizeof(__check_old_set_param(set))*0, -1) /* We don't get oldget: it's often a new-style param_get_uint, etc. */ static inline int @@ -272,7 +272,7 @@ static inline void __kernel_param_unlock(void) */ #define core_param(name, var, type, perm) \ param_check_##type(name, &(var)); \ - __module_param_call("", name, ¶m_ops_##type, &var, perm, 0) + __module_param_call("", name, ¶m_ops_##type, &var, perm, -1) #endif /* !MODULE */ /** @@ -290,7 +290,7 @@ static inline void __kernel_param_unlock(void) = { len, string }; \ __module_param_call(MODULE_PARAM_PREFIX, name, \ ¶m_ops_string, \ - .str = &__param_string_##name, perm, 0); \ + .str = &__param_string_##name, perm, -1); \ __MODULE_PARM_TYPE(name, "string") /** @@ -432,7 +432,7 @@ extern int param_set_bint(const char *val, const struct kernel_param *kp); __module_param_call(MODULE_PARAM_PREFIX, name, \ ¶m_array_ops, \ .arr = &__param_arr_##name, \ - perm, 0); \ + perm, -1); \ __MODULE_PARM_TYPE(name, "array of " #type) extern struct kernel_param_ops param_array_ops; diff --git a/init/main.c b/init/main.c index 1ca6b32c4828..37e12098eac1 100644 --- a/init/main.c +++ b/init/main.c @@ -508,7 +508,7 @@ asmlinkage void __init start_kernel(void) parse_early_param(); parse_args("Booting kernel", static_command_line, __start___param, __stop___param - __start___param, - 0, 0, &unknown_bootoption); + -1, -1, &unknown_bootoption); jump_label_init(); From 19efb72fdc3c3bbfb929d91e34312b0494f14409 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 1 Jun 2012 18:56:00 +0200 Subject: [PATCH 168/178] init: Drop initcall level output 9fb48c744ba6a ("params: add 3rd arg to option handler callback signature") added similar lines to dmesg: initlevel:0=early, 4 registered initcalls initlevel:1=core, 31 registered initcalls initlevel:2=postcore, 11 registered initcalls initlevel:3=arch, 7 registered initcalls initlevel:4=subsys, 40 registered initcalls initlevel:5=fs, 30 registered initcalls initlevel:6=device, 250 registered initcalls initlevel:7=late, 35 registered initcalls but they don't contain any info for the general user staring at dmesg. I'm very doubtful the count of initcalls registered per level helps anyone so drop that output completely. Cc: Jim Cromie Cc: Rusty Russell Cc: Jason Baron Signed-off-by: Borislav Petkov Signed-off-by: Rusty Russell --- init/main.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/init/main.c b/init/main.c index 37e12098eac1..b5cc0a7c4708 100644 --- a/init/main.c +++ b/init/main.c @@ -755,13 +755,8 @@ static void __init do_initcalls(void) { int level; - for (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++) { - pr_info("initlevel:%d=%s, %d registered initcalls\n", - level, initcall_level_names[level], - (int) (initcall_levels[level+1] - - initcall_levels[level])); + for (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++) do_initcall_level(level); - } } /* From bd2753b2dda7bb43c7468826de75f49c6a7e8965 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 Jun 2012 10:55:40 -0700 Subject: [PATCH 169/178] x86/mm: Only add extra pages count for the first memory range during pre-allocation early page table space Robin found this regression: | I just tried to boot an 8TB system. It fails very early in boot with: | Kernel panic - not syncing: Cannot find space for the kernel page tables git bisect commit 722bc6b16771ed80871e1fd81c86d3627dda2ac8. A git revert of that commit does boot past that point on the 8TB configuration. That commit will add up extra pages for all memory range even above 4g. Try to limit that extra page count adding to first entry only. Bisected-by: Robin Holt Tested-by: Robin Holt Signed-off-by: Yinghai Lu Cc: WANG Cong Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/CAE9FiQUj3wyzQxtq9yzBNc9u220p8JZ1FYHG7t%3DMOzJ%3D9BZMYA@mail.gmail.com Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 97141c26a13a..bc4e9d84157f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -62,7 +62,8 @@ static void __init find_early_table_space(struct map_range *mr, unsigned long en extra += PMD_SIZE; #endif /* The first 2/4M doesn't use large pages. */ - extra += mr->end - mr->start; + if (mr->start < PMD_SIZE) + extra += mr->end - mr->start; ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; } else From d5d2d2eea84b0d8450b082edbc3dbde41fb8bfd8 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Thu, 7 Jun 2012 08:31:40 -0500 Subject: [PATCH 170/178] x86/uv: Fix UV2 BAU legacy mode The SGI Altix UV2 BAU (Broadcast Assist Unit) as used for tlb-shootdown (selective broadcast mode) always uses UV2 broadcast descriptor format. There is no need to clear the 'legacy' (UV1) mode, because the hardware always uses UV2 mode for selective broadcast. But the BIOS uses general broadcast and legacy mode, and the hardware pays attention to the legacy mode bit for general broadcast. So the kernel must not clear that mode bit. Signed-off-by: Cliff Wickman Cc: Link: http://lkml.kernel.org/r/E1SccoO-0002Lh-Cb@eag09.americas.sgi.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 1 - arch/x86/platform/uv/tlb_uv.c | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index becf47b81735..6149b476d9df 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -149,7 +149,6 @@ /* 4 bits of software ack period */ #define UV2_ACK_MASK 0x7UL #define UV2_ACK_UNITS_SHFT 3 -#define UV2_LEG_SHFT UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT #define UV2_EXT_SHFT UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT /* diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 3ae0e61abd23..59880afa851f 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1295,7 +1295,6 @@ static void __init enable_timeouts(void) */ mmr_image |= (1L << SOFTACK_MSHIFT); if (is_uv2_hub()) { - mmr_image &= ~(1L << UV2_LEG_SHFT); mmr_image |= (1L << UV2_EXT_SHFT); } write_mmr_misc_control(pnode, mmr_image); From 3c75296562f43e6fbc6cddd3de948a7b3e4e9bcf Mon Sep 17 00:00:00 2001 From: Steffen Rumler Date: Wed, 6 Jun 2012 16:37:17 +0200 Subject: [PATCH 171/178] powerpc: Fix kernel panic during kernel module load This fixes a problem which can causes kernel oopses while loading a kernel module. According to the PowerPC EABI specification, GPR r11 is assigned the dedicated function to point to the previous stack frame. In the powerpc-specific kernel module loader, do_plt_call() (in arch/powerpc/kernel/module_32.c), GPR r11 is also used to generate trampoline code. This combination crashes the kernel, in the case where the compiler chooses to use a helper function for saving GPRs on entry, and the module loader has placed the .init.text section far away from the .text section, meaning that it has to generate a trampoline for functions in the .init.text section to call the GPR save helper. Because the trampoline trashes r11, references to the stack frame using r11 can cause an oops. The fix just uses GPR r12 instead of GPR r11 for generating the trampoline code. According to the statements from Freescale, this is safe from an EABI perspective. I've tested the fix for kernel 2.6.33 on MPC8541. Cc: stable@vger.kernel.org Signed-off-by: Steffen Rumler [paulus@samba.org: reworded the description] Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/module_32.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c index 0b6d79617d7b..2e3200ca485f 100644 --- a/arch/powerpc/kernel/module_32.c +++ b/arch/powerpc/kernel/module_32.c @@ -176,8 +176,8 @@ int module_frob_arch_sections(Elf32_Ehdr *hdr, static inline int entry_matches(struct ppc_plt_entry *entry, Elf32_Addr val) { - if (entry->jump[0] == 0x3d600000 + ((val + 0x8000) >> 16) - && entry->jump[1] == 0x396b0000 + (val & 0xffff)) + if (entry->jump[0] == 0x3d800000 + ((val + 0x8000) >> 16) + && entry->jump[1] == 0x398c0000 + (val & 0xffff)) return 1; return 0; } @@ -204,10 +204,9 @@ static uint32_t do_plt_call(void *location, entry++; } - /* Stolen from Paul Mackerras as well... */ - entry->jump[0] = 0x3d600000+((val+0x8000)>>16); /* lis r11,sym@ha */ - entry->jump[1] = 0x396b0000 + (val&0xffff); /* addi r11,r11,sym@l*/ - entry->jump[2] = 0x7d6903a6; /* mtctr r11 */ + entry->jump[0] = 0x3d800000+((val+0x8000)>>16); /* lis r12,sym@ha */ + entry->jump[1] = 0x398c0000 + (val&0xffff); /* addi r12,r12,sym@l*/ + entry->jump[2] = 0x7d8903a6; /* mtctr r12 */ entry->jump[3] = 0x4e800420; /* bctr */ DEBUGP("Initialized plt for 0x%x at %p\n", val, entry); From eeaaa96a3a2134a174100afd129bb0891d05f4b2 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Wed, 6 Jun 2012 10:05:42 -0400 Subject: [PATCH 172/178] x86/nmi: Fix section mismatch warnings on 32-bit It was reported that compiling for 32-bit caused a bunch of section mismatch warnings: VDSOSYM arch/x86/vdso/vdso32-syms.lds LD arch/x86/vdso/built-in.o LD arch/x86/built-in.o WARNING: arch/x86/built-in.o(.data+0x5af0): Section mismatch in reference from the variable test_nmi_ipi_callback_na.10451 to the function .init.text:test_nmi_ipi_callback() [...] WARNING: arch/x86/built-in.o(.data+0x5b04): Section mismatch in reference from the variable nmi_unk_cb_na.10399 to the function .init.text:nmi_unk_cb() The variable nmi_unk_cb_na.10399 references the function __init nmi_unk_cb() [...] Both of these are attributed to the internal representation of the nmiaction struct created during register_nmi_handler. The reason for this is that those structs are not defined in the init section whereas the rest of the code in nmi_selftest.c is. To resolve this, I created a new #define, register_nmi_handler_initonly, that tags the struct as __initdata to resolve the mismatch. This #define should only be used in rare situations where the register/unregister is called during init of the kernel. Big thanks to Jan Beulich for decoding this for me as I didn't have a clue what was going on. Reported-by: Witold Baryluk Tested-by: Witold Baryluk Cc: Jan Beulich Signed-off-by: Don Zickus Link: http://lkml.kernel.org/r/1338991542-23000-1-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/nmi.h | 14 ++++++++++++++ arch/x86/kernel/nmi_selftest.c | 4 ++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 0e3793b821ef..dc580c42851c 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -54,6 +54,20 @@ struct nmiaction { __register_nmi_handler((t), &fn##_na); \ }) +/* + * For special handlers that register/unregister in the + * init section only. This should be considered rare. + */ +#define register_nmi_handler_initonly(t, fn, fg, n) \ +({ \ + static struct nmiaction fn##_na __initdata = { \ + .handler = (fn), \ + .name = (n), \ + .flags = (fg), \ + }; \ + __register_nmi_handler((t), &fn##_na); \ +}) + int __register_nmi_handler(unsigned int, struct nmiaction *); void unregister_nmi_handler(unsigned int, const char *); diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index e31bf8d5c4d2..149b8d9c6ad4 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -42,7 +42,7 @@ static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs) static void __init init_nmi_testsuite(void) { /* trap all the unknown NMIs we may generate */ - register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); + register_nmi_handler_initonly(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); } static void __init cleanup_nmi_testsuite(void) @@ -64,7 +64,7 @@ static void __init test_nmi_ipi(struct cpumask *mask) { unsigned long timeout; - if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback, + if (register_nmi_handler_initonly(NMI_LOCAL, test_nmi_ipi_callback, NMI_FLAG_FIRST, "nmi_selftest")) { nmi_fail = FAILURE; return; From 32ba9c3fcab960f0b0d332c86ebcd2c4870d9bb8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 8 Jun 2012 10:34:03 -0700 Subject: [PATCH 173/178] Revert "vfs: stop d_splice_alias creating directory aliases" This reverts commit 7732a557b1342c6e6966efb5f07effcf99f56167 (and commit 3f50fff4dace23d3cfeb195d5cd4ee813cee68b7, which was a follow-up cleanup). We're chasing an elusive bug that Dave Jones can apparently reproduce using his system call fuzzer tool, and that looks like some kind of locking ordering problem on the directory i_mutex chain. Our i_mutex locking is rather complex, and depends on the topological ordering of the directories, which is why we have been very wary of splicing directory entries around. Of course, we really don't want to ever see aliased unconnected directories anyway, so none of this should ever happen, but this revert aims to basically get us back to a known older state. Bruce points to some of the previous discussion at http://marc.info/?i=<20110310105821.GE22723@ZenIV.linux.org.uk> and in particular a long post from Neil: http://marc.info/?i=<20110311150749.2fa2be66@notabene.brown> It should be noted that it's possible that Dave's problems come from other changes altohgether, including possibly just the fact that Dave constantly is teachning his fuzzer new tricks. So what appears to be a new bug could in fact be an old one that just gets newly triggered, but reverting these patches as "still under heavy discussion" is the right thing regardless. Requested-by: Al Viro Acked-by: J. Bruce Fields Signed-off-by: Linus Torvalds --- fs/dcache.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 85c9e2bff8e6..40469044088d 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -683,6 +683,8 @@ EXPORT_SYMBOL(dget_parent); /** * d_find_alias - grab a hashed alias of inode * @inode: inode in question + * @want_discon: flag, used by d_splice_alias, to request + * that only a DISCONNECTED alias be returned. * * If inode has a hashed alias, or is a directory and has any alias, * acquire the reference to alias and return it. Otherwise return NULL. @@ -691,9 +693,10 @@ EXPORT_SYMBOL(dget_parent); * of a filesystem. * * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer - * any other hashed alias over that. + * any other hashed alias over that one unless @want_discon is set, + * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias. */ -static struct dentry *__d_find_alias(struct inode *inode) +static struct dentry *__d_find_alias(struct inode *inode, int want_discon) { struct dentry *alias, *discon_alias; @@ -705,7 +708,7 @@ again: if (IS_ROOT(alias) && (alias->d_flags & DCACHE_DISCONNECTED)) { discon_alias = alias; - } else { + } else if (!want_discon) { __dget_dlock(alias); spin_unlock(&alias->d_lock); return alias; @@ -736,7 +739,7 @@ struct dentry *d_find_alias(struct inode *inode) if (!list_empty(&inode->i_dentry)) { spin_lock(&inode->i_lock); - de = __d_find_alias(inode); + de = __d_find_alias(inode, 0); spin_unlock(&inode->i_lock); } return de; @@ -1647,8 +1650,9 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) if (inode && S_ISDIR(inode->i_mode)) { spin_lock(&inode->i_lock); - new = __d_find_any_alias(inode); + new = __d_find_alias(inode, 1); if (new) { + BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); spin_unlock(&inode->i_lock); security_d_instantiate(new, inode); d_move(new, dentry); @@ -2478,7 +2482,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) struct dentry *alias; /* Does an aliased dentry already exist? */ - alias = __d_find_alias(inode); + alias = __d_find_alias(inode, 0); if (alias) { actual = alias; write_seqlock(&rename_lock); From 8f53369b753f5f4c7684c2eb0b592152abb1dd00 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 8 Jun 2012 14:53:06 -0700 Subject: [PATCH 174/178] Revert "drm/i915/crt: Do not rely upon the HPD presence pin" This reverts commit 9e612a008fa7fe493a473454def56aa321479495. It incorrectly finds VGA connectors where none are attached, apparently not noticing that nothing replied to the EDID queries, and happily using the default EDID modes that have nothing to do with actual hardware. That in turn then causes X to fall down to the lowest common denominator, which is usually the default 1024x768 mode that is in the default EDID and pretty much anything supports). I'd suggest that if not relying on the HDP pin, the code should at least check whether it gets valid EDID data back, rather than just assume there's something on the VGA connector. Cc: Dave Airlie Cc: Chris Wilson Cc: Daniel Vetter Signed-off-by: Linus Torvalds --- drivers/gpu/drm/i915/intel_crt.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_crt.c b/drivers/gpu/drm/i915/intel_crt.c index 844e93e1e395..75a70c46ef1b 100644 --- a/drivers/gpu/drm/i915/intel_crt.c +++ b/drivers/gpu/drm/i915/intel_crt.c @@ -453,15 +453,13 @@ intel_crt_detect(struct drm_connector *connector, bool force) struct intel_load_detect_pipe tmp; if (I915_HAS_HOTPLUG(dev)) { - /* We can not rely on the HPD pin always being correctly wired - * up, for example many KVM do not pass it through, and so - * only trust an assertion that the monitor is connected. - */ if (intel_crt_detect_hotplug(connector)) { DRM_DEBUG_KMS("CRT detected via hotplug\n"); return connector_status_connected; - } else + } else { DRM_DEBUG_KMS("CRT not detected via hotplug\n"); + return connector_status_disconnected; + } } if (intel_crt_detect_ddc(connector)) From cd96891d48a945ca2011fbeceda73813d6286195 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 8 Jun 2012 13:18:33 -0700 Subject: [PATCH 175/178] sched/fair: fix lots of kernel-doc warnings Fix lots of new kernel-doc warnings in kernel/sched/fair.c: Warning(kernel/sched/fair.c:3625): No description found for parameter 'env' Warning(kernel/sched/fair.c:3625): Excess function parameter 'sd' description in 'update_sg_lb_stats' Warning(kernel/sched/fair.c:3735): No description found for parameter 'env' Warning(kernel/sched/fair.c:3735): Excess function parameter 'sd' description in 'update_sd_pick_busiest' Warning(kernel/sched/fair.c:3735): Excess function parameter 'this_cpu' description in 'update_sd_pick_busiest' .. more warnings Signed-off-by: Randy Dunlap Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/sched/fair.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b2a2d236f27b..d5583f9588e7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3632,7 +3632,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. - * @sd: The sched_domain whose statistics are to be updated. + * @env: The load balancing environment. * @group: sched_group whose statistics are to be updated. * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. @@ -3741,11 +3741,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, /** * update_sd_pick_busiest - return 1 on busiest group - * @sd: sched_domain whose statistics are to be checked + * @env: The load balancing environment. * @sds: sched_domain statistics * @sg: sched_group candidate to be checked for being the busiest * @sgs: sched_group statistics - * @this_cpu: the current cpu * * Determine if @sg is a busier group than the previously selected * busiest group. @@ -3783,9 +3782,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. - * @sd: sched_domain whose statistics are to be updated. - * @this_cpu: Cpu for which load balance is currently performed. - * @idle: Idle status of this_cpu + * @env: The load balancing environment. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. @@ -3874,10 +3871,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, * Returns 1 when packing is required and a task should be moved to * this CPU. The amount of the imbalance is returned in *imbalance. * - * @sd: The sched_domain whose packing is to be checked. + * @env: The load balancing environment. * @sds: Statistics of the sched_domain which is to be packed - * @this_cpu: The cpu at whose sched_domain we're performing load-balance. - * @imbalance: returns amount of imbalanced due to packing. */ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) { @@ -3903,9 +3898,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) * fix_small_imbalance - Calculate the minor imbalance that exists * amongst the groups of a sched_domain, during * load balancing. + * @env: The load balancing environment. * @sds: Statistics of the sched_domain whose imbalance is to be calculated. - * @this_cpu: The cpu at whose sched_domain we're performing load-balance. - * @imbalance: Variable to store the imbalance. */ static inline void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) @@ -4048,11 +4042,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * Also calculates the amount of weighted load which should be moved * to restore balance. * - * @sd: The sched_domain whose busiest group is to be returned. - * @this_cpu: The cpu for which load balancing is currently being performed. - * @imbalance: Variable which stores amount of weighted load which should - * be moved to restore balance/put a group to idle. - * @idle: The idle status of this_cpu. + * @env: The load balancing environment. * @cpus: The set of CPUs under consideration for load-balancing. * @balance: Pointer to a variable indicating if this_cpu * is the appropriate cpu to perform load balancing at this_level. From 1e11ad8dc42975d5c2bab7d478f6cd875602eda4 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 8 Jun 2012 13:21:26 -0700 Subject: [PATCH 176/178] mm, oom: fix badness score underflow If the privileges given to root threads (3% of allowable memory) or a negative value of /proc/pid/oom_score_adj happen to exceed the amount of rss of a thread, its badness score overflows as a result of commit a7f638f999ff ("mm, oom: normalize oom scores to oom_score_adj scale only for userspace"). Fix this by making the type signed and return 1, meaning the thread is still eligible for kill, if the value is negative. Reported-by: Dave Jones Acked-by: Oleg Nesterov Signed-off-by: David Rientjes Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ed0e19677360..416637f0e924 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -183,7 +183,7 @@ static bool oom_unkillable_task(struct task_struct *p, unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages) { - unsigned long points; + long points; if (oom_unkillable_task(p, memcg, nodemask)) return 0; @@ -223,7 +223,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * Never return 0 for an eligible task regardless of the root bonus and * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). */ - return points ? points : 1; + return points > 0 ? points : 1; } /* From cfaf025112d3856637ff34a767ef785ef5cf2ca9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 8 Jun 2012 18:40:09 -0700 Subject: [PATCH 177/178] Linux 3.5-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 0d718ede9ea5..d845c2a1aa68 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 3 PATCHLEVEL = 5 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Saber-toothed Squirrel # *DOCUMENTATION* From f242e50eee1ec7692c4854d94e8cd543991cce71 Mon Sep 17 00:00:00 2001 From: Ola Lilja Date: Thu, 7 Jun 2012 14:00:46 +0200 Subject: [PATCH 178/178] mfd/ab8500: Move platform-data for ab8500-codec into mfd-driver The platform-data used by the Ux500 ASoC-driver is moved from the machine-driver context into the codec-driver context. This means adding the platform-data for 'ab8500-codec' into the main AB8500 platform-data. Signed-off-by: Ola Lilja Signed-off-by: Mark Brown --- arch/arm/mach-ux500/board-mop500.c | 14 +++++++ include/linux/mfd/abx500/ab8500-codec.h | 52 +++++++++++++++++++++++++ include/linux/mfd/abx500/ab8500.h | 2 + 3 files changed, 68 insertions(+) create mode 100644 include/linux/mfd/abx500/ab8500-codec.h diff --git a/arch/arm/mach-ux500/board-mop500.c b/arch/arm/mach-ux500/board-mop500.c index 9c74ac545849..c8a8fde777bb 100644 --- a/arch/arm/mach-ux500/board-mop500.c +++ b/arch/arm/mach-ux500/board-mop500.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -97,6 +98,18 @@ static struct ab8500_gpio_platform_data ab8500_gpio_pdata = { 0x7A, 0x00, 0x00}, }; +/* ab8500-codec */ +static struct ab8500_codec_platform_data ab8500_codec_pdata = { + .amics = { + .mic1_type = AMIC_TYPE_DIFFERENTIAL, + .mic2_type = AMIC_TYPE_DIFFERENTIAL, + .mic1a_micbias = AMIC_MICBIAS_VAMIC1, + .mic1b_micbias = AMIC_MICBIAS_VAMIC1, + .mic2_micbias = AMIC_MICBIAS_VAMIC2 + }, + .ear_cmv = EAR_CMV_0_95V +}; + static struct gpio_keys_button snowball_key_array[] = { { .gpio = 32, @@ -195,6 +208,7 @@ static struct ab8500_platform_data ab8500_platdata = { .regulator = ab8500_regulators, .num_regulator = ARRAY_SIZE(ab8500_regulators), .gpio = &ab8500_gpio_pdata, + .codec = &ab8500_codec_pdata, }; static struct resource ab8500_resources[] = { diff --git a/include/linux/mfd/abx500/ab8500-codec.h b/include/linux/mfd/abx500/ab8500-codec.h new file mode 100644 index 000000000000..dc6529202cdd --- /dev/null +++ b/include/linux/mfd/abx500/ab8500-codec.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) ST-Ericsson SA 2012 + * + * Author: Ola Lilja + * for ST-Ericsson. + * + * License terms: + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#ifndef AB8500_CORE_CODEC_H +#define AB8500_CORE_CODEC_H + +/* Mic-types */ +enum amic_type { + AMIC_TYPE_SINGLE_ENDED, + AMIC_TYPE_DIFFERENTIAL +}; + +/* Mic-biases */ +enum amic_micbias { + AMIC_MICBIAS_VAMIC1, + AMIC_MICBIAS_VAMIC2 +}; + +/* Bias-voltage */ +enum ear_cm_voltage { + EAR_CMV_0_95V, + EAR_CMV_1_10V, + EAR_CMV_1_27V, + EAR_CMV_1_58V +}; + +/* Analog microphone settings */ +struct amic_settings { + enum amic_type mic1_type; + enum amic_type mic2_type; + enum amic_micbias mic1a_micbias; + enum amic_micbias mic1b_micbias; + enum amic_micbias mic2_micbias; +}; + +/* Platform data structure for the audio-parts of the AB8500 */ +struct ab8500_codec_platform_data { + struct amic_settings amics; + enum ear_cm_voltage ear_cmv; +}; + +#endif diff --git a/include/linux/mfd/abx500/ab8500.h b/include/linux/mfd/abx500/ab8500.h index 91dd3ef63e99..bc9b84b60ec6 100644 --- a/include/linux/mfd/abx500/ab8500.h +++ b/include/linux/mfd/abx500/ab8500.h @@ -266,6 +266,7 @@ struct ab8500 { struct regulator_reg_init; struct regulator_init_data; struct ab8500_gpio_platform_data; +struct ab8500_codec_platform_data; /** * struct ab8500_platform_data - AB8500 platform data @@ -284,6 +285,7 @@ struct ab8500_platform_data { int num_regulator; struct regulator_init_data *regulator; struct ab8500_gpio_platform_data *gpio; + struct ab8500_codec_platform_data *codec; }; extern int __devinit ab8500_init(struct ab8500 *ab8500,