From bb93d802ae5c1949977cc6da247b218240677f11 Mon Sep 17 00:00:00 2001 From: David Brownell Date: Sat, 18 Oct 2008 20:27:47 -0700 Subject: [PATCH 01/15] rtc-cmos: export second NVRAM bank Teach rtc-cmos about the second bank of registers found on most modern x86 systems, giving access to 128 bytes more NVRAM. This version only sees that extra NVRAM when both register banks are provided as part of *one* PNP resource. Since BIOS on some systems presents them using two IO resources, and nothing merges them, this can't always show all the NVRAM. (We're supposed to be able to use PNP id PNP0b01 too, but BIOS tables doesn't often seem to use that particular option.) Signed-off-by: David Brownell Cc: torvalds@linux-foundation.org Cc: Ingo Molnar Cc: Bjorn Helgaas Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- drivers/rtc/rtc-cmos.c | 70 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 63 insertions(+), 7 deletions(-) diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index 963ad0b6a4e9..f1695d7fa0fa 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -143,6 +143,43 @@ static inline int hpet_unregister_irq_handler(irq_handler_t handler) /*----------------------------------------------------------------*/ +#ifdef RTC_PORT + +/* Most newer x86 systems have two register banks, the first used + * for RTC and NVRAM and the second only for NVRAM. Caller must + * own rtc_lock ... and we won't worry about access during NMI. + */ +#define can_bank2 true + +static inline unsigned char cmos_read_bank2(unsigned char addr) +{ + outb(addr, RTC_PORT(2)); + return inb(RTC_PORT(3)); +} + +static inline void cmos_write_bank2(unsigned char val, unsigned char addr) +{ + outb(addr, RTC_PORT(2)); + outb(val, RTC_PORT(2)); +} + +#else + +#define can_bank2 false + +static inline unsigned char cmos_read_bank2(unsigned char addr) +{ + return 0; +} + +static inline void cmos_write_bank2(unsigned char val, unsigned char addr) +{ +} + +#endif + +/*----------------------------------------------------------------*/ + static int cmos_read_time(struct device *dev, struct rtc_time *t) { /* REVISIT: if the clock has a "century" register, use @@ -491,12 +528,21 @@ cmos_nvram_read(struct kobject *kobj, struct bin_attribute *attr, if (unlikely(off >= attr->size)) return 0; + if (unlikely(off < 0)) + return -EINVAL; if ((off + count) > attr->size) count = attr->size - off; + off += NVRAM_OFFSET; spin_lock_irq(&rtc_lock); - for (retval = 0, off += NVRAM_OFFSET; count--; retval++, off++) - *buf++ = CMOS_READ(off); + for (retval = 0; count; count--, off++, retval++) { + if (off < 128) + *buf++ = CMOS_READ(off); + else if (can_bank2) + *buf++ = cmos_read_bank2(off); + else + break; + } spin_unlock_irq(&rtc_lock); return retval; @@ -512,6 +558,8 @@ cmos_nvram_write(struct kobject *kobj, struct bin_attribute *attr, cmos = dev_get_drvdata(container_of(kobj, struct device, kobj)); if (unlikely(off >= attr->size)) return -EFBIG; + if (unlikely(off < 0)) + return -EINVAL; if ((off + count) > attr->size) count = attr->size - off; @@ -520,15 +568,20 @@ cmos_nvram_write(struct kobject *kobj, struct bin_attribute *attr, * here. If userspace is smart enough to know what fields of * NVRAM to update, updating checksums is also part of its job. */ + off += NVRAM_OFFSET; spin_lock_irq(&rtc_lock); - for (retval = 0, off += NVRAM_OFFSET; count--; retval++, off++) { + for (retval = 0; count; count--, off++, retval++) { /* don't trash RTC registers */ if (off == cmos->day_alrm || off == cmos->mon_alrm || off == cmos->century) buf++; - else + else if (off < 128) CMOS_WRITE(*buf++, off); + else if (can_bank2) + cmos_write_bank2(*buf++, off); + else + break; } spin_unlock_irq(&rtc_lock); @@ -631,8 +684,8 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq) /* Heuristic to deduce NVRAM size ... do what the legacy NVRAM * driver did, but don't reject unknown configs. Old hardware - * won't address 128 bytes, and for now we ignore the way newer - * chips can address 256 bytes (using two more i/o ports). + * won't address 128 bytes. Newer chips have multiple banks, + * though they may not be listed in one I/O resource. */ #if defined(CONFIG_ATARI) address_space = 64; @@ -642,6 +695,8 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq) #warning Assuming 128 bytes of RTC+NVRAM address space, not 64 bytes. address_space = 128; #endif + if (can_bank2 && ports->end > (ports->start + 1)) + address_space = 256; /* For ACPI systems extension info comes from the FADT. On others, * board specific setup provides it as appropriate. Systems where @@ -740,7 +795,7 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq) goto cleanup2; } - pr_info("%s: alarms up to one %s%s%s\n", + pr_info("%s: alarms up to one %s%s, %zd bytes nvram, %s irqs\n", cmos_rtc.rtc->dev.bus_id, is_valid_irq(rtc_irq) ? (cmos_rtc.mon_alrm @@ -749,6 +804,7 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq) ? "month" : "day")) : "no", cmos_rtc.century ? ", y3k" : "", + nvram.size, is_hpet_enabled() ? ", hpet irqs" : ""); return 0; From e45f2c07742d447597df001c878bc4a8aafcde37 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Mon, 24 Nov 2008 11:28:36 +0300 Subject: [PATCH 02/15] x86: correct link to HPET timer specification Impact: update documentation / help text Original link is dead. Signed-off-by: Denis V. Lunev Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- drivers/char/hpet.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ac22bb7719f7..19f0d97829ee 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -482,7 +482,7 @@ config HPET_TIMER The HPET provides a stable time base on SMP systems, unlike the TSC, but it is more expensive to access, as it is off-chip. You can find the HPET spec at - . + . You can safely choose Y here. However, HPET will only be activated if the platform and the BIOS support this feature. diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index 53fdc7ff3870..32b8bbf5003e 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c @@ -46,7 +46,7 @@ /* * The High Precision Event Timer driver. * This driver is closely modelled after the rtc.c driver. - * http://www.intel.com/hardwaredesign/hpetspec.htm + * http://www.intel.com/hardwaredesign/hpetspec_1.pdf */ #define HPET_USER_FREQ (64) #define HPET_DRIFT (500) From ca109491f612aab5c8152207631c0444f63da97f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 Nov 2008 12:43:51 +0100 Subject: [PATCH 03/15] hrtimer: removing all ur callback modes Impact: cleanup, move all hrtimer processing into hardirq context This is an attempt at removing some of the hrtimer complexity by reducing the number of callback modes to 1. This means that all hrtimer callback functions will be ran from HARD-irq context. I went through all the 30 odd hrtimer callback functions in the kernel and saw only one that I'm not quite sure of, which is the one in net/can/bcm.c - hence I'm CC-ing the folks responsible for that code. Furthermore, the hrtimer core now calls callbacks directly with IRQs disabled in case you try to enqueue an expired timer. If this timer is a periodic timer (which should use hrtimer_forward() to advance its time) then it might be possible to end up in an inf. recursive loop due to the fact that hrtimer_forward() doesn't round up to the next timer granularity, and therefore keeps on calling the callback - obviously this needs a fix. Aside from that, this seems to compile and actually boot on my dual core test box - although I'm sure there are some bugs in, me not hitting any makes me certain :-) Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- drivers/input/touchscreen/ads7846.c | 4 +- include/linux/hrtimer.h | 34 +--- include/linux/interrupt.h | 3 - kernel/hrtimer.c | 280 +++------------------------- kernel/sched.c | 2 - kernel/time/ntp.c | 4 +- kernel/time/tick-sched.c | 1 - kernel/trace/trace_sysprof.c | 1 - sound/drivers/pcsp/pcsp.c | 1 - 9 files changed, 37 insertions(+), 293 deletions(-) diff --git a/drivers/input/touchscreen/ads7846.c b/drivers/input/touchscreen/ads7846.c index b9b7fc6ff1eb..e1ece89fe922 100644 --- a/drivers/input/touchscreen/ads7846.c +++ b/drivers/input/touchscreen/ads7846.c @@ -697,7 +697,7 @@ static enum hrtimer_restart ads7846_timer(struct hrtimer *handle) struct ads7846 *ts = container_of(handle, struct ads7846, timer); int status = 0; - spin_lock_irq(&ts->lock); + spin_lock(&ts->lock); if (unlikely(!get_pendown_state(ts) || device_suspended(&ts->spi->dev))) { @@ -728,7 +728,7 @@ static enum hrtimer_restart ads7846_timer(struct hrtimer *handle) dev_err(&ts->spi->dev, "spi_async --> %d\n", status); } - spin_unlock_irq(&ts->lock); + spin_unlock(&ts->lock); return HRTIMER_NORESTART; } diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 3eba43878dcb..bd37078c2d7d 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -42,26 +42,6 @@ enum hrtimer_restart { HRTIMER_RESTART, /* Timer must be restarted */ }; -/* - * hrtimer callback modes: - * - * HRTIMER_CB_SOFTIRQ: Callback must run in softirq context - * HRTIMER_CB_IRQSAFE_PERCPU: Callback must run in hardirq context - * Special mode for tick emulation and - * scheduler timer. Such timers are per - * cpu and not allowed to be migrated on - * cpu unplug. - * HRTIMER_CB_IRQSAFE_UNLOCKED: Callback should run in hardirq context - * with timer->base lock unlocked - * used for timers which call wakeup to - * avoid lock order problems with rq->lock - */ -enum hrtimer_cb_mode { - HRTIMER_CB_SOFTIRQ, - HRTIMER_CB_IRQSAFE_PERCPU, - HRTIMER_CB_IRQSAFE_UNLOCKED, -}; - /* * Values to track state of the timer * @@ -70,7 +50,6 @@ enum hrtimer_cb_mode { * 0x00 inactive * 0x01 enqueued into rbtree * 0x02 callback function running - * 0x04 callback pending (high resolution mode) * * Special cases: * 0x03 callback function running and enqueued @@ -92,8 +71,7 @@ enum hrtimer_cb_mode { #define HRTIMER_STATE_INACTIVE 0x00 #define HRTIMER_STATE_ENQUEUED 0x01 #define HRTIMER_STATE_CALLBACK 0x02 -#define HRTIMER_STATE_PENDING 0x04 -#define HRTIMER_STATE_MIGRATE 0x08 +#define HRTIMER_STATE_MIGRATE 0x04 /** * struct hrtimer - the basic hrtimer structure @@ -109,8 +87,6 @@ enum hrtimer_cb_mode { * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) * @state: state information (See bit values above) - * @cb_mode: high resolution timer feature to select the callback execution - * mode * @cb_entry: list head to enqueue an expired timer into the callback list * @start_site: timer statistics field to store the site where the timer * was started @@ -129,7 +105,6 @@ struct hrtimer { struct hrtimer_clock_base *base; unsigned long state; struct list_head cb_entry; - enum hrtimer_cb_mode cb_mode; #ifdef CONFIG_TIMER_STATS int start_pid; void *start_site; @@ -188,15 +163,11 @@ struct hrtimer_clock_base { * @check_clocks: Indictator, when set evaluate time source and clock * event devices whether high resolution mode can be * activated. - * @cb_pending: Expired timers are moved from the rbtree to this - * list in the timer interrupt. The list is processed - * in the softirq. * @nr_events: Total number of timer interrupt events */ struct hrtimer_cpu_base { spinlock_t lock; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; - struct list_head cb_pending; #ifdef CONFIG_HIGH_RES_TIMERS ktime_t expires_next; int hres_active; @@ -404,8 +375,7 @@ static inline int hrtimer_active(const struct hrtimer *timer) */ static inline int hrtimer_is_queued(struct hrtimer *timer) { - return timer->state & - (HRTIMER_STATE_ENQUEUED | HRTIMER_STATE_PENDING); + return timer->state & HRTIMER_STATE_ENQUEUED; } /* diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index f58a0cf8929a..d6210a97a8ca 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -251,9 +251,6 @@ enum BLOCK_SOFTIRQ, TASKLET_SOFTIRQ, SCHED_SOFTIRQ, -#ifdef CONFIG_HIGH_RES_TIMERS - HRTIMER_SOFTIRQ, -#endif RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ NR_SOFTIRQS diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 47e63349d1b2..efd6f41e1c16 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -442,22 +442,6 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif -/* - * Check, whether the timer is on the callback pending list - */ -static inline int hrtimer_cb_pending(const struct hrtimer *timer) -{ - return timer->state & HRTIMER_STATE_PENDING; -} - -/* - * Remove a timer from the callback pending list - */ -static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) -{ - list_del_init(&timer->cb_entry); -} - /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS @@ -651,6 +635,8 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } +static void __run_hrtimer(struct hrtimer *timer); + /* * When High resolution timers are active, try to reprogram. Note, that in case * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry @@ -661,31 +647,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base) { if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { - - /* Timer is expired, act upon the callback mode */ - switch(timer->cb_mode) { - case HRTIMER_CB_IRQSAFE_PERCPU: - case HRTIMER_CB_IRQSAFE_UNLOCKED: - /* - * This is solely for the sched tick emulation with - * dynamic tick support to ensure that we do not - * restart the tick right on the edge and end up with - * the tick timer in the softirq ! The calling site - * takes care of this. Also used for hrtimer sleeper ! - */ - debug_hrtimer_deactivate(timer); - return 1; - case HRTIMER_CB_SOFTIRQ: - /* - * Move everything else into the softirq pending list ! - */ - list_add_tail(&timer->cb_entry, - &base->cpu_base->cb_pending); - timer->state = HRTIMER_STATE_PENDING; - return 1; - default: - BUG(); - } + /* + * XXX: recursion check? + * hrtimer_forward() should round up with timer granularity + * so that we never get into inf recursion here, + * it doesn't do that though + */ + __run_hrtimer(timer); + return 1; } return 0; } @@ -724,11 +693,6 @@ static int hrtimer_switch_to_hres(void) return 1; } -static inline void hrtimer_raise_softirq(void) -{ - raise_softirq(HRTIMER_SOFTIRQ); -} - #else static inline int hrtimer_hres_active(void) { return 0; } @@ -747,7 +711,6 @@ static inline int hrtimer_reprogram(struct hrtimer *timer, { return 0; } -static inline void hrtimer_raise_softirq(void) { } #endif /* CONFIG_HIGH_RES_TIMERS */ @@ -890,10 +853,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, unsigned long newstate, int reprogram) { - /* High res. callback list. NOP for !HIGHRES */ - if (hrtimer_cb_pending(timer)) - hrtimer_remove_cb_pending(timer); - else { + if (timer->state & HRTIMER_STATE_ENQUEUED) { /* * Remove the timer from the rbtree and replace the * first entry pointer if necessary. @@ -953,7 +913,7 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n { struct hrtimer_clock_base *base, *new_base; unsigned long flags; - int ret, raise; + int ret; base = lock_hrtimer_base(timer, &flags); @@ -988,26 +948,8 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n enqueue_hrtimer(timer, new_base, new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); - /* - * The timer may be expired and moved to the cb_pending - * list. We can not raise the softirq with base lock held due - * to a possible deadlock with runqueue lock. - */ - raise = timer->state == HRTIMER_STATE_PENDING; - - /* - * We use preempt_disable to prevent this task from migrating after - * setting up the softirq and raising it. Otherwise, if me migrate - * we will raise the softirq on the wrong CPU. - */ - preempt_disable(); - unlock_hrtimer_base(timer, &flags); - if (raise) - hrtimer_raise_softirq(); - preempt_enable(); - return ret; } EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); @@ -1192,75 +1134,6 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) } EXPORT_SYMBOL_GPL(hrtimer_get_res); -static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base) -{ - spin_lock_irq(&cpu_base->lock); - - while (!list_empty(&cpu_base->cb_pending)) { - enum hrtimer_restart (*fn)(struct hrtimer *); - struct hrtimer *timer; - int restart; - int emulate_hardirq_ctx = 0; - - timer = list_entry(cpu_base->cb_pending.next, - struct hrtimer, cb_entry); - - debug_hrtimer_deactivate(timer); - timer_stats_account_hrtimer(timer); - - fn = timer->function; - /* - * A timer might have been added to the cb_pending list - * when it was migrated during a cpu-offline operation. - * Emulate hardirq context for such timers. - */ - if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU || - timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) - emulate_hardirq_ctx = 1; - - __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); - spin_unlock_irq(&cpu_base->lock); - - if (unlikely(emulate_hardirq_ctx)) { - local_irq_disable(); - restart = fn(timer); - local_irq_enable(); - } else - restart = fn(timer); - - spin_lock_irq(&cpu_base->lock); - - timer->state &= ~HRTIMER_STATE_CALLBACK; - if (restart == HRTIMER_RESTART) { - BUG_ON(hrtimer_active(timer)); - /* - * Enqueue the timer, allow reprogramming of the event - * device - */ - enqueue_hrtimer(timer, timer->base, 1); - } else if (hrtimer_active(timer)) { - /* - * If the timer was rearmed on another CPU, reprogram - * the event device. - */ - struct hrtimer_clock_base *base = timer->base; - - if (base->first == &timer->node && - hrtimer_reprogram(timer, base)) { - /* - * Timer is expired. Thus move it from tree to - * pending list again. - */ - __remove_hrtimer(timer, base, - HRTIMER_STATE_PENDING, 0); - list_add_tail(&timer->cb_entry, - &base->cpu_base->cb_pending); - } - } - } - spin_unlock_irq(&cpu_base->lock); -} - static void __run_hrtimer(struct hrtimer *timer) { struct hrtimer_clock_base *base = timer->base; @@ -1268,25 +1141,21 @@ static void __run_hrtimer(struct hrtimer *timer) enum hrtimer_restart (*fn)(struct hrtimer *); int restart; + WARN_ON(!irqs_disabled()); + debug_hrtimer_deactivate(timer); __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); timer_stats_account_hrtimer(timer); - fn = timer->function; - if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU || - timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) { - /* - * Used for scheduler timers, avoid lock inversion with - * rq->lock and tasklist_lock. - * - * These timers are required to deal with enqueue expiry - * themselves and are not allowed to migrate. - */ - spin_unlock(&cpu_base->lock); - restart = fn(timer); - spin_lock(&cpu_base->lock); - } else - restart = fn(timer); + + /* + * Because we run timers from hardirq context, there is no chance + * they get migrated to another cpu, therefore its safe to unlock + * the timer base. + */ + spin_unlock(&cpu_base->lock); + restart = fn(timer); + spin_lock(&cpu_base->lock); /* * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid @@ -1311,7 +1180,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); struct hrtimer_clock_base *base; ktime_t expires_next, now; - int i, raise = 0; + int i; BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; @@ -1360,16 +1229,6 @@ void hrtimer_interrupt(struct clock_event_device *dev) break; } - /* Move softirq callbacks to the pending list */ - if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { - __remove_hrtimer(timer, base, - HRTIMER_STATE_PENDING, 0); - list_add_tail(&timer->cb_entry, - &base->cpu_base->cb_pending); - raise = 1; - continue; - } - __run_hrtimer(timer); } spin_unlock(&cpu_base->lock); @@ -1383,10 +1242,6 @@ void hrtimer_interrupt(struct clock_event_device *dev) if (tick_program_event(expires_next, 0)) goto retry; } - - /* Raise softirq ? */ - if (raise) - raise_softirq(HRTIMER_SOFTIRQ); } /** @@ -1413,11 +1268,6 @@ void hrtimer_peek_ahead_timers(void) local_irq_restore(flags); } -static void run_hrtimer_softirq(struct softirq_action *h) -{ - run_hrtimer_pending(&__get_cpu_var(hrtimer_bases)); -} - #endif /* CONFIG_HIGH_RES_TIMERS */ /* @@ -1429,8 +1279,6 @@ static void run_hrtimer_softirq(struct softirq_action *h) */ void hrtimer_run_pending(void) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - if (hrtimer_hres_active()) return; @@ -1444,8 +1292,6 @@ void hrtimer_run_pending(void) */ if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) hrtimer_switch_to_hres(); - - run_hrtimer_pending(cpu_base); } /* @@ -1482,14 +1328,6 @@ void hrtimer_run_queues(void) hrtimer_get_expires_tv64(timer)) break; - if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { - __remove_hrtimer(timer, base, - HRTIMER_STATE_PENDING, 0); - list_add_tail(&timer->cb_entry, - &base->cpu_base->cb_pending); - continue; - } - __run_hrtimer(timer); } spin_unlock(&cpu_base->lock); @@ -1516,9 +1354,6 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) { sl->timer.function = hrtimer_wakeup; sl->task = task; -#ifdef CONFIG_HIGH_RES_TIMERS - sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; -#endif } static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) @@ -1655,36 +1490,22 @@ static void __cpuinit init_hrtimers_cpu(int cpu) for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) cpu_base->clock_base[i].cpu_base = cpu_base; - INIT_LIST_HEAD(&cpu_base->cb_pending); hrtimer_init_hres(cpu_base); } #ifdef CONFIG_HOTPLUG_CPU -static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, - struct hrtimer_clock_base *new_base, int dcpu) +static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, + struct hrtimer_clock_base *new_base, int dcpu) { struct hrtimer *timer; struct rb_node *node; - int raise = 0; while ((node = rb_first(&old_base->active))) { timer = rb_entry(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); debug_hrtimer_deactivate(timer); - /* - * Should not happen. Per CPU timers should be - * canceled _before_ the migration code is called - */ - if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) { - __remove_hrtimer(timer, old_base, - HRTIMER_STATE_INACTIVE, 0); - WARN(1, "hrtimer (%p %p)active but cpu %d dead\n", - timer, timer->function, dcpu); - continue; - } - /* * Mark it as STATE_MIGRATE not INACTIVE otherwise the * timer could be seen as !active and just vanish away @@ -1708,48 +1529,19 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * otherwise we end up with a stale timer. */ if (timer->state == HRTIMER_STATE_MIGRATE) { - timer->state = HRTIMER_STATE_PENDING; - list_add_tail(&timer->cb_entry, - &new_base->cpu_base->cb_pending); - raise = 1; + /* XXX: running on offline cpu */ + __run_hrtimer(timer); } #endif /* Clear the migration state bit */ timer->state &= ~HRTIMER_STATE_MIGRATE; } - return raise; } -#ifdef CONFIG_HIGH_RES_TIMERS -static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base, - struct hrtimer_cpu_base *new_base) -{ - struct hrtimer *timer; - int raise = 0; - - while (!list_empty(&old_base->cb_pending)) { - timer = list_entry(old_base->cb_pending.next, - struct hrtimer, cb_entry); - - __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0); - timer->base = &new_base->clock_base[timer->base->index]; - list_add_tail(&timer->cb_entry, &new_base->cb_pending); - raise = 1; - } - return raise; -} -#else -static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base, - struct hrtimer_cpu_base *new_base) -{ - return 0; -} -#endif - static void migrate_hrtimers(int cpu) { struct hrtimer_cpu_base *old_base, *new_base; - int i, raise = 0; + int i; BUG_ON(cpu_online(cpu)); old_base = &per_cpu(hrtimer_bases, cpu); @@ -1764,20 +1556,13 @@ static void migrate_hrtimers(int cpu) spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - if (migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i], cpu)) - raise = 1; + migrate_hrtimer_list(&old_base->clock_base[i], + &new_base->clock_base[i], cpu); } - if (migrate_hrtimer_pending(old_base, new_base)) - raise = 1; - spin_unlock(&old_base->lock); spin_unlock_irq(&new_base->lock); put_cpu_var(hrtimer_bases); - - if (raise) - hrtimer_raise_softirq(); } #endif /* CONFIG_HOTPLUG_CPU */ @@ -1817,9 +1602,6 @@ void __init hrtimers_init(void) hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); -#ifdef CONFIG_HIGH_RES_TIMERS - open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq); -#endif } /** diff --git a/kernel/sched.c b/kernel/sched.c index 9b1e79371c20..5ac5e9536168 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -203,7 +203,6 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rt_b->rt_period_timer.function = sched_rt_period_timer; - rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; } static inline int rt_bandwidth_enabled(void) @@ -1139,7 +1138,6 @@ static void init_rq_hrtick(struct rq *rq) hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rq->hrtick_timer.function = hrtick; - rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; } #else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 8ff15e5d486b..f5f793d92415 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -131,7 +131,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) { enum hrtimer_restart res = HRTIMER_NORESTART; - write_seqlock_irq(&xtime_lock); + write_seqlock(&xtime_lock); switch (time_state) { case TIME_OK: @@ -164,7 +164,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) } update_vsyscall(&xtime, clock); - write_sequnlock_irq(&xtime_lock); + write_sequnlock(&xtime_lock); return res; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 342fc9ccab46..502a81e2639b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -681,7 +681,6 @@ void tick_setup_sched_timer(void) */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); ts->sched_timer.function = tick_sched_timer; - ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; /* Get the next period (per cpu) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 9587d3bcba55..ae542e2e38d5 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -202,7 +202,6 @@ static void start_stack_timer(int cpu) hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = stack_trace_timer_fn; - hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); } diff --git a/sound/drivers/pcsp/pcsp.c b/sound/drivers/pcsp/pcsp.c index 1899cf0685bc..8e52b2a8a13a 100644 --- a/sound/drivers/pcsp/pcsp.c +++ b/sound/drivers/pcsp/pcsp.c @@ -96,7 +96,6 @@ static int __devinit snd_card_pcsp_probe(int devnum, struct device *dev) return -EINVAL; hrtimer_init(&pcsp_chip.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - pcsp_chip.timer.cb_mode = HRTIMER_CB_SOFTIRQ; pcsp_chip.timer.function = pcsp_do_timer; card = snd_card_new(index, id, THIS_MODULE, 0); From 37810659ea7d9572c5ac284ade272f806ef8f788 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 4 Dec 2008 11:17:10 +0100 Subject: [PATCH 04/15] hrtimer: removing all ur callback modes, fix hotplug Impact: fix hrtimer locking (reported by lockdep) in the CPU hotplug case This addition fixes the hotplug locking issue on my machine Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 65 +++++++++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index efd6f41e1c16..b09c7a27631d 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1496,7 +1496,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu) #ifdef CONFIG_HOTPLUG_CPU static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, - struct hrtimer_clock_base *new_base, int dcpu) + struct hrtimer_clock_base *new_base) { struct hrtimer *timer; struct rb_node *node; @@ -1514,40 +1514,34 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); timer->base = new_base; /* - * Enqueue the timer. Allow reprogramming of the event device + * Enqueue the timers on the new cpu, but do not reprogram + * the timer as that would enable a deadlock between + * hrtimer_enqueue_reprogramm() running the timer and us still + * holding a nested base lock. + * + * Instead we tickle the hrtimer interrupt after the migration + * is done, which will run all expired timers and re-programm + * the timer device. */ - enqueue_hrtimer(timer, new_base, 1); + enqueue_hrtimer(timer, new_base, 0); -#ifdef CONFIG_HIGH_RES_TIMERS - /* - * Happens with high res enabled when the timer was - * already expired and the callback mode is - * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The - * enqueue code does not move them to the soft irq - * pending list for performance/latency reasons, but - * in the migration state, we need to do that - * otherwise we end up with a stale timer. - */ - if (timer->state == HRTIMER_STATE_MIGRATE) { - /* XXX: running on offline cpu */ - __run_hrtimer(timer); - } -#endif /* Clear the migration state bit */ timer->state &= ~HRTIMER_STATE_MIGRATE; } } -static void migrate_hrtimers(int cpu) +static int migrate_hrtimers(int scpu) { struct hrtimer_cpu_base *old_base, *new_base; - int i; + int dcpu, i; - BUG_ON(cpu_online(cpu)); - old_base = &per_cpu(hrtimer_bases, cpu); + BUG_ON(cpu_online(scpu)); + old_base = &per_cpu(hrtimer_bases, scpu); new_base = &get_cpu_var(hrtimer_bases); - tick_cancel_sched_timer(cpu); + dcpu = smp_processor_id(); + + tick_cancel_sched_timer(scpu); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. @@ -1557,32 +1551,47 @@ static void migrate_hrtimers(int cpu) for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i], cpu); + &new_base->clock_base[i]); } spin_unlock(&old_base->lock); spin_unlock_irq(&new_base->lock); put_cpu_var(hrtimer_bases); + + return dcpu; } + +static void tickle_timers(void *arg) +{ + hrtimer_peek_ahead_timers(); +} + #endif /* CONFIG_HOTPLUG_CPU */ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { - unsigned int cpu = (long)hcpu; + int dcpu = -1, scpu = (long)hcpu; switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - init_hrtimers_cpu(cpu); + init_hrtimers_cpu(scpu); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_DEAD: case CPU_DEAD_FROZEN: - clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu); - migrate_hrtimers(cpu); + clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu); + dcpu = migrate_hrtimers(scpu); + break; + + case CPU_POST_DEAD: + if (dcpu == -1) + break; + + smp_call_function_single(dcpu, tickle_timers, NULL, 0); break; #endif From a0a99b227da57f81319dd239bc4de811b0f530ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 8 Dec 2008 17:13:02 +0100 Subject: [PATCH 05/15] hrtimer: removing all ur callback modes, fix > Ingo, this addition fixes the hotplug issue on my machine And because we're all human... Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index b09c7a27631d..b741f850426e 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1571,7 +1571,7 @@ static void tickle_timers(void *arg) static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { - int dcpu = -1, scpu = (long)hcpu; + int dcpu, scpu = (long)hcpu; switch (action) { @@ -1585,12 +1585,6 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, case CPU_DEAD_FROZEN: clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu); dcpu = migrate_hrtimers(scpu); - break; - - case CPU_POST_DEAD: - if (dcpu == -1) - break; - smp_call_function_single(dcpu, tickle_timers, NULL, 0); break; #endif From fa116ea35ec7f40e890972324409e99eed008d56 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 11 Dec 2008 17:04:11 +0100 Subject: [PATCH 06/15] nohz: no softirq pending warnings for offline cpus Impact: remove false positive warning After a cpu was taken down during cpu hotplug (read: disabled for interrupts) it still might have pending softirqs. However take_cpu_down makes sure that the idle task will run next instead of ksoftirqd on the taken down cpu. The idle task will call tick_nohz_stop_sched_tick which might warn about pending softirqs just before the cpu kills itself completely. However the pending softirqs on the dead cpu aren't a problem because they will be moved to an online cpu during CPU_DEAD handling. So make sure we warn only for online cpus. Signed-off-by: Heiko Carstens Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 342fc9ccab46..dc17ffcf1919 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -247,7 +247,7 @@ void tick_nohz_stop_sched_tick(int inidle) if (need_resched()) goto end; - if (unlikely(local_softirq_pending())) { + if (unlikely(local_softirq_pending() && cpu_online(cpu))) { static int ratelimit; if (ratelimit < 10) { From 0a57b783018a77ca16097198844438bdff4d012e Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 1 Dec 2008 14:18:12 -0800 Subject: [PATCH 07/15] clocksource, acpi_pm.c: put acpi_pm_read_slow() under CONFIG_PCI acpi_pm_read_slow() is only used when CONFIG_PCI=y, so move the definition inside the ifdef. Otherwise this causes a "defined but not used" warning when building with CONFIG_ACPI=y and CONFIG_PCI=n (that's not supported yet, but it could be). Signed-off-by: Bjorn Helgaas Cc: mingo@elte.hu Cc: johnstul@us.ibm.com Cc: akpm@linux-foundation.org Cc: bjorn.helgaas@hp.com Cc: lenb@kernel.org Cc: linux@dominikbrodowski.net Cc: tglx@linutronix.de Signed-off-by: Thomas Gleixner Cc: Dominik Brodowski Cc: Thomas Gleixner Cc: Len Brown Signed-off-by: Andrew Morton --- drivers/clocksource/acpi_pm.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c index c20171078d1d..e1129fad96dd 100644 --- a/drivers/clocksource/acpi_pm.c +++ b/drivers/clocksource/acpi_pm.c @@ -57,11 +57,6 @@ u32 acpi_pm_read_verified(void) return v2; } -static cycle_t acpi_pm_read_slow(void) -{ - return (cycle_t)acpi_pm_read_verified(); -} - static cycle_t acpi_pm_read(void) { return (cycle_t)read_pmtmr(); @@ -88,6 +83,11 @@ static int __init acpi_pm_good_setup(char *__str) } __setup("acpi_pm_good", acpi_pm_good_setup); +static cycle_t acpi_pm_read_slow(void) +{ + return (cycle_t)acpi_pm_read_verified(); +} + static inline void acpi_pm_need_workaround(void) { clocksource_acpi_pm.read = acpi_pm_read_slow; From 001474491fabeca233168a8598f721c808040f90 Mon Sep 17 00:00:00 2001 From: "Woodruff, Richard" Date: Mon, 1 Dec 2008 14:18:11 -0800 Subject: [PATCH 08/15] nohz: suppress needless timer reprogramming In my device I get many interrupts from a high speed USB device in a very short period of time. The system spends a lot of time reprogramming the hardware timer which is in a slower timing domain as compared to the CPU. This results in the CPU spending a huge amount of time waiting for the timer posting to be done. All of this reprogramming is useless as the wake up time has not changed. As measured using ETM trace this drops my reprogramming penalty from almost 60% CPU load down to 15% during high interrupt rate. I can send traces to show this. Suppress setting of duplicate timer event when timer already stopped. Timer programming can be very costly and can result in long cpu stall/wait times. [akpm@linux-foundation.org: coding-style fixes] [tglx@linutronix.de: move the check to the right place and avoid raising the softirq for nothing] Signed-off-by: Richard Woodruff Cc: johnstul@us.ibm.com Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 41 +++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index dc17ffcf1919..87fc34f21db2 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -282,8 +282,31 @@ void tick_nohz_stop_sched_tick(int inidle) /* Schedule the tick, if we are at least one jiffie off */ if ((long)delta_jiffies >= 1) { + /* + * calculate the expiry time for the next timer wheel + * timer + */ + expires = ktime_add_ns(last_update, tick_period.tv64 * + delta_jiffies); + + /* + * If this cpu is the one which updates jiffies, then + * give up the assignment and let it be taken by the + * cpu which runs the tick timer next, which might be + * this cpu as well. If we don't drop this here the + * jiffies might be stale and do_timer() never + * invoked. + */ + if (cpu == tick_do_timer_cpu) + tick_do_timer_cpu = TICK_DO_TIMER_NONE; + if (delta_jiffies > 1) cpu_set(cpu, nohz_cpu_mask); + + /* Skip reprogram of event if its not changed */ + if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) + goto out; + /* * nohz_stop_sched_tick can be called several times before * the nohz_restart_sched_tick is called. This happens when @@ -306,17 +329,6 @@ void tick_nohz_stop_sched_tick(int inidle) rcu_enter_nohz(); } - /* - * If this cpu is the one which updates jiffies, then - * give up the assignment and let it be taken by the - * cpu which runs the tick timer next, which might be - * this cpu as well. If we don't drop this here the - * jiffies might be stale and do_timer() never - * invoked. - */ - if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = TICK_DO_TIMER_NONE; - ts->idle_sleeps++; /* @@ -332,12 +344,7 @@ void tick_nohz_stop_sched_tick(int inidle) goto out; } - /* - * calculate the expiry time for the next timer wheel - * timer - */ - expires = ktime_add_ns(last_update, tick_period.tv64 * - delta_jiffies); + /* Mark expiries */ ts->idle_expires = expires; if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { From 27af4245b6ce99e08c6a7c38825383bf51119cc9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 1 Dec 2008 14:18:13 -0800 Subject: [PATCH 09/15] posix-timers: use "struct pid*" instead of "struct task_struct*" Impact: restructure, clean up code k_itimer holds the ref to the ->it_process until sys_timer_delete(). This allows to pin up to RLIMIT_SIGPENDING dead task_struct's. Change the code to use "struct pid *" instead. The patch doesn't kill ->it_process, it places ->it_pid into the union. ->it_process is still used by do_cpu_nanosleep() as before. It would be trivial to change the nanosleep code as well, but since it uses it_process in a special way I think it is better to keep this field for grep. The patch bloats the kernel by 104 bytes and it also adds the new pointer, ->it_signal, to k_itimer. It is used by lock_timer() to verify that the found timer was not created by another process. It is not clear why do we use the global database (and thus the global idr_lock) for posix timers. We still need the signal_struct->posix_timers which contains all useable timers, perhaps it is better to use some form of per-process array instead. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- include/linux/posix-timers.h | 6 ++++- kernel/posix-timers.c | 43 +++++++++++++++++++----------------- 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index a7c721355549..4f71bf4e628c 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -45,7 +45,11 @@ struct k_itimer { int it_requeue_pending; /* waiting to requeue this timer */ #define REQUEUE_PENDING 1 int it_sigev_notify; /* notify word of sigevent struct */ - struct task_struct *it_process; /* process to send signal to */ + struct signal_struct *it_signal; + union { + struct pid *it_pid; /* pid of process to send signal to */ + struct task_struct *it_process; /* for clock_nanosleep */ + }; struct sigqueue *sigq; /* signal queue entry. */ union { struct { diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 5e79c662294b..42a39afd694a 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -116,7 +116,7 @@ static DEFINE_SPINLOCK(idr_lock); * must supply functions here, even if the function just returns * ENOSYS. The standard POSIX timer management code assumes the * following: 1.) The k_itimer struct (sched.h) is used for the - * timer. 2.) The list, it_lock, it_clock, it_id and it_process + * timer. 2.) The list, it_lock, it_clock, it_id and it_pid * fields are not modified by timer code. * * At this time all functions EXCEPT clock_nanosleep can be @@ -313,7 +313,8 @@ void do_schedule_next_timer(struct siginfo *info) int posix_timer_event(struct k_itimer *timr, int si_private) { - int shared, ret; + struct task_struct *task; + int shared, ret = -1; /* * FIXME: if ->sigq is queued we can race with * dequeue_signal()->do_schedule_next_timer(). @@ -327,8 +328,13 @@ int posix_timer_event(struct k_itimer *timr, int si_private) */ timr->sigq->info.si_sys_private = si_private; - shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); - ret = send_sigqueue(timr->sigq, timr->it_process, shared); + rcu_read_lock(); + task = pid_task(timr->it_pid, PIDTYPE_PID); + if (task) { + shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); + ret = send_sigqueue(timr->sigq, task, shared); + } + rcu_read_unlock(); /* If we failed to send the signal the timer stops. */ return ret > 0; } @@ -405,7 +411,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) return ret; } -static struct task_struct * good_sigevent(sigevent_t * event) +static struct pid *good_sigevent(sigevent_t * event) { struct task_struct *rtn = current->group_leader; @@ -419,7 +425,7 @@ static struct task_struct * good_sigevent(sigevent_t * event) ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) return NULL; - return rtn; + return task_pid(rtn); } void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) @@ -471,7 +477,7 @@ sys_timer_create(const clockid_t which_clock, { struct k_itimer *new_timer; int error, new_timer_id; - struct task_struct *process; + struct pid *it_pid; sigevent_t event; int it_id_set = IT_ID_NOT_SET; @@ -525,11 +531,9 @@ sys_timer_create(const clockid_t which_clock, goto out; } rcu_read_lock(); - process = good_sigevent(&event); - if (process) - get_task_struct(process); + it_pid = get_pid(good_sigevent(&event)); rcu_read_unlock(); - if (!process) { + if (!it_pid) { error = -EINVAL; goto out; } @@ -537,8 +541,7 @@ sys_timer_create(const clockid_t which_clock, event.sigev_notify = SIGEV_SIGNAL; event.sigev_signo = SIGALRM; event.sigev_value.sival_int = new_timer->it_id; - process = current->group_leader; - get_task_struct(process); + it_pid = get_pid(task_tgid(current)); } new_timer->it_sigev_notify = event.sigev_notify; @@ -548,7 +551,8 @@ sys_timer_create(const clockid_t which_clock, new_timer->sigq->info.si_code = SI_TIMER; spin_lock_irq(¤t->sighand->siglock); - new_timer->it_process = process; + new_timer->it_pid = it_pid; + new_timer->it_signal = current->signal; list_add(&new_timer->list, ¤t->signal->posix_timers); spin_unlock_irq(¤t->sighand->siglock); @@ -583,8 +587,7 @@ static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags) timr = idr_find(&posix_timers_id, (int)timer_id); if (timr) { spin_lock(&timr->it_lock); - if (timr->it_process && - same_thread_group(timr->it_process, current)) { + if (timr->it_pid && timr->it_signal == current->signal) { spin_unlock(&idr_lock); return timr; } @@ -831,8 +834,8 @@ retry_delete: * This keeps any tasks waiting on the spin lock from thinking * they got something (see the lock code above). */ - put_task_struct(timer->it_process); - timer->it_process = NULL; + put_pid(timer->it_pid); + timer->it_pid = NULL; unlock_timer(timer, flags); release_posix_timer(timer, IT_ID_SET); @@ -858,8 +861,8 @@ retry_delete: * This keeps any tasks waiting on the spin lock from thinking * they got something (see the lock code above). */ - put_task_struct(timer->it_process); - timer->it_process = NULL; + put_pid(timer->it_pid); + timer->it_pid = NULL; unlock_timer(timer, flags); release_posix_timer(timer, IT_ID_SET); From 899921025b406a71a8aeb2d7855658ea0cf0ed23 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 1 Dec 2008 14:18:15 -0800 Subject: [PATCH 10/15] posix-timers: check ->it_signal instead of ->it_pid to validate the timer Impact: clean up, speed up ->it_pid (was ->it_process) has also a special meaning: if it is NULL, the timer is under deletion or it wasn't initialized yet. We can check ->it_signal != NULL instead, this way we can - simplify sys_timer_create() a bit - remove yet another check from lock_timer() - move put_pid(->it_pid) into release_posix_timer() which runs outside of ->it_lock Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 42a39afd694a..aa922bbb6403 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -464,6 +464,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) idr_remove(&posix_timers_id, tmr->it_id); spin_unlock_irqrestore(&idr_lock, flags); } + put_pid(tmr->it_pid); sigqueue_free(tmr->sigq); kmem_cache_free(posix_timers_cache, tmr); } @@ -477,7 +478,6 @@ sys_timer_create(const clockid_t which_clock, { struct k_itimer *new_timer; int error, new_timer_id; - struct pid *it_pid; sigevent_t event; int it_id_set = IT_ID_NOT_SET; @@ -531,9 +531,9 @@ sys_timer_create(const clockid_t which_clock, goto out; } rcu_read_lock(); - it_pid = get_pid(good_sigevent(&event)); + new_timer->it_pid = get_pid(good_sigevent(&event)); rcu_read_unlock(); - if (!it_pid) { + if (!new_timer->it_pid) { error = -EINVAL; goto out; } @@ -541,7 +541,7 @@ sys_timer_create(const clockid_t which_clock, event.sigev_notify = SIGEV_SIGNAL; event.sigev_signo = SIGALRM; event.sigev_value.sival_int = new_timer->it_id; - it_pid = get_pid(task_tgid(current)); + new_timer->it_pid = get_pid(task_tgid(current)); } new_timer->it_sigev_notify = event.sigev_notify; @@ -551,7 +551,6 @@ sys_timer_create(const clockid_t which_clock, new_timer->sigq->info.si_code = SI_TIMER; spin_lock_irq(¤t->sighand->siglock); - new_timer->it_pid = it_pid; new_timer->it_signal = current->signal; list_add(&new_timer->list, ¤t->signal->posix_timers); spin_unlock_irq(¤t->sighand->siglock); @@ -587,7 +586,7 @@ static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags) timr = idr_find(&posix_timers_id, (int)timer_id); if (timr) { spin_lock(&timr->it_lock); - if (timr->it_pid && timr->it_signal == current->signal) { + if (timr->it_signal == current->signal) { spin_unlock(&idr_lock); return timr; } @@ -834,8 +833,7 @@ retry_delete: * This keeps any tasks waiting on the spin lock from thinking * they got something (see the lock code above). */ - put_pid(timer->it_pid); - timer->it_pid = NULL; + timer->it_signal = NULL; unlock_timer(timer, flags); release_posix_timer(timer, IT_ID_SET); @@ -861,8 +859,7 @@ retry_delete: * This keeps any tasks waiting on the spin lock from thinking * they got something (see the lock code above). */ - put_pid(timer->it_pid); - timer->it_pid = NULL; + timer->it_signal = NULL; unlock_timer(timer, flags); release_posix_timer(timer, IT_ID_SET); From 8187926bdae98648db24db3391c4efd21ec669b1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 1 Dec 2008 14:18:16 -0800 Subject: [PATCH 11/15] posix-timers: simplify de_thread()->exit_itimers() path Impact: simplify code de_thread() postpones release_task(leader) until after exit_itimers(). This was needed because !SIGEV_THREAD_ID timers could use ->group_leader without get_task_struct(). With the recent changes we can release the leader earlier and simplify the code. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- fs/exec.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index ec5df9a38313..b4e5b8a9216a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -772,7 +772,6 @@ static int de_thread(struct task_struct *tsk) struct signal_struct *sig = tsk->signal; struct sighand_struct *oldsighand = tsk->sighand; spinlock_t *lock = &oldsighand->siglock; - struct task_struct *leader = NULL; int count; if (thread_group_empty(tsk)) @@ -810,7 +809,7 @@ static int de_thread(struct task_struct *tsk) * and to assume its PID: */ if (!thread_group_leader(tsk)) { - leader = tsk->group_leader; + struct task_struct *leader = tsk->group_leader; sig->notify_count = -1; /* for exit_notify() */ for (;;) { @@ -862,8 +861,9 @@ static int de_thread(struct task_struct *tsk) BUG_ON(leader->exit_state != EXIT_ZOMBIE); leader->exit_state = EXIT_DEAD; - write_unlock_irq(&tasklist_lock); + + release_task(leader); } sig->group_exit_task = NULL; @@ -872,8 +872,6 @@ static int de_thread(struct task_struct *tsk) no_thread_group: exit_itimers(sig); flush_itimer_signals(); - if (leader) - release_task(leader); if (atomic_read(&oldsighand->count) != 1) { struct sighand_struct *newsighand; From c29541b24fb2c6301021637229ae5347c877330a Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Mon, 1 Dec 2008 14:18:11 -0800 Subject: [PATCH 12/15] linux/timex.h: cleanup for userspace Impact: fix user-space exported use Move all the kernel-specific defines and includes into the __KERNEL__ section so that they don't get leaked into userspace. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Mike Frysinger Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- include/linux/timex.h | 73 ++++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/include/linux/timex.h b/include/linux/timex.h index 9007313b5b71..998a55d80acf 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -53,46 +53,10 @@ #ifndef _LINUX_TIMEX_H #define _LINUX_TIMEX_H -#include #include -#include - #define NTP_API 4 /* NTP API version */ -/* - * SHIFT_KG and SHIFT_KF establish the damping of the PLL and are chosen - * for a slightly underdamped convergence characteristic. SHIFT_KH - * establishes the damping of the FLL and is chosen by wisdom and black - * art. - * - * MAXTC establishes the maximum time constant of the PLL. With the - * SHIFT_KG and SHIFT_KF values given and a time constant range from - * zero to MAXTC, the PLL will converge in 15 minutes to 16 hours, - * respectively. - */ -#define SHIFT_PLL 4 /* PLL frequency factor (shift) */ -#define SHIFT_FLL 2 /* FLL frequency factor (shift) */ -#define MAXTC 10 /* maximum time constant (shift) */ - -/* - * SHIFT_USEC defines the scaling (shift) of the time_freq and - * time_tolerance variables, which represent the current frequency - * offset and maximum frequency tolerance. - */ -#define SHIFT_USEC 16 /* frequency offset scale (shift) */ -#define PPM_SCALE (NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC)) -#define PPM_SCALE_INV_SHIFT 19 -#define PPM_SCALE_INV ((1ll << (PPM_SCALE_INV_SHIFT + NTP_SCALE_SHIFT)) / \ - PPM_SCALE + 1) - -#define MAXPHASE 500000000l /* max phase error (ns) */ -#define MAXFREQ 500000 /* max frequency error (ns/s) */ -#define MAXFREQ_SCALED ((s64)MAXFREQ << NTP_SCALE_SHIFT) -#define MINSEC 256 /* min interval between updates (s) */ -#define MAXSEC 2048 /* max interval between updates (s) */ -#define NTP_PHASE_LIMIT ((MAXPHASE / NSEC_PER_USEC) << 5) /* beyond max. dispersion */ - /* * syscall interface - used (mainly by NTP daemon) * to discipline kernel clock oscillator @@ -199,8 +163,45 @@ struct timex { #define TIME_BAD TIME_ERROR /* bw compat */ #ifdef __KERNEL__ +#include +#include +#include + #include +/* + * SHIFT_KG and SHIFT_KF establish the damping of the PLL and are chosen + * for a slightly underdamped convergence characteristic. SHIFT_KH + * establishes the damping of the FLL and is chosen by wisdom and black + * art. + * + * MAXTC establishes the maximum time constant of the PLL. With the + * SHIFT_KG and SHIFT_KF values given and a time constant range from + * zero to MAXTC, the PLL will converge in 15 minutes to 16 hours, + * respectively. + */ +#define SHIFT_PLL 4 /* PLL frequency factor (shift) */ +#define SHIFT_FLL 2 /* FLL frequency factor (shift) */ +#define MAXTC 10 /* maximum time constant (shift) */ + +/* + * SHIFT_USEC defines the scaling (shift) of the time_freq and + * time_tolerance variables, which represent the current frequency + * offset and maximum frequency tolerance. + */ +#define SHIFT_USEC 16 /* frequency offset scale (shift) */ +#define PPM_SCALE (NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC)) +#define PPM_SCALE_INV_SHIFT 19 +#define PPM_SCALE_INV ((1ll << (PPM_SCALE_INV_SHIFT + NTP_SCALE_SHIFT)) / \ + PPM_SCALE + 1) + +#define MAXPHASE 500000000l /* max phase error (ns) */ +#define MAXFREQ 500000 /* max frequency error (ns/s) */ +#define MAXFREQ_SCALED ((s64)MAXFREQ << NTP_SCALE_SHIFT) +#define MINSEC 256 /* min interval between updates (s) */ +#define MAXSEC 2048 /* max interval between updates (s) */ +#define NTP_PHASE_LIMIT ((MAXPHASE / NSEC_PER_USEC) << 5) /* beyond max. dispersion */ + /* * kernel variables * Note: maximum error = NTP synch distance = dispersion + delay / 2; From bacbe9994541c70aa3abd1a013ac738e58d4bfb2 Mon Sep 17 00:00:00 2001 From: Janne Kulmala Date: Tue, 16 Dec 2008 13:39:57 +0200 Subject: [PATCH 13/15] x86: enable HPET on Fujitsu u9200 Impact: auto-enable HPET on Fujitsu u9200 HPET timer is listed in the ACPI table, but needs a quirk entry in order to work. Unfortunately, the quirk code runs after first HPET hpet_enable() which has already determined that the timer doesn't work (reads 0xFFFFFFFF). This patch allows hpet_enable() to be called again after running the quirk code. Signed-off-by: Janne Kulmala Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 2 +- arch/x86/kernel/quirks.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 067d8de913f6..84089dc8fd1d 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -811,7 +811,7 @@ int __init hpet_enable(void) out_nohpet: hpet_clear_mapping(); - boot_hpet_disable = 1; + hpet_address = 0; return 0; } diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 67465ed89310..309949e9e1c1 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -168,6 +168,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31, ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4, + ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, ich_force_enable_hpet); From 39c04b55240342d0742ac48538d3d8c71bfc0a94 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 16 Dec 2008 12:32:23 -0800 Subject: [PATCH 14/15] x86: make sure we really have an hpet mapping before using it Impact: prepare the hpet code for Xen dom0 booting When booting in Xen dom0, the hpet isn't really accessible, so make sure the mapping is non-NULL before use. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 84089dc8fd1d..a1f6ed5e1a05 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -834,10 +834,11 @@ static __init int hpet_late_init(void) hpet_address = force_hpet_address; hpet_enable(); - if (!hpet_virt_address) - return -ENODEV; } + if (!hpet_virt_address) + return -ENODEV; + hpet_reserve_platform_timers(hpet_readl(HPET_ID)); for_each_online_cpu(cpu) { From b2e3c0adec918ea22b6c9d7c76193dd3aaba9bd4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 19 Dec 2008 00:48:27 +0100 Subject: [PATCH 15/15] hrtimers: fix warning in kernel/hrtimer.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit this warning: kernel/hrtimer.c: In function ‘hrtimer_cpu_notify’: kernel/hrtimer.c:1574: warning: unused variable ‘dcpu’ is caused because 'dcpu' is only used in the CONFIG_HOTPLUG_CPU case. Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index b741f850426e..bda9cb924276 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1571,7 +1571,7 @@ static void tickle_timers(void *arg) static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { - int dcpu, scpu = (long)hcpu; + int scpu = (long)hcpu; switch (action) { @@ -1583,10 +1583,14 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, #ifdef CONFIG_HOTPLUG_CPU case CPU_DEAD: case CPU_DEAD_FROZEN: + { + int dcpu; + clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu); dcpu = migrate_hrtimers(scpu); smp_call_function_single(dcpu, tickle_timers, NULL, 0); break; + } #endif default: