diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index debd7dbb4158..a39280b4dd3a 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c @@ -292,7 +292,6 @@ static struct clocksource clocksource_tsc = { void mark_tsc_unstable(char *reason) { - sched_clock_unstable_event(); if (!tsc_unstable) { tsc_unstable = 1; tsc_enabled = 0; diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index a8634a0655fc..d9b8af763e1e 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -63,6 +63,7 @@ ACPI_MODULE_NAME("processor_idle"); #define ACPI_PROCESSOR_FILE_POWER "power" #define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000) +#define PM_TIMER_TICK_NS (1000000000ULL/PM_TIMER_FREQUENCY) #define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */ #define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */ static void (*pm_idle_save) (void) __read_mostly; @@ -462,6 +463,9 @@ static void acpi_processor_idle(void) * TBD: Can't get time duration while in C1, as resumes * go to an ISR rather than here. Need to instrument * base interrupt handler. + * + * Note: the TSC better not stop in C1, sched_clock() will + * skew otherwise. */ sleep_ticks = 0xFFFFFFFF; break; @@ -469,6 +473,8 @@ static void acpi_processor_idle(void) case ACPI_STATE_C2: /* Get start time (ticks) */ t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); + /* Tell the scheduler that we are going deep-idle: */ + sched_clock_idle_sleep_event(); /* Invoke C2 */ acpi_state_timer_broadcast(pr, cx, 1); acpi_cstate_enter(cx); @@ -479,17 +485,22 @@ static void acpi_processor_idle(void) /* TSC halts in C2, so notify users */ mark_tsc_unstable("possible TSC halt in C2"); #endif + /* Compute time (ticks) that we were actually asleep */ + sleep_ticks = ticks_elapsed(t1, t2); + + /* Tell the scheduler how much we idled: */ + sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS); + /* Re-enable interrupts */ local_irq_enable(); + /* Do not account our idle-switching overhead: */ + sleep_ticks -= cx->latency_ticks + C2_OVERHEAD; + current_thread_info()->status |= TS_POLLING; - /* Compute time (ticks) that we were actually asleep */ - sleep_ticks = - ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD; acpi_state_timer_broadcast(pr, cx, 0); break; case ACPI_STATE_C3: - /* * disable bus master * bm_check implies we need ARB_DIS @@ -518,6 +529,8 @@ static void acpi_processor_idle(void) t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); /* Invoke C3 */ acpi_state_timer_broadcast(pr, cx, 1); + /* Tell the scheduler that we are going deep-idle: */ + sched_clock_idle_sleep_event(); acpi_cstate_enter(cx); /* Get end time (ticks) */ t2 = inl(acpi_gbl_FADT.xpm_timer_block.address); @@ -531,12 +544,17 @@ static void acpi_processor_idle(void) /* TSC halts in C3, so notify users */ mark_tsc_unstable("TSC halts in C3"); #endif + /* Compute time (ticks) that we were actually asleep */ + sleep_ticks = ticks_elapsed(t1, t2); + /* Tell the scheduler how much we idled: */ + sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS); + /* Re-enable interrupts */ local_irq_enable(); + /* Do not account our idle-switching overhead: */ + sleep_ticks -= cx->latency_ticks + C3_OVERHEAD; + current_thread_info()->status |= TS_POLLING; - /* Compute time (ticks) that we were actually asleep */ - sleep_ticks = - ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD; acpi_state_timer_broadcast(pr, cx, 0); break; diff --git a/fs/proc/array.c b/fs/proc/array.c index 965625a0977d..ee4814dd98f9 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -320,7 +320,21 @@ int proc_pid_status(struct task_struct *task, char *buffer) return buffer - orig; } -static clock_t task_utime(struct task_struct *p) +/* + * Use precise platform statistics if available: + */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +static cputime_t task_utime(struct task_struct *p) +{ + return p->utime; +} + +static cputime_t task_stime(struct task_struct *p) +{ + return p->stime; +} +#else +static cputime_t task_utime(struct task_struct *p) { clock_t utime = cputime_to_clock_t(p->utime), total = utime + cputime_to_clock_t(p->stime); @@ -337,10 +351,10 @@ static clock_t task_utime(struct task_struct *p) } utime = (clock_t)temp; - return utime; + return clock_t_to_cputime(utime); } -static clock_t task_stime(struct task_struct *p) +static cputime_t task_stime(struct task_struct *p) { clock_t stime; @@ -349,10 +363,12 @@ static clock_t task_stime(struct task_struct *p) * the total, to make sure the total observed by userspace * grows monotonically - apps rely on that): */ - stime = nsec_to_clock_t(p->se.sum_exec_runtime) - task_utime(p); + stime = nsec_to_clock_t(p->se.sum_exec_runtime) - + cputime_to_clock_t(task_utime(p)); - return stime; + return clock_t_to_cputime(stime); } +#endif static int do_task_stat(struct task_struct *task, char *buffer, int whole) { @@ -368,8 +384,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) unsigned long long start_time; unsigned long cmin_flt = 0, cmaj_flt = 0; unsigned long min_flt = 0, maj_flt = 0; - cputime_t cutime, cstime; - clock_t utime, stime; + cputime_t cutime, cstime, utime, stime; unsigned long rsslim = 0; char tcomm[sizeof(task->comm)]; unsigned long flags; @@ -387,8 +402,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) sigemptyset(&sigign); sigemptyset(&sigcatch); - cutime = cstime = cputime_zero; - utime = stime = 0; + cutime = cstime = utime = stime = cputime_zero; rcu_read_lock(); if (lock_task_sighand(task, &flags)) { @@ -414,15 +428,15 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) do { min_flt += t->min_flt; maj_flt += t->maj_flt; - utime += task_utime(t); - stime += task_stime(t); + utime = cputime_add(utime, task_utime(t)); + stime = cputime_add(stime, task_stime(t)); t = next_thread(t); } while (t != task); min_flt += sig->min_flt; maj_flt += sig->maj_flt; - utime += cputime_to_clock_t(sig->utime); - stime += cputime_to_clock_t(sig->stime); + utime = cputime_add(utime, sig->utime); + stime = cputime_add(stime, sig->stime); } sid = signal_session(sig); @@ -471,8 +485,8 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) cmin_flt, maj_flt, cmaj_flt, - utime, - stime, + cputime_to_clock_t(utime), + cputime_to_clock_t(stime), cputime_to_clock_t(cutime), cputime_to_clock_t(cstime), priority, diff --git a/include/linux/sched.h b/include/linux/sched.h index 682ef87da6eb..ba78807eab91 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -681,7 +681,7 @@ enum cpu_idle_type { #define SCHED_LOAD_SHIFT 10 #define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) -#define SCHED_LOAD_SCALE_FUZZ (SCHED_LOAD_SCALE >> 1) +#define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE #ifdef CONFIG_SMP #define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ @@ -1388,7 +1388,8 @@ extern void sched_exec(void); #define sched_exec() {} #endif -extern void sched_clock_unstable_event(void); +extern void sched_clock_idle_sleep_event(void); +extern void sched_clock_idle_wakeup_event(u64 delta_ns); #ifdef CONFIG_HOTPLUG_CPU extern void idle_task_exit(void); diff --git a/kernel/sched.c b/kernel/sched.c index 45e17b83b7f1..96e9b82246d2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -262,7 +262,8 @@ struct rq { s64 clock_max_delta; unsigned int clock_warps, clock_overflows; - unsigned int clock_unstable_events; + u64 idle_clock; + unsigned int clock_deep_idle_events; u64 tick_timestamp; atomic_t nr_iowait; @@ -556,18 +557,40 @@ static inline struct rq *this_rq_lock(void) } /* - * CPU frequency is/was unstable - start new by setting prev_clock_raw: + * We are going deep-idle (irqs are disabled): */ -void sched_clock_unstable_event(void) +void sched_clock_idle_sleep_event(void) { - unsigned long flags; - struct rq *rq; + struct rq *rq = cpu_rq(smp_processor_id()); - rq = task_rq_lock(current, &flags); - rq->prev_clock_raw = sched_clock(); - rq->clock_unstable_events++; - task_rq_unlock(rq, &flags); + spin_lock(&rq->lock); + __update_rq_clock(rq); + spin_unlock(&rq->lock); + rq->clock_deep_idle_events++; } +EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); + +/* + * We just idled delta nanoseconds (called with irqs disabled): + */ +void sched_clock_idle_wakeup_event(u64 delta_ns) +{ + struct rq *rq = cpu_rq(smp_processor_id()); + u64 now = sched_clock(); + + rq->idle_clock += delta_ns; + /* + * Override the previous timestamp and ignore all + * sched_clock() deltas that occured while we idled, + * and use the PM-provided delta_ns to advance the + * rq clock: + */ + spin_lock(&rq->lock); + rq->prev_clock_raw = now; + rq->clock += delta_ns; + spin_unlock(&rq->lock); +} +EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); /* * resched_task - mark a task 'to be rescheduled now'. @@ -2494,7 +2517,7 @@ group_next: * a think about bumping its value to force at least one task to be * moved */ - if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) { + if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task) { unsigned long tmp, pwr_now, pwr_move; unsigned int imbn; @@ -3020,6 +3043,7 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; + int update_next_balance = 0; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -3056,8 +3080,10 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) if (sd->flags & SD_SERIALIZE) spin_unlock(&balancing); out: - if (time_after(next_balance, sd->last_balance + interval)) + if (time_after(next_balance, sd->last_balance + interval)) { next_balance = sd->last_balance + interval; + update_next_balance = 1; + } /* * Stop the load balance at this level. There is another @@ -3067,7 +3093,14 @@ out: if (!balance) break; } - rq->next_balance = next_balance; + + /* + * next_balance will be updated only when there is a need. + * When the cpu is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + rq->next_balance = next_balance; } /* @@ -4890,7 +4923,7 @@ static inline void sched_init_granularity(void) if (sysctl_sched_granularity > gran_limit) sysctl_sched_granularity = gran_limit; - sysctl_sched_runtime_limit = sysctl_sched_granularity * 4; + sysctl_sched_runtime_limit = sysctl_sched_granularity * 8; sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; } @@ -5234,15 +5267,16 @@ static void migrate_dead_tasks(unsigned int dead_cpu) static struct ctl_table sd_ctl_dir[] = { { .procname = "sched_domain", - .mode = 0755, + .mode = 0555, }, {0,}, }; static struct ctl_table sd_ctl_root[] = { { + .ctl_name = CTL_KERN, .procname = "kernel", - .mode = 0755, + .mode = 0555, .child = sd_ctl_dir, }, {0,}, @@ -5318,7 +5352,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) for_each_domain(cpu, sd) { snprintf(buf, 32, "domain%d", i); entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0755; + entry->mode = 0555; entry->child = sd_alloc_ctl_domain_table(sd); entry++; i++; @@ -5338,7 +5372,7 @@ static void init_sched_domain_sysctl(void) for (i = 0; i < cpu_num; i++, entry++) { snprintf(buf, 32, "cpu%d", i); entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0755; + entry->mode = 0555; entry->child = sd_alloc_ctl_cpu_table(i); } sd_sysctl_header = register_sysctl_table(sd_ctl_root); diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 87e524762b85..ab18f45f2ab2 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -154,10 +154,11 @@ static void print_cpu(struct seq_file *m, int cpu) P(next_balance); P(curr->pid); P(clock); + P(idle_clock); P(prev_clock_raw); P(clock_warps); P(clock_overflows); - P(clock_unstable_events); + P(clock_deep_idle_events); P(clock_max_delta); P(cpu_load[0]); P(cpu_load[1]);