From 2aa44d0567ed21b47b87d68819415d48194cb923 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Aug 2007 15:18:02 +0200
Subject: [PATCH 1/6] sched: sched_clock_idle_[sleep|wakeup]_event()

construct a more or less wall-clock time out of sched_clock(), by
using ACPI-idle's existing knowledge about how much time we spent
idling. This allows the rq clock to work around TSC-stops-in-C2,
TSC-gets-corrupted-in-C3 type of problems.

( Besides the scheduler's statistics this also benefits blktrace and
  printk-timestamps as well. )

Furthermore, the precise before-C2/C3-sleep and after-C2/C3-wakeup
callbacks allow the scheduler to get out the most of the period where
the CPU has a reliable TSC. This results in slightly more precise
task statistics.

the ACPI bits were acked by Len.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Len Brown <len.brown@intel.com>
---
 arch/i386/kernel/tsc.c        |  1 -
 drivers/acpi/processor_idle.c | 32 +++++++++++++++++++++------
 include/linux/sched.h         |  3 ++-
 kernel/sched.c                | 41 +++++++++++++++++++++++++++--------
 kernel/sched_debug.c          |  3 ++-
 5 files changed, 61 insertions(+), 19 deletions(-)

diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index debd7dbb4158..a39280b4dd3a 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -292,7 +292,6 @@ static struct clocksource clocksource_tsc = {
 
 void mark_tsc_unstable(char *reason)
 {
-	sched_clock_unstable_event();
 	if (!tsc_unstable) {
 		tsc_unstable = 1;
 		tsc_enabled = 0;
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index a8634a0655fc..d9b8af763e1e 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -63,6 +63,7 @@
 ACPI_MODULE_NAME("processor_idle");
 #define ACPI_PROCESSOR_FILE_POWER	"power"
 #define US_TO_PM_TIMER_TICKS(t)		((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
+#define PM_TIMER_TICK_NS		(1000000000ULL/PM_TIMER_FREQUENCY)
 #define C2_OVERHEAD			4	/* 1us (3.579 ticks per us) */
 #define C3_OVERHEAD			4	/* 1us (3.579 ticks per us) */
 static void (*pm_idle_save) (void) __read_mostly;
@@ -462,6 +463,9 @@ static void acpi_processor_idle(void)
 		 * TBD: Can't get time duration while in C1, as resumes
 		 *      go to an ISR rather than here.  Need to instrument
 		 *      base interrupt handler.
+		 *
+		 * Note: the TSC better not stop in C1, sched_clock() will
+		 *       skew otherwise.
 		 */
 		sleep_ticks = 0xFFFFFFFF;
 		break;
@@ -469,6 +473,8 @@ static void acpi_processor_idle(void)
 	case ACPI_STATE_C2:
 		/* Get start time (ticks) */
 		t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		/* Tell the scheduler that we are going deep-idle: */
+		sched_clock_idle_sleep_event();
 		/* Invoke C2 */
 		acpi_state_timer_broadcast(pr, cx, 1);
 		acpi_cstate_enter(cx);
@@ -479,17 +485,22 @@ static void acpi_processor_idle(void)
 		/* TSC halts in C2, so notify users */
 		mark_tsc_unstable("possible TSC halt in C2");
 #endif
+		/* Compute time (ticks) that we were actually asleep */
+		sleep_ticks = ticks_elapsed(t1, t2);
+
+		/* Tell the scheduler how much we idled: */
+		sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
+
 		/* Re-enable interrupts */
 		local_irq_enable();
+		/* Do not account our idle-switching overhead: */
+		sleep_ticks -= cx->latency_ticks + C2_OVERHEAD;
+
 		current_thread_info()->status |= TS_POLLING;
-		/* Compute time (ticks) that we were actually asleep */
-		sleep_ticks =
-		    ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD;
 		acpi_state_timer_broadcast(pr, cx, 0);
 		break;
 
 	case ACPI_STATE_C3:
-
 		/*
 		 * disable bus master
 		 * bm_check implies we need ARB_DIS
@@ -518,6 +529,8 @@ static void acpi_processor_idle(void)
 		t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
 		/* Invoke C3 */
 		acpi_state_timer_broadcast(pr, cx, 1);
+		/* Tell the scheduler that we are going deep-idle: */
+		sched_clock_idle_sleep_event();
 		acpi_cstate_enter(cx);
 		/* Get end time (ticks) */
 		t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
@@ -531,12 +544,17 @@ static void acpi_processor_idle(void)
 		/* TSC halts in C3, so notify users */
 		mark_tsc_unstable("TSC halts in C3");
 #endif
+		/* Compute time (ticks) that we were actually asleep */
+		sleep_ticks = ticks_elapsed(t1, t2);
+		/* Tell the scheduler how much we idled: */
+		sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
+
 		/* Re-enable interrupts */
 		local_irq_enable();
+		/* Do not account our idle-switching overhead: */
+		sleep_ticks -= cx->latency_ticks + C3_OVERHEAD;
+
 		current_thread_info()->status |= TS_POLLING;
-		/* Compute time (ticks) that we were actually asleep */
-		sleep_ticks =
-		    ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD;
 		acpi_state_timer_broadcast(pr, cx, 0);
 		break;
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 682ef87da6eb..1845b2e99a87 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1388,7 +1388,8 @@ extern void sched_exec(void);
 #define sched_exec()   {}
 #endif
 
-extern void sched_clock_unstable_event(void);
+extern void sched_clock_idle_sleep_event(void);
+extern void sched_clock_idle_wakeup_event(u64 delta_ns);
 
 #ifdef CONFIG_HOTPLUG_CPU
 extern void idle_task_exit(void);
diff --git a/kernel/sched.c b/kernel/sched.c
index 45e17b83b7f1..48e7586168ef 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -262,7 +262,8 @@ struct rq {
 	s64 clock_max_delta;
 
 	unsigned int clock_warps, clock_overflows;
-	unsigned int clock_unstable_events;
+	u64 idle_clock;
+	unsigned int clock_deep_idle_events;
 	u64 tick_timestamp;
 
 	atomic_t nr_iowait;
@@ -556,18 +557,40 @@ static inline struct rq *this_rq_lock(void)
 }
 
 /*
- * CPU frequency is/was unstable - start new by setting prev_clock_raw:
+ * We are going deep-idle (irqs are disabled):
  */
-void sched_clock_unstable_event(void)
+void sched_clock_idle_sleep_event(void)
 {
-	unsigned long flags;
-	struct rq *rq;
+	struct rq *rq = cpu_rq(smp_processor_id());
 
-	rq = task_rq_lock(current, &flags);
-	rq->prev_clock_raw = sched_clock();
-	rq->clock_unstable_events++;
-	task_rq_unlock(rq, &flags);
+	spin_lock(&rq->lock);
+	__update_rq_clock(rq);
+	spin_unlock(&rq->lock);
+	rq->clock_deep_idle_events++;
 }
+EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
+
+/*
+ * We just idled delta nanoseconds (called with irqs disabled):
+ */
+void sched_clock_idle_wakeup_event(u64 delta_ns)
+{
+	struct rq *rq = cpu_rq(smp_processor_id());
+	u64 now = sched_clock();
+
+	rq->idle_clock += delta_ns;
+	/*
+	 * Override the previous timestamp and ignore all
+	 * sched_clock() deltas that occured while we idled,
+	 * and use the PM-provided delta_ns to advance the
+	 * rq clock:
+	 */
+	spin_lock(&rq->lock);
+	rq->prev_clock_raw = now;
+	rq->clock += delta_ns;
+	spin_unlock(&rq->lock);
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
 /*
  * resched_task - mark a task 'to be rescheduled now'.
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 87e524762b85..ab18f45f2ab2 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -154,10 +154,11 @@ static void print_cpu(struct seq_file *m, int cpu)
 	P(next_balance);
 	P(curr->pid);
 	P(clock);
+	P(idle_clock);
 	P(prev_clock_raw);
 	P(clock_warps);
 	P(clock_overflows);
-	P(clock_unstable_events);
+	P(clock_deep_idle_events);
 	P(clock_max_delta);
 	P(cpu_load[0]);
 	P(cpu_load[1]);

From c57baf1e1e24b004b57d282267542baab802753c Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 23 Aug 2007 15:18:02 +0200
Subject: [PATCH 2/6] sched: fix sysctl directory permissions

There are two remaining gotchas:

- The directories have impossible permissions (writeable).

- The ctl_name for the kernel directory is inconsistent with
  everything else.  It should be CTL_KERN.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 48e7586168ef..5fecbbba12ac 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5257,15 +5257,16 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
 static struct ctl_table sd_ctl_dir[] = {
 	{
 		.procname	= "sched_domain",
-		.mode		= 0755,
+		.mode		= 0555,
 	},
 	{0,},
 };
 
 static struct ctl_table sd_ctl_root[] = {
 	{
+		.ctl_name	= CTL_KERN,
 		.procname	= "kernel",
-		.mode		= 0755,
+		.mode		= 0555,
 		.child		= sd_ctl_dir,
 	},
 	{0,},
@@ -5341,7 +5342,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
 	for_each_domain(cpu, sd) {
 		snprintf(buf, 32, "domain%d", i);
 		entry->procname = kstrdup(buf, GFP_KERNEL);
-		entry->mode = 0755;
+		entry->mode = 0555;
 		entry->child = sd_alloc_ctl_domain_table(sd);
 		entry++;
 		i++;
@@ -5361,7 +5362,7 @@ static void init_sched_domain_sysctl(void)
 	for (i = 0; i < cpu_num; i++, entry++) {
 		snprintf(buf, 32, "cpu%d", i);
 		entry->procname = kstrdup(buf, GFP_KERNEL);
-		entry->mode = 0755;
+		entry->mode = 0555;
 		entry->child = sd_alloc_ctl_cpu_table(i);
 	}
 	sd_sysctl_header = register_sysctl_table(sd_ctl_root);

From efe567fc8281661524ffa75477a7c4ca9b466c63 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Thu, 23 Aug 2007 15:18:02 +0200
Subject: [PATCH 3/6] sched: accounting regression since rc1

Fix the accounting regression for CONFIG_VIRT_CPU_ACCOUNTING.  It
reverts parts of commit b27f03d4bdc145a09fb7b0c0e004b29f1ee555fa by
converting fs/proc/array.c back to cputime_t.  The new functions
task_utime and task_stime now return cputime_t instead of clock_t.  If
CONFIG_VIRT_CPU_ACCOUTING is set, task->utime and task->stime are
returned directly instead of using sum_exec_runtime.

Patch is tested on s390x with and without VIRT_CPU_ACCOUTING as well as
on i386.

[ mingo@elte.hu: cleanups, comments. ]

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/array.c | 44 +++++++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 965625a0977d..ee4814dd98f9 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -320,7 +320,21 @@ int proc_pid_status(struct task_struct *task, char *buffer)
 	return buffer - orig;
 }
 
-static clock_t task_utime(struct task_struct *p)
+/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+static cputime_t task_utime(struct task_struct *p)
+{
+	return p->utime;
+}
+
+static cputime_t task_stime(struct task_struct *p)
+{
+	return p->stime;
+}
+#else
+static cputime_t task_utime(struct task_struct *p)
 {
 	clock_t utime = cputime_to_clock_t(p->utime),
 		total = utime + cputime_to_clock_t(p->stime);
@@ -337,10 +351,10 @@ static clock_t task_utime(struct task_struct *p)
 	}
 	utime = (clock_t)temp;
 
-	return utime;
+	return clock_t_to_cputime(utime);
 }
 
-static clock_t task_stime(struct task_struct *p)
+static cputime_t task_stime(struct task_struct *p)
 {
 	clock_t stime;
 
@@ -349,10 +363,12 @@ static clock_t task_stime(struct task_struct *p)
 	 * the total, to make sure the total observed by userspace
 	 * grows monotonically - apps rely on that):
 	 */
-	stime = nsec_to_clock_t(p->se.sum_exec_runtime) - task_utime(p);
+	stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
+			cputime_to_clock_t(task_utime(p));
 
-	return stime;
+	return clock_t_to_cputime(stime);
 }
+#endif
 
 static int do_task_stat(struct task_struct *task, char *buffer, int whole)
 {
@@ -368,8 +384,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
 	unsigned long long start_time;
 	unsigned long cmin_flt = 0, cmaj_flt = 0;
 	unsigned long  min_flt = 0,  maj_flt = 0;
-	cputime_t cutime, cstime;
-	clock_t utime, stime;
+	cputime_t cutime, cstime, utime, stime;
 	unsigned long rsslim = 0;
 	char tcomm[sizeof(task->comm)];
 	unsigned long flags;
@@ -387,8 +402,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
 
 	sigemptyset(&sigign);
 	sigemptyset(&sigcatch);
-	cutime = cstime = cputime_zero;
-	utime = stime = 0;
+	cutime = cstime = utime = stime = cputime_zero;
 
 	rcu_read_lock();
 	if (lock_task_sighand(task, &flags)) {
@@ -414,15 +428,15 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
 			do {
 				min_flt += t->min_flt;
 				maj_flt += t->maj_flt;
-				utime += task_utime(t);
-				stime += task_stime(t);
+				utime = cputime_add(utime, task_utime(t));
+				stime = cputime_add(stime, task_stime(t));
 				t = next_thread(t);
 			} while (t != task);
 
 			min_flt += sig->min_flt;
 			maj_flt += sig->maj_flt;
-			utime += cputime_to_clock_t(sig->utime);
-			stime += cputime_to_clock_t(sig->stime);
+			utime = cputime_add(utime, sig->utime);
+			stime = cputime_add(stime, sig->stime);
 		}
 
 		sid = signal_session(sig);
@@ -471,8 +485,8 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
 		cmin_flt,
 		maj_flt,
 		cmaj_flt,
-		utime,
-		stime,
+		cputime_to_clock_t(utime),
+		cputime_to_clock_t(stime),
 		cputime_to_clock_t(cutime),
 		cputime_to_clock_t(cstime),
 		priority,

From f8700df7c419781efb34696de7e7f49717f8ede7 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Thu, 23 Aug 2007 15:18:02 +0200
Subject: [PATCH 4/6] sched: fix broken SMT/MC optimizations

On a four package system with HT - HT load balancing optimizations were
broken.  For example, if two tasks end up running on two logical threads
of one of the packages, scheduler is not able to pull one of the tasks
to a completely idle package.

In this scenario, for nice-0 tasks, imbalance calculated by scheduler
will be 512 and find_busiest_queue() will return 0 (as each cpu's load
is 1024 > imbalance and has only one task running).

Similarly MC scheduler optimizations also get fixed with this patch.

[ mingo@elte.hu: restored fair balancing by increasing the fuzz and
                 adding it back to the power decision, without the /2
                 factor. ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 2 +-
 kernel/sched.c        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1845b2e99a87..ba78807eab91 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -681,7 +681,7 @@ enum cpu_idle_type {
 #define SCHED_LOAD_SHIFT	10
 #define SCHED_LOAD_SCALE	(1L << SCHED_LOAD_SHIFT)
 
-#define SCHED_LOAD_SCALE_FUZZ	(SCHED_LOAD_SCALE >> 1)
+#define SCHED_LOAD_SCALE_FUZZ	SCHED_LOAD_SCALE
 
 #ifdef CONFIG_SMP
 #define SD_LOAD_BALANCE		1	/* Do load balancing on this domain. */
diff --git a/kernel/sched.c b/kernel/sched.c
index 5fecbbba12ac..d96030db8ff7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2517,7 +2517,7 @@ group_next:
 	 * a think about bumping its value to force at least one task to be
 	 * moved
 	 */
-	if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
+	if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task) {
 		unsigned long tmp, pwr_now, pwr_move;
 		unsigned int imbn;
 

From f549da848eca595abca14ebc5e1bf00fd72aa53d Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Thu, 23 Aug 2007 15:18:02 +0200
Subject: [PATCH 5/6] sched: skip updating rq's next_balance under null SD

Was playing with sched_smt_power_savings/sched_mc_power_savings and
found out that while the scheduler domains are reconstructed when sysfs
settings change, rebalance_domains() can get triggered with null domain
on other cpus, which is setting next_balance to jiffies + 60*HZ.
Resulting in no idle/busy balancing for 60 seconds.

Fix this.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index d96030db8ff7..a4b22d93e00d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3043,6 +3043,7 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
 	struct sched_domain *sd;
 	/* Earliest time when we have to do rebalance again */
 	unsigned long next_balance = jiffies + 60*HZ;
+	int update_next_balance = 0;
 
 	for_each_domain(cpu, sd) {
 		if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3079,8 +3080,10 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
 		if (sd->flags & SD_SERIALIZE)
 			spin_unlock(&balancing);
 out:
-		if (time_after(next_balance, sd->last_balance + interval))
+		if (time_after(next_balance, sd->last_balance + interval)) {
 			next_balance = sd->last_balance + interval;
+			update_next_balance = 1;
+		}
 
 		/*
 		 * Stop the load balance at this level. There is another
@@ -3090,7 +3093,14 @@ out:
 		if (!balance)
 			break;
 	}
-	rq->next_balance = next_balance;
+
+	/*
+	 * next_balance will be updated only when there is a need.
+	 * When the cpu is attached to null domain for ex, it will not be
+	 * updated.
+	 */
+	if (likely(update_next_balance))
+		rq->next_balance = next_balance;
 }
 
 /*

From 505c0efd58031923ae01deac16d896607cafa70e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Aug 2007 15:18:02 +0200
Subject: [PATCH 6/6] sched: tweak the sched_runtime_limit tunable

Michael Gerdau reported reniced task CPU usage weirdnesses.
Such symptoms can be caused by limit underruns so double the
sched_runtime_limit.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index a4b22d93e00d..96e9b82246d2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4923,7 +4923,7 @@ static inline void sched_init_granularity(void)
 	if (sysctl_sched_granularity > gran_limit)
 		sysctl_sched_granularity = gran_limit;
 
-	sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
+	sysctl_sched_runtime_limit = sysctl_sched_granularity * 8;
 	sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
 }