From 0b1007c3578569469a6fab6ae5cca918ccdc3ee1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 2 Jun 2011 11:13:59 +0200
Subject: [PATCH 01/39] ptrace: remove silly wait_trap variable from
 ptrace_attach()

Remove local variable wait_trap which determines whether to wait for
!TRAPPING or not and simply wait for it if attach was successful.

-v2: Oleg pointed out wait should happen iff attach was successful.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 kernel/ptrace.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 2df115790cd9..4f689cb739a3 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -184,7 +184,6 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
 
 static int ptrace_attach(struct task_struct *task)
 {
-	bool wait_trap = false;
 	int retval;
 
 	audit_ptrace(task);
@@ -246,7 +245,6 @@ static int ptrace_attach(struct task_struct *task)
 	if (task_is_stopped(task)) {
 		task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING;
 		signal_wake_up(task, 1);
-		wait_trap = true;
 	}
 
 	spin_unlock(&task->sighand->siglock);
@@ -257,7 +255,7 @@ unlock_tasklist:
 unlock_creds:
 	mutex_unlock(&task->signal->cred_guard_mutex);
 out:
-	if (wait_trap)
+	if (!retval)
 		wait_event(current->signal->wait_chldexit,
 			   !(task->group_stop & GROUP_STOP_TRAPPING));
 	return retval;

From a8f072c1d624a627b67f2ace2f0c25d856ef4e54 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 2 Jun 2011 11:13:59 +0200
Subject: [PATCH 02/39] job control: rename signal->group_stop and flags to
 jobctl and update them

signal->group_stop currently hosts mostly group stop related flags;
however, it's gonna be used for wider purposes and the GROUP_STOP_
flag prefix becomes confusing.  Rename signal->group_stop to
signal->jobctl and rename all GROUP_STOP_* flags to JOBCTL_*.

Bit position macros JOBCTL_*_BIT are defined and JOBCTL_* flags are
defined in terms of them to allow using bitops later.

While at it, reassign JOBCTL_TRAPPING to bit 22 to better accomodate
future additions.

This doesn't cause any functional change.

-v2: JOBCTL_*_BIT macros added as suggested by Linus.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 fs/exec.c             |  2 +-
 include/linux/sched.h | 22 +++++++----
 kernel/ptrace.c       | 12 +++---
 kernel/signal.c       | 91 ++++++++++++++++++++++---------------------
 4 files changed, 67 insertions(+), 60 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index ea5f748906a8..8986bb0f9dc2 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1772,7 +1772,7 @@ static int zap_process(struct task_struct *start, int exit_code)
 
 	t = start;
 	do {
-		task_clear_group_stop_pending(t);
+		task_clear_jobctl_stop_pending(t);
 		if (t != current && t->mm) {
 			sigaddset(&t->pending.signal, SIGKILL);
 			signal_wake_up(t, 1);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2a8621c4be1e..b0dd064eb4fc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1282,7 +1282,7 @@ struct task_struct {
 	int exit_state;
 	int exit_code, exit_signal;
 	int pdeath_signal;  /*  The signal sent when the parent dies  */
-	unsigned int group_stop;	/* GROUP_STOP_*, siglock protected */
+	unsigned int jobctl;	/* JOBCTL_*, siglock protected */
 	/* ??? */
 	unsigned int personality;
 	unsigned did_exec:1;
@@ -1803,15 +1803,21 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #define used_math() tsk_used_math(current)
 
 /*
- * task->group_stop flags
+ * task->jobctl flags
  */
-#define GROUP_STOP_SIGMASK	0xffff    /* signr of the last group stop */
-#define GROUP_STOP_PENDING	(1 << 16) /* task should stop for group stop */
-#define GROUP_STOP_CONSUME	(1 << 17) /* consume group stop count */
-#define GROUP_STOP_TRAPPING	(1 << 18) /* switching from STOPPED to TRACED */
-#define GROUP_STOP_DEQUEUED	(1 << 19) /* stop signal dequeued */
+#define JOBCTL_STOP_SIGMASK	0xffff	/* signr of the last group stop */
 
-extern void task_clear_group_stop_pending(struct task_struct *task);
+#define JOBCTL_STOP_DEQUEUED_BIT 16	/* stop signal dequeued */
+#define JOBCTL_STOP_PENDING_BIT	17	/* task should stop for group stop */
+#define JOBCTL_STOP_CONSUME_BIT	18	/* consume group stop count */
+#define JOBCTL_TRAPPING_BIT	21	/* switching to TRACED */
+
+#define JOBCTL_STOP_DEQUEUED	(1 << JOBCTL_STOP_DEQUEUED_BIT)
+#define JOBCTL_STOP_PENDING	(1 << JOBCTL_STOP_PENDING_BIT)
+#define JOBCTL_STOP_CONSUME	(1 << JOBCTL_STOP_CONSUME_BIT)
+#define JOBCTL_TRAPPING		(1 << JOBCTL_TRAPPING_BIT)
+
+extern void task_clear_jobctl_stop_pending(struct task_struct *task);
 
 #ifdef CONFIG_PREEMPT_RCU
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 4f689cb739a3..134f34cb142b 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -77,13 +77,13 @@ void __ptrace_unlink(struct task_struct *child)
 	spin_lock(&child->sighand->siglock);
 
 	/*
-	 * Reinstate GROUP_STOP_PENDING if group stop is in effect and
+	 * Reinstate JOBCTL_STOP_PENDING if group stop is in effect and
 	 * @child isn't dead.
 	 */
 	if (!(child->flags & PF_EXITING) &&
 	    (child->signal->flags & SIGNAL_STOP_STOPPED ||
 	     child->signal->group_stop_count))
-		child->group_stop |= GROUP_STOP_PENDING;
+		child->jobctl |= JOBCTL_STOP_PENDING;
 
 	/*
 	 * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
@@ -91,7 +91,7 @@ void __ptrace_unlink(struct task_struct *child)
 	 * is in TASK_TRACED; otherwise, we might unduly disrupt
 	 * TASK_KILLABLE sleeps.
 	 */
-	if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child))
+	if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child))
 		signal_wake_up(child, task_is_traced(child));
 
 	spin_unlock(&child->sighand->siglock);
@@ -226,7 +226,7 @@ static int ptrace_attach(struct task_struct *task)
 	spin_lock(&task->sighand->siglock);
 
 	/*
-	 * If the task is already STOPPED, set GROUP_STOP_PENDING and
+	 * If the task is already STOPPED, set JOBCTL_STOP_PENDING and
 	 * TRAPPING, and kick it so that it transits to TRACED.  TRAPPING
 	 * will be cleared if the child completes the transition or any
 	 * event which clears the group stop states happens.  We'll wait
@@ -243,7 +243,7 @@ static int ptrace_attach(struct task_struct *task)
 	 * in and out of STOPPED are protected by siglock.
 	 */
 	if (task_is_stopped(task)) {
-		task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING;
+		task->jobctl |= JOBCTL_STOP_PENDING | JOBCTL_TRAPPING;
 		signal_wake_up(task, 1);
 	}
 
@@ -257,7 +257,7 @@ unlock_creds:
 out:
 	if (!retval)
 		wait_event(current->signal->wait_chldexit,
-			   !(task->group_stop & GROUP_STOP_TRAPPING));
+			   !(task->jobctl & JOBCTL_TRAPPING));
 	return retval;
 }
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 86c32b884f8e..ab6851c06461 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
 
 static int recalc_sigpending_tsk(struct task_struct *t)
 {
-	if ((t->group_stop & GROUP_STOP_PENDING) ||
+	if ((t->jobctl & JOBCTL_STOP_PENDING) ||
 	    PENDING(&t->pending, &t->blocked) ||
 	    PENDING(&t->signal->shared_pending, &t->blocked)) {
 		set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -224,27 +224,28 @@ static inline void print_dropped_signal(int sig)
 }
 
 /**
- * task_clear_group_stop_trapping - clear group stop trapping bit
+ * task_clear_jobctl_trapping - clear jobctl trapping bit
  * @task: target task
  *
- * If GROUP_STOP_TRAPPING is set, a ptracer is waiting for us.  Clear it
- * and wake up the ptracer.  Note that we don't need any further locking.
- * @task->siglock guarantees that @task->parent points to the ptracer.
+ * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED.
+ * Clear it and wake up the ptracer.  Note that we don't need any further
+ * locking.  @task->siglock guarantees that @task->parent points to the
+ * ptracer.
  *
  * CONTEXT:
  * Must be called with @task->sighand->siglock held.
  */
-static void task_clear_group_stop_trapping(struct task_struct *task)
+static void task_clear_jobctl_trapping(struct task_struct *task)
 {
-	if (unlikely(task->group_stop & GROUP_STOP_TRAPPING)) {
-		task->group_stop &= ~GROUP_STOP_TRAPPING;
+	if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
+		task->jobctl &= ~JOBCTL_TRAPPING;
 		__wake_up_sync_key(&task->parent->signal->wait_chldexit,
 				   TASK_UNINTERRUPTIBLE, 1, task);
 	}
 }
 
 /**
- * task_clear_group_stop_pending - clear pending group stop
+ * task_clear_jobctl_stop_pending - clear pending group stop
  * @task: target task
  *
  * Clear group stop states for @task.
@@ -252,19 +253,19 @@ static void task_clear_group_stop_trapping(struct task_struct *task)
  * CONTEXT:
  * Must be called with @task->sighand->siglock held.
  */
-void task_clear_group_stop_pending(struct task_struct *task)
+void task_clear_jobctl_stop_pending(struct task_struct *task)
 {
-	task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME |
-			      GROUP_STOP_DEQUEUED);
+	task->jobctl &= ~(JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME |
+			  JOBCTL_STOP_DEQUEUED);
 }
 
 /**
  * task_participate_group_stop - participate in a group stop
  * @task: task participating in a group stop
  *
- * @task has GROUP_STOP_PENDING set and is participating in a group stop.
+ * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop.
  * Group stop states are cleared and the group stop count is consumed if
- * %GROUP_STOP_CONSUME was set.  If the consumption completes the group
+ * %JOBCTL_STOP_CONSUME was set.  If the consumption completes the group
  * stop, the appropriate %SIGNAL_* flags are set.
  *
  * CONTEXT:
@@ -277,11 +278,11 @@ void task_clear_group_stop_pending(struct task_struct *task)
 static bool task_participate_group_stop(struct task_struct *task)
 {
 	struct signal_struct *sig = task->signal;
-	bool consume = task->group_stop & GROUP_STOP_CONSUME;
+	bool consume = task->jobctl & JOBCTL_STOP_CONSUME;
 
-	WARN_ON_ONCE(!(task->group_stop & GROUP_STOP_PENDING));
+	WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING));
 
-	task_clear_group_stop_pending(task);
+	task_clear_jobctl_stop_pending(task);
 
 	if (!consume)
 		return false;
@@ -604,7 +605,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 		 * is to alert stop-signal processing code when another
 		 * processor has come along and cleared the flag.
 		 */
-		current->group_stop |= GROUP_STOP_DEQUEUED;
+		current->jobctl |= JOBCTL_STOP_DEQUEUED;
 	}
 	if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
 		/*
@@ -809,7 +810,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
 		rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
 		t = p;
 		do {
-			task_clear_group_stop_pending(t);
+			task_clear_jobctl_stop_pending(t);
 			rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
 			wake_up_state(t, __TASK_STOPPED);
 		} while_each_thread(p, t);
@@ -925,7 +926,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
 			signal->group_stop_count = 0;
 			t = p;
 			do {
-				task_clear_group_stop_pending(t);
+				task_clear_jobctl_stop_pending(t);
 				sigaddset(&t->pending.signal, SIGKILL);
 				signal_wake_up(t, 1);
 			} while_each_thread(p, t);
@@ -1160,7 +1161,7 @@ int zap_other_threads(struct task_struct *p)
 	p->signal->group_stop_count = 0;
 
 	while_each_thread(p, t) {
-		task_clear_group_stop_pending(t);
+		task_clear_jobctl_stop_pending(t);
 		count++;
 
 		/* Don't bother with already dead threads */
@@ -1738,7 +1739,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
 	 * clear now.  We act as if SIGCONT is received after TASK_TRACED
 	 * is entered - ignore it.
 	 */
-	if (why == CLD_STOPPED && (current->group_stop & GROUP_STOP_PENDING))
+	if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING))
 		gstop_done = task_participate_group_stop(current);
 
 	current->last_siginfo = info;
@@ -1751,12 +1752,12 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
 	set_current_state(TASK_TRACED);
 
 	/*
-	 * We're committing to trapping.  Clearing GROUP_STOP_TRAPPING and
+	 * We're committing to trapping.  Clearing JOBCTL_TRAPPING and
 	 * transition to TASK_TRACED should be atomic with respect to
-	 * siglock.  This hsould be done after the arch hook as siglock is
+	 * siglock.  This should be done after the arch hook as siglock is
 	 * released and regrabbed across it.
 	 */
-	task_clear_group_stop_trapping(current);
+	task_clear_jobctl_trapping(current);
 
 	spin_unlock_irq(&current->sighand->siglock);
 	read_lock(&tasklist_lock);
@@ -1792,9 +1793,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
 		 *
 		 * If @gstop_done, the ptracer went away between group stop
 		 * completion and here.  During detach, it would have set
-		 * GROUP_STOP_PENDING on us and we'll re-enter TASK_STOPPED
-		 * in do_signal_stop() on return, so notifying the real
-		 * parent of the group stop completion is enough.
+		 * JOBCTL_STOP_PENDING on us and we'll re-enter
+		 * TASK_STOPPED in do_signal_stop() on return, so notifying
+		 * the real parent of the group stop completion is enough.
 		 */
 		if (gstop_done)
 			do_notify_parent_cldstop(current, false, why);
@@ -1856,14 +1857,14 @@ static int do_signal_stop(int signr)
 {
 	struct signal_struct *sig = current->signal;
 
-	if (!(current->group_stop & GROUP_STOP_PENDING)) {
-		unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME;
+	if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
+		unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
 		struct task_struct *t;
 
-		/* signr will be recorded in task->group_stop for retries */
-		WARN_ON_ONCE(signr & ~GROUP_STOP_SIGMASK);
+		/* signr will be recorded in task->jobctl for retries */
+		WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK);
 
-		if (!likely(current->group_stop & GROUP_STOP_DEQUEUED) ||
+		if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) ||
 		    unlikely(signal_group_exit(sig)))
 			return 0;
 		/*
@@ -1890,19 +1891,19 @@ static int do_signal_stop(int signr)
 		else
 			WARN_ON_ONCE(!task_ptrace(current));
 
-		current->group_stop &= ~GROUP_STOP_SIGMASK;
-		current->group_stop |= signr | gstop;
+		current->jobctl &= ~JOBCTL_STOP_SIGMASK;
+		current->jobctl |= signr | gstop;
 		sig->group_stop_count = 1;
 		for (t = next_thread(current); t != current;
 		     t = next_thread(t)) {
-			t->group_stop &= ~GROUP_STOP_SIGMASK;
+			t->jobctl &= ~JOBCTL_STOP_SIGMASK;
 			/*
 			 * Setting state to TASK_STOPPED for a group
 			 * stop is always done with the siglock held,
 			 * so this check has no races.
 			 */
 			if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) {
-				t->group_stop |= signr | gstop;
+				t->jobctl |= signr | gstop;
 				sig->group_stop_count++;
 				signal_wake_up(t, 0);
 			}
@@ -1943,23 +1944,23 @@ retry:
 
 		spin_lock_irq(&current->sighand->siglock);
 	} else {
-		ptrace_stop(current->group_stop & GROUP_STOP_SIGMASK,
+		ptrace_stop(current->jobctl & JOBCTL_STOP_SIGMASK,
 			    CLD_STOPPED, 0, NULL);
 		current->exit_code = 0;
 	}
 
 	/*
-	 * GROUP_STOP_PENDING could be set if another group stop has
+	 * JOBCTL_STOP_PENDING could be set if another group stop has
 	 * started since being woken up or ptrace wants us to transit
 	 * between TASK_STOPPED and TRACED.  Retry group stop.
 	 */
-	if (current->group_stop & GROUP_STOP_PENDING) {
-		WARN_ON_ONCE(!(current->group_stop & GROUP_STOP_SIGMASK));
+	if (current->jobctl & JOBCTL_STOP_PENDING) {
+		WARN_ON_ONCE(!(current->jobctl & JOBCTL_STOP_SIGMASK));
 		goto retry;
 	}
 
 	/* PTRACE_ATTACH might have raced with task killing, clear trapping */
-	task_clear_group_stop_trapping(current);
+	task_clear_jobctl_trapping(current);
 
 	spin_unlock_irq(&current->sighand->siglock);
 
@@ -2078,8 +2079,8 @@ relock:
 		if (unlikely(signr != 0))
 			ka = return_ka;
 		else {
-			if (unlikely(current->group_stop &
-				     GROUP_STOP_PENDING) && do_signal_stop(0))
+			if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
+			    do_signal_stop(0))
 				goto relock;
 
 			signr = dequeue_signal(current, &current->blocked,
@@ -2253,7 +2254,7 @@ void exit_signals(struct task_struct *tsk)
 	signotset(&unblocked);
 	retarget_shared_pending(tsk, &unblocked);
 
-	if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) &&
+	if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) &&
 	    task_participate_group_stop(tsk))
 		group_stop = CLD_STOPPED;
 out:

From 755e276b3326f300585435d2f3876e66e248c476 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 2 Jun 2011 11:13:59 +0200
Subject: [PATCH 03/39] ptrace: ptrace_check_attach(): rename @kill to
 @ignore_state and add comments

PTRACE_INTERRUPT is going to be added which should also skip
task_is_traced() check in ptrace_check_attach().  Rename @kill to
@ignore_state and make it bool.  Add function comment while at it.

This patch doesn't introduce any behavior difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/ptrace.h |  2 +-
 kernel/ptrace.c        | 24 +++++++++++++++++++-----
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 9178d5cc0b01..e93ef1a54fc7 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -105,7 +105,7 @@ extern long arch_ptrace(struct task_struct *child, long request,
 extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len);
 extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len);
 extern void ptrace_disable(struct task_struct *);
-extern int ptrace_check_attach(struct task_struct *task, int kill);
+extern int ptrace_check_attach(struct task_struct *task, bool ignore_state);
 extern int ptrace_request(struct task_struct *child, long request,
 			  unsigned long addr, unsigned long data);
 extern void ptrace_notify(int exit_code);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 134f34cb142b..eb191116edf7 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -97,10 +97,24 @@ void __ptrace_unlink(struct task_struct *child)
 	spin_unlock(&child->sighand->siglock);
 }
 
-/*
- * Check that we have indeed attached to the thing..
+/**
+ * ptrace_check_attach - check whether ptracee is ready for ptrace operation
+ * @child: ptracee to check for
+ * @ignore_state: don't check whether @child is currently %TASK_TRACED
+ *
+ * Check whether @child is being ptraced by %current and ready for further
+ * ptrace operations.  If @ignore_state is %false, @child also should be in
+ * %TASK_TRACED state and on return the child is guaranteed to be traced
+ * and not executing.  If @ignore_state is %true, @child can be in any
+ * state.
+ *
+ * CONTEXT:
+ * Grabs and releases tasklist_lock and @child->sighand->siglock.
+ *
+ * RETURNS:
+ * 0 on success, -ESRCH if %child is not ready.
  */
-int ptrace_check_attach(struct task_struct *child, int kill)
+int ptrace_check_attach(struct task_struct *child, bool ignore_state)
 {
 	int ret = -ESRCH;
 
@@ -119,13 +133,13 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 		 */
 		spin_lock_irq(&child->sighand->siglock);
 		WARN_ON_ONCE(task_is_stopped(child));
-		if (task_is_traced(child) || kill)
+		if (task_is_traced(child) || ignore_state)
 			ret = 0;
 		spin_unlock_irq(&child->sighand->siglock);
 	}
 	read_unlock(&tasklist_lock);
 
-	if (!ret && !kill)
+	if (!ret && !ignore_state)
 		ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
 
 	/* All systems go.. */

From 81be24b8cdeb69e62f9d1b6b425fd9ffdd37f581 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 2 Jun 2011 11:13:59 +0200
Subject: [PATCH 04/39] ptrace: relocate set_current_state(TASK_TRACED) in
 ptrace_stop()

In ptrace_stop(), after arch hook is done, the task state and jobctl
bits are updated while holding siglock.  The ordering requirement
there is that TASK_TRACED is set before JOBCTL_TRAPPING is cleared to
prevent ptracer waiting on TRAPPING doesn't end up waking up TRACED is
actually set and sees TASK_RUNNING in wait(2).

Move set_current_state(TASK_TRACED) to the top of the block and
reorganize comments.  This makes the ordering more obvious
(TASK_TRACED before other updates) and helps future updates to group
stop participation.

This patch doesn't cause any functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 kernel/signal.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index ab6851c06461..62a6c3bb9f0d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1732,6 +1732,18 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
 			return;
 	}
 
+	/*
+	 * We're committing to trapping.  TRACED should be visible before
+	 * TRAPPING is cleared; otherwise, the tracer might fail do_wait().
+	 * Also, transition to TRACED and updates to ->jobctl should be
+	 * atomic with respect to siglock and should be done after the arch
+	 * hook as siglock is released and regrabbed across it.
+	 */
+	set_current_state(TASK_TRACED);
+
+	current->last_siginfo = info;
+	current->exit_code = exit_code;
+
 	/*
 	 * If @why is CLD_STOPPED, we're trapping to participate in a group
 	 * stop.  Do the bookkeeping.  Note that if SIGCONT was delievered
@@ -1742,21 +1754,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
 	if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING))
 		gstop_done = task_participate_group_stop(current);
 
-	current->last_siginfo = info;
-	current->exit_code = exit_code;
-
-	/*
-	 * TRACED should be visible before TRAPPING is cleared; otherwise,
-	 * the tracer might fail do_wait().
-	 */
-	set_current_state(TASK_TRACED);
-
-	/*
-	 * We're committing to trapping.  Clearing JOBCTL_TRAPPING and
-	 * transition to TASK_TRACED should be atomic with respect to
-	 * siglock.  This should be done after the arch hook as siglock is
-	 * released and regrabbed across it.
-	 */
+	/* entering a trap, clear TRAPPING */
 	task_clear_jobctl_trapping(current);
 
 	spin_unlock_irq(&current->sighand->siglock);

From 3759a0d94c18764247b66511d1038f2b93aa95de Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 2 Jun 2011 11:14:00 +0200
Subject: [PATCH 05/39] job control: introduce JOBCTL_PENDING_MASK and
 task_clear_jobctl_pending()

This patch introduces JOBCTL_PENDING_MASK and replaces
task_clear_jobctl_stop_pending() with task_clear_jobctl_pending()
which takes an extra @mask argument.

JOBCTL_PENDING_MASK is currently equal to JOBCTL_STOP_PENDING but
future patches will add more bits.  recalc_sigpending_tsk() is updated
to use JOBCTL_PENDING_MASK instead.

task_clear_jobctl_pending() takes @mask which in subset of
JOBCTL_PENDING_MASK and clears the relevant jobctl bits.  If
JOBCTL_STOP_PENDING is set, other STOP bits are cleared together.  All
task_clear_jobctl_stop_pending() users are updated to call
task_clear_jobctl_pending() with JOBCTL_STOP_PENDING which is
functionally identical to task_clear_jobctl_stop_pending().

This patch doesn't cause any functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 fs/exec.c             |  2 +-
 include/linux/sched.h |  5 ++++-
 kernel/signal.c       | 27 +++++++++++++++++----------
 3 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 8986bb0f9dc2..4402105287cb 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1772,7 +1772,7 @@ static int zap_process(struct task_struct *start, int exit_code)
 
 	t = start;
 	do {
-		task_clear_jobctl_stop_pending(t);
+		task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
 		if (t != current && t->mm) {
 			sigaddset(&t->pending.signal, SIGKILL);
 			signal_wake_up(t, 1);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b0dd064eb4fc..5a958b17f9fe 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1817,7 +1817,10 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #define JOBCTL_STOP_CONSUME	(1 << JOBCTL_STOP_CONSUME_BIT)
 #define JOBCTL_TRAPPING		(1 << JOBCTL_TRAPPING_BIT)
 
-extern void task_clear_jobctl_stop_pending(struct task_struct *task);
+#define JOBCTL_PENDING_MASK	JOBCTL_STOP_PENDING
+
+extern void task_clear_jobctl_pending(struct task_struct *task,
+				      unsigned int mask);
 
 #ifdef CONFIG_PREEMPT_RCU
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 62a6c3bb9f0d..288d952fa3b8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
 
 static int recalc_sigpending_tsk(struct task_struct *t)
 {
-	if ((t->jobctl & JOBCTL_STOP_PENDING) ||
+	if ((t->jobctl & JOBCTL_PENDING_MASK) ||
 	    PENDING(&t->pending, &t->blocked) ||
 	    PENDING(&t->signal->shared_pending, &t->blocked)) {
 		set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -245,18 +245,25 @@ static void task_clear_jobctl_trapping(struct task_struct *task)
 }
 
 /**
- * task_clear_jobctl_stop_pending - clear pending group stop
+ * task_clear_jobctl_pending - clear jobctl pending bits
  * @task: target task
+ * @mask: pending bits to clear
  *
- * Clear group stop states for @task.
+ * Clear @mask from @task->jobctl.  @mask must be subset of
+ * %JOBCTL_PENDING_MASK.  If %JOBCTL_STOP_PENDING is being cleared, other
+ * STOP bits are cleared together.
  *
  * CONTEXT:
  * Must be called with @task->sighand->siglock held.
  */
-void task_clear_jobctl_stop_pending(struct task_struct *task)
+void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask)
 {
-	task->jobctl &= ~(JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME |
-			  JOBCTL_STOP_DEQUEUED);
+	BUG_ON(mask & ~JOBCTL_PENDING_MASK);
+
+	if (mask & JOBCTL_STOP_PENDING)
+		mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED;
+
+	task->jobctl &= ~mask;
 }
 
 /**
@@ -282,7 +289,7 @@ static bool task_participate_group_stop(struct task_struct *task)
 
 	WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING));
 
-	task_clear_jobctl_stop_pending(task);
+	task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING);
 
 	if (!consume)
 		return false;
@@ -810,7 +817,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
 		rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
 		t = p;
 		do {
-			task_clear_jobctl_stop_pending(t);
+			task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
 			rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
 			wake_up_state(t, __TASK_STOPPED);
 		} while_each_thread(p, t);
@@ -926,7 +933,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
 			signal->group_stop_count = 0;
 			t = p;
 			do {
-				task_clear_jobctl_stop_pending(t);
+				task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
 				sigaddset(&t->pending.signal, SIGKILL);
 				signal_wake_up(t, 1);
 			} while_each_thread(p, t);
@@ -1161,7 +1168,7 @@ int zap_other_threads(struct task_struct *p)
 	p->signal->group_stop_count = 0;
 
 	while_each_thread(p, t) {
-		task_clear_jobctl_stop_pending(t);
+		task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
 		count++;
 
 		/* Don't bother with already dead threads */

From 6dfca32984237a8a011b5bf367e53341a265b2a4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 2 Jun 2011 11:14:00 +0200
Subject: [PATCH 06/39] job control: make task_clear_jobctl_pending() clear
 TRAPPING automatically

JOBCTL_TRAPPING indicates that ptracer is waiting for tracee to
(re)transit into TRACED.  task_clear_jobctl_pending() must be called
when either tracee enters TRACED or the transition is cancelled for
some reason.  The former is achieved by explicitly calling
task_clear_jobctl_pending() in ptrace_stop() and the latter by calling
it at the end of do_signal_stop().

Calling task_clear_jobctl_trapping() at the end of do_signal_stop()
limits the scope TRAPPING can be used and is fragile in that seemingly
unrelated changes to tracee's control flow can lead to stuck TRAPPING.

We already have task_clear_jobctl_pending() calls on those cancelling
events to clear JOBCTL_STOP_PENDING.  Cancellations can be handled by
making those call sites use JOBCTL_PENDING_MASK instead and updating
task_clear_jobctl_pending() such that task_clear_jobctl_trapping() is
called automatically if no stop/trap is pending.

This patch makes the above changes and removes the fallback
task_clear_jobctl_trapping() call from do_signal_stop().

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 fs/exec.c       |  2 +-
 kernel/signal.c | 13 ++++++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 4402105287cb..a9f2b3631bdb 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1772,7 +1772,7 @@ static int zap_process(struct task_struct *start, int exit_code)
 
 	t = start;
 	do {
-		task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
+		task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
 		if (t != current && t->mm) {
 			sigaddset(&t->pending.signal, SIGKILL);
 			signal_wake_up(t, 1);
diff --git a/kernel/signal.c b/kernel/signal.c
index 288d952fa3b8..637a171b65b6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -253,6 +253,9 @@ static void task_clear_jobctl_trapping(struct task_struct *task)
  * %JOBCTL_PENDING_MASK.  If %JOBCTL_STOP_PENDING is being cleared, other
  * STOP bits are cleared together.
  *
+ * If clearing of @mask leaves no stop or trap pending, this function calls
+ * task_clear_jobctl_trapping().
+ *
  * CONTEXT:
  * Must be called with @task->sighand->siglock held.
  */
@@ -264,6 +267,9 @@ void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask)
 		mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED;
 
 	task->jobctl &= ~mask;
+
+	if (!(task->jobctl & JOBCTL_PENDING_MASK))
+		task_clear_jobctl_trapping(task);
 }
 
 /**
@@ -933,7 +939,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
 			signal->group_stop_count = 0;
 			t = p;
 			do {
-				task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
+				task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
 				sigaddset(&t->pending.signal, SIGKILL);
 				signal_wake_up(t, 1);
 			} while_each_thread(p, t);
@@ -1168,7 +1174,7 @@ int zap_other_threads(struct task_struct *p)
 	p->signal->group_stop_count = 0;
 
 	while_each_thread(p, t) {
-		task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
+		task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
 		count++;
 
 		/* Don't bother with already dead threads */
@@ -1964,9 +1970,6 @@ retry:
 		goto retry;
 	}
 
-	/* PTRACE_ATTACH might have raced with task killing, clear trapping */
-	task_clear_jobctl_trapping(current);
-
 	spin_unlock_irq(&current->sighand->siglock);
 
 	tracehook_finish_jctl();

From 7dd3db54e77d21eb95e145f19ba53f68250d0e73 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 2 Jun 2011 11:14:00 +0200
Subject: [PATCH 07/39] job control: introduce task_set_jobctl_pending()

task->jobctl currently hosts JOBCTL_STOP_PENDING and will host TRAP
pending bits too.  Setting pending conditions on a dying task may make
the task unkillable.  Currently, each setting site is responsible for
checking for the condition but with to-be-added job control traps this
becomes too fragile.

This patch adds task_set_jobctl_pending() which should be used when
setting task->jobctl bits to schedule a stop or trap.  The function
performs the followings to ease setting pending bits.

* Sanity checks.

* If fatal signal is pending or PF_EXITING is set, no bit is set.

* STOP_SIGMASK is automatically cleared if new value is being set.

do_signal_stop() and ptrace_attach() are updated to use
task_set_jobctl_pending() instead of setting STOP_PENDING explicitly.
The surrounding structures around setting are changed to fit
task_set_jobctl_pending() better but there should be no userland
visible behavior difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/sched.h |  2 ++
 kernel/ptrace.c       |  6 +++---
 kernel/signal.c       | 46 +++++++++++++++++++++++++++++++++++++------
 3 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5a958b17f9fe..5157bd9eee37 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1819,6 +1819,8 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 
 #define JOBCTL_PENDING_MASK	JOBCTL_STOP_PENDING
 
+extern bool task_set_jobctl_pending(struct task_struct *task,
+				    unsigned int mask);
 extern void task_clear_jobctl_pending(struct task_struct *task,
 				      unsigned int mask);
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index eb191116edf7..0c37d999c8b8 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -256,10 +256,10 @@ static int ptrace_attach(struct task_struct *task)
 	 * The following task_is_stopped() test is safe as both transitions
 	 * in and out of STOPPED are protected by siglock.
 	 */
-	if (task_is_stopped(task)) {
-		task->jobctl |= JOBCTL_STOP_PENDING | JOBCTL_TRAPPING;
+	if (task_is_stopped(task) &&
+	    task_set_jobctl_pending(task,
+				    JOBCTL_STOP_PENDING | JOBCTL_TRAPPING))
 		signal_wake_up(task, 1);
-	}
 
 	spin_unlock(&task->sighand->siglock);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 637a171b65b6..9ab91c516c3f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -223,6 +223,39 @@ static inline void print_dropped_signal(int sig)
 				current->comm, current->pid, sig);
 }
 
+/**
+ * task_set_jobctl_pending - set jobctl pending bits
+ * @task: target task
+ * @mask: pending bits to set
+ *
+ * Clear @mask from @task->jobctl.  @mask must be subset of
+ * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK |
+ * %JOBCTL_TRAPPING.  If stop signo is being set, the existing signo is
+ * cleared.  If @task is already being killed or exiting, this function
+ * becomes noop.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ *
+ * RETURNS:
+ * %true if @mask is set, %false if made noop because @task was dying.
+ */
+bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask)
+{
+	BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
+			JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
+	BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK));
+
+	if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING)))
+		return false;
+
+	if (mask & JOBCTL_STOP_SIGMASK)
+		task->jobctl &= ~JOBCTL_STOP_SIGMASK;
+
+	task->jobctl |= mask;
+	return true;
+}
+
 /**
  * task_clear_jobctl_trapping - clear jobctl trapping bit
  * @task: target task
@@ -1902,19 +1935,20 @@ static int do_signal_stop(int signr)
 		else
 			WARN_ON_ONCE(!task_ptrace(current));
 
-		current->jobctl &= ~JOBCTL_STOP_SIGMASK;
-		current->jobctl |= signr | gstop;
-		sig->group_stop_count = 1;
+		sig->group_stop_count = 0;
+
+		if (task_set_jobctl_pending(current, signr | gstop))
+			sig->group_stop_count++;
+
 		for (t = next_thread(current); t != current;
 		     t = next_thread(t)) {
-			t->jobctl &= ~JOBCTL_STOP_SIGMASK;
 			/*
 			 * Setting state to TASK_STOPPED for a group
 			 * stop is always done with the siglock held,
 			 * so this check has no races.
 			 */
-			if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) {
-				t->jobctl |= signr | gstop;
+			if (!task_is_stopped(t) &&
+			    task_set_jobctl_pending(t, signr | gstop)) {
 				sig->group_stop_count++;
 				signal_wake_up(t, 0);
 			}

From 62c124ff3bcdb414af635c2bf822c9e4f2a5abfa Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 2 Jun 2011 11:14:00 +0200
Subject: [PATCH 08/39] ptrace: use bit_waitqueue for TRAPPING instead of
 wait_chldexit

ptracer->signal->wait_chldexit was used to wait for TRAPPING; however,
->wait_chldexit was already complicated with waker-side filtering
without adding TRAPPING wait on top of it.  Also, it unnecessarily
made TRAPPING clearing depend on the current ptrace relationship - if
the ptracee is detached, wakeup is lost.

There is no reason to use signal->wait_chldexit here.  We're just
waiting for JOBCTL_TRAPPING bit to clear and given the relatively
infrequent use of ptrace, bit_waitqueue can serve it perfectly.

This patch makes JOBCTL_TRAPPING wait use bit_waitqueue instead of
signal->wait_chldexit.

-v2: Use JOBCTL_*_BIT macros instead of ilog2() as suggested by Linus.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 kernel/ptrace.c | 10 ++++++++--
 kernel/signal.c |  3 +--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 0c37d999c8b8..7f05f3a1267b 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -25,6 +25,12 @@
 #include <linux/hw_breakpoint.h>
 
 
+static int ptrace_trapping_sleep_fn(void *flags)
+{
+	schedule();
+	return 0;
+}
+
 /*
  * ptrace a task: make the debugger its new parent and
  * move it to the ptrace list.
@@ -270,8 +276,8 @@ unlock_creds:
 	mutex_unlock(&task->signal->cred_guard_mutex);
 out:
 	if (!retval)
-		wait_event(current->signal->wait_chldexit,
-			   !(task->jobctl & JOBCTL_TRAPPING));
+		wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
+			    ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE);
 	return retval;
 }
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 9ab91c516c3f..172a4c79f12c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -272,8 +272,7 @@ static void task_clear_jobctl_trapping(struct task_struct *task)
 {
 	if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
 		task->jobctl &= ~JOBCTL_TRAPPING;
-		__wake_up_sync_key(&task->parent->signal->wait_chldexit,
-				   TASK_UNINTERRUPTIBLE, 1, task);
+		wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT);
 	}
 }
 

From dd1d6772692316fe35094085c5e4d9a370ad3462 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 2 Jun 2011 11:14:00 +0200
Subject: [PATCH 09/39] signal: remove three noop tracehooks

Remove the following three noop tracehooks in signals.c.

* tracehook_force_sigpending()
* tracehook_get_signal()
* tracehook_finish_jctl()

The code area is about to be updated and these hooks don't do anything
other than obfuscating the logic.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/tracehook.h | 52 ---------------------------------------
 kernel/signal.c           | 44 +++++++++++----------------------
 2 files changed, 14 insertions(+), 82 deletions(-)

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index e95f5236611f..15745cdd32ce 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -425,58 +425,6 @@ static inline int tracehook_consider_fatal_signal(struct task_struct *task,
 	return (task_ptrace(task) & PT_PTRACED) != 0;
 }
 
-/**
- * tracehook_force_sigpending - let tracing force signal_pending(current) on
- *
- * Called when recomputing our signal_pending() flag.  Return nonzero
- * to force the signal_pending() flag on, so that tracehook_get_signal()
- * will be called before the next return to user mode.
- *
- * Called with @current->sighand->siglock held.
- */
-static inline int tracehook_force_sigpending(void)
-{
-	return 0;
-}
-
-/**
- * tracehook_get_signal - deliver synthetic signal to traced task
- * @task:		@current
- * @regs:		task_pt_regs(@current)
- * @info:		details of synthetic signal
- * @return_ka:		sigaction for synthetic signal
- *
- * Return zero to check for a real pending signal normally.
- * Return -1 after releasing the siglock to repeat the check.
- * Return a signal number to induce an artificial signal delivery,
- * setting *@info and *@return_ka to specify its details and behavior.
- *
- * The @return_ka->sa_handler value controls the disposition of the
- * signal, no matter the signal number.  For %SIG_DFL, the return value
- * is a representative signal to indicate the behavior (e.g. %SIGTERM
- * for death, %SIGQUIT for core dump, %SIGSTOP for job control stop,
- * %SIGTSTP for stop unless in an orphaned pgrp), but the signal number
- * reported will be @info->si_signo instead.
- *
- * Called with @task->sighand->siglock held, before dequeuing pending signals.
- */
-static inline int tracehook_get_signal(struct task_struct *task,
-				       struct pt_regs *regs,
-				       siginfo_t *info,
-				       struct k_sigaction *return_ka)
-{
-	return 0;
-}
-
-/**
- * tracehook_finish_jctl - report about return from job control stop
- *
- * This is called by do_signal_stop() after wakeup.
- */
-static inline void tracehook_finish_jctl(void)
-{
-}
-
 #define DEATH_REAP			-1
 #define DEATH_DELAYED_GROUP_LEADER	-2
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 172a4c79f12c..c99b8b5c0be7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -150,9 +150,7 @@ void recalc_sigpending_and_wake(struct task_struct *t)
 
 void recalc_sigpending(void)
 {
-	if (unlikely(tracehook_force_sigpending()))
-		set_thread_flag(TIF_SIGPENDING);
-	else if (!recalc_sigpending_tsk(current) && !freezing(current))
+	if (!recalc_sigpending_tsk(current) && !freezing(current))
 		clear_thread_flag(TIF_SIGPENDING);
 
 }
@@ -2005,8 +2003,6 @@ retry:
 
 	spin_unlock_irq(&current->sighand->siglock);
 
-	tracehook_finish_jctl();
-
 	return 1;
 }
 
@@ -2109,37 +2105,25 @@ relock:
 
 	for (;;) {
 		struct k_sigaction *ka;
-		/*
-		 * Tracing can induce an artificial signal and choose sigaction.
-		 * The return value in @signr determines the default action,
-		 * but @info->si_signo is the signal number we will report.
-		 */
-		signr = tracehook_get_signal(current, regs, info, return_ka);
-		if (unlikely(signr < 0))
+
+		if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
+		    do_signal_stop(0))
 			goto relock;
-		if (unlikely(signr != 0))
-			ka = return_ka;
-		else {
-			if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
-			    do_signal_stop(0))
-				goto relock;
 
-			signr = dequeue_signal(current, &current->blocked,
-					       info);
+		signr = dequeue_signal(current, &current->blocked, info);
 
+		if (!signr)
+			break; /* will return 0 */
+
+		if (signr != SIGKILL) {
+			signr = ptrace_signal(signr, info,
+					      regs, cookie);
 			if (!signr)
-				break; /* will return 0 */
-
-			if (signr != SIGKILL) {
-				signr = ptrace_signal(signr, info,
-						      regs, cookie);
-				if (!signr)
-					continue;
-			}
-
-			ka = &sighand->action[signr-1];
+				continue;
 		}
 
+		ka = &sighand->action[signr-1];
+
 		/* Trace actually delivered signals. */
 		trace_signal_deliver(signr, info, ka);
 

From 73ddff2bee159ffb580bd24faf625cd5e628f5ec Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 Jun 2011 11:20:14 +0200
Subject: [PATCH 10/39] job control: introduce JOBCTL_TRAP_STOP and use it for
 group stop trap

do_signal_stop() implemented both normal group stop and trap for group
stop while ptraced.  This approach has been enough but scheduled
changes require trap mechanism which can be used in more generic
manner and using group stop trap for generic trap site simplifies both
userland visible interface and implementation.

This patch adds a new jobctl flag - JOBCTL_TRAP_STOP.  When set, it
triggers a trap site, which behaves like group stop trap, in
get_signal_to_deliver() after checking for pending signals.  While
ptraced, do_signal_stop() doesn't stop itself.  It initiates group
stop if requested and schedules JOBCTL_TRAP_STOP and returns.  The
caller - get_signal_to_deliver() - is responsible for checking whether
TRAP_STOP is pending afterwards and handling it.

ptrace_attach() is updated to use JOBCTL_TRAP_STOP instead of
JOBCTL_STOP_PENDING and __ptrace_unlink() to clear all pending trap
bits and TRAPPING so that TRAP_STOP and future trap bits don't linger
after detach.

While at it, add proper function comment to do_signal_stop() and make
it return bool.

-v2: __ptrace_unlink() updated to clear JOBCTL_TRAP_MASK and TRAPPING
     instead of JOBCTL_PENDING_MASK.  This avoids accidentally
     clearing JOBCTL_STOP_CONSUME.  Spotted by Oleg.

-v3: do_signal_stop() updated to return %false without dropping
     siglock while ptraced and TRAP_STOP check moved inside for(;;)
     loop after group stop participation.  This avoids unnecessary
     relocking and also will help avoiding unnecessary traps by
     consuming group stop before handling pending traps.

-v4: Jobctl trap handling moved into a separate function -
     do_jobctl_trap().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/sched.h |  6 ++-
 kernel/ptrace.c       | 12 ++++--
 kernel/signal.c       | 90 +++++++++++++++++++++++++++++--------------
 3 files changed, 75 insertions(+), 33 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5157bd9eee37..8bd84b83a35b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1810,17 +1810,21 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #define JOBCTL_STOP_DEQUEUED_BIT 16	/* stop signal dequeued */
 #define JOBCTL_STOP_PENDING_BIT	17	/* task should stop for group stop */
 #define JOBCTL_STOP_CONSUME_BIT	18	/* consume group stop count */
+#define JOBCTL_TRAP_STOP_BIT	19	/* trap for STOP */
 #define JOBCTL_TRAPPING_BIT	21	/* switching to TRACED */
 
 #define JOBCTL_STOP_DEQUEUED	(1 << JOBCTL_STOP_DEQUEUED_BIT)
 #define JOBCTL_STOP_PENDING	(1 << JOBCTL_STOP_PENDING_BIT)
 #define JOBCTL_STOP_CONSUME	(1 << JOBCTL_STOP_CONSUME_BIT)
+#define JOBCTL_TRAP_STOP	(1 << JOBCTL_TRAP_STOP_BIT)
 #define JOBCTL_TRAPPING		(1 << JOBCTL_TRAPPING_BIT)
 
-#define JOBCTL_PENDING_MASK	JOBCTL_STOP_PENDING
+#define JOBCTL_TRAP_MASK	JOBCTL_TRAP_STOP
+#define JOBCTL_PENDING_MASK	(JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
 
 extern bool task_set_jobctl_pending(struct task_struct *task,
 				    unsigned int mask);
+extern void task_clear_jobctl_trapping(struct task_struct *task);
 extern void task_clear_jobctl_pending(struct task_struct *task,
 				      unsigned int mask);
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 7f05f3a1267b..45a8a4c5d8b2 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -82,6 +82,13 @@ void __ptrace_unlink(struct task_struct *child)
 
 	spin_lock(&child->sighand->siglock);
 
+	/*
+	 * Clear all pending traps and TRAPPING.  TRAPPING should be
+	 * cleared regardless of JOBCTL_STOP_PENDING.  Do it explicitly.
+	 */
+	task_clear_jobctl_pending(child, JOBCTL_TRAP_MASK);
+	task_clear_jobctl_trapping(child);
+
 	/*
 	 * Reinstate JOBCTL_STOP_PENDING if group stop is in effect and
 	 * @child isn't dead.
@@ -246,7 +253,7 @@ static int ptrace_attach(struct task_struct *task)
 	spin_lock(&task->sighand->siglock);
 
 	/*
-	 * If the task is already STOPPED, set JOBCTL_STOP_PENDING and
+	 * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
 	 * TRAPPING, and kick it so that it transits to TRACED.  TRAPPING
 	 * will be cleared if the child completes the transition or any
 	 * event which clears the group stop states happens.  We'll wait
@@ -263,8 +270,7 @@ static int ptrace_attach(struct task_struct *task)
 	 * in and out of STOPPED are protected by siglock.
 	 */
 	if (task_is_stopped(task) &&
-	    task_set_jobctl_pending(task,
-				    JOBCTL_STOP_PENDING | JOBCTL_TRAPPING))
+	    task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
 		signal_wake_up(task, 1);
 
 	spin_unlock(&task->sighand->siglock);
diff --git a/kernel/signal.c b/kernel/signal.c
index c99b8b5c0be7..b5f55ca1f43f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -266,7 +266,7 @@ bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask)
  * CONTEXT:
  * Must be called with @task->sighand->siglock held.
  */
-static void task_clear_jobctl_trapping(struct task_struct *task)
+void task_clear_jobctl_trapping(struct task_struct *task)
 {
 	if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
 		task->jobctl &= ~JOBCTL_TRAPPING;
@@ -1790,13 +1790,16 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
 	/*
 	 * If @why is CLD_STOPPED, we're trapping to participate in a group
 	 * stop.  Do the bookkeeping.  Note that if SIGCONT was delievered
-	 * while siglock was released for the arch hook, PENDING could be
-	 * clear now.  We act as if SIGCONT is received after TASK_TRACED
-	 * is entered - ignore it.
+	 * across siglock relocks since INTERRUPT was scheduled, PENDING
+	 * could be clear now.  We act as if SIGCONT is received after
+	 * TASK_TRACED is entered - ignore it.
 	 */
 	if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING))
 		gstop_done = task_participate_group_stop(current);
 
+	/* any trap clears pending STOP trap */
+	task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP);
+
 	/* entering a trap, clear TRAPPING */
 	task_clear_jobctl_trapping(current);
 
@@ -1888,13 +1891,30 @@ void ptrace_notify(int exit_code)
 	spin_unlock_irq(&current->sighand->siglock);
 }
 
-/*
- * This performs the stopping for SIGSTOP and other stop signals.
- * We have to stop all threads in the thread group.
- * Returns non-zero if we've actually stopped and released the siglock.
- * Returns zero if we didn't stop and still hold the siglock.
+/**
+ * do_signal_stop - handle group stop for SIGSTOP and other stop signals
+ * @signr: signr causing group stop if initiating
+ *
+ * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr
+ * and participate in it.  If already set, participate in the existing
+ * group stop.  If participated in a group stop (and thus slept), %true is
+ * returned with siglock released.
+ *
+ * If ptraced, this function doesn't handle stop itself.  Instead,
+ * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock
+ * untouched.  The caller must ensure that INTERRUPT trap handling takes
+ * places afterwards.
+ *
+ * CONTEXT:
+ * Must be called with @current->sighand->siglock held, which is released
+ * on %true return.
+ *
+ * RETURNS:
+ * %false if group stop is already cancelled or ptrace trap is scheduled.
+ * %true if participated in group stop.
  */
-static int do_signal_stop(int signr)
+static bool do_signal_stop(int signr)
+	__releases(&current->sighand->siglock)
 {
 	struct signal_struct *sig = current->signal;
 
@@ -1907,7 +1927,7 @@ static int do_signal_stop(int signr)
 
 		if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) ||
 		    unlikely(signal_group_exit(sig)))
-			return 0;
+			return false;
 		/*
 		 * There is no group stop already in progress.  We must
 		 * initiate one now.
@@ -1951,7 +1971,7 @@ static int do_signal_stop(int signr)
 			}
 		}
 	}
-retry:
+
 	if (likely(!task_ptrace(current))) {
 		int notify = 0;
 
@@ -1983,27 +2003,33 @@ retry:
 
 		/* Now we don't run again until woken by SIGCONT or SIGKILL */
 		schedule();
-
-		spin_lock_irq(&current->sighand->siglock);
+		return true;
 	} else {
-		ptrace_stop(current->jobctl & JOBCTL_STOP_SIGMASK,
-			    CLD_STOPPED, 0, NULL);
-		current->exit_code = 0;
+		/*
+		 * While ptraced, group stop is handled by STOP trap.
+		 * Schedule it and let the caller deal with it.
+		 */
+		task_set_jobctl_pending(current, JOBCTL_TRAP_STOP);
+		return false;
 	}
+}
 
-	/*
-	 * JOBCTL_STOP_PENDING could be set if another group stop has
-	 * started since being woken up or ptrace wants us to transit
-	 * between TASK_STOPPED and TRACED.  Retry group stop.
-	 */
-	if (current->jobctl & JOBCTL_STOP_PENDING) {
-		WARN_ON_ONCE(!(current->jobctl & JOBCTL_STOP_SIGMASK));
-		goto retry;
-	}
+/**
+ * do_jobctl_trap - take care of ptrace jobctl traps
+ *
+ * It is currently used only to trap for group stop while ptraced.
+ *
+ * CONTEXT:
+ * Must be called with @current->sighand->siglock held, which may be
+ * released and re-acquired before returning with intervening sleep.
+ */
+static void do_jobctl_trap(void)
+{
+	int signr = current->jobctl & JOBCTL_STOP_SIGMASK;
 
-	spin_unlock_irq(&current->sighand->siglock);
-
-	return 1;
+	WARN_ON_ONCE(!signr);
+	ptrace_stop(signr, CLD_STOPPED, 0, NULL);
+	current->exit_code = 0;
 }
 
 static int ptrace_signal(int signr, siginfo_t *info,
@@ -2110,6 +2136,12 @@ relock:
 		    do_signal_stop(0))
 			goto relock;
 
+		if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) {
+			do_jobctl_trap();
+			spin_unlock_irq(&sighand->siglock);
+			goto relock;
+		}
+
 		signr = dequeue_signal(current, &current->blocked, info);
 
 		if (!signr)

From 3544d72a0e10d0aa1c1bd59ed77a53a59cdc12f7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 Jun 2011 11:20:15 +0200
Subject: [PATCH 11/39] ptrace: implement PTRACE_SEIZE

PTRACE_ATTACH implicitly issues SIGSTOP on attach which has side
effects on tracee signal and job control states.  This patch
implements a new ptrace request PTRACE_SEIZE which attaches a tracee
without trapping it or affecting its signal and job control states.

The usage is the same with PTRACE_ATTACH but it takes PTRACE_SEIZE_*
flags in @data.  Currently, the only defined flag is
PTRACE_SEIZE_DEVEL which is a temporary flag to enable PTRACE_SEIZE.
PTRACE_SEIZE will change ptrace behaviors outside of attach itself.
The changes will be implemented gradually and the DEVEL flag is to
prevent programs which expect full SEIZE behavior from using it before
all the behavior modifications are complete while allowing unit
testing.  The flag will be removed once SEIZE behaviors are completely
implemented.

* PTRACE_SEIZE, unlike ATTACH, doesn't force tracee to trap.  After
  attaching tracee continues to run unless a trap condition occurs.

* PTRACE_SEIZE doesn't affect signal or group stop state.

* If PTRACE_SEIZE'd, group stop uses PTRACE_EVENT_STOP trap which uses
  exit_code of (signr | PTRACE_EVENT_STOP << 8) where signr is one of
  the stopping signals if group stop is in effect or SIGTRAP
  otherwise, and returns usual trap siginfo on PTRACE_GETSIGINFO
  instead of NULL.

Seizing sets PT_SEIZED in ->ptrace of the tracee.  This flag will be
used to determine whether new SEIZE behaviors should be enabled.

Test program follows.

  #define PTRACE_SEIZE		0x4206
  #define PTRACE_SEIZE_DEVEL	0x80000000

  static const struct timespec ts100ms = { .tv_nsec = 100000000 };
  static const struct timespec ts1s = { .tv_sec = 1 };
  static const struct timespec ts3s = { .tv_sec = 3 };

  int main(int argc, char **argv)
  {
	  pid_t tracee;

	  tracee = fork();
	  if (tracee == 0) {
		  nanosleep(&ts100ms, NULL);
		  while (1) {
			  printf("tracee: alive\n");
			  nanosleep(&ts1s, NULL);
		  }
	  }

	  if (argc > 1)
		  kill(tracee, SIGSTOP);

	  nanosleep(&ts100ms, NULL);

	  ptrace(PTRACE_SEIZE, tracee, NULL,
		 (void *)(unsigned long)PTRACE_SEIZE_DEVEL);
	  if (argc > 1) {
		  waitid(P_PID, tracee, NULL, WSTOPPED);
		  ptrace(PTRACE_CONT, tracee, NULL, NULL);
	  }
	  nanosleep(&ts3s, NULL);
	  printf("tracer: exiting\n");
	  return 0;
  }

When the above program is called w/o argument, tracee is seized while
running and remains running.  When tracer exits, tracee continues to
run and print out messages.

  # ./test-seize-simple
  tracee: alive
  tracee: alive
  tracee: alive
  tracer: exiting
  tracee: alive
  tracee: alive

When called with an argument, tracee is seized from stopped state and
continued, and returns to stopped state when tracer exits.

  # ./test-seize
  tracee: alive
  tracee: alive
  tracee: alive
  tracer: exiting
  # ps -el|grep test-seize
  1 T     0  4720     1  0  80   0 -   941 signal ttyS0    00:00:00 test-seize

-v2: SEIZE doesn't schedule TRAP_STOP and leaves tracee running as Jan
     suggested.

-v3: PTRACE_EVENT_STOP traps now report group stop state by signr.  If
     group stop is in effect the stop signal number is returned as
     part of exit_code; otherwise, SIGTRAP.  This was suggested by
     Denys and Oleg.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/ptrace.h |  7 +++++++
 kernel/ptrace.c        | 35 +++++++++++++++++++++++++++++------
 kernel/signal.c        | 39 ++++++++++++++++++++++++++++++---------
 3 files changed, 66 insertions(+), 15 deletions(-)

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index e93ef1a54fc7..67ad3f152329 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -47,6 +47,11 @@
 #define PTRACE_GETREGSET	0x4204
 #define PTRACE_SETREGSET	0x4205
 
+#define PTRACE_SEIZE		0x4206
+
+/* flags in @data for PTRACE_SEIZE */
+#define PTRACE_SEIZE_DEVEL	0x80000000 /* temp flag for development */
+
 /* options set using PTRACE_SETOPTIONS */
 #define PTRACE_O_TRACESYSGOOD	0x00000001
 #define PTRACE_O_TRACEFORK	0x00000002
@@ -65,6 +70,7 @@
 #define PTRACE_EVENT_EXEC	4
 #define PTRACE_EVENT_VFORK_DONE	5
 #define PTRACE_EVENT_EXIT	6
+#define PTRACE_EVENT_STOP	7
 
 #include <asm/ptrace.h>
 
@@ -77,6 +83,7 @@
  * flags.  When the a task is stopped the ptracer owns task->ptrace.
  */
 
+#define PT_SEIZED	0x00010000	/* SEIZE used, enable new behavior */
 #define PT_PTRACED	0x00000001
 #define PT_DTRACE	0x00000002	/* delayed trace (used on m68k, i386) */
 #define PT_TRACESYSGOOD	0x00000004
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 45a8a4c5d8b2..dcf9f974198c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -209,10 +209,28 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
 	return !err;
 }
 
-static int ptrace_attach(struct task_struct *task)
+static int ptrace_attach(struct task_struct *task, long request,
+			 unsigned long flags)
 {
+	bool seize = (request == PTRACE_SEIZE);
 	int retval;
 
+	/*
+	 * SEIZE will enable new ptrace behaviors which will be implemented
+	 * gradually.  SEIZE_DEVEL is used to prevent applications
+	 * expecting full SEIZE behaviors trapping on kernel commits which
+	 * are still in the process of implementing them.
+	 *
+	 * Only test programs for new ptrace behaviors being implemented
+	 * should set SEIZE_DEVEL.  If unset, SEIZE will fail with -EIO.
+	 *
+	 * Once SEIZE behaviors are completely implemented, this flag and
+	 * the following test will be removed.
+	 */
+	retval = -EIO;
+	if (seize && !(flags & PTRACE_SEIZE_DEVEL))
+		goto out;
+
 	audit_ptrace(task);
 
 	retval = -EPERM;
@@ -244,11 +262,16 @@ static int ptrace_attach(struct task_struct *task)
 		goto unlock_tasklist;
 
 	task->ptrace = PT_PTRACED;
+	if (seize)
+		task->ptrace |= PT_SEIZED;
 	if (task_ns_capable(task, CAP_SYS_PTRACE))
 		task->ptrace |= PT_PTRACE_CAP;
 
 	__ptrace_link(task, current);
-	send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
+
+	/* SEIZE doesn't trap tracee on attach */
+	if (!seize)
+		send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
 
 	spin_lock(&task->sighand->siglock);
 
@@ -785,8 +808,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
 		goto out;
 	}
 
-	if (request == PTRACE_ATTACH) {
-		ret = ptrace_attach(child);
+	if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
+		ret = ptrace_attach(child, request, data);
 		/*
 		 * Some architectures need to do book-keeping after
 		 * a ptrace attach.
@@ -927,8 +950,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 		goto out;
 	}
 
-	if (request == PTRACE_ATTACH) {
-		ret = ptrace_attach(child);
+	if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
+		ret = ptrace_attach(child, request, data);
 		/*
 		 * Some architectures need to do book-keeping after
 		 * a ptrace attach.
diff --git a/kernel/signal.c b/kernel/signal.c
index b5f55ca1f43f..589292f38530 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1873,21 +1873,26 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
 	recalc_sigpending_tsk(current);
 }
 
-void ptrace_notify(int exit_code)
+static void ptrace_do_notify(int signr, int exit_code, int why)
 {
 	siginfo_t info;
 
-	BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
-
 	memset(&info, 0, sizeof info);
-	info.si_signo = SIGTRAP;
+	info.si_signo = signr;
 	info.si_code = exit_code;
 	info.si_pid = task_pid_vnr(current);
 	info.si_uid = current_uid();
 
 	/* Let the debugger run.  */
+	ptrace_stop(exit_code, why, 1, &info);
+}
+
+void ptrace_notify(int exit_code)
+{
+	BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
+
 	spin_lock_irq(&current->sighand->siglock);
-	ptrace_stop(exit_code, CLD_TRAPPED, 1, &info);
+	ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
 	spin_unlock_irq(&current->sighand->siglock);
 }
 
@@ -2017,7 +2022,13 @@ static bool do_signal_stop(int signr)
 /**
  * do_jobctl_trap - take care of ptrace jobctl traps
  *
- * It is currently used only to trap for group stop while ptraced.
+ * When PT_SEIZED, it's used for both group stop and explicit
+ * SEIZE/INTERRUPT traps.  Both generate PTRACE_EVENT_STOP trap with
+ * accompanying siginfo.  If stopped, lower eight bits of exit_code contain
+ * the stop signal; otherwise, %SIGTRAP.
+ *
+ * When !PT_SEIZED, it's used only for group stop trap with stop signal
+ * number as exit_code and no siginfo.
  *
  * CONTEXT:
  * Must be called with @current->sighand->siglock held, which may be
@@ -2025,11 +2036,21 @@ static bool do_signal_stop(int signr)
  */
 static void do_jobctl_trap(void)
 {
+	struct signal_struct *signal = current->signal;
 	int signr = current->jobctl & JOBCTL_STOP_SIGMASK;
 
-	WARN_ON_ONCE(!signr);
-	ptrace_stop(signr, CLD_STOPPED, 0, NULL);
-	current->exit_code = 0;
+	if (current->ptrace & PT_SEIZED) {
+		if (!signal->group_stop_count &&
+		    !(signal->flags & SIGNAL_STOP_STOPPED))
+			signr = SIGTRAP;
+		WARN_ON_ONCE(!signr);
+		ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8),
+				 CLD_STOPPED);
+	} else {
+		WARN_ON_ONCE(!signr);
+		ptrace_stop(signr, CLD_STOPPED, 0, NULL);
+		current->exit_code = 0;
+	}
 }
 
 static int ptrace_signal(int signr, siginfo_t *info,

From fca26f260c528ee51a2e451b5b200aeb528f3e09 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 Jun 2011 11:20:16 +0200
Subject: [PATCH 12/39] ptrace: implement PTRACE_INTERRUPT

Currently, there's no way to trap a running ptracee short of sending a
signal which has various side effects.  This patch implements
PTRACE_INTERRUPT which traps ptracee without any signal or job control
related side effect.

The implementation is almost trivial.  It uses the group stop trap -
SIGTRAP | PTRACE_EVENT_STOP << 8.  A new trap flag
JOBCTL_TRAP_INTERRUPT is added, which is set on PTRACE_INTERRUPT and
cleared when any trap happens.  As INTERRUPT should be useable
regardless of the current state of tracee, task_is_traced() test in
ptrace_check_attach() is skipped for INTERRUPT.

PTRACE_INTERRUPT is available iff tracee is attached with
PTRACE_SEIZE.

Test program follows.

  #define PTRACE_SEIZE		0x4206
  #define PTRACE_INTERRUPT	0x4207

  #define PTRACE_SEIZE_DEVEL	0x80000000

  static const struct timespec ts100ms = { .tv_nsec = 100000000 };
  static const struct timespec ts1s = { .tv_sec = 1 };
  static const struct timespec ts3s = { .tv_sec = 3 };

  int main(int argc, char **argv)
  {
	  pid_t tracee;

	  tracee = fork();
	  if (tracee == 0) {
		  nanosleep(&ts100ms, NULL);
		  while (1) {
			  printf("tracee: alive pid=%d\n", getpid());
			  nanosleep(&ts1s, NULL);
		  }
	  }

	  if (argc > 1)
		  kill(tracee, SIGSTOP);

	  nanosleep(&ts100ms, NULL);

	  ptrace(PTRACE_SEIZE, tracee, NULL,
		 (void *)(unsigned long)PTRACE_SEIZE_DEVEL);
	  if (argc > 1) {
		  waitid(P_PID, tracee, NULL, WSTOPPED);
		  ptrace(PTRACE_CONT, tracee, NULL, NULL);
	  }
	  nanosleep(&ts3s, NULL);

	  printf("tracer: INTERRUPT and DETACH\n");
	  ptrace(PTRACE_INTERRUPT, tracee, NULL, NULL);
	  waitid(P_PID, tracee, NULL, WSTOPPED);
	  ptrace(PTRACE_DETACH, tracee, NULL, NULL);
	  nanosleep(&ts3s, NULL);

	  printf("tracer: exiting\n");
	  kill(tracee, SIGKILL);
	  return 0;
  }

When called without argument, tracee is seized from running state,
interrupted and then detached back to running state.

  # ./test-interrupt
  tracee: alive pid=4546
  tracee: alive pid=4546
  tracee: alive pid=4546
  tracer: INTERRUPT and DETACH
  tracee: alive pid=4546
  tracee: alive pid=4546
  tracee: alive pid=4546
  tracer: exiting

When called with argument, tracee is seized from stopped state,
continued, interrupted and then detached back to stopped state.

  # ./test-interrupt  1
  tracee: alive pid=4548
  tracee: alive pid=4548
  tracee: alive pid=4548
  tracer: INTERRUPT and DETACH
  tracer: exiting

Before PTRACE_INTERRUPT, once the tracee was running, there was no way
to trap tracee and do PTRACE_DETACH without causing side effect.

-v2: Updated to use task_set_jobctl_pending() so that it doesn't end
     up scheduling TRAP_STOP if child is dying which may make the
     child unkillable.  Spotted by Oleg.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/ptrace.h |  1 +
 kernel/ptrace.c        | 29 +++++++++++++++++++++++++++--
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 67ad3f152329..ad754d1e0b13 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -48,6 +48,7 @@
 #define PTRACE_SETREGSET	0x4205
 
 #define PTRACE_SEIZE		0x4206
+#define PTRACE_INTERRUPT	0x4207
 
 /* flags in @data for PTRACE_SEIZE */
 #define PTRACE_SEIZE_DEVEL	0x80000000 /* temp flag for development */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index dcf9f974198c..6852c0f4a916 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -658,10 +658,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
 int ptrace_request(struct task_struct *child, long request,
 		   unsigned long addr, unsigned long data)
 {
+	bool seized = child->ptrace & PT_SEIZED;
 	int ret = -EIO;
 	siginfo_t siginfo;
 	void __user *datavp = (void __user *) data;
 	unsigned long __user *datalp = datavp;
+	unsigned long flags;
 
 	switch (request) {
 	case PTRACE_PEEKTEXT:
@@ -694,6 +696,27 @@ int ptrace_request(struct task_struct *child, long request,
 			ret = ptrace_setsiginfo(child, &siginfo);
 		break;
 
+	case PTRACE_INTERRUPT:
+		/*
+		 * Stop tracee without any side-effect on signal or job
+		 * control.  At least one trap is guaranteed to happen
+		 * after this request.  If @child is already trapped, the
+		 * current trap is not disturbed and another trap will
+		 * happen after the current trap is ended with PTRACE_CONT.
+		 *
+		 * The actual trap might not be PTRACE_EVENT_STOP trap but
+		 * the pending condition is cleared regardless.
+		 */
+		if (unlikely(!seized || !lock_task_sighand(child, &flags)))
+			break;
+
+		if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP)))
+			signal_wake_up(child, 0);
+
+		unlock_task_sighand(child, &flags);
+		ret = 0;
+		break;
+
 	case PTRACE_DETACH:	 /* detach a process that was attached. */
 		ret = ptrace_detach(child, data);
 		break;
@@ -819,7 +842,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
 		goto out_put_task_struct;
 	}
 
-	ret = ptrace_check_attach(child, request == PTRACE_KILL);
+	ret = ptrace_check_attach(child, request == PTRACE_KILL ||
+				  request == PTRACE_INTERRUPT);
 	if (ret < 0)
 		goto out_put_task_struct;
 
@@ -961,7 +985,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 		goto out_put_task_struct;
 	}
 
-	ret = ptrace_check_attach(child, request == PTRACE_KILL);
+	ret = ptrace_check_attach(child, request == PTRACE_KILL ||
+				  request == PTRACE_INTERRUPT);
 	if (!ret)
 		ret = compat_arch_ptrace(child, request, addr, data);
 

From fb1d910c178ba0c5bc32d3e5a9e82e05b7aad3cd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 Jun 2011 11:20:17 +0200
Subject: [PATCH 13/39] ptrace: implement TRAP_NOTIFY and use it for group stop
 events

Currently there's no way for ptracer to find out whether group stop
finished other than polling with INTERRUPT - GETSIGINFO - CONT
sequence.  This patch implements group stop notification for ptracer
using STOP traps.

When group stop state of a seized tracee changes, JOBCTL_TRAP_NOTIFY
is set, which schedules a STOP trap which is sticky - it isn't cleared
by other traps and at least one STOP trap will happen eventually.
STOP trap is synchronization point for event notification and the
tracer can determine the current group stop state by looking at the
signal number portion of exit code (si_status from waitid(2) or
si_code from PTRACE_GETSIGINFO).

Notifications are generated both on start and end of group stops but,
because group stop participation always happens before STOP trap, this
doesn't cause an extra trap while tracee is participating in group
stop.  The symmetry will be useful later.

Note that this notification works iff tracee is not trapped.
Currently there is no way to be notified of group stop state changes
while tracee is trapped.  This will be addressed by a later patch.

An example program follows.

  #define PTRACE_SEIZE		0x4206
  #define PTRACE_INTERRUPT	0x4207

  #define PTRACE_SEIZE_DEVEL	0x80000000

  static const struct timespec ts1s = { .tv_sec = 1 };

  int main(int argc, char **argv)
  {
	  pid_t tracee, tracer;
	  int i;

	  tracee = fork();
	  if (!tracee)
		  while (1)
			  pause();

	  tracer = fork();
	  if (!tracer) {
		  siginfo_t si;

		  ptrace(PTRACE_SEIZE, tracee, NULL,
			 (void *)(unsigned long)PTRACE_SEIZE_DEVEL);
		  ptrace(PTRACE_INTERRUPT, tracee, NULL, NULL);
	  repeat:
		  waitid(P_PID, tracee, NULL, WSTOPPED);

		  ptrace(PTRACE_GETSIGINFO, tracee, NULL, &si);
		  if (!si.si_code) {
			  printf("tracer: SIG %d\n", si.si_signo);
			  ptrace(PTRACE_CONT, tracee, NULL,
				 (void *)(unsigned long)si.si_signo);
			  goto repeat;
		  }
		  printf("tracer: stopped=%d signo=%d\n",
			 si.si_signo != SIGTRAP, si.si_signo);
		  ptrace(PTRACE_CONT, tracee, NULL, NULL);
		  goto repeat;
	  }

	  for (i = 0; i < 3; i++) {
		  nanosleep(&ts1s, NULL);
		  printf("mother: SIGSTOP\n");
		  kill(tracee, SIGSTOP);
		  nanosleep(&ts1s, NULL);
		  printf("mother: SIGCONT\n");
		  kill(tracee, SIGCONT);
	  }
	  nanosleep(&ts1s, NULL);

	  kill(tracer, SIGKILL);
	  kill(tracee, SIGKILL);
	  return 0;
  }

In the above program, tracer keeps tracee running and gets
notification of each group stop state changes.

  # ./test-notify
  tracer: stopped=0 signo=5
  mother: SIGSTOP
  tracer: SIG 19
  tracer: stopped=1 signo=19
  mother: SIGCONT
  tracer: stopped=0 signo=5
  tracer: SIG 18
  mother: SIGSTOP
  tracer: SIG 19
  tracer: stopped=1 signo=19
  mother: SIGCONT
  tracer: stopped=0 signo=5
  tracer: SIG 18
  mother: SIGSTOP
  tracer: SIG 19
  tracer: stopped=1 signo=19
  mother: SIGCONT
  tracer: stopped=0 signo=5
  tracer: SIG 18

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/sched.h |  4 +++-
 kernel/signal.c       | 38 +++++++++++++++++++++++++++++++++++---
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8bd84b83a35b..1854def284f5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1811,15 +1811,17 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #define JOBCTL_STOP_PENDING_BIT	17	/* task should stop for group stop */
 #define JOBCTL_STOP_CONSUME_BIT	18	/* consume group stop count */
 #define JOBCTL_TRAP_STOP_BIT	19	/* trap for STOP */
+#define JOBCTL_TRAP_NOTIFY_BIT	20	/* trap for NOTIFY */
 #define JOBCTL_TRAPPING_BIT	21	/* switching to TRACED */
 
 #define JOBCTL_STOP_DEQUEUED	(1 << JOBCTL_STOP_DEQUEUED_BIT)
 #define JOBCTL_STOP_PENDING	(1 << JOBCTL_STOP_PENDING_BIT)
 #define JOBCTL_STOP_CONSUME	(1 << JOBCTL_STOP_CONSUME_BIT)
 #define JOBCTL_TRAP_STOP	(1 << JOBCTL_TRAP_STOP_BIT)
+#define JOBCTL_TRAP_NOTIFY	(1 << JOBCTL_TRAP_NOTIFY_BIT)
 #define JOBCTL_TRAPPING		(1 << JOBCTL_TRAPPING_BIT)
 
-#define JOBCTL_TRAP_MASK	JOBCTL_TRAP_STOP
+#define JOBCTL_TRAP_MASK	(JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
 #define JOBCTL_PENDING_MASK	(JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
 
 extern bool task_set_jobctl_pending(struct task_struct *task,
diff --git a/kernel/signal.c b/kernel/signal.c
index 589292f38530..06177e2b3917 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -817,6 +817,30 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	return security_task_kill(t, info, sig, 0);
 }
 
+/**
+ * ptrace_trap_notify - schedule trap to notify ptracer
+ * @t: tracee wanting to notify tracer
+ *
+ * This function schedules sticky ptrace trap which is cleared on the next
+ * TRAP_STOP to notify ptracer of an event.  @t must have been seized by
+ * ptracer.
+ *
+ * If @t is running, STOP trap will be taken.  If already trapped, STOP
+ * trap will be eventually taken without returning to userland after the
+ * existing traps are finished by PTRACE_CONT.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ */
+static void ptrace_trap_notify(struct task_struct *t)
+{
+	WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
+	assert_spin_locked(&t->sighand->siglock);
+
+	task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
+	signal_wake_up(t, 0);
+}
+
 /*
  * Handle magic process-wide effects of stop/continue signals. Unlike
  * the signal actions, these happen immediately at signal-generation
@@ -855,7 +879,10 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
 		do {
 			task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
 			rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
-			wake_up_state(t, __TASK_STOPPED);
+			if (likely(!(t->ptrace & PT_SEIZED)))
+				wake_up_state(t, __TASK_STOPPED);
+			else
+				ptrace_trap_notify(t);
 		} while_each_thread(p, t);
 
 		/*
@@ -1797,8 +1824,10 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
 	if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING))
 		gstop_done = task_participate_group_stop(current);
 
-	/* any trap clears pending STOP trap */
+	/* any trap clears pending STOP trap, STOP trap clears NOTIFY */
 	task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP);
+	if (info && info->si_code >> 8 == PTRACE_EVENT_STOP)
+		task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY);
 
 	/* entering a trap, clear TRAPPING */
 	task_clear_jobctl_trapping(current);
@@ -1972,7 +2001,10 @@ static bool do_signal_stop(int signr)
 			if (!task_is_stopped(t) &&
 			    task_set_jobctl_pending(t, signr | gstop)) {
 				sig->group_stop_count++;
-				signal_wake_up(t, 0);
+				if (likely(!(t->ptrace & PT_SEIZED)))
+					signal_wake_up(t, 0);
+				else
+					ptrace_trap_notify(t);
 			}
 		}
 	}

From 544b2c91a9f14f9565af1972203438b7f49afd48 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 Jun 2011 11:20:18 +0200
Subject: [PATCH 14/39] ptrace: implement PTRACE_LISTEN

The previous patch implemented async notification for ptrace but it
only worked while trace is running.  This patch introduces
PTRACE_LISTEN which is suggested by Oleg Nestrov.

It's allowed iff tracee is in STOP trap and puts tracee into
quasi-running state - tracee never really runs but wait(2) and
ptrace(2) consider it to be running.  While ptracer is listening,
tracee is allowed to re-enter STOP to notify an async event.
Listening state is cleared on the first notification.  Ptracer can
also clear it by issuing INTERRUPT - tracee will re-trap into STOP
with listening state cleared.

This allows ptracer to monitor group stop state without running tracee
- use INTERRUPT to put tracee into STOP trap, issue LISTEN and then
wait(2) to wait for the next group stop event.  When it happens,
PTRACE_GETSIGINFO provides information to determine the current state.

Test program follows.

  #define PTRACE_SEIZE		0x4206
  #define PTRACE_INTERRUPT	0x4207
  #define PTRACE_LISTEN		0x4208

  #define PTRACE_SEIZE_DEVEL	0x80000000

  static const struct timespec ts1s = { .tv_sec = 1 };

  int main(int argc, char **argv)
  {
	  pid_t tracee, tracer;
	  int i;

	  tracee = fork();
	  if (!tracee)
		  while (1)
			  pause();

	  tracer = fork();
	  if (!tracer) {
		  siginfo_t si;

		  ptrace(PTRACE_SEIZE, tracee, NULL,
			 (void *)(unsigned long)PTRACE_SEIZE_DEVEL);
		  ptrace(PTRACE_INTERRUPT, tracee, NULL, NULL);
	  repeat:
		  waitid(P_PID, tracee, NULL, WSTOPPED);

		  ptrace(PTRACE_GETSIGINFO, tracee, NULL, &si);
		  if (!si.si_code) {
			  printf("tracer: SIG %d\n", si.si_signo);
			  ptrace(PTRACE_CONT, tracee, NULL,
				 (void *)(unsigned long)si.si_signo);
			  goto repeat;
		  }
		  printf("tracer: stopped=%d signo=%d\n",
			 si.si_signo != SIGTRAP, si.si_signo);
		  if (si.si_signo != SIGTRAP)
			  ptrace(PTRACE_LISTEN, tracee, NULL, NULL);
		  else
			  ptrace(PTRACE_CONT, tracee, NULL, NULL);
		  goto repeat;
	  }

	  for (i = 0; i < 3; i++) {
		  nanosleep(&ts1s, NULL);
		  printf("mother: SIGSTOP\n");
		  kill(tracee, SIGSTOP);
		  nanosleep(&ts1s, NULL);
		  printf("mother: SIGCONT\n");
		  kill(tracee, SIGCONT);
	  }
	  nanosleep(&ts1s, NULL);

	  kill(tracer, SIGKILL);
	  kill(tracee, SIGKILL);
	  return 0;
  }

This is identical to the program to test TRAP_NOTIFY except that
tracee is PTRACE_LISTEN'd instead of PTRACE_CONT'd when group stopped.
This allows ptracer to monitor when group stop ends without running
tracee.

  # ./test-listen
  tracer: stopped=0 signo=5
  mother: SIGSTOP
  tracer: SIG 19
  tracer: stopped=1 signo=19
  mother: SIGCONT
  tracer: stopped=0 signo=5
  tracer: SIG 18
  mother: SIGSTOP
  tracer: SIG 19
  tracer: stopped=1 signo=19
  mother: SIGCONT
  tracer: stopped=0 signo=5
  tracer: SIG 18
  mother: SIGSTOP
  tracer: SIG 19
  tracer: stopped=1 signo=19
  mother: SIGCONT
  tracer: stopped=0 signo=5
  tracer: SIG 18

-v2: Moved JOBCTL_LISTENING check in wait_task_stopped() into
     task_stopped_code() as suggested by Oleg.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/ptrace.h |  1 +
 include/linux/sched.h  |  2 ++
 kernel/exit.c          |  3 ++-
 kernel/ptrace.c        | 42 +++++++++++++++++++++++++++++++++++++++---
 kernel/signal.c        | 13 +++++++++----
 5 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index ad754d1e0b13..4f224f169524 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -49,6 +49,7 @@
 
 #define PTRACE_SEIZE		0x4206
 #define PTRACE_INTERRUPT	0x4207
+#define PTRACE_LISTEN		0x4208
 
 /* flags in @data for PTRACE_SEIZE */
 #define PTRACE_SEIZE_DEVEL	0x80000000 /* temp flag for development */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1854def284f5..87f7ca7ed6f6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1813,6 +1813,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #define JOBCTL_TRAP_STOP_BIT	19	/* trap for STOP */
 #define JOBCTL_TRAP_NOTIFY_BIT	20	/* trap for NOTIFY */
 #define JOBCTL_TRAPPING_BIT	21	/* switching to TRACED */
+#define JOBCTL_LISTENING_BIT	22	/* ptracer is listening for events */
 
 #define JOBCTL_STOP_DEQUEUED	(1 << JOBCTL_STOP_DEQUEUED_BIT)
 #define JOBCTL_STOP_PENDING	(1 << JOBCTL_STOP_PENDING_BIT)
@@ -1820,6 +1821,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #define JOBCTL_TRAP_STOP	(1 << JOBCTL_TRAP_STOP_BIT)
 #define JOBCTL_TRAP_NOTIFY	(1 << JOBCTL_TRAP_NOTIFY_BIT)
 #define JOBCTL_TRAPPING		(1 << JOBCTL_TRAPPING_BIT)
+#define JOBCTL_LISTENING	(1 << JOBCTL_LISTENING_BIT)
 
 #define JOBCTL_TRAP_MASK	(JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
 #define JOBCTL_PENDING_MASK	(JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
diff --git a/kernel/exit.c b/kernel/exit.c
index 20a406471525..289f59d686bf 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1368,7 +1368,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 static int *task_stopped_code(struct task_struct *p, bool ptrace)
 {
 	if (ptrace) {
-		if (task_is_stopped_or_traced(p))
+		if (task_is_stopped_or_traced(p) &&
+		    !(p->jobctl & JOBCTL_LISTENING))
 			return &p->exit_code;
 	} else {
 		if (p->signal->flags & SIGNAL_STOP_STOPPED)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6852c0f4a916..e18966c1c0da 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -146,7 +146,8 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state)
 		 */
 		spin_lock_irq(&child->sighand->siglock);
 		WARN_ON_ONCE(task_is_stopped(child));
-		if (task_is_traced(child) || ignore_state)
+		if (ignore_state || (task_is_traced(child) &&
+				     !(child->jobctl & JOBCTL_LISTENING)))
 			ret = 0;
 		spin_unlock_irq(&child->sighand->siglock);
 	}
@@ -660,7 +661,7 @@ int ptrace_request(struct task_struct *child, long request,
 {
 	bool seized = child->ptrace & PT_SEIZED;
 	int ret = -EIO;
-	siginfo_t siginfo;
+	siginfo_t siginfo, *si;
 	void __user *datavp = (void __user *) data;
 	unsigned long __user *datalp = datavp;
 	unsigned long flags;
@@ -710,8 +711,43 @@ int ptrace_request(struct task_struct *child, long request,
 		if (unlikely(!seized || !lock_task_sighand(child, &flags)))
 			break;
 
+		/*
+		 * INTERRUPT doesn't disturb existing trap sans one
+		 * exception.  If ptracer issued LISTEN for the current
+		 * STOP, this INTERRUPT should clear LISTEN and re-trap
+		 * tracee into STOP.
+		 */
 		if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP)))
-			signal_wake_up(child, 0);
+			signal_wake_up(child, child->jobctl & JOBCTL_LISTENING);
+
+		unlock_task_sighand(child, &flags);
+		ret = 0;
+		break;
+
+	case PTRACE_LISTEN:
+		/*
+		 * Listen for events.  Tracee must be in STOP.  It's not
+		 * resumed per-se but is not considered to be in TRACED by
+		 * wait(2) or ptrace(2).  If an async event (e.g. group
+		 * stop state change) happens, tracee will enter STOP trap
+		 * again.  Alternatively, ptracer can issue INTERRUPT to
+		 * finish listening and re-trap tracee into STOP.
+		 */
+		if (unlikely(!seized || !lock_task_sighand(child, &flags)))
+			break;
+
+		si = child->last_siginfo;
+		if (unlikely(!si || si->si_code >> 8 != PTRACE_EVENT_STOP))
+			break;
+
+		child->jobctl |= JOBCTL_LISTENING;
+
+		/*
+		 * If NOTIFY is set, it means event happened between start
+		 * of this trap and now.  Trigger re-trap immediately.
+		 */
+		if (child->jobctl & JOBCTL_TRAP_NOTIFY)
+			signal_wake_up(child, true);
 
 		unlock_task_sighand(child, &flags);
 		ret = 0;
diff --git a/kernel/signal.c b/kernel/signal.c
index 06177e2b3917..97e575a3387e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -825,9 +825,11 @@ static int check_kill_permission(int sig, struct siginfo *info,
  * TRAP_STOP to notify ptracer of an event.  @t must have been seized by
  * ptracer.
  *
- * If @t is running, STOP trap will be taken.  If already trapped, STOP
- * trap will be eventually taken without returning to userland after the
- * existing traps are finished by PTRACE_CONT.
+ * If @t is running, STOP trap will be taken.  If trapped for STOP and
+ * ptracer is listening for events, tracee is woken up so that it can
+ * re-trap for the new event.  If trapped otherwise, STOP trap will be
+ * eventually taken without returning to userland after the existing traps
+ * are finished by PTRACE_CONT.
  *
  * CONTEXT:
  * Must be called with @task->sighand->siglock held.
@@ -838,7 +840,7 @@ static void ptrace_trap_notify(struct task_struct *t)
 	assert_spin_locked(&t->sighand->siglock);
 
 	task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
-	signal_wake_up(t, 0);
+	signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
 }
 
 /*
@@ -1894,6 +1896,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
 	spin_lock_irq(&current->sighand->siglock);
 	current->last_siginfo = NULL;
 
+	/* LISTENING can be set only during STOP traps, clear it */
+	current->jobctl &= ~JOBCTL_LISTENING;
+
 	/*
 	 * Queued signals ignored us while we were stopped for tracing.
 	 * So check for any that we should take before resuming user mode.

From d21142ece414ce1088cfcae760689aa60d6fee80 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 17 Jun 2011 16:50:34 +0200
Subject: [PATCH 15/39] ptrace: kill task_ptrace()

task_ptrace(task) simply dereferences task->ptrace and isn't even used
consistently only adding confusion.  Kill it and directly access
->ptrace instead.

This doesn't introduce any behavior change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/ptrace.h    | 11 -----------
 include/linux/tracehook.h | 16 ++++++++--------
 kernel/exit.c             |  8 ++++----
 kernel/signal.c           | 14 +++++++-------
 mm/oom_kill.c             |  3 +--
 5 files changed, 20 insertions(+), 32 deletions(-)

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 4f224f169524..3ff20b322598 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -145,17 +145,6 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
 int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
 			    unsigned long data);
 
-/**
- * task_ptrace - return %PT_* flags that apply to a task
- * @task:	pointer to &task_struct in question
- *
- * Returns the %PT_* flags that apply to @task.
- */
-static inline int task_ptrace(struct task_struct *task)
-{
-	return task->ptrace;
-}
-
 /**
  * ptrace_event - possibly stop for a ptrace event notification
  * @mask:	%PT_* bit to check in @current->ptrace
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 15745cdd32ce..a3e838784f43 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -63,7 +63,7 @@ struct linux_binprm;
  */
 static inline int tracehook_expect_breakpoints(struct task_struct *task)
 {
-	return (task_ptrace(task) & PT_PTRACED) != 0;
+	return (task->ptrace & PT_PTRACED) != 0;
 }
 
 /*
@@ -71,7 +71,7 @@ static inline int tracehook_expect_breakpoints(struct task_struct *task)
  */
 static inline void ptrace_report_syscall(struct pt_regs *regs)
 {
-	int ptrace = task_ptrace(current);
+	int ptrace = current->ptrace;
 
 	if (!(ptrace & PT_PTRACED))
 		return;
@@ -155,7 +155,7 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
 static inline int tracehook_unsafe_exec(struct task_struct *task)
 {
 	int unsafe = 0;
-	int ptrace = task_ptrace(task);
+	int ptrace = task->ptrace;
 	if (ptrace & PT_PTRACED) {
 		if (ptrace & PT_PTRACE_CAP)
 			unsafe |= LSM_UNSAFE_PTRACE_CAP;
@@ -178,7 +178,7 @@ static inline int tracehook_unsafe_exec(struct task_struct *task)
  */
 static inline struct task_struct *tracehook_tracer_task(struct task_struct *tsk)
 {
-	if (task_ptrace(tsk) & PT_PTRACED)
+	if (tsk->ptrace & PT_PTRACED)
 		return rcu_dereference(tsk->parent);
 	return NULL;
 }
@@ -202,7 +202,7 @@ static inline void tracehook_report_exec(struct linux_binfmt *fmt,
 					 struct pt_regs *regs)
 {
 	if (!ptrace_event(PT_TRACE_EXEC, PTRACE_EVENT_EXEC, 0) &&
-	    unlikely(task_ptrace(current) & PT_PTRACED))
+	    unlikely(current->ptrace & PT_PTRACED))
 		send_sig(SIGTRAP, current, 0);
 }
 
@@ -285,7 +285,7 @@ static inline void tracehook_report_clone(struct pt_regs *regs,
 					  unsigned long clone_flags,
 					  pid_t pid, struct task_struct *child)
 {
-	if (unlikely(task_ptrace(child))) {
+	if (unlikely(child->ptrace)) {
 		/*
 		 * It doesn't matter who attached/attaching to this
 		 * task, the pending SIGSTOP is right in any case.
@@ -403,7 +403,7 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info,
 static inline int tracehook_consider_ignored_signal(struct task_struct *task,
 						    int sig)
 {
-	return (task_ptrace(task) & PT_PTRACED) != 0;
+	return (task->ptrace & PT_PTRACED) != 0;
 }
 
 /**
@@ -422,7 +422,7 @@ static inline int tracehook_consider_ignored_signal(struct task_struct *task,
 static inline int tracehook_consider_fatal_signal(struct task_struct *task,
 						  int sig)
 {
-	return (task_ptrace(task) & PT_PTRACED) != 0;
+	return (task->ptrace & PT_PTRACED) != 0;
 }
 
 #define DEATH_REAP			-1
diff --git a/kernel/exit.c b/kernel/exit.c
index 289f59d686bf..e5cc05644609 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -765,7 +765,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
 	p->exit_signal = SIGCHLD;
 
 	/* If it has exited notify the new parent about this child's death. */
-	if (!task_ptrace(p) &&
+	if (!p->ptrace &&
 	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
 		do_notify_parent(p, p->exit_signal);
 		if (task_detached(p)) {
@@ -795,7 +795,7 @@ static void forget_original_parent(struct task_struct *father)
 		do {
 			t->real_parent = reaper;
 			if (t->parent == father) {
-				BUG_ON(task_ptrace(t));
+				BUG_ON(t->ptrace);
 				t->parent = t->real_parent;
 			}
 			if (t->pdeath_signal)
@@ -1565,7 +1565,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 		 * Notification and reaping will be cascaded to the real
 		 * parent when the ptracer detaches.
 		 */
-		if (likely(!ptrace) && unlikely(task_ptrace(p))) {
+		if (likely(!ptrace) && unlikely(p->ptrace)) {
 			/* it will become visible, clear notask_error */
 			wo->notask_error = 0;
 			return 0;
@@ -1608,7 +1608,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 		 * own children, it should create a separate process which
 		 * takes the role of real parent.
 		 */
-		if (likely(!ptrace) && task_ptrace(p) &&
+		if (likely(!ptrace) && p->ptrace &&
 		    same_thread_group(p->parent, p->real_parent))
 			return 0;
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 97e575a3387e..0f3370872506 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1592,7 +1592,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
  	/* do_notify_parent_cldstop should have been called instead.  */
  	BUG_ON(task_is_stopped_or_traced(tsk));
 
-	BUG_ON(!task_ptrace(tsk) &&
+	BUG_ON(!tsk->ptrace &&
 	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
 
 	info.si_signo = sig;
@@ -1631,7 +1631,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 
 	psig = tsk->parent->sighand;
 	spin_lock_irqsave(&psig->siglock, flags);
-	if (!task_ptrace(tsk) && sig == SIGCHLD &&
+	if (!tsk->ptrace && sig == SIGCHLD &&
 	    (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
 	     (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
 		/*
@@ -1731,7 +1731,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
 
 static inline int may_ptrace_stop(void)
 {
-	if (!likely(task_ptrace(current)))
+	if (!likely(current->ptrace))
 		return 0;
 	/*
 	 * Are we in the middle of do_coredump?
@@ -1989,7 +1989,7 @@ static bool do_signal_stop(int signr)
 		if (!(sig->flags & SIGNAL_STOP_STOPPED))
 			sig->group_exit_code = signr;
 		else
-			WARN_ON_ONCE(!task_ptrace(current));
+			WARN_ON_ONCE(!current->ptrace);
 
 		sig->group_stop_count = 0;
 
@@ -2014,7 +2014,7 @@ static bool do_signal_stop(int signr)
 		}
 	}
 
-	if (likely(!task_ptrace(current))) {
+	if (likely(!current->ptrace)) {
 		int notify = 0;
 
 		/*
@@ -2093,7 +2093,7 @@ static void do_jobctl_trap(void)
 static int ptrace_signal(int signr, siginfo_t *info,
 			 struct pt_regs *regs, void *cookie)
 {
-	if (!task_ptrace(current))
+	if (!current->ptrace)
 		return signr;
 
 	ptrace_signal_deliver(regs, cookie);
@@ -2179,7 +2179,7 @@ relock:
 		do_notify_parent_cldstop(current, false, why);
 
 		leader = current->group_leader;
-		if (task_ptrace(leader) && !real_parent_is_ptracer(leader))
+		if (leader->ptrace && !real_parent_is_ptracer(leader))
 			do_notify_parent_cldstop(leader, true, why);
 
 		read_unlock(&tasklist_lock);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index e4b0991ca351..b0be989d4365 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -339,8 +339,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 				 * then wait for it to finish before killing
 				 * some other task unnecessarily.
 				 */
-				if (!(task_ptrace(p->group_leader) &
-							PT_TRACE_EXIT))
+				if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
 					return ERR_PTR(-1UL);
 			}
 		}

From 643ad8388e189dfd14ef76972cf7dc394b3cbebd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 17 Jun 2011 16:50:35 +0200
Subject: [PATCH 16/39] ptrace: introduce ptrace_event_enabled() and simplify
 ptrace_event() and tracehook_prepare_clone()

This patch implements ptrace_event_enabled() which tests whether a
given PTRACE_EVENT_* is enabled and use it to simplify ptrace_event()
and tracehook_prepare_clone().

PT_EVENT_FLAG() macro is added which calculates PT_TRACE_* flag from
PTRACE_EVENT_*.  This is used to define PT_TRACE_* flags and by
ptrace_event_enabled() to find the matching flag.

This is used to make ptrace_event() and tracehook_prepare_clone()
simpler.

* ptrace_event() callers were responsible for providing mask to test
  whether the event was enabled.  This patch implements
  ptrace_event_enabled() and make ptrace_event() drop @mask and
  determine whether the event is enabled from @event.  Note that
  @event is constant and this conversion doesn't add runtime overhead.

  All conversions except tracehook_report_clone_complete() are
  trivial.  tracehook_report_clone_complete() used to use 0 for @mask
  (always enabled) but now tests whether the specified event is
  enabled.  This doesn't cause any behavior difference as it's
  guaranteed that the event specified by @trace is enabled.

* tracehook_prepare_clone() now only determines which event is
  applicable and use ptrace_event_enabled() for enable test.

This doesn't introduce any behavior change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/ptrace.h    | 46 +++++++++++++++++++++++++++------------
 include/linux/tracehook.h | 26 +++++++++++-----------
 2 files changed, 45 insertions(+), 27 deletions(-)

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 3ff20b322598..18feac6f441e 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -90,12 +90,17 @@
 #define PT_DTRACE	0x00000002	/* delayed trace (used on m68k, i386) */
 #define PT_TRACESYSGOOD	0x00000004
 #define PT_PTRACE_CAP	0x00000008	/* ptracer can follow suid-exec */
-#define PT_TRACE_FORK	0x00000010
-#define PT_TRACE_VFORK	0x00000020
-#define PT_TRACE_CLONE	0x00000040
-#define PT_TRACE_EXEC	0x00000080
-#define PT_TRACE_VFORK_DONE	0x00000100
-#define PT_TRACE_EXIT	0x00000200
+
+/* PT_TRACE_* event enable flags */
+#define PT_EVENT_FLAG_SHIFT	4
+#define PT_EVENT_FLAG(event)	(1 << (PT_EVENT_FLAG_SHIFT + (event) - 1))
+
+#define PT_TRACE_FORK		PT_EVENT_FLAG(PTRACE_EVENT_FORK)
+#define PT_TRACE_VFORK		PT_EVENT_FLAG(PTRACE_EVENT_VFORK)
+#define PT_TRACE_CLONE		PT_EVENT_FLAG(PTRACE_EVENT_CLONE)
+#define PT_TRACE_EXEC		PT_EVENT_FLAG(PTRACE_EVENT_EXEC)
+#define PT_TRACE_VFORK_DONE	PT_EVENT_FLAG(PTRACE_EVENT_VFORK_DONE)
+#define PT_TRACE_EXIT		PT_EVENT_FLAG(PTRACE_EVENT_EXIT)
 
 #define PT_TRACE_MASK	0x000003f4
 
@@ -145,26 +150,39 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
 int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
 			    unsigned long data);
 
+/**
+ * ptrace_event_enabled - test whether a ptrace event is enabled
+ * @task: ptracee of interest
+ * @event: %PTRACE_EVENT_* to test
+ *
+ * Test whether @event is enabled for ptracee @task.
+ *
+ * Returns %true if @event is enabled, %false otherwise.
+ */
+static inline bool ptrace_event_enabled(struct task_struct *task, int event)
+{
+	return task->ptrace & PT_EVENT_FLAG(event);
+}
+
 /**
  * ptrace_event - possibly stop for a ptrace event notification
- * @mask:	%PT_* bit to check in @current->ptrace
- * @event:	%PTRACE_EVENT_* value to report if @mask is set
+ * @event:	%PTRACE_EVENT_* value to report
  * @message:	value for %PTRACE_GETEVENTMSG to return
  *
- * This checks the @mask bit to see if ptrace wants stops for this event.
- * If so we stop, reporting @event and @message to the ptrace parent.
+ * Check whether @event is enabled and, if so, report @event and @message
+ * to the ptrace parent.
  *
  * Returns nonzero if we did a ptrace notification, zero if not.
  *
  * Called without locks.
  */
-static inline int ptrace_event(int mask, int event, unsigned long message)
+static inline int ptrace_event(int event, unsigned long message)
 {
-	if (mask && likely(!(current->ptrace & mask)))
-		return 0;
+	if (likely(!ptrace_event_enabled(current, event)))
+		return false;
 	current->ptrace_message = message;
 	ptrace_notify((event << 8) | SIGTRAP);
-	return 1;
+	return true;
 }
 
 /**
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index a3e838784f43..7d38571b0c05 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -201,7 +201,7 @@ static inline void tracehook_report_exec(struct linux_binfmt *fmt,
 					 struct linux_binprm *bprm,
 					 struct pt_regs *regs)
 {
-	if (!ptrace_event(PT_TRACE_EXEC, PTRACE_EVENT_EXEC, 0) &&
+	if (!ptrace_event(PTRACE_EVENT_EXEC, 0) &&
 	    unlikely(current->ptrace & PT_PTRACED))
 		send_sig(SIGTRAP, current, 0);
 }
@@ -218,7 +218,7 @@ static inline void tracehook_report_exec(struct linux_binfmt *fmt,
  */
 static inline void tracehook_report_exit(long *exit_code)
 {
-	ptrace_event(PT_TRACE_EXIT, PTRACE_EVENT_EXIT, *exit_code);
+	ptrace_event(PTRACE_EVENT_EXIT, *exit_code);
 }
 
 /**
@@ -232,19 +232,19 @@ static inline void tracehook_report_exit(long *exit_code)
  */
 static inline int tracehook_prepare_clone(unsigned clone_flags)
 {
+	int event = 0;
+
 	if (clone_flags & CLONE_UNTRACED)
 		return 0;
 
-	if (clone_flags & CLONE_VFORK) {
-		if (current->ptrace & PT_TRACE_VFORK)
-			return PTRACE_EVENT_VFORK;
-	} else if ((clone_flags & CSIGNAL) != SIGCHLD) {
-		if (current->ptrace & PT_TRACE_CLONE)
-			return PTRACE_EVENT_CLONE;
-	} else if (current->ptrace & PT_TRACE_FORK)
-		return PTRACE_EVENT_FORK;
+	if (clone_flags & CLONE_VFORK)
+		event = PTRACE_EVENT_VFORK;
+	else if ((clone_flags & CSIGNAL) != SIGCHLD)
+		event = PTRACE_EVENT_CLONE;
+	else
+		event = PTRACE_EVENT_FORK;
 
-	return 0;
+	return ptrace_event_enabled(current, event) ? event : 0;
 }
 
 /**
@@ -318,7 +318,7 @@ static inline void tracehook_report_clone_complete(int trace,
 						   struct task_struct *child)
 {
 	if (unlikely(trace))
-		ptrace_event(0, trace, pid);
+		ptrace_event(trace, pid);
 }
 
 /**
@@ -336,7 +336,7 @@ static inline void tracehook_report_clone_complete(int trace,
 static inline void tracehook_report_vfork_done(struct task_struct *child,
 					       pid_t pid)
 {
-	ptrace_event(PT_TRACE_VFORK_DONE, PTRACE_EVENT_VFORK_DONE, pid);
+	ptrace_event(PTRACE_EVENT_VFORK_DONE, pid);
 }
 
 /**

From f3c04b934d429b1ace21866f011b66de328c0dc9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 17 Jun 2011 16:50:36 +0200
Subject: [PATCH 17/39] ptrace: move SIGTRAP on exec(2) logic to ptrace_event()

Move SIGTRAP on exec(2) logic from tracehook_report_exec() to
ptrace_event().  This is part of changes to make ptrace_event()
smarter and handle ptrace event related details in one place.

This doesn't introduce any behavior change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/ptrace.h    | 16 ++++++++--------
 include/linux/tracehook.h |  4 +---
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 18feac6f441e..b546fd6c3506 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -172,17 +172,17 @@ static inline bool ptrace_event_enabled(struct task_struct *task, int event)
  * Check whether @event is enabled and, if so, report @event and @message
  * to the ptrace parent.
  *
- * Returns nonzero if we did a ptrace notification, zero if not.
- *
  * Called without locks.
  */
-static inline int ptrace_event(int event, unsigned long message)
+static inline void ptrace_event(int event, unsigned long message)
 {
-	if (likely(!ptrace_event_enabled(current, event)))
-		return false;
-	current->ptrace_message = message;
-	ptrace_notify((event << 8) | SIGTRAP);
-	return true;
+	if (unlikely(ptrace_event_enabled(current, event))) {
+		current->ptrace_message = message;
+		ptrace_notify((event << 8) | SIGTRAP);
+	} else if (event == PTRACE_EVENT_EXEC && unlikely(current->ptrace)) {
+		/* legacy EXEC report via SIGTRAP */
+		send_sig(SIGTRAP, current, 0);
+	}
 }
 
 /**
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 7d38571b0c05..3b68aa842a92 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -201,9 +201,7 @@ static inline void tracehook_report_exec(struct linux_binfmt *fmt,
 					 struct linux_binprm *bprm,
 					 struct pt_regs *regs)
 {
-	if (!ptrace_event(PTRACE_EVENT_EXEC, 0) &&
-	    unlikely(current->ptrace & PT_PTRACED))
-		send_sig(SIGTRAP, current, 0);
+	ptrace_event(PTRACE_EVENT_EXEC, 0);
 }
 
 /**

From a288eecce5253cc1565d400a52b9b476a157e040 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 17 Jun 2011 16:50:37 +0200
Subject: [PATCH 18/39] ptrace: kill trivial tracehooks

At this point, tracehooks aren't useful to mainline kernel and mostly
just add an extra layer of obfuscation.  Although they have comments,
without actual in-kernel users, it is difficult to tell what are their
assumptions and they're actually trying to achieve.  To mainline
kernel, they just aren't worth keeping around.

This patch kills the following trivial tracehooks.

* Ones testing whether task is ptraced.  Replace with ->ptrace test.

	tracehook_expect_breakpoints()
	tracehook_consider_ignored_signal()
	tracehook_consider_fatal_signal()

* ptrace_event() wrappers.  Call directly.

	tracehook_report_exec()
	tracehook_report_exit()
	tracehook_report_vfork_done()

* ptrace_release_task() wrapper.  Call directly.

	tracehook_finish_release_task()

* noop

	tracehook_prepare_release_task()
	tracehook_report_death()

This doesn't introduce any behavior change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 arch/s390/kernel/traps.c  |   4 +-
 fs/exec.c                 |   2 +-
 include/linux/tracehook.h | 156 --------------------------------------
 kernel/exit.c             |   7 +-
 kernel/fork.c             |   2 +-
 kernel/signal.c           |   8 +-
 mm/nommu.c                |   3 +-
 7 files changed, 11 insertions(+), 171 deletions(-)

diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index a65d2e82f61d..a63d34c3611e 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -331,7 +331,7 @@ void __kprobes do_per_trap(struct pt_regs *regs)
 {
 	if (notify_die(DIE_SSTEP, "sstep", regs, 0, 0, SIGTRAP) == NOTIFY_STOP)
 		return;
-	if (tracehook_consider_fatal_signal(current, SIGTRAP))
+	if (current->ptrace)
 		force_sig(SIGTRAP, current);
 }
 
@@ -425,7 +425,7 @@ static void __kprobes illegal_op(struct pt_regs *regs, long pgm_int_code,
 		if (get_user(*((__u16 *) opcode), (__u16 __user *) location))
 			return;
 		if (*((__u16 *) opcode) == S390_BREAKPOINT_U16) {
-			if (tracehook_consider_fatal_signal(current, SIGTRAP))
+			if (current->ptrace)
 				force_sig(SIGTRAP, current);
 			else
 				signal = SIGILL;
diff --git a/fs/exec.c b/fs/exec.c
index a9f2b3631bdb..b37030d0a50b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1384,7 +1384,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 			bprm->recursion_depth = depth;
 			if (retval >= 0) {
 				if (depth == 0)
-					tracehook_report_exec(fmt, bprm, regs);
+					ptrace_event(PTRACE_EVENT_EXEC, 0);
 				put_binfmt(fmt);
 				allow_write_access(bprm->file);
 				if (bprm->file)
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 3b68aa842a92..8b06d4f2b814 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -51,21 +51,6 @@
 #include <linux/security.h>
 struct linux_binprm;
 
-/**
- * tracehook_expect_breakpoints - guess if task memory might be touched
- * @task:		current task, making a new mapping
- *
- * Return nonzero if @task is expected to want breakpoint insertion in
- * its memory at some point.  A zero return is no guarantee it won't
- * be done, but this is a hint that it's known to be likely.
- *
- * May be called with @task->mm->mmap_sem held for writing.
- */
-static inline int tracehook_expect_breakpoints(struct task_struct *task)
-{
-	return (task->ptrace & PT_PTRACED) != 0;
-}
-
 /*
  * ptrace report for syscall entry and exit looks identical.
  */
@@ -183,42 +168,6 @@ static inline struct task_struct *tracehook_tracer_task(struct task_struct *tsk)
 	return NULL;
 }
 
-/**
- * tracehook_report_exec - a successful exec was completed
- * @fmt:		&struct linux_binfmt that performed the exec
- * @bprm:		&struct linux_binprm containing exec details
- * @regs:		user-mode register state
- *
- * An exec just completed, we are shortly going to return to user mode.
- * The freshly initialized register state can be seen and changed in @regs.
- * The name, file and other pointers in @bprm are still on hand to be
- * inspected, but will be freed as soon as this returns.
- *
- * Called with no locks, but with some kernel resources held live
- * and a reference on @fmt->module.
- */
-static inline void tracehook_report_exec(struct linux_binfmt *fmt,
-					 struct linux_binprm *bprm,
-					 struct pt_regs *regs)
-{
-	ptrace_event(PTRACE_EVENT_EXEC, 0);
-}
-
-/**
- * tracehook_report_exit - task has begun to exit
- * @exit_code:		pointer to value destined for @current->exit_code
- *
- * @exit_code points to the value passed to do_exit(), which tracing
- * might change here.  This is almost the first thing in do_exit(),
- * before freeing any resources or setting the %PF_EXITING flag.
- *
- * Called with no locks held.
- */
-static inline void tracehook_report_exit(long *exit_code)
-{
-	ptrace_event(PTRACE_EVENT_EXIT, *exit_code);
-}
-
 /**
  * tracehook_prepare_clone - prepare for new child to be cloned
  * @clone_flags:	%CLONE_* flags from clone/fork/vfork system call
@@ -319,52 +268,6 @@ static inline void tracehook_report_clone_complete(int trace,
 		ptrace_event(trace, pid);
 }
 
-/**
- * tracehook_report_vfork_done - vfork parent's child has exited or exec'd
- * @child:		child task, already running
- * @pid:		new child's PID in the parent's namespace
- *
- * Called after a %CLONE_VFORK parent has waited for the child to complete.
- * The clone/vfork system call will return immediately after this.
- * The @child pointer may be invalid if a self-reaping child died and
- * tracehook_report_clone() took no action to prevent it from self-reaping.
- *
- * Called with no locks held.
- */
-static inline void tracehook_report_vfork_done(struct task_struct *child,
-					       pid_t pid)
-{
-	ptrace_event(PTRACE_EVENT_VFORK_DONE, pid);
-}
-
-/**
- * tracehook_prepare_release_task - task is being reaped, clean up tracing
- * @task:		task in %EXIT_DEAD state
- *
- * This is called in release_task() just before @task gets finally reaped
- * and freed.  This would be the ideal place to remove and clean up any
- * tracing-related state for @task.
- *
- * Called with no locks held.
- */
-static inline void tracehook_prepare_release_task(struct task_struct *task)
-{
-}
-
-/**
- * tracehook_finish_release_task - final tracing clean-up
- * @task:		task in %EXIT_DEAD state
- *
- * This is called in release_task() when @task is being in the middle of
- * being reaped.  After this, there must be no tracing entanglements.
- *
- * Called with write_lock_irq(&tasklist_lock) held.
- */
-static inline void tracehook_finish_release_task(struct task_struct *task)
-{
-	ptrace_release_task(task);
-}
-
 /**
  * tracehook_signal_handler - signal handler setup is complete
  * @sig:		number of signal being delivered
@@ -388,41 +291,6 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info,
 		ptrace_notify(SIGTRAP);
 }
 
-/**
- * tracehook_consider_ignored_signal - suppress short-circuit of ignored signal
- * @task:		task receiving the signal
- * @sig:		signal number being sent
- *
- * Return zero iff tracing doesn't care to examine this ignored signal,
- * so it can short-circuit normal delivery and never even get queued.
- *
- * Called with @task->sighand->siglock held.
- */
-static inline int tracehook_consider_ignored_signal(struct task_struct *task,
-						    int sig)
-{
-	return (task->ptrace & PT_PTRACED) != 0;
-}
-
-/**
- * tracehook_consider_fatal_signal - suppress special handling of fatal signal
- * @task:		task receiving the signal
- * @sig:		signal number being sent
- *
- * Return nonzero to prevent special handling of this termination signal.
- * Normally handler for signal is %SIG_DFL.  It can be %SIG_IGN if @sig is
- * ignored, in which case force_sig() is about to reset it to %SIG_DFL.
- * When this returns zero, this signal might cause a quick termination
- * that does not give the debugger a chance to intercept the signal.
- *
- * Called with or without @task->sighand->siglock held.
- */
-static inline int tracehook_consider_fatal_signal(struct task_struct *task,
-						  int sig)
-{
-	return (task->ptrace & PT_PTRACED) != 0;
-}
-
 #define DEATH_REAP			-1
 #define DEATH_DELAYED_GROUP_LEADER	-2
 
@@ -457,30 +325,6 @@ static inline int tracehook_notify_death(struct task_struct *task,
 	return task->ptrace ? SIGCHLD : DEATH_DELAYED_GROUP_LEADER;
 }
 
-/**
- * tracehook_report_death - task is dead and ready to be reaped
- * @task:		@current task now exiting
- * @signal:		return value from tracheook_notify_death()
- * @death_cookie:	value passed back from tracehook_notify_death()
- * @group_dead:		nonzero if this was the last thread in the group to die
- *
- * Thread has just become a zombie or is about to self-reap.  If positive,
- * @signal is the signal number just sent to the parent (usually %SIGCHLD).
- * If @signal is %DEATH_REAP, this thread will self-reap.  If @signal is
- * %DEATH_DELAYED_GROUP_LEADER, this is a delayed_group_leader() zombie.
- * The @death_cookie was passed back by tracehook_notify_death().
- *
- * If normal reaping is not inhibited, @task->exit_state might be changing
- * in parallel.
- *
- * Called without locks.
- */
-static inline void tracehook_report_death(struct task_struct *task,
-					  int signal, void *death_cookie,
-					  int group_dead)
-{
-}
-
 #ifdef TIF_NOTIFY_RESUME
 /**
  * set_notify_resume - cause tracehook_notify_resume() to be called
diff --git a/kernel/exit.c b/kernel/exit.c
index e5cc05644609..d49134a7f250 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -169,7 +169,6 @@ void release_task(struct task_struct * p)
 	struct task_struct *leader;
 	int zap_leader;
 repeat:
-	tracehook_prepare_release_task(p);
 	/* don't need to get the RCU readlock here - the process is dead and
 	 * can't be modifying its own credentials. But shut RCU-lockdep up */
 	rcu_read_lock();
@@ -179,7 +178,7 @@ repeat:
 	proc_flush_task(p);
 
 	write_lock_irq(&tasklist_lock);
-	tracehook_finish_release_task(p);
+	ptrace_release_task(p);
 	__exit_signal(p);
 
 	/*
@@ -868,8 +867,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 		wake_up_process(tsk->signal->group_exit_task);
 	write_unlock_irq(&tasklist_lock);
 
-	tracehook_report_death(tsk, signal, cookie, group_dead);
-
 	/* If the process is dead, release it - nobody will wait for it */
 	if (signal == DEATH_REAP)
 		release_task(tsk);
@@ -924,7 +921,7 @@ NORET_TYPE void do_exit(long code)
 	 */
 	set_fs(USER_DS);
 
-	tracehook_report_exit(&code);
+	ptrace_event(PTRACE_EVENT_EXIT, code);
 
 	validate_creds_for_do_exit(tsk);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 0276c30401a0..d4f0dff9d617 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1527,7 +1527,7 @@ long do_fork(unsigned long clone_flags,
 			freezer_do_not_count();
 			wait_for_completion(&vfork);
 			freezer_count();
-			tracehook_report_vfork_done(p, nr);
+			ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
 		}
 	} else {
 		nr = PTR_ERR(p);
diff --git a/kernel/signal.c b/kernel/signal.c
index 0f3370872506..1550aee34f42 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -87,7 +87,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
 	/*
 	 * Tracers may want to know about even ignored signals.
 	 */
-	return !tracehook_consider_ignored_signal(t, sig);
+	return !t->ptrace;
 }
 
 /*
@@ -493,7 +493,8 @@ int unhandled_signal(struct task_struct *tsk, int sig)
 		return 1;
 	if (handler != SIG_IGN && handler != SIG_DFL)
 		return 0;
-	return !tracehook_consider_fatal_signal(tsk, sig);
+	/* if ptraced, let the tracer determine */
+	return !tsk->ptrace;
 }
 
 /*
@@ -981,8 +982,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
 	if (sig_fatal(p, sig) &&
 	    !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
 	    !sigismember(&t->real_blocked, sig) &&
-	    (sig == SIGKILL ||
-	     !tracehook_consider_fatal_signal(t, sig))) {
+	    (sig == SIGKILL || !t->ptrace)) {
 		/*
 		 * This signal will be fatal to the whole group.
 		 */
diff --git a/mm/nommu.c b/mm/nommu.c
index 1fd0c51b10a6..54ae707bdae8 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -22,7 +22,6 @@
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/tracehook.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/mount.h>
@@ -1087,7 +1086,7 @@ static unsigned long determine_vm_flags(struct file *file,
 	 * it's being traced - otherwise breakpoints set in it may interfere
 	 * with another untraced process
 	 */
-	if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current))
+	if ((flags & MAP_PRIVATE) && current->ptrace)
 		vm_flags &= ~VM_MAYSHARE;
 
 	return vm_flags;

From 4b9d33e6d83cc05a8005a8f9a8b9677fa0f53626 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 17 Jun 2011 16:50:38 +0200
Subject: [PATCH 19/39] ptrace: kill clone/exec tracehooks

At this point, tracehooks aren't useful to mainline kernel and mostly
just add an extra layer of obfuscation.  Although they have comments,
without actual in-kernel users, it is difficult to tell what are their
assumptions and they're actually trying to achieve.  To mainline
kernel, they just aren't worth keeping around.

This patch kills the following clone and exec related tracehooks.

	tracehook_prepare_clone()
	tracehook_finish_clone()
	tracehook_report_clone()
	tracehook_report_clone_complete()
	tracehook_unsafe_exec()

The changes are mostly trivial - logic is moved to the caller and
comments are merged and adjusted appropriately.

The only exception is in check_unsafe_exec() where LSM_UNSAFE_PTRACE*
are OR'd to bprm->unsafe instead of setting it, which produces the
same result as the field is always zero on entry.  It also tests
p->ptrace instead of (p->ptrace & PT_PTRACED) for consistency, which
also gives the same result.

This doesn't introduce any behavior change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 fs/exec.c                 |   7 ++-
 include/linux/tracehook.h | 121 --------------------------------------
 kernel/fork.c             |  41 ++++++++++---
 3 files changed, 38 insertions(+), 131 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index b37030d0a50b..8dca45b0dae8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1224,7 +1224,12 @@ int check_unsafe_exec(struct linux_binprm *bprm)
 	unsigned n_fs;
 	int res = 0;
 
-	bprm->unsafe = tracehook_unsafe_exec(p);
+	if (p->ptrace) {
+		if (p->ptrace & PT_PTRACE_CAP)
+			bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
+		else
+			bprm->unsafe |= LSM_UNSAFE_PTRACE;
+	}
 
 	n_fs = 1;
 	spin_lock(&p->fs->lock);
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 8b06d4f2b814..bcc4ca762aee 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -129,27 +129,6 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
 	ptrace_report_syscall(regs);
 }
 
-/**
- * tracehook_unsafe_exec - check for exec declared unsafe due to tracing
- * @task:		current task doing exec
- *
- * Return %LSM_UNSAFE_* bits applied to an exec because of tracing.
- *
- * @task->signal->cred_guard_mutex is held by the caller through the do_execve().
- */
-static inline int tracehook_unsafe_exec(struct task_struct *task)
-{
-	int unsafe = 0;
-	int ptrace = task->ptrace;
-	if (ptrace & PT_PTRACED) {
-		if (ptrace & PT_PTRACE_CAP)
-			unsafe |= LSM_UNSAFE_PTRACE_CAP;
-		else
-			unsafe |= LSM_UNSAFE_PTRACE;
-	}
-	return unsafe;
-}
-
 /**
  * tracehook_tracer_task - return the task that is tracing the given task
  * @tsk:		task to consider
@@ -168,106 +147,6 @@ static inline struct task_struct *tracehook_tracer_task(struct task_struct *tsk)
 	return NULL;
 }
 
-/**
- * tracehook_prepare_clone - prepare for new child to be cloned
- * @clone_flags:	%CLONE_* flags from clone/fork/vfork system call
- *
- * This is called before a new user task is to be cloned.
- * Its return value will be passed to tracehook_finish_clone().
- *
- * Called with no locks held.
- */
-static inline int tracehook_prepare_clone(unsigned clone_flags)
-{
-	int event = 0;
-
-	if (clone_flags & CLONE_UNTRACED)
-		return 0;
-
-	if (clone_flags & CLONE_VFORK)
-		event = PTRACE_EVENT_VFORK;
-	else if ((clone_flags & CSIGNAL) != SIGCHLD)
-		event = PTRACE_EVENT_CLONE;
-	else
-		event = PTRACE_EVENT_FORK;
-
-	return ptrace_event_enabled(current, event) ? event : 0;
-}
-
-/**
- * tracehook_finish_clone - new child created and being attached
- * @child:		new child task
- * @clone_flags:	%CLONE_* flags from clone/fork/vfork system call
- * @trace:		return value from tracehook_prepare_clone()
- *
- * This is called immediately after adding @child to its parent's children list.
- * The @trace value is that returned by tracehook_prepare_clone().
- *
- * Called with current's siglock and write_lock_irq(&tasklist_lock) held.
- */
-static inline void tracehook_finish_clone(struct task_struct *child,
-					  unsigned long clone_flags, int trace)
-{
-	ptrace_init_task(child, (clone_flags & CLONE_PTRACE) || trace);
-}
-
-/**
- * tracehook_report_clone - in parent, new child is about to start running
- * @regs:		parent's user register state
- * @clone_flags:	flags from parent's system call
- * @pid:		new child's PID in the parent's namespace
- * @child:		new child task
- *
- * Called after a child is set up, but before it has been started running.
- * This is not a good place to block, because the child has not started
- * yet.  Suspend the child here if desired, and then block in
- * tracehook_report_clone_complete().  This must prevent the child from
- * self-reaping if tracehook_report_clone_complete() uses the @child
- * pointer; otherwise it might have died and been released by the time
- * tracehook_report_clone_complete() is called.
- *
- * Called with no locks held, but the child cannot run until this returns.
- */
-static inline void tracehook_report_clone(struct pt_regs *regs,
-					  unsigned long clone_flags,
-					  pid_t pid, struct task_struct *child)
-{
-	if (unlikely(child->ptrace)) {
-		/*
-		 * It doesn't matter who attached/attaching to this
-		 * task, the pending SIGSTOP is right in any case.
-		 */
-		sigaddset(&child->pending.signal, SIGSTOP);
-		set_tsk_thread_flag(child, TIF_SIGPENDING);
-	}
-}
-
-/**
- * tracehook_report_clone_complete - new child is running
- * @trace:		return value from tracehook_prepare_clone()
- * @regs:		parent's user register state
- * @clone_flags:	flags from parent's system call
- * @pid:		new child's PID in the parent's namespace
- * @child:		child task, already running
- *
- * This is called just after the child has started running.  This is
- * just before the clone/fork syscall returns, or blocks for vfork
- * child completion if @clone_flags has the %CLONE_VFORK bit set.
- * The @child pointer may be invalid if a self-reaping child died and
- * tracehook_report_clone() took no action to prevent it from self-reaping.
- *
- * Called with no locks held.
- */
-static inline void tracehook_report_clone_complete(int trace,
-						   struct pt_regs *regs,
-						   unsigned long clone_flags,
-						   pid_t pid,
-						   struct task_struct *child)
-{
-	if (unlikely(trace))
-		ptrace_event(trace, pid);
-}
-
 /**
  * tracehook_signal_handler - signal handler setup is complete
  * @sig:		number of signal being delivered
diff --git a/kernel/fork.c b/kernel/fork.c
index d4f0dff9d617..3c72a5b321a7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1340,7 +1340,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	}
 
 	if (likely(p->pid)) {
-		tracehook_finish_clone(p, clone_flags, trace);
+		ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
 
 		if (thread_group_leader(p)) {
 			if (is_child_reaper(pid))
@@ -1481,10 +1481,22 @@ long do_fork(unsigned long clone_flags,
 	}
 
 	/*
-	 * When called from kernel_thread, don't do user tracing stuff.
+	 * Determine whether and which event to report to ptracer.  When
+	 * called from kernel_thread or CLONE_UNTRACED is explicitly
+	 * requested, no event is reported; otherwise, report if the event
+	 * for the type of forking is enabled.
 	 */
-	if (likely(user_mode(regs)))
-		trace = tracehook_prepare_clone(clone_flags);
+	if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) {
+		if (clone_flags & CLONE_VFORK)
+			trace = PTRACE_EVENT_VFORK;
+		else if ((clone_flags & CSIGNAL) != SIGCHLD)
+			trace = PTRACE_EVENT_CLONE;
+		else
+			trace = PTRACE_EVENT_FORK;
+
+		if (likely(!ptrace_event_enabled(current, trace)))
+			trace = 0;
+	}
 
 	p = copy_process(clone_flags, stack_start, regs, stack_size,
 			 child_tidptr, NULL, trace);
@@ -1508,20 +1520,31 @@ long do_fork(unsigned long clone_flags,
 		}
 
 		audit_finish_fork(p);
-		tracehook_report_clone(regs, clone_flags, nr, p);
+
+		/*
+		 * Child is ready but hasn't started running yet.  Queue
+		 * SIGSTOP if it's gonna be ptraced - it doesn't matter who
+		 * attached/attaching to this task, the pending SIGSTOP is
+		 * right in any case.
+		 */
+		if (unlikely(p->ptrace)) {
+			sigaddset(&p->pending.signal, SIGSTOP);
+			set_tsk_thread_flag(p, TIF_SIGPENDING);
+		}
 
 		/*
 		 * We set PF_STARTING at creation in case tracing wants to
 		 * use this to distinguish a fully live task from one that
-		 * hasn't gotten to tracehook_report_clone() yet.  Now we
-		 * clear it and set the child going.
+		 * hasn't finished SIGSTOP raising yet.  Now we clear it
+		 * and set the child going.
 		 */
 		p->flags &= ~PF_STARTING;
 
 		wake_up_new_task(p);
 
-		tracehook_report_clone_complete(trace, regs,
-						clone_flags, nr, p);
+		/* forking complete and child started to run, tell ptracer */
+		if (unlikely(trace))
+			ptrace_event(trace, nr);
 
 		if (clone_flags & CLONE_VFORK) {
 			freezer_do_not_count();

From 06d984737bac0545fe20bb5447ee488b95adb531 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 17 Jun 2011 16:50:40 +0200
Subject: [PATCH 20/39] ptrace: s/tracehook_tracer_task()/ptrace_parent()/

tracehook.h is on the way out.  Rename tracehook_tracer_task() to
ptrace_parent() and move it from tracehook.h to ptrace.h.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: John Johansen <john.johansen@canonical.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 fs/proc/array.c            |  2 +-
 fs/proc/base.c             |  2 +-
 include/linux/ptrace.h     | 18 ++++++++++++++++++
 include/linux/tracehook.h  | 18 ------------------
 security/apparmor/domain.c |  2 +-
 security/selinux/hooks.c   |  4 ++--
 6 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 9b45ee84fbcc..3a1dafd228d1 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -172,7 +172,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 		task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
 	tpid = 0;
 	if (pid_alive(p)) {
-		struct task_struct *tracer = tracehook_tracer_task(p);
+		struct task_struct *tracer = ptrace_parent(p);
 		if (tracer)
 			tpid = task_pid_nr_ns(tracer, ns);
 	}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 14def991d9dd..c883dad74b9a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -216,7 +216,7 @@ static struct mm_struct *__check_mem_permission(struct task_struct *task)
 	if (task_is_stopped_or_traced(task)) {
 		int match;
 		rcu_read_lock();
-		match = (tracehook_tracer_task(task) == current);
+		match = (ptrace_parent(task) == current);
 		rcu_read_unlock();
 		if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
 			return mm;
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index b546fd6c3506..bb157bdd0c55 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -150,6 +150,24 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
 int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
 			    unsigned long data);
 
+/**
+ * ptrace_parent - return the task that is tracing the given task
+ * @task: task to consider
+ *
+ * Returns %NULL if no one is tracing @task, or the &struct task_struct
+ * pointer to its tracer.
+ *
+ * Must called under rcu_read_lock().  The pointer returned might be kept
+ * live only by RCU.  During exec, this may be called with task_lock() held
+ * on @task, still held from when check_unsafe_exec() was called.
+ */
+static inline struct task_struct *ptrace_parent(struct task_struct *task)
+{
+	if (unlikely(task->ptrace))
+		return rcu_dereference(task->parent);
+	return NULL;
+}
+
 /**
  * ptrace_event_enabled - test whether a ptrace event is enabled
  * @task: ptracee of interest
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index bcc4ca762aee..7a1bd12aeffa 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -129,24 +129,6 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
 	ptrace_report_syscall(regs);
 }
 
-/**
- * tracehook_tracer_task - return the task that is tracing the given task
- * @tsk:		task to consider
- *
- * Returns NULL if no one is tracing @task, or the &struct task_struct
- * pointer to its tracer.
- *
- * Must called under rcu_read_lock().  The pointer returned might be kept
- * live only by RCU.  During exec, this may be called with task_lock()
- * held on @task, still held from when tracehook_unsafe_exec() was called.
- */
-static inline struct task_struct *tracehook_tracer_task(struct task_struct *tsk)
-{
-	if (tsk->ptrace & PT_PTRACED)
-		return rcu_dereference(tsk->parent);
-	return NULL;
-}
-
 /**
  * tracehook_signal_handler - signal handler setup is complete
  * @sig:		number of signal being delivered
diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
index c825c6e0b636..7312bf9f7afc 100644
--- a/security/apparmor/domain.c
+++ b/security/apparmor/domain.c
@@ -67,7 +67,7 @@ static int may_change_ptraced_domain(struct task_struct *task,
 	int error = 0;
 
 	rcu_read_lock();
-	tracer = tracehook_tracer_task(task);
+	tracer = ptrace_parent(task);
 	if (tracer) {
 		/* released below */
 		cred = get_task_cred(tracer);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index a0d38459d650..fc07d18ed6fc 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2048,7 +2048,7 @@ static int selinux_bprm_set_creds(struct linux_binprm *bprm)
 			u32 ptsid = 0;
 
 			rcu_read_lock();
-			tracer = tracehook_tracer_task(current);
+			tracer = ptrace_parent(current);
 			if (likely(tracer != NULL)) {
 				sec = __task_cred(tracer)->security;
 				ptsid = sec->sid;
@@ -5314,7 +5314,7 @@ static int selinux_setprocattr(struct task_struct *p,
 		   Otherwise, leave SID unchanged and fail. */
 		ptsid = 0;
 		task_lock(p);
-		tracer = tracehook_tracer_task(p);
+		tracer = ptrace_parent(p);
 		if (tracer)
 			ptsid = task_sid(tracer);
 		task_unlock(p);

From 53c8f9f199b239668e6b1a907735ee323a0d1ccd Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 22 Jun 2011 23:08:18 +0200
Subject: [PATCH 21/39] make do_notify_parent() return bool

- change do_notify_parent() to return a boolean, true if the task should
  be reaped because its parent ignores SIGCHLD.

- update the only caller which checks the returned value, exit_notify().

This temporary uglifies exit_notify() even more, will be cleanuped by
the next change.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched.h |  4 ++--
 kernel/exit.c         |  9 ++++++---
 kernel/signal.c       | 17 +++++++++--------
 3 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 87f7ca7ed6f6..0df7231d9ee0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2145,7 +2145,7 @@ static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, s
 	spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
 
 	return ret;
-}	
+}
 
 extern void block_all_signals(int (*notifier)(void *priv), void *priv,
 			      sigset_t *mask);
@@ -2160,7 +2160,7 @@ extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *, uid_t, uid_
 extern int kill_pgrp(struct pid *pid, int sig, int priv);
 extern int kill_pid(struct pid *pid, int sig, int priv);
 extern int kill_proc_info(int, struct siginfo *, pid_t);
-extern int do_notify_parent(struct task_struct *, int);
+extern bool do_notify_parent(struct task_struct *, int);
 extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
 extern void force_sig(int, struct task_struct *);
 extern int send_sig(int, struct task_struct *, int);
diff --git a/kernel/exit.c b/kernel/exit.c
index d49134a7f250..34d135f4fccc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -820,6 +820,7 @@ static void forget_original_parent(struct task_struct *father)
 static void exit_notify(struct task_struct *tsk, int group_dead)
 {
 	int signal;
+	bool autoreap;
 	void *cookie;
 
 	/*
@@ -858,9 +859,11 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 
 	signal = tracehook_notify_death(tsk, &cookie, group_dead);
 	if (signal >= 0)
-		signal = do_notify_parent(tsk, signal);
+		autoreap = do_notify_parent(tsk, signal);
+	else
+		autoreap = (signal == DEATH_REAP);
 
-	tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
+	tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
 
 	/* mt-exec, de_thread() is waiting for group leader */
 	if (unlikely(tsk->signal->notify_count < 0))
@@ -868,7 +871,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	write_unlock_irq(&tasklist_lock);
 
 	/* If the process is dead, release it - nobody will wait for it */
-	if (signal == DEATH_REAP)
+	if (autoreap)
 		release_task(tsk);
 }
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 1550aee34f42..d52e82cd62bb 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1577,15 +1577,15 @@ ret:
  * Let a parent know about the death of a child.
  * For a stopped/continued status change, use do_notify_parent_cldstop instead.
  *
- * Returns -1 if our parent ignored us and so we've switched to
- * self-reaping, or else @sig.
+ * Returns true if our parent ignored us and so we've switched to
+ * self-reaping.
  */
-int do_notify_parent(struct task_struct *tsk, int sig)
+bool do_notify_parent(struct task_struct *tsk, int sig)
 {
 	struct siginfo info;
 	unsigned long flags;
 	struct sighand_struct *psig;
-	int ret = sig;
+	bool autoreap = false;
 
 	BUG_ON(sig == -1);
 
@@ -1649,16 +1649,17 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 		 * is implementation-defined: we do (if you don't want
 		 * it, just use SIG_IGN instead).
 		 */
-		ret = tsk->exit_signal = -1;
+		autoreap = true;
+		tsk->exit_signal = -1;
 		if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
-			sig = -1;
+			sig = 0;
 	}
-	if (valid_signal(sig) && sig > 0)
+	if (valid_signal(sig) && sig)
 		__group_send_sig_info(sig, &info, tsk->parent);
 	__wake_up_parent(tsk, tsk->parent);
 	spin_unlock_irqrestore(&psig->siglock, flags);
 
-	return ret;
+	return autoreap;
 }
 
 /**

From 45cdf5cc0703c537194588c63d53bad1f2539d36 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 23 Jun 2011 19:06:50 +0200
Subject: [PATCH 22/39] kill tracehook_notify_death()

Kill tracehook_notify_death(), reimplement the logic in its caller,
exit_notify().

Also, change the exec_id's check to use thread_group_leader() instead
of task_detached(), this is more clear. This logic only applies to
the exiting leader, a sub-thread must never change its exit_signal.

Note: when the traced group leader exits the exit_signal-or-SIGCHLD
logic looks really strange:

	- we notify the tracer even if !thread_group_empty() but
	   do_wait(WEXITED) can't work until all threads exit

	- if the tracer is real_parent, it is not clear why can't
	  we use ->exit_signal event if !thread_group_empty()

-v2: do not try to fix the 2nd oddity to avoid the subtle behavior
     change mixed with reorganization, suggested by Tejun.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
---
 include/linux/tracehook.h | 34 ----------------------------------
 kernel/exit.c             | 21 +++++++++++++--------
 2 files changed, 13 insertions(+), 42 deletions(-)

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 7a1bd12aeffa..a71a2927a6a0 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -152,40 +152,6 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info,
 		ptrace_notify(SIGTRAP);
 }
 
-#define DEATH_REAP			-1
-#define DEATH_DELAYED_GROUP_LEADER	-2
-
-/**
- * tracehook_notify_death - task is dead, ready to notify parent
- * @task:		@current task now exiting
- * @death_cookie:	value to pass to tracehook_report_death()
- * @group_dead:		nonzero if this was the last thread in the group to die
- *
- * A return value >= 0 means call do_notify_parent() with that signal
- * number.  Negative return value can be %DEATH_REAP to self-reap right
- * now, or %DEATH_DELAYED_GROUP_LEADER to a zombie without notifying our
- * parent.  Note that a return value of 0 means a do_notify_parent() call
- * that sends no signal, but still wakes up a parent blocked in wait*().
- *
- * Called with write_lock_irq(&tasklist_lock) held.
- */
-static inline int tracehook_notify_death(struct task_struct *task,
-					 void **death_cookie, int group_dead)
-{
-	if (task_detached(task))
-		return task->ptrace ? SIGCHLD : DEATH_REAP;
-
-	/*
-	 * If something other than our normal parent is ptracing us, then
-	 * send it a SIGCHLD instead of honoring exit_signal.  exit_signal
-	 * only has special meaning to our real parent.
-	 */
-	if (thread_group_empty(task) && !ptrace_reparented(task))
-		return task->exit_signal;
-
-	return task->ptrace ? SIGCHLD : DEATH_DELAYED_GROUP_LEADER;
-}
-
 #ifdef TIF_NOTIFY_RESUME
 /**
  * set_notify_resume - cause tracehook_notify_resume() to be called
diff --git a/kernel/exit.c b/kernel/exit.c
index 34d135f4fccc..bb08e938ca74 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -819,9 +819,7 @@ static void forget_original_parent(struct task_struct *father)
  */
 static void exit_notify(struct task_struct *tsk, int group_dead)
 {
-	int signal;
 	bool autoreap;
-	void *cookie;
 
 	/*
 	 * This does two things:
@@ -852,16 +850,23 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	 * we have changed execution domain as these two values started
 	 * the same after a fork.
 	 */
-	if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
+	if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD &&
 	    (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
 	     tsk->self_exec_id != tsk->parent_exec_id))
 		tsk->exit_signal = SIGCHLD;
 
-	signal = tracehook_notify_death(tsk, &cookie, group_dead);
-	if (signal >= 0)
-		autoreap = do_notify_parent(tsk, signal);
-	else
-		autoreap = (signal == DEATH_REAP);
+	if (unlikely(tsk->ptrace)) {
+		int sig = thread_group_leader(tsk) &&
+				thread_group_empty(tsk) &&
+				!ptrace_reparented(tsk) ?
+			tsk->exit_signal : SIGCHLD;
+		autoreap = do_notify_parent(tsk, sig);
+	} else if (thread_group_leader(tsk)) {
+		autoreap = thread_group_empty(tsk) &&
+			do_notify_parent(tsk, tsk->exit_signal);
+	} else {
+		autoreap = true;
+	}
 
 	tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
 

From 9843a1e977977986d0a4c1000f2229b032572534 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 22 Jun 2011 23:08:53 +0200
Subject: [PATCH 23/39] __ptrace_detach: avoid task_detached(), check
 do_notify_parent()

__ptrace_detach() relies on the current obscure behaviour of
do_notify_parent(tsk) which changes tsk->exit_signal if this child
should be silently reaped. That is why we check task_detached(), it
is true if the task is sub-thread, or it is the group_leader but
its exit_signal was changed by do_notify_parent().

This is confusing, change the code to rely on !thread_group_leader()
or the value returned by do_notify_parent().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 kernel/ptrace.c | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index e18966c1c0da..66a28bd71ef6 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -370,25 +370,28 @@ static int ignoring_children(struct sighand_struct *sigh)
  */
 static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
 {
+	bool dead;
+
 	__ptrace_unlink(p);
 
-	if (p->exit_state == EXIT_ZOMBIE) {
-		if (!task_detached(p) && thread_group_empty(p)) {
-			if (!same_thread_group(p->real_parent, tracer))
-				do_notify_parent(p, p->exit_signal);
-			else if (ignoring_children(tracer->sighand)) {
-				__wake_up_parent(p, tracer);
-				p->exit_signal = -1;
-			}
-		}
-		if (task_detached(p)) {
-			/* Mark it as in the process of being reaped. */
-			p->exit_state = EXIT_DEAD;
-			return true;
+	if (p->exit_state != EXIT_ZOMBIE)
+		return false;
+
+	dead = !thread_group_leader(p);
+
+	if (!dead && thread_group_empty(p)) {
+		if (!same_thread_group(p->real_parent, tracer))
+			dead = do_notify_parent(p, p->exit_signal);
+		else if (ignoring_children(tracer->sighand)) {
+			__wake_up_parent(p, tracer);
+			p->exit_signal = -1;
+			dead = true;
 		}
 	}
-
-	return false;
+	/* Mark it as in the process of being reaped. */
+	if (dead)
+		p->exit_state = EXIT_DEAD;
+	return dead;
 }
 
 static int ptrace_detach(struct task_struct *child, unsigned int data)

From 8677347378044ab564470bced2275520efb3670d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 22 Jun 2011 23:09:09 +0200
Subject: [PATCH 24/39] make do_notify_parent() __must_check, update the
 callers

Change other callers of do_notify_parent() to check the value it
returns, this makes the subsequent task_detached() unnecessary.
Mark do_notify_parent() as __must_check.

Use thread_group_leader() instead of !task_detached() to check
if we need to notify the real parent in wait_task_zombie().

Remove the stale comment in release_task(). "just for sanity" is
no longer true, we have to set EXIT_DEAD to avoid the races with
do_wait().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched.h |  2 +-
 kernel/exit.c         | 29 ++++++++---------------------
 2 files changed, 9 insertions(+), 22 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0df7231d9ee0..0cb4f097f76c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2160,7 +2160,7 @@ extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *, uid_t, uid_
 extern int kill_pgrp(struct pid *pid, int sig, int priv);
 extern int kill_pid(struct pid *pid, int sig, int priv);
 extern int kill_proc_info(int, struct siginfo *, pid_t);
-extern bool do_notify_parent(struct task_struct *, int);
+extern __must_check bool do_notify_parent(struct task_struct *, int);
 extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
 extern void force_sig(int, struct task_struct *);
 extern int send_sig(int, struct task_struct *, int);
diff --git a/kernel/exit.c b/kernel/exit.c
index bb08e938ca74..f68d137ffeb4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -190,21 +190,12 @@ repeat:
 	leader = p->group_leader;
 	if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
 		BUG_ON(task_detached(leader));
-		do_notify_parent(leader, leader->exit_signal);
 		/*
 		 * If we were the last child thread and the leader has
 		 * exited already, and the leader's parent ignores SIGCHLD,
 		 * then we are the one who should release the leader.
-		 *
-		 * do_notify_parent() will have marked it self-reaping in
-		 * that case.
-		 */
-		zap_leader = task_detached(leader);
-
-		/*
-		 * This maintains the invariant that release_task()
-		 * only runs on a task in EXIT_DEAD, just for sanity.
 		 */
+		zap_leader = do_notify_parent(leader, leader->exit_signal);
 		if (zap_leader)
 			leader->exit_state = EXIT_DEAD;
 	}
@@ -766,8 +757,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
 	/* If it has exited notify the new parent about this child's death. */
 	if (!p->ptrace &&
 	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
-		do_notify_parent(p, p->exit_signal);
-		if (task_detached(p)) {
+		if (do_notify_parent(p, p->exit_signal)) {
 			p->exit_state = EXIT_DEAD;
 			list_move_tail(&p->sibling, dead);
 		}
@@ -1351,16 +1341,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		/* We dropped tasklist, ptracer could die and untrace */
 		ptrace_unlink(p);
 		/*
-		 * If this is not a detached task, notify the parent.
-		 * If it's still not detached after that, don't release
-		 * it now.
+		 * If this is not a sub-thread, notify the parent.
+		 * If parent wants a zombie, don't release it now.
 		 */
-		if (!task_detached(p)) {
-			do_notify_parent(p, p->exit_signal);
-			if (!task_detached(p)) {
-				p->exit_state = EXIT_ZOMBIE;
-				p = NULL;
-			}
+		if (thread_group_leader(p) &&
+		    !do_notify_parent(p, p->exit_signal)) {
+			p->exit_state = EXIT_ZOMBIE;
+			p = NULL;
 		}
 		write_unlock_irq(&tasklist_lock);
 	}

From 0976a03e5ce8ec346e985f21046d7a75bb7fdffd Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 22 Jun 2011 23:09:39 +0200
Subject: [PATCH 25/39] reparent_leader: check EXIT_DEAD instead of
 task_detached()

Change reparent_leader() to check ->exit_state instead of ->exit_signal,
this matches the similar EXIT_DEAD check in wait_consider_task() and
allows us to cleanup the do_notify_parent/task_detached logic.

task_detached() was really needed during reparenting before 9cd80bbb
"do_wait() optimization: do not place sub-threads on ->children list"
to filter out the sub-threads. After this change task_detached(p) can
only be true if p is the dead group_leader and its parent ignores
SIGCHLD, in this case the caller of do_notify_parent() is going to
reap this task and it should set EXIT_DEAD.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index f68d137ffeb4..2b1ba8048a14 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -742,7 +742,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
 {
 	list_move_tail(&p->sibling, &p->real_parent->children);
 
-	if (task_detached(p))
+	if (p->exit_state == EXIT_DEAD)
 		return;
 	/*
 	 * If this is a threaded reparent there is no need to

From e550f14dc6322e794d4e70825f63c9c99177ae8b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 22 Jun 2011 23:09:54 +0200
Subject: [PATCH 26/39] kill task_detached()

Upadate the last user of task_detached(), wait_task_zombie(), to
use thread_group_leader() and kill task_detached().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched.h | 5 -----
 kernel/exit.c         | 5 ++---
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0cb4f097f76c..39acee2c8929 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2318,11 +2318,6 @@ static inline int thread_group_empty(struct task_struct *p)
 #define delay_group_leader(p) \
 		(thread_group_leader(p) && !thread_group_empty(p))
 
-static inline int task_detached(struct task_struct *p)
-{
-	return p->exit_signal == -1;
-}
-
 /*
  * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
  * subscriptions and synchronises with wait4().  Also used in procfs.  Also
diff --git a/kernel/exit.c b/kernel/exit.c
index 2b1ba8048a14..9fa99702645d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -189,7 +189,6 @@ repeat:
 	zap_leader = 0;
 	leader = p->group_leader;
 	if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
-		BUG_ON(task_detached(leader));
 		/*
 		 * If we were the last child thread and the leader has
 		 * exited already, and the leader's parent ignores SIGCHLD,
@@ -1231,9 +1230,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 	traced = ptrace_reparented(p);
 	/*
 	 * It can be ptraced but not reparented, check
-	 * !task_detached() to filter out sub-threads.
+	 * thread_group_leader() to filter out sub-threads.
 	 */
-	if (likely(!traced) && likely(!task_detached(p))) {
+	if (likely(!traced) && thread_group_leader(p)) {
 		struct signal_struct *psig;
 		struct signal_struct *sig;
 		unsigned long maxrss;

From d4f7c511c1c2a67eb287987cf1ce9554149030e6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 22 Jun 2011 23:10:11 +0200
Subject: [PATCH 27/39] do not change dead_task->exit_signal

__ptrace_detach() and do_notify_parent() set task->exit_signal = -1
to mark the task dead. This is no longer needed, nobody checks
exit_signal to detect the EXIT_DEAD task.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
---
 kernel/ptrace.c | 1 -
 kernel/signal.c | 1 -
 2 files changed, 2 deletions(-)

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 66a28bd71ef6..d7ccc79454f5 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -384,7 +384,6 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
 			dead = do_notify_parent(p, p->exit_signal);
 		else if (ignoring_children(tracer->sighand)) {
 			__wake_up_parent(p, tracer);
-			p->exit_signal = -1;
 			dead = true;
 		}
 	}
diff --git a/kernel/signal.c b/kernel/signal.c
index d52e82cd62bb..4c4ad34caf77 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1650,7 +1650,6 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
 		 * it, just use SIG_IGN instead).
 		 */
 		autoreap = true;
-		tsk->exit_signal = -1;
 		if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
 			sig = 0;
 	}

From 087806b1281563e4ae7a5bce3155f894af5f4118 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 22 Jun 2011 23:10:26 +0200
Subject: [PATCH 28/39] redefine thread_group_leader() as exit_signal >= 0

Change de_thread() to set old_leader->exit_signal = -1. This is
good for the consistency, it is no longer the leader and all
sub-threads have exit_signal = -1 set by copy_process(CLONE_THREAD).

And this allows us to micro-optimize thread_group_leader(), it can
simply check exit_signal >= 0. This also makes sense because we
should move ->group_leader from task_struct to signal_struct.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
---
 fs/exec.c             | 1 +
 include/linux/sched.h | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 8dca45b0dae8..c3d517bfdd27 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -963,6 +963,7 @@ static int de_thread(struct task_struct *tsk)
 		leader->group_leader = tsk;
 
 		tsk->exit_signal = SIGCHLD;
+		leader->exit_signal = -1;
 
 		BUG_ON(leader->exit_state != EXIT_ZOMBIE);
 		leader->exit_state = EXIT_DEAD;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 39acee2c8929..b38ed51d5c64 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2284,8 +2284,10 @@ static inline int get_nr_threads(struct task_struct *tsk)
 	return tsk->signal->nr_threads;
 }
 
-/* de_thread depends on thread_group_leader not being a pid based check */
-#define thread_group_leader(p)	(p == p->group_leader)
+static inline bool thread_group_leader(struct task_struct *p)
+{
+	return p->exit_signal >= 0;
+}
 
 /* Do to the insanities of de_thread it is possible for a process
  * to have the pid of the thread group leader without actually being

From 0347e17739095c58c0194fed6a61aced3536d258 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 24 Jun 2011 17:34:06 +0200
Subject: [PATCH 29/39] ptrace: ptrace_reparented() should check
 same_thread_group()

ptrace_reparented() naively does parent != real_parent, this means
it returns true even if the tracer _is_ the real parent. This is per
process thing, not per-thread. The only reason ->real_parent can
point to the non-leader thread is that we have __WNOTHREAD.

Change it to check !same_thread_group(parent, real_parent).

It has two callers, and in both cases the current check does not
look right.

exit_notify: we should respect ->exit_signal if the exiting leader
is traced by any thread from the parent thread group. It is the
child of the whole group, and we are going to send the signal to
the whole group.

wait_task_zombie: without __WNOTHREAD do_wait() should do the same
for any thread, only sys_ptrace() is "bound" to the single thread.
However do_wait(WEXITED) succeeds but does not release a traced
natural child unless the caller is the tracer.

Test-case:

	void *tfunc(void *arg)
	{
		assert(ptrace(PTRACE_ATTACH, (long)arg, 0,0) == 0);
		pause();
		return NULL;
	}

	int main(void)
	{
		pthread_t thr;
		pid_t pid, stat, ret;

		pid = fork();
		if (!pid) {
			pause();
			assert(0);
		}

		assert(pthread_create(&thr, NULL, tfunc, (void*)(long)pid) == 0);

		assert(waitpid(-1, &stat, 0) == pid);
		assert(WIFSTOPPED(stat));

		kill(pid, SIGKILL);

		assert(waitpid(-1, &stat, 0) == pid);
		assert(WIFSIGNALED(stat) && WTERMSIG(stat) == SIGKILL);

		ret = waitpid(pid, &stat, 0);
		if (ret < 0)
			return 0;

		printf("WTF? %d is dead, but: wait=%d stat=%x\n",
				pid, ret, stat);

		return 1;
	}

Note that the main thread simply does

	pid = fork();
	kill(pid, SIGKILL);

and then without the patch wait4(WEXITED) succeeds twice and reports
WTERMSIG(stat) == SIGKILL.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 include/linux/ptrace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index bb157bdd0c55..eae381d584f9 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -136,7 +136,7 @@ extern bool ptrace_may_access(struct task_struct *task, unsigned int mode);
 
 static inline int ptrace_reparented(struct task_struct *child)
 {
-	return child->real_parent != child->parent;
+	return !same_thread_group(child->real_parent, child->parent);
 }
 
 static inline void ptrace_unlink(struct task_struct *child)

From bb3696da89743d580f869142d0a6e6ba9b7fe89a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 24 Jun 2011 17:34:23 +0200
Subject: [PATCH 30/39] ptrace: kill real_parent_is_ptracer() in in favor of
 ptrace_reparented()

Kill real_parent_is_ptracer() and update the callers to use
ptrace_reparented(), after the previous patch they do the same.

Remove the unnecessary ->ptrace != 0 check in get_signal_to_deliver(),
if ptrace_reparented() == T then the task must be ptraced.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 kernel/signal.c | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 4c4ad34caf77..0a1bf2c8bdcd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1759,15 +1759,6 @@ static int sigkill_pending(struct task_struct *tsk)
 		sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
 }
 
-/*
- * Test whether the target task of the usual cldstop notification - the
- * real_parent of @child - is in the same group as the ptracer.
- */
-static bool real_parent_is_ptracer(struct task_struct *child)
-{
-	return same_thread_group(child->parent, child->real_parent);
-}
-
 /*
  * This must be called with current->sighand->siglock held.
  *
@@ -1848,7 +1839,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
 		 * separately unless they're gonna be duplicates.
 		 */
 		do_notify_parent_cldstop(current, true, why);
-		if (gstop_done && !real_parent_is_ptracer(current))
+		if (gstop_done && ptrace_reparented(current))
 			do_notify_parent_cldstop(current, false, why);
 
 		/*
@@ -2154,7 +2145,6 @@ relock:
 	 * the CLD_ si_code into SIGNAL_CLD_MASK bits.
 	 */
 	if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
-		struct task_struct *leader;
 		int why;
 
 		if (signal->flags & SIGNAL_CLD_CONTINUED)
@@ -2175,13 +2165,11 @@ relock:
 		 * a duplicate.
 		 */
 		read_lock(&tasklist_lock);
-
 		do_notify_parent_cldstop(current, false, why);
 
-		leader = current->group_leader;
-		if (leader->ptrace && !real_parent_is_ptracer(leader))
-			do_notify_parent_cldstop(leader, true, why);
-
+		if (ptrace_reparented(current->group_leader))
+			do_notify_parent_cldstop(current->group_leader,
+						true, why);
 		read_unlock(&tasklist_lock);
 
 		goto relock;

From 479bf98c1c29b40d86e40a4e6e4944c2f03d9493 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 24 Jun 2011 17:34:39 +0200
Subject: [PATCH 31/39] ptrace: wait_consider_task:
 s/same_thread_group/ptrace_reparented/

wait_consider_task() checks same_thread_group(parent, real_parent),
this is the open-coded ptrace_reparented().

__ptrace_detach() remains the only function which has to check this by
hand, although we could reorganize the code to delay __ptrace_unlink.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 kernel/exit.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index 9fa99702645d..b8d3b47bb881 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1599,8 +1599,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 		 * own children, it should create a separate process which
 		 * takes the role of real parent.
 		 */
-		if (likely(!ptrace) && p->ptrace &&
-		    same_thread_group(p->parent, p->real_parent))
+		if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p))
 			return 0;
 
 		/*

From bb188d7e64deb0e9cf13a99f44ae0065de5352d6 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Wed, 29 Jun 2011 04:13:39 +0200
Subject: [PATCH 32/39] ptrace: make former thread ID available via
 PTRACE_GETEVENTMSG after PTRACE_EVENT_EXEC stop

When multithreaded program execs under ptrace,
all traced threads report WIFEXITED status, except for
thread group leader and the thread which execs.

Unless tracer tracks thread group relationship between tracees,
which is a nontrivial task, it will not detect that
execed thread no longer exists.

This patch allows tracer to figure out which thread
performed this exec, by requesting PTRACE_GETEVENTMSG
in PTRACE_EVENT_EXEC stop.

Another, samller problem which is solved by this patch
is that tracer now can figure out which of the several
concurrent execs in multithreaded program succeeded.

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 fs/exec.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/exec.c b/fs/exec.c
index c3d517bfdd27..b08367abf30e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1358,6 +1358,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 	unsigned int depth = bprm->recursion_depth;
 	int try,retval;
 	struct linux_binfmt *fmt;
+	pid_t old_pid;
 
 	retval = security_bprm_check(bprm);
 	if (retval)
@@ -1371,6 +1372,11 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 	if (retval)
 		return retval;
 
+	/* Need to fetch pid before load_binary changes it */
+	rcu_read_lock();
+	old_pid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
+	rcu_read_unlock();
+
 	retval = -ENOENT;
 	for (try=0; try<2; try++) {
 		read_lock(&binfmt_lock);
@@ -1390,7 +1396,8 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 			bprm->recursion_depth = depth;
 			if (retval >= 0) {
 				if (depth == 0)
-					ptrace_event(PTRACE_EVENT_EXEC, 0);
+					ptrace_event(PTRACE_EVENT_EXEC,
+							old_pid);
 				put_binfmt(fmt);
 				allow_write_access(bprm->file);
 				if (bprm->file)

From 961c4675c75112717705fa5c0c53cb9664051479 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 7 Jul 2011 21:33:54 +0200
Subject: [PATCH 33/39] has_stopped_jobs:
 s/task_is_stopped/SIGNAL_STOP_STOPPED/

has_stopped_jobs() naively checks task_is_stopped(group_leader). This
was always wrong even without ptrace, group_leader can be dead. And
given that ptrace can change the state to TRACED this is wrong even
in the single-threaded case.

Change the code to check SIGNAL_STOP_STOPPED and simplify the code,
retval + break/continue doesn't make this trivial code more readable.

We could probably add the usual "|| signal->group_stop_count" check
but I don't think this makes sense, the task can start the group-stop
right after the check anyway.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 kernel/exit.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index b8d3b47bb881..6c7fbbe7d86f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -266,18 +266,16 @@ int is_current_pgrp_orphaned(void)
 	return retval;
 }
 
-static int has_stopped_jobs(struct pid *pgrp)
+static bool has_stopped_jobs(struct pid *pgrp)
 {
-	int retval = 0;
 	struct task_struct *p;
 
 	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
-		if (!task_is_stopped(p))
-			continue;
-		retval = 1;
-		break;
+		if (p->signal->flags & SIGNAL_STOP_STOPPED)
+			return true;
 	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
-	return retval;
+
+	return false;
 }
 
 /*

From 6634ae1033ceaeca5877dd75723210f8c2648c17 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 8 Jul 2011 19:13:39 +0200
Subject: [PATCH 34/39] ptrace_init_task: initialize child->jobctl explicitly

new_child->jobctl is not initialized during the fork, it is copied
from parent->jobctl. Currently this is harmless, the forking task
is running and copy_process() can't succeed if signal_pending() is
true, so only JOBCTL_STOP_DEQUEUED can be copied. Still this is a
bit fragile, it would be more clean to set ->jobctl = 0 explicitly.

Also, check ->ptrace != 0 instead of PT_PTRACED, move the
CONFIG_HAVE_HW_BREAKPOINT code up.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 include/linux/ptrace.h | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index eae381d584f9..fd8669fc339f 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -217,16 +217,17 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace)
 {
 	INIT_LIST_HEAD(&child->ptrace_entry);
 	INIT_LIST_HEAD(&child->ptraced);
-	child->parent = child->real_parent;
-	child->ptrace = 0;
-	if (unlikely(ptrace) && (current->ptrace & PT_PTRACED)) {
-		child->ptrace = current->ptrace;
-		__ptrace_link(child, current->parent);
-	}
-
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 	atomic_set(&child->ptrace_bp_refcnt, 1);
 #endif
+	child->jobctl = 0;
+	child->ptrace = 0;
+	child->parent = child->real_parent;
+
+	if (unlikely(ptrace) && current->ptrace) {
+		child->ptrace = current->ptrace;
+		__ptrace_link(child, current->parent);
+	}
 }
 
 /**

From dcace06cc29df927a74a6bc0e57b9bef87704377 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 8 Jul 2011 19:13:54 +0200
Subject: [PATCH 35/39] ptrace: mv send-SIGSTOP from do_fork() to
 ptrace_init_task()

If the new child is traced, do_fork() adds the pending SIGSTOP.
It assumes that either it is traced because of auto-attach or the
tracer attached later, in both cases sigaddset/set_thread_flag is
correct even if SIGSTOP is already pending.

Now that we have PTRACE_SEIZE this is no longer right in the latter
case. If the tracer does PTRACE_SEIZE after copy_process() makes the
child visible the queued SIGSTOP is wrong.

We could check PT_SEIZED bit and change ptrace_attach() to set both
PT_PTRACED and PT_SEIZED bits simultaneously but see the next patch,
we need to know whether this child was auto-attached or not anyway.

So this patch simply moves this code to ptrace_init_task(), this
way we can never race with ptrace_attach().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 include/linux/ptrace.h |  3 +++
 kernel/fork.c          | 12 ------------
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index fd8669fc339f..9b5d2c901d06 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -227,6 +227,9 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace)
 	if (unlikely(ptrace) && current->ptrace) {
 		child->ptrace = current->ptrace;
 		__ptrace_link(child, current->parent);
+
+		sigaddset(&child->pending.signal, SIGSTOP);
+		set_tsk_thread_flag(child, TIF_SIGPENDING);
 	}
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 3c72a5b321a7..4d4117e01504 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,7 +37,6 @@
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
-#include <linux/tracehook.h>
 #include <linux/futex.h>
 #include <linux/compat.h>
 #include <linux/kthread.h>
@@ -1521,17 +1520,6 @@ long do_fork(unsigned long clone_flags,
 
 		audit_finish_fork(p);
 
-		/*
-		 * Child is ready but hasn't started running yet.  Queue
-		 * SIGSTOP if it's gonna be ptraced - it doesn't matter who
-		 * attached/attaching to this task, the pending SIGSTOP is
-		 * right in any case.
-		 */
-		if (unlikely(p->ptrace)) {
-			sigaddset(&p->pending.signal, SIGSTOP);
-			set_tsk_thread_flag(p, TIF_SIGPENDING);
-		}
-
 		/*
 		 * We set PF_STARTING at creation in case tracing wants to
 		 * use this to distinguish a fully live task from one that

From d184d6eb1dc3c9869e25a8e422be5c55ab0db4ac Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 8 Jul 2011 19:14:17 +0200
Subject: [PATCH 36/39] ptrace: dont send SIGSTOP on auto-attach if PT_SEIZED

The fake SIGSTOP during attach has numerous problems. PTRACE_SEIZE
is already fine, but we have basically the same problems is SIGSTOP
is sent on auto-attach, the tracer can't know if this signal signal
should be cancelled or not.

Change ptrace_event() to set JOBCTL_TRAP_STOP if the new child is
PT_SEIZED, this triggers the PTRACE_EVENT_STOP report.

Thereafter a PT_SEIZED task can never report the bogus SIGSTOP.

Test-case:

	#define PTRACE_SEIZE		0x4206
	#define PTRACE_SEIZE_DEVEL	0x80000000
	#define PTRACE_EVENT_STOP	7
	#define WEVENT(s)		((s & 0xFF0000) >> 16)

	int main(void)
	{
		int child, grand_child, status;
		long message;

		child = fork();
		if (!child) {
			kill(getpid(), SIGSTOP);
			fork();
			assert(0);
			return 0x23;
		}

		assert(ptrace(PTRACE_SEIZE, child, 0,PTRACE_SEIZE_DEVEL) == 0);
		assert(wait(&status) == child);
		assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);

		assert(ptrace(PTRACE_SETOPTIONS, child, 0, PTRACE_O_TRACEFORK) == 0);

		assert(ptrace(PTRACE_CONT, child, 0,0) == 0);
		assert(waitpid(child, &status, 0) == child);
		assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP);
		assert(WEVENT(status) == PTRACE_EVENT_FORK);

		assert(ptrace(PTRACE_GETEVENTMSG, child, 0, &message) == 0);
		grand_child = message;

		assert(waitpid(grand_child, &status, 0) == grand_child);
		assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP);
		assert(WEVENT(status) == PTRACE_EVENT_STOP);

		kill(child, SIGKILL);
		kill(grand_child, SIGKILL);
		return 0;
	}

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 include/linux/ptrace.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 9b5d2c901d06..800f113bea66 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -228,7 +228,11 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace)
 		child->ptrace = current->ptrace;
 		__ptrace_link(child, current->parent);
 
-		sigaddset(&child->pending.signal, SIGSTOP);
+		if (child->ptrace & PT_SEIZED)
+			task_set_jobctl_pending(child, JOBCTL_TRAP_STOP);
+		else
+			sigaddset(&child->pending.signal, SIGSTOP);
+
 		set_tsk_thread_flag(child, TIF_SIGPENDING);
 	}
 }

From f701e5b73a1a79ea62ffd45d9e2bed4c7d5c1fd2 Mon Sep 17 00:00:00 2001
From: Vladimir Zapolskiy <vzapolskiy@gmail.com>
Date: Fri, 15 Jul 2011 20:45:18 +0300
Subject: [PATCH 37/39] connector: add an event for monitoring process tracers

This change adds a procfs connector event, which is emitted on every
successful process tracer attach or detach.

If some process connects to other one, kernelspace connector reports
process id and thread group id of both these involved processes. On
disconnection null process id is returned.

Such an event allows to create a simple automated userspace mechanism
to be aware about processes connecting to others, therefore predefined
process policies can be applied to them if needed.

Note, a detach signal is emitted only in case, if a tracer process
explicitly executes PTRACE_DETACH request. In other cases like tracee
or tracer exit detach event from proc connector is not reported.

Signed-off-by: Vladimir Zapolskiy <vzapolskiy@gmail.com>
Acked-by: Evgeniy Polyakov <zbr@ioremap.net>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 drivers/connector/cn_proc.c | 35 +++++++++++++++++++++++++++++++++++
 include/linux/cn_proc.h     | 13 +++++++++++++
 kernel/ptrace.c             |  7 ++++++-
 3 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
index 2b46a7efa0ac..281902d3f7ec 100644
--- a/drivers/connector/cn_proc.c
+++ b/drivers/connector/cn_proc.c
@@ -28,6 +28,7 @@
 #include <linux/init.h>
 #include <linux/connector.h>
 #include <linux/gfp.h>
+#include <linux/ptrace.h>
 #include <asm/atomic.h>
 #include <asm/unaligned.h>
 
@@ -166,6 +167,40 @@ void proc_sid_connector(struct task_struct *task)
 	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
 }
 
+void proc_ptrace_connector(struct task_struct *task, int ptrace_id)
+{
+	struct cn_msg *msg;
+	struct proc_event *ev;
+	struct timespec ts;
+	__u8 buffer[CN_PROC_MSG_SIZE];
+	struct task_struct *tracer;
+
+	if (atomic_read(&proc_event_num_listeners) < 1)
+		return;
+
+	msg = (struct cn_msg *)buffer;
+	ev = (struct proc_event *)msg->data;
+	get_seq(&msg->seq, &ev->cpu);
+	ktime_get_ts(&ts); /* get high res monotonic timestamp */
+	put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns);
+	ev->what = PROC_EVENT_PTRACE;
+	ev->event_data.ptrace.process_pid  = task->pid;
+	ev->event_data.ptrace.process_tgid = task->tgid;
+	if (ptrace_id == PTRACE_ATTACH) {
+		ev->event_data.ptrace.tracer_pid  = current->pid;
+		ev->event_data.ptrace.tracer_tgid = current->tgid;
+	} else if (ptrace_id == PTRACE_DETACH) {
+		ev->event_data.ptrace.tracer_pid  = 0;
+		ev->event_data.ptrace.tracer_tgid = 0;
+	} else
+		return;
+
+	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
+	msg->ack = 0; /* not used */
+	msg->len = sizeof(*ev);
+	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
+}
+
 void proc_exit_connector(struct task_struct *task)
 {
 	struct cn_msg *msg;
diff --git a/include/linux/cn_proc.h b/include/linux/cn_proc.h
index 47dac5ea8d3a..12c517b51ca2 100644
--- a/include/linux/cn_proc.h
+++ b/include/linux/cn_proc.h
@@ -53,6 +53,7 @@ struct proc_event {
 		PROC_EVENT_UID  = 0x00000004,
 		PROC_EVENT_GID  = 0x00000040,
 		PROC_EVENT_SID  = 0x00000080,
+		PROC_EVENT_PTRACE = 0x00000100,
 		/* "next" should be 0x00000400 */
 		/* "last" is the last process event: exit */
 		PROC_EVENT_EXIT = 0x80000000
@@ -95,6 +96,13 @@ struct proc_event {
 			__kernel_pid_t process_tgid;
 		} sid;
 
+		struct ptrace_proc_event {
+			__kernel_pid_t process_pid;
+			__kernel_pid_t process_tgid;
+			__kernel_pid_t tracer_pid;
+			__kernel_pid_t tracer_tgid;
+		} ptrace;
+
 		struct exit_proc_event {
 			__kernel_pid_t process_pid;
 			__kernel_pid_t process_tgid;
@@ -109,6 +117,7 @@ void proc_fork_connector(struct task_struct *task);
 void proc_exec_connector(struct task_struct *task);
 void proc_id_connector(struct task_struct *task, int which_id);
 void proc_sid_connector(struct task_struct *task);
+void proc_ptrace_connector(struct task_struct *task, int which_id);
 void proc_exit_connector(struct task_struct *task);
 #else
 static inline void proc_fork_connector(struct task_struct *task)
@@ -124,6 +133,10 @@ static inline void proc_id_connector(struct task_struct *task,
 static inline void proc_sid_connector(struct task_struct *task)
 {}
 
+static inline void proc_ptrace_connector(struct task_struct *task,
+					 int ptrace_id)
+{}
+
 static inline void proc_exit_connector(struct task_struct *task)
 {}
 #endif	/* CONFIG_PROC_EVENTS */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d7ccc79454f5..9de3ecfd20f9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -23,6 +23,7 @@
 #include <linux/uaccess.h>
 #include <linux/regset.h>
 #include <linux/hw_breakpoint.h>
+#include <linux/cn_proc.h>
 
 
 static int ptrace_trapping_sleep_fn(void *flags)
@@ -305,9 +306,12 @@ unlock_tasklist:
 unlock_creds:
 	mutex_unlock(&task->signal->cred_guard_mutex);
 out:
-	if (!retval)
+	if (!retval) {
 		wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
 			    ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE);
+		proc_ptrace_connector(task, PTRACE_ATTACH);
+	}
+
 	return retval;
 }
 
@@ -415,6 +419,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
 	}
 	write_unlock_irq(&tasklist_lock);
 
+	proc_ptrace_connector(child, PTRACE_DETACH);
 	if (unlikely(dead))
 		release_task(child);
 

From 8a35241803eeb0e9fd3fe27835d6b2775c73b641 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 21 Jul 2011 17:06:53 +0200
Subject: [PATCH 38/39] ptrace: fix ptrace_signal() && STOP_DEQUEUED
 interaction

Simple test-case,

	int main(void)
	{
		int pid, status;

		pid = fork();
		if (!pid) {
			pause();
			assert(0);
			return 0x23;
		}

		assert(ptrace(PTRACE_ATTACH, pid, 0,0) == 0);
		assert(wait(&status) == pid);
		assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);

		kill(pid, SIGCONT);	// <--- also clears STOP_DEQUEUD

		assert(ptrace(PTRACE_CONT, pid, 0,0) == 0);
		assert(wait(&status) == pid);
		assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGCONT);

		assert(ptrace(PTRACE_CONT, pid, 0, SIGSTOP) == 0);
		assert(wait(&status) == pid);
		assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);

		kill(pid, SIGKILL);
		return 0;
	}

Without the patch it hangs. After the patch SIGSTOP "injected" by the
tracer is not ignored and stops the tracee.

Note also that if this test-case uses, say, SIGWINCH instead of SIGCONT,
everything works without the patch. This can't be right, and this is
confusing.

The problem is that SIGSTOP (or any other sig_kernel_stop() signal) has
no effect without JOBCTL_STOP_DEQUEUED. This means it is simply ignored
after PTRACE_CONT unless JOBCTL_STOP_DEQUEUED was set "by accident", say
it wasn't cleared after initial SIGSTOP sent by PTRACE_ATTACH.

At first glance we could change ptrace_signal() to add STOP_DEQUEUED
after return from ptrace_stop(), but this is not right in case when the
tracer does not change the reported SIGSTOP and SIGCONT comes in between.
This is even more wrong with PT_SEIZED, SIGCONT adds JOBCTL_TRAP_NOTIFY
which will be "lost" during the TRAP_STOP | TRAP_NOTIFY report.

So lets add STOP_DEQUEUED _before_ we report the signal. It has no effect
unless sig_kernel_stop() == T after the tracer resumes us, and in the
latter case the pending STOP_DEQUEUED means no SIGCONT in between, we
should stop.

Note also that if SIGCONT was sent, PT_SEIZED tracee will correctly
report PTRACE_EVENT_STOP/SIGTRAP and thus the tracer can notice the fact
SIGSTOP was cancelled.

Also, move the current->ptrace check from ptrace_signal() to its caller,
get_signal_to_deliver(), this looks more natural.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 kernel/signal.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 0a1bf2c8bdcd..c34f8f899b76 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2084,12 +2084,17 @@ static void do_jobctl_trap(void)
 static int ptrace_signal(int signr, siginfo_t *info,
 			 struct pt_regs *regs, void *cookie)
 {
-	if (!current->ptrace)
-		return signr;
-
 	ptrace_signal_deliver(regs, cookie);
-
-	/* Let the debugger run.  */
+	/*
+	 * We do not check sig_kernel_stop(signr) but set this marker
+	 * unconditionally because we do not know whether debugger will
+	 * change signr. This flag has no meaning unless we are going
+	 * to stop after return from ptrace_stop(). In this case it will
+	 * be checked in do_signal_stop(), we should only stop if it was
+	 * not cleared by SIGCONT while we were sleeping. See also the
+	 * comment in dequeue_signal().
+	 */
+	current->jobctl |= JOBCTL_STOP_DEQUEUED;
 	ptrace_stop(signr, CLD_TRAPPED, 0, info);
 
 	/* We're back.  Did the debugger cancel the sig?  */
@@ -2193,7 +2198,7 @@ relock:
 		if (!signr)
 			break; /* will return 0 */
 
-		if (signr != SIGKILL) {
+		if (unlikely(current->ptrace) && signr != SIGKILL) {
 			signr = ptrace_signal(signr, info,
 					      regs, cookie);
 			if (!signr)

From eac1b5e57d7abc836e78fd3fbcf77dbeed01edc9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 21 Jul 2011 20:00:43 +0200
Subject: [PATCH 39/39] ptrace: do_wait(traced_leader_killed_by_mt_exec) can
 block forever

Test-case:

	void *tfunc(void *arg)
	{
		execvp("true", NULL);
		return NULL;
	}

	int main(void)
	{
		int pid;

		if (fork()) {
			pthread_t t;

			kill(getpid(), SIGSTOP);

			pthread_create(&t, NULL, tfunc, NULL);

			for (;;)
				pause();
		}

		pid = getppid();
		assert(ptrace(PTRACE_ATTACH, pid, 0,0) == 0);

		while (wait(NULL) > 0)
			ptrace(PTRACE_CONT, pid, 0,0);

		return 0;
	}

It is racy, exit_notify() does __wake_up_parent() too. But in the
likely case it triggers the problem: de_thread() does release_task()
and the old leader goes away without the notification, the tracer
sleeps in do_wait() without children/tracees.

Change de_thread() to do __wake_up_parent(traced_leader->parent).
Since it is already EXIT_DEAD we can do this without ptrace_unlink(),
EXIT_DEAD threads do not exist from do_wait's pov.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 fs/exec.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/exec.c b/fs/exec.c
index b08367abf30e..d219541db06c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -967,6 +967,14 @@ static int de_thread(struct task_struct *tsk)
 
 		BUG_ON(leader->exit_state != EXIT_ZOMBIE);
 		leader->exit_state = EXIT_DEAD;
+
+		/*
+		 * We are going to release_task()->ptrace_unlink() silently,
+		 * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
+		 * the tracer wont't block again waiting for this thread.
+		 */
+		if (unlikely(leader->ptrace))
+			__wake_up_parent(leader, leader->parent);
 		write_unlock_irq(&tasklist_lock);
 
 		release_task(leader);