1
0
Fork 0
remarkable-linux/kernel/signal.c

3142 lines
80 KiB
C
Raw Normal View History

/*
* linux/kernel/signal.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
* 1997-11-02 Modified for POSIX.1b signals by Richard Henderson
*
* 2003-06-02 Jim Houston - Concurrent Computer Corp.
* Changes to use preallocated sigqueue structures
* to allow signals to be sent reliably.
*/
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/tty.h>
#include <linux/binfmts.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/ptrace.h>
#include <linux/signal.h>
signal/timer/event: signalfd core This patch series implements the new signalfd() system call. I took part of the original Linus code (and you know how badly it can be broken :), and I added even more breakage ;) Signals are fetched from the same signal queue used by the process, so signalfd will compete with standard kernel delivery in dequeue_signal(). If you want to reliably fetch signals on the signalfd file, you need to block them with sigprocmask(SIG_BLOCK). This seems to be working fine on my Dual Opteron machine. I made a quick test program for it: http://www.xmailserver.org/signafd-test.c The signalfd() system call implements signal delivery into a file descriptor receiver. The signalfd file descriptor if created with the following API: int signalfd(int ufd, const sigset_t *mask, size_t masksize); The "ufd" parameter allows to change an existing signalfd sigmask, w/out going to close/create cycle (Linus idea). Use "ufd" == -1 if you want a brand new signalfd file. The "mask" allows to specify the signal mask of signals that we are interested in. The "masksize" parameter is the size of "mask". The signalfd fd supports the poll(2) and read(2) system calls. The poll(2) will return POLLIN when signals are available to be dequeued. As a direct consequence of supporting the Linux poll subsystem, the signalfd fd can use used together with epoll(2) too. The read(2) system call will return a "struct signalfd_siginfo" structure in the userspace supplied buffer. The return value is the number of bytes copied in the supplied buffer, or -1 in case of error. The read(2) call can also return 0, in case the sighand structure to which the signalfd was attached, has been orphaned. The O_NONBLOCK flag is also supported, and read(2) will return -EAGAIN in case no signal is available. If the size of the buffer passed to read(2) is lower than sizeof(struct signalfd_siginfo), -EINVAL is returned. A read from the signalfd can also return -ERESTARTSYS in case a signal hits the process. The format of the struct signalfd_siginfo is, and the valid fields depends of the (->code & __SI_MASK) value, in the same way a struct siginfo would: struct signalfd_siginfo { __u32 signo; /* si_signo */ __s32 err; /* si_errno */ __s32 code; /* si_code */ __u32 pid; /* si_pid */ __u32 uid; /* si_uid */ __s32 fd; /* si_fd */ __u32 tid; /* si_fd */ __u32 band; /* si_band */ __u32 overrun; /* si_overrun */ __u32 trapno; /* si_trapno */ __s32 status; /* si_status */ __s32 svint; /* si_int */ __u64 svptr; /* si_ptr */ __u64 utime; /* si_utime */ __u64 stime; /* si_stime */ __u64 addr; /* si_addr */ }; [akpm@linux-foundation.org: fix signalfd_copyinfo() on i386] Signed-off-by: Davide Libenzi <davidel@xmailserver.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-10 23:23:13 -06:00
#include <linux/signalfd.h>
#include <linux/ratelimit.h>
#include <linux/tracehook.h>
#include <linux/capability.h>
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <linux/nsproxy.h>
#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>
#include <asm/param.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include <asm/siginfo.h>
#include "audit.h" /* audit_signal_info() */
/*
* SLAB caches for signal bits.
*/
static struct kmem_cache *sigqueue_cachep;
int print_fatal_signals __read_mostly;
static void __user *sig_handler(struct task_struct *t, int sig)
{
return t->sighand->action[sig - 1].sa.sa_handler;
}
static int sig_handler_ignored(void __user *handler, int sig)
{
/* Is it explicitly or implicitly ignored? */
return handler == SIG_IGN ||
(handler == SIG_DFL && sig_kernel_ignore(sig));
}
static int sig_task_ignored(struct task_struct *t, int sig,
int from_ancestor_ns)
{
void __user *handler;
handler = sig_handler(t, sig);
if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
handler == SIG_DFL && !from_ancestor_ns)
return 1;
return sig_handler_ignored(handler, sig);
}
static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
{
/*
* Blocked signals are never ignored, since the
* signal handler may change by the time it is
* unblocked.
*/
if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
return 0;
if (!sig_task_ignored(t, sig, from_ancestor_ns))
return 0;
/*
* Tracers may want to know about even ignored signals.
*/
signals: remove 'handler' parameter to tracehook functions Container-init must behave like global-init to processes within the container and hence it must be immune to unhandled fatal signals from within the container (i.e SIG_DFL signals that terminate the process). But the same container-init must behave like a normal process to processes in ancestor namespaces and so if it receives the same fatal signal from a process in ancestor namespace, the signal must be processed. Implementing these semantics requires that send_signal() determine pid namespace of the sender but since signals can originate from workqueues/ interrupt-handlers, determining pid namespace of sender may not always be possible or safe. This patchset implements the design/simplified semantics suggested by Oleg Nesterov. The simplified semantics for container-init are: - container-init must never be terminated by a signal from a descendant process. - container-init must never be immune to SIGKILL from an ancestor namespace (so a process in parent namespace must always be able to terminate a descendant container). - container-init may be immune to unhandled fatal signals (like SIGUSR1) even if they are from ancestor namespace. SIGKILL/SIGSTOP are the only reliable signals to a container-init from ancestor namespace. This patch: Based on an earlier patch submitted by Oleg Nesterov and comments from Roland McGrath (http://lkml.org/lkml/2008/11/19/258). The handler parameter is currently unused in the tracehook functions. Besides, the tracehook functions are called with siglock held, so the functions can check the handler if they later need to. Removing the parameter simiplifies changes to sig_ignored() in a follow-on patch. Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com> Acked-by: Roland McGrath <roland@redhat.com> Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Daniel Lezcano <daniel.lezcano@free.fr> Cc: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-02 17:58:00 -06:00
return !tracehook_consider_ignored_signal(t, sig);
}
/*
* Re-calculate pending state from the set of locally pending
* signals, globally pending signals, and blocked signals.
*/
static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
{
unsigned long ready;
long i;
switch (_NSIG_WORDS) {
default:
for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;)
ready |= signal->sig[i] &~ blocked->sig[i];
break;
case 4: ready = signal->sig[3] &~ blocked->sig[3];
ready |= signal->sig[2] &~ blocked->sig[2];
ready |= signal->sig[1] &~ blocked->sig[1];
ready |= signal->sig[0] &~ blocked->sig[0];
break;
case 2: ready = signal->sig[1] &~ blocked->sig[1];
ready |= signal->sig[0] &~ blocked->sig[0];
break;
case 1: ready = signal->sig[0] &~ blocked->sig[0];
}
return ready != 0;
}
#define PENDING(p,b) has_pending_signals(&(p)->signal, (b))
static int recalc_sigpending_tsk(struct task_struct *t)
{
if ((t->jobctl & JOBCTL_PENDING_MASK) ||
PENDING(&t->pending, &t->blocked) ||
PENDING(&t->signal->shared_pending, &t->blocked)) {
set_tsk_thread_flag(t, TIF_SIGPENDING);
return 1;
}
/*
* We must never clear the flag in another thread, or in current
* when it's possible the current syscall is returning -ERESTART*.
* So we don't clear it here, and only callers who know they should do.
*/
return 0;
}
/*
* After recalculating TIF_SIGPENDING, we need to make sure the task wakes up.
* This is superfluous when called on current, the wakeup is a harmless no-op.
*/
void recalc_sigpending_and_wake(struct task_struct *t)
{
if (recalc_sigpending_tsk(t))
signal_wake_up(t, 0);
}
void recalc_sigpending(void)
{
if (!recalc_sigpending_tsk(current) && !freezing(current))
clear_thread_flag(TIF_SIGPENDING);
}
/* Given the mask, find the first available signal that should be serviced. */
#define SYNCHRONOUS_MASK \
(sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
sigmask(SIGTRAP) | sigmask(SIGFPE))
signal/timer/event: signalfd core This patch series implements the new signalfd() system call. I took part of the original Linus code (and you know how badly it can be broken :), and I added even more breakage ;) Signals are fetched from the same signal queue used by the process, so signalfd will compete with standard kernel delivery in dequeue_signal(). If you want to reliably fetch signals on the signalfd file, you need to block them with sigprocmask(SIG_BLOCK). This seems to be working fine on my Dual Opteron machine. I made a quick test program for it: http://www.xmailserver.org/signafd-test.c The signalfd() system call implements signal delivery into a file descriptor receiver. The signalfd file descriptor if created with the following API: int signalfd(int ufd, const sigset_t *mask, size_t masksize); The "ufd" parameter allows to change an existing signalfd sigmask, w/out going to close/create cycle (Linus idea). Use "ufd" == -1 if you want a brand new signalfd file. The "mask" allows to specify the signal mask of signals that we are interested in. The "masksize" parameter is the size of "mask". The signalfd fd supports the poll(2) and read(2) system calls. The poll(2) will return POLLIN when signals are available to be dequeued. As a direct consequence of supporting the Linux poll subsystem, the signalfd fd can use used together with epoll(2) too. The read(2) system call will return a "struct signalfd_siginfo" structure in the userspace supplied buffer. The return value is the number of bytes copied in the supplied buffer, or -1 in case of error. The read(2) call can also return 0, in case the sighand structure to which the signalfd was attached, has been orphaned. The O_NONBLOCK flag is also supported, and read(2) will return -EAGAIN in case no signal is available. If the size of the buffer passed to read(2) is lower than sizeof(struct signalfd_siginfo), -EINVAL is returned. A read from the signalfd can also return -ERESTARTSYS in case a signal hits the process. The format of the struct signalfd_siginfo is, and the valid fields depends of the (->code & __SI_MASK) value, in the same way a struct siginfo would: struct signalfd_siginfo { __u32 signo; /* si_signo */ __s32 err; /* si_errno */ __s32 code; /* si_code */ __u32 pid; /* si_pid */ __u32 uid; /* si_uid */ __s32 fd; /* si_fd */ __u32 tid; /* si_fd */ __u32 band; /* si_band */ __u32 overrun; /* si_overrun */ __u32 trapno; /* si_trapno */ __s32 status; /* si_status */ __s32 svint; /* si_int */ __u64 svptr; /* si_ptr */ __u64 utime; /* si_utime */ __u64 stime; /* si_stime */ __u64 addr; /* si_addr */ }; [akpm@linux-foundation.org: fix signalfd_copyinfo() on i386] Signed-off-by: Davide Libenzi <davidel@xmailserver.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-10 23:23:13 -06:00
int next_signal(struct sigpending *pending, sigset_t *mask)
{
unsigned long i, *s, *m, x;
int sig = 0;
s = pending->signal.sig;
m = mask->sig;
/*
* Handle the first word specially: it contains the
* synchronous signals that need to be dequeued first.
*/
x = *s &~ *m;
if (x) {
if (x & SYNCHRONOUS_MASK)
x &= SYNCHRONOUS_MASK;
sig = ffz(~x) + 1;
return sig;
}
switch (_NSIG_WORDS) {
default:
for (i = 1; i < _NSIG_WORDS; ++i) {
x = *++s &~ *++m;
if (!x)
continue;
sig = ffz(~x) + i*_NSIG_BPW + 1;
break;
}
break;
case 2:
x = s[1] &~ m[1];
if (!x)
break;
sig = ffz(~x) + _NSIG_BPW + 1;
break;
case 1:
/* Nothing to do */
break;
}
return sig;
}
static inline void print_dropped_signal(int sig)
{
static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
if (!print_fatal_signals)
return;
if (!__ratelimit(&ratelimit_state))
return;
printk(KERN_INFO "%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n",
current->comm, current->pid, sig);
}
/**
* task_set_jobctl_pending - set jobctl pending bits
* @task: target task
* @mask: pending bits to set
*
* Clear @mask from @task->jobctl. @mask must be subset of
* %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK |
* %JOBCTL_TRAPPING. If stop signo is being set, the existing signo is
* cleared. If @task is already being killed or exiting, this function
* becomes noop.
*
* CONTEXT:
* Must be called with @task->sighand->siglock held.
*
* RETURNS:
* %true if @mask is set, %false if made noop because @task was dying.
*/
bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask)
{
BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK));
if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING)))
return false;
if (mask & JOBCTL_STOP_SIGMASK)
task->jobctl &= ~JOBCTL_STOP_SIGMASK;
task->jobctl |= mask;
return true;
}
ptrace: Clean transitions between TASK_STOPPED and TRACED Currently, if the task is STOPPED on ptrace attach, it's left alone and the state is silently changed to TRACED on the next ptrace call. The behavior breaks the assumption that arch_ptrace_stop() is called before any task is poked by ptrace and is ugly in that a task manipulates the state of another task directly. With GROUP_STOP_PENDING, the transitions between TASK_STOPPED and TRACED can be made clean. The tracer can use the flag to tell the tracee to retry stop on attach and detach. On retry, the tracee will enter the desired state in the correct way. The lower 16bits of task->group_stop is used to remember the signal number which caused the last group stop. This is used while retrying for ptrace attach as the original group_exit_code could have been consumed with wait(2) by then. As the real parent may wait(2) and consume the group_exit_code anytime, the group_exit_code needs to be saved separately so that it can be used when switching from regular sleep to ptrace_stop(). This is recorded in the lower 16bits of task->group_stop. If a task is already stopped and there's no intervening SIGCONT, a ptrace request immediately following a successful PTRACE_ATTACH should always succeed even if the tracer doesn't wait(2) for attach completion; however, with this change, the tracee might still be TASK_RUNNING trying to enter TASK_TRACED which would cause the following request to fail with -ESRCH. This intermediate state is hidden from the ptracer by setting GROUP_STOP_TRAPPING on attach and making ptrace_check_attach() wait for it to clear on its signal->wait_chldexit. Completing the transition or getting killed clears TRAPPING and wakes up the tracer. Note that the STOPPED -> RUNNING -> TRACED transition is still visible to other threads which are in the same group as the ptracer and the reverse transition is visible to all. Please read the comments for details. Oleg: * Spotted a race condition where a task may retry group stop without proper bookkeeping. Fixed by redoing bookkeeping on retry. * Spotted that the transition is visible to userland in several different ways. Most are fixed with GROUP_STOP_TRAPPING. Unhandled corner case is documented. * Pointed out not setting GROUP_STOP_SIGMASK on an already stopped task would result in more consistent behavior. * Pointed out that calling ptrace_stop() from do_signal_stop() in TASK_STOPPED can race with group stop start logic and then confuse the TRAPPING wait in ptrace_check_attach(). ptrace_stop() is now called with TASK_RUNNING. * Suggested using signal->wait_chldexit instead of bit wait. * Spotted a race condition between TRACED transition and clearing of TRAPPING. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com> Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
2011-03-23 03:37:00 -06:00
/**
* task_clear_jobctl_trapping - clear jobctl trapping bit
ptrace: Clean transitions between TASK_STOPPED and TRACED Currently, if the task is STOPPED on ptrace attach, it's left alone and the state is silently changed to TRACED on the next ptrace call. The behavior breaks the assumption that arch_ptrace_stop() is called before any task is poked by ptrace and is ugly in that a task manipulates the state of another task directly. With GROUP_STOP_PENDING, the transitions between TASK_STOPPED and TRACED can be made clean. The tracer can use the flag to tell the tracee to retry stop on attach and detach. On retry, the tracee will enter the desired state in the correct way. The lower 16bits of task->group_stop is used to remember the signal number which caused the last group stop. This is used while retrying for ptrace attach as the original group_exit_code could have been consumed with wait(2) by then. As the real parent may wait(2) and consume the group_exit_code anytime, the group_exit_code needs to be saved separately so that it can be used when switching from regular sleep to ptrace_stop(). This is recorded in the lower 16bits of task->group_stop. If a task is already stopped and there's no intervening SIGCONT, a ptrace request immediately following a successful PTRACE_ATTACH should always succeed even if the tracer doesn't wait(2) for attach completion; however, with this change, the tracee might still be TASK_RUNNING trying to enter TASK_TRACED which would cause the following request to fail with -ESRCH. This intermediate state is hidden from the ptracer by setting GROUP_STOP_TRAPPING on attach and making ptrace_check_attach() wait for it to clear on its signal->wait_chldexit. Completing the transition or getting killed clears TRAPPING and wakes up the tracer. Note that the STOPPED -> RUNNING -> TRACED transition is still visible to other threads which are in the same group as the ptracer and the reverse transition is visible to all. Please read the comments for details. Oleg: * Spotted a race condition where a task may retry group stop without proper bookkeeping. Fixed by redoing bookkeeping on retry. * Spotted that the transition is visible to userland in several different ways. Most are fixed with GROUP_STOP_TRAPPING. Unhandled corner case is documented. * Pointed out not setting GROUP_STOP_SIGMASK on an already stopped task would result in more consistent behavior. * Pointed out that calling ptrace_stop() from do_signal_stop() in TASK_STOPPED can race with group stop start logic and then confuse the TRAPPING wait in ptrace_check_attach(). ptrace_stop() is now called with TASK_RUNNING. * Suggested using signal->wait_chldexit instead of bit wait. * Spotted a race condition between TRACED transition and clearing of TRAPPING. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com> Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
2011-03-23 03:37:00 -06:00
* @task: target task
*
* If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED.
* Clear it and wake up the ptracer. Note that we don't need any further
* locking. @task->siglock guarantees that @task->parent points to the
* ptracer.
ptrace: Clean transitions between TASK_STOPPED and TRACED Currently, if the task is STOPPED on ptrace attach, it's left alone and the state is silently changed to TRACED on the next ptrace call. The behavior breaks the assumption that arch_ptrace_stop() is called before any task is poked by ptrace and is ugly in that a task manipulates the state of another task directly. With GROUP_STOP_PENDING, the transitions between TASK_STOPPED and TRACED can be made clean. The tracer can use the flag to tell the tracee to retry stop on attach and detach. On retry, the tracee will enter the desired state in the correct way. The lower 16bits of task->group_stop is used to remember the signal number which caused the last group stop. This is used while retrying for ptrace attach as the original group_exit_code could have been consumed with wait(2) by then. As the real parent may wait(2) and consume the group_exit_code anytime, the group_exit_code needs to be saved separately so that it can be used when switching from regular sleep to ptrace_stop(). This is recorded in the lower 16bits of task->group_stop. If a task is already stopped and there's no intervening SIGCONT, a ptrace request immediately following a successful PTRACE_ATTACH should always succeed even if the tracer doesn't wait(2) for attach completion; however, with this change, the tracee might still be TASK_RUNNING trying to enter TASK_TRACED which would cause the following request to fail with -ESRCH. This intermediate state is hidden from the ptracer by setting GROUP_STOP_TRAPPING on attach and making ptrace_check_attach() wait for it to clear on its signal->wait_chldexit. Completing the transition or getting killed clears TRAPPING and wakes up the tracer. Note that the STOPPED -> RUNNING -> TRACED transition is still visible to other threads which are in the same group as the ptracer and the reverse transition is visible to all. Please read the comments for details. Oleg: * Spotted a race condition where a task may retry group stop without proper bookkeeping. Fixed by redoing bookkeeping on retry. * Spotted that the transition is visible to userland in several different ways. Most are fixed with GROUP_STOP_TRAPPING. Unhandled corner case is documented. * Pointed out not setting GROUP_STOP_SIGMASK on an already stopped task would result in more consistent behavior. * Pointed out that calling ptrace_stop() from do_signal_stop() in TASK_STOPPED can race with group stop start logic and then confuse the TRAPPING wait in ptrace_check_attach(). ptrace_stop() is now called with TASK_RUNNING. * Suggested using signal->wait_chldexit instead of bit wait. * Spotted a race condition between TRACED transition and clearing of TRAPPING. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com> Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
2011-03-23 03:37:00 -06:00
*
* CONTEXT:
* Must be called with @task->sighand->siglock held.
*/
static void task_clear_jobctl_trapping(struct task_struct *task)
ptrace: Clean transitions between TASK_STOPPED and TRACED Currently, if the task is STOPPED on ptrace attach, it's left alone and the state is silently changed to TRACED on the next ptrace call. The behavior breaks the assumption that arch_ptrace_stop() is called before any task is poked by ptrace and is ugly in that a task manipulates the state of another task directly. With GROUP_STOP_PENDING, the transitions between TASK_STOPPED and TRACED can be made clean. The tracer can use the flag to tell the tracee to retry stop on attach and detach. On retry, the tracee will enter the desired state in the correct way. The lower 16bits of task->group_stop is used to remember the signal number which caused the last group stop. This is used while retrying for ptrace attach as the original group_exit_code could have been consumed with wait(2) by then. As the real parent may wait(2) and consume the group_exit_code anytime, the group_exit_code needs to be saved separately so that it can be used when switching from regular sleep to ptrace_stop(). This is recorded in the lower 16bits of task->group_stop. If a task is already stopped and there's no intervening SIGCONT, a ptrace request immediately following a successful PTRACE_ATTACH should always succeed even if the tracer doesn't wait(2) for attach completion; however, with this change, the tracee might still be TASK_RUNNING trying to enter TASK_TRACED which would cause the following request to fail with -ESRCH. This intermediate state is hidden from the ptracer by setting GROUP_STOP_TRAPPING on attach and making ptrace_check_attach() wait for it to clear on its signal->wait_chldexit. Completing the transition or getting killed clears TRAPPING and wakes up the tracer. Note that the STOPPED -> RUNNING -> TRACED transition is still visible to other threads which are in the same group as the ptracer and the reverse transition is visible to all. Please read the comments for details. Oleg: * Spotted a race condition where a task may retry group stop without proper bookkeeping. Fixed by redoing bookkeeping on retry. * Spotted that the transition is visible to userland in several different ways. Most are fixed with GROUP_STOP_TRAPPING. Unhandled corner case is documented. * Pointed out not setting GROUP_STOP_SIGMASK on an already stopped task would result in more consistent behavior. * Pointed out that calling ptrace_stop() from do_signal_stop() in TASK_STOPPED can race with group stop start logic and then confuse the TRAPPING wait in ptrace_check_attach(). ptrace_stop() is now called with TASK_RUNNING. * Suggested using signal->wait_chldexit instead of bit wait. * Spotted a race condition between TRACED transition and clearing of TRAPPING. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com> Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
2011-03-23 03:37:00 -06:00
{
if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
task->jobctl &= ~JOBCTL_TRAPPING;
wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT);
ptrace: Clean transitions between TASK_STOPPED and TRACED Currently, if the task is STOPPED on ptrace attach, it's left alone and the state is silently changed to TRACED on the next ptrace call. The behavior breaks the assumption that arch_ptrace_stop() is called before any task is poked by ptrace and is ugly in that a task manipulates the state of another task directly. With GROUP_STOP_PENDING, the transitions between TASK_STOPPED and TRACED can be made clean. The tracer can use the flag to tell the tracee to retry stop on attach and detach. On retry, the tracee will enter the desired state in the correct way. The lower 16bits of task->group_stop is used to remember the signal number which caused the last group stop. This is used while retrying for ptrace attach as the original group_exit_code could have been consumed with wait(2) by then. As the real parent may wait(2) and consume the group_exit_code anytime, the group_exit_code needs to be saved separately so that it can be used when switching from regular sleep to ptrace_stop(). This is recorded in the lower 16bits of task->group_stop. If a task is already stopped and there's no intervening SIGCONT, a ptrace request immediately following a successful PTRACE_ATTACH should always succeed even if the tracer doesn't wait(2) for attach completion; however, with this change, the tracee might still be TASK_RUNNING trying to enter TASK_TRACED which would cause the following request to fail with -ESRCH. This intermediate state is hidden from the ptracer by setting GROUP_STOP_TRAPPING on attach and making ptrace_check_attach() wait for it to clear on its signal->wait_chldexit. Completing the transition or getting killed clears TRAPPING and wakes up the tracer. Note that the STOPPED -> RUNNING -> TRACED transition is still visible to other threads which are in the same group as the ptracer and the reverse transition is visible to all. Please read the comments for details. Oleg: * Spotted a race condition where a task may retry group stop without proper bookkeeping. Fixed by redoing bookkeeping on retry. * Spotted that the transition is visible to userland in several different ways. Most are fixed with GROUP_STOP_TRAPPING. Unhandled corner case is documented. * Pointed out not setting GROUP_STOP_SIGMASK on an already stopped task would result in more consistent behavior. * Pointed out that calling ptrace_stop() from do_signal_stop() in TASK_STOPPED can race with group stop start logic and then confuse the TRAPPING wait in ptrace_check_attach(). ptrace_stop() is now called with TASK_RUNNING. * Suggested using signal->wait_chldexit instead of bit wait. * Spotted a race condition between TRACED transition and clearing of TRAPPING. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com> Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
2011-03-23 03:37:00 -06:00
}
}
signal: Fix premature completion of group stop when interfered by ptrace task->signal->group_stop_count is used to track the progress of group stop. It's initialized to the number of tasks which need to stop for group stop to finish and each stopping or trapping task decrements. However, each task doesn't keep track of whether it decremented the counter or not and if woken up before the group stop is complete and stops again, it can decrement the counter multiple times. Please consider the following example code. static void *worker(void *arg) { while (1) ; return NULL; } int main(void) { pthread_t thread; pid_t pid; int i; pid = fork(); if (!pid) { for (i = 0; i < 5; i++) pthread_create(&thread, NULL, worker, NULL); while (1) ; return 0; } ptrace(PTRACE_ATTACH, pid, NULL, NULL); while (1) { waitid(P_PID, pid, NULL, WSTOPPED); ptrace(PTRACE_SINGLESTEP, pid, NULL, (void *)(long)SIGSTOP); } return 0; } The child creates five threads and the parent continuously traps the first thread and whenever the child gets a signal, SIGSTOP is delivered. If an external process sends SIGSTOP to the child, all other threads in the process should reliably stop. However, due to the above bug, the first thread will often end up consuming group_stop_count multiple times and SIGSTOP often ends up stopping none or part of the other four threads. This patch adds a new field task->group_stop which is protected by siglock and uses GROUP_STOP_CONSUME flag to track which task is still to consume group_stop_count to fix this bug. task_clear_group_stop_pending() and task_participate_group_stop() are added to help manipulating group stop states. As ptrace_stop() now also uses task_participate_group_stop(), it will set SIGNAL_STOP_STOPPED if it completes a group stop. There still are many issues regarding the interaction between group stop and ptrace. Patches to address them will follow. - Oleg spotted duplicate GROUP_STOP_CONSUME. Dropped. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com>
2011-03-23 03:37:00 -06:00
/**
* task_clear_jobctl_pending - clear jobctl pending bits
signal: Fix premature completion of group stop when interfered by ptrace task->signal->group_stop_count is used to track the progress of group stop. It's initialized to the number of tasks which need to stop for group stop to finish and each stopping or trapping task decrements. However, each task doesn't keep track of whether it decremented the counter or not and if woken up before the group stop is complete and stops again, it can decrement the counter multiple times. Please consider the following example code. static void *worker(void *arg) { while (1) ; return NULL; } int main(void) { pthread_t thread; pid_t pid; int i; pid = fork(); if (!pid) { for (i = 0; i < 5; i++) pthread_create(&thread, NULL, worker, NULL); while (1) ; return 0; } ptrace(PTRACE_ATTACH, pid, NULL, NULL); while (1) { waitid(P_PID, pid, NULL, WSTOPPED); ptrace(PTRACE_SINGLESTEP, pid, NULL, (void *)(long)SIGSTOP); } return 0; } The child creates five threads and the parent continuously traps the first thread and whenever the child gets a signal, SIGSTOP is delivered. If an external process sends SIGSTOP to the child, all other threads in the process should reliably stop. However, due to the above bug, the first thread will often end up consuming group_stop_count multiple times and SIGSTOP often ends up stopping none or part of the other four threads. This patch adds a new field task->group_stop which is protected by siglock and uses GROUP_STOP_CONSUME flag to track which task is still to consume group_stop_count to fix this bug. task_clear_group_stop_pending() and task_participate_group_stop() are added to help manipulating group stop states. As ptrace_stop() now also uses task_participate_group_stop(), it will set SIGNAL_STOP_STOPPED if it completes a group stop. There still are many issues regarding the interaction between group stop and ptrace. Patches to address them will follow. - Oleg spotted duplicate GROUP_STOP_CONSUME. Dropped. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com>
2011-03-23 03:37:00 -06:00
* @task: target task
* @mask: pending bits to clear
signal: Fix premature completion of group stop when interfered by ptrace task->signal->group_stop_count is used to track the progress of group stop. It's initialized to the number of tasks which need to stop for group stop to finish and each stopping or trapping task decrements. However, each task doesn't keep track of whether it decremented the counter or not and if woken up before the group stop is complete and stops again, it can decrement the counter multiple times. Please consider the following example code. static void *worker(void *arg) { while (1) ; return NULL; } int main(void) { pthread_t thread; pid_t pid; int i; pid = fork(); if (!pid) { for (i = 0; i < 5; i++) pthread_create(&thread, NULL, worker, NULL); while (1) ; return 0; } ptrace(PTRACE_ATTACH, pid, NULL, NULL); while (1) { waitid(P_PID, pid, NULL, WSTOPPED); ptrace(PTRACE_SINGLESTEP, pid, NULL, (void *)(long)SIGSTOP); } return 0; } The child creates five threads and the parent continuously traps the first thread and whenever the child gets a signal, SIGSTOP is delivered. If an external process sends SIGSTOP to the child, all other threads in the process should reliably stop. However, due to the above bug, the first thread will often end up consuming group_stop_count multiple times and SIGSTOP often ends up stopping none or part of the other four threads. This patch adds a new field task->group_stop which is protected by siglock and uses GROUP_STOP_CONSUME flag to track which task is still to consume group_stop_count to fix this bug. task_clear_group_stop_pending() and task_participate_group_stop() are added to help manipulating group stop states. As ptrace_stop() now also uses task_participate_group_stop(), it will set SIGNAL_STOP_STOPPED if it completes a group stop. There still are many issues regarding the interaction between group stop and ptrace. Patches to address them will follow. - Oleg spotted duplicate GROUP_STOP_CONSUME. Dropped. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com>
2011-03-23 03:37:00 -06:00
*
* Clear @mask from @task->jobctl. @mask must be subset of
* %JOBCTL_PENDING_MASK. If %JOBCTL_STOP_PENDING is being cleared, other
* STOP bits are cleared together.
signal: Fix premature completion of group stop when interfered by ptrace task->signal->group_stop_count is used to track the progress of group stop. It's initialized to the number of tasks which need to stop for group stop to finish and each stopping or trapping task decrements. However, each task doesn't keep track of whether it decremented the counter or not and if woken up before the group stop is complete and stops again, it can decrement the counter multiple times. Please consider the following example code. static void *worker(void *arg) { while (1) ; return NULL; } int main(void) { pthread_t thread; pid_t pid; int i; pid = fork(); if (!pid) { for (i = 0; i < 5; i++) pthread_create(&thread, NULL, worker, NULL); while (1) ; return 0; } ptrace(PTRACE_ATTACH, pid, NULL, NULL); while (1) { waitid(P_PID, pid, NULL, WSTOPPED); ptrace(PTRACE_SINGLESTEP, pid, NULL, (void *)(long)SIGSTOP); } return 0; } The child creates five threads and the parent continuously traps the first thread and whenever the child gets a signal, SIGSTOP is delivered. If an external process sends SIGSTOP to the child, all other threads in the process should reliably stop. However, due to the above bug, the first thread will often end up consuming group_stop_count multiple times and SIGSTOP often ends up stopping none or part of the other four threads. This patch adds a new field task->group_stop which is protected by siglock and uses GROUP_STOP_CONSUME flag to track which task is still to consume group_stop_count to fix this bug. task_clear_group_stop_pending() and task_participate_group_stop() are added to help manipulating group stop states. As ptrace_stop() now also uses task_participate_group_stop(), it will set SIGNAL_STOP_STOPPED if it completes a group stop. There still are many issues regarding the interaction between group stop and ptrace. Patches to address them will follow. - Oleg spotted duplicate GROUP_STOP_CONSUME. Dropped. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com>
2011-03-23 03:37:00 -06:00
*
* If clearing of @mask leaves no stop or trap pending, this function calls
* task_clear_jobctl_trapping().
*
signal: Fix premature completion of group stop when interfered by ptrace task->signal->group_stop_count is used to track the progress of group stop. It's initialized to the number of tasks which need to stop for group stop to finish and each stopping or trapping task decrements. However, each task doesn't keep track of whether it decremented the counter or not and if woken up before the group stop is complete and stops again, it can decrement the counter multiple times. Please consider the following example code. static void *worker(void *arg) { while (1) ; return NULL; } int main(void) { pthread_t thread; pid_t pid; int i; pid = fork(); if (!pid) { for (i = 0; i < 5; i++) pthread_create(&thread, NULL, worker, NULL); while (1) ; return 0; } ptrace(PTRACE_ATTACH, pid, NULL, NULL); while (1) { waitid(P_PID, pid, NULL, WSTOPPED); ptrace(PTRACE_SINGLESTEP, pid, NULL, (void *)(long)SIGSTOP); } return 0; } The child creates five threads and the parent continuously traps the first thread and whenever the child gets a signal, SIGSTOP is delivered. If an external process sends SIGSTOP to the child, all other threads in the process should reliably stop. However, due to the above bug, the first thread will often end up consuming group_stop_count multiple times and SIGSTOP often ends up stopping none or part of the other four threads. This patch adds a new field task->group_stop which is protected by siglock and uses GROUP_STOP_CONSUME flag to track which task is still to consume group_stop_count to fix this bug. task_clear_group_stop_pending() and task_participate_group_stop() are added to help manipulating group stop states. As ptrace_stop() now also uses task_participate_group_stop(), it will set SIGNAL_STOP_STOPPED if it completes a group stop. There still are many issues regarding the interaction between group stop and ptrace. Patches to address them will follow. - Oleg spotted duplicate GROUP_STOP_CONSUME. Dropped. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com>
2011-03-23 03:37:00 -06:00
* CONTEXT:
* Must be called with @task->sighand->siglock held.
*/
void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask)
signal: Fix premature completion of group stop when interfered by ptrace task->signal->group_stop_count is used to track the progress of group stop. It's initialized to the number of tasks which need to stop for group stop to finish and each stopping or trapping task decrements. However, each task doesn't keep track of whether it decremented the counter or not and if woken up before the group stop is complete and stops again, it can decrement the counter multiple times. Please consider the following example code. static void *worker(void *arg) { while (1) ; return NULL; } int main(void) { pthread_t thread; pid_t pid; int i; pid = fork(); if (!pid) { for (i = 0; i < 5; i++) pthread_create(&thread, NULL, worker, NULL); while (1) ; return 0; } ptrace(PTRACE_ATTACH, pid, NULL, NULL); while (1) { waitid(P_PID, pid, NULL, WSTOPPED); ptrace(PTRACE_SINGLESTEP, pid, NULL, (void *)(long)SIGSTOP); } return 0; } The child creates five threads and the parent continuously traps the first thread and whenever the child gets a signal, SIGSTOP is delivered. If an external process sends SIGSTOP to the child, all other threads in the process should reliably stop. However, due to the above bug, the first thread will often end up consuming group_stop_count multiple times and SIGSTOP often ends up stopping none or part of the other four threads. This patch adds a new field task->group_stop which is protected by siglock and uses GROUP_STOP_CONSUME flag to track which task is still to consume group_stop_count to fix this bug. task_clear_group_stop_pending() and task_participate_group_stop() are added to help manipulating group stop states. As ptrace_stop() now also uses task_participate_group_stop(), it will set SIGNAL_STOP_STOPPED if it completes a group stop. There still are many issues regarding the interaction between group stop and ptrace. Patches to address them will follow. - Oleg spotted duplicate GROUP_STOP_CONSUME. Dropped. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com>
2011-03-23 03:37:00 -06:00
{
BUG_ON(mask & ~JOBCTL_PENDING_MASK);
if (mask & JOBCTL_STOP_PENDING)
mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED;
task->jobctl &= ~mask;
if (!(task->jobctl & JOBCTL_PENDING_MASK))
task_clear_jobctl_trapping(task);
signal: Fix premature completion of group stop when interfered by ptrace task->signal->group_stop_count is used to track the progress of group stop. It's initialized to the number of tasks which need to stop for group stop to finish and each stopping or trapping task decrements. However, each task doesn't keep track of whether it decremented the counter or not and if woken up before the group stop is complete and stops again, it can decrement the counter multiple times. Please consider the following example code. static void *worker(void *arg) { while (1) ; return NULL; } int main(void) { pthread_t thread; pid_t pid; int i; pid = fork(); if (!pid) { for (i = 0; i < 5; i++) pthread_create(&thread, NULL, worker, NULL); while (1) ; return 0; } ptrace(PTRACE_ATTACH, pid, NULL, NULL); while (1) { waitid(P_PID, pid, NULL, WSTOPPED); ptrace(PTRACE_SINGLESTEP, pid, NULL, (void *)(long)SIGSTOP); } return 0; } The child creates five threads and the parent continuously traps the first thread and whenever the child gets a signal, SIGSTOP is delivered. If an external process sends SIGSTOP to the child, all other threads in the process should reliably stop. However, due to the above bug, the first thread will often end up consuming group_stop_count multiple times and SIGSTOP often ends up stopping none or part of the other four threads. This patch adds a new field task->group_stop which is protected by siglock and uses GROUP_STOP_CONSUME flag to track which task is still to consume group_stop_count to fix this bug. task_clear_group_stop_pending() and task_participate_group_stop() are added to help manipulating group stop states. As ptrace_stop() now also uses task_participate_group_stop(), it will set SIGNAL_STOP_STOPPED if it completes a group stop. There still are many issues regarding the interaction between group stop and ptrace. Patches to address them will follow. - Oleg spotted duplicate GROUP_STOP_CONSUME. Dropped. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com>
2011-03-23 03:37:00 -06:00
}
/**
* task_participate_group_stop - participate in a group stop
* @task: task participating in a group stop
*
* @task has %JOBCTL_STOP_PENDING set and is participating in a group stop.
signal: Use GROUP_STOP_PENDING to stop once for a single group stop Currently task->signal->group_stop_count is used to decide whether to stop for group stop. However, if there is a task in the group which is taking a long time to stop, other tasks which are continued by ptrace would repeatedly stop for the same group stop until the group stop is complete. Conversely, if a ptraced task is in TASK_TRACED state, the debugger won't get notified of group stops which is inconsistent compared to the ptraced task in any other state. This patch introduces GROUP_STOP_PENDING which tracks whether a task is yet to stop for the group stop in progress. The flag is set when a group stop starts and cleared when the task stops the first time for the group stop, and consulted whenever whether the task should participate in a group stop needs to be determined. Note that now tasks in TASK_TRACED also participate in group stop. This results in the following behavior changes. * For a single group stop, a ptracer would see at most one stop reported. * A ptracee in TASK_TRACED now also participates in group stop and the tracer would get the notification. However, as a ptraced task could be in TASK_STOPPED state or any ptrace trap could consume group stop, the notification may still be missing. These will be addressed with further patches. * A ptracee may start a group stop while one is still in progress if the tracer let it continue with stop signal delivery. Group stop code handles this correctly. Oleg: * Spotted that a task might skip signal check even when its GROUP_STOP_PENDING is set. Fixed by updating recalc_sigpending_tsk() to check GROUP_STOP_PENDING instead of group_stop_count. * Pointed out that task->group_stop should be cleared whenever task->signal->group_stop_count is cleared. Fixed accordingly. * Pointed out the behavior inconsistency between TASK_TRACED and RUNNING and the last behavior change. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com>
2011-03-23 03:37:00 -06:00
* Group stop states are cleared and the group stop count is consumed if
* %JOBCTL_STOP_CONSUME was set. If the consumption completes the group
signal: Use GROUP_STOP_PENDING to stop once for a single group stop Currently task->signal->group_stop_count is used to decide whether to stop for group stop. However, if there is a task in the group which is taking a long time to stop, other tasks which are continued by ptrace would repeatedly stop for the same group stop until the group stop is complete. Conversely, if a ptraced task is in TASK_TRACED state, the debugger won't get notified of group stops which is inconsistent compared to the ptraced task in any other state. This patch introduces GROUP_STOP_PENDING which tracks whether a task is yet to stop for the group stop in progress. The flag is set when a group stop starts and cleared when the task stops the first time for the group stop, and consulted whenever whether the task should participate in a group stop needs to be determined. Note that now tasks in TASK_TRACED also participate in group stop. This results in the following behavior changes. * For a single group stop, a ptracer would see at most one stop reported. * A ptracee in TASK_TRACED now also participates in group stop and the tracer would get the notification. However, as a ptraced task could be in TASK_STOPPED state or any ptrace trap could consume group stop, the notification may still be missing. These will be addressed with further patches. * A ptracee may start a group stop while one is still in progress if the tracer let it continue with stop signal delivery. Group stop code handles this correctly. Oleg: * Spotted that a task might skip signal check even when its GROUP_STOP_PENDING is set. Fixed by updating recalc_sigpending_tsk() to check GROUP_STOP_PENDING instead of group_stop_count. * Pointed out that task->group_stop should be cleared whenever task->signal->group_stop_count is cleared. Fixed accordingly. * Pointed out the behavior inconsistency between TASK_TRACED and RUNNING and the last behavior change. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com>
2011-03-23 03:37:00 -06:00
* stop, the appropriate %SIGNAL_* flags are set.
signal: Fix premature completion of group stop when interfered by ptrace task->signal->group_stop_count is used to track the progress of group stop. It's initialized to the number of tasks which need to stop for group stop to finish and each stopping or trapping task decrements. However, each task doesn't keep track of whether it decremented the counter or not and if woken up before the group stop is complete and stops again, it can decrement the counter multiple times. Please consider the following example code. static void *worker(void *arg) { while (1) ; return NULL; } int main(void) { pthread_t thread; pid_t pid; int i; pid = fork(); if (!pid) { for (i = 0; i < 5; i++) pthread_create(&thread, NULL, worker, NULL); while (1) ; return 0; } ptrace(PTRACE_ATTACH, pid, NULL, NULL); while (1) { waitid(P_PID, pid, NULL, WSTOPPED); ptrace(PTRACE_SINGLESTEP, pid, NULL, (void *)(long)SIGSTOP); } return 0; } The child creates five threads and the parent continuously traps the first thread and whenever the child gets a signal, SIGSTOP is delivered. If an external process sends SIGSTOP to the child, all other threads in the process should reliably stop. However, due to the above bug, the first thread will often end up consuming group_stop_count multiple times and SIGSTOP often ends up stopping none or part of the other four threads. This patch adds a new field task->group_stop which is protected by siglock and uses GROUP_STOP_CONSUME flag to track which task is still to consume group_stop_count to fix this bug. task_clear_group_stop_pending() and task_participate_group_stop() are added to help manipulating group stop states. As ptrace_stop() now also uses task_participate_group_stop(), it will set SIGNAL_STOP_STOPPED if it completes a group stop. There still are many issues regarding the interaction between group stop and ptrace. Patches to address them will follow. - Oleg spotted duplicate GROUP_STOP_CONSUME. Dropped. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com>
2011-03-23 03:37:00 -06:00
*
* CONTEXT:
* Must be called with @task->sighand->siglock held.
job control: Don't send duplicate job control stop notification while ptraced Just as group_exit_code shouldn't be generated when a PTRACE_CONT'd task re-enters job control stop, notifiction for the event should be suppressed too. The logic is the same as the group_exit_code generation suppression in do_signal_stop(), if SIGNAL_STOP_STOPPED is already set, the task is re-entering job control stop without intervening SIGCONT and the notifications should be suppressed. Test case follows. #include <stdio.h> #include <unistd.h> #include <signal.h> #include <time.h> #include <sys/ptrace.h> #include <sys/wait.h> static const struct timespec ts100ms = { .tv_nsec = 100000000 }; static pid_t tracee, tracer; static const char *pid_who(pid_t pid) { return pid == tracee ? "tracee" : (pid == tracer ? "tracer" : "mommy "); } static void sigchld_sigaction(int signo, siginfo_t *si, void *ucxt) { printf("%s: SIG status=%02d code=%02d (%s)\n", pid_who(getpid()), si->si_status, si->si_code, pid_who(si->si_pid)); } int main(void) { const struct sigaction chld_sa = { .sa_sigaction = sigchld_sigaction, .sa_flags = SA_SIGINFO|SA_RESTART }; siginfo_t si; sigaction(SIGCHLD, &chld_sa, NULL); tracee = fork(); if (!tracee) { tracee = getpid(); while (1) pause(); } kill(tracee, SIGSTOP); waitid(P_PID, tracee, &si, WSTOPPED); tracer = fork(); if (!tracer) { tracer = getpid(); ptrace(PTRACE_ATTACH, tracee, NULL, NULL); waitid(P_PID, tracee, &si, WSTOPPED); ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); waitid(P_PID, tracee, &si, WSTOPPED); ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); waitid(P_PID, tracee, &si, WSTOPPED); printf("tracer: detaching\n"); ptrace(PTRACE_DETACH, tracee, NULL, NULL); return 0; } while (1) pause(); return 0; } Before the patch, the parent gets the second notification for the tracee after the tracer detaches. si_status is zero because group_exit_code is not set by the group stop completion which triggered this notification. mommy : SIG status=19 code=05 (tracee) tracer: SIG status=00 code=05 (tracee) tracer: SIG status=19 code=04 (tracee) tracer: SIG status=00 code=05 (tracee) tracer: detaching mommy : SIG status=00 code=05 (tracee) mommy : SIG status=00 code=01 (tracer) ^C After the patch, the duplicate notification is gone. mommy : SIG status=19 code=05 (tracee) tracer: SIG status=00 code=05 (tracee) tracer: SIG status=19 code=04 (tracee) tracer: SIG status=00 code=05 (tracee) tracer: detaching mommy : SIG status=00 code=01 (tracer) ^C Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com>
2011-03-23 03:37:01 -06:00
*
* RETURNS:
* %true if group stop completion should be notified to the parent, %false
* otherwise.
signal: Fix premature completion of group stop when interfered by ptrace task->signal->group_stop_count is used to track the progress of group stop. It's initialized to the number of tasks which need to stop for group stop to finish and each stopping or trapping task decrements. However, each task doesn't keep track of whether it decremented the counter or not and if woken up before the group stop is complete and stops again, it can decrement the counter multiple times. Please consider the following example code. static void *worker(void *arg) { while (1) ; return NULL; } int main(void) { pthread_t thread; pid_t pid; int i; pid = fork(); if (!pid) { for (i = 0; i < 5; i++) pthread_create(&thread, NULL, worker, NULL); while (1) ; return 0; } ptrace(PTRACE_ATTACH, pid, NULL, NULL); while (1) { waitid(P_PID, pid, NULL, WSTOPPED); ptrace(PTRACE_SINGLESTEP, pid, NULL, (void *)(long)SIGSTOP); } return 0; } The child creates five threads and the parent continuously traps the first thread and whenever the child gets a signal, SIGSTOP is delivered. If an external process sends SIGSTOP to the child, all other threads in the process should reliably stop. However, due to the above bug, the first thread will often end up consuming group_stop_count multiple times and SIGSTOP often ends up stopping none or part of the other four threads. This patch adds a new field task->group_stop which is protected by siglock and uses GROUP_STOP_CONSUME flag to track which task is still to consume group_stop_count to fix this bug. task_clear_group_stop_pending() and task_participate_group_stop() are added to help manipulating group stop states. As ptrace_stop() now also uses task_participate_group_stop(), it will set SIGNAL_STOP_STOPPED if it completes a group stop. There still are many issues regarding the interaction between group stop and ptrace. Patches to address them will follow. - Oleg spotted duplicate GROUP_STOP_CONSUME. Dropped. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com>
2011-03-23 03:37:00 -06:00
*/
static bool task_participate_group_stop(struct task_struct *task)
{
struct signal_struct *sig = task->signal;
bool consume = task->jobctl & JOBCTL_STOP_CONSUME;
signal: Fix premature completion of group stop when interfered by ptrace task->signal->group_stop_count is used to track the progress of group stop. It's initialized to the number of tasks which need to stop for group stop to finish and each stopping or trapping task decrements. However, each task doesn't keep track of whether it decremented the counter or not and if woken up before the group stop is complete and stops again, it can decrement the counter multiple times. Please consider the following example code. static void *worker(void *arg) { while (1) ; return NULL; } int main(void) { pthread_t thread; pid_t pid; int i; pid = fork(); if (!pid) { for (i = 0; i < 5; i++) pthread_create(&thread, NULL, worker, NULL); while (1) ; return 0; } ptrace(PTRACE_ATTACH, pid, NULL, NULL); while (1) { waitid(P_PID, pid, NULL, WSTOPPED); ptrace(PTRACE_SINGLESTEP, pid, NULL, (void *)(long)SIGSTOP); } return 0; } The child creates five threads and the parent continuously traps the first thread and whenever the child gets a signal, SIGSTOP is delivered. If an external process sends SIGSTOP to the child, all other threads in the process should reliably stop. However, due to the above bug, the first thread will often end up consuming group_stop_count multiple times and SIGSTOP often ends up stopping none or part of the other four threads. This patch adds a new field task->group_stop which is protected by siglock and uses GROUP_STOP_CONSUME flag to track which task is still to consume group_stop_count to fix this bug. task_clear_group_stop_pending() and task_participate_group_stop() are added to help manipulating group stop states. As ptrace_stop() now also uses task_participate_group_stop(), it will set SIGNAL_STOP_STOPPED if it completes a group stop. There still are many issues regarding the interaction between group stop and ptrace. Patches to address them will follow. - Oleg spotted duplicate GROUP_STOP_CONSUME. Dropped. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com>
2011-03-23 03:37:00 -06:00
WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING));
signal: Use GROUP_STOP_PENDING to stop once for a single group stop Currently task->signal->group_stop_count is used to decide whether to stop for group stop. However, if there is a task in the group which is taking a long time to stop, other tasks which are continued by ptrace would repeatedly stop for the same group stop until the group stop is complete. Conversely, if a ptraced task is in TASK_TRACED state, the debugger won't get notified of group stops which is inconsistent compared to the ptraced task in any other state. This patch introduces GROUP_STOP_PENDING which tracks whether a task is yet to stop for the group stop in progress. The flag is set when a group stop starts and cleared when the task stops the first time for the group stop, and consulted whenever whether the task should participate in a group stop needs to be determined. Note that now tasks in TASK_TRACED also participate in group stop. This results in the following behavior changes. * For a single group stop, a ptracer would see at most one stop reported. * A ptracee in TASK_TRACED now also participates in group stop and the tracer would get the notification. However, as a ptraced task could be in TASK_STOPPED state or any ptrace trap could consume group stop, the notification may still be missing. These will be addressed with further patches. * A ptracee may start a group stop while one is still in progress if the tracer let it continue with stop signal delivery. Group stop code handles this correctly. Oleg: * Spotted that a task might skip signal check even when its GROUP_STOP_PENDING is set. Fixed by updating recalc_sigpending_tsk() to check GROUP_STOP_PENDING instead of group_stop_count. * Pointed out that task->group_stop should be cleared whenever task->signal->group_stop_count is cleared. Fixed accordingly. * Pointed out the behavior inconsistency between TASK_TRACED and RUNNING and the last behavior change. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com>
2011-03-23 03:37:00 -06:00
task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING);
signal: Fix premature completion of group stop when interfered by ptrace task->signal->group_stop_count is used to track the progress of group stop. It's initialized to the number of tasks which need to stop for group stop to finish and each stopping or trapping task decrements. However, each task doesn't keep track of whether it decremented the counter or not and if woken up before the group stop is complete and stops again, it can decrement the counter multiple times. Please consider the following example code. static void *worker(void *arg) { while (1) ; return NULL; } int main(void) { pthread_t thread; pid_t pid; int i; pid = fork(); if (!pid) { for (i = 0; i < 5; i++) pthread_create(&thread, NULL, worker, NULL); while (1) ; return 0; } ptrace(PTRACE_ATTACH, pid, NULL, NULL); while (1) { waitid(P_PID, pid, NULL, WSTOPPED); ptrace(PTRACE_SINGLESTEP, pid, NULL, (void *)(long)SIGSTOP); } return 0; } The child creates five threads and the parent continuously traps the first thread and whenever the child gets a signal, SIGSTOP is delivered. If an external process sends SIGSTOP to the child, all other threads in the process should reliably stop. However, due to the above bug, the first thread will often end up consuming group_stop_count multiple times and SIGSTOP often ends up stopping none or part of the other four threads. This patch adds a new field task->group_stop which is protected by siglock and uses GROUP_STOP_CONSUME flag to track which task is still to consume group_stop_count to fix this bug. task_clear_group_stop_pending() and task_participate_group_stop() are added to help manipulating group stop states. As ptrace_stop() now also uses task_participate_group_stop(), it will set SIGNAL_STOP_STOPPED if it completes a group stop. There still are many issues regarding the interaction between group stop and ptrace. Patches to address them will follow. - Oleg spotted duplicate GROUP_STOP_CONSUME. Dropped. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com>
2011-03-23 03:37:00 -06:00
if (!consume)
return false;
if (!WARN_ON_ONCE(sig->group_stop_count == 0))
sig->group_stop_count--;
job control: Don't send duplicate job control stop notification while ptraced Just as group_exit_code shouldn't be generated when a PTRACE_CONT'd task re-enters job control stop, notifiction for the event should be suppressed too. The logic is the same as the group_exit_code generation suppression in do_signal_stop(), if SIGNAL_STOP_STOPPED is already set, the task is re-entering job control stop without intervening SIGCONT and the notifications should be suppressed. Test case follows. #include <stdio.h> #include <unistd.h> #include <signal.h> #include <time.h> #include <sys/ptrace.h> #include <sys/wait.h> static const struct timespec ts100ms = { .tv_nsec = 100000000 }; static pid_t tracee, tracer; static const char *pid_who(pid_t pid) { return pid == tracee ? "tracee" : (pid == tracer ? "tracer" : "mommy "); } static void sigchld_sigaction(int signo, siginfo_t *si, void *ucxt) { printf("%s: SIG status=%02d code=%02d (%s)\n", pid_who(getpid()), si->si_status, si->si_code, pid_who(si->si_pid)); } int main(void) { const struct sigaction chld_sa = { .sa_sigaction = sigchld_sigaction, .sa_flags = SA_SIGINFO|SA_RESTART }; siginfo_t si; sigaction(SIGCHLD, &chld_sa, NULL); tracee = fork(); if (!tracee) { tracee = getpid(); while (1) pause(); } kill(tracee, SIGSTOP); waitid(P_PID, tracee, &si, WSTOPPED); tracer = fork(); if (!tracer) { tracer = getpid(); ptrace(PTRACE_ATTACH, tracee, NULL, NULL); waitid(P_PID, tracee, &si, WSTOPPED); ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); waitid(P_PID, tracee, &si, WSTOPPED); ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); waitid(P_PID, tracee, &si, WSTOPPED); printf("tracer: detaching\n"); ptrace(PTRACE_DETACH, tracee, NULL, NULL); return 0; } while (1) pause(); return 0; } Before the patch, the parent gets the second notification for the tracee after the tracer detaches. si_status is zero because group_exit_code is not set by the group stop completion which triggered this notification. mommy : SIG status=19 code=05 (tracee) tracer: SIG status=00 code=05 (tracee) tracer: SIG status=19 code=04 (tracee) tracer: SIG status=00 code=05 (tracee) tracer: detaching mommy : SIG status=00 code=05 (tracee) mommy : SIG status=00 code=01 (tracer) ^C After the patch, the duplicate notification is gone. mommy : SIG status=19 code=05 (tracee) tracer: SIG status=00 code=05 (tracee) tracer: SIG status=19 code=04 (tracee) tracer: SIG status=00 code=05 (tracee) tracer: detaching mommy : SIG status=00 code=01 (tracer) ^C Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com>
2011-03-23 03:37:01 -06:00
/*
* Tell the caller to notify completion iff we are entering into a
* fresh group stop. Read comment in do_signal_stop() for details.
*/
if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) {
signal: Fix premature completion of group stop when interfered by ptrace task->signal->group_stop_count is used to track the progress of group stop. It's initialized to the number of tasks which need to stop for group stop to finish and each stopping or trapping task decrements. However, each task doesn't keep track of whether it decremented the counter or not and if woken up before the group stop is complete and stops again, it can decrement the counter multiple times. Please consider the following example code. static void *worker(void *arg) { while (1) ; return NULL; } int main(void) { pthread_t thread; pid_t pid; int i; pid = fork(); if (!pid) { for (i = 0; i < 5; i++) pthread_create(&thread, NULL, worker, NULL); while (1) ; return 0; } ptrace(PTRACE_ATTACH, pid, NULL, NULL); while (1) { waitid(P_PID, pid, NULL, WSTOPPED); ptrace(PTRACE_SINGLESTEP, pid, NULL, (void *)(long)SIGSTOP); } return 0; } The child creates five threads and the parent continuously traps the first thread and whenever the child gets a signal, SIGSTOP is delivered. If an external process sends SIGSTOP to the child, all other threads in the process should reliably stop. However, due to the above bug, the first thread will often end up consuming group_stop_count multiple times and SIGSTOP often ends up stopping none or part of the other four threads. This patch adds a new field task->group_stop which is protected by siglock and uses GROUP_STOP_CONSUME flag to track which task is still to consume group_stop_count to fix this bug. task_clear_group_stop_pending() and task_participate_group_stop() are added to help manipulating group stop states. As ptrace_stop() now also uses task_participate_group_stop(), it will set SIGNAL_STOP_STOPPED if it completes a group stop. There still are many issues regarding the interaction between group stop and ptrace. Patches to address them will follow. - Oleg spotted duplicate GROUP_STOP_CONSUME. Dropped. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com>
2011-03-23 03:37:00 -06:00
sig->flags = SIGNAL_STOP_STOPPED;
return true;
}
return false;
}
/*
* allocate a new signal queue record
* - this may be called without locks if and only if t == current, otherwise an
* appropriate lock must be held to stop the target task from exiting
*/
static struct sigqueue *
__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
{
struct sigqueue *q = NULL;
struct user_struct *user;
/*
* Protect access to @t credentials. This can go away when all
* callers hold rcu read lock.
*/
rcu_read_lock();
CRED: Inaugurate COW credentials Inaugurate copy-on-write credentials management. This uses RCU to manage the credentials pointer in the task_struct with respect to accesses by other tasks. A process may only modify its own credentials, and so does not need locking to access or modify its own credentials. A mutex (cred_replace_mutex) is added to the task_struct to control the effect of PTRACE_ATTACHED on credential calculations, particularly with respect to execve(). With this patch, the contents of an active credentials struct may not be changed directly; rather a new set of credentials must be prepared, modified and committed using something like the following sequence of events: struct cred *new = prepare_creds(); int ret = blah(new); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); There are some exceptions to this rule: the keyrings pointed to by the active credentials may be instantiated - keyrings violate the COW rule as managing COW keyrings is tricky, given that it is possible for a task to directly alter the keys in a keyring in use by another task. To help enforce this, various pointers to sets of credentials, such as those in the task_struct, are declared const. The purpose of this is compile-time discouragement of altering credentials through those pointers. Once a set of credentials has been made public through one of these pointers, it may not be modified, except under special circumstances: (1) Its reference count may incremented and decremented. (2) The keyrings to which it points may be modified, but not replaced. The only safe way to modify anything else is to create a replacement and commit using the functions described in Documentation/credentials.txt (which will be added by a later patch). This patch and the preceding patches have been tested with the LTP SELinux testsuite. This patch makes several logical sets of alteration: (1) execve(). This now prepares and commits credentials in various places in the security code rather than altering the current creds directly. (2) Temporary credential overrides. do_coredump() and sys_faccessat() now prepare their own credentials and temporarily override the ones currently on the acting thread, whilst preventing interference from other threads by holding cred_replace_mutex on the thread being dumped. This will be replaced in a future patch by something that hands down the credentials directly to the functions being called, rather than altering the task's objective credentials. (3) LSM interface. A number of functions have been changed, added or removed: (*) security_capset_check(), ->capset_check() (*) security_capset_set(), ->capset_set() Removed in favour of security_capset(). (*) security_capset(), ->capset() New. This is passed a pointer to the new creds, a pointer to the old creds and the proposed capability sets. It should fill in the new creds or return an error. All pointers, barring the pointer to the new creds, are now const. (*) security_bprm_apply_creds(), ->bprm_apply_creds() Changed; now returns a value, which will cause the process to be killed if it's an error. (*) security_task_alloc(), ->task_alloc_security() Removed in favour of security_prepare_creds(). (*) security_cred_free(), ->cred_free() New. Free security data attached to cred->security. (*) security_prepare_creds(), ->cred_prepare() New. Duplicate any security data attached to cred->security. (*) security_commit_creds(), ->cred_commit() New. Apply any security effects for the upcoming installation of new security by commit_creds(). (*) security_task_post_setuid(), ->task_post_setuid() Removed in favour of security_task_fix_setuid(). (*) security_task_fix_setuid(), ->task_fix_setuid() Fix up the proposed new credentials for setuid(). This is used by cap_set_fix_setuid() to implicitly adjust capabilities in line with setuid() changes. Changes are made to the new credentials, rather than the task itself as in security_task_post_setuid(). (*) security_task_reparent_to_init(), ->task_reparent_to_init() Removed. Instead the task being reparented to init is referred directly to init's credentials. NOTE! This results in the loss of some state: SELinux's osid no longer records the sid of the thread that forked it. (*) security_key_alloc(), ->key_alloc() (*) security_key_permission(), ->key_permission() Changed. These now take cred pointers rather than task pointers to refer to the security context. (4) sys_capset(). This has been simplified and uses less locking. The LSM functions it calls have been merged. (5) reparent_to_kthreadd(). This gives the current thread the same credentials as init by simply using commit_thread() to point that way. (6) __sigqueue_alloc() and switch_uid() __sigqueue_alloc() can't stop the target task from changing its creds beneath it, so this function gets a reference to the currently applicable user_struct which it then passes into the sigqueue struct it returns if successful. switch_uid() is now called from commit_creds(), and possibly should be folded into that. commit_creds() should take care of protecting __sigqueue_alloc(). (7) [sg]et[ug]id() and co and [sg]et_current_groups. The set functions now all use prepare_creds(), commit_creds() and abort_creds() to build and check a new set of credentials before applying it. security_task_set[ug]id() is called inside the prepared section. This guarantees that nothing else will affect the creds until we've finished. The calling of set_dumpable() has been moved into commit_creds(). Much of the functionality of set_user() has been moved into commit_creds(). The get functions all simply access the data directly. (8) security_task_prctl() and cap_task_prctl(). security_task_prctl() has been modified to return -ENOSYS if it doesn't want to handle a function, or otherwise return the return value directly rather than through an argument. Additionally, cap_task_prctl() now prepares a new set of credentials, even if it doesn't end up using it. (9) Keyrings. A number of changes have been made to the keyrings code: (a) switch_uid_keyring(), copy_keys(), exit_keys() and suid_keys() have all been dropped and built in to the credentials functions directly. They may want separating out again later. (b) key_alloc() and search_process_keyrings() now take a cred pointer rather than a task pointer to specify the security context. (c) copy_creds() gives a new thread within the same thread group a new thread keyring if its parent had one, otherwise it discards the thread keyring. (d) The authorisation key now points directly to the credentials to extend the search into rather pointing to the task that carries them. (e) Installing thread, process or session keyrings causes a new set of credentials to be created, even though it's not strictly necessary for process or session keyrings (they're shared). (10) Usermode helper. The usermode helper code now carries a cred struct pointer in its subprocess_info struct instead of a new session keyring pointer. This set of credentials is derived from init_cred and installed on the new process after it has been cloned. call_usermodehelper_setup() allocates the new credentials and call_usermodehelper_freeinfo() discards them if they haven't been used. A special cred function (prepare_usermodeinfo_creds()) is provided specifically for call_usermodehelper_setup() to call. call_usermodehelper_setkeys() adjusts the credentials to sport the supplied keyring as the new session keyring. (11) SELinux. SELinux has a number of changes, in addition to those to support the LSM interface changes mentioned above: (a) selinux_setprocattr() no longer does its check for whether the current ptracer can access processes with the new SID inside the lock that covers getting the ptracer's SID. Whilst this lock ensures that the check is done with the ptracer pinned, the result is only valid until the lock is released, so there's no point doing it inside the lock. (12) is_single_threaded(). This function has been extracted from selinux_setprocattr() and put into a file of its own in the lib/ directory as join_session_keyring() now wants to use it too. The code in SELinux just checked to see whether a task shared mm_structs with other tasks (CLONE_VM), but that isn't good enough. We really want to know if they're part of the same thread group (CLONE_THREAD). (13) nfsd. The NFS server daemon now has to use the COW credentials to set the credentials it is going to use. It really needs to pass the credentials down to the functions it calls, but it can't do that until other patches in this series have been applied. Signed-off-by: David Howells <dhowells@redhat.com> Acked-by: James Morris <jmorris@namei.org> Signed-off-by: James Morris <jmorris@namei.org>
2008-11-13 16:39:23 -07:00
user = get_uid(__task_cred(t)->user);
atomic_inc(&user->sigpending);
rcu_read_unlock();
if (override_rlimit ||
atomic_read(&user->sigpending) <=
task_rlimit(t, RLIMIT_SIGPENDING)) {
q = kmem_cache_alloc(sigqueue_cachep, flags);
} else {
print_dropped_signal(sig);
}
if (unlikely(q == NULL)) {
atomic_dec(&user->sigpending);
CRED: Inaugurate COW credentials Inaugurate copy-on-write credentials management. This uses RCU to manage the credentials pointer in the task_struct with respect to accesses by other tasks. A process may only modify its own credentials, and so does not need locking to access or modify its own credentials. A mutex (cred_replace_mutex) is added to the task_struct to control the effect of PTRACE_ATTACHED on credential calculations, particularly with respect to execve(). With this patch, the contents of an active credentials struct may not be changed directly; rather a new set of credentials must be prepared, modified and committed using something like the following sequence of events: struct cred *new = prepare_creds(); int ret = blah(new); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); There are some exceptions to this rule: the keyrings pointed to by the active credentials may be instantiated - keyrings violate the COW rule as managing COW keyrings is tricky, given that it is possible for a task to directly alter the keys in a keyring in use by another task. To help enforce this, various pointers to sets of credentials, such as those in the task_struct, are declared const. The purpose of this is compile-time discouragement of altering credentials through those pointers. Once a set of credentials has been made public through one of these pointers, it may not be modified, except under special circumstances: (1) Its reference count may incremented and decremented. (2) The keyrings to which it points may be modified, but not replaced. The only safe way to modify anything else is to create a replacement and commit using the functions described in Documentation/credentials.txt (which will be added by a later patch). This patch and the preceding patches have been tested with the LTP SELinux testsuite. This patch makes several logical sets of alteration: (1) execve(). This now prepares and commits credentials in various places in the security code rather than altering the current creds directly. (2) Temporary credential overrides. do_coredump() and sys_faccessat() now prepare their own credentials and temporarily override the ones currently on the acting thread, whilst preventing interference from other threads by holding cred_replace_mutex on the thread being dumped. This will be replaced in a future patch by something that hands down the credentials directly to the functions being called, rather than altering the task's objective credentials. (3) LSM interface. A number of functions have been changed, added or removed: (*) security_capset_check(), ->capset_check() (*) security_capset_set(), ->capset_set() Removed in favour of security_capset(). (*) security_capset(), ->capset() New. This is passed a pointer to the new creds, a pointer to the old creds and the proposed capability sets. It should fill in the new creds or return an error. All pointers, barring the pointer to the new creds, are now const. (*) security_bprm_apply_creds(), ->bprm_apply_creds() Changed; now returns a value, which will cause the process to be killed if it's an error. (*) security_task_alloc(), ->task_alloc_security() Removed in favour of security_prepare_creds(). (*) security_cred_free(), ->cred_free() New. Free security data attached to cred->security. (*) security_prepare_creds(), ->cred_prepare() New. Duplicate any security data attached to cred->security. (*) security_commit_creds(), ->cred_commit() New. Apply any security effects for the upcoming installation of new security by commit_creds(). (*) security_task_post_setuid(), ->task_post_setuid() Removed in favour of security_task_fix_setuid(). (*) security_task_fix_setuid(), ->task_fix_setuid() Fix up the proposed new credentials for setuid(). This is used by cap_set_fix_setuid() to implicitly adjust capabilities in line with setuid() changes. Changes are made to the new credentials, rather than the task itself as in security_task_post_setuid(). (*) security_task_reparent_to_init(), ->task_reparent_to_init() Removed. Instead the task being reparented to init is referred directly to init's credentials. NOTE! This results in the loss of some state: SELinux's osid no longer records the sid of the thread that forked it. (*) security_key_alloc(), ->key_alloc() (*) security_key_permission(), ->key_permission() Changed. These now take cred pointers rather than task pointers to refer to the security context. (4) sys_capset(). This has been simplified and uses less locking. The LSM functions it calls have been merged. (5) reparent_to_kthreadd(). This gives the current thread the same credentials as init by simply using commit_thread() to point that way. (6) __sigqueue_alloc() and switch_uid() __sigqueue_alloc() can't stop the target task from changing its creds beneath it, so this function gets a reference to the currently applicable user_struct which it then passes into the sigqueue struct it returns if successful. switch_uid() is now called from commit_creds(), and possibly should be folded into that. commit_creds() should take care of protecting __sigqueue_alloc(). (7) [sg]et[ug]id() and co and [sg]et_current_groups. The set functions now all use prepare_creds(), commit_creds() and abort_creds() to build and check a new set of credentials before applying it. security_task_set[ug]id() is called inside the prepared section. This guarantees that nothing else will affect the creds until we've finished. The calling of set_dumpable() has been moved into commit_creds(). Much of the functionality of set_user() has been moved into commit_creds(). The get functions all simply access the data directly. (8) security_task_prctl() and cap_task_prctl(). security_task_prctl() has been modified to return -ENOSYS if it doesn't want to handle a function, or otherwise return the return value directly rather than through an argument. Additionally, cap_task_prctl() now prepares a new set of credentials, even if it doesn't end up using it. (9) Keyrings. A number of changes have been made to the keyrings code: (a) switch_uid_keyring(), copy_keys(), exit_keys() and suid_keys() have all been dropped and built in to the credentials functions directly. They may want separating out again later. (b) key_alloc() and search_process_keyrings() now take a cred pointer rather than a task pointer to specify the security context. (c) copy_creds() gives a new thread within the same thread group a new thread keyring if its parent had one, otherwise it discards the thread keyring. (d) The authorisation key now points directly to the credentials to extend the search into rather pointing to the task that carries them. (e) Installing thread, process or session keyrings causes a new set of credentials to be created, even though it's not strictly necessary for process or session keyrings (they're shared). (10) Usermode helper. The usermode helper code now carries a cred struct pointer in its subprocess_info struct instead of a new session keyring pointer. This set of credentials is derived from init_cred and installed on the new process after it has been cloned. call_usermodehelper_setup() allocates the new credentials and call_usermodehelper_freeinfo() discards them if they haven't been used. A special cred function (prepare_usermodeinfo_creds()) is provided specifically for call_usermodehelper_setup() to call. call_usermodehelper_setkeys() adjusts the credentials to sport the supplied keyring as the new session keyring. (11) SELinux. SELinux has a number of changes, in addition to those to support the LSM interface changes mentioned above: (a) selinux_setprocattr() no longer does its check for whether the current ptracer can access processes with the new SID inside the lock that covers getting the ptracer's SID. Whilst this lock ensures that the check is done with the ptracer pinned, the result is only valid until the lock is released, so there's no point doing it inside the lock. (12) is_single_threaded(). This function has been extracted from selinux_setprocattr() and put into a file of its own in the lib/ directory as join_session_keyring() now wants to use it too. The code in SELinux just checked to see whether a task shared mm_structs with other tasks (CLONE_VM), but that isn't good enough. We really want to know if they're part of the same thread group (CLONE_THREAD). (13) nfsd. The NFS server daemon now has to use the COW credentials to set the credentials it is going to use. It really needs to pass the credentials down to the functions it calls, but it can't do that until other patches in this series have been applied. Signed-off-by: David Howells <dhowells@redhat.com> Acked-by: James Morris <jmorris@namei.org> Signed-off-by: James Morris <jmorris@namei.org>
2008-11-13 16:39:23 -07:00
free_uid(user);
} else {
INIT_LIST_HEAD(&q->list);
q->flags = 0;
CRED: Inaugurate COW credentials Inaugurate copy-on-write credentials management. This uses RCU to manage the credentials pointer in the task_struct with respect to accesses by other tasks. A process may only modify its own credentials, and so does not need locking to access or modify its own credentials. A mutex (cred_replace_mutex) is added to the task_struct to control the effect of PTRACE_ATTACHED on credential calculations, particularly with respect to execve(). With this patch, the contents of an active credentials struct may not be changed directly; rather a new set of credentials must be prepared, modified and committed using something like the following sequence of events: struct cred *new = prepare_creds(); int ret = blah(new); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); There are some exceptions to this rule: the keyrings pointed to by the active credentials may be instantiated - keyrings violate the COW rule as managing COW keyrings is tricky, given that it is possible for a task to directly alter the keys in a keyring in use by another task. To help enforce this, various pointers to sets of credentials, such as those in the task_struct, are declared const. The purpose of this is compile-time discouragement of altering credentials through those pointers. Once a set of credentials has been made public through one of these pointers, it may not be modified, except under special circumstances: (1) Its reference count may incremented and decremented. (2) The keyrings to which it points may be modified, but not replaced. The only safe way to modify anything else is to create a replacement and commit using the functions described in Documentation/credentials.txt (which will be added by a later patch). This patch and the preceding patches have been tested with the LTP SELinux testsuite. This patch makes several logical sets of alteration: (1) execve(). This now prepares and commits credentials in various places in the security code rather than altering the current creds directly. (2) Temporary credential overrides. do_coredump() and sys_faccessat() now prepare their own credentials and temporarily override the ones currently on the acting thread, whilst preventing interference from other threads by holding cred_replace_mutex on the thread being dumped. This will be replaced in a future patch by something that hands down the credentials directly to the functions being called, rather than altering the task's objective credentials. (3) LSM interface. A number of functions have been changed, added or removed: (*) security_capset_check(), ->capset_check() (*) security_capset_set(), ->capset_set() Removed in favour of security_capset(). (*) security_capset(), ->capset() New. This is passed a pointer to the new creds, a pointer to the old creds and the proposed capability sets. It should fill in the new creds or return an error. All pointers, barring the pointer to the new creds, are now const. (*) security_bprm_apply_creds(), ->bprm_apply_creds() Changed; now returns a value, which will cause the process to be killed if it's an error. (*) security_task_alloc(), ->task_alloc_security() Removed in favour of security_prepare_creds(). (*) security_cred_free(), ->cred_free() New. Free security data attached to cred->security. (*) security_prepare_creds(), ->cred_prepare() New. Duplicate any security data attached to cred->security. (*) security_commit_creds(), ->cred_commit() New. Apply any security effects for the upcoming installation of new security by commit_creds(). (*) security_task_post_setuid(), ->task_post_setuid() Removed in favour of security_task_fix_setuid(). (*) security_task_fix_setuid(), ->task_fix_setuid() Fix up the proposed new credentials for setuid(). This is used by cap_set_fix_setuid() to implicitly adjust capabilities in line with setuid() changes. Changes are made to the new credentials, rather than the task itself as in security_task_post_setuid(). (*) security_task_reparent_to_init(), ->task_reparent_to_init() Removed. Instead the task being reparented to init is referred directly to init's credentials. NOTE! This results in the loss of some state: SELinux's osid no longer records the sid of the thread that forked it. (*) security_key_alloc(), ->key_alloc() (*) security_key_permission(), ->key_permission() Changed. These now take cred pointers rather than task pointers to refer to the security context. (4) sys_capset(). This has been simplified and uses less locking. The LSM functions it calls have been merged. (5) reparent_to_kthreadd(). This gives the current thread the same credentials as init by simply using commit_thread() to point that way. (6) __sigqueue_alloc() and switch_uid() __sigqueue_alloc() can't stop the target task from changing its creds beneath it, so this function gets a reference to the currently applicable user_struct which it then passes into the sigqueue struct it returns if successful. switch_uid() is now called from commit_creds(), and possibly should be folded into that. commit_creds() should take care of protecting __sigqueue_alloc(). (7) [sg]et[ug]id() and co and [sg]et_current_groups. The set functions now all use prepare_creds(), commit_creds() and abort_creds() to build and check a new set of credentials before applying it. security_task_set[ug]id() is called inside the prepared section. This guarantees that nothing else will affect the creds until we've finished. The calling of set_dumpable() has been moved into commit_creds(). Much of the functionality of set_user() has been moved into commit_creds(). The get functions all simply access the data directly. (8) security_task_prctl() and cap_task_prctl(). security_task_prctl() has been modified to return -ENOSYS if it doesn't want to handle a function, or otherwise return the return value directly rather than through an argument. Additionally, cap_task_prctl() now prepares a new set of credentials, even if it doesn't end up using it. (9) Keyrings. A number of changes have been made to the keyrings code: (a) switch_uid_keyring(), copy_keys(), exit_keys() and suid_keys() have all been dropped and built in to the credentials functions directly. They may want separating out again later. (b) key_alloc() and search_process_keyrings() now take a cred pointer rather than a task pointer to specify the security context. (c) copy_creds() gives a new thread within the same thread group a new thread keyring if its parent had one, otherwise it discards the thread keyring. (d) The authorisation key now points directly to the credentials to extend the search into rather pointing to the task that carries them. (e) Installing thread, process or session keyrings causes a new set of credentials to be created, even though it's not strictly necessary for process or session keyrings (they're shared). (10) Usermode helper. The usermode helper code now carries a cred struct pointer in its subprocess_info struct instead of a new session keyring pointer. This set of credentials is derived from init_cred and installed on the new process after it has been cloned. call_usermodehelper_setup() allocates the new credentials and call_usermodehelper_freeinfo() discards them if they haven't been used. A special cred function (prepare_usermodeinfo_creds()) is provided specifically for call_usermodehelper_setup() to call. call_usermodehelper_setkeys() adjusts the credentials to sport the supplied keyring as the new session keyring. (11) SELinux. SELinux has a number of changes, in addition to those to support the LSM interface changes mentioned above: (a) selinux_setprocattr() no longer does its check for whether the current ptracer can access processes with the new SID inside the lock that covers getting the ptracer's SID. Whilst this lock ensures that the check is done with the ptracer pinned, the result is only valid until the lock is released, so there's no point doing it inside the lock. (12) is_single_threaded(). This function has been extracted from selinux_setprocattr() and put into a file of its own in the lib/ directory as join_session_keyring() now wants to use it too. The code in SELinux just checked to see whether a task shared mm_structs with other tasks (CLONE_VM), but that isn't good enough. We really want to know if they're part of the same thread group (CLONE_THREAD). (13) nfsd. The NFS server daemon now has to use the COW credentials to set the credentials it is going to use. It really needs to pass the credentials down to the functions it calls, but it can't do that until other patches in this series have been applied. Signed-off-by: David Howells <dhowells@redhat.com> Acked-by: James Morris <jmorris@namei.org> Signed-off-by: James Morris <jmorris@namei.org>
2008-11-13 16:39:23 -07:00
q->user = user;
}
CRED: Inaugurate COW credentials Inaugurate copy-on-write credentials management. This uses RCU to manage the credentials pointer in the task_struct with respect to accesses by other tasks. A process may only modify its own credentials, and so does not need locking to access or modify its own credentials. A mutex (cred_replace_mutex) is added to the task_struct to control the effect of PTRACE_ATTACHED on credential calculations, particularly with respect to execve(). With this patch, the contents of an active credentials struct may not be changed directly; rather a new set of credentials must be prepared, modified and committed using something like the following sequence of events: struct cred *new = prepare_creds(); int ret = blah(new); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); There are some exceptions to this rule: the keyrings pointed to by the active credentials may be instantiated - keyrings violate the COW rule as managing COW keyrings is tricky, given that it is possible for a task to directly alter the keys in a keyring in use by another task. To help enforce this, various pointers to sets of credentials, such as those in the task_struct, are declared const. The purpose of this is compile-time discouragement of altering credentials through those pointers. Once a set of credentials has been made public through one of these pointers, it may not be modified, except under special circumstances: (1) Its reference count may incremented and decremented. (2) The keyrings to which it points may be modified, but not replaced. The only safe way to modify anything else is to create a replacement and commit using the functions described in Documentation/credentials.txt (which will be added by a later patch). This patch and the preceding patches have been tested with the LTP SELinux testsuite. This patch makes several logical sets of alteration: (1) execve(). This now prepares and commits credentials in various places in the security code rather than altering the current creds directly. (2) Temporary credential overrides. do_coredump() and sys_faccessat() now prepare their own credentials and temporarily override the ones currently on the acting thread, whilst preventing interference from other threads by holding cred_replace_mutex on the thread being dumped. This will be replaced in a future patch by something that hands down the credentials directly to the functions being called, rather than altering the task's objective credentials. (3) LSM interface. A number of functions have been changed, added or removed: (*) security_capset_check(), ->capset_check() (*) security_capset_set(), ->capset_set() Removed in favour of security_capset(). (*) security_capset(), ->capset() New. This is passed a pointer to the new creds, a pointer to the old creds and the proposed capability sets. It should fill in the new creds or return an error. All pointers, barring the pointer to the new creds, are now const. (*) security_bprm_apply_creds(), ->bprm_apply_creds() Changed; now returns a value, which will cause the process to be killed if it's an error. (*) security_task_alloc(), ->task_alloc_security() Removed in favour of security_prepare_creds(). (*) security_cred_free(), ->cred_free() New. Free security data attached to cred->security. (*) security_prepare_creds(), ->cred_prepare() New. Duplicate any security data attached to cred->security. (*) security_commit_creds(), ->cred_commit() New. Apply any security effects for the upcoming installation of new security by commit_creds(). (*) security_task_post_setuid(), ->task_post_setuid() Removed in favour of security_task_fix_setuid(). (*) security_task_fix_setuid(), ->task_fix_setuid() Fix up the proposed new credentials for setuid(). This is used by cap_set_fix_setuid() to implicitly adjust capabilities in line with setuid() changes. Changes are made to the new credentials, rather than the task itself as in security_task_post_setuid(). (*) security_task_reparent_to_init(), ->task_reparent_to_init() Removed. Instead the task being reparented to init is referred directly to init's credentials. NOTE! This results in the loss of some state: SELinux's osid no longer records the sid of the thread that forked it. (*) security_key_alloc(), ->key_alloc() (*) security_key_permission(), ->key_permission() Changed. These now take cred pointers rather than task pointers to refer to the security context. (4) sys_capset(). This has been simplified and uses less locking. The LSM functions it calls have been merged. (5) reparent_to_kthreadd(). This gives the current thread the same credentials as init by simply using commit_thread() to point that way. (6) __sigqueue_alloc() and switch_uid() __sigqueue_alloc() can't stop the target task from changing its creds beneath it, so this function gets a reference to the currently applicable user_struct which it then passes into the sigqueue struct it returns if successful. switch_uid() is now called from commit_creds(), and possibly should be folded into that. commit_creds() should take care of protecting __sigqueue_alloc(). (7) [sg]et[ug]id() and co and [sg]et_current_groups. The set functions now all use prepare_creds(), commit_creds() and abort_creds() to build and check a new set of credentials before applying it. security_task_set[ug]id() is called inside the prepared section. This guarantees that nothing else will affect the creds until we've finished. The calling of set_dumpable() has been moved into commit_creds(). Much of the functionality of set_user() has been moved into commit_creds(). The get functions all simply access the data directly. (8) security_task_prctl() and cap_task_prctl(). security_task_prctl() has been modified to return -ENOSYS if it doesn't want to handle a function, or otherwise return the return value directly rather than through an argument. Additionally, cap_task_prctl() now prepares a new set of credentials, even if it doesn't end up using it. (9) Keyrings. A number of changes have been made to the keyrings code: (a) switch_uid_keyring(), copy_keys(), exit_keys() and suid_keys() have all been dropped and built in to the credentials functions directly. They may want separating out again later. (b) key_alloc() and search_process_keyrings() now take a cred pointer rather than a task pointer to specify the security context. (c) copy_creds() gives a new thread within the same thread group a new thread keyring if its parent had one, otherwise it discards the thread keyring. (d) The authorisation key now points directly to the credentials to extend the search into rather pointing to the task that carries them. (e) Installing thread, process or session keyrings causes a new set of credentials to be created, even though it's not strictly necessary for process or session keyrings (they're shared). (10) Usermode helper. The usermode helper code now carries a cred struct pointer in its subprocess_info struct instead of a new session keyring pointer. This set of credentials is derived from init_cred and installed on the new process after it has been cloned. call_usermodehelper_setup() allocates the new credentials and call_usermodehelper_freeinfo() discards them if they haven't been used. A special cred function (prepare_usermodeinfo_creds()) is provided specifically for call_usermodehelper_setup() to call. call_usermodehelper_setkeys() adjusts the credentials to sport the supplied keyring as the new session keyring. (11) SELinux. SELinux has a number of changes, in addition to those to support the LSM interface changes mentioned above: (a) selinux_setprocattr() no longer does its check for whether the current ptracer can access processes with the new SID inside the lock that covers getting the ptracer's SID. Whilst this lock ensures that the check is done with the ptracer pinned, the result is only valid until the lock is released, so there's no point doing it inside the lock. (12) is_single_threaded(). This function has been extracted from selinux_setprocattr() and put into a file of its own in the lib/ directory as join_session_keyring() now wants to use it too. The code in SELinux just checked to see whether a task shared mm_structs with other tasks (CLONE_VM), but that isn't good enough. We really want to know if they're part of the same thread group (CLONE_THREAD). (13) nfsd. The NFS server daemon now has to use the COW credentials to set the credentials it is going to use. It really needs to pass the credentials down to the functions it calls, but it can't do that until other patches in this series have been applied. Signed-off-by: David Howells <dhowells@redhat.com> Acked-by: James Morris <jmorris@namei.org> Signed-off-by: James Morris <jmorris@namei.org>
2008-11-13 16:39:23 -07:00
return q;
}
static void __sigqueue_free(struct sigqueue *q)
{
if (q->flags & SIGQUEUE_PREALLOC)
return;
atomic_dec(&q->user->sigpending);
free_uid(q->user);
kmem_cache_free(sigqueue_cachep, q);
}
void flush_sigqueue(struct sigpending *queue)
{
struct sigqueue *q;
sigemptyset(&queue->signal);
while (!list_empty(&queue->list)) {
q = list_entry(queue->list.next, struct sigqueue , list);
list_del_init(&q->list);
__sigqueue_free(q);
}
}
/*
* Flush all pending signals for a task.
*/
void __flush_signals(struct task_struct *t)
{
clear_tsk_thread_flag(t, TIF_SIGPENDING);
flush_sigqueue(&t->pending);
flush_sigqueue(&t->signal->shared_pending);
}
void flush_signals(struct task_struct *t)
{
unsigned long flags;
spin_lock_irqsave(&t->sighand->siglock, flags);
__flush_signals(t);
spin_unlock_irqrestore(&t->sighand->siglock, flags);
}
static void __flush_itimer_signals(struct sigpending *pending)
{
sigset_t signal, retain;
struct sigqueue *q, *n;
signal = pending->signal;
sigemptyset(&retain);
list_for_each_entry_safe(q, n, &pending->list, list) {
int sig = q->info.si_signo;
if (likely(q->info.si_code != SI_TIMER)) {
sigaddset(&retain, sig);
} else {
sigdelset(&signal, sig);
list_del_init(&q->list);
__sigqueue_free(q);
}
}
sigorsets(&pending->signal, &signal, &retain);
}
void flush_itimer_signals(void)
{
struct task_struct *tsk = current;
unsigned long flags;
spin_lock_irqsave(&tsk->sighand->siglock, flags);
__flush_itimer_signals(&tsk->pending);
__flush_itimer_signals(&tsk->signal->shared_pending);
spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
}
void ignore_signals(struct task_struct *t)
{
int i;
for (i = 0; i < _NSIG; ++i)
t->sighand->action[i].sa.sa_handler = SIG_IGN;
flush_signals(t);
}
/*
* Flush all handlers for a task.
*/
void
flush_signal_handlers(struct task_struct *t, int force_default)
{
int i;
struct k_sigaction *ka = &t->sighand->action[0];
for (i = _NSIG ; i != 0 ; i--) {
if (force_default || ka->sa.sa_handler != SIG_IGN)
ka->sa.sa_handler = SIG_DFL;
ka->sa.sa_flags = 0;
sigemptyset(&ka->sa.sa_mask);
ka++;
}
}
int unhandled_signal(struct task_struct *tsk, int sig)
{
void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler;
pid namespaces: define is_global_init() and is_container_init() is_init() is an ambiguous name for the pid==1 check. Split it into is_global_init() and is_container_init(). A cgroup init has it's tsk->pid == 1. A global init also has it's tsk->pid == 1 and it's active pid namespace is the init_pid_ns. But rather than check the active pid namespace, compare the task structure with 'init_pid_ns.child_reaper', which is initialized during boot to the /sbin/init process and never changes. Changelog: 2.6.22-rc4-mm2-pidns1: - Use 'init_pid_ns.child_reaper' to determine if a given task is the global init (/sbin/init) process. This would improve performance and remove dependence on the task_pid(). 2.6.21-mm2-pidns2: - [Sukadev Bhattiprolu] Changed is_container_init() calls in {powerpc, ppc,avr32}/traps.c for the _exception() call to is_global_init(). This way, we kill only the cgroup if the cgroup's init has a bug rather than force a kernel panic. [akpm@linux-foundation.org: fix comment] [sukadev@us.ibm.com: Use is_global_init() in arch/m32r/mm/fault.c] [bunk@stusta.de: kernel/pid.c: remove unused exports] [sukadev@us.ibm.com: Fix capability.c to work with threaded init] Signed-off-by: Serge E. Hallyn <serue@us.ibm.com> Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com> Acked-by: Pavel Emelianov <xemul@openvz.org> Cc: Eric W. Biederman <ebiederm@xmission.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Herbert Poetzel <herbert@13thfloor.at> Cc: Kirill Korotaev <dev@sw.ru> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-19 00:39:52 -06:00
if (is_global_init(tsk))
return 1;
if (handler != SIG_IGN && handler != SIG_DFL)
return 0;
signals: remove 'handler' parameter to tracehook functions Container-init must behave like global-init to processes within the container and hence it must be immune to unhandled fatal signals from within the container (i.e SIG_DFL signals that terminate the process). But the same container-init must behave like a normal process to processes in ancestor namespaces and so if it receives the same fatal signal from a process in ancestor namespace, the signal must be processed. Implementing these semantics requires that send_signal() determine pid namespace of the sender but since signals can originate from workqueues/ interrupt-handlers, determining pid namespace of sender may not always be possible or safe. This patchset implements the design/simplified semantics suggested by Oleg Nesterov. The simplified semantics for container-init are: - container-init must never be terminated by a signal from a descendant process. - container-init must never be immune to SIGKILL from an ancestor namespace (so a process in parent namespace must always be able to terminate a descendant container). - container-init may be immune to unhandled fatal signals (like SIGUSR1) even if they are from ancestor namespace. SIGKILL/SIGSTOP are the only reliable signals to a container-init from ancestor namespace. This patch: Based on an earlier patch submitted by Oleg Nesterov and comments from Roland McGrath (http://lkml.org/lkml/2008/11/19/258). The handler parameter is currently unused in the tracehook functions. Besides, the tracehook functions are called with siglock held, so the functions can check the handler if they later need to. Removing the parameter simiplifies changes to sig_ignored() in a follow-on patch. Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com> Acked-by: Roland McGrath <roland@redhat.com> Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Daniel Lezcano <daniel.lezcano@free.fr> Cc: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-02 17:58:00 -06:00
return !tracehook_consider_fatal_signal(tsk, sig);
}
/*
* Notify the system that a driver wants to block all signals for this
* process, and wants to be notified if any signals at all were to be
* sent/acted upon. If the notifier routine returns non-zero, then the
* signal will be acted upon after all. If the notifier routine returns 0,
* then then signal will be blocked. Only one block per process is
* allowed. priv is a pointer to private data that the notifier routine
* can use to determine if the signal should be blocked or not.
*/
void
block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask)
{
unsigned long flags;
spin_lock_irqsave(&current->sighand->siglock, flags);
current->notifier_mask = mask;
current->notifier_data = priv;
current->notifier = notifier;
spin_unlock_irqrestore(&current->sighand->siglock, flags);
}
/* Notify the system that blocking has ended. */
void
unblock_all_signals(void)
{
unsigned long flags;
spin_lock_irqsave(&current->sighand->siglock, flags);
current->notifier = NULL;
current->notifier_data = NULL;
recalc_sigpending();
spin_unlock_irqrestore(&current->sighand->siglock, flags);
}
static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
{
struct sigqueue *q, *first = NULL;
/*
* Collect the siginfo appropriate to this signal. Check if
* there is another siginfo for the same signal.
*/
list_for_each_entry(q, &list->list, list) {
if (q->info.si_signo == sig) {
if (first)
goto still_pending;
first = q;
}
}
sigdelset(&list->signal, sig);
if (first) {
still_pending:
list_del_init(&first->list);
copy_siginfo(info, &first->info);
__sigqueue_free(first);
} else {
/*
* Ok, it wasn't in the queue. This must be
* a fast-pathed signal or we must have been
* out of queue space. So zero out the info.
*/
info->si_signo = sig;
info->si_errno = 0;
info->si_code = SI_USER;
info->si_pid = 0;
info->si_uid = 0;
}
}
static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
siginfo_t *info)
{
int sig = next_signal(pending, mask);
if (sig) {
if (current->notifier) {
if (sigismember(current->notifier_mask, sig)) {
if (!(current->notifier)(current->notifier_data)) {
clear_thread_flag(TIF_SIGPENDING);
return 0;
}
}
}
collect_signal(sig, pending, info);
}
return sig;
}
/*
* Dequeue a signal and return the element to the caller, which is
* expected to free it.
*
* All callers have to hold the siglock.
*/
int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
{
int signr;
/* We only dequeue private signals from ourselves, we don't let
* signalfd steal them
*/
signr = __dequeue_signal(&tsk->pending, mask, info);
if (!signr) {
signr = __dequeue_signal(&tsk->signal->shared_pending,
mask, info);
/*
* itimer signal ?
*
* itimers are process shared and we restart periodic
* itimers in the signal delivery path to prevent DoS
* attacks in the high resolution timer case. This is
* compliant with the old way of self-restarting
* itimers, as the SIGALRM is a legacy signal and only
* queued once. Changing the restart behaviour to
* restart the timer in the signal dequeue path is
* reducing the timer noise on heavy loaded !highres
* systems too.
*/
if (unlikely(signr == SIGALRM)) {
struct hrtimer *tmr = &tsk->signal->real_timer;
if (!hrtimer_is_queued(tmr) &&
tsk->signal->it_real_incr.tv64 != 0) {
hrtimer_forward(tmr, tmr->base->get_time(),
tsk->signal->it_real_incr);
hrtimer_restart(tmr);
}
}
}
recalc_sigpending();
if (!signr)
return 0;
if (unlikely(sig_kernel_stop(signr))) {
/*
* Set a marker that we have dequeued a stop signal. Our
* caller might release the siglock and then the pending
* stop signal it is about to process is no longer in the
* pending bitmasks, but must still be cleared by a SIGCONT
* (and overruled by a SIGKILL). So those cases clear this
* shared flag after we've set it. Note that this flag may
* remain set after the signal we return is ignored or
* handled. That doesn't matter because its only purpose
* is to alert stop-signal processing code when another
* processor has come along and cleared the flag.
*/
current->jobctl |= JOBCTL_STOP_DEQUEUED;
}
if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
/*
* Release the siglock to ensure proper locking order
* of timer locks outside of siglocks. Note, we leave
* irqs disabled here, since the posix-timers code is
* about to disable them again anyway.
*/
spin_unlock(&tsk->sighand->siglock);
do_schedule_next_timer(info);
spin_lock(&tsk->sighand->siglock);
}
return signr;
}
/*
* Tell a process that it has a new active signal..
*
* NOTE! we rely on the previous spin_lock to
* lock interrupts for us! We can only be called with
* "siglock" held, and the local interrupt must
* have been disabled when that got acquired!
*
* No need to set need_resched since signal event passing
* goes through ->blocked
*/
void signal_wake_up(struct task_struct *t, int resume)
{
unsigned int mask;
set_tsk_thread_flag(t, TIF_SIGPENDING);
/*
* For SIGKILL, we want to wake it up in the stopped/traced/killable
* case. We don't check t->state here because there is a race with it
* executing another processor and just now entering stopped state.
* By using wake_up_state, we ensure the process will wake up and
* handle its death signal.
*/
mask = TASK_INTERRUPTIBLE;
if (resume)
mask |= TASK_WAKEKILL;
if (!wake_up_state(t, mask))
kick_process(t);
}
/*
* Remove signals in mask from the pending set and queue.
* Returns 1 if any signals were found.
*
* All callers must be holding the siglock.
*
* This version takes a sigset mask and looks at all signals,
* not just those in the first mask word.
*/
static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
{
struct sigqueue *q, *n;
sigset_t m;
sigandsets(&m, mask, &s->signal);
if (sigisemptyset(&m))
return 0;
sigandnsets(&s->signal, &s->signal, mask);
list_for_each_entry_safe(q, n, &s->list, list) {
if (sigismember(mask, q->info.si_signo)) {
list_del_init(&q->list);
__sigqueue_free(q);
}
}
return 1;
}
/*
* Remove signals in mask from the pending set and queue.
* Returns 1 if any signals were found.
*
* All callers must be holding the siglock.
*/
static int rm_from_queue(unsigned long mask, struct sigpending *s)
{
struct sigqueue *q, *n;
if (!sigtestsetmask(&s->signal, mask))
return 0;
sigdelsetmask(&s->signal, mask);
list_for_each_entry_safe(q, n, &s->list, list) {
if (q->info.si_signo < SIGRTMIN &&
(mask & sigmask(q->info.si_signo))) {
list_del_init(&q->list);
__sigqueue_free(q);
}
}
return 1;
}
static inline int is_si_special(const struct siginfo *info)
{
return info <= SEND_SIG_FORCED;
}
static inline bool si_fromuser(const struct siginfo *info)
{
return info == SEND_SIG_NOINFO ||
(!is_si_special(info) && SI_FROMUSER(info));
}
/*
* called with RCU read lock from check_kill_permission()
*/
static int kill_ok_by_cred(struct task_struct *t)
{
const struct cred *cred = current_cred();
const struct cred *tcred = __task_cred(t);
if (cred->user->user_ns == tcred->user->user_ns &&
(cred->euid == tcred->suid ||
cred->euid == tcred->uid ||
cred->uid == tcred->suid ||
cred->uid == tcred->uid))
return 1;
if (ns_capable(tcred->user->user_ns, CAP_KILL))
return 1;
return 0;
}
/*
* Bad permissions for sending the signal
* - the caller must hold the RCU read lock
*/
static int check_kill_permission(int sig, struct siginfo *info,
struct task_struct *t)
{
struct pid *sid;
int error;
if (!valid_signal(sig))
return -EINVAL;
if (!si_fromuser(info))
return 0;
error = audit_signal_info(sig, t); /* Let audit system see the signal */
if (error)
return error;
signals: check_kill_permission(): don't check creds if same_thread_group() Andrew Tridgell reports that aio_read(SIGEV_SIGNAL) can fail if the notification from the helper thread races with setresuid(), see http://samba.org/~tridge/junkcode/aio_uid.c This happens because check_kill_permission() doesn't permit sending a signal to the task with the different cred->xids. But there is not any security reason to check ->cred's when the task sends a signal (private or group-wide) to its sub-thread. Whatever we do, any thread can bypass all security checks and send SIGKILL to all threads, or it can block a signal SIG and do kill(gettid(), SIG) to deliver this signal to another sub-thread. Not to mention that CLONE_THREAD implies CLONE_VM. Change check_kill_permission() to avoid the credentials check when the sender and the target are from the same thread group. Also, move "cred = current_cred()" down to avoid calling get_current() twice. Note: David Howells pointed out we could relax this even more, the CLONE_SIGHAND (without CLONE_THREAD) case probably does not need these checks too. Roland said: : The glibc (libpthread) that does set*id across threads has : been in use for a while (2.3.4?), probably in distro's using kernels as old : or older than any active -stable streams. In the race in question, this : kernel bug is breaking valid POSIX application expectations. Reported-by: Andrew Tridgell <tridge@samba.org> Signed-off-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Roland McGrath <roland@redhat.com> Acked-by: David Howells <dhowells@redhat.com> Cc: Eric Paris <eparis@parisplace.org> Cc: Jakub Jelinek <jakub@redhat.com> Cc: James Morris <jmorris@namei.org> Cc: Roland McGrath <roland@redhat.com> Cc: Stephen Smalley <sds@tycho.nsa.gov> Cc: <stable@kernel.org> [all kernel versions] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-05-26 15:42:54 -06:00
if (!same_thread_group(current, t) &&
!kill_ok_by_cred(t)) {
switch (sig) {
case SIGCONT:
sid = task_session(t);
/*
* We don't return the error if sid == NULL. The
* task was unhashed, the caller must notice this.
*/
if (!sid || sid == task_session(current))
break;
default:
return -EPERM;
}
}
return security_task_kill(t, info, sig, 0);
}
/*
* Handle magic process-wide effects of stop/continue signals. Unlike
* the signal actions, these happen immediately at signal-generation
* time regardless of blocking, ignoring, or handling. This does the
* actual continuing for SIGCONT, but not the actual stopping for stop
* signals. The process stop is done as a signal action for SIG_DFL.
*
* Returns true if the signal should be actually delivered, otherwise
* it should be dropped.
*/
static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
{
struct signal_struct *signal = p->signal;
struct task_struct *t;
if (unlikely(signal->flags & SIGNAL_GROUP_EXIT)) {
/*
* The process is in the middle of dying, nothing to do.
*/
} else if (sig_kernel_stop(sig)) {
/*
* This is a stop signal. Remove SIGCONT from all queues.
*/
rm_from_queue(sigmask(SIGCONT), &signal->shared_pending);
t = p;
do {
rm_from_queue(sigmask(SIGCONT), &t->pending);
} while_each_thread(p, t);
} else if (sig == SIGCONT) {
unsigned int why;
/*
* Remove all stop signals from all queues, wake all threads.
*/
rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
t = p;
do {
task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
wake_up_state(t, __TASK_STOPPED);
} while_each_thread(p, t);
/*
* Notify the parent with CLD_CONTINUED if we were stopped.
*
* If we were in the middle of a group stop, we pretend it
* was already finished, and then continued. Since SIGCHLD
* doesn't queue we report only CLD_STOPPED, as if the next
* CLD_CONTINUED was dropped.
*/
why = 0;
if (signal->flags & SIGNAL_STOP_STOPPED)
why |= SIGNAL_CLD_CONTINUED;
else if (signal->group_stop_count)
why |= SIGNAL_CLD_STOPPED;
if (why) {
/*
* The first thread which returns from do_signal_stop()
* will take ->siglock, notice SIGNAL_CLD_MASK, and
* notify its parent. See get_signal_to_deliver().
*/
signal->flags = why | SIGNAL_STOP_CONTINUED;
signal->group_stop_count = 0;
signal->group_exit_code = 0;
}
}
return !sig_ignored(p, sig, from_ancestor_ns);
}
/*
* Test if P wants to take SIG. After we've checked all threads with this,
* it's equivalent to finding no threads not blocking SIG. Any threads not
* blocking SIG were ruled out because they are not running and already
* have pending signals. Such threads will dequeue from the shared queue
* as soon as they're available, so putting the signal on the shared queue
* will be equivalent to sending it to one such thread.
*/
static inline int wants_signal(int sig, struct task_struct *p)
{
if (sigismember(&p->blocked, sig))
return 0;
if (p->flags & PF_EXITING)
return 0;
if (sig == SIGKILL)
return 1;
if (task_is_stopped_or_traced(p))
return 0;
return task_curr(p) || !signal_pending(p);
}
static void complete_signal(int sig, struct task_struct *p, int group)
{
struct signal_struct *signal = p->signal;
struct task_struct *t;
/*
* Now find a thread we can wake up to take the signal off the queue.
*
* If the main thread wants the signal, it gets first crack.
* Probably the least surprising to the average bear.
*/
if (wants_signal(sig, p))
t = p;
else if (!group || thread_group_empty(p))
/*
* There is just one thread and it does not need to be woken.
* It will dequeue unblocked signals before it runs again.
*/
return;
else {
/*
* Otherwise try to find a suitable thread.
*/
t = signal->curr_target;
while (!wants_signal(sig, t)) {
t = next_thread(t);
if (t == signal->curr_target)
/*
* No thread needs to be woken.
* Any eligible threads will see
* the signal in the queue soon.
*/
return;
}
signal->curr_target = t;
}
/*
* Found a killable thread. If the signal will be fatal,
* then start taking the whole group down immediately.
*/
if (sig_fatal(p, sig) &&
!(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
!sigismember(&t->real_blocked, sig) &&
(sig == SIGKILL ||
signals: remove 'handler' parameter to tracehook functions Container-init must behave like global-init to processes within the container and hence it must be immune to unhandled fatal signals from within the container (i.e SIG_DFL signals that terminate the process). But the same container-init must behave like a normal process to processes in ancestor namespaces and so if it receives the same fatal signal from a process in ancestor namespace, the signal must be processed. Implementing these semantics requires that send_signal() determine pid namespace of the sender but since signals can originate from workqueues/ interrupt-handlers, determining pid namespace of sender may not always be possible or safe. This patchset implements the design/simplified semantics suggested by Oleg Nesterov. The simplified semantics for container-init are: - container-init must never be terminated by a signal from a descendant process. - container-init must never be immune to SIGKILL from an ancestor namespace (so a process in parent namespace must always be able to terminate a descendant container). - container-init may be immune to unhandled fatal signals (like SIGUSR1) even if they are from ancestor namespace. SIGKILL/SIGSTOP are the only reliable signals to a container-init from ancestor namespace. This patch: Based on an earlier patch submitted by Oleg Nesterov and comments from Roland McGrath (http://lkml.org/lkml/2008/11/19/258). The handler parameter is currently unused in the tracehook functions. Besides, the tracehook functions are called with siglock held, so the functions can check the handler if they later need to. Removing the parameter simiplifies changes to sig_ignored() in a follow-on patch. Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com> Acked-by: Roland McGrath <roland@redhat.com> Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Daniel Lezcano <daniel.lezcano@free.fr> Cc: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-02 17:58:00 -06:00
!tracehook_consider_fatal_signal(t, sig))) {
/*
* This signal will be fatal to the whole group.
*/
if (!sig_kernel_coredump(sig)) {
/*
* Start a group exit and wake everybody up.
* This way we don't have other threads
* running and doing things after a slower
* thread has the fatal signal pending.
*/
signal->flags = SIGNAL_GROUP_EXIT;
signal->group_exit_code = sig;
signal->group_stop_count = 0;
t = p;
do {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
} while_each_thread(p, t);
return;
}
}
/*
* The signal is already in the shared-pending queue.
* Tell the chosen thread to wake up and dequeue it.
*/
signal_wake_up(t, sig == SIGKILL);
return;
}
static inline int legacy_queue(struct sigpending *signals, int sig)
{
return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
}
static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
int group, int from_ancestor_ns)
{
struct sigpending *pending;
struct sigqueue *q;
int override_rlimit;
trace_signal_generate(sig, info, t);
assert_spin_locked(&t->sighand->siglock);
if (!prepare_signal(sig, t, from_ancestor_ns))
return 0;
pending = group ? &t->signal->shared_pending : &t->pending;
/*
* Short-circuit ignored signals and support queuing
* exactly one non-rt signal, so that we can get more
* detailed information about the cause of the signal.
*/
if (legacy_queue(pending, sig))
return 0;
/*
* fast-pathed signals for kernel-internal things like SIGSTOP
* or SIGKILL.
*/
if (info == SEND_SIG_FORCED)
goto out_set;
/*
* Real-time signals must be queued if sent by sigqueue, or
* some other real-time mechanism. It is implementation
* defined whether kill() does so. We attempt to do so, on
* the principle of least surprise, but since kill is not
* allowed to fail with EAGAIN when low on memory we just
* make sure at least one signal gets delivered and don't
* pass on the info struct.
*/
if (sig < SIGRTMIN)
override_rlimit = (is_si_special(info) || info->si_code >= 0);
else
override_rlimit = 0;
q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
override_rlimit);
if (q) {
list_add_tail(&q->list, &pending->list);
switch ((unsigned long) info) {
case (unsigned long) SEND_SIG_NOINFO:
q->info.si_signo = sig;
q->info.si_errno = 0;
q->info.si_code = SI_USER;
q->info.si_pid = task_tgid_nr_ns(current,
task_active_pid_ns(t));
q->info.si_uid = current_uid();
break;
case (unsigned long) SEND_SIG_PRIV:
q->info.si_signo = sig;
q->info.si_errno = 0;
q->info.si_code = SI_KERNEL;
q->info.si_pid = 0;
q->info.si_uid = 0;
break;
default:
copy_siginfo(&q->info, info);
if (from_ancestor_ns)
q->info.si_pid = 0;
break;
}
} else if (!is_si_special(info)) {
if (sig >= SIGRTMIN && info->si_code != SI_USER) {
/*
* Queue overflow, abort. We may abort if the
* signal was rt and sent by user using something
* other than kill().
*/
trace_signal_overflow_fail(sig, group, info);
return -EAGAIN;
} else {
/*
* This is a silent loss of information. We still
* send the signal, but the *info bits are lost.
*/
trace_signal_lose_info(sig, group, info);
}
}
out_set:
signalfd_notify(t, sig);
sigaddset(&pending->signal, sig);
complete_signal(sig, t, group);
return 0;
}
static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
int group)
{
int from_ancestor_ns = 0;
#ifdef CONFIG_PID_NS
from_ancestor_ns = si_fromuser(info) &&
!task_pid_nr_ns(current, task_active_pid_ns(t));
#endif
return __send_signal(sig, info, t, group, from_ancestor_ns);
}
static void print_fatal_signal(struct pt_regs *regs, int signr)
{
printk("%s/%d: potentially unexpected fatal signal %d.\n",
current->comm, task_pid_nr(current), signr);
#if defined(__i386__) && !defined(__arch_um__)
printk("code at %08lx: ", regs->ip);
{
int i;
for (i = 0; i < 16; i++) {
unsigned char insn;
if (get_user(insn, (unsigned char *)(regs->ip + i)))
break;
printk("%02x ", insn);
}
}
#endif
printk("\n");
preempt_disable();
show_regs(regs);
preempt_enable();
}
static int __init setup_print_fatal_signals(char *str)
{
get_option (&str, &print_fatal_signals);
return 1;
}
__setup("print-fatal-signals=", setup_print_fatal_signals);
int
__group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
{
return send_signal(sig, info, p, 1);
}
static int
specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
{
return send_signal(sig, info, t, 0);
}
int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
bool group)
{
unsigned long flags;
int ret = -ESRCH;
if (lock_task_sighand(p, &flags)) {
ret = send_signal(sig, info, p, group);
unlock_task_sighand(p, &flags);
}
return ret;
}
/*
* Force a signal that the process can't ignore: if necessary
* we unblock the signal and change any SIG_IGN to SIG_DFL.
*
* Note: If we unblock the signal, we always reset it to SIG_DFL,
* since we do not want to have a signal handler that was blocked
* be invoked when user space had explicitly blocked it.
*
* We don't want to have recursive SIGSEGV's etc, for example,
* that is why we also clear SIGNAL_UNKILLABLE.
*/
int
force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
{
unsigned long int flags;
int ret, blocked, ignored;
struct k_sigaction *action;
spin_lock_irqsave(&t->sighand->siglock, flags);
action = &t->sighand->action[sig-1];
ignored = action->sa.sa_handler == SIG_IGN;
blocked = sigismember(&t->blocked, sig);
if (blocked || ignored) {
action->sa.sa_handler = SIG_DFL;
if (blocked) {
sigdelset(&t->blocked, sig);
recalc_sigpending_and_wake(t);
}
}
if (action->sa.sa_handler == SIG_DFL)
t->signal->flags &= ~SIGNAL_UNKILLABLE;
ret = specific_send_sig_info(sig, info, t);
spin_unlock_irqrestore(&t->sighand->siglock, flags);
return ret;
}
/*
* Nuke all other threads in the group.
*/
int zap_other_threads(struct task_struct *p)
{
struct task_struct *t = p;
int count = 0;
p->signal->group_stop_count = 0;
while_each_thread(p, t) {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
count++;
/* Don't bother with already dead threads */
if (t->exit_state)
continue;
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
}
return count;
}
struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
unsigned long *flags)
{
struct sighand_struct *sighand;
rcu_read_lock();
for (;;) {
sighand = rcu_dereference(tsk->sighand);
if (unlikely(sighand == NULL))
break;
spin_lock_irqsave(&sighand->siglock, *flags);
if (likely(sighand == tsk->sighand))
break;
spin_unlock_irqrestore(&sighand->siglock, *flags);
}
rcu_read_unlock();
return sighand;
}
/*
* send signal info to all the members of a group
*/
int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
{
int ret;
rcu_read_lock();
ret = check_kill_permission(sig, info, p);
rcu_read_unlock();
if (!ret && sig)
ret = do_send_sig_info(sig, info, p, true);
return ret;
}
/*
* __kill_pgrp_info() sends a signal to a process group: this is what the tty
* control characters do (^C, ^Z etc)
* - the caller must hold at least a readlock on tasklist_lock
*/
int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
{
struct task_struct *p = NULL;
int retval, success;
success = 0;
retval = -ESRCH;
do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
int err = group_send_sig_info(sig, info, p);
success |= !err;
retval = err;
} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
return success ? 0 : retval;
}
int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
{
int error = -ESRCH;
struct task_struct *p;
rcu_read_lock();
retry:
p = pid_task(pid, PIDTYPE_PID);
if (p) {
error = group_send_sig_info(sig, info, p);
if (unlikely(error == -ESRCH))
/*
* The task was unhashed in between, try again.
* If it is dead, pid_task() will return NULL,
* if we race with de_thread() it will find the
* new leader.
*/
goto retry;
}
rcu_read_unlock();
return error;
}
int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
{
int error;
rcu_read_lock();
error = kill_pid_info(sig, info, find_vpid(pid));
rcu_read_unlock();
return error;
}
/* like kill_pid_info(), but doesn't use uid/euid of "current" */
int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
uid_t uid, uid_t euid, u32 secid)
{
int ret = -EINVAL;
struct task_struct *p;
const struct cred *pcred;
unsigned long flags;
if (!valid_signal(sig))
return ret;
rcu_read_lock();
p = pid_task(pid, PIDTYPE_PID);
if (!p) {
ret = -ESRCH;
goto out_unlock;
}
pcred = __task_cred(p);
if (si_fromuser(info) &&
euid != pcred->suid && euid != pcred->uid &&
uid != pcred->suid && uid != pcred->uid) {
ret = -EPERM;
goto out_unlock;
}
ret = security_task_kill(p, info, sig, secid);
if (ret)
goto out_unlock;
if (sig) {
if (lock_task_sighand(p, &flags)) {
ret = __send_signal(sig, info, p, 1, 0);
unlock_task_sighand(p, &flags);
} else
ret = -ESRCH;
}
out_unlock:
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
/*
* kill_something_info() interprets pid in interesting ways just like kill(2).
*
* POSIX specifies that kill(-1,sig) is unspecified, but what we have
* is probably wrong. Should make it like BSD or SYSV.
*/
static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
{
int ret;
if (pid > 0) {
rcu_read_lock();
ret = kill_pid_info(sig, info, find_vpid(pid));
rcu_read_unlock();
return ret;
}
read_lock(&tasklist_lock);
if (pid != -1) {
ret = __kill_pgrp_info(sig, info,
pid ? find_vpid(-pid) : task_pgrp(current));
} else {
int retval = 0, count = 0;
struct task_struct * p;
for_each_process(p) {
if (task_pid_vnr(p) > 1 &&
!same_thread_group(p, current)) {
int err = group_send_sig_info(sig, info, p);
++count;
if (err != -EPERM)
retval = err;
}
}
ret = count ? retval : -ESRCH;
}
read_unlock(&tasklist_lock);
return ret;
}
/*
* These are for backward compatibility with the rest of the kernel source.
*/
int send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
{
/*
* Make sure legacy kernel users don't send in bad values
* (normal paths check this in check_kill_permission).
*/
if (!valid_signal(sig))
return -EINVAL;
return do_send_sig_info(sig, info, p, false);
}
#define __si_special(priv) \
((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO)
int
send_sig(int sig, struct task_struct *p, int priv)
{
return send_sig_info(sig, __si_special(priv), p);
}
void
force_sig(int sig, struct task_struct *p)
{
force_sig_info(sig, SEND_SIG_PRIV, p);
}
/*
* When things go south during signal handling, we
* will force a SIGSEGV. And if the signal that caused
* the problem was already a SIGSEGV, we'll want to
* make sure we don't even try to deliver the signal..
*/
int
force_sigsegv(int sig, struct task_struct *p)
{
if (sig == SIGSEGV) {
unsigned long flags;
spin_lock_irqsave(&p->sighand->siglock, flags);
p->sighand->action[sig - 1].sa.sa_handler = SIG_DFL;
spin_unlock_irqrestore(&p->sighand->siglock, flags);
}
force_sig(SIGSEGV, p);
return 0;
}
int kill_pgrp(struct pid *pid, int sig, int priv)
{
int ret;
read_lock(&tasklist_lock);
ret = __kill_pgrp_info(sig, __si_special(priv), pid);
read_unlock(&tasklist_lock);
return ret;
}
EXPORT_SYMBOL(kill_pgrp);
int kill_pid(struct pid *pid, int sig, int priv)
{
return kill_pid_info(sig, __si_special(priv), pid);
}
EXPORT_SYMBOL(kill_pid);
/*
* These functions support sending signals using preallocated sigqueue
* structures. This is needed "because realtime applications cannot
* afford to lose notifications of asynchronous events, like timer
* expirations or I/O completions". In the case of POSIX Timers
* we allocate the sigqueue structure from the timer_create. If this
* allocation fails we are able to report the failure to the application
* with an EAGAIN error.
*/
struct sigqueue *sigqueue_alloc(void)
{
struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
if (q)
q->flags |= SIGQUEUE_PREALLOC;
return q;
}
void sigqueue_free(struct sigqueue *q)
{
unsigned long flags;
2007-08-31 00:56:35 -06:00
spinlock_t *lock = &current->sighand->siglock;
BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
/*
* We must hold ->siglock while testing q->list
* to serialize with collect_signal() or with
* __exit_signal()->flush_sigqueue().
*/
2007-08-31 00:56:35 -06:00
spin_lock_irqsave(lock, flags);
q->flags &= ~SIGQUEUE_PREALLOC;
/*
* If it is queued it will be freed when dequeued,
* like the "regular" sigqueue.
*/
2007-08-31 00:56:35 -06:00
if (!list_empty(&q->list))
q = NULL;
2007-08-31 00:56:35 -06:00
spin_unlock_irqrestore(lock, flags);
if (q)
__sigqueue_free(q);
}
int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
2008-04-30 01:52:41 -06:00
{
int sig = q->info.si_signo;
struct sigpending *pending;
unsigned long flags;
int ret;
BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
ret = -1;
if (!likely(lock_task_sighand(t, &flags)))
goto ret;
ret = 1; /* the signal is ignored */
if (!prepare_signal(sig, t, 0))
goto out;
ret = 0;
2008-04-30 01:52:41 -06:00
if (unlikely(!list_empty(&q->list))) {
/*
* If an SI_TIMER entry is already queue just increment
* the overrun count.
*/
BUG_ON(q->info.si_code != SI_TIMER);
q->info.si_overrun++;
goto out;
2008-04-30 01:52:41 -06:00
}
posix-timers: fix posix_timer_event() vs dequeue_signal() race The bug was reported and analysed by Mark McLoughlin <markmc@redhat.com>, the patch is based on his and Roland's suggestions. posix_timer_event() always rewrites the pre-allocated siginfo before sending the signal. Most of the written info is the same all the time, but memset(0) is very wrong. If ->sigq is queued we can race with collect_signal() which can fail to find this siginfo looking at .si_signo, or copy_siginfo() can copy the wrong .si_code/si_tid/etc. In short, sys_timer_settime() can in fact stop the active timer, or the user can receive the siginfo with the wrong .si_xxx values. Move "memset(->info, 0)" from posix_timer_event() to alloc_posix_timer(), change send_sigqueue() to set .si_overrun = 0 when ->sigq is not queued. It would be nice to move the whole sigq->info initialization from send to create path, but this is not easy to do without uglifying timer_create() further. As Roland rightly pointed out, we need more cleanups/fixes here, see the "FIXME" comment in the patch. Hopefully this patch makes sense anyway, and it can mask the most bad implications. Reported-by: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru> Cc: Mark McLoughlin <markmc@redhat.com> Cc: Oliver Pinter <oliver.pntr@gmail.com> Cc: Roland McGrath <roland@redhat.com> Cc: stable@kernel.org Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> kernel/posix-timers.c | 17 +++++++++++++---- kernel/signal.c | 1 + 2 files changed, 14 insertions(+), 4 deletions(-)
2008-07-23 10:52:05 -06:00
q->info.si_overrun = 0;
2008-04-30 01:52:41 -06:00
signalfd_notify(t, sig);
pending = group ? &t->signal->shared_pending : &t->pending;
2008-04-30 01:52:41 -06:00
list_add_tail(&q->list, &pending->list);
sigaddset(&pending->signal, sig);
complete_signal(sig, t, group);
out:
unlock_task_sighand(t, &flags);
ret:
return ret;
2008-04-30 01:52:41 -06:00
}
/*
* Let a parent know about the death of a child.
* For a stopped/continued status change, use do_notify_parent_cldstop instead.
*
* Returns -1 if our parent ignored us and so we've switched to
* self-reaping, or else @sig.
*/
int do_notify_parent(struct task_struct *tsk, int sig)
{
struct siginfo info;
unsigned long flags;
struct sighand_struct *psig;
int ret = sig;
BUG_ON(sig == -1);
/* do_notify_parent_cldstop should have been called instead. */
BUG_ON(task_is_stopped_or_traced(tsk));
BUG_ON(!task_ptrace(tsk) &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
info.si_signo = sig;
info.si_errno = 0;
/*
* we are under tasklist_lock here so our parent is tied to
* us and cannot exit and release its namespace.
*
* the only it can is to switch its nsproxy with sys_unshare,
* bu uncharing pid namespaces is not allowed, so we'll always
* see relevant namespace
*
* write_lock() currently calls preempt_disable() which is the
* same as rcu_read_lock(), but according to Oleg, this is not
* correct to rely on this
*/
rcu_read_lock();
info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
info.si_uid = __task_cred(tsk)->uid;
rcu_read_unlock();
info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
tsk->signal->utime));
info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
tsk->signal->stime));
info.si_status = tsk->exit_code & 0x7f;
if (tsk->exit_code & 0x80)
info.si_code = CLD_DUMPED;
else if (tsk->exit_code & 0x7f)
info.si_code = CLD_KILLED;
else {
info.si_code = CLD_EXITED;
info.si_status = tsk->exit_code >> 8;
}
psig = tsk->parent->sighand;
spin_lock_irqsave(&psig->siglock, flags);
if (!task_ptrace(tsk) && sig == SIGCHLD &&
(psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
(psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
/*
* We are exiting and our parent doesn't care. POSIX.1
* defines special semantics for setting SIGCHLD to SIG_IGN
* or setting the SA_NOCLDWAIT flag: we should be reaped
* automatically and not left for our parent's wait4 call.
* Rather than having the parent do it as a magic kind of
* signal handler, we just set this to tell do_exit that we
* can be cleaned up without becoming a zombie. Note that
* we still call __wake_up_parent in this case, because a
* blocked sys_wait4 might now return -ECHILD.
*
* Whether we send SIGCHLD or not for SA_NOCLDWAIT
* is implementation-defined: we do (if you don't want
* it, just use SIG_IGN instead).
*/
ret = tsk->exit_signal = -1;
if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
sig = -1;
}
if (valid_signal(sig) && sig > 0)
__group_send_sig_info(sig, &info, tsk->parent);
__wake_up_parent(tsk, tsk->parent);
spin_unlock_irqrestore(&psig->siglock, flags);
return ret;
}
/**
* do_notify_parent_cldstop - notify parent of stopped/continued state change
* @tsk: task reporting the state change
* @for_ptracer: the notification is for ptracer
* @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report
*
* Notify @tsk's parent that the stopped/continued state has changed. If
* @for_ptracer is %false, @tsk's group leader notifies to its real parent.
* If %true, @tsk reports to @tsk->parent which should be the ptracer.
*
* CONTEXT:
* Must be called with tasklist_lock at least read locked.
*/
static void do_notify_parent_cldstop(struct task_struct *tsk,
bool for_ptracer, int why)
{
struct siginfo info;
unsigned long flags;
struct task_struct *parent;
struct sighand_struct *sighand;
if (for_ptracer) {
parent = tsk->parent;
} else {
tsk = tsk->group_leader;
parent = tsk->real_parent;
}
info.si_signo = SIGCHLD;
info.si_errno = 0;
/*
* see comment in do_notify_parent() about the following 4 lines
*/
rcu_read_lock();
info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
info.si_uid = __task_cred(tsk)->uid;
rcu_read_unlock();
info.si_utime = cputime_to_clock_t(tsk->utime);
info.si_stime = cputime_to_clock_t(tsk->stime);
info.si_code = why;
switch (why) {
case CLD_CONTINUED:
info.si_status = SIGCONT;
break;
case CLD_STOPPED:
info.si_status = tsk->signal->group_exit_code & 0x7f;
break;
case CLD_TRAPPED:
info.si_status = tsk->exit_code & 0x7f;
break;
default:
BUG();
}
sighand = parent->sighand;
spin_lock_irqsave(&sighand->siglock, flags);
if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN &&
!(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP))
__group_send_sig_info(SIGCHLD, &info, parent);
/*
* Even if SIGCHLD is not generated, we must wake up wait4 calls.
*/
__wake_up_parent(tsk, parent);
spin_unlock_irqrestore(&sighand->siglock, flags);
}
static inline int may_ptrace_stop(void)
{
if (!likely(task_ptrace(current)))
return 0;
/*
* Are we in the middle of do_coredump?
* If so and our tracer is also part of the coredump stopping
* is a deadlock situation, and pointless because our tracer
* is dead so don't allow us to stop.
* If SIGKILL was already sent before the caller unlocked
* ->siglock we must see ->core_state != NULL. Otherwise it
* is safe to enter schedule().
*/
if (unlikely(current->mm->core_state) &&
unlikely(current->mm == current->parent->mm))
return 0;
return 1;
}
/*
* Return non-zero if there is a SIGKILL that should be waking us up.
* Called with the siglock held.
*/
static int sigkill_pending(struct task_struct *tsk)
{
return sigismember(&tsk->pending.signal, SIGKILL) ||
sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
}
job control: Notify the real parent of job control events regardless of ptrace With recent changes, job control and ptrace stopped states are properly separated and accessible to the real parent and the ptracer respectively; however, notifications of job control stopped/continued events to the real parent while ptraced are still missing. A ptracee participates in group stop in ptrace_stop() but the completion isn't notified. If participation results in completion of group stop, notify the real parent of the event. The ptrace and group stops are separate and can be handled as such. However, when the real parent and the ptracer are in the same thread group, only the ptrace stop event is visible through wait(2) and the duplicate notifications are different from the current behavior and are confusing. Suppress group stop notification in such cases. The continued state is shared between the real parent and the ptracer but is only meaningful to the real parent. Always notify the real parent and notify the ptracer too for backward compatibility. Similar to stop notification, if the real parent is the ptracer, suppress a duplicate notification. Test case follows. #include <stdio.h> #include <unistd.h> #include <time.h> #include <errno.h> #include <sys/types.h> #include <sys/ptrace.h> #include <sys/wait.h> int main(void) { const struct timespec ts100ms = { .tv_nsec = 100000000 }; pid_t tracee, tracer; siginfo_t si; int i; tracee = fork(); if (tracee == 0) { while (1) { printf("tracee: SIGSTOP\n"); raise(SIGSTOP); nanosleep(&ts100ms, NULL); printf("tracee: SIGCONT\n"); raise(SIGCONT); nanosleep(&ts100ms, NULL); } } waitid(P_PID, tracee, &si, WSTOPPED | WNOHANG | WNOWAIT); tracer = fork(); if (tracer == 0) { nanosleep(&ts100ms, NULL); ptrace(PTRACE_ATTACH, tracee, NULL, NULL); for (i = 0; i < 11; i++) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED); if (si.si_pid && si.si_code == CLD_TRAPPED) ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); } printf("tracer: EXITING\n"); return 0; } while (1) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED | WCONTINUED | WEXITED); if (si.si_pid) printf("mommy : WAIT status=%02d code=%02d\n", si.si_status, si.si_code); } return 0; } Before this patch, while ptraced, the real parent doesn't get notifications for job control events, so although it can access those events, the later waitid(2) call never wakes up. tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT tracee: SIGSTOP tracee: SIGCONT tracee: SIGSTOP tracee: SIGCONT tracee: SIGSTOP tracer: EXITING mommy : WAIT status=19 code=05 ^C After this patch, it works as expected. tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP tracer: EXITING mommy : WAIT status=19 code=05 ^C -v2: Oleg pointed out that * Group stop notification to the real parent should also happen when ptracer detach races with ptrace_stop(). * real_parent_is_ptracer() should be testing thread group equality not the task itself as wait(2) and stop/cont notifications are normally thread-group wide. Both issues are fixed accordingly. -v3: real_parent_is_ptracer() updated to test child->real_parent instead of child->group_leader->real_parent per Oleg's suggestion. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com>
2011-03-23 03:37:01 -06:00
/*
* Test whether the target task of the usual cldstop notification - the
* real_parent of @child - is in the same group as the ptracer.
*/
static bool real_parent_is_ptracer(struct task_struct *child)
{
return same_thread_group(child->parent, child->real_parent);
}
/*
* This must be called with current->sighand->siglock held.
*
* This should be the path for all ptrace stops.
* We always set current->last_siginfo while stopped here.
* That makes it a way to test a stopped process for
* being ptrace-stopped vs being job-control-stopped.
*
* If we actually decide not to stop at all because the tracer
* is gone, we keep current->exit_code unless clear_code.
*/
static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
__releases(&current->sighand->siglock)
__acquires(&current->sighand->siglock)
{
job control: Notify the real parent of job control events regardless of ptrace With recent changes, job control and ptrace stopped states are properly separated and accessible to the real parent and the ptracer respectively; however, notifications of job control stopped/continued events to the real parent while ptraced are still missing. A ptracee participates in group stop in ptrace_stop() but the completion isn't notified. If participation results in completion of group stop, notify the real parent of the event. The ptrace and group stops are separate and can be handled as such. However, when the real parent and the ptracer are in the same thread group, only the ptrace stop event is visible through wait(2) and the duplicate notifications are different from the current behavior and are confusing. Suppress group stop notification in such cases. The continued state is shared between the real parent and the ptracer but is only meaningful to the real parent. Always notify the real parent and notify the ptracer too for backward compatibility. Similar to stop notification, if the real parent is the ptracer, suppress a duplicate notification. Test case follows. #include <stdio.h> #include <unistd.h> #include <time.h> #include <errno.h> #include <sys/types.h> #include <sys/ptrace.h> #include <sys/wait.h> int main(void) { const struct timespec ts100ms = { .tv_nsec = 100000000 }; pid_t tracee, tracer; siginfo_t si; int i; tracee = fork(); if (tracee == 0) { while (1) { printf("tracee: SIGSTOP\n"); raise(SIGSTOP); nanosleep(&ts100ms, NULL); printf("tracee: SIGCONT\n"); raise(SIGCONT); nanosleep(&ts100ms, NULL); } } waitid(P_PID, tracee, &si, WSTOPPED | WNOHANG | WNOWAIT); tracer = fork(); if (tracer == 0) { nanosleep(&ts100ms, NULL); ptrace(PTRACE_ATTACH, tracee, NULL, NULL); for (i = 0; i < 11; i++) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED); if (si.si_pid && si.si_code == CLD_TRAPPED) ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); } printf("tracer: EXITING\n"); return 0; } while (1) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED | WCONTINUED | WEXITED); if (si.si_pid) printf("mommy : WAIT status=%02d code=%02d\n", si.si_status, si.si_code); } return 0; } Before this patch, while ptraced, the real parent doesn't get notifications for job control events, so although it can access those events, the later waitid(2) call never wakes up. tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT tracee: SIGSTOP tracee: SIGCONT tracee: SIGSTOP tracee: SIGCONT tracee: SIGSTOP tracer: EXITING mommy : WAIT status=19 code=05 ^C After this patch, it works as expected. tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP tracer: EXITING mommy : WAIT status=19 code=05 ^C -v2: Oleg pointed out that * Group stop notification to the real parent should also happen when ptracer detach races with ptrace_stop(). * real_parent_is_ptracer() should be testing thread group equality not the task itself as wait(2) and stop/cont notifications are normally thread-group wide. Both issues are fixed accordingly. -v3: real_parent_is_ptracer() updated to test child->real_parent instead of child->group_leader->real_parent per Oleg's suggestion. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com>
2011-03-23 03:37:01 -06:00
bool gstop_done = false;
if (arch_ptrace_stop_needed(exit_code, info)) {
/*
* The arch code has something special to do before a
* ptrace stop. This is allowed to block, e.g. for faults
* on user stack pages. We can't keep the siglock while
* calling arch_ptrace_stop, so we must release it now.
* To preserve proper semantics, we must do this before
* any signal bookkeeping like checking group_stop_count.
* Meanwhile, a SIGKILL could come in before we retake the
* siglock. That must prevent us from sleeping in TASK_TRACED.
* So after regaining the lock, we must check for SIGKILL.
*/
spin_unlock_irq(&current->sighand->siglock);
arch_ptrace_stop(exit_code, info);
spin_lock_irq(&current->sighand->siglock);
if (sigkill_pending(current))
return;
}
/*
* We're committing to trapping. TRACED should be visible before
* TRAPPING is cleared; otherwise, the tracer might fail do_wait().
* Also, transition to TRACED and updates to ->jobctl should be
* atomic with respect to siglock and should be done after the arch
* hook as siglock is released and regrabbed across it.
*/
set_current_state(TASK_TRACED);
current->last_siginfo = info;
current->exit_code = exit_code;
/*
* If @why is CLD_STOPPED, we're trapping to participate in a group
* stop. Do the bookkeeping. Note that if SIGCONT was delievered
* while siglock was released for the arch hook, PENDING could be
* clear now. We act as if SIGCONT is received after TASK_TRACED
* is entered - ignore it.
*/
if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING))
job control: Notify the real parent of job control events regardless of ptrace With recent changes, job control and ptrace stopped states are properly separated and accessible to the real parent and the ptracer respectively; however, notifications of job control stopped/continued events to the real parent while ptraced are still missing. A ptracee participates in group stop in ptrace_stop() but the completion isn't notified. If participation results in completion of group stop, notify the real parent of the event. The ptrace and group stops are separate and can be handled as such. However, when the real parent and the ptracer are in the same thread group, only the ptrace stop event is visible through wait(2) and the duplicate notifications are different from the current behavior and are confusing. Suppress group stop notification in such cases. The continued state is shared between the real parent and the ptracer but is only meaningful to the real parent. Always notify the real parent and notify the ptracer too for backward compatibility. Similar to stop notification, if the real parent is the ptracer, suppress a duplicate notification. Test case follows. #include <stdio.h> #include <unistd.h> #include <time.h> #include <errno.h> #include <sys/types.h> #include <sys/ptrace.h> #include <sys/wait.h> int main(void) { const struct timespec ts100ms = { .tv_nsec = 100000000 }; pid_t tracee, tracer; siginfo_t si; int i; tracee = fork(); if (tracee == 0) { while (1) { printf("tracee: SIGSTOP\n"); raise(SIGSTOP); nanosleep(&ts100ms, NULL); printf("tracee: SIGCONT\n"); raise(SIGCONT); nanosleep(&ts100ms, NULL); } } waitid(P_PID, tracee, &si, WSTOPPED | WNOHANG | WNOWAIT); tracer = fork(); if (tracer == 0) { nanosleep(&ts100ms, NULL); ptrace(PTRACE_ATTACH, tracee, NULL, NULL); for (i = 0; i < 11; i++) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED); if (si.si_pid && si.si_code == CLD_TRAPPED) ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); } printf("tracer: EXITING\n"); return 0; } while (1) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED | WCONTINUED | WEXITED); if (si.si_pid) printf("mommy : WAIT status=%02d code=%02d\n", si.si_status, si.si_code); } return 0; } Before this patch, while ptraced, the real parent doesn't get notifications for job control events, so although it can access those events, the later waitid(2) call never wakes up. tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT tracee: SIGSTOP tracee: SIGCONT tracee: SIGSTOP tracee: SIGCONT tracee: SIGSTOP tracer: EXITING mommy : WAIT status=19 code=05 ^C After this patch, it works as expected. tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP tracer: EXITING mommy : WAIT status=19 code=05 ^C -v2: Oleg pointed out that * Group stop notification to the real parent should also happen when ptracer detach races with ptrace_stop(). * real_parent_is_ptracer() should be testing thread group equality not the task itself as wait(2) and stop/cont notifications are normally thread-group wide. Both issues are fixed accordingly. -v3: real_parent_is_ptracer() updated to test child->real_parent instead of child->group_leader->real_parent per Oleg's suggestion. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com>
2011-03-23 03:37:01 -06:00
gstop_done = task_participate_group_stop(current);
/* entering a trap, clear TRAPPING */
task_clear_jobctl_trapping(current);
ptrace: Clean transitions between TASK_STOPPED and TRACED Currently, if the task is STOPPED on ptrace attach, it's left alone and the state is silently changed to TRACED on the next ptrace call. The behavior breaks the assumption that arch_ptrace_stop() is called before any task is poked by ptrace and is ugly in that a task manipulates the state of another task directly. With GROUP_STOP_PENDING, the transitions between TASK_STOPPED and TRACED can be made clean. The tracer can use the flag to tell the tracee to retry stop on attach and detach. On retry, the tracee will enter the desired state in the correct way. The lower 16bits of task->group_stop is used to remember the signal number which caused the last group stop. This is used while retrying for ptrace attach as the original group_exit_code could have been consumed with wait(2) by then. As the real parent may wait(2) and consume the group_exit_code anytime, the group_exit_code needs to be saved separately so that it can be used when switching from regular sleep to ptrace_stop(). This is recorded in the lower 16bits of task->group_stop. If a task is already stopped and there's no intervening SIGCONT, a ptrace request immediately following a successful PTRACE_ATTACH should always succeed even if the tracer doesn't wait(2) for attach completion; however, with this change, the tracee might still be TASK_RUNNING trying to enter TASK_TRACED which would cause the following request to fail with -ESRCH. This intermediate state is hidden from the ptracer by setting GROUP_STOP_TRAPPING on attach and making ptrace_check_attach() wait for it to clear on its signal->wait_chldexit. Completing the transition or getting killed clears TRAPPING and wakes up the tracer. Note that the STOPPED -> RUNNING -> TRACED transition is still visible to other threads which are in the same group as the ptracer and the reverse transition is visible to all. Please read the comments for details. Oleg: * Spotted a race condition where a task may retry group stop without proper bookkeeping. Fixed by redoing bookkeeping on retry. * Spotted that the transition is visible to userland in several different ways. Most are fixed with GROUP_STOP_TRAPPING. Unhandled corner case is documented. * Pointed out not setting GROUP_STOP_SIGMASK on an already stopped task would result in more consistent behavior. * Pointed out that calling ptrace_stop() from do_signal_stop() in TASK_STOPPED can race with group stop start logic and then confuse the TRAPPING wait in ptrace_check_attach(). ptrace_stop() is now called with TASK_RUNNING. * Suggested using signal->wait_chldexit instead of bit wait. * Spotted a race condition between TRACED transition and clearing of TRAPPING. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com> Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
2011-03-23 03:37:00 -06:00
spin_unlock_irq(&current->sighand->siglock);
read_lock(&tasklist_lock);
if (may_ptrace_stop()) {
job control: Notify the real parent of job control events regardless of ptrace With recent changes, job control and ptrace stopped states are properly separated and accessible to the real parent and the ptracer respectively; however, notifications of job control stopped/continued events to the real parent while ptraced are still missing. A ptracee participates in group stop in ptrace_stop() but the completion isn't notified. If participation results in completion of group stop, notify the real parent of the event. The ptrace and group stops are separate and can be handled as such. However, when the real parent and the ptracer are in the same thread group, only the ptrace stop event is visible through wait(2) and the duplicate notifications are different from the current behavior and are confusing. Suppress group stop notification in such cases. The continued state is shared between the real parent and the ptracer but is only meaningful to the real parent. Always notify the real parent and notify the ptracer too for backward compatibility. Similar to stop notification, if the real parent is the ptracer, suppress a duplicate notification. Test case follows. #include <stdio.h> #include <unistd.h> #include <time.h> #include <errno.h> #include <sys/types.h> #include <sys/ptrace.h> #include <sys/wait.h> int main(void) { const struct timespec ts100ms = { .tv_nsec = 100000000 }; pid_t tracee, tracer; siginfo_t si; int i; tracee = fork(); if (tracee == 0) { while (1) { printf("tracee: SIGSTOP\n"); raise(SIGSTOP); nanosleep(&ts100ms, NULL); printf("tracee: SIGCONT\n"); raise(SIGCONT); nanosleep(&ts100ms, NULL); } } waitid(P_PID, tracee, &si, WSTOPPED | WNOHANG | WNOWAIT); tracer = fork(); if (tracer == 0) { nanosleep(&ts100ms, NULL); ptrace(PTRACE_ATTACH, tracee, NULL, NULL); for (i = 0; i < 11; i++) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED); if (si.si_pid && si.si_code == CLD_TRAPPED) ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); } printf("tracer: EXITING\n"); return 0; } while (1) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED | WCONTINUED | WEXITED); if (si.si_pid) printf("mommy : WAIT status=%02d code=%02d\n", si.si_status, si.si_code); } return 0; } Before this patch, while ptraced, the real parent doesn't get notifications for job control events, so although it can access those events, the later waitid(2) call never wakes up. tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT tracee: SIGSTOP tracee: SIGCONT tracee: SIGSTOP tracee: SIGCONT tracee: SIGSTOP tracer: EXITING mommy : WAIT status=19 code=05 ^C After this patch, it works as expected. tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP tracer: EXITING mommy : WAIT status=19 code=05 ^C -v2: Oleg pointed out that * Group stop notification to the real parent should also happen when ptracer detach races with ptrace_stop(). * real_parent_is_ptracer() should be testing thread group equality not the task itself as wait(2) and stop/cont notifications are normally thread-group wide. Both issues are fixed accordingly. -v3: real_parent_is_ptracer() updated to test child->real_parent instead of child->group_leader->real_parent per Oleg's suggestion. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com>
2011-03-23 03:37:01 -06:00
/*
* Notify parents of the stop.
*
* While ptraced, there are two parents - the ptracer and
* the real_parent of the group_leader. The ptracer should
* know about every stop while the real parent is only
* interested in the completion of group stop. The states
* for the two don't interact with each other. Notify
* separately unless they're gonna be duplicates.
*/
do_notify_parent_cldstop(current, true, why);
if (gstop_done && !real_parent_is_ptracer(current))
do_notify_parent_cldstop(current, false, why);
/*
* Don't want to allow preemption here, because
* sys_ptrace() needs this task to be inactive.
*
* XXX: implement read_unlock_no_resched().
*/
preempt_disable();
read_unlock(&tasklist_lock);
preempt_enable_no_resched();
schedule();
} else {
/*
* By the time we got the lock, our tracer went away.
* Don't drop the lock yet, another tracer may come.
job control: Notify the real parent of job control events regardless of ptrace With recent changes, job control and ptrace stopped states are properly separated and accessible to the real parent and the ptracer respectively; however, notifications of job control stopped/continued events to the real parent while ptraced are still missing. A ptracee participates in group stop in ptrace_stop() but the completion isn't notified. If participation results in completion of group stop, notify the real parent of the event. The ptrace and group stops are separate and can be handled as such. However, when the real parent and the ptracer are in the same thread group, only the ptrace stop event is visible through wait(2) and the duplicate notifications are different from the current behavior and are confusing. Suppress group stop notification in such cases. The continued state is shared between the real parent and the ptracer but is only meaningful to the real parent. Always notify the real parent and notify the ptracer too for backward compatibility. Similar to stop notification, if the real parent is the ptracer, suppress a duplicate notification. Test case follows. #include <stdio.h> #include <unistd.h> #include <time.h> #include <errno.h> #include <sys/types.h> #include <sys/ptrace.h> #include <sys/wait.h> int main(void) { const struct timespec ts100ms = { .tv_nsec = 100000000 }; pid_t tracee, tracer; siginfo_t si; int i; tracee = fork(); if (tracee == 0) { while (1) { printf("tracee: SIGSTOP\n"); raise(SIGSTOP); nanosleep(&ts100ms, NULL); printf("tracee: SIGCONT\n"); raise(SIGCONT); nanosleep(&ts100ms, NULL); } } waitid(P_PID, tracee, &si, WSTOPPED | WNOHANG | WNOWAIT); tracer = fork(); if (tracer == 0) { nanosleep(&ts100ms, NULL); ptrace(PTRACE_ATTACH, tracee, NULL, NULL); for (i = 0; i < 11; i++) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED); if (si.si_pid && si.si_code == CLD_TRAPPED) ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); } printf("tracer: EXITING\n"); return 0; } while (1) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED | WCONTINUED | WEXITED); if (si.si_pid) printf("mommy : WAIT status=%02d code=%02d\n", si.si_status, si.si_code); } return 0; } Before this patch, while ptraced, the real parent doesn't get notifications for job control events, so although it can access those events, the later waitid(2) call never wakes up. tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT tracee: SIGSTOP tracee: SIGCONT tracee: SIGSTOP tracee: SIGCONT tracee: SIGSTOP tracer: EXITING mommy : WAIT status=19 code=05 ^C After this patch, it works as expected. tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP tracer: EXITING mommy : WAIT status=19 code=05 ^C -v2: Oleg pointed out that * Group stop notification to the real parent should also happen when ptracer detach races with ptrace_stop(). * real_parent_is_ptracer() should be testing thread group equality not the task itself as wait(2) and stop/cont notifications are normally thread-group wide. Both issues are fixed accordingly. -v3: real_parent_is_ptracer() updated to test child->real_parent instead of child->group_leader->real_parent per Oleg's suggestion. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com>
2011-03-23 03:37:01 -06:00
*
* If @gstop_done, the ptracer went away between group stop
* completion and here. During detach, it would have set
* JOBCTL_STOP_PENDING on us and we'll re-enter
* TASK_STOPPED in do_signal_stop() on return, so notifying
* the real parent of the group stop completion is enough.
*/
job control: Notify the real parent of job control events regardless of ptrace With recent changes, job control and ptrace stopped states are properly separated and accessible to the real parent and the ptracer respectively; however, notifications of job control stopped/continued events to the real parent while ptraced are still missing. A ptracee participates in group stop in ptrace_stop() but the completion isn't notified. If participation results in completion of group stop, notify the real parent of the event. The ptrace and group stops are separate and can be handled as such. However, when the real parent and the ptracer are in the same thread group, only the ptrace stop event is visible through wait(2) and the duplicate notifications are different from the current behavior and are confusing. Suppress group stop notification in such cases. The continued state is shared between the real parent and the ptracer but is only meaningful to the real parent. Always notify the real parent and notify the ptracer too for backward compatibility. Similar to stop notification, if the real parent is the ptracer, suppress a duplicate notification. Test case follows. #include <stdio.h> #include <unistd.h> #include <time.h> #include <errno.h> #include <sys/types.h> #include <sys/ptrace.h> #include <sys/wait.h> int main(void) { const struct timespec ts100ms = { .tv_nsec = 100000000 }; pid_t tracee, tracer; siginfo_t si; int i; tracee = fork(); if (tracee == 0) { while (1) { printf("tracee: SIGSTOP\n"); raise(SIGSTOP); nanosleep(&ts100ms, NULL); printf("tracee: SIGCONT\n"); raise(SIGCONT); nanosleep(&ts100ms, NULL); } } waitid(P_PID, tracee, &si, WSTOPPED | WNOHANG | WNOWAIT); tracer = fork(); if (tracer == 0) { nanosleep(&ts100ms, NULL); ptrace(PTRACE_ATTACH, tracee, NULL, NULL); for (i = 0; i < 11; i++) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED); if (si.si_pid && si.si_code == CLD_TRAPPED) ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); } printf("tracer: EXITING\n"); return 0; } while (1) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED | WCONTINUED | WEXITED); if (si.si_pid) printf("mommy : WAIT status=%02d code=%02d\n", si.si_status, si.si_code); } return 0; } Before this patch, while ptraced, the real parent doesn't get notifications for job control events, so although it can access those events, the later waitid(2) call never wakes up. tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT tracee: SIGSTOP tracee: SIGCONT tracee: SIGSTOP tracee: SIGCONT tracee: SIGSTOP tracer: EXITING mommy : WAIT status=19 code=05 ^C After this patch, it works as expected. tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP tracer: EXITING mommy : WAIT status=19 code=05 ^C -v2: Oleg pointed out that * Group stop notification to the real parent should also happen when ptracer detach races with ptrace_stop(). * real_parent_is_ptracer() should be testing thread group equality not the task itself as wait(2) and stop/cont notifications are normally thread-group wide. Both issues are fixed accordingly. -v3: real_parent_is_ptracer() updated to test child->real_parent instead of child->group_leader->real_parent per Oleg's suggestion. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com>
2011-03-23 03:37:01 -06:00
if (gstop_done)
do_notify_parent_cldstop(current, false, why);
__set_current_state(TASK_RUNNING);
if (clear_code)
current->exit_code = 0;
read_unlock(&tasklist_lock);
}
/*
* While in TASK_TRACED, we were considered "frozen enough".
* Now that we woke up, it's crucial if we're supposed to be
* frozen that we freeze now before running anything substantial.
*/
try_to_freeze();
/*
* We are back. Now reacquire the siglock before touching
* last_siginfo, so that we are sure to have synchronized with
* any signal-sending on another CPU that wants to examine it.
*/
spin_lock_irq(&current->sighand->siglock);
current->last_siginfo = NULL;
/*
* Queued signals ignored us while we were stopped for tracing.
* So check for any that we should take before resuming user mode.
* This sets TIF_SIGPENDING, but never clears it.
*/
recalc_sigpending_tsk(current);
}
void ptrace_notify(int exit_code)
{
siginfo_t info;
BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
memset(&info, 0, sizeof info);
info.si_signo = SIGTRAP;
info.si_code = exit_code;
info.si_pid = task_pid_vnr(current);
info.si_uid = current_uid();
/* Let the debugger run. */
spin_lock_irq(&current->sighand->siglock);
ptrace_stop(exit_code, CLD_TRAPPED, 1, &info);
spin_unlock_irq(&current->sighand->siglock);
}
/*
* This performs the stopping for SIGSTOP and other stop signals.
* We have to stop all threads in the thread group.
* Returns non-zero if we've actually stopped and released the siglock.
* Returns zero if we didn't stop and still hold the siglock.
*/
static int do_signal_stop(int signr)
{
struct signal_struct *sig = current->signal;
if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
struct task_struct *t;
/* signr will be recorded in task->jobctl for retries */
WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK);
ptrace: Clean transitions between TASK_STOPPED and TRACED Currently, if the task is STOPPED on ptrace attach, it's left alone and the state is silently changed to TRACED on the next ptrace call. The behavior breaks the assumption that arch_ptrace_stop() is called before any task is poked by ptrace and is ugly in that a task manipulates the state of another task directly. With GROUP_STOP_PENDING, the transitions between TASK_STOPPED and TRACED can be made clean. The tracer can use the flag to tell the tracee to retry stop on attach and detach. On retry, the tracee will enter the desired state in the correct way. The lower 16bits of task->group_stop is used to remember the signal number which caused the last group stop. This is used while retrying for ptrace attach as the original group_exit_code could have been consumed with wait(2) by then. As the real parent may wait(2) and consume the group_exit_code anytime, the group_exit_code needs to be saved separately so that it can be used when switching from regular sleep to ptrace_stop(). This is recorded in the lower 16bits of task->group_stop. If a task is already stopped and there's no intervening SIGCONT, a ptrace request immediately following a successful PTRACE_ATTACH should always succeed even if the tracer doesn't wait(2) for attach completion; however, with this change, the tracee might still be TASK_RUNNING trying to enter TASK_TRACED which would cause the following request to fail with -ESRCH. This intermediate state is hidden from the ptracer by setting GROUP_STOP_TRAPPING on attach and making ptrace_check_attach() wait for it to clear on its signal->wait_chldexit. Completing the transition or getting killed clears TRAPPING and wakes up the tracer. Note that the STOPPED -> RUNNING -> TRACED transition is still visible to other threads which are in the same group as the ptracer and the reverse transition is visible to all. Please read the comments for details. Oleg: * Spotted a race condition where a task may retry group stop without proper bookkeeping. Fixed by redoing bookkeeping on retry. * Spotted that the transition is visible to userland in several different ways. Most are fixed with GROUP_STOP_TRAPPING. Unhandled corner case is documented. * Pointed out not setting GROUP_STOP_SIGMASK on an already stopped task would result in more consistent behavior. * Pointed out that calling ptrace_stop() from do_signal_stop() in TASK_STOPPED can race with group stop start logic and then confuse the TRAPPING wait in ptrace_check_attach(). ptrace_stop() is now called with TASK_RUNNING. * Suggested using signal->wait_chldexit instead of bit wait. * Spotted a race condition between TRACED transition and clearing of TRAPPING. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com> Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
2011-03-23 03:37:00 -06:00
if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) ||
unlikely(signal_group_exit(sig)))
return 0;
/*
job control: Don't set group_stop exit_code if re-entering job control stop While ptraced, a task may be resumed while the containing process is still job control stopped. If the task receives another stop signal in this state, it will still initiate group stop, which generates group_exit_code, which the real parent would be able to see once the ptracer detaches. In this scenario, the real parent may see two consecutive CLD_STOPPED events from two stop signals without intervening SIGCONT, which normally is impossible. Test case follows. #include <stdio.h> #include <unistd.h> #include <sys/ptrace.h> #include <sys/wait.h> int main(void) { pid_t tracee; siginfo_t si; tracee = fork(); if (!tracee) while (1) pause(); kill(tracee, SIGSTOP); waitid(P_PID, tracee, &si, WSTOPPED); if (!fork()) { ptrace(PTRACE_ATTACH, tracee, NULL, NULL); waitid(P_PID, tracee, &si, WSTOPPED); ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); waitid(P_PID, tracee, &si, WSTOPPED); ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); waitid(P_PID, tracee, &si, WSTOPPED); ptrace(PTRACE_DETACH, tracee, NULL, NULL); return 0; } while (1) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED | WNOHANG); if (si.si_pid) printf("st=%02d c=%02d\n", si.si_status, si.si_code); } return 0; } Before the patch, the latter waitid() in polling mode reports the second stopped event generated by the implied SIGSTOP of PTRACE_ATTACH. st=19 c=05 ^C After the patch, the second event is not reported. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com>
2011-03-23 03:37:01 -06:00
* There is no group stop already in progress. We must
* initiate one now.
*
* While ptraced, a task may be resumed while group stop is
* still in effect and then receive a stop signal and
* initiate another group stop. This deviates from the
* usual behavior as two consecutive stop signals can't
* cause two group stops when !ptraced. That is why we
* also check !task_is_stopped(t) below.
job control: Don't set group_stop exit_code if re-entering job control stop While ptraced, a task may be resumed while the containing process is still job control stopped. If the task receives another stop signal in this state, it will still initiate group stop, which generates group_exit_code, which the real parent would be able to see once the ptracer detaches. In this scenario, the real parent may see two consecutive CLD_STOPPED events from two stop signals without intervening SIGCONT, which normally is impossible. Test case follows. #include <stdio.h> #include <unistd.h> #include <sys/ptrace.h> #include <sys/wait.h> int main(void) { pid_t tracee; siginfo_t si; tracee = fork(); if (!tracee) while (1) pause(); kill(tracee, SIGSTOP); waitid(P_PID, tracee, &si, WSTOPPED); if (!fork()) { ptrace(PTRACE_ATTACH, tracee, NULL, NULL); waitid(P_PID, tracee, &si, WSTOPPED); ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); waitid(P_PID, tracee, &si, WSTOPPED); ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); waitid(P_PID, tracee, &si, WSTOPPED); ptrace(PTRACE_DETACH, tracee, NULL, NULL); return 0; } while (1) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED | WNOHANG); if (si.si_pid) printf("st=%02d c=%02d\n", si.si_status, si.si_code); } return 0; } Before the patch, the latter waitid() in polling mode reports the second stopped event generated by the implied SIGSTOP of PTRACE_ATTACH. st=19 c=05 ^C After the patch, the second event is not reported. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com>
2011-03-23 03:37:01 -06:00
*
* The condition can be distinguished by testing whether
* SIGNAL_STOP_STOPPED is already set. Don't generate
* group_exit_code in such case.
*
* This is not necessary for SIGNAL_STOP_CONTINUED because
* an intervening stop signal is required to cause two
* continued events regardless of ptrace.
*/
job control: Don't set group_stop exit_code if re-entering job control stop While ptraced, a task may be resumed while the containing process is still job control stopped. If the task receives another stop signal in this state, it will still initiate group stop, which generates group_exit_code, which the real parent would be able to see once the ptracer detaches. In this scenario, the real parent may see two consecutive CLD_STOPPED events from two stop signals without intervening SIGCONT, which normally is impossible. Test case follows. #include <stdio.h> #include <unistd.h> #include <sys/ptrace.h> #include <sys/wait.h> int main(void) { pid_t tracee; siginfo_t si; tracee = fork(); if (!tracee) while (1) pause(); kill(tracee, SIGSTOP); waitid(P_PID, tracee, &si, WSTOPPED); if (!fork()) { ptrace(PTRACE_ATTACH, tracee, NULL, NULL); waitid(P_PID, tracee, &si, WSTOPPED); ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); waitid(P_PID, tracee, &si, WSTOPPED); ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); waitid(P_PID, tracee, &si, WSTOPPED); ptrace(PTRACE_DETACH, tracee, NULL, NULL); return 0; } while (1) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED | WNOHANG); if (si.si_pid) printf("st=%02d c=%02d\n", si.si_status, si.si_code); } return 0; } Before the patch, the latter waitid() in polling mode reports the second stopped event generated by the implied SIGSTOP of PTRACE_ATTACH. st=19 c=05 ^C After the patch, the second event is not reported. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com>
2011-03-23 03:37:01 -06:00
if (!(sig->flags & SIGNAL_STOP_STOPPED))
sig->group_exit_code = signr;
else
WARN_ON_ONCE(!task_ptrace(current));
sig->group_stop_count = 0;
if (task_set_jobctl_pending(current, signr | gstop))
sig->group_stop_count++;
ptrace: Clean transitions between TASK_STOPPED and TRACED Currently, if the task is STOPPED on ptrace attach, it's left alone and the state is silently changed to TRACED on the next ptrace call. The behavior breaks the assumption that arch_ptrace_stop() is called before any task is poked by ptrace and is ugly in that a task manipulates the state of another task directly. With GROUP_STOP_PENDING, the transitions between TASK_STOPPED and TRACED can be made clean. The tracer can use the flag to tell the tracee to retry stop on attach and detach. On retry, the tracee will enter the desired state in the correct way. The lower 16bits of task->group_stop is used to remember the signal number which caused the last group stop. This is used while retrying for ptrace attach as the original group_exit_code could have been consumed with wait(2) by then. As the real parent may wait(2) and consume the group_exit_code anytime, the group_exit_code needs to be saved separately so that it can be used when switching from regular sleep to ptrace_stop(). This is recorded in the lower 16bits of task->group_stop. If a task is already stopped and there's no intervening SIGCONT, a ptrace request immediately following a successful PTRACE_ATTACH should always succeed even if the tracer doesn't wait(2) for attach completion; however, with this change, the tracee might still be TASK_RUNNING trying to enter TASK_TRACED which would cause the following request to fail with -ESRCH. This intermediate state is hidden from the ptracer by setting GROUP_STOP_TRAPPING on attach and making ptrace_check_attach() wait for it to clear on its signal->wait_chldexit. Completing the transition or getting killed clears TRAPPING and wakes up the tracer. Note that the STOPPED -> RUNNING -> TRACED transition is still visible to other threads which are in the same group as the ptracer and the reverse transition is visible to all. Please read the comments for details. Oleg: * Spotted a race condition where a task may retry group stop without proper bookkeeping. Fixed by redoing bookkeeping on retry. * Spotted that the transition is visible to userland in several different ways. Most are fixed with GROUP_STOP_TRAPPING. Unhandled corner case is documented. * Pointed out not setting GROUP_STOP_SIGMASK on an already stopped task would result in more consistent behavior. * Pointed out that calling ptrace_stop() from do_signal_stop() in TASK_STOPPED can race with group stop start logic and then confuse the TRAPPING wait in ptrace_check_attach(). ptrace_stop() is now called with TASK_RUNNING. * Suggested using signal->wait_chldexit instead of bit wait. * Spotted a race condition between TRACED transition and clearing of TRAPPING. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com> Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
2011-03-23 03:37:00 -06:00
for (t = next_thread(current); t != current;
t = next_thread(t)) {
/*
* Setting state to TASK_STOPPED for a group
* stop is always done with the siglock held,
* so this check has no races.
*/
if (!task_is_stopped(t) &&
task_set_jobctl_pending(t, signr | gstop)) {
sig->group_stop_count++;
signal_wake_up(t, 0);
}
ptrace: Clean transitions between TASK_STOPPED and TRACED Currently, if the task is STOPPED on ptrace attach, it's left alone and the state is silently changed to TRACED on the next ptrace call. The behavior breaks the assumption that arch_ptrace_stop() is called before any task is poked by ptrace and is ugly in that a task manipulates the state of another task directly. With GROUP_STOP_PENDING, the transitions between TASK_STOPPED and TRACED can be made clean. The tracer can use the flag to tell the tracee to retry stop on attach and detach. On retry, the tracee will enter the desired state in the correct way. The lower 16bits of task->group_stop is used to remember the signal number which caused the last group stop. This is used while retrying for ptrace attach as the original group_exit_code could have been consumed with wait(2) by then. As the real parent may wait(2) and consume the group_exit_code anytime, the group_exit_code needs to be saved separately so that it can be used when switching from regular sleep to ptrace_stop(). This is recorded in the lower 16bits of task->group_stop. If a task is already stopped and there's no intervening SIGCONT, a ptrace request immediately following a successful PTRACE_ATTACH should always succeed even if the tracer doesn't wait(2) for attach completion; however, with this change, the tracee might still be TASK_RUNNING trying to enter TASK_TRACED which would cause the following request to fail with -ESRCH. This intermediate state is hidden from the ptracer by setting GROUP_STOP_TRAPPING on attach and making ptrace_check_attach() wait for it to clear on its signal->wait_chldexit. Completing the transition or getting killed clears TRAPPING and wakes up the tracer. Note that the STOPPED -> RUNNING -> TRACED transition is still visible to other threads which are in the same group as the ptracer and the reverse transition is visible to all. Please read the comments for details. Oleg: * Spotted a race condition where a task may retry group stop without proper bookkeeping. Fixed by redoing bookkeeping on retry. * Spotted that the transition is visible to userland in several different ways. Most are fixed with GROUP_STOP_TRAPPING. Unhandled corner case is documented. * Pointed out not setting GROUP_STOP_SIGMASK on an already stopped task would result in more consistent behavior. * Pointed out that calling ptrace_stop() from do_signal_stop() in TASK_STOPPED can race with group stop start logic and then confuse the TRAPPING wait in ptrace_check_attach(). ptrace_stop() is now called with TASK_RUNNING. * Suggested using signal->wait_chldexit instead of bit wait. * Spotted a race condition between TRACED transition and clearing of TRAPPING. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com> Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
2011-03-23 03:37:00 -06:00
}
}
ptrace: Clean transitions between TASK_STOPPED and TRACED Currently, if the task is STOPPED on ptrace attach, it's left alone and the state is silently changed to TRACED on the next ptrace call. The behavior breaks the assumption that arch_ptrace_stop() is called before any task is poked by ptrace and is ugly in that a task manipulates the state of another task directly. With GROUP_STOP_PENDING, the transitions between TASK_STOPPED and TRACED can be made clean. The tracer can use the flag to tell the tracee to retry stop on attach and detach. On retry, the tracee will enter the desired state in the correct way. The lower 16bits of task->group_stop is used to remember the signal number which caused the last group stop. This is used while retrying for ptrace attach as the original group_exit_code could have been consumed with wait(2) by then. As the real parent may wait(2) and consume the group_exit_code anytime, the group_exit_code needs to be saved separately so that it can be used when switching from regular sleep to ptrace_stop(). This is recorded in the lower 16bits of task->group_stop. If a task is already stopped and there's no intervening SIGCONT, a ptrace request immediately following a successful PTRACE_ATTACH should always succeed even if the tracer doesn't wait(2) for attach completion; however, with this change, the tracee might still be TASK_RUNNING trying to enter TASK_TRACED which would cause the following request to fail with -ESRCH. This intermediate state is hidden from the ptracer by setting GROUP_STOP_TRAPPING on attach and making ptrace_check_attach() wait for it to clear on its signal->wait_chldexit. Completing the transition or getting killed clears TRAPPING and wakes up the tracer. Note that the STOPPED -> RUNNING -> TRACED transition is still visible to other threads which are in the same group as the ptracer and the reverse transition is visible to all. Please read the comments for details. Oleg: * Spotted a race condition where a task may retry group stop without proper bookkeeping. Fixed by redoing bookkeeping on retry. * Spotted that the transition is visible to userland in several different ways. Most are fixed with GROUP_STOP_TRAPPING. Unhandled corner case is documented. * Pointed out not setting GROUP_STOP_SIGMASK on an already stopped task would result in more consistent behavior. * Pointed out that calling ptrace_stop() from do_signal_stop() in TASK_STOPPED can race with group stop start logic and then confuse the TRAPPING wait in ptrace_check_attach(). ptrace_stop() is now called with TASK_RUNNING. * Suggested using signal->wait_chldexit instead of bit wait. * Spotted a race condition between TRACED transition and clearing of TRAPPING. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com> Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
2011-03-23 03:37:00 -06:00
retry:
ptrace: Make do_signal_stop() use ptrace_stop() if the task is being ptraced A ptraced task would still stop at do_signal_stop() when it's stopping for stop signals and do_signal_stop() behaves the same whether the task is ptraced or not. However, in addition to stopping, ptrace_stop() also does ptrace specific stuff like calling architecture specific callbacks, so this behavior makes the code more fragile and difficult to understand. This patch makes do_signal_stop() test whether the task is ptraced and use ptrace_stop() if so. This renders tracehook_notify_jctl() rather pointless as the ptrace notification is now handled by ptrace_stop() regardless of the return value from the tracehook. It probably is a good idea to update it. This doesn't solve the whole problem as tasks already in stopped state would stay in the regular stop when ptrace attached. That part will be handled by the next patch. Oleg pointed out that this makes a userland-visible change. Before, SIGCONT would be able to wake up a task in group stop even if the task is ptraced if the tracer hasn't issued another ptrace command afterwards (as the next ptrace commands transitions the state into TASK_TRACED which ignores SIGCONT wakeups). With this and the next patch, SIGCONT may race with the transition into TASK_TRACED and is ignored if the tracee already entered TASK_TRACED. Another userland visible change of this and the next patch is that the ptracee's state would now be TASK_TRACED where it used to be TASK_STOPPED, which is visible via fs/proc. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com> Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
2011-03-23 03:37:00 -06:00
if (likely(!task_ptrace(current))) {
int notify = 0;
ptrace: Make do_signal_stop() use ptrace_stop() if the task is being ptraced A ptraced task would still stop at do_signal_stop() when it's stopping for stop signals and do_signal_stop() behaves the same whether the task is ptraced or not. However, in addition to stopping, ptrace_stop() also does ptrace specific stuff like calling architecture specific callbacks, so this behavior makes the code more fragile and difficult to understand. This patch makes do_signal_stop() test whether the task is ptraced and use ptrace_stop() if so. This renders tracehook_notify_jctl() rather pointless as the ptrace notification is now handled by ptrace_stop() regardless of the return value from the tracehook. It probably is a good idea to update it. This doesn't solve the whole problem as tasks already in stopped state would stay in the regular stop when ptrace attached. That part will be handled by the next patch. Oleg pointed out that this makes a userland-visible change. Before, SIGCONT would be able to wake up a task in group stop even if the task is ptraced if the tracer hasn't issued another ptrace command afterwards (as the next ptrace commands transitions the state into TASK_TRACED which ignores SIGCONT wakeups). With this and the next patch, SIGCONT may race with the transition into TASK_TRACED and is ignored if the tracee already entered TASK_TRACED. Another userland visible change of this and the next patch is that the ptracee's state would now be TASK_TRACED where it used to be TASK_STOPPED, which is visible via fs/proc. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com> Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
2011-03-23 03:37:00 -06:00
/*
* If there are no other threads in the group, or if there
* is a group stop in progress and we are the last to stop,
* report to the parent.
*/
if (task_participate_group_stop(current))
notify = CLD_STOPPED;
__set_current_state(TASK_STOPPED);
ptrace: Make do_signal_stop() use ptrace_stop() if the task is being ptraced A ptraced task would still stop at do_signal_stop() when it's stopping for stop signals and do_signal_stop() behaves the same whether the task is ptraced or not. However, in addition to stopping, ptrace_stop() also does ptrace specific stuff like calling architecture specific callbacks, so this behavior makes the code more fragile and difficult to understand. This patch makes do_signal_stop() test whether the task is ptraced and use ptrace_stop() if so. This renders tracehook_notify_jctl() rather pointless as the ptrace notification is now handled by ptrace_stop() regardless of the return value from the tracehook. It probably is a good idea to update it. This doesn't solve the whole problem as tasks already in stopped state would stay in the regular stop when ptrace attached. That part will be handled by the next patch. Oleg pointed out that this makes a userland-visible change. Before, SIGCONT would be able to wake up a task in group stop even if the task is ptraced if the tracer hasn't issued another ptrace command afterwards (as the next ptrace commands transitions the state into TASK_TRACED which ignores SIGCONT wakeups). With this and the next patch, SIGCONT may race with the transition into TASK_TRACED and is ignored if the tracee already entered TASK_TRACED. Another userland visible change of this and the next patch is that the ptracee's state would now be TASK_TRACED where it used to be TASK_STOPPED, which is visible via fs/proc. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com> Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
2011-03-23 03:37:00 -06:00
spin_unlock_irq(&current->sighand->siglock);
/*
* Notify the parent of the group stop completion. Because
* we're not holding either the siglock or tasklist_lock
* here, ptracer may attach inbetween; however, this is for
* group stop and should always be delivered to the real
* parent of the group leader. The new ptracer will get
* its notification when this task transitions into
* TASK_TRACED.
*/
ptrace: Make do_signal_stop() use ptrace_stop() if the task is being ptraced A ptraced task would still stop at do_signal_stop() when it's stopping for stop signals and do_signal_stop() behaves the same whether the task is ptraced or not. However, in addition to stopping, ptrace_stop() also does ptrace specific stuff like calling architecture specific callbacks, so this behavior makes the code more fragile and difficult to understand. This patch makes do_signal_stop() test whether the task is ptraced and use ptrace_stop() if so. This renders tracehook_notify_jctl() rather pointless as the ptrace notification is now handled by ptrace_stop() regardless of the return value from the tracehook. It probably is a good idea to update it. This doesn't solve the whole problem as tasks already in stopped state would stay in the regular stop when ptrace attached. That part will be handled by the next patch. Oleg pointed out that this makes a userland-visible change. Before, SIGCONT would be able to wake up a task in group stop even if the task is ptraced if the tracer hasn't issued another ptrace command afterwards (as the next ptrace commands transitions the state into TASK_TRACED which ignores SIGCONT wakeups). With this and the next patch, SIGCONT may race with the transition into TASK_TRACED and is ignored if the tracee already entered TASK_TRACED. Another userland visible change of this and the next patch is that the ptracee's state would now be TASK_TRACED where it used to be TASK_STOPPED, which is visible via fs/proc. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com> Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
2011-03-23 03:37:00 -06:00
if (notify) {
read_lock(&tasklist_lock);
do_notify_parent_cldstop(current, false, notify);
ptrace: Make do_signal_stop() use ptrace_stop() if the task is being ptraced A ptraced task would still stop at do_signal_stop() when it's stopping for stop signals and do_signal_stop() behaves the same whether the task is ptraced or not. However, in addition to stopping, ptrace_stop() also does ptrace specific stuff like calling architecture specific callbacks, so this behavior makes the code more fragile and difficult to understand. This patch makes do_signal_stop() test whether the task is ptraced and use ptrace_stop() if so. This renders tracehook_notify_jctl() rather pointless as the ptrace notification is now handled by ptrace_stop() regardless of the return value from the tracehook. It probably is a good idea to update it. This doesn't solve the whole problem as tasks already in stopped state would stay in the regular stop when ptrace attached. That part will be handled by the next patch. Oleg pointed out that this makes a userland-visible change. Before, SIGCONT would be able to wake up a task in group stop even if the task is ptraced if the tracer hasn't issued another ptrace command afterwards (as the next ptrace commands transitions the state into TASK_TRACED which ignores SIGCONT wakeups). With this and the next patch, SIGCONT may race with the transition into TASK_TRACED and is ignored if the tracee already entered TASK_TRACED. Another userland visible change of this and the next patch is that the ptracee's state would now be TASK_TRACED where it used to be TASK_STOPPED, which is visible via fs/proc. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com> Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
2011-03-23 03:37:00 -06:00
read_unlock(&tasklist_lock);
}
/* Now we don't run again until woken by SIGCONT or SIGKILL */
schedule();
spin_lock_irq(&current->sighand->siglock);
ptrace: Clean transitions between TASK_STOPPED and TRACED Currently, if the task is STOPPED on ptrace attach, it's left alone and the state is silently changed to TRACED on the next ptrace call. The behavior breaks the assumption that arch_ptrace_stop() is called before any task is poked by ptrace and is ugly in that a task manipulates the state of another task directly. With GROUP_STOP_PENDING, the transitions between TASK_STOPPED and TRACED can be made clean. The tracer can use the flag to tell the tracee to retry stop on attach and detach. On retry, the tracee will enter the desired state in the correct way. The lower 16bits of task->group_stop is used to remember the signal number which caused the last group stop. This is used while retrying for ptrace attach as the original group_exit_code could have been consumed with wait(2) by then. As the real parent may wait(2) and consume the group_exit_code anytime, the group_exit_code needs to be saved separately so that it can be used when switching from regular sleep to ptrace_stop(). This is recorded in the lower 16bits of task->group_stop. If a task is already stopped and there's no intervening SIGCONT, a ptrace request immediately following a successful PTRACE_ATTACH should always succeed even if the tracer doesn't wait(2) for attach completion; however, with this change, the tracee might still be TASK_RUNNING trying to enter TASK_TRACED which would cause the following request to fail with -ESRCH. This intermediate state is hidden from the ptracer by setting GROUP_STOP_TRAPPING on attach and making ptrace_check_attach() wait for it to clear on its signal->wait_chldexit. Completing the transition or getting killed clears TRAPPING and wakes up the tracer. Note that the STOPPED -> RUNNING -> TRACED transition is still visible to other threads which are in the same group as the ptracer and the reverse transition is visible to all. Please read the comments for details. Oleg: * Spotted a race condition where a task may retry group stop without proper bookkeeping. Fixed by redoing bookkeeping on retry. * Spotted that the transition is visible to userland in several different ways. Most are fixed with GROUP_STOP_TRAPPING. Unhandled corner case is documented. * Pointed out not setting GROUP_STOP_SIGMASK on an already stopped task would result in more consistent behavior. * Pointed out that calling ptrace_stop() from do_signal_stop() in TASK_STOPPED can race with group stop start logic and then confuse the TRAPPING wait in ptrace_check_attach(). ptrace_stop() is now called with TASK_RUNNING. * Suggested using signal->wait_chldexit instead of bit wait. * Spotted a race condition between TRACED transition and clearing of TRAPPING. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com> Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
2011-03-23 03:37:00 -06:00
} else {
ptrace_stop(current->jobctl & JOBCTL_STOP_SIGMASK,
ptrace: Clean transitions between TASK_STOPPED and TRACED Currently, if the task is STOPPED on ptrace attach, it's left alone and the state is silently changed to TRACED on the next ptrace call. The behavior breaks the assumption that arch_ptrace_stop() is called before any task is poked by ptrace and is ugly in that a task manipulates the state of another task directly. With GROUP_STOP_PENDING, the transitions between TASK_STOPPED and TRACED can be made clean. The tracer can use the flag to tell the tracee to retry stop on attach and detach. On retry, the tracee will enter the desired state in the correct way. The lower 16bits of task->group_stop is used to remember the signal number which caused the last group stop. This is used while retrying for ptrace attach as the original group_exit_code could have been consumed with wait(2) by then. As the real parent may wait(2) and consume the group_exit_code anytime, the group_exit_code needs to be saved separately so that it can be used when switching from regular sleep to ptrace_stop(). This is recorded in the lower 16bits of task->group_stop. If a task is already stopped and there's no intervening SIGCONT, a ptrace request immediately following a successful PTRACE_ATTACH should always succeed even if the tracer doesn't wait(2) for attach completion; however, with this change, the tracee might still be TASK_RUNNING trying to enter TASK_TRACED which would cause the following request to fail with -ESRCH. This intermediate state is hidden from the ptracer by setting GROUP_STOP_TRAPPING on attach and making ptrace_check_attach() wait for it to clear on its signal->wait_chldexit. Completing the transition or getting killed clears TRAPPING and wakes up the tracer. Note that the STOPPED -> RUNNING -> TRACED transition is still visible to other threads which are in the same group as the ptracer and the reverse transition is visible to all. Please read the comments for details. Oleg: * Spotted a race condition where a task may retry group stop without proper bookkeeping. Fixed by redoing bookkeeping on retry. * Spotted that the transition is visible to userland in several different ways. Most are fixed with GROUP_STOP_TRAPPING. Unhandled corner case is documented. * Pointed out not setting GROUP_STOP_SIGMASK on an already stopped task would result in more consistent behavior. * Pointed out that calling ptrace_stop() from do_signal_stop() in TASK_STOPPED can race with group stop start logic and then confuse the TRAPPING wait in ptrace_check_attach(). ptrace_stop() is now called with TASK_RUNNING. * Suggested using signal->wait_chldexit instead of bit wait. * Spotted a race condition between TRACED transition and clearing of TRAPPING. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com> Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
2011-03-23 03:37:00 -06:00
CLD_STOPPED, 0, NULL);
current->exit_code = 0;
}
ptrace: Clean transitions between TASK_STOPPED and TRACED Currently, if the task is STOPPED on ptrace attach, it's left alone and the state is silently changed to TRACED on the next ptrace call. The behavior breaks the assumption that arch_ptrace_stop() is called before any task is poked by ptrace and is ugly in that a task manipulates the state of another task directly. With GROUP_STOP_PENDING, the transitions between TASK_STOPPED and TRACED can be made clean. The tracer can use the flag to tell the tracee to retry stop on attach and detach. On retry, the tracee will enter the desired state in the correct way. The lower 16bits of task->group_stop is used to remember the signal number which caused the last group stop. This is used while retrying for ptrace attach as the original group_exit_code could have been consumed with wait(2) by then. As the real parent may wait(2) and consume the group_exit_code anytime, the group_exit_code needs to be saved separately so that it can be used when switching from regular sleep to ptrace_stop(). This is recorded in the lower 16bits of task->group_stop. If a task is already stopped and there's no intervening SIGCONT, a ptrace request immediately following a successful PTRACE_ATTACH should always succeed even if the tracer doesn't wait(2) for attach completion; however, with this change, the tracee might still be TASK_RUNNING trying to enter TASK_TRACED which would cause the following request to fail with -ESRCH. This intermediate state is hidden from the ptracer by setting GROUP_STOP_TRAPPING on attach and making ptrace_check_attach() wait for it to clear on its signal->wait_chldexit. Completing the transition or getting killed clears TRAPPING and wakes up the tracer. Note that the STOPPED -> RUNNING -> TRACED transition is still visible to other threads which are in the same group as the ptracer and the reverse transition is visible to all. Please read the comments for details. Oleg: * Spotted a race condition where a task may retry group stop without proper bookkeeping. Fixed by redoing bookkeeping on retry. * Spotted that the transition is visible to userland in several different ways. Most are fixed with GROUP_STOP_TRAPPING. Unhandled corner case is documented. * Pointed out not setting GROUP_STOP_SIGMASK on an already stopped task would result in more consistent behavior. * Pointed out that calling ptrace_stop() from do_signal_stop() in TASK_STOPPED can race with group stop start logic and then confuse the TRAPPING wait in ptrace_check_attach(). ptrace_stop() is now called with TASK_RUNNING. * Suggested using signal->wait_chldexit instead of bit wait. * Spotted a race condition between TRACED transition and clearing of TRAPPING. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com> Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
2011-03-23 03:37:00 -06:00
/*
* JOBCTL_STOP_PENDING could be set if another group stop has
ptrace: Clean transitions between TASK_STOPPED and TRACED Currently, if the task is STOPPED on ptrace attach, it's left alone and the state is silently changed to TRACED on the next ptrace call. The behavior breaks the assumption that arch_ptrace_stop() is called before any task is poked by ptrace and is ugly in that a task manipulates the state of another task directly. With GROUP_STOP_PENDING, the transitions between TASK_STOPPED and TRACED can be made clean. The tracer can use the flag to tell the tracee to retry stop on attach and detach. On retry, the tracee will enter the desired state in the correct way. The lower 16bits of task->group_stop is used to remember the signal number which caused the last group stop. This is used while retrying for ptrace attach as the original group_exit_code could have been consumed with wait(2) by then. As the real parent may wait(2) and consume the group_exit_code anytime, the group_exit_code needs to be saved separately so that it can be used when switching from regular sleep to ptrace_stop(). This is recorded in the lower 16bits of task->group_stop. If a task is already stopped and there's no intervening SIGCONT, a ptrace request immediately following a successful PTRACE_ATTACH should always succeed even if the tracer doesn't wait(2) for attach completion; however, with this change, the tracee might still be TASK_RUNNING trying to enter TASK_TRACED which would cause the following request to fail with -ESRCH. This intermediate state is hidden from the ptracer by setting GROUP_STOP_TRAPPING on attach and making ptrace_check_attach() wait for it to clear on its signal->wait_chldexit. Completing the transition or getting killed clears TRAPPING and wakes up the tracer. Note that the STOPPED -> RUNNING -> TRACED transition is still visible to other threads which are in the same group as the ptracer and the reverse transition is visible to all. Please read the comments for details. Oleg: * Spotted a race condition where a task may retry group stop without proper bookkeeping. Fixed by redoing bookkeeping on retry. * Spotted that the transition is visible to userland in several different ways. Most are fixed with GROUP_STOP_TRAPPING. Unhandled corner case is documented. * Pointed out not setting GROUP_STOP_SIGMASK on an already stopped task would result in more consistent behavior. * Pointed out that calling ptrace_stop() from do_signal_stop() in TASK_STOPPED can race with group stop start logic and then confuse the TRAPPING wait in ptrace_check_attach(). ptrace_stop() is now called with TASK_RUNNING. * Suggested using signal->wait_chldexit instead of bit wait. * Spotted a race condition between TRACED transition and clearing of TRAPPING. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com> Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
2011-03-23 03:37:00 -06:00
* started since being woken up or ptrace wants us to transit
* between TASK_STOPPED and TRACED. Retry group stop.
*/
if (current->jobctl & JOBCTL_STOP_PENDING) {
WARN_ON_ONCE(!(current->jobctl & JOBCTL_STOP_SIGMASK));
ptrace: Clean transitions between TASK_STOPPED and TRACED Currently, if the task is STOPPED on ptrace attach, it's left alone and the state is silently changed to TRACED on the next ptrace call. The behavior breaks the assumption that arch_ptrace_stop() is called before any task is poked by ptrace and is ugly in that a task manipulates the state of another task directly. With GROUP_STOP_PENDING, the transitions between TASK_STOPPED and TRACED can be made clean. The tracer can use the flag to tell the tracee to retry stop on attach and detach. On retry, the tracee will enter the desired state in the correct way. The lower 16bits of task->group_stop is used to remember the signal number which caused the last group stop. This is used while retrying for ptrace attach as the original group_exit_code could have been consumed with wait(2) by then. As the real parent may wait(2) and consume the group_exit_code anytime, the group_exit_code needs to be saved separately so that it can be used when switching from regular sleep to ptrace_stop(). This is recorded in the lower 16bits of task->group_stop. If a task is already stopped and there's no intervening SIGCONT, a ptrace request immediately following a successful PTRACE_ATTACH should always succeed even if the tracer doesn't wait(2) for attach completion; however, with this change, the tracee might still be TASK_RUNNING trying to enter TASK_TRACED which would cause the following request to fail with -ESRCH. This intermediate state is hidden from the ptracer by setting GROUP_STOP_TRAPPING on attach and making ptrace_check_attach() wait for it to clear on its signal->wait_chldexit. Completing the transition or getting killed clears TRAPPING and wakes up the tracer. Note that the STOPPED -> RUNNING -> TRACED transition is still visible to other threads which are in the same group as the ptracer and the reverse transition is visible to all. Please read the comments for details. Oleg: * Spotted a race condition where a task may retry group stop without proper bookkeeping. Fixed by redoing bookkeeping on retry. * Spotted that the transition is visible to userland in several different ways. Most are fixed with GROUP_STOP_TRAPPING. Unhandled corner case is documented. * Pointed out not setting GROUP_STOP_SIGMASK on an already stopped task would result in more consistent behavior. * Pointed out that calling ptrace_stop() from do_signal_stop() in TASK_STOPPED can race with group stop start logic and then confuse the TRAPPING wait in ptrace_check_attach(). ptrace_stop() is now called with TASK_RUNNING. * Suggested using signal->wait_chldexit instead of bit wait. * Spotted a race condition between TRACED transition and clearing of TRAPPING. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com> Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
2011-03-23 03:37:00 -06:00
goto retry;
}
ptrace: Make do_signal_stop() use ptrace_stop() if the task is being ptraced A ptraced task would still stop at do_signal_stop() when it's stopping for stop signals and do_signal_stop() behaves the same whether the task is ptraced or not. However, in addition to stopping, ptrace_stop() also does ptrace specific stuff like calling architecture specific callbacks, so this behavior makes the code more fragile and difficult to understand. This patch makes do_signal_stop() test whether the task is ptraced and use ptrace_stop() if so. This renders tracehook_notify_jctl() rather pointless as the ptrace notification is now handled by ptrace_stop() regardless of the return value from the tracehook. It probably is a good idea to update it. This doesn't solve the whole problem as tasks already in stopped state would stay in the regular stop when ptrace attached. That part will be handled by the next patch. Oleg pointed out that this makes a userland-visible change. Before, SIGCONT would be able to wake up a task in group stop even if the task is ptraced if the tracer hasn't issued another ptrace command afterwards (as the next ptrace commands transitions the state into TASK_TRACED which ignores SIGCONT wakeups). With this and the next patch, SIGCONT may race with the transition into TASK_TRACED and is ignored if the tracee already entered TASK_TRACED. Another userland visible change of this and the next patch is that the ptracee's state would now be TASK_TRACED where it used to be TASK_STOPPED, which is visible via fs/proc. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com> Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
2011-03-23 03:37:00 -06:00
spin_unlock_irq(&current->sighand->siglock);
return 1;
}
static int ptrace_signal(int signr, siginfo_t *info,
struct pt_regs *regs, void *cookie)
{
if (!task_ptrace(current))
return signr;
ptrace_signal_deliver(regs, cookie);
/* Let the debugger run. */
ptrace_stop(signr, CLD_TRAPPED, 0, info);
/* We're back. Did the debugger cancel the sig? */
signr = current->exit_code;
if (signr == 0)
return signr;
current->exit_code = 0;
/*
* Update the siginfo structure if the signal has
* changed. If the debugger wanted something
* specific in the siginfo structure then it should
* have updated *info via PTRACE_SETSIGINFO.
*/
if (signr != info->si_signo) {
info->si_signo = signr;
info->si_errno = 0;
info->si_code = SI_USER;
info->si_pid = task_pid_vnr(current->parent);
info->si_uid = task_uid(current->parent);
}
/* If the (new) signal is now blocked, requeue it. */
if (sigismember(&current->blocked, signr)) {
specific_send_sig_info(signr, info, current);
signr = 0;
}
return signr;
}
int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
struct pt_regs *regs, void *cookie)
{
struct sighand_struct *sighand = current->sighand;
struct signal_struct *signal = current->signal;
int signr;
relock:
/*
* We'll jump back here after any time we were stopped in TASK_STOPPED.
* While in TASK_STOPPED, we were considered "frozen enough".
* Now that we woke up, it's crucial if we're supposed to be
* frozen that we freeze now before running anything substantial.
*/
try_to_freeze();
spin_lock_irq(&sighand->siglock);
/*
* Every stopped thread goes here after wakeup. Check to see if
* we should notify the parent, prepare_signal(SIGCONT) encodes
* the CLD_ si_code into SIGNAL_CLD_MASK bits.
*/
if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
struct task_struct *leader;
int why;
if (signal->flags & SIGNAL_CLD_CONTINUED)
why = CLD_CONTINUED;
else
why = CLD_STOPPED;
signal->flags &= ~SIGNAL_CLD_MASK;
signals: re-assign CLD_CONTINUED notification from the sender to reciever Based on discussion with Jiri and Roland. In short: currently handle_stop_signal(SIGCONT, p) sends the notification to p->parent, with this patch p itself notifies its parent when it becomes running. handle_stop_signal(SIGCONT) has to drop ->siglock temporary in order to notify the parent with do_notify_parent_cldstop(). This leads to multiple problems: - as Jiri Kosina pointed out, the stopped task can resume without actually seeing SIGCONT which may have a handler. - we race with another sig_kernel_stop() signal which may come in that window. - we race with sig_fatal() signals which may set SIGNAL_GROUP_EXIT in that window. - we can't avoid taking tasklist_lock() while sending SIGCONT. With this patch handle_stop_signal() just sets the new SIGNAL_CLD_CONTINUED flag in p->signal->flags and returns. The notification is sent by the first task which returns from finish_stop() (there should be at least one) or any other signalled thread from get_signal_to_deliver(). This is a user-visible change. Say, currently kill(SIGCONT, stopped_child) can't return without seeing SIGCHLD, with this patch SIGCHLD can be delayed unpredictably. Another difference is that if the child is ptraced by another process, CLD_CONTINUED may be delivered to ->real_parent after ptrace_detach() while currently it always goes to the tracer which doesn't actually need this notification. Hopefully not a problem. The patch asks for the futher obvious cleanups, I'll send them separately. Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru> Cc: Roland McGrath <roland@redhat.com> Cc: Jiri Kosina <jkosina@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 01:52:44 -06:00
spin_unlock_irq(&sighand->siglock);
job control: Notify the real parent of job control events regardless of ptrace With recent changes, job control and ptrace stopped states are properly separated and accessible to the real parent and the ptracer respectively; however, notifications of job control stopped/continued events to the real parent while ptraced are still missing. A ptracee participates in group stop in ptrace_stop() but the completion isn't notified. If participation results in completion of group stop, notify the real parent of the event. The ptrace and group stops are separate and can be handled as such. However, when the real parent and the ptracer are in the same thread group, only the ptrace stop event is visible through wait(2) and the duplicate notifications are different from the current behavior and are confusing. Suppress group stop notification in such cases. The continued state is shared between the real parent and the ptracer but is only meaningful to the real parent. Always notify the real parent and notify the ptracer too for backward compatibility. Similar to stop notification, if the real parent is the ptracer, suppress a duplicate notification. Test case follows. #include <stdio.h> #include <unistd.h> #include <time.h> #include <errno.h> #include <sys/types.h> #include <sys/ptrace.h> #include <sys/wait.h> int main(void) { const struct timespec ts100ms = { .tv_nsec = 100000000 }; pid_t tracee, tracer; siginfo_t si; int i; tracee = fork(); if (tracee == 0) { while (1) { printf("tracee: SIGSTOP\n"); raise(SIGSTOP); nanosleep(&ts100ms, NULL); printf("tracee: SIGCONT\n"); raise(SIGCONT); nanosleep(&ts100ms, NULL); } } waitid(P_PID, tracee, &si, WSTOPPED | WNOHANG | WNOWAIT); tracer = fork(); if (tracer == 0) { nanosleep(&ts100ms, NULL); ptrace(PTRACE_ATTACH, tracee, NULL, NULL); for (i = 0; i < 11; i++) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED); if (si.si_pid && si.si_code == CLD_TRAPPED) ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); } printf("tracer: EXITING\n"); return 0; } while (1) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED | WCONTINUED | WEXITED); if (si.si_pid) printf("mommy : WAIT status=%02d code=%02d\n", si.si_status, si.si_code); } return 0; } Before this patch, while ptraced, the real parent doesn't get notifications for job control events, so although it can access those events, the later waitid(2) call never wakes up. tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT tracee: SIGSTOP tracee: SIGCONT tracee: SIGSTOP tracee: SIGCONT tracee: SIGSTOP tracer: EXITING mommy : WAIT status=19 code=05 ^C After this patch, it works as expected. tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP tracer: EXITING mommy : WAIT status=19 code=05 ^C -v2: Oleg pointed out that * Group stop notification to the real parent should also happen when ptracer detach races with ptrace_stop(). * real_parent_is_ptracer() should be testing thread group equality not the task itself as wait(2) and stop/cont notifications are normally thread-group wide. Both issues are fixed accordingly. -v3: real_parent_is_ptracer() updated to test child->real_parent instead of child->group_leader->real_parent per Oleg's suggestion. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com>
2011-03-23 03:37:01 -06:00
/*
* Notify the parent that we're continuing. This event is
* always per-process and doesn't make whole lot of sense
* for ptracers, who shouldn't consume the state via
* wait(2) either, but, for backward compatibility, notify
* the ptracer of the group leader too unless it's gonna be
* a duplicate.
*/
read_lock(&tasklist_lock);
job control: Notify the real parent of job control events regardless of ptrace With recent changes, job control and ptrace stopped states are properly separated and accessible to the real parent and the ptracer respectively; however, notifications of job control stopped/continued events to the real parent while ptraced are still missing. A ptracee participates in group stop in ptrace_stop() but the completion isn't notified. If participation results in completion of group stop, notify the real parent of the event. The ptrace and group stops are separate and can be handled as such. However, when the real parent and the ptracer are in the same thread group, only the ptrace stop event is visible through wait(2) and the duplicate notifications are different from the current behavior and are confusing. Suppress group stop notification in such cases. The continued state is shared between the real parent and the ptracer but is only meaningful to the real parent. Always notify the real parent and notify the ptracer too for backward compatibility. Similar to stop notification, if the real parent is the ptracer, suppress a duplicate notification. Test case follows. #include <stdio.h> #include <unistd.h> #include <time.h> #include <errno.h> #include <sys/types.h> #include <sys/ptrace.h> #include <sys/wait.h> int main(void) { const struct timespec ts100ms = { .tv_nsec = 100000000 }; pid_t tracee, tracer; siginfo_t si; int i; tracee = fork(); if (tracee == 0) { while (1) { printf("tracee: SIGSTOP\n"); raise(SIGSTOP); nanosleep(&ts100ms, NULL); printf("tracee: SIGCONT\n"); raise(SIGCONT); nanosleep(&ts100ms, NULL); } } waitid(P_PID, tracee, &si, WSTOPPED | WNOHANG | WNOWAIT); tracer = fork(); if (tracer == 0) { nanosleep(&ts100ms, NULL); ptrace(PTRACE_ATTACH, tracee, NULL, NULL); for (i = 0; i < 11; i++) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED); if (si.si_pid && si.si_code == CLD_TRAPPED) ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); } printf("tracer: EXITING\n"); return 0; } while (1) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED | WCONTINUED | WEXITED); if (si.si_pid) printf("mommy : WAIT status=%02d code=%02d\n", si.si_status, si.si_code); } return 0; } Before this patch, while ptraced, the real parent doesn't get notifications for job control events, so although it can access those events, the later waitid(2) call never wakes up. tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT tracee: SIGSTOP tracee: SIGCONT tracee: SIGSTOP tracee: SIGCONT tracee: SIGSTOP tracer: EXITING mommy : WAIT status=19 code=05 ^C After this patch, it works as expected. tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP tracer: EXITING mommy : WAIT status=19 code=05 ^C -v2: Oleg pointed out that * Group stop notification to the real parent should also happen when ptracer detach races with ptrace_stop(). * real_parent_is_ptracer() should be testing thread group equality not the task itself as wait(2) and stop/cont notifications are normally thread-group wide. Both issues are fixed accordingly. -v3: real_parent_is_ptracer() updated to test child->real_parent instead of child->group_leader->real_parent per Oleg's suggestion. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com>
2011-03-23 03:37:01 -06:00
do_notify_parent_cldstop(current, false, why);
leader = current->group_leader;
job control: Notify the real parent of job control events regardless of ptrace With recent changes, job control and ptrace stopped states are properly separated and accessible to the real parent and the ptracer respectively; however, notifications of job control stopped/continued events to the real parent while ptraced are still missing. A ptracee participates in group stop in ptrace_stop() but the completion isn't notified. If participation results in completion of group stop, notify the real parent of the event. The ptrace and group stops are separate and can be handled as such. However, when the real parent and the ptracer are in the same thread group, only the ptrace stop event is visible through wait(2) and the duplicate notifications are different from the current behavior and are confusing. Suppress group stop notification in such cases. The continued state is shared between the real parent and the ptracer but is only meaningful to the real parent. Always notify the real parent and notify the ptracer too for backward compatibility. Similar to stop notification, if the real parent is the ptracer, suppress a duplicate notification. Test case follows. #include <stdio.h> #include <unistd.h> #include <time.h> #include <errno.h> #include <sys/types.h> #include <sys/ptrace.h> #include <sys/wait.h> int main(void) { const struct timespec ts100ms = { .tv_nsec = 100000000 }; pid_t tracee, tracer; siginfo_t si; int i; tracee = fork(); if (tracee == 0) { while (1) { printf("tracee: SIGSTOP\n"); raise(SIGSTOP); nanosleep(&ts100ms, NULL); printf("tracee: SIGCONT\n"); raise(SIGCONT); nanosleep(&ts100ms, NULL); } } waitid(P_PID, tracee, &si, WSTOPPED | WNOHANG | WNOWAIT); tracer = fork(); if (tracer == 0) { nanosleep(&ts100ms, NULL); ptrace(PTRACE_ATTACH, tracee, NULL, NULL); for (i = 0; i < 11; i++) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED); if (si.si_pid && si.si_code == CLD_TRAPPED) ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); } printf("tracer: EXITING\n"); return 0; } while (1) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED | WCONTINUED | WEXITED); if (si.si_pid) printf("mommy : WAIT status=%02d code=%02d\n", si.si_status, si.si_code); } return 0; } Before this patch, while ptraced, the real parent doesn't get notifications for job control events, so although it can access those events, the later waitid(2) call never wakes up. tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT tracee: SIGSTOP tracee: SIGCONT tracee: SIGSTOP tracee: SIGCONT tracee: SIGSTOP tracer: EXITING mommy : WAIT status=19 code=05 ^C After this patch, it works as expected. tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP tracer: EXITING mommy : WAIT status=19 code=05 ^C -v2: Oleg pointed out that * Group stop notification to the real parent should also happen when ptracer detach races with ptrace_stop(). * real_parent_is_ptracer() should be testing thread group equality not the task itself as wait(2) and stop/cont notifications are normally thread-group wide. Both issues are fixed accordingly. -v3: real_parent_is_ptracer() updated to test child->real_parent instead of child->group_leader->real_parent per Oleg's suggestion. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com>
2011-03-23 03:37:01 -06:00
if (task_ptrace(leader) && !real_parent_is_ptracer(leader))
do_notify_parent_cldstop(leader, true, why);
read_unlock(&tasklist_lock);
job control: Notify the real parent of job control events regardless of ptrace With recent changes, job control and ptrace stopped states are properly separated and accessible to the real parent and the ptracer respectively; however, notifications of job control stopped/continued events to the real parent while ptraced are still missing. A ptracee participates in group stop in ptrace_stop() but the completion isn't notified. If participation results in completion of group stop, notify the real parent of the event. The ptrace and group stops are separate and can be handled as such. However, when the real parent and the ptracer are in the same thread group, only the ptrace stop event is visible through wait(2) and the duplicate notifications are different from the current behavior and are confusing. Suppress group stop notification in such cases. The continued state is shared between the real parent and the ptracer but is only meaningful to the real parent. Always notify the real parent and notify the ptracer too for backward compatibility. Similar to stop notification, if the real parent is the ptracer, suppress a duplicate notification. Test case follows. #include <stdio.h> #include <unistd.h> #include <time.h> #include <errno.h> #include <sys/types.h> #include <sys/ptrace.h> #include <sys/wait.h> int main(void) { const struct timespec ts100ms = { .tv_nsec = 100000000 }; pid_t tracee, tracer; siginfo_t si; int i; tracee = fork(); if (tracee == 0) { while (1) { printf("tracee: SIGSTOP\n"); raise(SIGSTOP); nanosleep(&ts100ms, NULL); printf("tracee: SIGCONT\n"); raise(SIGCONT); nanosleep(&ts100ms, NULL); } } waitid(P_PID, tracee, &si, WSTOPPED | WNOHANG | WNOWAIT); tracer = fork(); if (tracer == 0) { nanosleep(&ts100ms, NULL); ptrace(PTRACE_ATTACH, tracee, NULL, NULL); for (i = 0; i < 11; i++) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED); if (si.si_pid && si.si_code == CLD_TRAPPED) ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); } printf("tracer: EXITING\n"); return 0; } while (1) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED | WCONTINUED | WEXITED); if (si.si_pid) printf("mommy : WAIT status=%02d code=%02d\n", si.si_status, si.si_code); } return 0; } Before this patch, while ptraced, the real parent doesn't get notifications for job control events, so although it can access those events, the later waitid(2) call never wakes up. tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT tracee: SIGSTOP tracee: SIGCONT tracee: SIGSTOP tracee: SIGCONT tracee: SIGSTOP tracer: EXITING mommy : WAIT status=19 code=05 ^C After this patch, it works as expected. tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP tracer: EXITING mommy : WAIT status=19 code=05 ^C -v2: Oleg pointed out that * Group stop notification to the real parent should also happen when ptracer detach races with ptrace_stop(). * real_parent_is_ptracer() should be testing thread group equality not the task itself as wait(2) and stop/cont notifications are normally thread-group wide. Both issues are fixed accordingly. -v3: real_parent_is_ptracer() updated to test child->real_parent instead of child->group_leader->real_parent per Oleg's suggestion. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com>
2011-03-23 03:37:01 -06:00
signals: re-assign CLD_CONTINUED notification from the sender to reciever Based on discussion with Jiri and Roland. In short: currently handle_stop_signal(SIGCONT, p) sends the notification to p->parent, with this patch p itself notifies its parent when it becomes running. handle_stop_signal(SIGCONT) has to drop ->siglock temporary in order to notify the parent with do_notify_parent_cldstop(). This leads to multiple problems: - as Jiri Kosina pointed out, the stopped task can resume without actually seeing SIGCONT which may have a handler. - we race with another sig_kernel_stop() signal which may come in that window. - we race with sig_fatal() signals which may set SIGNAL_GROUP_EXIT in that window. - we can't avoid taking tasklist_lock() while sending SIGCONT. With this patch handle_stop_signal() just sets the new SIGNAL_CLD_CONTINUED flag in p->signal->flags and returns. The notification is sent by the first task which returns from finish_stop() (there should be at least one) or any other signalled thread from get_signal_to_deliver(). This is a user-visible change. Say, currently kill(SIGCONT, stopped_child) can't return without seeing SIGCHLD, with this patch SIGCHLD can be delayed unpredictably. Another difference is that if the child is ptraced by another process, CLD_CONTINUED may be delivered to ->real_parent after ptrace_detach() while currently it always goes to the tracer which doesn't actually need this notification. Hopefully not a problem. The patch asks for the futher obvious cleanups, I'll send them separately. Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru> Cc: Roland McGrath <roland@redhat.com> Cc: Jiri Kosina <jkosina@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 01:52:44 -06:00
goto relock;
}
for (;;) {
struct k_sigaction *ka;
if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
do_signal_stop(0))
goto relock;
signr = dequeue_signal(current, &current->blocked, info);
if (!signr)
break; /* will return 0 */
if (signr != SIGKILL) {
signr = ptrace_signal(signr, info,
regs, cookie);
if (!signr)
continue;
}
ka = &sighand->action[signr-1];
/* Trace actually delivered signals. */
trace_signal_deliver(signr, info, ka);
if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
continue;
if (ka->sa.sa_handler != SIG_DFL) {
/* Run the handler. */
*return_ka = *ka;
if (ka->sa.sa_flags & SA_ONESHOT)
ka->sa.sa_handler = SIG_DFL;
break; /* will return non-zero "signr" value */
}
/*
* Now we are doing the default action for this signal.
*/
if (sig_kernel_ignore(signr)) /* Default is nothing. */
continue;
/*
* Global init gets no signals it doesn't want.
signals: protect cinit from blocked fatal signals Normally SIG_DFL signals to global and container-init are dropped early. But if a signal is blocked when it is posted, we cannot drop the signal since the receiver may install a handler before unblocking the signal. Once this signal is queued however, the receiver container-init has no way of knowing if the signal was sent from an ancestor or descendant namespace. This patch ensures that contianer-init drops all SIG_DFL signals in get_signal_to_deliver() except SIGKILL/SIGSTOP. If SIGSTOP/SIGKILL originate from a descendant of container-init they are never queued (i.e dropped in sig_ignored() in an earler patch). If SIGSTOP/SIGKILL originate from parent namespace, the signal is queued and container-init processes the signal. IOW, if get_signal_to_deliver() sees a sig_kernel_only() signal for global or container-init, the signal must have been generated internally or must have come from an ancestor ns and we process the signal. Further, the signal_group_exit() check was needed to cover the case of a multi-threaded init sending SIGKILL to other threads when doing an exit() or exec(). But since the new sig_kernel_only() check covers the SIGKILL, the signal_group_exit() check is no longer needed and can be removed. Finally, now that we have all pieces in place, set SIGNAL_UNKILLABLE for container-inits. Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com> Cc: Oleg Nesterov <oleg@tv-sign.ru> Cc: Roland McGrath <roland@redhat.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Daniel Lezcano <daniel.lezcano@free.fr> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-02 17:58:08 -06:00
* Container-init gets no signals it doesn't want from same
* container.
*
* Note that if global/container-init sees a sig_kernel_only()
* signal here, the signal must have been generated internally
* or must have come from an ancestor namespace. In either
* case, the signal cannot be dropped.
*/
if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
signals: protect cinit from blocked fatal signals Normally SIG_DFL signals to global and container-init are dropped early. But if a signal is blocked when it is posted, we cannot drop the signal since the receiver may install a handler before unblocking the signal. Once this signal is queued however, the receiver container-init has no way of knowing if the signal was sent from an ancestor or descendant namespace. This patch ensures that contianer-init drops all SIG_DFL signals in get_signal_to_deliver() except SIGKILL/SIGSTOP. If SIGSTOP/SIGKILL originate from a descendant of container-init they are never queued (i.e dropped in sig_ignored() in an earler patch). If SIGSTOP/SIGKILL originate from parent namespace, the signal is queued and container-init processes the signal. IOW, if get_signal_to_deliver() sees a sig_kernel_only() signal for global or container-init, the signal must have been generated internally or must have come from an ancestor ns and we process the signal. Further, the signal_group_exit() check was needed to cover the case of a multi-threaded init sending SIGKILL to other threads when doing an exit() or exec(). But since the new sig_kernel_only() check covers the SIGKILL, the signal_group_exit() check is no longer needed and can be removed. Finally, now that we have all pieces in place, set SIGNAL_UNKILLABLE for container-inits. Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com> Cc: Oleg Nesterov <oleg@tv-sign.ru> Cc: Roland McGrath <roland@redhat.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Daniel Lezcano <daniel.lezcano@free.fr> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-02 17:58:08 -06:00
!sig_kernel_only(signr))
continue;
if (sig_kernel_stop(signr)) {
/*
* The default action is to stop all threads in
* the thread group. The job control signals
* do nothing in an orphaned pgrp, but SIGSTOP
* always works. Note that siglock needs to be
* dropped during the call to is_orphaned_pgrp()
* because of lock ordering with tasklist_lock.
* This allows an intervening SIGCONT to be posted.
* We need to check for that and bail out if necessary.
*/
if (signr != SIGSTOP) {
spin_unlock_irq(&sighand->siglock);
/* signals can be posted during this window */
if (is_current_pgrp_orphaned())
goto relock;
spin_lock_irq(&sighand->siglock);
}
if (likely(do_signal_stop(info->si_signo))) {
/* It released the siglock. */
goto relock;
}
/*
* We didn't actually stop, due to a race
* with SIGCONT or something like that.
*/
continue;
}
spin_unlock_irq(&sighand->siglock);
/*
* Anything else is fatal, maybe with a core dump.
*/
current->flags |= PF_SIGNALED;
if (sig_kernel_coredump(signr)) {
if (print_fatal_signals)
print_fatal_signal(regs, info->si_signo);
/*
* If it was able to dump core, this kills all
* other threads in the group and synchronizes with
* their demise. If we lost the race with another
* thread getting here, it set group_exit_code
* first and our do_group_exit call below will use
* that value and ignore the one we pass it.
*/
do_coredump(info->si_signo, info->si_signo, regs);
}
/*
* Death signals, no core dump.
*/
do_group_exit(info->si_signo);
/* NOTREACHED */
}
spin_unlock_irq(&sighand->siglock);
return signr;
}
/*
* It could be that complete_signal() picked us to notify about the
* group-wide signal. Other threads should be notified now to take
* the shared signals in @which since we will not.
*/
static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
{
sigset_t retarget;
struct task_struct *t;
sigandsets(&retarget, &tsk->signal->shared_pending.signal, which);
if (sigisemptyset(&retarget))
return;
t = tsk;
while_each_thread(tsk, t) {
if (t->flags & PF_EXITING)
continue;
if (!has_pending_signals(&retarget, &t->blocked))
continue;
/* Remove the signals this thread can handle. */
sigandsets(&retarget, &retarget, &t->blocked);
if (!signal_pending(t))
signal_wake_up(t, 0);
if (sigisemptyset(&retarget))
break;
}
}
void exit_signals(struct task_struct *tsk)
{
int group_stop = 0;
sigset_t unblocked;
if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
tsk->flags |= PF_EXITING;
return;
}
spin_lock_irq(&tsk->sighand->siglock);
/*
* From now this task is not visible for group-wide signals,
* see wants_signal(), do_signal_stop().
*/
tsk->flags |= PF_EXITING;
if (!signal_pending(tsk))
goto out;
unblocked = tsk->blocked;
signotset(&unblocked);
retarget_shared_pending(tsk, &unblocked);
if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) &&
signal: Fix premature completion of group stop when interfered by ptrace task->signal->group_stop_count is used to track the progress of group stop. It's initialized to the number of tasks which need to stop for group stop to finish and each stopping or trapping task decrements. However, each task doesn't keep track of whether it decremented the counter or not and if woken up before the group stop is complete and stops again, it can decrement the counter multiple times. Please consider the following example code. static void *worker(void *arg) { while (1) ; return NULL; } int main(void) { pthread_t thread; pid_t pid; int i; pid = fork(); if (!pid) { for (i = 0; i < 5; i++) pthread_create(&thread, NULL, worker, NULL); while (1) ; return 0; } ptrace(PTRACE_ATTACH, pid, NULL, NULL); while (1) { waitid(P_PID, pid, NULL, WSTOPPED); ptrace(PTRACE_SINGLESTEP, pid, NULL, (void *)(long)SIGSTOP); } return 0; } The child creates five threads and the parent continuously traps the first thread and whenever the child gets a signal, SIGSTOP is delivered. If an external process sends SIGSTOP to the child, all other threads in the process should reliably stop. However, due to the above bug, the first thread will often end up consuming group_stop_count multiple times and SIGSTOP often ends up stopping none or part of the other four threads. This patch adds a new field task->group_stop which is protected by siglock and uses GROUP_STOP_CONSUME flag to track which task is still to consume group_stop_count to fix this bug. task_clear_group_stop_pending() and task_participate_group_stop() are added to help manipulating group stop states. As ptrace_stop() now also uses task_participate_group_stop(), it will set SIGNAL_STOP_STOPPED if it completes a group stop. There still are many issues regarding the interaction between group stop and ptrace. Patches to address them will follow. - Oleg spotted duplicate GROUP_STOP_CONSUME. Dropped. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com>
2011-03-23 03:37:00 -06:00
task_participate_group_stop(tsk))
group_stop = CLD_STOPPED;
out:
spin_unlock_irq(&tsk->sighand->siglock);
/*
* If group stop has completed, deliver the notification. This
* should always go to the real parent of the group leader.
*/
if (unlikely(group_stop)) {
read_lock(&tasklist_lock);
do_notify_parent_cldstop(tsk, false, group_stop);
read_unlock(&tasklist_lock);
}
}
EXPORT_SYMBOL(recalc_sigpending);
EXPORT_SYMBOL_GPL(dequeue_signal);
EXPORT_SYMBOL(flush_signals);
EXPORT_SYMBOL(force_sig);
EXPORT_SYMBOL(send_sig);
EXPORT_SYMBOL(send_sig_info);
EXPORT_SYMBOL(sigprocmask);
EXPORT_SYMBOL(block_all_signals);
EXPORT_SYMBOL(unblock_all_signals);
/*
* System call entry points.
*/
/**
* sys_restart_syscall - restart a system call
*/
SYSCALL_DEFINE0(restart_syscall)
{
struct restart_block *restart = &current_thread_info()->restart_block;
return restart->fn(restart);
}
long do_no_restart_syscall(struct restart_block *param)
{
return -EINTR;
}
static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
{
if (signal_pending(tsk) && !thread_group_empty(tsk)) {
sigset_t newblocked;
/* A set of now blocked but previously unblocked signals. */
sigandnsets(&newblocked, newset, &current->blocked);
retarget_shared_pending(tsk, &newblocked);
}
tsk->blocked = *newset;
recalc_sigpending();
}
/**
* set_current_blocked - change current->blocked mask
* @newset: new mask
*
* It is wrong to change ->blocked directly, this helper should be used
* to ensure the process can't miss a shared signal we are going to block.
*/
void set_current_blocked(const sigset_t *newset)
{
struct task_struct *tsk = current;
spin_lock_irq(&tsk->sighand->siglock);
__set_task_blocked(tsk, newset);
spin_unlock_irq(&tsk->sighand->siglock);
}
/*
* This is also useful for kernel threads that want to temporarily
* (or permanently) block certain signals.
*
* NOTE! Unlike the user-mode sys_sigprocmask(), the kernel
* interface happily blocks "unblockable" signals like SIGKILL
* and friends.
*/
int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
{
struct task_struct *tsk = current;
sigset_t newset;
/* Lockless, only current can change ->blocked, never from irq */
if (oldset)
*oldset = tsk->blocked;
switch (how) {
case SIG_BLOCK:
sigorsets(&newset, &tsk->blocked, set);
break;
case SIG_UNBLOCK:
sigandnsets(&newset, &tsk->blocked, set);
break;
case SIG_SETMASK:
newset = *set;
break;
default:
return -EINVAL;
}
set_current_blocked(&newset);
return 0;
}
/**
* sys_rt_sigprocmask - change the list of currently blocked signals
* @how: whether to add, remove, or set signals
* @set: stores pending signals
* @oset: previous value of signal mask if non-null
* @sigsetsize: size of sigset_t type
*/
SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
sigset_t __user *, oset, size_t, sigsetsize)
{
sigset_t old_set, new_set;
int error;
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
old_set = current->blocked;
if (nset) {
if (copy_from_user(&new_set, nset, sizeof(sigset_t)))
return -EFAULT;
sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
error = sigprocmask(how, &new_set, NULL);
if (error)
return error;
}
if (oset) {
if (copy_to_user(oset, &old_set, sizeof(sigset_t)))
return -EFAULT;
}
return 0;
}
long do_sigpending(void __user *set, unsigned long sigsetsize)
{
long error = -EINVAL;
sigset_t pending;
if (sigsetsize > sizeof(sigset_t))
goto out;
spin_lock_irq(&current->sighand->siglock);
sigorsets(&pending, &current->pending.signal,
&current->signal->shared_pending.signal);
spin_unlock_irq(&current->sighand->siglock);
/* Outside the lock because only this thread touches it. */
sigandsets(&pending, &current->blocked, &pending);
error = -EFAULT;
if (!copy_to_user(set, &pending, sigsetsize))
error = 0;
out:
return error;
}
/**
* sys_rt_sigpending - examine a pending signal that has been raised
* while blocked
* @set: stores pending signals
* @sigsetsize: size of sigset_t type or larger
*/
SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize)
{
return do_sigpending(set, sigsetsize);
}
#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
{
int err;
if (!access_ok (VERIFY_WRITE, to, sizeof(siginfo_t)))
return -EFAULT;
if (from->si_code < 0)
return __copy_to_user(to, from, sizeof(siginfo_t))
? -EFAULT : 0;
/*
* If you change siginfo_t structure, please be sure
* this code is fixed accordingly.
signal/timer/event: signalfd core This patch series implements the new signalfd() system call. I took part of the original Linus code (and you know how badly it can be broken :), and I added even more breakage ;) Signals are fetched from the same signal queue used by the process, so signalfd will compete with standard kernel delivery in dequeue_signal(). If you want to reliably fetch signals on the signalfd file, you need to block them with sigprocmask(SIG_BLOCK). This seems to be working fine on my Dual Opteron machine. I made a quick test program for it: http://www.xmailserver.org/signafd-test.c The signalfd() system call implements signal delivery into a file descriptor receiver. The signalfd file descriptor if created with the following API: int signalfd(int ufd, const sigset_t *mask, size_t masksize); The "ufd" parameter allows to change an existing signalfd sigmask, w/out going to close/create cycle (Linus idea). Use "ufd" == -1 if you want a brand new signalfd file. The "mask" allows to specify the signal mask of signals that we are interested in. The "masksize" parameter is the size of "mask". The signalfd fd supports the poll(2) and read(2) system calls. The poll(2) will return POLLIN when signals are available to be dequeued. As a direct consequence of supporting the Linux poll subsystem, the signalfd fd can use used together with epoll(2) too. The read(2) system call will return a "struct signalfd_siginfo" structure in the userspace supplied buffer. The return value is the number of bytes copied in the supplied buffer, or -1 in case of error. The read(2) call can also return 0, in case the sighand structure to which the signalfd was attached, has been orphaned. The O_NONBLOCK flag is also supported, and read(2) will return -EAGAIN in case no signal is available. If the size of the buffer passed to read(2) is lower than sizeof(struct signalfd_siginfo), -EINVAL is returned. A read from the signalfd can also return -ERESTARTSYS in case a signal hits the process. The format of the struct signalfd_siginfo is, and the valid fields depends of the (->code & __SI_MASK) value, in the same way a struct siginfo would: struct signalfd_siginfo { __u32 signo; /* si_signo */ __s32 err; /* si_errno */ __s32 code; /* si_code */ __u32 pid; /* si_pid */ __u32 uid; /* si_uid */ __s32 fd; /* si_fd */ __u32 tid; /* si_fd */ __u32 band; /* si_band */ __u32 overrun; /* si_overrun */ __u32 trapno; /* si_trapno */ __s32 status; /* si_status */ __s32 svint; /* si_int */ __u64 svptr; /* si_ptr */ __u64 utime; /* si_utime */ __u64 stime; /* si_stime */ __u64 addr; /* si_addr */ }; [akpm@linux-foundation.org: fix signalfd_copyinfo() on i386] Signed-off-by: Davide Libenzi <davidel@xmailserver.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-10 23:23:13 -06:00
* Please remember to update the signalfd_copyinfo() function
* inside fs/signalfd.c too, in case siginfo_t changes.
* It should never copy any pad contained in the structure
* to avoid security leaks, but must copy the generic
* 3 ints plus the relevant union member.
*/
err = __put_user(from->si_signo, &to->si_signo);
err |= __put_user(from->si_errno, &to->si_errno);
err |= __put_user((short)from->si_code, &to->si_code);
switch (from->si_code & __SI_MASK) {
case __SI_KILL:
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
break;
case __SI_TIMER:
err |= __put_user(from->si_tid, &to->si_tid);
err |= __put_user(from->si_overrun, &to->si_overrun);
err |= __put_user(from->si_ptr, &to->si_ptr);
break;
case __SI_POLL:
err |= __put_user(from->si_band, &to->si_band);
err |= __put_user(from->si_fd, &to->si_fd);
break;
case __SI_FAULT:
err |= __put_user(from->si_addr, &to->si_addr);
#ifdef __ARCH_SI_TRAPNO
err |= __put_user(from->si_trapno, &to->si_trapno);
#endif
#ifdef BUS_MCEERR_AO
/*
* Other callers might not initialize the si_lsb field,
* so check explicitly for the right codes here.
*/
if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
#endif
break;
case __SI_CHLD:
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_status, &to->si_status);
err |= __put_user(from->si_utime, &to->si_utime);
err |= __put_user(from->si_stime, &to->si_stime);
break;
case __SI_RT: /* This is not generated by the kernel as of now. */
case __SI_MESGQ: /* But this is */
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_ptr, &to->si_ptr);
break;
default: /* this is just in case for now ... */
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
break;
}
return err;
}
#endif
/**
* do_sigtimedwait - wait for queued signals specified in @which
* @which: queued signals to wait for
* @info: if non-null, the signal's siginfo is returned here
* @ts: upper bound on process time suspension
*/
int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
const struct timespec *ts)
{
struct task_struct *tsk = current;
long timeout = MAX_SCHEDULE_TIMEOUT;
sigset_t mask = *which;
int sig;
if (ts) {
if (!timespec_valid(ts))
return -EINVAL;
timeout = timespec_to_jiffies(ts);
/*
* We can be close to the next tick, add another one
* to ensure we will wait at least the time asked for.
*/
if (ts->tv_sec || ts->tv_nsec)
timeout++;
}
/*
* Invert the set of allowed signals to get those we want to block.
*/
sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
signotset(&mask);
spin_lock_irq(&tsk->sighand->siglock);
sig = dequeue_signal(tsk, &mask, info);
if (!sig && timeout) {
/*
* None ready, temporarily unblock those we're interested
* while we are sleeping in so that we'll be awakened when
* they arrive. Unblocking is always fine, we can avoid
* set_current_blocked().
*/
tsk->real_blocked = tsk->blocked;
sigandsets(&tsk->blocked, &tsk->blocked, &mask);
recalc_sigpending();
spin_unlock_irq(&tsk->sighand->siglock);
timeout = schedule_timeout_interruptible(timeout);
spin_lock_irq(&tsk->sighand->siglock);
__set_task_blocked(tsk, &tsk->real_blocked);
siginitset(&tsk->real_blocked, 0);
sig = dequeue_signal(tsk, &mask, info);
}
spin_unlock_irq(&tsk->sighand->siglock);
if (sig)
return sig;
return timeout ? -EINTR : -EAGAIN;
}
/**
* sys_rt_sigtimedwait - synchronously wait for queued signals specified
* in @uthese
* @uthese: queued signals to wait for
* @uinfo: if non-null, the signal's siginfo is returned here
* @uts: upper bound on process time suspension
* @sigsetsize: size of sigset_t type
*/
SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
siginfo_t __user *, uinfo, const struct timespec __user *, uts,
size_t, sigsetsize)
{
sigset_t these;
struct timespec ts;
siginfo_t info;
int ret;
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
if (copy_from_user(&these, uthese, sizeof(these)))
return -EFAULT;
if (uts) {
if (copy_from_user(&ts, uts, sizeof(ts)))
return -EFAULT;
}
ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);
if (ret > 0 && uinfo) {
if (copy_siginfo_to_user(uinfo, &info))
ret = -EFAULT;
}
return ret;
}
/**
* sys_kill - send a signal to a process
* @pid: the PID of the process
* @sig: signal to be sent
*/
SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
{
struct siginfo info;
info.si_signo = sig;
info.si_errno = 0;
info.si_code = SI_USER;
info.si_pid = task_tgid_vnr(current);
info.si_uid = current_uid();
return kill_something_info(sig, &info, pid);
}
static int
do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
{
struct task_struct *p;
int error = -ESRCH;
rcu_read_lock();
p = find_task_by_vpid(pid);
if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
error = check_kill_permission(sig, info, p);
/*
* The null signal is a permissions and process existence
* probe. No signal is actually delivered.
*/
if (!error && sig) {
error = do_send_sig_info(sig, info, p, false);
/*
* If lock_task_sighand() failed we pretend the task
* dies after receiving the signal. The window is tiny,
* and the signal is private anyway.
*/
if (unlikely(error == -ESRCH))
error = 0;
}
}
rcu_read_unlock();
return error;
}
static int do_tkill(pid_t tgid, pid_t pid, int sig)
{
struct siginfo info;
info.si_signo = sig;
info.si_errno = 0;
info.si_code = SI_TKILL;
info.si_pid = task_tgid_vnr(current);
info.si_uid = current_uid();
return do_send_specific(tgid, pid, sig, &info);
}
/**
* sys_tgkill - send signal to one specific thread
* @tgid: the thread group ID of the thread
* @pid: the PID of the thread
* @sig: signal to be sent
*
* This syscall also checks the @tgid and returns -ESRCH even if the PID
* exists but it's not belonging to the target process anymore. This
* method solves the problem of threads exiting and PIDs getting reused.
*/
SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig)
{
/* This is only valid for single tasks */
if (pid <= 0 || tgid <= 0)
return -EINVAL;
return do_tkill(tgid, pid, sig);
}
/**
* sys_tkill - send signal to one specific task
* @pid: the PID of the task
* @sig: signal to be sent
*
* Send a signal to only one task, even if it's a CLONE_THREAD task.
*/
SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
{
/* This is only valid for single tasks */
if (pid <= 0)
return -EINVAL;
return do_tkill(0, pid, sig);
}
/**
* sys_rt_sigqueueinfo - send signal information to a signal
* @pid: the PID of the thread
* @sig: signal to be sent
* @uinfo: signal info to be sent
*/
SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
siginfo_t __user *, uinfo)
{
siginfo_t info;
if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
return -EFAULT;
/* Not even root can pretend to send signals from the kernel.
* Nor can they impersonate a kill()/tgkill(), which adds source info.
*/
if (info.si_code >= 0 || info.si_code == SI_TKILL) {
/* We used to allow any < 0 si_code */
WARN_ON_ONCE(info.si_code < 0);
return -EPERM;
}
info.si_signo = sig;
/* POSIX.1b doesn't mention process groups. */
return kill_proc_info(sig, &info, pid);
}
long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
{
/* This is only valid for single tasks */
if (pid <= 0 || tgid <= 0)
return -EINVAL;
/* Not even root can pretend to send signals from the kernel.
* Nor can they impersonate a kill()/tgkill(), which adds source info.
*/
if (info->si_code >= 0 || info->si_code == SI_TKILL) {
/* We used to allow any < 0 si_code */
WARN_ON_ONCE(info->si_code < 0);
return -EPERM;
}
info->si_signo = sig;
return do_send_specific(tgid, pid, sig, info);
}
SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
siginfo_t __user *, uinfo)
{
siginfo_t info;
if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
return -EFAULT;
return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
}
int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
{
struct task_struct *t = current;
struct k_sigaction *k;
sigset_t mask;
if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
return -EINVAL;
k = &t->sighand->action[sig-1];
spin_lock_irq(&current->sighand->siglock);
if (oact)
*oact = *k;
if (act) {
sigdelsetmask(&act->sa.sa_mask,
sigmask(SIGKILL) | sigmask(SIGSTOP));
*k = *act;
/*
* POSIX 3.3.1.3:
* "Setting a signal action to SIG_IGN for a signal that is
* pending shall cause the pending signal to be discarded,
* whether or not it is blocked."
*
* "Setting a signal action to SIG_DFL for a signal that is
* pending and whose default action is to ignore the signal
* (for example, SIGCHLD), shall cause the pending signal to
* be discarded, whether or not it is blocked"
*/
if (sig_handler_ignored(sig_handler(t, sig), sig)) {
sigemptyset(&mask);
sigaddset(&mask, sig);
rm_from_queue_full(&mask, &t->signal->shared_pending);
do {
rm_from_queue_full(&mask, &t->pending);
t = next_thread(t);
} while (t != current);
}
}
spin_unlock_irq(&current->sighand->siglock);
return 0;
}
int
do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)
{
stack_t oss;
int error;
oss.ss_sp = (void __user *) current->sas_ss_sp;
oss.ss_size = current->sas_ss_size;
oss.ss_flags = sas_ss_flags(sp);
if (uss) {
void __user *ss_sp;
size_t ss_size;
int ss_flags;
error = -EFAULT;
if (!access_ok(VERIFY_READ, uss, sizeof(*uss)))
goto out;
error = __get_user(ss_sp, &uss->ss_sp) |
__get_user(ss_flags, &uss->ss_flags) |
__get_user(ss_size, &uss->ss_size);
if (error)
goto out;
error = -EPERM;
if (on_sig_stack(sp))
goto out;
error = -EINVAL;
/*
* Note - this code used to test ss_flags incorrectly:
* old code may have been written using ss_flags==0
* to mean ss_flags==SS_ONSTACK (as this was the only
* way that worked) - this fix preserves that older
* mechanism.
*/
if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0)
goto out;
if (ss_flags == SS_DISABLE) {
ss_size = 0;
ss_sp = NULL;
} else {
error = -ENOMEM;
if (ss_size < MINSIGSTKSZ)
goto out;
}
current->sas_ss_sp = (unsigned long) ss_sp;
current->sas_ss_size = ss_size;
}
error = 0;
if (uoss) {
error = -EFAULT;
if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss)))
goto out;
error = __put_user(oss.ss_sp, &uoss->ss_sp) |
__put_user(oss.ss_size, &uoss->ss_size) |
__put_user(oss.ss_flags, &uoss->ss_flags);
}
out:
return error;
}
#ifdef __ARCH_WANT_SYS_SIGPENDING
/**
* sys_sigpending - examine pending signals
* @set: where mask of pending signal is returned
*/
SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
{
return do_sigpending(set, sizeof(*set));
}
#endif
#ifdef __ARCH_WANT_SYS_SIGPROCMASK
/**
* sys_sigprocmask - examine and change blocked signals
* @how: whether to add, remove, or set signals
* @nset: signals to add or remove (if non-null)
* @oset: previous value of signal mask if non-null
*
* Some platforms have their own version with special arguments;
* others support only sys_rt_sigprocmask.
*/
SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
old_sigset_t __user *, oset)
{
old_sigset_t old_set, new_set;
sigset_t new_blocked;
old_set = current->blocked.sig[0];
if (nset) {
if (copy_from_user(&new_set, nset, sizeof(*nset)))
return -EFAULT;
new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
new_blocked = current->blocked;
switch (how) {
case SIG_BLOCK:
sigaddsetmask(&new_blocked, new_set);
break;
case SIG_UNBLOCK:
sigdelsetmask(&new_blocked, new_set);
break;
case SIG_SETMASK:
new_blocked.sig[0] = new_set;
break;
default:
return -EINVAL;
}
set_current_blocked(&new_blocked);
}
if (oset) {
if (copy_to_user(oset, &old_set, sizeof(*oset)))
return -EFAULT;
}
return 0;
}
#endif /* __ARCH_WANT_SYS_SIGPROCMASK */
#ifdef __ARCH_WANT_SYS_RT_SIGACTION
/**
* sys_rt_sigaction - alter an action taken by a process
* @sig: signal to be sent
* @act: new sigaction
* @oact: used to save the previous sigaction
* @sigsetsize: size of sigset_t type
*/
SYSCALL_DEFINE4(rt_sigaction, int, sig,
const struct sigaction __user *, act,
struct sigaction __user *, oact,
size_t, sigsetsize)
{
struct k_sigaction new_sa, old_sa;
int ret = -EINVAL;
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
goto out;
if (act) {
if (copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa)))
return -EFAULT;
}
ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL);
if (!ret && oact) {
if (copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa)))
return -EFAULT;
}
out:
return ret;
}
#endif /* __ARCH_WANT_SYS_RT_SIGACTION */
#ifdef __ARCH_WANT_SYS_SGETMASK
/*
* For backwards compatibility. Functionality superseded by sigprocmask.
*/
SYSCALL_DEFINE0(sgetmask)
{
/* SMP safe */
return current->blocked.sig[0];
}
SYSCALL_DEFINE1(ssetmask, int, newmask)
{
int old;
spin_lock_irq(&current->sighand->siglock);
old = current->blocked.sig[0];
siginitset(&current->blocked, newmask & ~(sigmask(SIGKILL)|
sigmask(SIGSTOP)));
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
return old;
}
#endif /* __ARCH_WANT_SGETMASK */
#ifdef __ARCH_WANT_SYS_SIGNAL
/*
* For backwards compatibility. Functionality superseded by sigaction.
*/
SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
{
struct k_sigaction new_sa, old_sa;
int ret;
new_sa.sa.sa_handler = handler;
new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK;
sigemptyset(&new_sa.sa.sa_mask);
ret = do_sigaction(sig, &new_sa, &old_sa);
return ret ? ret : (unsigned long)old_sa.sa.sa_handler;
}
#endif /* __ARCH_WANT_SYS_SIGNAL */
#ifdef __ARCH_WANT_SYS_PAUSE
SYSCALL_DEFINE0(pause)
{
while (!signal_pending(current)) {
current->state = TASK_INTERRUPTIBLE;
schedule();
}
return -ERESTARTNOHAND;
}
#endif
#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
/**
* sys_rt_sigsuspend - replace the signal mask for a value with the
* @unewset value until a signal is received
* @unewset: new signal mask value
* @sigsetsize: size of sigset_t type
*/
SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
{
sigset_t newset;
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
if (copy_from_user(&newset, unewset, sizeof(newset)))
return -EFAULT;
sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
spin_lock_irq(&current->sighand->siglock);
current->saved_sigmask = current->blocked;
current->blocked = newset;
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
current->state = TASK_INTERRUPTIBLE;
schedule();
set_restore_sigmask();
return -ERESTARTNOHAND;
}
#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
{
return NULL;
}
void __init signals_init(void)
{
sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
}
#ifdef CONFIG_KGDB_KDB
#include <linux/kdb.h>
/*
* kdb_send_sig_info - Allows kdb to send signals without exposing
* signal internals. This function checks if the required locks are
* available before calling the main signal code, to avoid kdb
* deadlocks.
*/
void
kdb_send_sig_info(struct task_struct *t, struct siginfo *info)
{
static struct task_struct *kdb_prev_t;
int sig, new_t;
if (!spin_trylock(&t->sighand->siglock)) {
kdb_printf("Can't do kill command now.\n"
"The sigmask lock is held somewhere else in "
"kernel, try again later\n");
return;
}
spin_unlock(&t->sighand->siglock);
new_t = kdb_prev_t != t;
kdb_prev_t = t;
if (t->state != TASK_RUNNING && new_t) {
kdb_printf("Process is not RUNNING, sending a signal from "
"kdb risks deadlock\n"
"on the run queue locks. "
"The signal has _not_ been sent.\n"
"Reissue the kill command if you want to risk "
"the deadlock.\n");
return;
}
sig = info->si_signo;
if (send_sig_info(sig, info, t))
kdb_printf("Fail to deliver Signal %d to process %d.\n",
sig, t->pid);
else
kdb_printf("Signal %d is sent to process %d.\n", sig, t->pid);
}
#endif /* CONFIG_KGDB_KDB */