diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 8da78595d69d..1f9607ed087c 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -429,6 +429,7 @@ 421 i386 rt_sigtimedwait_time64 sys_rt_sigtimedwait __ia32_compat_sys_rt_sigtimedwait_time64 422 i386 futex_time64 sys_futex __ia32_sys_futex 423 i386 sched_rr_get_interval_time64 sys_sched_rr_get_interval __ia32_sys_sched_rr_get_interval +424 i386 pidfd_send_signal sys_pidfd_send_signal __ia32_sys_pidfd_send_signal 425 i386 io_uring_setup sys_io_uring_setup __ia32_sys_io_uring_setup 426 i386 io_uring_enter sys_io_uring_enter __ia32_sys_io_uring_enter 427 i386 io_uring_register sys_io_uring_register __ia32_sys_io_uring_register diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index c768447f97ec..92ee0b4378d4 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -345,6 +345,7 @@ 334 common rseq __x64_sys_rseq # don't use numbers 387 through 423, add new calls after the last # 'common' entry +424 common pidfd_send_signal __x64_sys_pidfd_send_signal 425 common io_uring_setup __x64_sys_io_uring_setup 426 common io_uring_enter __x64_sys_io_uring_enter 427 common io_uring_register __x64_sys_io_uring_register diff --git a/fs/proc/base.c b/fs/proc/base.c index f5ebdd87afb2..ddef482f1334 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3074,6 +3074,15 @@ static const struct file_operations proc_tgid_base_operations = { .llseek = generic_file_llseek, }; +struct pid *tgid_pidfd_to_pid(const struct file *file) +{ + if (!d_is_dir(file->f_path.dentry) || + (file->f_op != &proc_tgid_base_operations)) + return ERR_PTR(-EBADF); + + return proc_pid(file_inode(file)); +} + static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index d0e1f1522a78..52a283ba0465 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -73,6 +73,7 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo int (*show)(struct seq_file *, void *), proc_write_t write, void *data); +extern struct pid *tgid_pidfd_to_pid(const struct file *file); #else /* CONFIG_PROC_FS */ @@ -114,6 +115,11 @@ static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *p #define proc_create_net(name, mode, parent, state_size, ops) ({NULL;}) #define proc_create_net_single(name, mode, parent, show, data) ({NULL;}) +static inline struct pid *tgid_pidfd_to_pid(const struct file *file) +{ + return ERR_PTR(-EBADF); +} + #endif /* CONFIG_PROC_FS */ struct net; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index c2962953bf11..e446806a561f 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -985,6 +985,9 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, unsigned mask, struct statx __user *buffer); asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len, int flags, uint32_t sig); +asmlinkage long sys_pidfd_send_signal(int pidfd, int sig, + siginfo_t __user *info, + unsigned int flags); /* * Architecture-specific system calls diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index bf4624efe5e6..dee7292e1df6 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -824,6 +824,8 @@ __SYSCALL(__NR_futex_time64, sys_futex) __SYSCALL(__NR_sched_rr_get_interval_time64, sys_sched_rr_get_interval) #endif +#define __NR_pidfd_send_signal 424 +__SYSCALL(__NR_pidfd_send_signal, sys_pidfd_send_signal) #define __NR_io_uring_setup 425 __SYSCALL(__NR_io_uring_setup, sys_io_uring_setup) #define __NR_io_uring_enter 426 diff --git a/kernel/signal.c b/kernel/signal.c index 5d53183e2705..b7953934aa99 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -19,7 +19,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -3487,6 +3489,16 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese, #endif #endif +static inline void prepare_kill_siginfo(int sig, struct kernel_siginfo *info) +{ + clear_siginfo(info); + info->si_signo = sig; + info->si_errno = 0; + info->si_code = SI_USER; + info->si_pid = task_tgid_vnr(current); + info->si_uid = from_kuid_munged(current_user_ns(), current_uid()); +} + /** * sys_kill - send a signal to a process * @pid: the PID of the process @@ -3496,16 +3508,125 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct kernel_siginfo info; - clear_siginfo(&info); - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_USER; - info.si_pid = task_tgid_vnr(current); - info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); + prepare_kill_siginfo(sig, &info); return kill_something_info(sig, &info, pid); } +#ifdef CONFIG_PROC_FS +/* + * Verify that the signaler and signalee either are in the same pid namespace + * or that the signaler's pid namespace is an ancestor of the signalee's pid + * namespace. + */ +static bool access_pidfd_pidns(struct pid *pid) +{ + struct pid_namespace *active = task_active_pid_ns(current); + struct pid_namespace *p = ns_of_pid(pid); + + for (;;) { + if (!p) + return false; + if (p == active) + break; + p = p->parent; + } + + return true; +} + +static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info) +{ +#ifdef CONFIG_COMPAT + /* + * Avoid hooking up compat syscalls and instead handle necessary + * conversions here. Note, this is a stop-gap measure and should not be + * considered a generic solution. + */ + if (in_compat_syscall()) + return copy_siginfo_from_user32( + kinfo, (struct compat_siginfo __user *)info); +#endif + return copy_siginfo_from_user(kinfo, info); +} + +/** + * sys_pidfd_send_signal - send a signal to a process through a task file + * descriptor + * @pidfd: the file descriptor of the process + * @sig: signal to be sent + * @info: the signal info + * @flags: future flags to be passed + * + * The syscall currently only signals via PIDTYPE_PID which covers + * kill(, . It does not signal threads or process + * groups. + * In order to extend the syscall to threads and process groups the @flags + * argument should be used. In essence, the @flags argument will determine + * what is signaled and not the file descriptor itself. Put in other words, + * grouping is a property of the flags argument not a property of the file + * descriptor. + * + * Return: 0 on success, negative errno on failure + */ +SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, + siginfo_t __user *, info, unsigned int, flags) +{ + int ret; + struct fd f; + struct pid *pid; + kernel_siginfo_t kinfo; + + /* Enforce flags be set to 0 until we add an extension. */ + if (flags) + return -EINVAL; + + f = fdget_raw(pidfd); + if (!f.file) + return -EBADF; + + /* Is this a pidfd? */ + pid = tgid_pidfd_to_pid(f.file); + if (IS_ERR(pid)) { + ret = PTR_ERR(pid); + goto err; + } + + ret = -EINVAL; + if (!access_pidfd_pidns(pid)) + goto err; + + if (info) { + ret = copy_siginfo_from_user_any(&kinfo, info); + if (unlikely(ret)) + goto err; + + ret = -EINVAL; + if (unlikely(sig != kinfo.si_signo)) + goto err; + + if ((task_pid(current) != pid) && + (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) { + /* Only allow sending arbitrary signals to yourself. */ + ret = -EPERM; + if (kinfo.si_code != SI_USER) + goto err; + + /* Turn this into a regular kill signal. */ + prepare_kill_siginfo(sig, &kinfo); + } + } else { + prepare_kill_siginfo(sig, &kinfo); + } + + ret = kill_pid_info(sig, &kinfo, pid); + +err: + fdput(f); + return ret; +} +#endif /* CONFIG_PROC_FS */ + static int do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info) { diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 51d7c6794bf1..d21f4befaea4 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -168,6 +168,7 @@ COND_SYSCALL(syslog); /* kernel/sched/core.c */ /* kernel/signal.c */ +COND_SYSCALL(pidfd_send_signal); /* kernel/sys.c */ COND_SYSCALL(setregid); diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index fb5758ac469e..971fc8428117 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -32,6 +32,7 @@ TARGETS += net TARGETS += netfilter TARGETS += networking/timestamping TARGETS += nsfs +TARGETS += pidfd TARGETS += powerpc TARGETS += proc TARGETS += pstore diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile new file mode 100644 index 000000000000..deaf8073bc06 --- /dev/null +++ b/tools/testing/selftests/pidfd/Makefile @@ -0,0 +1,6 @@ +CFLAGS += -g -I../../../../usr/include/ + +TEST_GEN_PROGS := pidfd_test + +include ../lib.mk + diff --git a/tools/testing/selftests/pidfd/pidfd_test.c b/tools/testing/selftests/pidfd/pidfd_test.c new file mode 100644 index 000000000000..d59378a93782 --- /dev/null +++ b/tools/testing/selftests/pidfd/pidfd_test.c @@ -0,0 +1,381 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info, + unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int signal_received; + +static void set_signal_received_on_sigusr1(int sig) +{ + if (sig == SIGUSR1) + signal_received = 1; +} + +/* + * Straightforward test to see whether pidfd_send_signal() works is to send + * a signal to ourself. + */ +static int test_pidfd_send_signal_simple_success(void) +{ + int pidfd, ret; + const char *test_name = "pidfd_send_signal send SIGUSR1"; + + pidfd = open("/proc/self", O_DIRECTORY | O_CLOEXEC); + if (pidfd < 0) + ksft_exit_fail_msg( + "%s test: Failed to open process file descriptor\n", + test_name); + + signal(SIGUSR1, set_signal_received_on_sigusr1); + + ret = sys_pidfd_send_signal(pidfd, SIGUSR1, NULL, 0); + close(pidfd); + if (ret < 0) + ksft_exit_fail_msg("%s test: Failed to send signal\n", + test_name); + + if (signal_received != 1) + ksft_exit_fail_msg("%s test: Failed to receive signal\n", + test_name); + + signal_received = 0; + ksft_test_result_pass("%s test: Sent signal\n", test_name); + return 0; +} + +static int wait_for_pid(pid_t pid) +{ + int status, ret; + +again: + ret = waitpid(pid, &status, 0); + if (ret == -1) { + if (errno == EINTR) + goto again; + + return -1; + } + + if (ret != pid) + goto again; + + if (!WIFEXITED(status)) + return -1; + + return WEXITSTATUS(status); +} + +static int test_pidfd_send_signal_exited_fail(void) +{ + int pidfd, ret, saved_errno; + char buf[256]; + pid_t pid; + const char *test_name = "pidfd_send_signal signal exited process"; + + pid = fork(); + if (pid < 0) + ksft_exit_fail_msg("%s test: Failed to create new process\n", + test_name); + + if (pid == 0) + _exit(EXIT_SUCCESS); + + snprintf(buf, sizeof(buf), "/proc/%d", pid); + + pidfd = open(buf, O_DIRECTORY | O_CLOEXEC); + + (void)wait_for_pid(pid); + + if (pidfd < 0) + ksft_exit_fail_msg( + "%s test: Failed to open process file descriptor\n", + test_name); + + ret = sys_pidfd_send_signal(pidfd, 0, NULL, 0); + saved_errno = errno; + close(pidfd); + if (ret == 0) + ksft_exit_fail_msg( + "%s test: Managed to send signal to process even though it should have failed\n", + test_name); + + if (saved_errno != ESRCH) + ksft_exit_fail_msg( + "%s test: Expected to receive ESRCH as errno value but received %d instead\n", + test_name, saved_errno); + + ksft_test_result_pass("%s test: Failed to send signal as expected\n", + test_name); + return 0; +} + +/* + * The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c + * That means, when it wraps around any pid < 300 will be skipped. + * So we need to use a pid > 300 in order to test recycling. + */ +#define PID_RECYCLE 1000 + +/* + * Maximum number of cycles we allow. This is equivalent to PID_MAX_DEFAULT. + * If users set a higher limit or we have cycled PIDFD_MAX_DEFAULT number of + * times then we skip the test to not go into an infinite loop or block for a + * long time. + */ +#define PIDFD_MAX_DEFAULT 0x8000 + +/* + * Define a few custom error codes for the child process to clearly indicate + * what is happening. This way we can tell the difference between a system + * error, a test error, etc. + */ +#define PIDFD_PASS 0 +#define PIDFD_FAIL 1 +#define PIDFD_ERROR 2 +#define PIDFD_SKIP 3 +#define PIDFD_XFAIL 4 + +static int test_pidfd_send_signal_recycled_pid_fail(void) +{ + int i, ret; + pid_t pid1; + const char *test_name = "pidfd_send_signal signal recycled pid"; + + ret = unshare(CLONE_NEWPID); + if (ret < 0) + ksft_exit_fail_msg("%s test: Failed to unshare pid namespace\n", + test_name); + + ret = unshare(CLONE_NEWNS); + if (ret < 0) + ksft_exit_fail_msg( + "%s test: Failed to unshare mount namespace\n", + test_name); + + ret = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0); + if (ret < 0) + ksft_exit_fail_msg("%s test: Failed to remount / private\n", + test_name); + + /* pid 1 in new pid namespace */ + pid1 = fork(); + if (pid1 < 0) + ksft_exit_fail_msg("%s test: Failed to create new process\n", + test_name); + + if (pid1 == 0) { + char buf[256]; + pid_t pid2; + int pidfd = -1; + + (void)umount2("/proc", MNT_DETACH); + ret = mount("proc", "/proc", "proc", 0, NULL); + if (ret < 0) + _exit(PIDFD_ERROR); + + /* grab pid PID_RECYCLE */ + for (i = 0; i <= PIDFD_MAX_DEFAULT; i++) { + pid2 = fork(); + if (pid2 < 0) + _exit(PIDFD_ERROR); + + if (pid2 == 0) + _exit(PIDFD_PASS); + + if (pid2 == PID_RECYCLE) { + snprintf(buf, sizeof(buf), "/proc/%d", pid2); + ksft_print_msg("pid to recycle is %d\n", pid2); + pidfd = open(buf, O_DIRECTORY | O_CLOEXEC); + } + + if (wait_for_pid(pid2)) + _exit(PIDFD_ERROR); + + if (pid2 >= PID_RECYCLE) + break; + } + + /* + * We want to be as predictable as we can so if we haven't been + * able to grab pid PID_RECYCLE skip the test. + */ + if (pid2 != PID_RECYCLE) { + /* skip test */ + close(pidfd); + _exit(PIDFD_SKIP); + } + + if (pidfd < 0) + _exit(PIDFD_ERROR); + + for (i = 0; i <= PIDFD_MAX_DEFAULT; i++) { + char c; + int pipe_fds[2]; + pid_t recycled_pid; + int child_ret = PIDFD_PASS; + + ret = pipe2(pipe_fds, O_CLOEXEC); + if (ret < 0) + _exit(PIDFD_ERROR); + + recycled_pid = fork(); + if (recycled_pid < 0) + _exit(PIDFD_ERROR); + + if (recycled_pid == 0) { + close(pipe_fds[1]); + (void)read(pipe_fds[0], &c, 1); + close(pipe_fds[0]); + + _exit(PIDFD_PASS); + } + + /* + * Stop the child so we can inspect whether we have + * recycled pid PID_RECYCLE. + */ + close(pipe_fds[0]); + ret = kill(recycled_pid, SIGSTOP); + close(pipe_fds[1]); + if (ret) { + (void)wait_for_pid(recycled_pid); + _exit(PIDFD_ERROR); + } + + /* + * We have recycled the pid. Try to signal it. This + * needs to fail since this is a different process than + * the one the pidfd refers to. + */ + if (recycled_pid == PID_RECYCLE) { + ret = sys_pidfd_send_signal(pidfd, SIGCONT, + NULL, 0); + if (ret && errno == ESRCH) + child_ret = PIDFD_XFAIL; + else + child_ret = PIDFD_FAIL; + } + + /* let the process move on */ + ret = kill(recycled_pid, SIGCONT); + if (ret) + (void)kill(recycled_pid, SIGKILL); + + if (wait_for_pid(recycled_pid)) + _exit(PIDFD_ERROR); + + switch (child_ret) { + case PIDFD_FAIL: + /* fallthrough */ + case PIDFD_XFAIL: + _exit(child_ret); + case PIDFD_PASS: + break; + default: + /* not reached */ + _exit(PIDFD_ERROR); + } + + /* + * If the user set a custom pid_max limit we could be + * in the millions. + * Skip the test in this case. + */ + if (recycled_pid > PIDFD_MAX_DEFAULT) + _exit(PIDFD_SKIP); + } + + /* failed to recycle pid */ + _exit(PIDFD_SKIP); + } + + ret = wait_for_pid(pid1); + switch (ret) { + case PIDFD_FAIL: + ksft_exit_fail_msg( + "%s test: Managed to signal recycled pid %d\n", + test_name, PID_RECYCLE); + case PIDFD_PASS: + ksft_exit_fail_msg("%s test: Failed to recycle pid %d\n", + test_name, PID_RECYCLE); + case PIDFD_SKIP: + ksft_print_msg("%s test: Skipping test\n", test_name); + ret = 0; + break; + case PIDFD_XFAIL: + ksft_test_result_pass( + "%s test: Failed to signal recycled pid as expected\n", + test_name); + ret = 0; + break; + default /* PIDFD_ERROR */: + ksft_exit_fail_msg("%s test: Error while running tests\n", + test_name); + } + + return ret; +} + +static int test_pidfd_send_signal_syscall_support(void) +{ + int pidfd, ret; + const char *test_name = "pidfd_send_signal check for support"; + + pidfd = open("/proc/self", O_DIRECTORY | O_CLOEXEC); + if (pidfd < 0) + ksft_exit_fail_msg( + "%s test: Failed to open process file descriptor\n", + test_name); + + ret = sys_pidfd_send_signal(pidfd, 0, NULL, 0); + if (ret < 0) { + /* + * pidfd_send_signal() will currently return ENOSYS when + * CONFIG_PROC_FS is not set. + */ + if (errno == ENOSYS) + ksft_exit_skip( + "%s test: pidfd_send_signal() syscall not supported (Ensure that CONFIG_PROC_FS=y is set)\n", + test_name); + + ksft_exit_fail_msg("%s test: Failed to send signal\n", + test_name); + } + + close(pidfd); + ksft_test_result_pass( + "%s test: pidfd_send_signal() syscall is supported. Tests can be executed\n", + test_name); + return 0; +} + +int main(int argc, char **argv) +{ + ksft_print_header(); + + test_pidfd_send_signal_syscall_support(); + test_pidfd_send_signal_simple_success(); + test_pidfd_send_signal_exited_fail(); + test_pidfd_send_signal_recycled_pid_fail(); + + return ksft_exit_pass(); +}