Subject: ptrace: fix ptrace vs tasklist_lock race From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Date: Thu Aug 29 18:21:04 2013 +0200 From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> As explained by Alexander Fyodorov <halcy@yandex.ru>: |read_lock(&tasklist_lock) in ptrace_stop() is converted to mutex on RT kernel, |and it can remove __TASK_TRACED from task->state (by moving it to |task->saved_state). If parent does wait() on child followed by a sys_ptrace |call, the following race can happen: | |- child sets __TASK_TRACED in ptrace_stop() |- parent does wait() which eventually calls wait_task_stopped() and returns | child's pid |- child blocks on read_lock(&tasklist_lock) in ptrace_stop() and moves | __TASK_TRACED flag to saved_state |- parent calls sys_ptrace, which calls ptrace_check_attach() and wait_task_inactive() The patch is based on his initial patch where an additional check is added in case the __TASK_TRACED moved to ->saved_state. The pi_lock is taken in case the caller is interrupted between looking into ->state and ->saved_state. [ Fix for ptrace_unfreeze_traced() by Oleg Nesterov ] Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/linux/sched.h | 79 +++++++++++++++++++++++++++++++++++++++++++++++--- kernel/ptrace.c | 38 +++++++++++++++++++----- kernel/sched/core.c | 4 +- 3 files changed, 108 insertions(+), 13 deletions(-) --- @ include/linux/sched.h:121 @ struct task_group; #define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) -#define task_is_traced(task) ((READ_ONCE(task->__state) & __TASK_TRACED) != 0) - #define task_is_stopped(task) ((READ_ONCE(task->__state) & __TASK_STOPPED) != 0) -#define task_is_stopped_or_traced(task) ((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0) - /* * Special states are those that do not use the normal wait-loop pattern. See * the comment with set_special_state(). @ include/linux/sched.h:2014 @ static inline int test_tsk_need_resched( return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } +#ifdef CONFIG_PREEMPT_RT +static inline bool task_match_saved_state(struct task_struct *p, long match_state) +{ + return p->saved_state == match_state; +} + +static inline bool task_is_traced(struct task_struct *task) +{ + bool traced = false; + + /* in case the task is sleeping on tasklist_lock */ + raw_spin_lock_irq(&task->pi_lock); + if (READ_ONCE(task->__state) & __TASK_TRACED) + traced = true; + else if (task->saved_state & __TASK_TRACED) + traced = true; + raw_spin_unlock_irq(&task->pi_lock); + return traced; +} + +static inline bool task_is_stopped_or_traced(struct task_struct *task) +{ + bool traced_stopped = false; + unsigned long flags; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + + if (READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) + traced_stopped = true; + else if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED)) + traced_stopped = true; + + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + return traced_stopped; +} + +#else + +static inline bool task_match_saved_state(struct task_struct *p, long match_state) +{ + return false; +} + +static inline bool task_is_traced(struct task_struct *task) +{ + return READ_ONCE(task->__state) & __TASK_TRACED; +} + +static inline bool task_is_stopped_or_traced(struct task_struct *task) +{ + return READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED); +} +#endif + +static inline bool task_match_state_or_saved(struct task_struct *p, + long match_state) +{ + if (READ_ONCE(p->__state) == match_state) + return true; + + return task_match_saved_state(p, match_state); +} + +static inline bool task_match_state_lock(struct task_struct *p, + long match_state) +{ + bool match; + + raw_spin_lock_irq(&p->pi_lock); + match = task_match_state_or_saved(p, match_state); + raw_spin_unlock_irq(&p->pi_lock); + + return match; +} + /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @ include/linux/sched.h:200 @ static bool ptrace_freeze_traced(struct spin_lock_irq(&task->sighand->siglock); if (task_is_traced(task) && !looks_like_a_spurious_pid(task) && !__fatal_signal_pending(task)) { +#ifdef CONFIG_PREEMPT_RT + unsigned long flags; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + if (READ_ONCE(task->__state) & __TASK_TRACED) + WRITE_ONCE(task->__state, __TASK_TRACED); + else + task->saved_state = __TASK_TRACED; + raw_spin_unlock_irqrestore(&task->pi_lock, flags); +#else WRITE_ONCE(task->__state, __TASK_TRACED); +#endif ret = true; } spin_unlock_irq(&task->sighand->siglock); @ include/linux/sched.h:221 @ static bool ptrace_freeze_traced(struct static void ptrace_unfreeze_traced(struct task_struct *task) { - if (READ_ONCE(task->__state) != __TASK_TRACED) + unsigned long flags; + bool frozen = true; + + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && + READ_ONCE(task->__state) != __TASK_TRACED) return; WARN_ON(!task->ptrace || task->parent != current); @ include/linux/sched.h:235 @ static void ptrace_unfreeze_traced(struc * Recheck state under the lock to close this race. */ spin_lock_irq(&task->sighand->siglock); - if (READ_ONCE(task->__state) == __TASK_TRACED) { - if (__fatal_signal_pending(task)) - wake_up_state(task, __TASK_TRACED); - else - WRITE_ONCE(task->__state, TASK_TRACED); - } + raw_spin_lock_irqsave(&task->pi_lock, flags); + if (READ_ONCE(task->__state) == __TASK_TRACED) + WRITE_ONCE(task->__state, TASK_TRACED); + +#ifdef CONFIG_PREEMPT_RT + else if (task->saved_state == __TASK_TRACED) + task->saved_state = TASK_TRACED; +#endif + else + frozen = false; + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + if (frozen && __fatal_signal_pending(task)) + wake_up_state(task, __TASK_TRACED); + spin_unlock_irq(&task->sighand->siglock); } --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @ include/linux/sched.h:3210 @ unsigned long wait_task_inactive(struct * is actually now running somewhere else! */ while (task_running(rq, p)) { - if (match_state && unlikely(READ_ONCE(p->__state) != match_state)) + if (match_state && !task_match_state_lock(p, match_state)) return 0; cpu_relax(); } @ include/linux/sched.h:3225 @ unsigned long wait_task_inactive(struct running = task_running(rq, p); queued = task_on_rq_queued(p); ncsw = 0; - if (!match_state || READ_ONCE(p->__state) == match_state) + if (!match_state || task_match_state_or_saved(p, match_state)) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, p, &rf);