Skip to content

Commit

Permalink
oom_kill.c: futex: delay the OOM reaper to allow time for proper fute…
Browse files Browse the repository at this point in the history
…x cleanup

commit e4a3840 upstream.

The pthread struct is allocated on PRIVATE|ANONYMOUS memory [1] which
can be targeted by the oom reaper.  This mapping is used to store the
futex robust list head; the kernel does not keep a copy of the robust
list and instead references a userspace address to maintain the
robustness during a process death.

A race can occur between exit_mm and the oom reaper that allows the oom
reaper to free the memory of the futex robust list before the exit path
has handled the futex death:

    CPU1                               CPU2
    --------------------------------------------------------------------
    page_fault
    do_exit "signal"
    wake_oom_reaper
                                        oom_reaper
                                        oom_reap_task_mm (invalidates mm)
    exit_mm
    exit_mm_release
    futex_exit_release
    futex_cleanup
    exit_robust_list
    get_user (EFAULT- can't access memory)

If the get_user EFAULT's, the kernel will be unable to recover the
waiters on the robust_list, leaving userspace mutexes hung indefinitely.

Delay the OOM reaper, allowing more time for the exit path to perform
the futex cleanup.

Reproducer: https://gitlab.com/jsavitz/oom_futex_reproducer

Based on a patch by Michal Hocko.

Link: https://elixir.bootlin.com/glibc/glibc-2.35/source/nptl/allocatestack.c#L370 [1]
Link: https://lkml.kernel.org/r/[email protected]
Fixes: 2129258 ("mm: oom: let oom_reap_task and exit_mmap run concurrently")
Signed-off-by: Joel Savitz <[email protected]>
Signed-off-by: Nico Pache <[email protected]>
Co-developed-by: Joel Savitz <[email protected]>
Suggested-by: Thomas Gleixner <[email protected]>
Acked-by: Thomas Gleixner <[email protected]>
Acked-by: Michal Hocko <[email protected]>
Cc: Rafael Aquini <[email protected]>
Cc: Waiman Long <[email protected]>
Cc: Herton R. Krzesinski <[email protected]>
Cc: Juri Lelli <[email protected]>
Cc: Vincent Guittot <[email protected]>
Cc: Dietmar Eggemann <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Ben Segall <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Daniel Bristot de Oliveira <[email protected]>
Cc: David Rientjes <[email protected]>
Cc: Andrea Arcangeli <[email protected]>
Cc: Davidlohr Bueso <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Joel Savitz <[email protected]>
Cc: Darren Hart <[email protected]>
Cc: <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
Signed-off-by: Greg Kroah-Hartman <[email protected]>
  • Loading branch information
Nico Pache authored and Sasha Levin committed Apr 25, 2022
1 parent 0d30029 commit b6cb2ae
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 14 deletions.
1 change: 1 addition & 0 deletions include/linux/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -1325,6 +1325,7 @@ struct task_struct {
int pagefault_disabled;
#ifdef CONFIG_MMU
struct task_struct *oom_reaper_list;
struct timer_list oom_reaper_timer;
#endif
#ifdef CONFIG_VMAP_STACK
struct vm_struct *stack_vm_area;
Expand Down
54 changes: 40 additions & 14 deletions mm/oom_kill.c
Original file line number Diff line number Diff line change
Expand Up @@ -633,7 +633,7 @@ static void oom_reap_task(struct task_struct *tsk)
*/
set_bit(MMF_OOM_SKIP, &mm->flags);

/* Drop a reference taken by wake_oom_reaper */
/* Drop a reference taken by queue_oom_reaper */
put_task_struct(tsk);
}

Expand All @@ -643,12 +643,12 @@ static int oom_reaper(void *unused)
struct task_struct *tsk = NULL;

wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
spin_lock(&oom_reaper_lock);
spin_lock_irq(&oom_reaper_lock);
if (oom_reaper_list != NULL) {
tsk = oom_reaper_list;
oom_reaper_list = tsk->oom_reaper_list;
}
spin_unlock(&oom_reaper_lock);
spin_unlock_irq(&oom_reaper_lock);

if (tsk)
oom_reap_task(tsk);
Expand All @@ -657,30 +657,56 @@ static int oom_reaper(void *unused)
return 0;
}

static void wake_oom_reaper(struct task_struct *tsk)
static void wake_oom_reaper(struct timer_list *timer)
{
/* mm is already queued? */
if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
return;
struct task_struct *tsk = container_of(timer, struct task_struct,
oom_reaper_timer);
struct mm_struct *mm = tsk->signal->oom_mm;
unsigned long flags;

get_task_struct(tsk);
/* The victim managed to terminate on its own - see exit_mmap */
if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
put_task_struct(tsk);
return;
}

spin_lock(&oom_reaper_lock);
spin_lock_irqsave(&oom_reaper_lock, flags);
tsk->oom_reaper_list = oom_reaper_list;
oom_reaper_list = tsk;
spin_unlock(&oom_reaper_lock);
spin_unlock_irqrestore(&oom_reaper_lock, flags);
trace_wake_reaper(tsk->pid);
wake_up(&oom_reaper_wait);
}

/*
* Give the OOM victim time to exit naturally before invoking the oom_reaping.
* The timers timeout is arbitrary... the longer it is, the longer the worst
* case scenario for the OOM can take. If it is too small, the oom_reaper can
* get in the way and release resources needed by the process exit path.
* e.g. The futex robust list can sit in Anon|Private memory that gets reaped
* before the exit path is able to wake the futex waiters.
*/
#define OOM_REAPER_DELAY (2*HZ)
static void queue_oom_reaper(struct task_struct *tsk)
{
/* mm is already queued? */
if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
return;

get_task_struct(tsk);
timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0);
tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY;
add_timer(&tsk->oom_reaper_timer);
}

static int __init oom_init(void)
{
oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
return 0;
}
subsys_initcall(oom_init)
#else
static inline void wake_oom_reaper(struct task_struct *tsk)
static inline void queue_oom_reaper(struct task_struct *tsk)
{
}
#endif /* CONFIG_MMU */
Expand Down Expand Up @@ -931,7 +957,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
rcu_read_unlock();

if (can_oom_reap)
wake_oom_reaper(victim);
queue_oom_reaper(victim);

mmdrop(mm);
put_task_struct(victim);
Expand Down Expand Up @@ -967,7 +993,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
task_lock(victim);
if (task_will_free_mem(victim)) {
mark_oom_victim(victim);
wake_oom_reaper(victim);
queue_oom_reaper(victim);
task_unlock(victim);
put_task_struct(victim);
return;
Expand Down Expand Up @@ -1065,7 +1091,7 @@ bool out_of_memory(struct oom_control *oc)
*/
if (task_will_free_mem(current)) {
mark_oom_victim(current);
wake_oom_reaper(current);
queue_oom_reaper(current);
return true;
}

Expand Down

0 comments on commit b6cb2ae

Please sign in to comment.