Skip to content

Commit

Permalink
Merge "mm: Fix sleeping while atomic during speculative page fault"
Browse files Browse the repository at this point in the history
  • Loading branch information
qctecmdr authored and Gerrit - the friendly Code Review server committed Jun 26, 2020
2 parents 1f2a0ad + 8fbac43 commit 6e625d3
Show file tree
Hide file tree
Showing 31 changed files with 1,182 additions and 210 deletions.
1 change: 1 addition & 0 deletions arch/arm64/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ config ARM64
select SWIOTLB
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
select ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT
help
ARM 64-bit (AArch64) Linux support.

Expand Down
26 changes: 23 additions & 3 deletions arch/arm64/mm/fault.c
Original file line number Diff line number Diff line change
Expand Up @@ -406,10 +406,9 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
#define VM_FAULT_BADMAP 0x010000
#define VM_FAULT_BADACCESS 0x020000

static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr,
static int __do_page_fault(struct vm_area_struct *vma, unsigned long addr,
unsigned int mm_flags, unsigned long vm_flags)
{
struct vm_area_struct *vma = find_vma(mm, addr);

if (unlikely(!vma))
return VM_FAULT_BADMAP;
Expand Down Expand Up @@ -456,6 +455,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
vm_fault_t fault, major = 0;
unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC;
unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
struct vm_area_struct *vma = NULL;

if (kprobe_page_fault(regs, esr))
return 0;
Expand Down Expand Up @@ -495,6 +495,14 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,

perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);

/*
* let's try a speculative page fault without grabbing the
* mmap_sem.
*/
fault = handle_speculative_fault(mm, addr, mm_flags, &vma);
if (fault != VM_FAULT_RETRY)
goto done;

/*
* As per x86, we may deadlock here. However, since the kernel only
* validly references user space from well defined areas of the code,
Expand All @@ -519,7 +527,10 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
#endif
}

fault = __do_page_fault(mm, addr, mm_flags, vm_flags);
if (!vma || !can_reuse_spf_vma(vma, addr))
vma = find_vma(mm, addr);

fault = __do_page_fault(vma, addr, mm_flags, vm_flags);
major |= fault & VM_FAULT_MAJOR;

if (fault & VM_FAULT_RETRY) {
Expand All @@ -542,11 +553,20 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
if (mm_flags & FAULT_FLAG_ALLOW_RETRY) {
mm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
mm_flags |= FAULT_FLAG_TRIED;

/*
* Do not try to reuse this vma and fetch it
* again since we will release the mmap_sem.
*/
vma = NULL;

goto retry;
}
}
up_read(&mm->mmap_sem);

done:

/*
* Handle the "normal" (no error) case first.
*/
Expand Down
5 changes: 4 additions & 1 deletion fs/proc/task_mmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -1277,8 +1277,11 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
goto out_mm;
}
for (vma = mm->mmap; vma; vma = vma->vm_next) {
vma->vm_flags &= ~VM_SOFTDIRTY;
vm_write_begin(vma);
WRITE_ONCE(vma->vm_flags,
vma->vm_flags & ~VM_SOFTDIRTY);
vma_set_page_prot(vma);
vm_write_end(vma);
}
downgrade_write(&mm->mmap_sem);
break;
Expand Down
17 changes: 13 additions & 4 deletions fs/userfaultfd.c
Original file line number Diff line number Diff line change
Expand Up @@ -675,8 +675,11 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)

octx = vma->vm_userfaultfd_ctx.ctx;
if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
vm_write_begin(vma);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
WRITE_ONCE(vma->vm_flags,
vma->vm_flags & ~(VM_UFFD_WP | VM_UFFD_MISSING));
vm_write_end(vma);
return 0;
}

Expand Down Expand Up @@ -919,8 +922,10 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
else
prev = vma;
}
vma->vm_flags = new_flags;
vm_write_begin(vma);
WRITE_ONCE(vma->vm_flags, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
vm_write_end(vma);
}
up_write(&mm->mmap_sem);
mmput(mm);
Expand Down Expand Up @@ -1487,8 +1492,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
* the next vma was merged into the current one and
* the current one has not been updated yet.
*/
vma->vm_flags = new_flags;
vm_write_begin(vma);
WRITE_ONCE(vma->vm_flags, new_flags);
vma->vm_userfaultfd_ctx.ctx = ctx;
vm_write_end(vma);

skip:
prev = vma;
Expand Down Expand Up @@ -1650,8 +1657,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
* the next vma was merged into the current one and
* the current one has not been updated yet.
*/
vma->vm_flags = new_flags;
vm_write_begin(vma);
WRITE_ONCE(vma->vm_flags, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
vm_write_end(vma);

skip:
prev = vma;
Expand Down
2 changes: 1 addition & 1 deletion include/linux/hugetlb_inline.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
{
return !!(vma->vm_flags & VM_HUGETLB);
return !!(READ_ONCE(vma->vm_flags) & VM_HUGETLB);
}

#else
Expand Down
4 changes: 2 additions & 2 deletions include/linux/migrate.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,14 +124,14 @@ static inline void __ClearPageMovable(struct page *page)
#ifdef CONFIG_NUMA_BALANCING
extern bool pmd_trans_migrating(pmd_t pmd);
extern int migrate_misplaced_page(struct page *page,
struct vm_area_struct *vma, int node);
struct vm_fault *vmf, int node);
#else
static inline bool pmd_trans_migrating(pmd_t pmd)
{
return false;
}
static inline int migrate_misplaced_page(struct page *page,
struct vm_area_struct *vma, int node)
struct vm_fault *vmf, int node)
{
return -EAGAIN; /* can't migrate now */
}
Expand Down
132 changes: 122 additions & 10 deletions include/linux/mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,8 @@ extern pgprot_t protection_map[16];
#define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */
#define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */
#define FAULT_FLAG_PREFAULT_OLD 0x400 /* Make faultaround ptes old */
/* Speculative fault, not holding mmap_sem */
#define FAULT_FLAG_SPECULATIVE 0x200

#define FAULT_FLAG_TRACE \
{ FAULT_FLAG_WRITE, "WRITE" }, \
Expand Down Expand Up @@ -421,6 +423,10 @@ struct vm_fault {
gfp_t gfp_mask; /* gfp mask to be used for allocations */
pgoff_t pgoff; /* Logical page offset based on vma */
unsigned long address; /* Faulting virtual address */
#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
unsigned int sequence;
pmd_t orig_pmd; /* value of PMD at the time of fault */
#endif
pmd_t *pmd; /* Pointer to pmd entry matching
* the 'address' */
pud_t *pud; /* Pointer to pud entry matching
Expand Down Expand Up @@ -529,14 +535,23 @@ struct vm_operations_struct {
unsigned long addr);
};

static inline void INIT_VMA(struct vm_area_struct *vma)
{
INIT_LIST_HEAD(&vma->anon_vma_chain);
#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
seqcount_init(&vma->vm_sequence);
atomic_set(&vma->vm_ref_count, 1);
#endif
}

static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
static const struct vm_operations_struct dummy_vm_ops = {};

memset(vma, 0, sizeof(*vma));
vma->vm_mm = mm;
vma->vm_ops = &dummy_vm_ops;
INIT_LIST_HEAD(&vma->anon_vma_chain);
INIT_VMA(vma);
}

static inline void vma_set_anonymous(struct vm_area_struct *vma)
Expand Down Expand Up @@ -830,9 +845,9 @@ void free_compound_page(struct page *page);
* pte_mkwrite. But get_user_pages can cause write faults for mappings
* that do not have writing enabled, when used by access_process_vm.
*/
static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
static inline pte_t maybe_mkwrite(pte_t pte, unsigned long vma_flags)
{
if (likely(vma->vm_flags & VM_WRITE))
if (likely(vma_flags & VM_WRITE))
pte = pte_mkwrite(pte);
return pte;
}
Expand Down Expand Up @@ -1439,8 +1454,14 @@ struct zap_details {
pgoff_t last_index; /* Highest page->index to unmap */
};

struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte, unsigned long vma_flags);
static inline struct page *vm_normal_page(struct vm_area_struct *vma,
unsigned long addr, pte_t pte)
{
return _vm_normal_page(vma, addr, pte, vma->vm_flags);
}

struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd);

Expand All @@ -1467,6 +1488,47 @@ int follow_phys(struct vm_area_struct *vma, unsigned long address,
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write);

#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
static inline void vm_write_begin(struct vm_area_struct *vma)
{
write_seqcount_begin(&vma->vm_sequence);
}
static inline void vm_write_begin_nested(struct vm_area_struct *vma,
int subclass)
{
write_seqcount_begin_nested(&vma->vm_sequence, subclass);
}
static inline void vm_write_end(struct vm_area_struct *vma)
{
write_seqcount_end(&vma->vm_sequence);
}
static inline void vm_raw_write_begin(struct vm_area_struct *vma)
{
raw_write_seqcount_begin(&vma->vm_sequence);
}
static inline void vm_raw_write_end(struct vm_area_struct *vma)
{
raw_write_seqcount_end(&vma->vm_sequence);
}
#else
static inline void vm_write_begin(struct vm_area_struct *vma)
{
}
static inline void vm_write_begin_nested(struct vm_area_struct *vma,
int subclass)
{
}
static inline void vm_write_end(struct vm_area_struct *vma)
{
}
static inline void vm_raw_write_begin(struct vm_area_struct *vma)
{
}
static inline void vm_raw_write_end(struct vm_area_struct *vma)
{
}
#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */

extern void truncate_pagecache(struct inode *inode, loff_t new);
extern void truncate_setsize(struct inode *inode, loff_t newsize);
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
Expand All @@ -1478,6 +1540,43 @@ int invalidate_inode_page(struct page *page);
#ifdef CONFIG_MMU
extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
unsigned long address, unsigned int flags);

#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
extern int __handle_speculative_fault(struct mm_struct *mm,
unsigned long address,
unsigned int flags,
struct vm_area_struct **vma);
static inline int handle_speculative_fault(struct mm_struct *mm,
unsigned long address,
unsigned int flags,
struct vm_area_struct **vma)
{
/*
* Try speculative page fault for multithreaded user space task only.
*/
if (!(flags & FAULT_FLAG_USER) || atomic_read(&mm->mm_users) == 1) {
*vma = NULL;
return VM_FAULT_RETRY;
}
return __handle_speculative_fault(mm, address, flags, vma);
}
extern bool can_reuse_spf_vma(struct vm_area_struct *vma,
unsigned long address);
#else
static inline int handle_speculative_fault(struct mm_struct *mm,
unsigned long address,
unsigned int flags,
struct vm_area_struct **vma)
{
return VM_FAULT_RETRY;
}
static inline bool can_reuse_spf_vma(struct vm_area_struct *vma,
unsigned long address)
{
return false;
}
#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */

extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
unsigned long address, unsigned int fault_flags,
bool *unlocked);
Expand Down Expand Up @@ -2276,16 +2375,29 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
struct vm_area_struct *expand);
struct vm_area_struct *expand, bool keep_locked);
static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
{
return __vma_adjust(vma, start, end, pgoff, insert, NULL);
return __vma_adjust(vma, start, end, pgoff, insert, NULL, false);
}
extern struct vm_area_struct *vma_merge(struct mm_struct *,

extern struct vm_area_struct *__vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr, unsigned long end,
unsigned long vm_flags, struct anon_vma *anon, struct file *file,
pgoff_t pgoff, struct mempolicy *mpol, struct vm_userfaultfd_ctx uff,
const char __user *user, bool keep_locked);

static inline struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr, unsigned long end,
unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
struct mempolicy *, struct vm_userfaultfd_ctx, const char __user *);
unsigned long vm_flags, struct anon_vma *anon, struct file *file,
pgoff_t off, struct mempolicy *pol, struct vm_userfaultfd_ctx uff,
const char __user *user)
{
return __vma_merge(mm, prev, addr, end, vm_flags, anon, file, off,
pol, uff, user, false);
}

extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
unsigned long addr, int new_below);
Expand Down
9 changes: 8 additions & 1 deletion include/linux/mm_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,10 @@ struct vm_area_struct {
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
#endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;

#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
seqcount_t vm_sequence;
atomic_t vm_ref_count; /* see vma_get(), vma_put() */
#endif
ANDROID_KABI_RESERVE(1);
ANDROID_KABI_RESERVE(2);
ANDROID_KABI_RESERVE(3);
Expand All @@ -385,6 +388,9 @@ struct mm_struct {
struct vm_area_struct *mmap; /* list of VMAs */
struct rb_root mm_rb;
u64 vmacache_seqnum; /* per-thread vmacache */
#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
rwlock_t mm_rb_lock;
#endif
#ifdef CONFIG_MMU
unsigned long (*get_unmapped_area) (struct file *filp,
unsigned long addr, unsigned long len,
Expand Down Expand Up @@ -701,6 +707,7 @@ enum vm_fault_reason {
VM_FAULT_FALLBACK = (__force vm_fault_t)0x000800,
VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000,
VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000,
VM_FAULT_PTNOTSAME = (__force vm_fault_t)0x004000,
VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000,
};

Expand Down
Loading

0 comments on commit 6e625d3

Please sign in to comment.