Re: [PATCH 1/5] userfaultfd: support minor fault handling for shmem
From: Axel Rasmussen
Date: Thu Feb 25 2021 - 14:22:13 EST
On Wed, Feb 24, 2021 at 6:14 PM Axel Rasmussen <axelrasmussen@xxxxxxxxxx> wrote:
>
>
Modify the userfaultfd register API to allow registering shmem VMAs in
>
minor mode. Modify the shmem mcopy implementation to support
>
UFFDIO_CONTINUE in order to resolve such faults.
>
>
Combine the shmem mcopy handler functions into a single
>
shmem_mcopy_atomic_pte, which takes a mode parameter. This matches how
>
the hugetlbfs implementation is structured, and lets us remove a good
>
chunk of boilerplate.
>
>
Signed-off-by: Axel Rasmussen <axelrasmussen@xxxxxxxxxx>
>
---
>
fs/userfaultfd.c | 6 +--
>
include/linux/shmem_fs.h | 26 ++++------
>
include/uapi/linux/userfaultfd.h | 4 +-
>
mm/memory.c | 8 +--
>
mm/shmem.c | 88 +++++++++++++++-----------------
>
mm/userfaultfd.c | 27 +++++-----
>
6 files changed, 77 insertions(+), 82 deletions(-)
>
>
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
>
index 0311e9b8a8fc..aa6d584ae8c7 100644
>
--- a/fs/userfaultfd.c
>
+++ b/fs/userfaultfd.c
>
@@ -1267,8 +1267,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
>
}
>
>
if (vm_flags & VM_UFFD_MINOR) {
>
- /* FIXME: Add minor fault interception for shmem. */
>
- if (!is_vm_hugetlb_page(vma))
>
+ if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma)))
>
return false;
>
}
>
>
@@ -1941,7 +1940,8 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
>
/* report all available features and ioctls to userland */
>
uffdio_api.features = UFFD_API_FEATURES;
>
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
>
- uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS;
>
+ uffdio_api.features &=
>
+ ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
>
#endif
>
uffdio_api.ioctls = UFFD_API_IOCTLS;
>
ret = -EFAULT;
>
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
>
index d82b6f396588..f0919c3722e7 100644
>
--- a/include/linux/shmem_fs.h
>
+++ b/include/linux/shmem_fs.h
>
@@ -9,6 +9,7 @@
>
#include <linux/percpu_counter.h>
>
#include <linux/xattr.h>
>
#include <linux/fs_parser.h>
>
+#include <linux/userfaultfd_k.h>
>
>
/* inode in-kernel data */
>
>
@@ -122,21 +123,16 @@ static inline bool shmem_file(struct file *file)
>
extern bool shmem_charge(struct inode *inode, long pages);
>
extern void shmem_uncharge(struct inode *inode, long pages);
>
>
+#ifdef CONFIG_USERFAULTFD
>
#ifdef CONFIG_SHMEM
>
-extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
>
- struct vm_area_struct *dst_vma,
>
- unsigned long dst_addr,
>
- unsigned long src_addr,
>
- struct page **pagep);
>
-extern int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
>
- pmd_t *dst_pmd,
>
- struct vm_area_struct *dst_vma,
>
- unsigned long dst_addr);
>
-#else
>
-#define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
>
- src_addr, pagep) ({ BUG(); 0; })
>
-#define shmem_mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, \
>
- dst_addr) ({ BUG(); 0; })
>
-#endif
>
+int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
>
+ struct vm_area_struct *dst_vma,
>
+ unsigned long dst_addr, unsigned long src_addr,
>
+ enum mcopy_atomic_mode mode, struct page **pagep);
>
+#else /* !CONFIG_SHMEM */
>
+#define shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \
>
+ src_addr, mode, pagep) ({ BUG(); 0; })
>
+#endif /* CONFIG_SHMEM */
>
+#endif /* CONFIG_USERFAULTFD */
>
>
#endif
>
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
>
index bafbeb1a2624..47d9790d863d 100644
>
--- a/include/uapi/linux/userfaultfd.h
>
+++ b/include/uapi/linux/userfaultfd.h
>
@@ -31,7 +31,8 @@
>
UFFD_FEATURE_MISSING_SHMEM | \
>
UFFD_FEATURE_SIGBUS | \
>
UFFD_FEATURE_THREAD_ID | \
>
- UFFD_FEATURE_MINOR_HUGETLBFS)
>
+ UFFD_FEATURE_MINOR_HUGETLBFS | \
>
+ UFFD_FEATURE_MINOR_SHMEM)
>
#define UFFD_API_IOCTLS \
>
((__u64)1 << _UFFDIO_REGISTER | \
>
(__u64)1 << _UFFDIO_UNREGISTER | \
>
@@ -196,6 +197,7 @@ struct uffdio_api {
>
#define UFFD_FEATURE_SIGBUS (1<<7)
>
#define UFFD_FEATURE_THREAD_ID (1<<8)
>
#define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9)
>
+#define UFFD_FEATURE_MINOR_SHMEM (1<<10)
>
__u64 features;
>
>
__u64 ioctls;
>
diff --git a/mm/memory.c b/mm/memory.c
>
index c8e357627318..a1e5ff55027e 100644
>
--- a/mm/memory.c
>
+++ b/mm/memory.c
>
@@ -3929,9 +3929,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
>
* something).
>
*/
>
if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
>
- ret = do_fault_around(vmf);
>
- if (ret)
>
- return ret;
>
+ if (likely(!userfaultfd_minor(vmf->vma))) {
>
+ ret = do_fault_around(vmf);
>
+ if (ret)
>
+ return ret;
>
+ }
>
}
>
>
ret = __do_fault(vmf);
>
diff --git a/mm/shmem.c b/mm/shmem.c
>
index 06c771d23127..d7847f6f696b 100644
>
--- a/mm/shmem.c
>
+++ b/mm/shmem.c
>
@@ -77,7 +77,6 @@ static struct vfsmount *shm_mnt;
>
#include <linux/syscalls.h>
>
#include <linux/fcntl.h>
>
#include <uapi/linux/memfd.h>
>
-#include <linux/userfaultfd_k.h>
>
#include <linux/rmap.h>
>
#include <linux/uuid.h>
>
>
@@ -1781,8 +1780,8 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
>
* vm. If we swap it in we mark it dirty since we also free the swap
>
* entry since a page cannot live in both the swap and page cache.
>
*
>
- * vmf and fault_type are only supplied by shmem_fault:
>
- * otherwise they are NULL.
>
+ * vma, vmf, and fault_type are only supplied by shmem_fault: otherwise they
>
+ * are NULL.
>
*/
>
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
>
struct page **pagep, enum sgp_type sgp, gfp_t gfp,
>
@@ -1826,6 +1825,12 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
>
return error;
>
}
>
>
+ if (page && vma && userfaultfd_minor(vma)) {
>
+ unlock_page(page);
>
+ *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
>
+ return 0;
>
+ }
>
+
>
if (page)
>
hindex = page->index;
>
if (page && sgp == SGP_WRITE)
>
@@ -2350,14 +2355,12 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
>
return inode;
>
}
>
>
-static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
>
- pmd_t *dst_pmd,
>
- struct vm_area_struct *dst_vma,
>
- unsigned long dst_addr,
>
- unsigned long src_addr,
>
- bool zeropage,
>
- struct page **pagep)
>
+int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
>
+ struct vm_area_struct *dst_vma,
>
+ unsigned long dst_addr, unsigned long src_addr,
>
+ enum mcopy_atomic_mode mode, struct page **pagep)
>
{
>
+ bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
>
struct inode *inode = file_inode(dst_vma->vm_file);
>
struct shmem_inode_info *info = SHMEM_I(inode);
>
struct address_space *mapping = inode->i_mapping;
>
@@ -2374,12 +2377,17 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
>
if (!shmem_inode_acct_block(inode, 1))
>
goto out;
>
>
- if (!*pagep) {
>
+ if (is_continue) {
>
+ ret = -EFAULT;
>
+ page = find_get_page(mapping, pgoff);
>
+ if (!page)
>
+ goto out_unacct_blocks;
>
+ } else if (!*pagep) {
>
page = shmem_alloc_page(gfp, info, pgoff);
>
if (!page)
>
goto out_unacct_blocks;
>
>
- if (!zeropage) { /* mcopy_atomic */
>
+ if (mode == MCOPY_ATOMIC_NORMAL) { /* mcopy_atomic */
>
page_kaddr = kmap_atomic(page);
>
ret = copy_from_user(page_kaddr,
>
(const void __user *)src_addr,
>
@@ -2393,7 +2401,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
>
/* don't free the page */
>
return -ENOENT;
>
}
>
- } else { /* mfill_zeropage_atomic */
>
+ } else { /* zeropage */
>
clear_highpage(page);
>
}
>
} else {
>
@@ -2401,9 +2409,12 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
>
*pagep = NULL;
>
}
>
>
- VM_BUG_ON(PageLocked(page) || PageSwapBacked(page));
>
+ if (!is_continue)
>
+ VM_BUG_ON(PageSwapBacked(page));
>
+ VM_BUG_ON(PageLocked(page));
>
__SetPageLocked(page);
>
- __SetPageSwapBacked(page);
>
+ if (!is_continue || !PageSwapBacked(page))
>
+ __SetPageSwapBacked(page);
>
__SetPageUptodate(page);
Apologies, I was testing more scenarios today and discovered this
doesn't work when the shmem is backed by a tmpfs file with the
huge=always mount option. I think it's more correct to lookup the page
with find_lock_page, and then wrap *all* of the page flag fiddling
here in an "if(!is_continue) {" block. I'll send a v2 with this fix
next week.
>
>
ret = -EFAULT;
>
@@ -2412,10 +2423,13 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
>
if (unlikely(offset >= max_off))
>
goto out_release;
>
>
- ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
>
- gfp & GFP_RECLAIM_MASK, dst_mm);
>
- if (ret)
>
- goto out_release;
>
+ /* If page wasn't already in the page cache, add it. */
>
+ if (!is_continue) {
>
+ ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
>
+ gfp & GFP_RECLAIM_MASK, dst_mm);
>
+ if (ret)
>
+ goto out_release;
>
+ }
>
>
_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
>
if (dst_vma->vm_flags & VM_WRITE)
>
@@ -2442,13 +2456,15 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
>
if (!pte_none(*dst_pte))
>
goto out_release_unlock;
>
>
- lru_cache_add(page);
>
+ if (!is_continue) {
>
+ lru_cache_add(page);
>
>
- spin_lock_irq(&info->lock);
>
- info->alloced++;
>
- inode->i_blocks += BLOCKS_PER_PAGE;
>
- shmem_recalc_inode(inode);
>
- spin_unlock_irq(&info->lock);
>
+ spin_lock_irq(&info->lock);
>
+ info->alloced++;
>
+ inode->i_blocks += BLOCKS_PER_PAGE;
>
+ shmem_recalc_inode(inode);
>
+ spin_unlock_irq(&info->lock);
>
+ }
>
>
inc_mm_counter(dst_mm, mm_counter_file(page));
>
page_add_file_rmap(page, false);
>
@@ -2473,28 +2489,6 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
>
goto out;
>
}
>
>
-int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
>
- pmd_t *dst_pmd,
>
- struct vm_area_struct *dst_vma,
>
- unsigned long dst_addr,
>
- unsigned long src_addr,
>
- struct page **pagep)
>
-{
>
- return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
>
- dst_addr, src_addr, false, pagep);
>
-}
>
-
>
-int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
>
- pmd_t *dst_pmd,
>
- struct vm_area_struct *dst_vma,
>
- unsigned long dst_addr)
>
-{
>
- struct page *page = NULL;
>
-
>
- return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
>
- dst_addr, 0, true, &page);
>
-}
>
-
>
#ifdef CONFIG_TMPFS
>
static const struct inode_operations shmem_symlink_inode_operations;
>
static const struct inode_operations shmem_short_symlink_operations;
>
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
>
index ce6cb4760d2c..6cd7ab531aec 100644
>
--- a/mm/userfaultfd.c
>
+++ b/mm/userfaultfd.c
>
@@ -415,7 +415,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
>
unsigned long dst_addr,
>
unsigned long src_addr,
>
struct page **page,
>
- bool zeropage,
>
+ enum mcopy_atomic_mode mode,
>
bool wp_copy)
>
{
>
ssize_t err;
>
@@ -431,22 +431,24 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
>
* and not in the radix tree.
>
*/
>
if (!(dst_vma->vm_flags & VM_SHARED)) {
>
- if (!zeropage)
>
+ switch (mode) {
>
+ case MCOPY_ATOMIC_NORMAL:
>
err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
>
dst_addr, src_addr, page,
>
wp_copy);
>
- else
>
+ break;
>
+ case MCOPY_ATOMIC_ZEROPAGE:
>
err = mfill_zeropage_pte(dst_mm, dst_pmd,
>
dst_vma, dst_addr);
>
+ break;
>
+ case MCOPY_ATOMIC_CONTINUE:
>
+ err = -EINVAL;
>
+ break;
>
+ }
>
} else {
>
VM_WARN_ON_ONCE(wp_copy);
>
- if (!zeropage)
>
- err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
>
- dst_vma, dst_addr,
>
- src_addr, page);
>
- else
>
- err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
>
- dst_vma, dst_addr);
>
+ err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
>
+ src_addr, mode, page);
>
}
>
>
return err;
>
@@ -467,7 +469,6 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
>
long copied;
>
struct page *page;
>
bool wp_copy;
>
- bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE);
>
>
/*
>
* Sanitize the command parameters:
>
@@ -530,7 +531,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
>
>
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
>
goto out_unlock;
>
- if (mcopy_mode == MCOPY_ATOMIC_CONTINUE)
>
+ if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE)
>
goto out_unlock;
>
>
/*
>
@@ -578,7 +579,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
>
BUG_ON(pmd_trans_huge(*dst_pmd));
>
>
err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
>
- src_addr, &page, zeropage, wp_copy);
>
+ src_addr, &page, mcopy_mode, wp_copy);
>
cond_resched();
>
>
if (unlikely(err == -ENOENT)) {
>
--
>
2.30.0.617.g56c4b15f3c-goog
>