summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2016-11-29 15:36:58 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2016-11-29 15:36:58 +1100
commitb142f94c3c5039018ab8fee8d0b314f997398d2e (patch)
treeba28453a17351423da55b92f1d3c9addf6c1ac89
parent1bc1a1827270a43bd5a1973cc7eab5fd11c882d0 (diff)
parent232aaedcd7d85639e749bdf17d61999cbc55390c (diff)
downloadlinux-next-b142f94c3c5039018ab8fee8d0b314f997398d2e.tar.gz
Merge branch 'akpm/master'
-rw-r--r--Documentation/filesystems/Locking2
-rw-r--r--arch/powerpc/platforms/cell/spufs/file.c8
-rw-r--r--arch/sparc/kernel/nmi.c44
-rw-r--r--arch/x86/entry/vdso/vma.c4
-rw-r--r--drivers/char/agp/alpha-agp.c3
-rw-r--r--drivers/char/mspec.c2
-rw-r--r--drivers/dax/dax.c3
-rw-r--r--drivers/gpu/drm/armada/armada_gem.c5
-rw-r--r--drivers/gpu/drm/drm_vm.c10
-rw-r--r--drivers/gpu/drm/etnaviv/etnaviv_gem.c9
-rw-r--r--drivers/gpu/drm/exynos/exynos_drm_gem.c6
-rw-r--r--drivers/gpu/drm/gma500/framebuffer.c2
-rw-r--r--drivers/gpu/drm/gma500/gem.c5
-rw-r--r--drivers/gpu/drm/i915/i915_gem.c3
-rw-r--r--drivers/gpu/drm/i915/i915_gem_userptr.c2
-rw-r--r--drivers/gpu/drm/msm/msm_gem.c8
-rw-r--r--drivers/gpu/drm/omapdrm/omap_gem.c20
-rw-r--r--drivers/gpu/drm/tegra/gem.c4
-rw-r--r--drivers/gpu/drm/ttm/ttm_bo_vm.c2
-rw-r--r--drivers/gpu/drm/udl/udl_gem.c5
-rw-r--r--drivers/gpu/drm/vgem/vgem_drv.c2
-rw-r--r--drivers/infiniband/core/umem_odp.c2
-rw-r--r--drivers/media/v4l2-core/videobuf-dma-sg.c5
-rw-r--r--drivers/misc/cxl/context.c5
-rw-r--r--drivers/misc/sgi-gru/grumain.c2
-rw-r--r--drivers/net/wireless/intel/iwlwifi/dvm/calib.c3
-rw-r--r--drivers/staging/android/ion/ion.c2
-rw-r--r--drivers/staging/lustre/lustre/llite/vvp_io.c6
-rw-r--r--drivers/vfio/vfio_iommu_type1.c2
-rw-r--r--drivers/xen/privcmd.c2
-rw-r--r--fs/dax.c208
-rw-r--r--fs/exec.c2
-rw-r--r--fs/userfaultfd.c22
-rw-r--r--include/linux/dax.h7
-rw-r--r--include/linux/huge_mm.h10
-rw-r--r--include/linux/mm.h46
-rw-r--r--include/linux/nmi.h24
-rw-r--r--include/linux/userfaultfd_k.h4
-rw-r--r--ipc/sem.c512
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/events/uprobes.c4
-rw-r--r--kernel/time/posix-stubs.c5
-rw-r--r--kernel/watchdog.c269
-rw-r--r--kernel/watchdog_hld.c227
-rw-r--r--mm/filemap.c14
-rw-r--r--mm/gup.c20
-rw-r--r--mm/huge_memory.c173
-rw-r--r--mm/internal.h2
-rw-r--r--mm/khugepaged.c24
-rw-r--r--mm/memory.c859
-rw-r--r--mm/nommu.c10
-rw-r--r--mm/process_vm_access.c12
-rw-r--r--security/tomoyo/domain.c2
-rwxr-xr-xtools/testing/ktest/ktest.pl8
-rw-r--r--virt/kvm/async_pf.c10
-rw-r--r--virt/kvm/kvm_main.c5
56 files changed, 1375 insertions, 1283 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 960108560ab2..08c9e56c66ec 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -556,7 +556,7 @@ till "end_pgoff". ->map_pages() is called with page table locked and must
not block. If it's not possible to reach a page without blocking,
filesystem should skip it. Filesystem should use do_set_pte() to setup
page table entry. Pointer to entry associated with the page is passed in
-"pte" field in fault_env structure. Pointers to entries for other offsets
+"pte" field in vm_fault structure. Pointers to entries for other offsets
should be calculated relative to "pte".
->page_mkwrite() is called when a previously read-only pte is
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index 06254467e4dd..3a147122bc98 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -236,7 +236,6 @@ static int
spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct spu_context *ctx = vma->vm_file->private_data;
- unsigned long address = (unsigned long)vmf->virtual_address;
unsigned long pfn, offset;
offset = vmf->pgoff << PAGE_SHIFT;
@@ -244,7 +243,7 @@ spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
pr_debug("spufs_mem_mmap_fault address=0x%lx, offset=0x%lx\n",
- address, offset);
+ vmf->address, offset);
if (spu_acquire(ctx))
return VM_FAULT_NOPAGE;
@@ -256,7 +255,7 @@ spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot);
pfn = (ctx->spu->local_store_phys + offset) >> PAGE_SHIFT;
}
- vm_insert_pfn(vma, address, pfn);
+ vm_insert_pfn(vma, vmf->address, pfn);
spu_release(ctx);
@@ -355,8 +354,7 @@ static int spufs_ps_fault(struct vm_area_struct *vma,
down_read(&current->mm->mmap_sem);
} else {
area = ctx->spu->problem_phys + ps_offs;
- vm_insert_pfn(vma, (unsigned long)vmf->virtual_address,
- (area + offset) >> PAGE_SHIFT);
+ vm_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT);
spu_context_trace(spufs_ps_fault__insert, ctx, ctx->spu);
}
diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c
index a9973bb4a1b2..95e73c63c99d 100644
--- a/arch/sparc/kernel/nmi.c
+++ b/arch/sparc/kernel/nmi.c
@@ -42,7 +42,7 @@ static int panic_on_timeout;
*/
atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
EXPORT_SYMBOL(nmi_active);
-
+static int nmi_init_done;
static unsigned int nmi_hz = HZ;
static DEFINE_PER_CPU(short, wd_enabled);
static int endflag __initdata;
@@ -153,6 +153,8 @@ static void report_broken_nmi(int cpu, int *prev_nmi_count)
void stop_nmi_watchdog(void *unused)
{
+ if (!__this_cpu_read(wd_enabled))
+ return;
pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable);
__this_cpu_write(wd_enabled, 0);
atomic_dec(&nmi_active);
@@ -207,6 +209,9 @@ error:
void start_nmi_watchdog(void *unused)
{
+ if (__this_cpu_read(wd_enabled))
+ return;
+
__this_cpu_write(wd_enabled, 1);
atomic_inc(&nmi_active);
@@ -259,6 +264,8 @@ int __init nmi_init(void)
}
}
+ nmi_init_done = 1;
+
return err;
}
@@ -270,3 +277,38 @@ static int __init setup_nmi_watchdog(char *str)
return 0;
}
__setup("nmi_watchdog=", setup_nmi_watchdog);
+
+/*
+ * sparc specific NMI watchdog enable function.
+ * Enables watchdog if it is not enabled already.
+ */
+int watchdog_nmi_enable(unsigned int cpu)
+{
+ if (atomic_read(&nmi_active) == -1) {
+ pr_warn("NMI watchdog cannot be enabled or disabled\n");
+ return -1;
+ }
+
+ /*
+ * watchdog thread could start even before nmi_init is called.
+ * Just Return in that case. Let nmi_init finish the init
+ * process first.
+ */
+ if (!nmi_init_done)
+ return 0;
+
+ smp_call_function_single(cpu, start_nmi_watchdog, NULL, 1);
+
+ return 0;
+}
+/*
+ * sparc specific NMI watchdog disable function.
+ * Disables watchdog if it is not disabled already.
+ */
+void watchdog_nmi_disable(unsigned int cpu)
+{
+ if (atomic_read(&nmi_active) == -1)
+ pr_warn_once("NMI watchdog cannot be enabled or disabled\n");
+ else
+ smp_call_function_single(cpu, stop_nmi_watchdog, NULL, 1);
+}
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index e739002427ed..40121d14d34d 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -109,7 +109,7 @@ static int vvar_fault(const struct vm_special_mapping *sm,
return VM_FAULT_SIGBUS;
if (sym_offset == image->sym_vvar_page) {
- ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address,
+ ret = vm_insert_pfn(vma, vmf->address,
__pa_symbol(&__vvar_page) >> PAGE_SHIFT);
} else if (sym_offset == image->sym_pvclock_page) {
struct pvclock_vsyscall_time_info *pvti =
@@ -117,7 +117,7 @@ static int vvar_fault(const struct vm_special_mapping *sm,
if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
ret = vm_insert_pfn(
vma,
- (unsigned long)vmf->virtual_address,
+ vmf->address,
__pa(pvti) >> PAGE_SHIFT);
}
}
diff --git a/drivers/char/agp/alpha-agp.c b/drivers/char/agp/alpha-agp.c
index 199b8e99f7d7..737187865269 100644
--- a/drivers/char/agp/alpha-agp.c
+++ b/drivers/char/agp/alpha-agp.c
@@ -19,8 +19,7 @@ static int alpha_core_agp_vm_fault(struct vm_area_struct *vma,
unsigned long pa;
struct page *page;
- dma_addr = (unsigned long)vmf->virtual_address - vma->vm_start
- + agp->aperture.bus_base;
+ dma_addr = vmf->address - vma->vm_start + agp->aperture.bus_base;
pa = agp->ops->translate(agp, dma_addr);
if (pa == (unsigned long)-EINVAL)
diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c
index f3f92d5fcda0..a697ca0cab1e 100644
--- a/drivers/char/mspec.c
+++ b/drivers/char/mspec.c
@@ -227,7 +227,7 @@ mspec_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
* be because another thread has installed the pte first, so it
* is no problem.
*/
- vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn);
+ vm_insert_pfn(vma, vmf->address, pfn);
return VM_FAULT_NOPAGE;
}
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index 3d94ff20fdca..028b56ab9e31 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -328,7 +328,6 @@ static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
struct vm_fault *vmf)
{
- unsigned long vaddr = (unsigned long) vmf->virtual_address;
struct device *dev = &dax_dev->dev;
struct dax_region *dax_region;
int rc = VM_FAULT_SIGBUS;
@@ -353,7 +352,7 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
- rc = vm_insert_mixed(vma, vaddr, pfn);
+ rc = vm_insert_mixed(vma, vmf->address, pfn);
if (rc == -ENOMEM)
return VM_FAULT_OOM;
diff --git a/drivers/gpu/drm/armada/armada_gem.c b/drivers/gpu/drm/armada/armada_gem.c
index 768087ddb046..a293c8be232c 100644
--- a/drivers/gpu/drm/armada/armada_gem.c
+++ b/drivers/gpu/drm/armada/armada_gem.c
@@ -17,12 +17,11 @@
static int armada_gem_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct armada_gem_object *obj = drm_to_armada_gem(vma->vm_private_data);
- unsigned long addr = (unsigned long)vmf->virtual_address;
unsigned long pfn = obj->phys_addr >> PAGE_SHIFT;
int ret;
- pfn += (addr - vma->vm_start) >> PAGE_SHIFT;
- ret = vm_insert_pfn(vma, addr, pfn);
+ pfn += (vmf->address - vma->vm_start) >> PAGE_SHIFT;
+ ret = vm_insert_pfn(vma, vmf->address, pfn);
switch (ret) {
case 0:
diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c
index caa4e4ca616d..bd311c77c254 100644
--- a/drivers/gpu/drm/drm_vm.c
+++ b/drivers/gpu/drm/drm_vm.c
@@ -124,8 +124,7 @@ static int drm_do_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
* Using vm_pgoff as a selector forces us to use this unusual
* addressing scheme.
*/
- resource_size_t offset = (unsigned long)vmf->virtual_address -
- vma->vm_start;
+ resource_size_t offset = vmf->address - vma->vm_start;
resource_size_t baddr = map->offset + offset;
struct drm_agp_mem *agpmem;
struct page *page;
@@ -195,7 +194,7 @@ static int drm_do_vm_shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (!map)
return VM_FAULT_SIGBUS; /* Nothing allocated */
- offset = (unsigned long)vmf->virtual_address - vma->vm_start;
+ offset = vmf->address - vma->vm_start;
i = (unsigned long)map->handle + offset;
page = vmalloc_to_page((void *)i);
if (!page)
@@ -301,7 +300,8 @@ static int drm_do_vm_dma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (!dma->pagelist)
return VM_FAULT_SIGBUS; /* Nothing allocated */
- offset = (unsigned long)vmf->virtual_address - vma->vm_start; /* vm_[pg]off[set] should be 0 */
+ offset = vmf->address - vma->vm_start;
+ /* vm_[pg]off[set] should be 0 */
page_nr = offset >> PAGE_SHIFT; /* page_nr could just be vmf->pgoff */
page = virt_to_page((void *)dma->pagelist[page_nr]);
@@ -337,7 +337,7 @@ static int drm_do_vm_sg_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (!entry->pagelist)
return VM_FAULT_SIGBUS; /* Nothing allocated */
- offset = (unsigned long)vmf->virtual_address - vma->vm_start;
+ offset = vmf->address - vma->vm_start;
map_offset = map->offset - (unsigned long)dev->sg->virtual;
page_offset = (offset >> PAGE_SHIFT) + (map_offset >> PAGE_SHIFT);
page = entry->pagelist[page_offset];
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index 7d066a91d778..114dddbd297b 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -202,15 +202,14 @@ int etnaviv_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
/* We don't use vmf->pgoff since that has the fake offset: */
- pgoff = ((unsigned long)vmf->virtual_address -
- vma->vm_start) >> PAGE_SHIFT;
+ pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
page = pages[pgoff];
- VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address,
+ VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address,
page_to_pfn(page), page_to_pfn(page) << PAGE_SHIFT);
- ret = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page);
+ ret = vm_insert_page(vma, vmf->address, page);
out:
switch (ret) {
@@ -759,7 +758,7 @@ static struct page **etnaviv_gem_userptr_do_get_pages(
down_read(&mm->mmap_sem);
while (pinned < npages) {
ret = get_user_pages_remote(task, mm, ptr, npages - pinned,
- flags, pvec + pinned, NULL);
+ flags, pvec + pinned, NULL, NULL);
if (ret < 0)
break;
diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c
index f2ae72ba7d5a..5cfa5121e805 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_gem.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c
@@ -455,8 +455,7 @@ int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
pgoff_t page_offset;
int ret;
- page_offset = ((unsigned long)vmf->virtual_address -
- vma->vm_start) >> PAGE_SHIFT;
+ page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
if (page_offset >= (exynos_gem->size >> PAGE_SHIFT)) {
DRM_ERROR("invalid page offset\n");
@@ -465,8 +464,7 @@ int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
pfn = page_to_pfn(exynos_gem->pages[page_offset]);
- ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
- __pfn_to_pfn_t(pfn, PFN_DEV));
+ ret = vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV));
out:
switch (ret) {
diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c
index 4071b2d1e8cf..8b44fa542562 100644
--- a/drivers/gpu/drm/gma500/framebuffer.c
+++ b/drivers/gpu/drm/gma500/framebuffer.c
@@ -125,7 +125,7 @@ static int psbfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
psbfb->gtt->offset;
page_num = vma_pages(vma);
- address = (unsigned long)vmf->virtual_address - (vmf->pgoff << PAGE_SHIFT);
+ address = vmf->address - (vmf->pgoff << PAGE_SHIFT);
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
diff --git a/drivers/gpu/drm/gma500/gem.c b/drivers/gpu/drm/gma500/gem.c
index 6d1cb6b370b1..527c62917660 100644
--- a/drivers/gpu/drm/gma500/gem.c
+++ b/drivers/gpu/drm/gma500/gem.c
@@ -197,15 +197,14 @@ int psb_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
/* Page relative to the VMA start - we must calculate this ourselves
because vmf->pgoff is the fake GEM offset */
- page_offset = ((unsigned long) vmf->virtual_address - vma->vm_start)
- >> PAGE_SHIFT;
+ page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
/* CPU view of the page, don't go via the GART for CPU writes */
if (r->stolen)
pfn = (dev_priv->stolen_base + r->offset) >> PAGE_SHIFT;
else
pfn = page_to_pfn(r->pages[page_offset]);
- ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn);
+ ret = vm_insert_pfn(vma, vmf->address, pfn);
fail:
mutex_unlock(&dev_priv->mmap_mutex);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 902fa427c196..8f2d0eb66b81 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1796,8 +1796,7 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf)
int ret;
/* We don't use vmf->pgoff since that has the fake offset */
- page_offset = ((unsigned long)vmf->virtual_address - area->vm_start) >>
- PAGE_SHIFT;
+ page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
trace_i915_gem_object_fault(obj, page_offset, true, write);
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 107ddf51065e..d068af2ec3a3 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -515,7 +515,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
obj->userptr.ptr + pinned * PAGE_SIZE,
npages - pinned,
flags,
- pvec + pinned, NULL);
+ pvec + pinned, NULL, NULL);
if (ret < 0)
break;
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index cd06cfd94687..d8bc59c7e261 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -225,16 +225,14 @@ int msm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
/* We don't use vmf->pgoff since that has the fake offset: */
- pgoff = ((unsigned long)vmf->virtual_address -
- vma->vm_start) >> PAGE_SHIFT;
+ pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
pfn = page_to_pfn(pages[pgoff]);
- VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address,
+ VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address,
pfn, pfn << PAGE_SHIFT);
- ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
- __pfn_to_pfn_t(pfn, PFN_DEV));
+ ret = vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV));
out_unlock:
mutex_unlock(&dev->struct_mutex);
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
index d4e1e11466f8..4a90c690f09e 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -398,8 +398,7 @@ static int fault_1d(struct drm_gem_object *obj,
pgoff_t pgoff;
/* We don't use vmf->pgoff since that has the fake offset: */
- pgoff = ((unsigned long)vmf->virtual_address -
- vma->vm_start) >> PAGE_SHIFT;
+ pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
if (omap_obj->pages) {
omap_gem_cpu_sync(obj, pgoff);
@@ -409,11 +408,10 @@ static int fault_1d(struct drm_gem_object *obj,
pfn = (omap_obj->paddr >> PAGE_SHIFT) + pgoff;
}
- VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address,
+ VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address,
pfn, pfn << PAGE_SHIFT);
- return vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
- __pfn_to_pfn_t(pfn, PFN_DEV));
+ return vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV));
}
/* Special handling for the case of faulting in 2d tiled buffers */
@@ -427,7 +425,7 @@ static int fault_2d(struct drm_gem_object *obj,
struct page *pages[64]; /* XXX is this too much to have on stack? */
unsigned long pfn;
pgoff_t pgoff, base_pgoff;
- void __user *vaddr;
+ unsigned long vaddr;
int i, ret, slots;
/*
@@ -447,8 +445,7 @@ static int fault_2d(struct drm_gem_object *obj,
const int m = 1 + ((omap_obj->width << fmt) / PAGE_SIZE);
/* We don't use vmf->pgoff since that has the fake offset: */
- pgoff = ((unsigned long)vmf->virtual_address -
- vma->vm_start) >> PAGE_SHIFT;
+ pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
/*
* Actual address we start mapping at is rounded down to previous slot
@@ -459,7 +456,7 @@ static int fault_2d(struct drm_gem_object *obj,
/* figure out buffer width in slots */
slots = omap_obj->width >> priv->usergart[fmt].slot_shift;
- vaddr = vmf->virtual_address - ((pgoff - base_pgoff) << PAGE_SHIFT);
+ vaddr = vmf->address - ((pgoff - base_pgoff) << PAGE_SHIFT);
entry = &priv->usergart[fmt].entry[priv->usergart[fmt].last];
@@ -503,12 +500,11 @@ static int fault_2d(struct drm_gem_object *obj,
pfn = entry->paddr >> PAGE_SHIFT;
- VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address,
+ VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address,
pfn, pfn << PAGE_SHIFT);
for (i = n; i > 0; i--) {
- vm_insert_mixed(vma, (unsigned long)vaddr,
- __pfn_to_pfn_t(pfn, PFN_DEV));
+ vm_insert_mixed(vma, vaddr, __pfn_to_pfn_t(pfn, PFN_DEV));
pfn += priv->usergart[fmt].stride_pfn;
vaddr += PAGE_SIZE * m;
}
diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
index c08e5279eeac..7d853e6b5ff0 100644
--- a/drivers/gpu/drm/tegra/gem.c
+++ b/drivers/gpu/drm/tegra/gem.c
@@ -452,10 +452,10 @@ static int tegra_bo_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (!bo->pages)
return VM_FAULT_SIGBUS;
- offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >> PAGE_SHIFT;
+ offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
page = bo->pages[offset];
- err = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page);
+ err = vm_insert_page(vma, vmf->address, page);
switch (err) {
case -EAGAIN:
case 0:
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 4748aedc933a..68ef993ab431 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -101,7 +101,7 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct page *page;
int ret;
int i;
- unsigned long address = (unsigned long)vmf->virtual_address;
+ unsigned long address = vmf->address;
int retval = VM_FAULT_NOPAGE;
struct ttm_mem_type_manager *man =
&bdev->man[bo->mem.mem_type];
diff --git a/drivers/gpu/drm/udl/udl_gem.c b/drivers/gpu/drm/udl/udl_gem.c
index 818e70712b18..3c0c4bd3f750 100644
--- a/drivers/gpu/drm/udl/udl_gem.c
+++ b/drivers/gpu/drm/udl/udl_gem.c
@@ -107,14 +107,13 @@ int udl_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
unsigned int page_offset;
int ret = 0;
- page_offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >>
- PAGE_SHIFT;
+ page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
if (!obj->pages)
return VM_FAULT_SIGBUS;
page = obj->pages[page_offset];
- ret = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page);
+ ret = vm_insert_page(vma, vmf->address, page);
switch (ret) {
case -EAGAIN:
case 0:
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c
index f36c14729b55..477e07f0ecb6 100644
--- a/drivers/gpu/drm/vgem/vgem_drv.c
+++ b/drivers/gpu/drm/vgem/vgem_drv.c
@@ -54,7 +54,7 @@ static int vgem_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct drm_vgem_gem_object *obj = vma->vm_private_data;
/* We don't use vmf->pgoff since that has the fake offset */
- unsigned long vaddr = (unsigned long)vmf->virtual_address;
+ unsigned long vaddr = vmf->address;
struct page *page;
page = shmem_read_mapping_page(file_inode(obj->base.filp)->i_mapping,
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 1f0fe3217f23..6b079a31dced 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -578,7 +578,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
*/
npages = get_user_pages_remote(owning_process, owning_mm,
user_virt, gup_num_pages,
- flags, local_page_list, NULL);
+ flags, local_page_list, NULL, NULL);
up_read(&owning_mm->mmap_sem);
if (npages < 0)
diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c
index 1db0af6c7f94..ba63ca57ed7e 100644
--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
@@ -439,13 +439,12 @@ static int videobuf_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct page *page;
dprintk(3, "fault: fault @ %08lx [vma %08lx-%08lx]\n",
- (unsigned long)vmf->virtual_address,
- vma->vm_start, vma->vm_end);
+ vmf->address, vma->vm_start, vma->vm_end);
page = alloc_page(GFP_USER | __GFP_DMA32);
if (!page)
return VM_FAULT_OOM;
- clear_user_highpage(page, (unsigned long)vmf->virtual_address);
+ clear_user_highpage(page, vmf->address);
vmf->page = page;
return 0;
diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index ff5e7e8cb1d1..3907387b6d15 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -124,13 +124,12 @@ void cxl_context_set_mapping(struct cxl_context *ctx,
static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct cxl_context *ctx = vma->vm_file->private_data;
- unsigned long address = (unsigned long)vmf->virtual_address;
u64 area, offset;
offset = vmf->pgoff << PAGE_SHIFT;
pr_devel("%s: pe: %i address: 0x%lx offset: 0x%llx\n",
- __func__, ctx->pe, address, offset);
+ __func__, ctx->pe, vmf->address, offset);
if (ctx->afu->current_mode == CXL_MODE_DEDICATED) {
area = ctx->afu->psn_phys;
@@ -162,7 +161,7 @@ static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
}
- vm_insert_pfn(vma, address, (area + offset) >> PAGE_SHIFT);
+ vm_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT);
mutex_unlock(&ctx->status_mutex);
diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c
index 33741ad4a74a..af2e077da4b8 100644
--- a/drivers/misc/sgi-gru/grumain.c
+++ b/drivers/misc/sgi-gru/grumain.c
@@ -932,7 +932,7 @@ int gru_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
unsigned long paddr, vaddr;
unsigned long expires;
- vaddr = (unsigned long)vmf->virtual_address;
+ vaddr = vmf->address;
gru_dbg(grudev, "vma %p, vaddr 0x%lx (0x%lx)\n",
vma, vaddr, GSEG_BASE(vaddr));
STAT(nopfn);
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/calib.c b/drivers/net/wireless/intel/iwlwifi/dvm/calib.c
index e9cef9de9ed8..c96f9b1d948a 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/calib.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/calib.c
@@ -900,8 +900,7 @@ static void iwlagn_gain_computation(struct iwl_priv *priv,
/* bound gain by 2 bits value max, 3rd bit is sign */
data->delta_gain_code[i] =
- min(abs(delta_g),
- (s32) CHAIN_NOISE_MAX_DELTA_GAIN_CODE);
+ min(abs(delta_g), CHAIN_NOISE_MAX_DELTA_GAIN_CODE);
if (delta_g < 0)
/*
diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
index d5cc3070e83f..b653451843c8 100644
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -882,7 +882,7 @@ static int ion_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
BUG_ON(!buffer->pages || !buffer->pages[vmf->pgoff]);
pfn = page_to_pfn(ion_buffer_page(buffer->pages[vmf->pgoff]));
- ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn);
+ ret = vm_insert_pfn(vma, vmf->address, pfn);
mutex_unlock(&buffer->lock);
if (ret)
return VM_FAULT_ERROR;
diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c
index 33b38bda8319..d102ba0b629c 100644
--- a/drivers/staging/lustre/lustre/llite/vvp_io.c
+++ b/drivers/staging/lustre/lustre/llite/vvp_io.c
@@ -1018,7 +1018,7 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
"page %p map %p index %lu flags %lx count %u priv %0lx: got addr %p type NOPAGE\n",
vmf->page, vmf->page->mapping, vmf->page->index,
(long)vmf->page->flags, page_count(vmf->page),
- page_private(vmf->page), vmf->virtual_address);
+ page_private(vmf->page), (void *)vmf->address);
if (unlikely(!(cfio->ft_flags & VM_FAULT_LOCKED))) {
lock_page(vmf->page);
cfio->ft_flags |= VM_FAULT_LOCKED;
@@ -1029,12 +1029,12 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
}
if (cfio->ft_flags & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) {
- CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", vmf->virtual_address);
+ CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", (void *)vmf->address);
return -EFAULT;
}
if (cfio->ft_flags & VM_FAULT_OOM) {
- CDEBUG(D_PAGE, "got addr %p - OOM\n", vmf->virtual_address);
+ CDEBUG(D_PAGE, "got addr %p - OOM\n", (void *)vmf->address);
return -ENOMEM;
}
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 51810a95416e..cae2e9121e48 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -362,7 +362,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
down_read(&mm->mmap_sem);
ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page,
- NULL);
+ NULL, NULL);
up_read(&mm->mmap_sem);
}
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 702040fe2001..6e3306f4a525 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -602,7 +602,7 @@ static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
vma, vma->vm_start, vma->vm_end,
- vmf->pgoff, vmf->virtual_address);
+ vmf->pgoff, (void *)vmf->address);
return VM_FAULT_SIGBUS;
}
diff --git a/fs/dax.c b/fs/dax.c
index 92bfe98662f9..b1fe228cd609 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -31,6 +31,7 @@
#include <linux/vmstat.h>
#include <linux/pfn_t.h>
#include <linux/sizes.h>
+#include <linux/mmu_notifier.h>
#include <linux/iomap.h>
#include "internal.h"
@@ -240,6 +241,23 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
}
}
+static void dax_unlock_mapping_entry(struct address_space *mapping,
+ pgoff_t index)
+{
+ void *entry, **slot;
+
+ spin_lock_irq(&mapping->tree_lock);
+ entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
+ if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
+ !slot_locked(mapping, slot))) {
+ spin_unlock_irq(&mapping->tree_lock);
+ return;
+ }
+ unlock_slot(mapping, slot);
+ spin_unlock_irq(&mapping->tree_lock);
+ dax_wake_mapping_entry_waiter(mapping, index, entry, false);
+}
+
static void put_locked_mapping_entry(struct address_space *mapping,
pgoff_t index, void *entry)
{
@@ -434,22 +452,6 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
}
-void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
-{
- void *entry, **slot;
-
- spin_lock_irq(&mapping->tree_lock);
- entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
- if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
- !slot_locked(mapping, slot))) {
- spin_unlock_irq(&mapping->tree_lock);
- return;
- }
- unlock_slot(mapping, slot);
- spin_unlock_irq(&mapping->tree_lock);
- dax_wake_mapping_entry_waiter(mapping, index, entry, false);
-}
-
/*
* Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
* entry to get unlocked before deleting it.
@@ -501,10 +503,8 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
/* This will replace locked radix tree entry with a hole page */
page = find_or_create_page(mapping, vmf->pgoff,
vmf->gfp_mask | __GFP_ZERO);
- if (!page) {
- put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+ if (!page)
return VM_FAULT_OOM;
- }
vmf->page = page;
return VM_FAULT_LOCKED;
}
@@ -616,36 +616,107 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
return new_entry;
}
+static inline unsigned long
+pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
+{
+ unsigned long address;
+
+ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+ return address;
+}
+
+/* Walk all mappings of a given index of a file and writeprotect them */
+static void dax_mapping_entry_mkclean(struct address_space *mapping,
+ pgoff_t index, unsigned long pfn)
+{
+ struct vm_area_struct *vma;
+ pte_t *ptep;
+ pte_t pte;
+ spinlock_t *ptl;
+ bool changed;
+
+ i_mmap_lock_read(mapping);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
+ unsigned long address;
+
+ cond_resched();
+
+ if (!(vma->vm_flags & VM_SHARED))
+ continue;
+
+ address = pgoff_address(index, vma);
+ changed = false;
+ if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
+ continue;
+ if (pfn != pte_pfn(*ptep))
+ goto unlock;
+ if (!pte_dirty(*ptep) && !pte_write(*ptep))
+ goto unlock;
+
+ flush_cache_page(vma, address, pfn);
+ pte = ptep_clear_flush(vma, address, ptep);
+ pte = pte_wrprotect(pte);
+ pte = pte_mkclean(pte);
+ set_pte_at(vma->vm_mm, address, ptep, pte);
+ changed = true;
+unlock:
+ pte_unmap_unlock(ptep, ptl);
+
+ if (changed)
+ mmu_notifier_invalidate_page(vma->vm_mm, address);
+ }
+ i_mmap_unlock_read(mapping);
+}
+
static int dax_writeback_one(struct block_device *bdev,
struct address_space *mapping, pgoff_t index, void *entry)
{
struct radix_tree_root *page_tree = &mapping->page_tree;
- struct radix_tree_node *node;
struct blk_dax_ctl dax;
- void **slot;
+ void *entry2, **slot;
int ret = 0;
- spin_lock_irq(&mapping->tree_lock);
/*
- * Regular page slots are stabilized by the page lock even
- * without the tree itself locked. These unlocked entries
- * need verification under the tree lock.
+ * A page got tagged dirty in DAX mapping? Something is seriously
+ * wrong.
*/
- if (!__radix_tree_lookup(page_tree, index, &node, &slot))
- goto unlock;
- if (*slot != entry)
- goto unlock;
-
- /* another fsync thread may have already written back this entry */
- if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
- goto unlock;
+ if (WARN_ON(!radix_tree_exceptional_entry(entry)))
+ return -EIO;
+ spin_lock_irq(&mapping->tree_lock);
+ entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
+ /* Entry got punched out / reallocated? */
+ if (!entry2 || !radix_tree_exceptional_entry(entry2))
+ goto put_unlocked;
+ /*
+ * Entry got reallocated elsewhere? No need to writeback. We have to
+ * compare sectors as we must not bail out due to difference in lockbit
+ * or entry type.
+ */
+ if (dax_radix_sector(entry2) != dax_radix_sector(entry))
+ goto put_unlocked;
if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
dax_is_zero_entry(entry))) {
ret = -EIO;
- goto unlock;
+ goto put_unlocked;
}
+ /* Another fsync thread may have already written back this entry */
+ if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
+ goto put_unlocked;
+ /* Lock the entry to serialize with page faults */
+ entry = lock_slot(mapping, slot);
+ /*
+ * We can clear the tag now but we have to be careful so that concurrent
+ * dax_writeback_one() calls for the same index cannot finish before we
+ * actually flush the caches. This is achieved as the calls will look
+ * at the entry only under tree_lock and once they do that they will
+ * see the entry locked and wait for it to unlock.
+ */
+ radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
+ spin_unlock_irq(&mapping->tree_lock);
+
/*
* Even if dax_writeback_mapping_range() was given a wbc->range_start
* in the middle of a PMD, the 'index' we are given will be aligned to
@@ -655,31 +726,40 @@ static int dax_writeback_one(struct block_device *bdev,
*/
dax.sector = dax_radix_sector(entry);
dax.size = PAGE_SIZE << dax_radix_order(entry);
- spin_unlock_irq(&mapping->tree_lock);
/*
* We cannot hold tree_lock while calling dax_map_atomic() because it
* eventually calls cond_resched().
*/
ret = dax_map_atomic(bdev, &dax);
- if (ret < 0)
+ if (ret < 0) {
+ put_locked_mapping_entry(mapping, index, entry);
return ret;
+ }
if (WARN_ON_ONCE(ret < dax.size)) {
ret = -EIO;
goto unmap;
}
+ dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn));
wb_cache_pmem(dax.addr, dax.size);
-
+ /*
+ * After we have flushed the cache, we can clear the dirty tag. There
+ * cannot be new dirty data in the pfn after the flush has completed as
+ * the pfn mappings are writeprotected and fault waits for mapping
+ * entry lock.
+ */
spin_lock_irq(&mapping->tree_lock);
- radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
+ radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
spin_unlock_irq(&mapping->tree_lock);
unmap:
dax_unmap_atomic(bdev, &dax);
+ put_locked_mapping_entry(mapping, index, entry);
return ret;
- unlock:
+ put_unlocked:
+ put_unlocked_mapping_entry(mapping, index, entry2);
spin_unlock_irq(&mapping->tree_lock);
return ret;
}
@@ -739,7 +819,7 @@ static int dax_insert_mapping(struct address_space *mapping,
struct block_device *bdev, sector_t sector, size_t size,
void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
{
- unsigned long vaddr = (unsigned long)vmf->virtual_address;
+ unsigned long vaddr = vmf->address;
struct blk_dax_ctl dax = {
.sector = sector,
.size = size,
@@ -768,17 +848,27 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
- void *entry;
+ void *entry, **slot;
pgoff_t index = vmf->pgoff;
spin_lock_irq(&mapping->tree_lock);
- entry = get_unlocked_mapping_entry(mapping, index, NULL);
- if (!entry || !radix_tree_exceptional_entry(entry))
- goto out;
+ entry = get_unlocked_mapping_entry(mapping, index, &slot);
+ if (!entry || !radix_tree_exceptional_entry(entry)) {
+ if (entry)
+ put_unlocked_mapping_entry(mapping, index, entry);
+ spin_unlock_irq(&mapping->tree_lock);
+ return VM_FAULT_NOPAGE;
+ }
radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
- put_unlocked_mapping_entry(mapping, index, entry);
-out:
+ entry = lock_slot(mapping, slot);
spin_unlock_irq(&mapping->tree_lock);
+ /*
+ * If we race with somebody updating the PTE and finish_mkwrite_fault()
+ * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
+ * the fault in either case.
+ */
+ finish_mkwrite_fault(vmf);
+ put_locked_mapping_entry(mapping, index, entry);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
@@ -949,13 +1039,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
{
struct address_space *mapping = vma->vm_file->f_mapping;
struct inode *inode = mapping->host;
- unsigned long vaddr = (unsigned long)vmf->virtual_address;
+ unsigned long vaddr = vmf->address;
loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
sector_t sector;
struct iomap iomap = { 0 };
unsigned flags = IOMAP_FAULT;
int error, major = 0;
- int locked_status = 0;
+ int vmf_ret = 0;
void *entry;
/*
@@ -1008,13 +1098,11 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if (error)
goto finish_iomap;
- if (!radix_tree_exceptional_entry(entry)) {
- vmf->page = entry;
- locked_status = VM_FAULT_LOCKED;
- } else {
- vmf->entry = entry;
- locked_status = VM_FAULT_DAX_LOCKED;
- }
+
+ __SetPageUptodate(vmf->cow_page);
+ vmf_ret = finish_fault(vmf);
+ if (!vmf_ret)
+ vmf_ret = VM_FAULT_DONE_COW;
goto finish_iomap;
}
@@ -1031,7 +1119,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
if (!(vmf->flags & FAULT_FLAG_WRITE)) {
- locked_status = dax_load_hole(mapping, entry, vmf);
+ vmf_ret = dax_load_hole(mapping, entry, vmf);
break;
}
/*FALLTHRU*/
@@ -1043,7 +1131,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
finish_iomap:
if (ops->iomap_end) {
- if (error) {
+ if (error || (vmf_ret & VM_FAULT_ERROR)) {
/* keep previous error */
ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags,
&iomap);
@@ -1053,7 +1141,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
}
}
unlock_entry:
- if (!locked_status || error)
+ if (vmf_ret != VM_FAULT_LOCKED || error)
put_locked_mapping_entry(mapping, vmf->pgoff, entry);
out:
if (error == -ENOMEM)
@@ -1061,9 +1149,9 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
/* -EBUSY is fine, somebody else faulted on the same PTE */
if (error < 0 && error != -EBUSY)
return VM_FAULT_SIGBUS | major;
- if (locked_status) {
+ if (vmf_ret) {
WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */
- return locked_status;
+ return vmf_ret;
}
return VM_FAULT_NOPAGE | major;
}
diff --git a/fs/exec.c b/fs/exec.c
index 88b5e1efdbd6..8112eacf10f3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -209,7 +209,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
* doing the exec and bprm->mm is the new process's mm.
*/
ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags,
- &page, NULL);
+ &page, NULL, NULL);
if (ret <= 0)
return NULL;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 85959d8324df..d96e2f30084b 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -257,9 +257,9 @@ out:
* fatal_signal_pending()s, and the mmap_sem must be released before
* returning it.
*/
-int handle_userfault(struct fault_env *fe, unsigned long reason)
+int handle_userfault(struct vm_fault *vmf, unsigned long reason)
{
- struct mm_struct *mm = fe->vma->vm_mm;
+ struct mm_struct *mm = vmf->vma->vm_mm;
struct userfaultfd_ctx *ctx;
struct userfaultfd_wait_queue uwq;
int ret;
@@ -268,7 +268,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
ret = VM_FAULT_SIGBUS;
- ctx = fe->vma->vm_userfaultfd_ctx.ctx;
+ ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
if (!ctx)
goto out;
@@ -301,17 +301,18 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
* without first stopping userland access to the memory. For
* VM_UFFD_MISSING userfaults this is enough for now.
*/
- if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) {
+ if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
/*
* Validate the invariant that nowait must allow retry
* to be sure not to return SIGBUS erroneously on
* nowait invocations.
*/
- BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT);
+ BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
#ifdef CONFIG_DEBUG_VM
if (printk_ratelimit()) {
printk(KERN_WARNING
- "FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags);
+ "FAULT_FLAG_ALLOW_RETRY missing %x\n",
+ vmf->flags);
dump_stack();
}
#endif
@@ -323,7 +324,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
* and wait.
*/
ret = VM_FAULT_RETRY;
- if (fe->flags & FAULT_FLAG_RETRY_NOWAIT)
+ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
goto out;
/* take the reference before dropping the mmap_sem */
@@ -331,11 +332,11 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
uwq.wq.private = current;
- uwq.msg = userfault_msg(fe->address, fe->flags, reason);
+ uwq.msg = userfault_msg(vmf->address, vmf->flags, reason);
uwq.ctx = ctx;
return_to_userland =
- (fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
+ (vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
(FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
spin_lock(&ctx->fault_pending_wqh.lock);
@@ -353,7 +354,8 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
TASK_KILLABLE);
spin_unlock(&ctx->fault_pending_wqh.lock);
- must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason);
+ must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
+ reason);
up_read(&mm->mmap_sem);
if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 0afade8bd3d7..f97bcfe79472 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -46,7 +46,6 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
#ifdef CONFIG_FS_DAX
struct page *read_dax_sector(struct block_device *bdev, sector_t n);
-void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index);
int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
unsigned int offset, unsigned int length);
#else
@@ -55,12 +54,6 @@ static inline struct page *read_dax_sector(struct block_device *bdev,
{
return ERR_PTR(-ENXIO);
}
-/* Shouldn't ever be called when dax is disabled. */
-static inline void dax_unlock_mapping_entry(struct address_space *mapping,
- pgoff_t index)
-{
- BUG();
-}
static inline int __dax_zero_page_range(struct block_device *bdev,
sector_t sector, unsigned int offset, unsigned int length)
{
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 1f782aa1d8e6..97e478d6b690 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -1,12 +1,12 @@
#ifndef _LINUX_HUGE_MM_H
#define _LINUX_HUGE_MM_H
-extern int do_huge_pmd_anonymous_page(struct fault_env *fe);
+extern int do_huge_pmd_anonymous_page(struct vm_fault *vmf);
extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *vma);
-extern void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd);
-extern int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd);
+extern void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd);
+extern int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd);
extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned long addr,
pmd_t *pmd,
@@ -142,7 +142,7 @@ static inline int hpage_nr_pages(struct page *page)
return 1;
}
-extern int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd);
+extern int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
extern struct page *huge_zero_page;
@@ -212,7 +212,7 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
return NULL;
}
-static inline int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd)
+static inline int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd)
{
return 0;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0b5b2e4df14e..4424784ac374 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -292,36 +292,23 @@ extern pgprot_t protection_map[16];
* pgoff should be used in favour of virtual_address, if possible.
*/
struct vm_fault {
+ struct vm_area_struct *vma; /* Target VMA */
unsigned int flags; /* FAULT_FLAG_xxx flags */
gfp_t gfp_mask; /* gfp mask to be used for allocations */
pgoff_t pgoff; /* Logical page offset based on vma */
- void __user *virtual_address; /* Faulting virtual address */
+ unsigned long address; /* Faulting virtual address */
+ pmd_t *pmd; /* Pointer to pmd entry matching
+ * the 'address' */
+ pte_t orig_pte; /* Value of PTE at the time of fault */
- struct page *cow_page; /* Handler may choose to COW */
+ struct page *cow_page; /* Page handler may use for COW fault */
+ struct mem_cgroup *memcg; /* Cgroup cow_page belongs to */
struct page *page; /* ->fault handlers should return a
* page here, unless VM_FAULT_NOPAGE
* is set (which is also implied by
* VM_FAULT_ERROR).
*/
- void *entry; /* ->fault handler can alternatively
- * return locked DAX entry. In that
- * case handler should return
- * VM_FAULT_DAX_LOCKED and fill in
- * entry here.
- */
-};
-
-/*
- * Page fault context: passes though page fault handler instead of endless list
- * of function arguments.
- */
-struct fault_env {
- struct vm_area_struct *vma; /* Target VMA */
- unsigned long address; /* Faulting virtual address */
- unsigned int flags; /* FAULT_FLAG_xxx flags */
- pmd_t *pmd; /* Pointer to pmd entry matching
- * the 'address'
- */
+ /* These three entries are valid only while holding ptl lock */
pte_t *pte; /* Pointer to pte entry matching
* the 'address'. NULL if the page
* table hasn't been allocated.
@@ -351,7 +338,7 @@ struct vm_operations_struct {
int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
pmd_t *, unsigned int flags);
- void (*map_pages)(struct fault_env *fe,
+ void (*map_pages)(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff);
/* notification that a previously read-only page is about to become
@@ -625,8 +612,10 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
return pte;
}
-int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
+int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
struct page *page);
+int finish_fault(struct vm_fault *vmf);
+int finish_mkwrite_fault(struct vm_fault *vmf);
#endif
/*
@@ -1110,7 +1099,7 @@ static inline void clear_page_pfmemalloc(struct page *page)
#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
#define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */
#define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */
-#define VM_FAULT_DAX_LOCKED 0x1000 /* ->fault has locked DAX entry */
+#define VM_FAULT_DONE_COW 0x1000 /* ->fault has fully handled COW */
#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
@@ -1221,6 +1210,8 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma);
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows);
+int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp,
+ spinlock_t **ptlp);
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
unsigned long *pfn);
int follow_phys(struct vm_area_struct *vma, unsigned long address,
@@ -1276,15 +1267,12 @@ extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas);
+ struct vm_area_struct **vmas, int *locked);
long get_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas);
long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages, int *locked);
-long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- struct page **pages, unsigned int gup_flags);
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags);
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
@@ -2099,7 +2087,7 @@ extern void truncate_inode_pages_final(struct address_space *);
/* generic vm_area_ops exported for stackable file systems */
extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
-extern void filemap_map_pages(struct fault_env *fe,
+extern void filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff);
extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index a78c35cff1ae..aacca824a6ae 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -7,6 +7,23 @@
#include <linux/sched.h>
#include <asm/irq.h>
+/*
+ * The run state of the lockup detectors is controlled by the content of the
+ * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
+ * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
+ *
+ * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
+ * are variables that are only used as an 'interface' between the parameters
+ * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
+ * 'watchdog_thresh' variable is handled differently because its value is not
+ * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
+ * is equal zero.
+ */
+#define NMI_WATCHDOG_ENABLED_BIT 0
+#define SOFT_WATCHDOG_ENABLED_BIT 1
+#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT)
+#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT)
+
/**
* touch_nmi_watchdog - restart NMI watchdog timeout.
*
@@ -91,9 +108,16 @@ extern int nmi_watchdog_enabled;
extern int soft_watchdog_enabled;
extern int watchdog_user_enabled;
extern int watchdog_thresh;
+extern unsigned long watchdog_enabled;
extern unsigned long *watchdog_cpumask_bits;
+#ifdef CONFIG_SMP
extern int sysctl_softlockup_all_cpu_backtrace;
extern int sysctl_hardlockup_all_cpu_backtrace;
+#else
+#define sysctl_softlockup_all_cpu_backtrace 0
+#define sysctl_hardlockup_all_cpu_backtrace 0
+#endif
+extern bool is_hardlockup(void);
struct ctl_table;
extern int proc_watchdog(struct ctl_table *, int ,
void __user *, size_t *, loff_t *);
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index dd66a952e8cd..11b92b047a1e 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -27,7 +27,7 @@
#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
-extern int handle_userfault(struct fault_env *fe, unsigned long reason);
+extern int handle_userfault(struct vm_fault *vmf, unsigned long reason);
extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long src_start, unsigned long len);
@@ -55,7 +55,7 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
#else /* CONFIG_USERFAULTFD */
/* mm helpers */
-static inline int handle_userfault(struct fault_env *fe, unsigned long reason)
+static inline int handle_userfault(struct vm_fault *vmf, unsigned long reason)
{
return VM_FAULT_SIGBUS;
}
diff --git a/ipc/sem.c b/ipc/sem.c
index e84703d5024e..31eaa87ecba4 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -11,6 +11,7 @@
* (c) 2001 Red Hat Inc
* Lockless wakeup
* (c) 2003 Manfred Spraul <manfred@colorfullife.com>
+ * (c) 2016 Davidlohr Bueso <dave@stgolabs.net>
* Further wakeup optimizations, documentation
* (c) 2010 Manfred Spraul <manfred@colorfullife.com>
*
@@ -53,15 +54,11 @@
* Semaphores are actively given to waiting tasks (necessary for FIFO).
* (see update_queue())
* - To improve the scalability, the actual wake-up calls are performed after
- * dropping all locks. (see wake_up_sem_queue_prepare(),
- * wake_up_sem_queue_do())
+ * dropping all locks. (see wake_up_sem_queue_prepare())
* - All work is done by the waker, the woken up task does not have to do
* anything - not even acquiring a lock or dropping a refcount.
* - A woken up task may not even touch the semaphore array anymore, it may
* have been destroyed already by a semctl(RMID).
- * - The synchronizations between wake-ups due to a timeout/signal and a
- * wake-up due to a completed semaphore operation is achieved by using an
- * intermediate state (IN_WAKEUP).
* - UNDO values are stored in an array (one per process and per
* semaphore array, lazily allocated). For backwards compatibility, multiple
* modes for the UNDO variables are supported (per process, per thread)
@@ -118,7 +115,8 @@ struct sem_queue {
struct sembuf *sops; /* array of pending operations */
struct sembuf *blocking; /* the operation that blocked */
int nsops; /* number of operations */
- int alter; /* does *sops alter the array? */
+ bool alter; /* does *sops alter the array? */
+ bool dupsop; /* sops on more than one sem_num */
};
/* Each task has a list of undo requests. They are executed automatically
@@ -433,29 +431,6 @@ static inline void sem_unlock(struct sem_array *sma, int locknum)
*
* The caller holds the RCU read lock.
*/
-static inline struct sem_array *sem_obtain_lock(struct ipc_namespace *ns,
- int id, struct sembuf *sops, int nsops, int *locknum)
-{
- struct kern_ipc_perm *ipcp;
- struct sem_array *sma;
-
- ipcp = ipc_obtain_object_idr(&sem_ids(ns), id);
- if (IS_ERR(ipcp))
- return ERR_CAST(ipcp);
-
- sma = container_of(ipcp, struct sem_array, sem_perm);
- *locknum = sem_lock(sma, sops, nsops);
-
- /* ipc_rmid() may have already freed the ID while sem_lock
- * was spinning: verify that the structure is still valid
- */
- if (ipc_valid_object(ipcp))
- return container_of(ipcp, struct sem_array, sem_perm);
-
- sem_unlock(sma, *locknum);
- return ERR_PTR(-EINVAL);
-}
-
static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id)
{
struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&sem_ids(ns), id);
@@ -488,40 +463,6 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
ipc_rmid(&sem_ids(ns), &s->sem_perm);
}
-/*
- * Lockless wakeup algorithm:
- * Without the check/retry algorithm a lockless wakeup is possible:
- * - queue.status is initialized to -EINTR before blocking.
- * - wakeup is performed by
- * * unlinking the queue entry from the pending list
- * * setting queue.status to IN_WAKEUP
- * This is the notification for the blocked thread that a
- * result value is imminent.
- * * call wake_up_process
- * * set queue.status to the final value.
- * - the previously blocked thread checks queue.status:
- * * if it's IN_WAKEUP, then it must wait until the value changes
- * * if it's not -EINTR, then the operation was completed by
- * update_queue. semtimedop can return queue.status without
- * performing any operation on the sem array.
- * * otherwise it must acquire the spinlock and check what's up.
- *
- * The two-stage algorithm is necessary to protect against the following
- * races:
- * - if queue.status is set after wake_up_process, then the woken up idle
- * thread could race forward and try (and fail) to acquire sma->lock
- * before update_queue had a chance to set queue.status
- * - if queue.status is written before wake_up_process and if the
- * blocked process is woken up by a signal between writing
- * queue.status and the wake_up_process, then the woken up
- * process could return from semtimedop and die by calling
- * sys_exit before wake_up_process is called. Then wake_up_process
- * will oops, because the task structure is already invalid.
- * (yes, this happened on s390 with sysv msg).
- *
- */
-#define IN_WAKEUP 1
-
/**
* newary - Create a new semaphore set
* @ns: namespace
@@ -641,15 +582,23 @@ SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
}
/**
- * perform_atomic_semop - Perform (if possible) a semaphore operation
+ * perform_atomic_semop[_slow] - Attempt to perform semaphore
+ * operations on a given array.
* @sma: semaphore array
* @q: struct sem_queue that describes the operation
*
+ * Caller blocking are as follows, based the value
+ * indicated by the semaphore operation (sem_op):
+ *
+ * (1) >0 never blocks.
+ * (2) 0 (wait-for-zero operation): semval is non-zero.
+ * (3) <0 attempting to decrement semval to a value smaller than zero.
+ *
* Returns 0 if the operation was possible.
* Returns 1 if the operation is impossible, the caller must sleep.
- * Negative values are error codes.
+ * Returns <0 for error codes.
*/
-static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q)
+static int perform_atomic_semop_slow(struct sem_array *sma, struct sem_queue *q)
{
int result, sem_op, nsops, pid;
struct sembuf *sop;
@@ -720,51 +669,84 @@ undo:
return result;
}
-/** wake_up_sem_queue_prepare(q, error): Prepare wake-up
- * @q: queue entry that must be signaled
- * @error: Error value for the signal
- *
- * Prepare the wake-up of the queue entry q.
- */
-static void wake_up_sem_queue_prepare(struct list_head *pt,
- struct sem_queue *q, int error)
+static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q)
{
- if (list_empty(pt)) {
- /*
- * Hold preempt off so that we don't get preempted and have the
- * wakee busy-wait until we're scheduled back on.
- */
- preempt_disable();
+ int result, sem_op, nsops;
+ struct sembuf *sop;
+ struct sem *curr;
+ struct sembuf *sops;
+ struct sem_undo *un;
+
+ sops = q->sops;
+ nsops = q->nsops;
+ un = q->undo;
+
+ if (unlikely(q->dupsop))
+ return perform_atomic_semop_slow(sma, q);
+
+ /*
+ * We scan the semaphore set twice, first to ensure that the entire
+ * operation can succeed, therefore avoiding any pointless writes
+ * to shared memory and having to undo such changes in order to block
+ * until the operations can go through.
+ */
+ for (sop = sops; sop < sops + nsops; sop++) {
+ curr = sma->sem_base + sop->sem_num;
+ sem_op = sop->sem_op;
+ result = curr->semval;
+
+ if (!sem_op && result)
+ goto would_block; /* wait-for-zero */
+
+ result += sem_op;
+ if (result < 0)
+ goto would_block;
+
+ if (result > SEMVMX)
+ return -ERANGE;
+
+ if (sop->sem_flg & SEM_UNDO) {
+ int undo = un->semadj[sop->sem_num] - sem_op;
+
+ /* Exceeding the undo range is an error. */
+ if (undo < (-SEMAEM - 1) || undo > SEMAEM)
+ return -ERANGE;
+ }
+ }
+
+ for (sop = sops; sop < sops + nsops; sop++) {
+ curr = sma->sem_base + sop->sem_num;
+ sem_op = sop->sem_op;
+ result = curr->semval;
+
+ if (sop->sem_flg & SEM_UNDO) {
+ int undo = un->semadj[sop->sem_num] - sem_op;
+
+ un->semadj[sop->sem_num] = undo;
+ }
+ curr->semval += sem_op;
+ curr->sempid = q->pid;
}
- q->status = IN_WAKEUP;
- q->pid = error;
- list_add_tail(&q->list, pt);
+ return 0;
+
+would_block:
+ q->blocking = sop;
+ return sop->sem_flg & IPC_NOWAIT ? -EAGAIN : 1;
}
-/**
- * wake_up_sem_queue_do - do the actual wake-up
- * @pt: list of tasks to be woken up
- *
- * Do the actual wake-up.
- * The function is called without any locks held, thus the semaphore array
- * could be destroyed already and the tasks can disappear as soon as the
- * status is set to the actual return code.
- */
-static void wake_up_sem_queue_do(struct list_head *pt)
+static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error,
+ struct wake_q_head *wake_q)
{
- struct sem_queue *q, *t;
- int did_something;
-
- did_something = !list_empty(pt);
- list_for_each_entry_safe(q, t, pt, list) {
- wake_up_process(q->sleeper);
- /* q can disappear immediately after writing q->status. */
- smp_wmb();
- q->status = q->pid;
- }
- if (did_something)
- preempt_enable();
+ wake_q_add(wake_q, q->sleeper);
+ /*
+ * Rely on the above implicit barrier, such that we can
+ * ensure that we hold reference to the task before setting
+ * q->status. Otherwise we could race with do_exit if the
+ * task is awoken by an external event before calling
+ * wake_up_process().
+ */
+ WRITE_ONCE(q->status, error);
}
static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
@@ -784,7 +766,7 @@ static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
* modified the array.
* Note that wait-for-zero operations are handled without restart.
*/
-static int check_restart(struct sem_array *sma, struct sem_queue *q)
+static inline int check_restart(struct sem_array *sma, struct sem_queue *q)
{
/* pending complex alter operations are too difficult to analyse */
if (!list_empty(&sma->pending_alter))
@@ -812,21 +794,20 @@ static int check_restart(struct sem_array *sma, struct sem_queue *q)
* wake_const_ops - wake up non-alter tasks
* @sma: semaphore array.
* @semnum: semaphore that was modified.
- * @pt: list head for the tasks that must be woken up.
+ * @wake_q: lockless wake-queue head.
*
* wake_const_ops must be called after a semaphore in a semaphore array
* was set to 0. If complex const operations are pending, wake_const_ops must
* be called with semnum = -1, as well as with the number of each modified
* semaphore.
- * The tasks that must be woken up are added to @pt. The return code
+ * The tasks that must be woken up are added to @wake_q. The return code
* is stored in q->pid.
* The function returns 1 if at least one operation was completed successfully.
*/
static int wake_const_ops(struct sem_array *sma, int semnum,
- struct list_head *pt)
+ struct wake_q_head *wake_q)
{
- struct sem_queue *q;
- struct list_head *walk;
+ struct sem_queue *q, *tmp;
struct list_head *pending_list;
int semop_completed = 0;
@@ -835,25 +816,19 @@ static int wake_const_ops(struct sem_array *sma, int semnum,
else
pending_list = &sma->sem_base[semnum].pending_const;
- walk = pending_list->next;
- while (walk != pending_list) {
- int error;
-
- q = container_of(walk, struct sem_queue, list);
- walk = walk->next;
-
- error = perform_atomic_semop(sma, q);
-
- if (error <= 0) {
- /* operation completed, remove from queue & wakeup */
+ list_for_each_entry_safe(q, tmp, pending_list, list) {
+ int error = perform_atomic_semop(sma, q);
- unlink_queue(sma, q);
+ if (error > 0)
+ continue;
+ /* operation completed, remove from queue & wakeup */
+ unlink_queue(sma, q);
- wake_up_sem_queue_prepare(pt, q, error);
- if (error == 0)
- semop_completed = 1;
- }
+ wake_up_sem_queue_prepare(q, error, wake_q);
+ if (error == 0)
+ semop_completed = 1;
}
+
return semop_completed;
}
@@ -862,14 +837,14 @@ static int wake_const_ops(struct sem_array *sma, int semnum,
* @sma: semaphore array
* @sops: operations that were performed
* @nsops: number of operations
- * @pt: list head of the tasks that must be woken up.
+ * @wake_q: lockless wake-queue head
*
* Checks all required queue for wait-for-zero operations, based
* on the actual changes that were performed on the semaphore array.
* The function returns 1 if at least one operation was completed successfully.
*/
static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
- int nsops, struct list_head *pt)
+ int nsops, struct wake_q_head *wake_q)
{
int i;
int semop_completed = 0;
@@ -882,7 +857,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
if (sma->sem_base[num].semval == 0) {
got_zero = 1;
- semop_completed |= wake_const_ops(sma, num, pt);
+ semop_completed |= wake_const_ops(sma, num, wake_q);
}
}
} else {
@@ -893,7 +868,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
for (i = 0; i < sma->sem_nsems; i++) {
if (sma->sem_base[i].semval == 0) {
got_zero = 1;
- semop_completed |= wake_const_ops(sma, i, pt);
+ semop_completed |= wake_const_ops(sma, i, wake_q);
}
}
}
@@ -902,7 +877,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
* then check the global queue, too.
*/
if (got_zero)
- semop_completed |= wake_const_ops(sma, -1, pt);
+ semop_completed |= wake_const_ops(sma, -1, wake_q);
return semop_completed;
}
@@ -912,22 +887,21 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
* update_queue - look for tasks that can be completed.
* @sma: semaphore array.
* @semnum: semaphore that was modified.
- * @pt: list head for the tasks that must be woken up.
+ * @wake_q: lockless wake-queue head.
*
* update_queue must be called after a semaphore in a semaphore array
* was modified. If multiple semaphores were modified, update_queue must
* be called with semnum = -1, as well as with the number of each modified
* semaphore.
- * The tasks that must be woken up are added to @pt. The return code
+ * The tasks that must be woken up are added to @wake_q. The return code
* is stored in q->pid.
* The function internally checks if const operations can now succeed.
*
* The function return 1 if at least one semop was completed successfully.
*/
-static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt)
+static int update_queue(struct sem_array *sma, int semnum, struct wake_q_head *wake_q)
{
- struct sem_queue *q;
- struct list_head *walk;
+ struct sem_queue *q, *tmp;
struct list_head *pending_list;
int semop_completed = 0;
@@ -937,13 +911,9 @@ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt)
pending_list = &sma->sem_base[semnum].pending_alter;
again:
- walk = pending_list->next;
- while (walk != pending_list) {
+ list_for_each_entry_safe(q, tmp, pending_list, list) {
int error, restart;
- q = container_of(walk, struct sem_queue, list);
- walk = walk->next;
-
/* If we are scanning the single sop, per-semaphore list of
* one semaphore and that semaphore is 0, then it is not
* necessary to scan further: simple increments
@@ -966,11 +936,11 @@ again:
restart = 0;
} else {
semop_completed = 1;
- do_smart_wakeup_zero(sma, q->sops, q->nsops, pt);
+ do_smart_wakeup_zero(sma, q->sops, q->nsops, wake_q);
restart = check_restart(sma, q);
}
- wake_up_sem_queue_prepare(pt, q, error);
+ wake_up_sem_queue_prepare(q, error, wake_q);
if (restart)
goto again;
}
@@ -1001,24 +971,24 @@ static void set_semotime(struct sem_array *sma, struct sembuf *sops)
* @sops: operations that were performed
* @nsops: number of operations
* @otime: force setting otime
- * @pt: list head of the tasks that must be woken up.
+ * @wake_q: lockless wake-queue head
*
* do_smart_update() does the required calls to update_queue and wakeup_zero,
* based on the actual changes that were performed on the semaphore array.
* Note that the function does not do the actual wake-up: the caller is
- * responsible for calling wake_up_sem_queue_do(@pt).
+ * responsible for calling wake_up_q().
* It is safe to perform this call after dropping all locks.
*/
static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops,
- int otime, struct list_head *pt)
+ int otime, struct wake_q_head *wake_q)
{
int i;
- otime |= do_smart_wakeup_zero(sma, sops, nsops, pt);
+ otime |= do_smart_wakeup_zero(sma, sops, nsops, wake_q);
if (!list_empty(&sma->pending_alter)) {
/* semaphore array uses the global queue - just process it. */
- otime |= update_queue(sma, -1, pt);
+ otime |= update_queue(sma, -1, wake_q);
} else {
if (!sops) {
/*
@@ -1026,7 +996,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop
* known. Check all.
*/
for (i = 0; i < sma->sem_nsems; i++)
- otime |= update_queue(sma, i, pt);
+ otime |= update_queue(sma, i, wake_q);
} else {
/*
* Check the semaphores that were increased:
@@ -1040,7 +1010,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop
for (i = 0; i < nsops; i++) {
if (sops[i].sem_op > 0) {
otime |= update_queue(sma,
- sops[i].sem_num, pt);
+ sops[i].sem_num, wake_q);
}
}
}
@@ -1128,8 +1098,8 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
struct sem_undo *un, *tu;
struct sem_queue *q, *tq;
struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
- struct list_head tasks;
int i;
+ DEFINE_WAKE_Q(wake_q);
/* Free the existing undo structures for this semaphore set. */
ipc_assert_locked_object(&sma->sem_perm);
@@ -1143,25 +1113,24 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
}
/* Wake up all pending processes and let them fail with EIDRM. */
- INIT_LIST_HEAD(&tasks);
list_for_each_entry_safe(q, tq, &sma->pending_const, list) {
unlink_queue(sma, q);
- wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
+ wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
}
list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
unlink_queue(sma, q);
- wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
+ wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
}
for (i = 0; i < sma->sem_nsems; i++) {
struct sem *sem = sma->sem_base + i;
list_for_each_entry_safe(q, tq, &sem->pending_const, list) {
unlink_queue(sma, q);
- wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
+ wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
}
list_for_each_entry_safe(q, tq, &sem->pending_alter, list) {
unlink_queue(sma, q);
- wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
+ wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
}
}
@@ -1170,7 +1139,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
sem_unlock(sma, -1);
rcu_read_unlock();
- wake_up_sem_queue_do(&tasks);
+ wake_up_q(&wake_q);
ns->used_sems -= sma->sem_nsems;
ipc_rcu_putref(sma, sem_rcu_free);
}
@@ -1309,9 +1278,9 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
struct sem_undo *un;
struct sem_array *sma;
struct sem *curr;
- int err;
- struct list_head tasks;
- int val;
+ int err, val;
+ DEFINE_WAKE_Q(wake_q);
+
#if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN)
/* big-endian 64bit */
val = arg >> 32;
@@ -1323,8 +1292,6 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
if (val > SEMVMX || val < 0)
return -ERANGE;
- INIT_LIST_HEAD(&tasks);
-
rcu_read_lock();
sma = sem_obtain_object_check(ns, semid);
if (IS_ERR(sma)) {
@@ -1367,10 +1334,10 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
curr->sempid = task_tgid_vnr(current);
sma->sem_ctime = get_seconds();
/* maybe some queued-up processes were waiting for this */
- do_smart_update(sma, NULL, 0, 0, &tasks);
+ do_smart_update(sma, NULL, 0, 0, &wake_q);
sem_unlock(sma, -1);
rcu_read_unlock();
- wake_up_sem_queue_do(&tasks);
+ wake_up_q(&wake_q);
return 0;
}
@@ -1382,9 +1349,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
int err, nsems;
ushort fast_sem_io[SEMMSL_FAST];
ushort *sem_io = fast_sem_io;
- struct list_head tasks;
-
- INIT_LIST_HEAD(&tasks);
+ DEFINE_WAKE_Q(wake_q);
rcu_read_lock();
sma = sem_obtain_object_check(ns, semid);
@@ -1495,7 +1460,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
}
sma->sem_ctime = get_seconds();
/* maybe some queued-up processes were waiting for this */
- do_smart_update(sma, NULL, 0, 0, &tasks);
+ do_smart_update(sma, NULL, 0, 0, &wake_q);
err = 0;
goto out_unlock;
}
@@ -1531,7 +1496,7 @@ out_unlock:
sem_unlock(sma, -1);
out_rcu_wakeup:
rcu_read_unlock();
- wake_up_sem_queue_do(&tasks);
+ wake_up_q(&wake_q);
out_free:
if (sem_io != fast_sem_io)
ipc_free(sem_io);
@@ -1804,32 +1769,6 @@ out:
return un;
}
-
-/**
- * get_queue_result - retrieve the result code from sem_queue
- * @q: Pointer to queue structure
- *
- * Retrieve the return code from the pending queue. If IN_WAKEUP is found in
- * q->status, then we must loop until the value is replaced with the final
- * value: This may happen if a task is woken up by an unrelated event (e.g.
- * signal) and in parallel the task is woken up by another task because it got
- * the requested semaphores.
- *
- * The function can be called with or without holding the semaphore spinlock.
- */
-static int get_queue_result(struct sem_queue *q)
-{
- int error;
-
- error = q->status;
- while (unlikely(error == IN_WAKEUP)) {
- cpu_relax();
- error = q->status;
- }
-
- return error;
-}
-
SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
unsigned, nsops, const struct timespec __user *, timeout)
{
@@ -1838,11 +1777,11 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
struct sembuf fast_sops[SEMOPM_FAST];
struct sembuf *sops = fast_sops, *sop;
struct sem_undo *un;
- int undos = 0, alter = 0, max, locknum;
+ int max, locknum;
+ bool undos = false, alter = false, dupsop = false;
struct sem_queue queue;
- unsigned long jiffies_left = 0;
+ unsigned long dup = 0, jiffies_left = 0;
struct ipc_namespace *ns;
- struct list_head tasks;
ns = current->nsproxy->ipc_ns;
@@ -1855,10 +1794,12 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
if (sops == NULL)
return -ENOMEM;
}
+
if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) {
error = -EFAULT;
goto out_free;
}
+
if (timeout) {
struct timespec _timeout;
if (copy_from_user(&_timeout, timeout, sizeof(*timeout))) {
@@ -1872,18 +1813,30 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
}
jiffies_left = timespec_to_jiffies(&_timeout);
}
+
max = 0;
for (sop = sops; sop < sops + nsops; sop++) {
+ unsigned long mask = 1ULL << ((sop->sem_num) % BITS_PER_LONG);
+
if (sop->sem_num >= max)
max = sop->sem_num;
if (sop->sem_flg & SEM_UNDO)
- undos = 1;
- if (sop->sem_op != 0)
- alter = 1;
+ undos = true;
+ if (dup & mask) {
+ /*
+ * There was a previous alter access that appears
+ * to have accessed the same semaphore, thus use
+ * the dupsop logic. "appears", because the detection
+ * can only check % BITS_PER_LONG.
+ */
+ dupsop = true;
+ }
+ if (sop->sem_op != 0) {
+ alter = true;
+ dup |= mask;
+ }
}
- INIT_LIST_HEAD(&tasks);
-
if (undos) {
/* On success, find_alloc_undo takes the rcu_read_lock */
un = find_alloc_undo(ns, semid);
@@ -1904,16 +1857,22 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
}
error = -EFBIG;
- if (max >= sma->sem_nsems)
- goto out_rcu_wakeup;
+ if (max >= sma->sem_nsems) {
+ rcu_read_unlock();
+ goto out_free;
+ }
error = -EACCES;
- if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO))
- goto out_rcu_wakeup;
+ if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) {
+ rcu_read_unlock();
+ goto out_free;
+ }
error = security_sem_semop(sma, sops, nsops, alter);
- if (error)
- goto out_rcu_wakeup;
+ if (error) {
+ rcu_read_unlock();
+ goto out_free;
+ }
error = -EIDRM;
locknum = sem_lock(sma, sops, nsops);
@@ -1942,24 +1901,34 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
queue.undo = un;
queue.pid = task_tgid_vnr(current);
queue.alter = alter;
+ queue.dupsop = dupsop;
error = perform_atomic_semop(sma, &queue);
- if (error == 0) {
- /* If the operation was successful, then do
+ if (error == 0) { /* non-blocking succesfull path */
+ DEFINE_WAKE_Q(wake_q);
+
+ /*
+ * If the operation was successful, then do
* the required updates.
*/
if (alter)
- do_smart_update(sma, sops, nsops, 1, &tasks);
+ do_smart_update(sma, sops, nsops, 1, &wake_q);
else
set_semotime(sma, sops);
+
+ sem_unlock(sma, locknum);
+ rcu_read_unlock();
+ wake_up_q(&wake_q);
+
+ goto out_free;
}
- if (error <= 0)
+ if (error < 0) /* non-blocking error path */
goto out_unlock_free;
- /* We need to sleep on this operation, so we put the current
+ /*
+ * We need to sleep on this operation, so we put the current
* task into the pending queue and go to sleep.
*/
-
if (nsops == 1) {
struct sem *curr;
curr = &sma->sem_base[sops->sem_num];
@@ -1988,77 +1957,69 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
sma->complex_count++;
}
- queue.status = -EINTR;
- queue.sleeper = current;
+ do {
+ queue.status = -EINTR;
+ queue.sleeper = current;
-sleep_again:
- __set_current_state(TASK_INTERRUPTIBLE);
- sem_unlock(sma, locknum);
- rcu_read_unlock();
-
- if (timeout)
- jiffies_left = schedule_timeout(jiffies_left);
- else
- schedule();
+ __set_current_state(TASK_INTERRUPTIBLE);
+ sem_unlock(sma, locknum);
+ rcu_read_unlock();
- error = get_queue_result(&queue);
+ if (timeout)
+ jiffies_left = schedule_timeout(jiffies_left);
+ else
+ schedule();
- if (error != -EINTR) {
- /* fast path: update_queue already obtained all requested
- * resources.
- * Perform a smp_mb(): User space could assume that semop()
- * is a memory barrier: Without the mb(), the cpu could
- * speculatively read in user space stale data that was
- * overwritten by the previous owner of the semaphore.
+ /*
+ * fastpath: the semop has completed, either successfully or
+ * not, from the syscall pov, is quite irrelevant to us at this
+ * point; we're done.
+ *
+ * We _do_ care, nonetheless, about being awoken by a signal or
+ * spuriously. The queue.status is checked again in the
+ * slowpath (aka after taking sem_lock), such that we can detect
+ * scenarios where we were awakened externally, during the
+ * window between wake_q_add() and wake_up_q().
*/
- smp_mb();
-
- goto out_free;
- }
-
- rcu_read_lock();
- sma = sem_obtain_lock(ns, semid, sops, nsops, &locknum);
-
- /*
- * Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing.
- */
- error = get_queue_result(&queue);
+ error = READ_ONCE(queue.status);
+ if (error != -EINTR) {
+ /*
+ * User space could assume that semop() is a memory
+ * barrier: Without the mb(), the cpu could
+ * speculatively read in userspace stale data that was
+ * overwritten by the previous owner of the semaphore.
+ */
+ smp_mb();
+ goto out_free;
+ }
- /*
- * Array removed? If yes, leave without sem_unlock().
- */
- if (IS_ERR(sma)) {
- rcu_read_unlock();
- goto out_free;
- }
+ rcu_read_lock();
+ sem_lock(sma, sops, nsops);
+ if (!ipc_valid_object(&sma->sem_perm))
+ goto out_unlock_free;
- /*
- * If queue.status != -EINTR we are woken up by another process.
- * Leave without unlink_queue(), but with sem_unlock().
- */
- if (error != -EINTR)
- goto out_unlock_free;
+ error = READ_ONCE(queue.status);
- /*
- * If an interrupt occurred we have to clean up the queue
- */
- if (timeout && jiffies_left == 0)
- error = -EAGAIN;
+ /*
+ * If queue.status != -EINTR we are woken up by another process.
+ * Leave without unlink_queue(), but with sem_unlock().
+ */
+ if (error != -EINTR)
+ goto out_unlock_free;
- /*
- * If the wakeup was spurious, just retry
- */
- if (error == -EINTR && !signal_pending(current))
- goto sleep_again;
+ /*
+ * If an interrupt occurred we have to clean up the queue.
+ */
+ if (timeout && jiffies_left == 0)
+ error = -EAGAIN;
+ } while (error == -EINTR && !signal_pending(current)); /* spurious */
unlink_queue(sma, &queue);
out_unlock_free:
sem_unlock(sma, locknum);
-out_rcu_wakeup:
rcu_read_unlock();
- wake_up_sem_queue_do(&tasks);
out_free:
if (sops != fast_sops)
kfree(sops);
@@ -2119,8 +2080,8 @@ void exit_sem(struct task_struct *tsk)
for (;;) {
struct sem_array *sma;
struct sem_undo *un;
- struct list_head tasks;
int semid, i;
+ DEFINE_WAKE_Q(wake_q);
cond_resched();
@@ -2208,11 +2169,10 @@ void exit_sem(struct task_struct *tsk)
}
}
/* maybe some queued-up processes were waiting for this */
- INIT_LIST_HEAD(&tasks);
- do_smart_update(sma, NULL, 0, 1, &tasks);
+ do_smart_update(sma, NULL, 0, 1, &wake_q);
sem_unlock(sma, -1);
rcu_read_unlock();
- wake_up_sem_queue_do(&tasks);
+ wake_up_q(&wake_q);
kfree_rcu(un, rcu);
}
diff --git a/kernel/Makefile b/kernel/Makefile
index eb26e12c6c2a..314e7d62f5f0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -84,6 +84,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index f9ec9add2164..215871bda3a2 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -301,7 +301,7 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
retry:
/* Read the page with vaddr into memory */
ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page,
- &vma);
+ &vma, NULL);
if (ret <= 0)
return ret;
@@ -1712,7 +1712,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
* essentially a kernel access to the memory.
*/
result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page,
- NULL);
+ NULL, NULL);
if (result < 0)
return result;
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index cd6716e115e8..3c8bb6571128 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -121,3 +121,8 @@ long clock_nanosleep_restart(struct restart_block *restart_block)
return hrtimer_nanosleep_restart(restart_block);
}
#endif
+
+int posix_timer_event(struct k_itimer *timr, int si_private)
+{
+ return 0;
+}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6d1020c03d41..d4b0fa01cae3 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,32 +24,14 @@
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
-#include <linux/perf_event.h>
#include <linux/kthread.h>
-/*
- * The run state of the lockup detectors is controlled by the content of the
- * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
- * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
- *
- * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
- * are variables that are only used as an 'interface' between the parameters
- * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
- * 'watchdog_thresh' variable is handled differently because its value is not
- * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
- * is equal zero.
- */
-#define NMI_WATCHDOG_ENABLED_BIT 0
-#define SOFT_WATCHDOG_ENABLED_BIT 1
-#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT)
-#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT)
-
static DEFINE_MUTEX(watchdog_proc_mutex);
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
+unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
#else
-static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
+unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
#endif
int __read_mostly nmi_watchdog_enabled;
int __read_mostly soft_watchdog_enabled;
@@ -59,9 +41,6 @@ int __read_mostly watchdog_thresh = 10;
#ifdef CONFIG_SMP
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
-#else
-#define sysctl_softlockup_all_cpu_backtrace 0
-#define sysctl_hardlockup_all_cpu_backtrace 0
#endif
static struct cpumask watchdog_cpumask __read_mostly;
unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
@@ -100,50 +79,9 @@ static DEFINE_PER_CPU(bool, soft_watchdog_warn);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static DEFINE_PER_CPU(bool, hard_watchdog_warn);
-static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
-static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
-#endif
static unsigned long soft_lockup_nmi_warn;
-/* boot commands */
-/*
- * Should we panic when a soft-lockup or hard-lockup occurs:
- */
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-unsigned int __read_mostly hardlockup_panic =
- CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
-static unsigned long hardlockup_allcpu_dumped;
-/*
- * We may not want to enable hard lockup detection by default in all cases,
- * for example when running the kernel as a guest on a hypervisor. In these
- * cases this function can be called to disable hard lockup detection. This
- * function should only be executed once by the boot processor before the
- * kernel command line parameters are parsed, because otherwise it is not
- * possible to override this in hardlockup_panic_setup().
- */
-void hardlockup_detector_disable(void)
-{
- watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
-}
-
-static int __init hardlockup_panic_setup(char *str)
-{
- if (!strncmp(str, "panic", 5))
- hardlockup_panic = 1;
- else if (!strncmp(str, "nopanic", 7))
- hardlockup_panic = 0;
- else if (!strncmp(str, "0", 1))
- watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
- else if (!strncmp(str, "1", 1))
- watchdog_enabled |= NMI_WATCHDOG_ENABLED;
- return 1;
-}
-__setup("nmi_watchdog=", hardlockup_panic_setup);
-#endif
-
unsigned int __read_mostly softlockup_panic =
CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
@@ -264,32 +202,14 @@ void touch_all_softlockup_watchdogs(void)
wq_watchdog_touch(-1);
}
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-void touch_nmi_watchdog(void)
-{
- /*
- * Using __raw here because some code paths have
- * preemption enabled. If preemption is enabled
- * then interrupts should be enabled too, in which
- * case we shouldn't have to worry about the watchdog
- * going off.
- */
- raw_cpu_write(watchdog_nmi_touch, true);
- touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL(touch_nmi_watchdog);
-
-#endif
-
void touch_softlockup_watchdog_sync(void)
{
__this_cpu_write(softlockup_touch_sync, true);
__this_cpu_write(watchdog_touch_ts, 0);
}
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
/* watchdog detector functions */
-static bool is_hardlockup(void)
+bool is_hardlockup(void)
{
unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
@@ -299,7 +219,6 @@ static bool is_hardlockup(void)
__this_cpu_write(hrtimer_interrupts_saved, hrint);
return false;
}
-#endif
static int is_softlockup(unsigned long touch_ts)
{
@@ -313,77 +232,22 @@ static int is_softlockup(unsigned long touch_ts)
return 0;
}
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-
-static struct perf_event_attr wd_hw_attr = {
- .type = PERF_TYPE_HARDWARE,
- .config = PERF_COUNT_HW_CPU_CYCLES,
- .size = sizeof(struct perf_event_attr),
- .pinned = 1,
- .disabled = 1,
-};
-
-/* Callback function for perf event subsystem */
-static void watchdog_overflow_callback(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
-{
- /* Ensure the watchdog never gets throttled */
- event->hw.interrupts = 0;
-
- if (__this_cpu_read(watchdog_nmi_touch) == true) {
- __this_cpu_write(watchdog_nmi_touch, false);
- return;
- }
-
- /* check for a hardlockup
- * This is done by making sure our timer interrupt
- * is incrementing. The timer interrupt should have
- * fired multiple times before we overflow'd. If it hasn't
- * then this is a good indication the cpu is stuck
- */
- if (is_hardlockup()) {
- int this_cpu = smp_processor_id();
-
- /* only print hardlockups once */
- if (__this_cpu_read(hard_watchdog_warn) == true)
- return;
-
- pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
- print_modules();
- print_irqtrace_events(current);
- if (regs)
- show_regs(regs);
- else
- dump_stack();
-
- /*
- * Perform all-CPU dump only once to avoid multiple hardlockups
- * generating interleaving traces
- */
- if (sysctl_hardlockup_all_cpu_backtrace &&
- !test_and_set_bit(0, &hardlockup_allcpu_dumped))
- trigger_allbutself_cpu_backtrace();
-
- if (hardlockup_panic)
- nmi_panic(regs, "Hard LOCKUP");
-
- __this_cpu_write(hard_watchdog_warn, true);
- return;
- }
-
- __this_cpu_write(hard_watchdog_warn, false);
- return;
-}
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
-
static void watchdog_interrupt_count(void)
{
__this_cpu_inc(hrtimer_interrupts);
}
-static int watchdog_nmi_enable(unsigned int cpu);
-static void watchdog_nmi_disable(unsigned int cpu);
+/*
+ * These two functions are mostly architecture specific
+ * defining them as weak here.
+ */
+int __weak watchdog_nmi_enable(unsigned int cpu)
+{
+ return 0;
+}
+void __weak watchdog_nmi_disable(unsigned int cpu)
+{
+}
static int watchdog_enable_all_cpus(void);
static void watchdog_disable_all_cpus(void);
@@ -576,109 +440,6 @@ static void watchdog(unsigned int cpu)
watchdog_nmi_disable(cpu);
}
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-/*
- * People like the simple clean cpu node info on boot.
- * Reduce the watchdog noise by only printing messages
- * that are different from what cpu0 displayed.
- */
-static unsigned long cpu0_err;
-
-static int watchdog_nmi_enable(unsigned int cpu)
-{
- struct perf_event_attr *wd_attr;
- struct perf_event *event = per_cpu(watchdog_ev, cpu);
-
- /* nothing to do if the hard lockup detector is disabled */
- if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
- goto out;
-
- /* is it already setup and enabled? */
- if (event && event->state > PERF_EVENT_STATE_OFF)
- goto out;
-
- /* it is setup but not enabled */
- if (event != NULL)
- goto out_enable;
-
- wd_attr = &wd_hw_attr;
- wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
-
- /* Try to register using hardware perf events */
- event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
-
- /* save cpu0 error for future comparision */
- if (cpu == 0 && IS_ERR(event))
- cpu0_err = PTR_ERR(event);
-
- if (!IS_ERR(event)) {
- /* only print for cpu0 or different than cpu0 */
- if (cpu == 0 || cpu0_err)
- pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
- goto out_save;
- }
-
- /*
- * Disable the hard lockup detector if _any_ CPU fails to set up
- * set up the hardware perf event. The watchdog() function checks
- * the NMI_WATCHDOG_ENABLED bit periodically.
- *
- * The barriers are for syncing up watchdog_enabled across all the
- * cpus, as clear_bit() does not use barriers.
- */
- smp_mb__before_atomic();
- clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
- smp_mb__after_atomic();
-
- /* skip displaying the same error again */
- if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
- return PTR_ERR(event);
-
- /* vary the KERN level based on the returned errno */
- if (PTR_ERR(event) == -EOPNOTSUPP)
- pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
- else if (PTR_ERR(event) == -ENOENT)
- pr_warn("disabled (cpu%i): hardware events not enabled\n",
- cpu);
- else
- pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
- cpu, PTR_ERR(event));
-
- pr_info("Shutting down hard lockup detector on all cpus\n");
-
- return PTR_ERR(event);
-
- /* success path */
-out_save:
- per_cpu(watchdog_ev, cpu) = event;
-out_enable:
- perf_event_enable(per_cpu(watchdog_ev, cpu));
-out:
- return 0;
-}
-
-static void watchdog_nmi_disable(unsigned int cpu)
-{
- struct perf_event *event = per_cpu(watchdog_ev, cpu);
-
- if (event) {
- perf_event_disable(event);
- per_cpu(watchdog_ev, cpu) = NULL;
-
- /* should be in cleanup, but blocks oprofile */
- perf_event_release_kernel(event);
- }
- if (cpu == 0) {
- /* watchdog_nmi_enable() expects this to be zero initially. */
- cpu0_err = 0;
- }
-}
-
-#else
-static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
-static void watchdog_nmi_disable(unsigned int cpu) { return; }
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
-
static struct smp_hotplug_thread watchdog_threads = {
.store = &softlockup_watchdog,
.thread_should_run = watchdog_should_run,
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
new file mode 100644
index 000000000000..84016c8aee6b
--- /dev/null
+++ b/kernel/watchdog_hld.c
@@ -0,0 +1,227 @@
+/*
+ * Detect hard lockups on a system
+ *
+ * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ * Note: Most of this code is borrowed heavily from the original softlockup
+ * detector, so thanks to Ingo for the initial implementation.
+ * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
+ * to those contributors as well.
+ */
+
+#define pr_fmt(fmt) "NMI watchdog: " fmt
+
+#include <linux/nmi.h>
+#include <linux/module.h>
+#include <asm/irq_regs.h>
+#include <linux/perf_event.h>
+
+static DEFINE_PER_CPU(bool, hard_watchdog_warn);
+static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
+static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+
+/* boot commands */
+/*
+ * Should we panic when a soft-lockup or hard-lockup occurs:
+ */
+unsigned int __read_mostly hardlockup_panic =
+ CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
+static unsigned long hardlockup_allcpu_dumped;
+/*
+ * We may not want to enable hard lockup detection by default in all cases,
+ * for example when running the kernel as a guest on a hypervisor. In these
+ * cases this function can be called to disable hard lockup detection. This
+ * function should only be executed once by the boot processor before the
+ * kernel command line parameters are parsed, because otherwise it is not
+ * possible to override this in hardlockup_panic_setup().
+ */
+void hardlockup_detector_disable(void)
+{
+ watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+}
+
+static int __init hardlockup_panic_setup(char *str)
+{
+ if (!strncmp(str, "panic", 5))
+ hardlockup_panic = 1;
+ else if (!strncmp(str, "nopanic", 7))
+ hardlockup_panic = 0;
+ else if (!strncmp(str, "0", 1))
+ watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+ else if (!strncmp(str, "1", 1))
+ watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+ return 1;
+}
+__setup("nmi_watchdog=", hardlockup_panic_setup);
+
+void touch_nmi_watchdog(void)
+{
+ /*
+ * Using __raw here because some code paths have
+ * preemption enabled. If preemption is enabled
+ * then interrupts should be enabled too, in which
+ * case we shouldn't have to worry about the watchdog
+ * going off.
+ */
+ raw_cpu_write(watchdog_nmi_touch, true);
+ touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL(touch_nmi_watchdog);
+
+static struct perf_event_attr wd_hw_attr = {
+ .type = PERF_TYPE_HARDWARE,
+ .config = PERF_COUNT_HW_CPU_CYCLES,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = 1,
+ .disabled = 1,
+};
+
+/* Callback function for perf event subsystem */
+static void watchdog_overflow_callback(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ /* Ensure the watchdog never gets throttled */
+ event->hw.interrupts = 0;
+
+ if (__this_cpu_read(watchdog_nmi_touch) == true) {
+ __this_cpu_write(watchdog_nmi_touch, false);
+ return;
+ }
+
+ /* check for a hardlockup
+ * This is done by making sure our timer interrupt
+ * is incrementing. The timer interrupt should have
+ * fired multiple times before we overflow'd. If it hasn't
+ * then this is a good indication the cpu is stuck
+ */
+ if (is_hardlockup()) {
+ int this_cpu = smp_processor_id();
+
+ /* only print hardlockups once */
+ if (__this_cpu_read(hard_watchdog_warn) == true)
+ return;
+
+ pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+ print_modules();
+ print_irqtrace_events(current);
+ if (regs)
+ show_regs(regs);
+ else
+ dump_stack();
+
+ /*
+ * Perform all-CPU dump only once to avoid multiple hardlockups
+ * generating interleaving traces
+ */
+ if (sysctl_hardlockup_all_cpu_backtrace &&
+ !test_and_set_bit(0, &hardlockup_allcpu_dumped))
+ trigger_allbutself_cpu_backtrace();
+
+ if (hardlockup_panic)
+ nmi_panic(regs, "Hard LOCKUP");
+
+ __this_cpu_write(hard_watchdog_warn, true);
+ return;
+ }
+
+ __this_cpu_write(hard_watchdog_warn, false);
+ return;
+}
+
+/*
+ * People like the simple clean cpu node info on boot.
+ * Reduce the watchdog noise by only printing messages
+ * that are different from what cpu0 displayed.
+ */
+static unsigned long cpu0_err;
+
+int watchdog_nmi_enable(unsigned int cpu)
+{
+ struct perf_event_attr *wd_attr;
+ struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+ /* nothing to do if the hard lockup detector is disabled */
+ if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+ goto out;
+
+ /* is it already setup and enabled? */
+ if (event && event->state > PERF_EVENT_STATE_OFF)
+ goto out;
+
+ /* it is setup but not enabled */
+ if (event != NULL)
+ goto out_enable;
+
+ wd_attr = &wd_hw_attr;
+ wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+
+ /* Try to register using hardware perf events */
+ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
+
+ /* save cpu0 error for future comparision */
+ if (cpu == 0 && IS_ERR(event))
+ cpu0_err = PTR_ERR(event);
+
+ if (!IS_ERR(event)) {
+ /* only print for cpu0 or different than cpu0 */
+ if (cpu == 0 || cpu0_err)
+ pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
+ goto out_save;
+ }
+
+ /*
+ * Disable the hard lockup detector if _any_ CPU fails to set up
+ * set up the hardware perf event. The watchdog() function checks
+ * the NMI_WATCHDOG_ENABLED bit periodically.
+ *
+ * The barriers are for syncing up watchdog_enabled across all the
+ * cpus, as clear_bit() does not use barriers.
+ */
+ smp_mb__before_atomic();
+ clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
+ smp_mb__after_atomic();
+
+ /* skip displaying the same error again */
+ if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
+ return PTR_ERR(event);
+
+ /* vary the KERN level based on the returned errno */
+ if (PTR_ERR(event) == -EOPNOTSUPP)
+ pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
+ else if (PTR_ERR(event) == -ENOENT)
+ pr_warn("disabled (cpu%i): hardware events not enabled\n",
+ cpu);
+ else
+ pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
+ cpu, PTR_ERR(event));
+
+ pr_info("Shutting down hard lockup detector on all cpus\n");
+
+ return PTR_ERR(event);
+
+ /* success path */
+out_save:
+ per_cpu(watchdog_ev, cpu) = event;
+out_enable:
+ perf_event_enable(per_cpu(watchdog_ev, cpu));
+out:
+ return 0;
+}
+
+void watchdog_nmi_disable(unsigned int cpu)
+{
+ struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+ if (event) {
+ perf_event_disable(event);
+ per_cpu(watchdog_ev, cpu) = NULL;
+
+ /* should be in cleanup, but blocks oprofile */
+ perf_event_release_kernel(event);
+ }
+ if (cpu == 0) {
+ /* watchdog_nmi_enable() expects this to be zero initially. */
+ cpu0_err = 0;
+ }
+}
diff --git a/mm/filemap.c b/mm/filemap.c
index 69568388c699..235021e361eb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2164,12 +2164,12 @@ page_not_uptodate:
}
EXPORT_SYMBOL(filemap_fault);
-void filemap_map_pages(struct fault_env *fe,
+void filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{
struct radix_tree_iter iter;
void **slot;
- struct file *file = fe->vma->vm_file;
+ struct file *file = vmf->vma->vm_file;
struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff;
loff_t size;
@@ -2225,11 +2225,11 @@ repeat:
if (file->f_ra.mmap_miss > 0)
file->f_ra.mmap_miss--;
- fe->address += (iter.index - last_pgoff) << PAGE_SHIFT;
- if (fe->pte)
- fe->pte += iter.index - last_pgoff;
+ vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT;
+ if (vmf->pte)
+ vmf->pte += iter.index - last_pgoff;
last_pgoff = iter.index;
- if (alloc_set_pte(fe, NULL, page))
+ if (alloc_set_pte(vmf, NULL, page))
goto unlock;
unlock_page(page);
goto next;
@@ -2239,7 +2239,7 @@ skip:
put_page(page);
next:
/* Huge page is mapped? No need to proceed. */
- if (pmd_trans_huge(*fe->pmd))
+ if (pmd_trans_huge(*vmf->pmd))
break;
if (iter.index == end_pgoff)
break;
diff --git a/mm/gup.c b/mm/gup.c
index e50178c58b97..55315555489d 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -865,9 +865,10 @@ EXPORT_SYMBOL(get_user_pages_locked);
* caller if required (just like with __get_user_pages). "FOLL_GET"
* is set implicitly if "pages" is non-NULL.
*/
-__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- struct page **pages, unsigned int gup_flags)
+static __always_inline long __get_user_pages_unlocked(struct task_struct *tsk,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long nr_pages, struct page **pages,
+ unsigned int gup_flags)
{
long ret;
int locked = 1;
@@ -879,7 +880,6 @@ __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct m
up_read(&mm->mmap_sem);
return ret;
}
-EXPORT_SYMBOL(__get_user_pages_unlocked);
/*
* get_user_pages_unlocked() is suitable to replace the form:
@@ -917,6 +917,9 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
* only intends to ensure the pages are faulted in.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
+ * @locked: pointer to lock flag indicating whether lock is held and
+ * subsequently whether VM_FAULT_RETRY functionality can be
+ * utilised. Lock must initially be held.
*
* Returns number of pages pinned. This may be fewer than the number
* requested. If nr_pages is 0 or negative, returns 0. If no pages
@@ -960,10 +963,10 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas)
+ struct vm_area_struct **vmas, int *locked)
{
return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
- NULL, false,
+ locked, true,
gup_flags | FOLL_TOUCH | FOLL_REMOTE);
}
EXPORT_SYMBOL(get_user_pages_remote);
@@ -971,8 +974,9 @@ EXPORT_SYMBOL(get_user_pages_remote);
/*
* This is the same as get_user_pages_remote(), just with a
* less-flexible calling convention where we assume that the task
- * and mm being operated on are the current task's. We also
- * obviously don't pass FOLL_REMOTE in here.
+ * and mm being operated on are the current task's and don't allow
+ * passing of a locked parameter. We also obviously don't pass
+ * FOLL_REMOTE in here.
*/
long get_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e96c670fe8b7..0957e654b3c9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -532,13 +532,13 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
-static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
+static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
gfp_t gfp)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct mem_cgroup *memcg;
pgtable_t pgtable;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
VM_BUG_ON_PAGE(!PageCompound(page), page);
@@ -563,9 +563,9 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
*/
__SetPageUptodate(page);
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_none(*fe->pmd))) {
- spin_unlock(fe->ptl);
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_none(*vmf->pmd))) {
+ spin_unlock(vmf->ptl);
mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(vma->vm_mm, pgtable);
@@ -576,11 +576,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
if (userfaultfd_missing(vma)) {
int ret;
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(vma->vm_mm, pgtable);
- ret = handle_userfault(fe, VM_UFFD_MISSING);
+ ret = handle_userfault(vmf, VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
return ret;
}
@@ -590,11 +590,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
page_add_new_anon_rmap(page, vma, haddr, true);
mem_cgroup_commit_charge(page, memcg, false, true);
lru_cache_add_active_or_unevictable(page, vma);
- pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable);
- set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
+ pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
atomic_long_inc(&vma->vm_mm->nr_ptes);
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
count_vm_event(THP_FAULT_ALLOC);
}
@@ -641,12 +641,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
return true;
}
-int do_huge_pmd_anonymous_page(struct fault_env *fe)
+int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
gfp_t gfp;
struct page *page;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
return VM_FAULT_FALLBACK;
@@ -654,7 +654,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
return VM_FAULT_OOM;
- if (!(fe->flags & FAULT_FLAG_WRITE) &&
+ if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm) &&
transparent_hugepage_use_zero_page()) {
pgtable_t pgtable;
@@ -670,22 +670,22 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
ret = 0;
set = false;
- if (pmd_none(*fe->pmd)) {
+ if (pmd_none(*vmf->pmd)) {
if (userfaultfd_missing(vma)) {
- spin_unlock(fe->ptl);
- ret = handle_userfault(fe, VM_UFFD_MISSING);
+ spin_unlock(vmf->ptl);
+ ret = handle_userfault(vmf, VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
} else {
set_huge_zero_page(pgtable, vma->vm_mm, vma,
- haddr, fe->pmd, zero_page);
- spin_unlock(fe->ptl);
+ haddr, vmf->pmd, zero_page);
+ spin_unlock(vmf->ptl);
set = true;
}
} else
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
if (!set)
pte_free(vma->vm_mm, pgtable);
return ret;
@@ -697,7 +697,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
return VM_FAULT_FALLBACK;
}
prep_transhuge_page(page);
- return __do_huge_pmd_anonymous_page(fe, page, gfp);
+ return __do_huge_pmd_anonymous_page(vmf, page, gfp);
}
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -869,30 +869,30 @@ out:
return ret;
}
-void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd)
+void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
{
pmd_t entry;
unsigned long haddr;
- fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
+ vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
goto unlock;
entry = pmd_mkyoung(orig_pmd);
- haddr = fe->address & HPAGE_PMD_MASK;
- if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry,
- fe->flags & FAULT_FLAG_WRITE))
- update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd);
+ haddr = vmf->address & HPAGE_PMD_MASK;
+ if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry,
+ vmf->flags & FAULT_FLAG_WRITE))
+ update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
unlock:
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
}
-static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
+static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
struct page *page)
{
- struct vm_area_struct *vma = fe->vma;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
struct mem_cgroup *memcg;
pgtable_t pgtable;
pmd_t _pmd;
@@ -911,7 +911,7 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
for (i = 0; i < HPAGE_PMD_NR; i++) {
pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
__GFP_OTHER_NODE, vma,
- fe->address, page_to_nid(page));
+ vmf->address, page_to_nid(page));
if (unlikely(!pages[i] ||
mem_cgroup_try_charge(pages[i], vma->vm_mm,
GFP_KERNEL, &memcg, false))) {
@@ -942,15 +942,15 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
goto out_free_pages;
VM_BUG_ON_PAGE(!PageHead(page), page);
- pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
/* leave pmd empty until pte is filled */
- pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd);
+ pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
pmd_populate(vma->vm_mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -959,20 +959,20 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
- page_add_new_anon_rmap(pages[i], fe->vma, haddr, false);
+ page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
mem_cgroup_commit_charge(pages[i], memcg, false, false);
lru_cache_add_active_or_unevictable(pages[i], vma);
- fe->pte = pte_offset_map(&_pmd, haddr);
- VM_BUG_ON(!pte_none(*fe->pte));
- set_pte_at(vma->vm_mm, haddr, fe->pte, entry);
- pte_unmap(fe->pte);
+ vmf->pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*vmf->pte));
+ set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
+ pte_unmap(vmf->pte);
}
kfree(pages);
smp_wmb(); /* make pte visible before pmd */
- pmd_populate(vma->vm_mm, fe->pmd, pgtable);
+ pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
page_remove_rmap(page, true);
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
@@ -983,7 +983,7 @@ out:
return ret;
out_free_pages:
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
for (i = 0; i < HPAGE_PMD_NR; i++) {
memcg = (void *)page_private(pages[i]);
@@ -995,23 +995,23 @@ out_free_pages:
goto out;
}
-int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd)
+int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL, *new_page;
struct mem_cgroup *memcg;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
gfp_t huge_gfp; /* for allocation and charge */
int ret = 0;
- fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd);
+ vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
if (is_huge_zero_pmd(orig_pmd))
goto alloc;
- spin_lock(fe->ptl);
- if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
+ spin_lock(vmf->ptl);
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
goto out_unlock;
page = pmd_page(orig_pmd);
@@ -1024,13 +1024,13 @@ int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd)
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1))
- update_mmu_cache_pmd(vma, fe->address, fe->pmd);
+ if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
ret |= VM_FAULT_WRITE;
goto out_unlock;
}
get_page(page);
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) {
@@ -1043,12 +1043,12 @@ alloc:
prep_transhuge_page(new_page);
} else {
if (!page) {
- split_huge_pmd(vma, fe->pmd, fe->address);
+ split_huge_pmd(vma, vmf->pmd, vmf->address);
ret |= VM_FAULT_FALLBACK;
} else {
- ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page);
+ ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
if (ret & VM_FAULT_OOM) {
- split_huge_pmd(vma, fe->pmd, fe->address);
+ split_huge_pmd(vma, vmf->pmd, vmf->address);
ret |= VM_FAULT_FALLBACK;
}
put_page(page);
@@ -1060,7 +1060,7 @@ alloc:
if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
huge_gfp, &memcg, true))) {
put_page(new_page);
- split_huge_pmd(vma, fe->pmd, fe->address);
+ split_huge_pmd(vma, vmf->pmd, vmf->address);
if (page)
put_page(page);
ret |= VM_FAULT_FALLBACK;
@@ -1080,11 +1080,11 @@ alloc:
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
- spin_lock(fe->ptl);
+ spin_lock(vmf->ptl);
if (page)
put_page(page);
- if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) {
- spin_unlock(fe->ptl);
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+ spin_unlock(vmf->ptl);
mem_cgroup_cancel_charge(new_page, memcg, true);
put_page(new_page);
goto out_mn;
@@ -1092,12 +1092,12 @@ alloc:
pmd_t entry;
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
page_add_new_anon_rmap(new_page, vma, haddr, true);
mem_cgroup_commit_charge(new_page, memcg, false, true);
lru_cache_add_active_or_unevictable(new_page, vma);
- set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
- update_mmu_cache_pmd(vma, fe->address, fe->pmd);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
if (!page) {
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
} else {
@@ -1107,13 +1107,13 @@ alloc:
}
ret |= VM_FAULT_WRITE;
}
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
out_mn:
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
out:
return ret;
out_unlock:
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
return ret;
}
@@ -1186,12 +1186,12 @@ out:
}
/* NUMA hinting page fault entry point for trans huge pmds */
-int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
+int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct anon_vma *anon_vma = NULL;
struct page *page;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
int page_nid = -1, this_nid = numa_node_id();
int target_nid, last_cpupid = -1;
bool page_locked;
@@ -1199,8 +1199,8 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
bool was_writable;
int flags = 0;
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_same(pmd, *fe->pmd)))
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(pmd, *vmf->pmd)))
goto out_unlock;
/*
@@ -1208,9 +1208,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
* without disrupting NUMA hinting information. Do not relock and
* check_same as the page may no longer be mapped.
*/
- if (unlikely(pmd_trans_migrating(*fe->pmd))) {
- page = pmd_page(*fe->pmd);
- spin_unlock(fe->ptl);
+ if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
+ page = pmd_page(*vmf->pmd);
+ spin_unlock(vmf->ptl);
wait_on_page_locked(page);
goto out;
}
@@ -1243,7 +1243,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
/* Migration could have started since the pmd_trans_migrating check */
if (!page_locked) {
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
wait_on_page_locked(page);
page_nid = -1;
goto out;
@@ -1254,12 +1254,12 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
* to serialises splits
*/
get_page(page);
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
anon_vma = page_lock_anon_vma_read(page);
/* Confirm the PMD did not change while page_table_lock was released */
- spin_lock(fe->ptl);
- if (unlikely(!pmd_same(pmd, *fe->pmd))) {
+ spin_lock(vmf->ptl);
+ if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
unlock_page(page);
put_page(page);
page_nid = -1;
@@ -1277,9 +1277,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
* Migrate the THP to the requested node, returns with page unlocked
* and access rights restored.
*/
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
- fe->pmd, pmd, fe->address, page, target_nid);
+ vmf->pmd, pmd, vmf->address, page, target_nid);
if (migrated) {
flags |= TNF_MIGRATED;
page_nid = target_nid;
@@ -1294,18 +1294,19 @@ clear_pmdnuma:
pmd = pmd_mkyoung(pmd);
if (was_writable)
pmd = pmd_mkwrite(pmd);
- set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd);
- update_mmu_cache_pmd(vma, fe->address, fe->pmd);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
unlock_page(page);
out_unlock:
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
out:
if (anon_vma)
page_unlock_anon_vma_read(anon_vma);
if (page_nid != -1)
- task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags);
+ task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
+ vmf->flags);
return 0;
}
diff --git a/mm/internal.h b/mm/internal.h
index 537ac9951f5f..44d68895a9b9 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -36,7 +36,7 @@
/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
-int do_swap_page(struct fault_env *fe, pte_t orig_pte);
+int do_swap_page(struct vm_fault *vmf);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 09460955e818..7434a63cac94 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -875,13 +875,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
unsigned long address, pmd_t *pmd,
int referenced)
{
- pte_t pteval;
int swapped_in = 0, ret = 0;
- struct fault_env fe = {
+ struct vm_fault vmf = {
.vma = vma,
.address = address,
.flags = FAULT_FLAG_ALLOW_RETRY,
.pmd = pmd,
+ .pgoff = linear_page_index(vma, address),
};
/* we only decide to swapin, if there is enough young ptes */
@@ -889,19 +889,19 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
}
- fe.pte = pte_offset_map(pmd, address);
- for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE;
- fe.pte++, fe.address += PAGE_SIZE) {
- pteval = *fe.pte;
- if (!is_swap_pte(pteval))
+ vmf.pte = pte_offset_map(pmd, address);
+ for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
+ vmf.pte++, vmf.address += PAGE_SIZE) {
+ vmf.orig_pte = *vmf.pte;
+ if (!is_swap_pte(vmf.orig_pte))
continue;
swapped_in++;
- ret = do_swap_page(&fe, pteval);
+ ret = do_swap_page(&vmf);
/* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
if (ret & VM_FAULT_RETRY) {
down_read(&mm->mmap_sem);
- if (hugepage_vma_revalidate(mm, address, &fe.vma)) {
+ if (hugepage_vma_revalidate(mm, address, &vmf.vma)) {
/* vma is no longer available, don't continue to swapin */
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
@@ -915,10 +915,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
return false;
}
/* pte is unmapped now, we need to map it */
- fe.pte = pte_offset_map(pmd, fe.address);
+ vmf.pte = pte_offset_map(pmd, vmf.address);
}
- fe.pte--;
- pte_unmap(fe.pte);
+ vmf.pte--;
+ pte_unmap(vmf.pte);
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
return true;
}
diff --git a/mm/memory.c b/mm/memory.c
index 08d8da39de28..455c3e628d52 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2034,20 +2034,17 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
*
* We do this without the lock held, so that it can sleep if it needs to.
*/
-static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
- unsigned long address)
+static int do_page_mkwrite(struct vm_fault *vmf)
{
- struct vm_fault vmf;
int ret;
+ struct page *page = vmf->page;
+ unsigned int old_flags = vmf->flags;
- vmf.virtual_address = (void __user *)(address & PAGE_MASK);
- vmf.pgoff = page->index;
- vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
- vmf.gfp_mask = __get_fault_gfp_mask(vma);
- vmf.page = page;
- vmf.cow_page = NULL;
+ vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
- ret = vma->vm_ops->page_mkwrite(vma, &vmf);
+ ret = vmf->vma->vm_ops->page_mkwrite(vmf->vma, vmf);
+ /* Restore original flags so that caller is not surprised */
+ vmf->flags = old_flags;
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
return ret;
if (unlikely(!(ret & VM_FAULT_LOCKED))) {
@@ -2063,6 +2060,41 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
}
/*
+ * Handle dirtying of a page in shared file mapping on a write fault.
+ *
+ * The function expects the page to be locked and unlocks it.
+ */
+static void fault_dirty_shared_page(struct vm_area_struct *vma,
+ struct page *page)
+{
+ struct address_space *mapping;
+ bool dirtied;
+ bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
+
+ dirtied = set_page_dirty(page);
+ VM_BUG_ON_PAGE(PageAnon(page), page);
+ /*
+ * Take a local copy of the address_space - page.mapping may be zeroed
+ * by truncate after unlock_page(). The address_space itself remains
+ * pinned by vma->vm_file's reference. We rely on unlock_page()'s
+ * release semantics to prevent the compiler from undoing this copying.
+ */
+ mapping = page_rmapping(page);
+ unlock_page(page);
+
+ if ((dirtied || page_mkwrite) && mapping) {
+ /*
+ * Some device drivers do not set page.mapping
+ * but still dirty their pages
+ */
+ balance_dirty_pages_ratelimited(mapping);
+ }
+
+ if (!page_mkwrite)
+ file_update_time(vma->vm_file);
+}
+
+/*
* Handle write page faults for pages that can be reused in the current vma
*
* This can happen either due to the mapping being with the VM_SHARED flag,
@@ -2070,11 +2102,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
* case, all we need to do here is to mark the page as writable and update
* any related book-keeping.
*/
-static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
- struct page *page, int page_mkwrite, int dirty_shared)
- __releases(fe->ptl)
+static inline void wp_page_reuse(struct vm_fault *vmf)
+ __releases(vmf->ptl)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
+ struct page *page = vmf->page;
pte_t entry;
/*
* Clear the pages cpupid information as the existing
@@ -2084,39 +2116,12 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
if (page)
page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
- flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
- entry = pte_mkyoung(orig_pte);
+ flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
+ entry = pte_mkyoung(vmf->orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1))
- update_mmu_cache(vma, fe->address, fe->pte);
- pte_unmap_unlock(fe->pte, fe->ptl);
-
- if (dirty_shared) {
- struct address_space *mapping;
- int dirtied;
-
- if (!page_mkwrite)
- lock_page(page);
-
- dirtied = set_page_dirty(page);
- VM_BUG_ON_PAGE(PageAnon(page), page);
- mapping = page->mapping;
- unlock_page(page);
- put_page(page);
-
- if ((dirtied || page_mkwrite) && mapping) {
- /*
- * Some device drivers do not set page.mapping
- * but still dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
-
- if (!page_mkwrite)
- file_update_time(vma->vm_file);
- }
-
- return VM_FAULT_WRITE;
+ if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
}
/*
@@ -2135,31 +2140,32 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
* held to the old page, as well as updating the rmap.
* - In any case, unlock the PTL and drop the reference we took to the old page.
*/
-static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
- struct page *old_page)
+static int wp_page_copy(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
+ struct page *old_page = vmf->page;
struct page *new_page = NULL;
pte_t entry;
int page_copied = 0;
- const unsigned long mmun_start = fe->address & PAGE_MASK;
+ const unsigned long mmun_start = vmf->address & PAGE_MASK;
const unsigned long mmun_end = mmun_start + PAGE_SIZE;
struct mem_cgroup *memcg;
if (unlikely(anon_vma_prepare(vma)))
goto oom;
- if (is_zero_pfn(pte_pfn(orig_pte))) {
- new_page = alloc_zeroed_user_highpage_movable(vma, fe->address);
+ if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
+ new_page = alloc_zeroed_user_highpage_movable(vma,
+ vmf->address);
if (!new_page)
goto oom;
} else {
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
- fe->address);
+ vmf->address);
if (!new_page)
goto oom;
- cow_user_page(new_page, old_page, fe->address, vma);
+ cow_user_page(new_page, old_page, vmf->address, vma);
}
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
@@ -2172,8 +2178,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
/*
* Re-check the pte - we dropped the lock
*/
- fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl);
- if (likely(pte_same(*fe->pte, orig_pte))) {
+ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
+ if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
dec_mm_counter_fast(mm,
@@ -2183,7 +2189,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
} else {
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
- flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
+ flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/*
@@ -2192,8 +2198,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
* seen in the presence of one thread doing SMC and another
* thread doing COW.
*/
- ptep_clear_flush_notify(vma, fe->address, fe->pte);
- page_add_new_anon_rmap(new_page, vma, fe->address, false);
+ ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
+ page_add_new_anon_rmap(new_page, vma, vmf->address, false);
mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
/*
@@ -2201,8 +2207,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
* mmu page tables (such as kvm shadow page tables), we want the
* new page to be mapped directly into the secondary page table.
*/
- set_pte_at_notify(mm, fe->address, fe->pte, entry);
- update_mmu_cache(vma, fe->address, fe->pte);
+ set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
if (old_page) {
/*
* Only after switching the pte to the new page may
@@ -2239,7 +2245,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
if (new_page)
put_page(new_page);
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
if (old_page) {
/*
@@ -2263,79 +2269,91 @@ oom:
return VM_FAULT_OOM;
}
+/**
+ * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
+ * writeable once the page is prepared
+ *
+ * @vmf: structure describing the fault
+ *
+ * This function handles all that is needed to finish a write page fault in a
+ * shared mapping due to PTE being read-only once the mapped page is prepared.
+ * It handles locking of PTE and modifying it. The function returns
+ * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE
+ * lock.
+ *
+ * The function expects the page to be locked or other protection against
+ * concurrent faults / writeback (such as DAX radix tree locks).
+ */
+int finish_mkwrite_fault(struct vm_fault *vmf)
+{
+ WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ /*
+ * We might have raced with another page fault while we released the
+ * pte_offset_map_lock.
+ */
+ if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return VM_FAULT_NOPAGE;
+ }
+ wp_page_reuse(vmf);
+ return 0;
+}
+
/*
* Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
* mapping
*/
-static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte)
+static int wp_pfn_shared(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
- struct vm_fault vmf = {
- .page = NULL,
- .pgoff = linear_page_index(vma, fe->address),
- .virtual_address =
- (void __user *)(fe->address & PAGE_MASK),
- .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
- };
int ret;
- pte_unmap_unlock(fe->pte, fe->ptl);
- ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
- if (ret & VM_FAULT_ERROR)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ vmf->flags |= FAULT_FLAG_MKWRITE;
+ ret = vma->vm_ops->pfn_mkwrite(vma, vmf);
+ if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
return ret;
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- /*
- * We might have raced with another page fault while we
- * released the pte_offset_map_lock.
- */
- if (!pte_same(*fe->pte, orig_pte)) {
- pte_unmap_unlock(fe->pte, fe->ptl);
- return 0;
- }
+ return finish_mkwrite_fault(vmf);
}
- return wp_page_reuse(fe, orig_pte, NULL, 0, 0);
+ wp_page_reuse(vmf);
+ return VM_FAULT_WRITE;
}
-static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
- struct page *old_page)
- __releases(fe->ptl)
+static int wp_page_shared(struct vm_fault *vmf)
+ __releases(vmf->ptl)
{
- struct vm_area_struct *vma = fe->vma;
- int page_mkwrite = 0;
+ struct vm_area_struct *vma = vmf->vma;
- get_page(old_page);
+ get_page(vmf->page);
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
int tmp;
- pte_unmap_unlock(fe->pte, fe->ptl);
- tmp = do_page_mkwrite(vma, old_page, fe->address);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp || (tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- put_page(old_page);
+ put_page(vmf->page);
return tmp;
}
- /*
- * Since we dropped the lock we need to revalidate
- * the PTE as someone else may have changed it. If
- * they did, we just return, as we can count on the
- * MMU to tell us if they didn't also make it writable.
- */
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- if (!pte_same(*fe->pte, orig_pte)) {
- unlock_page(old_page);
- pte_unmap_unlock(fe->pte, fe->ptl);
- put_page(old_page);
- return 0;
+ tmp = finish_mkwrite_fault(vmf);
+ if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
+ unlock_page(vmf->page);
+ put_page(vmf->page);
+ return tmp;
}
- page_mkwrite = 1;
+ } else {
+ wp_page_reuse(vmf);
+ lock_page(vmf->page);
}
+ fault_dirty_shared_page(vma, vmf->page);
+ put_page(vmf->page);
- return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1);
+ return VM_FAULT_WRITE;
}
/*
@@ -2356,14 +2374,13 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
* but allow concurrent faults), with pte both mapped and locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
- __releases(fe->ptl)
+static int do_wp_page(struct vm_fault *vmf)
+ __releases(vmf->ptl)
{
- struct vm_area_struct *vma = fe->vma;
- struct page *old_page;
+ struct vm_area_struct *vma = vmf->vma;
- old_page = vm_normal_page(vma, fe->address, orig_pte);
- if (!old_page) {
+ vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
+ if (!vmf->page) {
/*
* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
* VM_PFNMAP VMA.
@@ -2373,33 +2390,33 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
*/
if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))
- return wp_pfn_shared(fe, orig_pte);
+ return wp_pfn_shared(vmf);
- pte_unmap_unlock(fe->pte, fe->ptl);
- return wp_page_copy(fe, orig_pte, old_page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return wp_page_copy(vmf);
}
/*
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
*/
- if (PageAnon(old_page) && !PageKsm(old_page)) {
+ if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
int total_mapcount;
- if (!trylock_page(old_page)) {
- get_page(old_page);
- pte_unmap_unlock(fe->pte, fe->ptl);
- lock_page(old_page);
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
- fe->address, &fe->ptl);
- if (!pte_same(*fe->pte, orig_pte)) {
- unlock_page(old_page);
- pte_unmap_unlock(fe->pte, fe->ptl);
- put_page(old_page);
+ if (!trylock_page(vmf->page)) {
+ get_page(vmf->page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ lock_page(vmf->page);
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+ unlock_page(vmf->page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ put_page(vmf->page);
return 0;
}
- put_page(old_page);
+ put_page(vmf->page);
}
- if (reuse_swap_page(old_page, &total_mapcount)) {
+ if (reuse_swap_page(vmf->page, &total_mapcount)) {
if (total_mapcount == 1) {
/*
* The page is all ours. Move it to
@@ -2408,24 +2425,25 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
* Protected against the rmap code by
* the page lock.
*/
- page_move_anon_rmap(old_page, vma);
+ page_move_anon_rmap(vmf->page, vma);
}
- unlock_page(old_page);
- return wp_page_reuse(fe, orig_pte, old_page, 0, 0);
+ unlock_page(vmf->page);
+ wp_page_reuse(vmf);
+ return VM_FAULT_WRITE;
}
- unlock_page(old_page);
+ unlock_page(vmf->page);
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
- return wp_page_shared(fe, orig_pte, old_page);
+ return wp_page_shared(vmf);
}
/*
* Ok, we need to copy. Oh, well..
*/
- get_page(old_page);
+ get_page(vmf->page);
- pte_unmap_unlock(fe->pte, fe->ptl);
- return wp_page_copy(fe, orig_pte, old_page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return wp_page_copy(vmf);
}
static void unmap_mapping_range_vma(struct vm_area_struct *vma,
@@ -2513,9 +2531,9 @@ EXPORT_SYMBOL(unmap_mapping_range);
* We return with the mmap_sem locked or unlocked in the same cases
* as does filemap_fault().
*/
-int do_swap_page(struct fault_env *fe, pte_t orig_pte)
+int do_swap_page(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct page *page, *swapcache;
struct mem_cgroup *memcg;
swp_entry_t entry;
@@ -2524,17 +2542,18 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
int exclusive = 0;
int ret = 0;
- if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte))
+ if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
goto out;
- entry = pte_to_swp_entry(orig_pte);
+ entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
- migration_entry_wait(vma->vm_mm, fe->pmd, fe->address);
+ migration_entry_wait(vma->vm_mm, vmf->pmd,
+ vmf->address);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else {
- print_bad_pte(vma, fe->address, orig_pte, NULL);
+ print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
ret = VM_FAULT_SIGBUS;
}
goto out;
@@ -2542,16 +2561,16 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
delayacct_set_flag(DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry);
if (!page) {
- page = swapin_readahead(entry,
- GFP_HIGHUSER_MOVABLE, vma, fe->address);
+ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma,
+ vmf->address);
if (!page) {
/*
* Back out if somebody else faulted in this pte
* while we released the pte lock.
*/
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
- fe->address, &fe->ptl);
- if (likely(pte_same(*fe->pte, orig_pte)))
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
ret = VM_FAULT_OOM;
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
goto unlock;
@@ -2573,7 +2592,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
}
swapcache = page;
- locked = lock_page_or_retry(page, vma->vm_mm, fe->flags);
+ locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
if (!locked) {
@@ -2590,7 +2609,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
goto out_page;
- page = ksm_might_need_to_copy(page, vma, fe->address);
+ page = ksm_might_need_to_copy(page, vma, vmf->address);
if (unlikely(!page)) {
ret = VM_FAULT_OOM;
page = swapcache;
@@ -2606,9 +2625,9 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
/*
* Back out if somebody else already faulted in this pte.
*/
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- if (unlikely(!pte_same(*fe->pte, orig_pte)))
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
goto out_nomap;
if (unlikely(!PageUptodate(page))) {
@@ -2629,22 +2648,23 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
- if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
+ if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
- fe->flags &= ~FAULT_FLAG_WRITE;
+ vmf->flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
exclusive = RMAP_EXCLUSIVE;
}
flush_icache_page(vma, page);
- if (pte_swp_soft_dirty(orig_pte))
+ if (pte_swp_soft_dirty(vmf->orig_pte))
pte = pte_mksoft_dirty(pte);
- set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
+ vmf->orig_pte = pte;
if (page == swapcache) {
- do_page_add_anon_rmap(page, vma, fe->address, exclusive);
+ do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
mem_cgroup_commit_charge(page, memcg, true, false);
activate_page(page);
} else { /* ksm created a completely new copy */
- page_add_new_anon_rmap(page, vma, fe->address, false);
+ page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
@@ -2667,22 +2687,22 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
put_page(swapcache);
}
- if (fe->flags & FAULT_FLAG_WRITE) {
- ret |= do_wp_page(fe, pte);
+ if (vmf->flags & FAULT_FLAG_WRITE) {
+ ret |= do_wp_page(vmf);
if (ret & VM_FAULT_ERROR)
ret &= VM_FAULT_ERROR;
goto out;
}
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, fe->address, fe->pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
return ret;
out_nomap:
mem_cgroup_cancel_charge(page, memcg, false);
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
unlock_page(page);
out_release:
@@ -2733,9 +2753,9 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_anonymous_page(struct fault_env *fe)
+static int do_anonymous_page(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct mem_cgroup *memcg;
struct page *page;
pte_t entry;
@@ -2745,7 +2765,7 @@ static int do_anonymous_page(struct fault_env *fe)
return VM_FAULT_SIGBUS;
/* Check if we need to add a guard page to the stack */
- if (check_stack_guard_page(vma, fe->address) < 0)
+ if (check_stack_guard_page(vma, vmf->address) < 0)
return VM_FAULT_SIGSEGV;
/*
@@ -2758,26 +2778,26 @@ static int do_anonymous_page(struct fault_env *fe)
*
* Here we only have down_read(mmap_sem).
*/
- if (pte_alloc(vma->vm_mm, fe->pmd, fe->address))
+ if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))
return VM_FAULT_OOM;
/* See the comment in pte_alloc_one_map() */
- if (unlikely(pmd_trans_unstable(fe->pmd)))
+ if (unlikely(pmd_trans_unstable(vmf->pmd)))
return 0;
/* Use the zero-page for reads */
- if (!(fe->flags & FAULT_FLAG_WRITE) &&
+ if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm)) {
- entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address),
+ entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
vma->vm_page_prot));
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- if (!pte_none(*fe->pte))
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (!pte_none(*vmf->pte))
goto unlock;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
- pte_unmap_unlock(fe->pte, fe->ptl);
- return handle_userfault(fe, VM_UFFD_MISSING);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return handle_userfault(vmf, VM_UFFD_MISSING);
}
goto setpte;
}
@@ -2785,7 +2805,7 @@ static int do_anonymous_page(struct fault_env *fe)
/* Allocate our own private page. */
if (unlikely(anon_vma_prepare(vma)))
goto oom;
- page = alloc_zeroed_user_highpage_movable(vma, fe->address);
+ page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
if (!page)
goto oom;
@@ -2803,30 +2823,30 @@ static int do_anonymous_page(struct fault_env *fe)
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- if (!pte_none(*fe->pte))
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ if (!pte_none(*vmf->pte))
goto release;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
- return handle_userfault(fe, VM_UFFD_MISSING);
+ return handle_userfault(vmf, VM_UFFD_MISSING);
}
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, fe->address, false);
+ page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
setpte:
- set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, fe->address, fe->pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
release:
mem_cgroup_cancel_charge(page, memcg, false);
@@ -2843,62 +2863,50 @@ oom:
* released depending on flags and vma->vm_ops->fault() return value.
* See filemap_fault() and __lock_page_retry().
*/
-static int __do_fault(struct fault_env *fe, pgoff_t pgoff,
- struct page *cow_page, struct page **page, void **entry)
+static int __do_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- struct vm_fault vmf;
+ struct vm_area_struct *vma = vmf->vma;
int ret;
- vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK);
- vmf.pgoff = pgoff;
- vmf.flags = fe->flags;
- vmf.page = NULL;
- vmf.gfp_mask = __get_fault_gfp_mask(vma);
- vmf.cow_page = cow_page;
-
- ret = vma->vm_ops->fault(vma, &vmf);
- if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
- return ret;
- if (ret & VM_FAULT_DAX_LOCKED) {
- *entry = vmf.entry;
+ ret = vma->vm_ops->fault(vma, vmf);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
+ VM_FAULT_DONE_COW)))
return ret;
- }
- if (unlikely(PageHWPoison(vmf.page))) {
+ if (unlikely(PageHWPoison(vmf->page))) {
if (ret & VM_FAULT_LOCKED)
- unlock_page(vmf.page);
- put_page(vmf.page);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
+ vmf->page = NULL;
return VM_FAULT_HWPOISON;
}
if (unlikely(!(ret & VM_FAULT_LOCKED)))
- lock_page(vmf.page);
+ lock_page(vmf->page);
else
- VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
+ VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
- *page = vmf.page;
return ret;
}
-static int pte_alloc_one_map(struct fault_env *fe)
+static int pte_alloc_one_map(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
- if (!pmd_none(*fe->pmd))
+ if (!pmd_none(*vmf->pmd))
goto map_pte;
- if (fe->prealloc_pte) {
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_none(*fe->pmd))) {
- spin_unlock(fe->ptl);
+ if (vmf->prealloc_pte) {
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_none(*vmf->pmd))) {
+ spin_unlock(vmf->ptl);
goto map_pte;
}
atomic_long_inc(&vma->vm_mm->nr_ptes);
- pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte);
- spin_unlock(fe->ptl);
- fe->prealloc_pte = 0;
- } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) {
+ pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
+ spin_unlock(vmf->ptl);
+ vmf->prealloc_pte = 0;
+ } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) {
return VM_FAULT_OOM;
}
map_pte:
@@ -2913,11 +2921,11 @@ map_pte:
* through an atomic read in C, which is what pmd_trans_unstable()
* provides.
*/
- if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
+ if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
return VM_FAULT_NOPAGE;
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
return 0;
}
@@ -2935,24 +2943,24 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
return true;
}
-static void deposit_prealloc_pte(struct fault_env *fe)
+static void deposit_prealloc_pte(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
- pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte);
+ pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
/*
* We are going to consume the prealloc table,
* count that as nr_ptes.
*/
atomic_long_inc(&vma->vm_mm->nr_ptes);
- fe->prealloc_pte = 0;
+ vmf->prealloc_pte = 0;
}
-static int do_set_pmd(struct fault_env *fe, struct page *page)
+static int do_set_pmd(struct vm_fault *vmf, struct page *page)
{
- struct vm_area_struct *vma = fe->vma;
- bool write = fe->flags & FAULT_FLAG_WRITE;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
pmd_t entry;
int i, ret;
@@ -2966,15 +2974,15 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
* Archs like ppc64 need additonal space to store information
* related to pte entry. Use the preallocated table for that.
*/
- if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) {
- fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address);
- if (!fe->prealloc_pte)
+ if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
+ vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address);
+ if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */
}
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_none(*fe->pmd)))
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_none(*vmf->pmd)))
goto out;
for (i = 0; i < HPAGE_PMD_NR; i++)
@@ -2990,11 +2998,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
* deposit and withdraw with pmd lock held
*/
if (arch_needs_pgtable_deposit())
- deposit_prealloc_pte(fe);
+ deposit_prealloc_pte(vmf);
- set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
- update_mmu_cache_pmd(vma, haddr, fe->pmd);
+ update_mmu_cache_pmd(vma, haddr, vmf->pmd);
/* fault is handled */
ret = 0;
@@ -3005,13 +3013,13 @@ out:
* withdraw with pmd lock held.
*/
if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK)
- fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm,
- fe->pmd);
- spin_unlock(fe->ptl);
+ vmf->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm,
+ vmf->pmd);
+ spin_unlock(vmf->ptl);
return ret;
}
#else
-static int do_set_pmd(struct fault_env *fe, struct page *page)
+static int do_set_pmd(struct vm_fault *vmf, struct page *page)
{
BUILD_BUG();
return 0;
@@ -3022,41 +3030,42 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
* alloc_set_pte - setup new PTE entry for given page and add reverse page
* mapping. If needed, the fucntion allocates page table or use pre-allocated.
*
- * @fe: fault environment
+ * @vmf: fault environment
* @memcg: memcg to charge page (only for private mappings)
* @page: page to map
*
- * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return.
+ * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
+ * return.
*
* Target users are page handler itself and implementations of
* vm_ops->map_pages.
*/
-int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
+int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
struct page *page)
{
- struct vm_area_struct *vma = fe->vma;
- bool write = fe->flags & FAULT_FLAG_WRITE;
+ struct vm_area_struct *vma = vmf->vma;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
pte_t entry;
int ret;
- if (pmd_none(*fe->pmd) && PageTransCompound(page) &&
+ if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
/* THP on COW? */
VM_BUG_ON_PAGE(memcg, page);
- ret = do_set_pmd(fe, page);
+ ret = do_set_pmd(vmf, page);
if (ret != VM_FAULT_FALLBACK)
goto fault_handled;
}
- if (!fe->pte) {
- ret = pte_alloc_one_map(fe);
+ if (!vmf->pte) {
+ ret = pte_alloc_one_map(vmf);
if (ret)
goto fault_handled;
}
/* Re-check under ptl */
- if (unlikely(!pte_none(*fe->pte))) {
+ if (unlikely(!pte_none(*vmf->pte))) {
ret = VM_FAULT_NOPAGE;
goto fault_handled;
}
@@ -3068,28 +3077,60 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, fe->address, false);
+ page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
}
- set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
/* no need to invalidate: a not-present page won't be cached */
- update_mmu_cache(vma, fe->address, fe->pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
ret = 0;
fault_handled:
/* preallocated pagetable is unused: free it */
- if (fe->prealloc_pte) {
- pte_free(fe->vma->vm_mm, fe->prealloc_pte);
- fe->prealloc_pte = 0;
+ if (vmf->prealloc_pte) {
+ pte_free(vmf->vma->vm_mm, vmf->prealloc_pte);
+ vmf->prealloc_pte = 0;
}
return ret;
}
+
+/**
+ * finish_fault - finish page fault once we have prepared the page to fault
+ *
+ * @vmf: structure describing the fault
+ *
+ * This function handles all that is needed to finish a page fault once the
+ * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
+ * given page, adds reverse page mapping, handles memcg charges and LRU
+ * addition. The function returns 0 on success, VM_FAULT_ code in case of
+ * error.
+ *
+ * The function expects the page to be locked and on success it consumes a
+ * reference of a page being mapped (for the PTE which maps it).
+ */
+int finish_fault(struct vm_fault *vmf)
+{
+ struct page *page;
+ int ret;
+
+ /* Did we COW the page? */
+ if ((vmf->flags & FAULT_FLAG_WRITE) &&
+ !(vmf->vma->vm_flags & VM_SHARED))
+ page = vmf->cow_page;
+ else
+ page = vmf->page;
+ ret = alloc_set_pte(vmf, vmf->memcg, page);
+ if (vmf->pte)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return ret;
+}
+
static unsigned long fault_around_bytes __read_mostly =
rounddown_pow_of_two(65536);
@@ -3154,17 +3195,18 @@ late_initcall(fault_around_debugfs);
* fault_around_pages() value (and therefore to page order). This way it's
* easier to guarantee that we don't cross page table boundaries.
*/
-static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
+static int do_fault_around(struct vm_fault *vmf)
{
- unsigned long address = fe->address, nr_pages, mask;
+ unsigned long address = vmf->address, nr_pages, mask;
+ pgoff_t start_pgoff = vmf->pgoff;
pgoff_t end_pgoff;
int off, ret = 0;
nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
- fe->address = max(address & mask, fe->vma->vm_start);
- off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+ vmf->address = max(address & mask, vmf->vma->vm_start);
+ off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
start_pgoff -= off;
/*
@@ -3172,45 +3214,45 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
* or fault_around_pages() from start_pgoff, depending what is nearest.
*/
end_pgoff = start_pgoff -
- ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
+ ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
PTRS_PER_PTE - 1;
- end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1,
+ end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
start_pgoff + nr_pages - 1);
- if (pmd_none(*fe->pmd)) {
- fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address);
- if (!fe->prealloc_pte)
+ if (pmd_none(*vmf->pmd)) {
+ vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
+ vmf->address);
+ if (!vmf->prealloc_pte)
goto out;
smp_wmb(); /* See comment in __pte_alloc() */
}
- fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff);
+ vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
/* Huge page is mapped? Page fault is solved */
- if (pmd_trans_huge(*fe->pmd)) {
+ if (pmd_trans_huge(*vmf->pmd)) {
ret = VM_FAULT_NOPAGE;
goto out;
}
/* ->map_pages() haven't done anything useful. Cold page cache? */
- if (!fe->pte)
+ if (!vmf->pte)
goto out;
/* check if the page fault is solved */
- fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
- if (!pte_none(*fe->pte))
+ vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
+ if (!pte_none(*vmf->pte))
ret = VM_FAULT_NOPAGE;
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
- fe->address = address;
- fe->pte = NULL;
+ vmf->address = address;
+ vmf->pte = NULL;
return ret;
}
-static int do_read_fault(struct fault_env *fe, pgoff_t pgoff)
+static int do_read_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- struct page *fault_page;
+ struct vm_area_struct *vma = vmf->vma;
int ret = 0;
/*
@@ -3219,80 +3261,67 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff)
* something).
*/
if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
- ret = do_fault_around(fe, pgoff);
+ ret = do_fault_around(vmf);
if (ret)
return ret;
}
- ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
+ ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
- ret |= alloc_set_pte(fe, NULL, fault_page);
- if (fe->pte)
- pte_unmap_unlock(fe->pte, fe->ptl);
- unlock_page(fault_page);
+ ret |= finish_fault(vmf);
+ unlock_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
- put_page(fault_page);
+ put_page(vmf->page);
return ret;
}
-static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff)
+static int do_cow_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- struct page *fault_page, *new_page;
- void *fault_entry;
- struct mem_cgroup *memcg;
+ struct vm_area_struct *vma = vmf->vma;
int ret;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address);
- if (!new_page)
+ vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
+ if (!vmf->cow_page)
return VM_FAULT_OOM;
- if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
- &memcg, false)) {
- put_page(new_page);
+ if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
+ &vmf->memcg, false)) {
+ put_page(vmf->cow_page);
return VM_FAULT_OOM;
}
- ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry);
+ ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
+ if (ret & VM_FAULT_DONE_COW)
+ return ret;
- if (!(ret & VM_FAULT_DAX_LOCKED))
- copy_user_highpage(new_page, fault_page, fe->address, vma);
- __SetPageUptodate(new_page);
+ copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
+ __SetPageUptodate(vmf->cow_page);
- ret |= alloc_set_pte(fe, memcg, new_page);
- if (fe->pte)
- pte_unmap_unlock(fe->pte, fe->ptl);
- if (!(ret & VM_FAULT_DAX_LOCKED)) {
- unlock_page(fault_page);
- put_page(fault_page);
- } else {
- dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff);
- }
+ ret |= finish_fault(vmf);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
return ret;
uncharge_out:
- mem_cgroup_cancel_charge(new_page, memcg, false);
- put_page(new_page);
+ mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
+ put_page(vmf->cow_page);
return ret;
}
-static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
+static int do_shared_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- struct page *fault_page;
- struct address_space *mapping;
- int dirtied = 0;
+ struct vm_area_struct *vma = vmf->vma;
int ret, tmp;
- ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
+ ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
@@ -3301,46 +3330,24 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
* about to become writable
*/
if (vma->vm_ops->page_mkwrite) {
- unlock_page(fault_page);
- tmp = do_page_mkwrite(vma, fault_page, fe->address);
+ unlock_page(vmf->page);
+ tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp ||
(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- put_page(fault_page);
+ put_page(vmf->page);
return tmp;
}
}
- ret |= alloc_set_pte(fe, NULL, fault_page);
- if (fe->pte)
- pte_unmap_unlock(fe->pte, fe->ptl);
+ ret |= finish_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
VM_FAULT_RETRY))) {
- unlock_page(fault_page);
- put_page(fault_page);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
return ret;
}
- if (set_page_dirty(fault_page))
- dirtied = 1;
- /*
- * Take a local copy of the address_space - page.mapping may be zeroed
- * by truncate after unlock_page(). The address_space itself remains
- * pinned by vma->vm_file's reference. We rely on unlock_page()'s
- * release semantics to prevent the compiler from undoing this copying.
- */
- mapping = page_rmapping(fault_page);
- unlock_page(fault_page);
- if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
- /*
- * Some device drivers do not set page.mapping but still
- * dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
-
- if (!vma->vm_ops->page_mkwrite)
- file_update_time(vma->vm_file);
-
+ fault_dirty_shared_page(vma, vmf->page);
return ret;
}
@@ -3350,19 +3357,18 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
-static int do_fault(struct fault_env *fe)
+static int do_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- pgoff_t pgoff = linear_page_index(vma, fe->address);
+ struct vm_area_struct *vma = vmf->vma;
/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
if (!vma->vm_ops->fault)
return VM_FAULT_SIGBUS;
- if (!(fe->flags & FAULT_FLAG_WRITE))
- return do_read_fault(fe, pgoff);
+ if (!(vmf->flags & FAULT_FLAG_WRITE))
+ return do_read_fault(vmf);
if (!(vma->vm_flags & VM_SHARED))
- return do_cow_fault(fe, pgoff);
- return do_shared_fault(fe, pgoff);
+ return do_cow_fault(vmf);
+ return do_shared_fault(vmf);
}
static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
@@ -3380,14 +3386,15 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
return mpol_misplaced(page, vma, addr);
}
-static int do_numa_page(struct fault_env *fe, pte_t pte)
+static int do_numa_page(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL;
int page_nid = -1;
int last_cpupid;
int target_nid;
bool migrated = false;
+ pte_t pte = vmf->orig_pte;
bool was_writable = pte_write(pte);
int flags = 0;
@@ -3400,10 +3407,10 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
* page table entry is not accessible, so there would be no
* concurrent hardware modifications to the PTE.
*/
- fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd);
- spin_lock(fe->ptl);
- if (unlikely(!pte_same(*fe->pte, pte))) {
- pte_unmap_unlock(fe->pte, fe->ptl);
+ vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
+ spin_lock(vmf->ptl);
+ if (unlikely(!pte_same(*vmf->pte, pte))) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
}
@@ -3412,18 +3419,18 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
pte = pte_mkyoung(pte);
if (was_writable)
pte = pte_mkwrite(pte);
- set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
- update_mmu_cache(vma, fe->address, fe->pte);
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
- page = vm_normal_page(vma, fe->address, pte);
+ page = vm_normal_page(vma, vmf->address, pte);
if (!page) {
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
/* TODO: handle PTE-mapped THP */
if (PageCompound(page)) {
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
@@ -3447,9 +3454,9 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
last_cpupid = page_cpupid_last(page);
page_nid = page_to_nid(page);
- target_nid = numa_migrate_prep(page, vma, fe->address, page_nid,
+ target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
&flags);
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
if (target_nid == -1) {
put_page(page);
goto out;
@@ -3469,28 +3476,28 @@ out:
return 0;
}
-static int create_huge_pmd(struct fault_env *fe)
+static int create_huge_pmd(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
if (vma_is_anonymous(vma))
- return do_huge_pmd_anonymous_page(fe);
+ return do_huge_pmd_anonymous_page(vmf);
if (vma->vm_ops->pmd_fault)
- return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd,
- fe->flags);
+ return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd,
+ vmf->flags);
return VM_FAULT_FALLBACK;
}
-static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd)
+static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
{
- if (vma_is_anonymous(fe->vma))
- return do_huge_pmd_wp_page(fe, orig_pmd);
- if (fe->vma->vm_ops->pmd_fault)
- return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd,
- fe->flags);
+ if (vma_is_anonymous(vmf->vma))
+ return do_huge_pmd_wp_page(vmf, orig_pmd);
+ if (vmf->vma->vm_ops->pmd_fault)
+ return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address,
+ vmf->pmd, vmf->flags);
/* COW handled on pte level: split pmd */
- VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma);
- __split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL);
+ VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
+ __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
return VM_FAULT_FALLBACK;
}
@@ -3515,21 +3522,21 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
* The mmap_sem may have been released depending on flags and our return value.
* See filemap_fault() and __lock_page_or_retry().
*/
-static int handle_pte_fault(struct fault_env *fe)
+static int handle_pte_fault(struct vm_fault *vmf)
{
pte_t entry;
- if (unlikely(pmd_none(*fe->pmd))) {
+ if (unlikely(pmd_none(*vmf->pmd))) {
/*
* Leave __pte_alloc() until later: because vm_ops->fault may
* want to allocate huge page, and if we expose page table
* for an instant, it will be difficult to retract from
* concurrent faults and from rmap lookups.
*/
- fe->pte = NULL;
+ vmf->pte = NULL;
} else {
/* See comment in pte_alloc_one_map() */
- if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
+ if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
return 0;
/*
* A regular pmd is established and it can't morph into a huge
@@ -3537,9 +3544,8 @@ static int handle_pte_fault(struct fault_env *fe)
* mmap_sem read mode and khugepaged takes it in write mode.
* So now it's safe to run pte_offset_map().
*/
- fe->pte = pte_offset_map(fe->pmd, fe->address);
-
- entry = *fe->pte;
+ vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
+ vmf->orig_pte = *vmf->pte;
/*
* some architectures can have larger ptes than wordsize,
@@ -3550,38 +3556,39 @@ static int handle_pte_fault(struct fault_env *fe)
* ptl lock held. So here a barrier will do.
*/
barrier();
- if (pte_none(entry)) {
- pte_unmap(fe->pte);
- fe->pte = NULL;
+ if (pte_none(vmf->orig_pte)) {
+ pte_unmap(vmf->pte);
+ vmf->pte = NULL;
}
}
- if (!fe->pte) {
- if (vma_is_anonymous(fe->vma))
- return do_anonymous_page(fe);
+ if (!vmf->pte) {
+ if (vma_is_anonymous(vmf->vma))
+ return do_anonymous_page(vmf);
else
- return do_fault(fe);
+ return do_fault(vmf);
}
- if (!pte_present(entry))
- return do_swap_page(fe, entry);
+ if (!pte_present(vmf->orig_pte))
+ return do_swap_page(vmf);
- if (pte_protnone(entry) && vma_is_accessible(fe->vma))
- return do_numa_page(fe, entry);
+ if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
+ return do_numa_page(vmf);
- fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd);
- spin_lock(fe->ptl);
- if (unlikely(!pte_same(*fe->pte, entry)))
+ vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+ spin_lock(vmf->ptl);
+ entry = vmf->orig_pte;
+ if (unlikely(!pte_same(*vmf->pte, entry)))
goto unlock;
- if (fe->flags & FAULT_FLAG_WRITE) {
+ if (vmf->flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry))
- return do_wp_page(fe, entry);
+ return do_wp_page(vmf);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
- if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry,
- fe->flags & FAULT_FLAG_WRITE)) {
- update_mmu_cache(fe->vma, fe->address, fe->pte);
+ if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
+ vmf->flags & FAULT_FLAG_WRITE)) {
+ update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
} else {
/*
* This is needed only for protection faults but the arch code
@@ -3589,11 +3596,11 @@ static int handle_pte_fault(struct fault_env *fe)
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
- if (fe->flags & FAULT_FLAG_WRITE)
- flush_tlb_fix_spurious_fault(fe->vma, fe->address);
+ if (vmf->flags & FAULT_FLAG_WRITE)
+ flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
}
unlock:
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
@@ -3606,10 +3613,12 @@ unlock:
static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags)
{
- struct fault_env fe = {
+ struct vm_fault vmf = {
.vma = vma,
- .address = address,
+ .address = address & PAGE_MASK,
.flags = flags,
+ .pgoff = linear_page_index(vma, address),
+ .gfp_mask = __get_fault_gfp_mask(vma),
};
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
@@ -3619,35 +3628,35 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
pud = pud_alloc(mm, pgd, address);
if (!pud)
return VM_FAULT_OOM;
- fe.pmd = pmd_alloc(mm, pud, address);
- if (!fe.pmd)
+ vmf.pmd = pmd_alloc(mm, pud, address);
+ if (!vmf.pmd)
return VM_FAULT_OOM;
- if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) {
- int ret = create_huge_pmd(&fe);
+ if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
+ int ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
- pmd_t orig_pmd = *fe.pmd;
+ pmd_t orig_pmd = *vmf.pmd;
int ret;
barrier();
if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
- return do_huge_pmd_numa_page(&fe, orig_pmd);
+ return do_huge_pmd_numa_page(&vmf, orig_pmd);
- if ((fe.flags & FAULT_FLAG_WRITE) &&
+ if ((vmf.flags & FAULT_FLAG_WRITE) &&
!pmd_write(orig_pmd)) {
- ret = wp_huge_pmd(&fe, orig_pmd);
+ ret = wp_huge_pmd(&vmf, orig_pmd);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
- huge_pmd_set_accessed(&fe, orig_pmd);
+ huge_pmd_set_accessed(&vmf, orig_pmd);
return 0;
}
}
}
- return handle_pte_fault(&fe);
+ return handle_pte_fault(&vmf);
}
/*
@@ -3808,8 +3817,8 @@ out:
return -EINVAL;
}
-static inline int follow_pte(struct mm_struct *mm, unsigned long address,
- pte_t **ptepp, spinlock_t **ptlp)
+int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp,
+ spinlock_t **ptlp)
{
int res;
@@ -3919,7 +3928,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
struct page *page = NULL;
ret = get_user_pages_remote(tsk, mm, addr, 1,
- gup_flags, &page, &vma);
+ gup_flags, &page, &vma, NULL);
if (ret <= 0) {
#ifndef CONFIG_HAVE_IOREMAP_PROT
break;
diff --git a/mm/nommu.c b/mm/nommu.c
index 27bc543128e5..210d7ec2843c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -176,9 +176,10 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
}
EXPORT_SYMBOL(get_user_pages_locked);
-long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- struct page **pages, unsigned int gup_flags)
+static long __get_user_pages_unlocked(struct task_struct *tsk,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long nr_pages, struct page **pages,
+ unsigned int gup_flags)
{
long ret;
down_read(&mm->mmap_sem);
@@ -187,7 +188,6 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
up_read(&mm->mmap_sem);
return ret;
}
-EXPORT_SYMBOL(__get_user_pages_unlocked);
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags)
@@ -1801,7 +1801,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
EXPORT_SYMBOL(filemap_fault);
-void filemap_map_pages(struct fault_env *fe,
+void filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{
BUG();
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index be8dc8d1edb9..84d0c7eada2b 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -88,7 +88,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
ssize_t rc = 0;
unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
/ sizeof(struct pages *);
- unsigned int flags = FOLL_REMOTE;
+ unsigned int flags = 0;
/* Work out address and page range required */
if (len == 0)
@@ -100,15 +100,19 @@ static int process_vm_rw_single_vec(unsigned long addr,
while (!rc && nr_pages && iov_iter_count(iter)) {
int pages = min(nr_pages, max_pages_per_loop);
+ int locked = 1;
size_t bytes;
/*
* Get the pages we're interested in. We must
- * add FOLL_REMOTE because task/mm might not
+ * access remotely because task/mm might not
* current/current->mm
*/
- pages = __get_user_pages_unlocked(task, mm, pa, pages,
- process_pages, flags);
+ down_read(&mm->mmap_sem);
+ pages = get_user_pages_remote(task, mm, pa, pages, flags,
+ process_pages, NULL, &locked);
+ if (locked)
+ up_read(&mm->mmap_sem);
if (pages <= 0)
return -EFAULT;
diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index 682b73af7766..838ffa78cfda 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -881,7 +881,7 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos,
* the execve().
*/
if (get_user_pages_remote(current, bprm->mm, pos, 1,
- FOLL_FORCE, &page, NULL) <= 0)
+ FOLL_FORCE, &page, NULL, NULL) <= 0)
return false;
#else
page = bprm->page[pos / PAGE_SIZE];
diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index d08e214ec6e7..be93ab02b490 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -719,14 +719,14 @@ sub set_value {
if ($buildonly && $lvalue =~ /^TEST_TYPE(\[.*\])?$/ && $prvalue ne "build") {
# Note if a test is something other than build, then we
- # will need other manditory options.
+ # will need other mandatory options.
if ($prvalue ne "install") {
# for bisect, we need to check BISECT_TYPE
if ($prvalue ne "bisect") {
$buildonly = 0;
}
} else {
- # install still limits some manditory options.
+ # install still limits some mandatory options.
$buildonly = 2;
}
}
@@ -735,7 +735,7 @@ sub set_value {
if ($prvalue ne "install") {
$buildonly = 0;
} else {
- # install still limits some manditory options.
+ # install still limits some mandatory options.
$buildonly = 2;
}
}
@@ -3989,7 +3989,7 @@ sub make_min_config {
}
}
- # Save off all the current mandidory configs
+ # Save off all the current mandatory configs
open (OUT, ">$temp_config")
or die "Can't write to $temp_config";
foreach my $config (keys %keep_configs) {
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index efeceb0a222d..3815e940fbea 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -76,16 +76,20 @@ static void async_pf_execute(struct work_struct *work)
struct kvm_vcpu *vcpu = apf->vcpu;
unsigned long addr = apf->addr;
gva_t gva = apf->gva;
+ int locked = 1;
might_sleep();
/*
* This work is run asynchromously to the task which owns
* mm and might be done in another context, so we must
- * use FOLL_REMOTE.
+ * access remotely.
*/
- __get_user_pages_unlocked(NULL, mm, addr, 1, NULL,
- FOLL_WRITE | FOLL_REMOTE);
+ down_read(&mm->mmap_sem);
+ get_user_pages_remote(NULL, mm, addr, 1, FOLL_WRITE, NULL, NULL,
+ &locked);
+ if (locked)
+ up_read(&mm->mmap_sem);
kvm_async_page_present_sync(vcpu, apf);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1992b9ba6569..69a12471a060 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1418,13 +1418,12 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
npages = get_user_page_nowait(addr, write_fault, page);
up_read(&current->mm->mmap_sem);
} else {
- unsigned int flags = FOLL_TOUCH | FOLL_HWPOISON;
+ unsigned int flags = FOLL_HWPOISON;
if (write_fault)
flags |= FOLL_WRITE;
- npages = __get_user_pages_unlocked(current, current->mm, addr, 1,
- page, flags);
+ npages = get_user_pages_unlocked(addr, 1, page, flags);
}
if (npages != 1)
return npages;