From eb6434d9e79a72d35d68811efd68fe8bab8f5baf Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 21 Jan 2009 17:45:47 +0900
Subject: nommu: Stub in vm_map_ram()/vm_unmap_ram()/vm_unmap_aliases().

Presently we do not support these interfaces, so make them BUG() wrappers
as per the rest of the vmap interface on nommu. Fixes up the modular xfs
build.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 mm/nommu.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/nommu.c b/mm/nommu.c
index 8cee8c8ff0f2..0c3e7d2114f6 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -10,7 +10,7 @@
  *  Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
  *  Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
  *  Copyright (c) 2002      Greg Ungerer <gerg@snapgear.com>
- *  Copyright (c) 2007-2008 Paul Mundt <lethal@linux-sh.org>
+ *  Copyright (c) 2007-2009 Paul Mundt <lethal@linux-sh.org>
  */
 
 #include <linux/module.h>
@@ -394,6 +394,24 @@ void vunmap(const void *addr)
 }
 EXPORT_SYMBOL(vunmap);
 
+void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
+{
+	BUG();
+	return NULL;
+}
+EXPORT_SYMBOL(vm_map_ram);
+
+void vm_unmap_ram(const void *mem, unsigned int count)
+{
+	BUG();
+}
+EXPORT_SYMBOL(vm_unmap_ram);
+
+void vm_unmap_aliases(void)
+{
+}
+EXPORT_SYMBOL_GPL(vm_unmap_aliases);
+
 /*
  * Implement a stub for vmalloc_sync_all() if the architecture chose not to
  * have one.
-- 
cgit v1.2.1


From 05ae6fa31874eda2484da13c5dc4ddee8a47a0a4 Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Tue, 13 Jan 2009 17:30:22 +1000
Subject: uclinux: add process name to allocation error message

This patch adds the name of the process to the bad allocation error
message on non-MMU systems.

Changed suggested by jsujjavanich@syntech-fuelmaster.com

Signed-off-by: Greg Ungerer <gerg@uclinux.org>
---
 mm/nommu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/nommu.c b/mm/nommu.c
index 0c3e7d2114f6..2fcf47d449b4 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1161,8 +1161,8 @@ error_free:
 	return ret;
 
 enomem:
-	printk("Allocation of length %lu from process %d failed\n",
-	       len, current->pid);
+	printk("Allocation of length %lu from process %d (%s) failed\n",
+	       len, current->pid, current->comm);
 	show_free_areas();
 	return -ENOMEM;
 }
-- 
cgit v1.2.1


From 3718909448116bf4411445468c58acc946379f92 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 27 Jan 2009 18:59:46 -0800
Subject: slub: fix per cpu kmem_cache_cpu array memory leak

The per cpu array of kmem_cache_cpu structures accomodates
NR_KMEM_CACHE_CPU such structs.

When this array overflows and a struct is allocated by kmalloc(), it may
have an address at the upper bound of this array.  If this happens, it
does not get freed and the per cpu kmem_cache_cpu_free pointer will be out
of bounds after kmem_cache_destroy() or cpu offlining.

Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 6392ae5cc6b1..bdc9abb08a23 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1996,7 +1996,7 @@ static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
 static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu)
 {
 	if (c < per_cpu(kmem_cache_cpu, cpu) ||
-			c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
+			c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
 		kfree(c);
 		return;
 	}
-- 
cgit v1.2.1


From de33c8db5910cda599899dd431cc30d7c1018cbf Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 29 Jan 2009 17:46:42 -0800
Subject: Fix OOPS in mmap_region() when merging adjacent VM_LOCKED file
 segments

As of commit ba470de43188cdbff795b5da43a1474523c6c2fb ("map: handle
mlocked pages during map, remap, unmap") we now use the 'vma' variable
at the end of mmap_region() to handle the page-in of newly mapped
mlocked pages.

However, if we merged adjacent vma's together, the vma we're using may
be stale.  We historically consciously avoided using it after the merge
operation, but that got overlooked when redoing the locked page
handling.

This commit simplifies mmap_region() by doing any vma merges early,
avoiding the issue entirely, and 'vma' will always be valid.  As pointed
out by Hugh Dickins, this depends on any drivers that change the page
offset of flags to have set one of the VM_SPECIAL bits (so that they
cannot trigger the early merge logic), but that's true in general.

Reported-and-tested-by: Maksim Yevmenkin <maksim.yevmenkin@gmail.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 26 ++++++--------------------
 1 file changed, 6 insertions(+), 20 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index 8d95902e9a38..d3fa10a726cf 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1134,16 +1134,11 @@ munmap_back:
 	}
 
 	/*
-	 * Can we just expand an old private anonymous mapping?
-	 * The VM_SHARED test is necessary because shmem_zero_setup
-	 * will create the file object for a shared anonymous map below.
+	 * Can we just expand an old mapping?
 	 */
-	if (!file && !(vm_flags & VM_SHARED)) {
-		vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
-					NULL, NULL, pgoff, NULL);
-		if (vma)
-			goto out;
-	}
+	vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
+	if (vma)
+		goto out;
 
 	/*
 	 * Determine the object being mapped and call the appropriate
@@ -1206,17 +1201,8 @@ munmap_back:
 	if (vma_wants_writenotify(vma))
 		vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
 
-	if (file && vma_merge(mm, prev, addr, vma->vm_end,
-			vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
-		mpol_put(vma_policy(vma));
-		kmem_cache_free(vm_area_cachep, vma);
-		fput(file);
-		if (vm_flags & VM_EXECUTABLE)
-			removed_exe_file_vma(mm);
-	} else {
-		vma_link(mm, vma, prev, rb_link, rb_parent);
-		file = vma->vm_file;
-	}
+	vma_link(mm, vma, prev, rb_link, rb_parent);
+	file = vma->vm_file;
 
 	/* Once vma denies write, undo our temporary denial count */
 	if (correct_wcount)
-- 
cgit v1.2.1


From 7bcc1bb1232de6efc0b85e0c7fe38e90b2436318 Mon Sep 17 00:00:00 2001
From: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Date: Thu, 29 Jan 2009 14:25:11 -0800
Subject: memcg: get/put parents at create/free

The lifetime of struct cgroup and struct mem_cgroup is different and
mem_cgroup has its own reference count for handling references from
swap_cgroup.

This causes strange problem that the parent mem_cgroup dies while child
mem_cgroup alive, and this problem causes a bug in case of
use_hierarchy==1 because res_counter_uncharge climbs up the tree.

This patch is for avoiding it by getting the parent at create, and putting
it at freeing.

Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Reviewed-by; KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4d0ea3ceba6d..76feccd26dc3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -202,6 +202,7 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
 
 static void mem_cgroup_get(struct mem_cgroup *mem);
 static void mem_cgroup_put(struct mem_cgroup *mem);
+static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
 
 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
 					 struct page_cgroup *pc,
@@ -2193,10 +2194,23 @@ static void mem_cgroup_get(struct mem_cgroup *mem)
 
 static void mem_cgroup_put(struct mem_cgroup *mem)
 {
-	if (atomic_dec_and_test(&mem->refcnt))
+	if (atomic_dec_and_test(&mem->refcnt)) {
+		struct mem_cgroup *parent = parent_mem_cgroup(mem);
 		__mem_cgroup_free(mem);
+		if (parent)
+			mem_cgroup_put(parent);
+	}
 }
 
+/*
+ * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
+ */
+static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
+{
+	if (!mem->res.parent)
+		return NULL;
+	return mem_cgroup_from_res_counter(mem->res.parent, res);
+}
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 static void __init enable_swap_cgroup(void)
@@ -2235,6 +2249,13 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	if (parent && parent->use_hierarchy) {
 		res_counter_init(&mem->res, &parent->res);
 		res_counter_init(&mem->memsw, &parent->memsw);
+		/*
+		 * We increment refcnt of the parent to ensure that we can
+		 * safely access it on res_counter_charge/uncharge.
+		 * This refcnt will be decremented when freeing this
+		 * mem_cgroup(see mem_cgroup_put).
+		 */
+		mem_cgroup_get(parent);
 	} else {
 		res_counter_init(&mem->res, NULL);
 		res_counter_init(&mem->memsw, NULL);
-- 
cgit v1.2.1


From 85d9fc89fb0f0703df6444f260187c088a8d59ff Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 29 Jan 2009 14:25:13 -0800
Subject: memcg: fix refcnt handling at swapoff

Now, at swapoff, even while try_charge() fails, commit is executed.  This
is a bug which turns the refcnt of cgroup_subsys_state negative.

Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Tested-by: Li Zefan <lizf@cn.fujitsu.com>
Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swapfile.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index f48b831e5e5c..7e6304dfafab 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -698,8 +698,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 	pte_t *pte;
 	int ret = 1;
 
-	if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr))
+	if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) {
 		ret = -ENOMEM;
+		goto out_nolock;
+	}
 
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
@@ -723,6 +725,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 	activate_page(page);
 out:
 	pte_unmap_unlock(pte, ptl);
+out_nolock:
 	return ret;
 }
 
-- 
cgit v1.2.1


From 299b4eaa302138426d5a9ecd954de1f565d76c94 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 29 Jan 2009 14:25:17 -0800
Subject: memcg: NULL pointer dereference at rmdir on some NUMA systems

N_POSSIBLE doesn't means there is memory...and force_empty can
visit invalid node which have no pgdat.

To visit all valid nodes, N_HIGH_MEMORY should be used.

Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Tested-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 76feccd26dc3..8e4be9cb2a6a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1685,7 +1685,7 @@ move_account:
 		/* This is for making all *used* pages to be on LRU. */
 		lru_add_drain_all();
 		ret = 0;
-		for_each_node_state(node, N_POSSIBLE) {
+		for_each_node_state(node, N_HIGH_MEMORY) {
 			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
 				enum lru_list l;
 				for_each_lru(l) {
-- 
cgit v1.2.1


From 33bfad54b58cf05cfe6678c3ec9235d4bc8db4c2 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 30 Jan 2009 11:37:22 -0800
Subject: Allow opportunistic merging of VM_CAN_NONLINEAR areas

Commit de33c8db5910cda599899dd431cc30d7c1018cbf ("Fix OOPS in
mmap_region() when merging adjacent VM_LOCKED file segments") unified
the vma merging of anonymous and file maps to just one place, which
simplified the code and fixed a use-after-free bug that could cause an
oops.

But by doing the merge opportunistically before even having called
->mmap() on the file method, it now compares two different 'vm_flags'
values: the pre-mmap() value of the new not-yet-formed vma, and previous
mappings of the same file around it.

And in doing so, it refused to merge the common file case, which adds a
marker to say "I can be made non-linear".

This fixes it by just adding a set of flags that don't have to match,
because we know they are ok to merge.  Currently it's only that single
VM_CAN_NONLINEAR flag, but at least conceptually there could be others
in the future.

Reported-and-acked-by: Hugh Dickins <hugh@veritas.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Greg KH <gregkh@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index d3fa10a726cf..c581df14d0de 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -658,6 +658,9 @@ again:			remove_next = 1 + (end > next->vm_end);
 	validate_mm(mm);
 }
 
+/* Flags that can be inherited from an existing mapping when merging */
+#define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR)
+
 /*
  * If the vma has a ->close operation then the driver probably needs to release
  * per-vma resources, so we don't attempt to merge those.
@@ -665,7 +668,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
 			struct file *file, unsigned long vm_flags)
 {
-	if (vma->vm_flags != vm_flags)
+	if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS)
 		return 0;
 	if (vma->vm_file != file)
 		return 0;
-- 
cgit v1.2.1


From fc8744adc870a8d4366908221508bb113d8b72ee Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 31 Jan 2009 15:08:56 -0800
Subject: Stop playing silly games with the VM_ACCOUNT flag

The mmap_region() code would temporarily set the VM_ACCOUNT flag for
anonymous shared mappings just to inform shmem_zero_setup() that it
should enable accounting for the resulting shm object.  It would then
clear the flag after calling ->mmap (for the /dev/zero case) or doing
shmem_zero_setup() (for the MAP_ANON case).

This just resulted in vma merge issues, but also made for just
unnecessary confusion.  Use the already-existing VM_NORESERVE flag for
this instead, and let shmem_{zero|file}_setup() just figure it out from
that.

This also happens to make it obvious that the new DRI2 GEM layer uses a
non-reserving backing store for its object allocation - which is quite
possibly not intentional.  But since I didn't want to change semantics
in this patch, I left it alone, and just updated the caller to use the
new flag semantics.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c  | 48 +++++++++++++++++++++++++-----------------------
 mm/shmem.c |  2 +-
 2 files changed, 26 insertions(+), 24 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index c581df14d0de..214b6a258eeb 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1090,6 +1090,15 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
 		mapping_cap_account_dirty(vma->vm_file->f_mapping);
 }
 
+/*
+ * We account for memory if it's a private writeable mapping,
+ * and VM_NORESERVE wasn't set.
+ */
+static inline int accountable_mapping(unsigned int vm_flags)
+{
+	return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
+}
+
 unsigned long mmap_region(struct file *file, unsigned long addr,
 			  unsigned long len, unsigned long flags,
 			  unsigned int vm_flags, unsigned long pgoff,
@@ -1117,23 +1126,24 @@ munmap_back:
 	if (!may_expand_vm(mm, len >> PAGE_SHIFT))
 		return -ENOMEM;
 
-	if (flags & MAP_NORESERVE)
+	/*
+	 * Set 'VM_NORESERVE' if we should not account for the
+	 * memory use of this mapping. We only honor MAP_NORESERVE
+	 * if we're allowed to overcommit memory.
+	 */
+	if ((flags & MAP_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+		vm_flags |= VM_NORESERVE;
+	if (!accountable)
 		vm_flags |= VM_NORESERVE;
 
-	if (accountable && (!(flags & MAP_NORESERVE) ||
-			    sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
-		if (vm_flags & VM_SHARED) {
-			/* Check memory availability in shmem_file_setup? */
-			vm_flags |= VM_ACCOUNT;
-		} else if (vm_flags & VM_WRITE) {
-			/*
-			 * Private writable mapping: check memory availability
-			 */
-			charged = len >> PAGE_SHIFT;
-			if (security_vm_enough_memory(charged))
-				return -ENOMEM;
-			vm_flags |= VM_ACCOUNT;
-		}
+	/*
+	 * Private writable mapping: check memory availability
+	 */
+	if (accountable_mapping(vm_flags)) {
+		charged = len >> PAGE_SHIFT;
+		if (security_vm_enough_memory(charged))
+			return -ENOMEM;
+		vm_flags |= VM_ACCOUNT;
 	}
 
 	/*
@@ -1184,14 +1194,6 @@ munmap_back:
 			goto free_vma;
 	}
 
-	/* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
-	 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
-	 * that memory reservation must be checked; but that reservation
-	 * belongs to shared memory object, not to vma: so now clear it.
-	 */
-	if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
-		vma->vm_flags &= ~VM_ACCOUNT;
-
 	/* Can addr have changed??
 	 *
 	 * Answer: Yes, several device drivers can do it in their
diff --git a/mm/shmem.c b/mm/shmem.c
index 5d0de96c9789..19d566ccdeea 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2628,7 +2628,7 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
 		goto close_file;
 
 #ifdef CONFIG_SHMEM
-	SHMEM_I(inode)->flags = flags & VM_ACCOUNT;
+	SHMEM_I(inode)->flags = (flags & VM_NORESERVE) ? 0 : VM_ACCOUNT;
 #endif
 	d_instantiate(dentry, inode);
 	inode->i_size = size;
-- 
cgit v1.2.1


From 27421e211a39784694b597dbf35848b88363c248 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 1 Feb 2009 11:00:16 -0800
Subject: Manually revert "mlock: downgrade mmap sem while populating mlocked
 regions"

This essentially reverts commit 8edb08caf68184fb170f4f69c7445929e199eaea.

It downgraded our mmap semaphore to a read-lock while mlocking pages, in
order to allow other threads (and external accesses like "ps" et al) to
walk the vma lists and take page faults etc.  Which is a nice idea, but
the implementation does not work.

Because we cannot upgrade the lock back to a write lock without
releasing the mmap semaphore, the code had to release the lock entirely
and then re-take it as a writelock.  However, that meant that the caller
possibly lost the vma chain that it was following, since now another
thread could come in and mmap/munmap the range.

The code tried to work around that by just looking up the vma again and
erroring out if that happened, but quite frankly, that was just a buggy
hack that doesn't actually protect against anything (the other thread
could just have replaced the vma with another one instead of totally
unmapping it).

The only way to downgrade to a read map _reliably_ is to do it at the
end, which is likely the right thing to do: do all the 'vma' operations
with the write-lock held, then downgrade to a read after completing them
all, and then do the "populate the newly mlocked regions" while holding
just the read lock.  And then just drop the read-lock and return to user
space.

The (perhaps somewhat simpler) alternative is to just make all the
callers of mlock_vma_pages_range() know that the mmap lock got dropped,
and just re-grab the mmap semaphore if it needs to mlock more than one
vma region.

So we can do this "downgrade mmap sem while populating mlocked regions"
thing right, but the way it was done here was absolutely not correct.
Thus the revert, in the expectation that we will do it all correctly
some day.

Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mlock.c | 47 ++---------------------------------------------
 1 file changed, 2 insertions(+), 45 deletions(-)

(limited to 'mm')

diff --git a/mm/mlock.c b/mm/mlock.c
index 2904a347e476..028ec482fdd4 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -294,14 +294,10 @@ static inline int __mlock_posix_error_return(long retval)
  *
  * return number of pages [> 0] to be removed from locked_vm on success
  * of "special" vmas.
- *
- * return negative error if vma spanning @start-@range disappears while
- * mmap semaphore is dropped.  Unlikely?
  */
 long mlock_vma_pages_range(struct vm_area_struct *vma,
 			unsigned long start, unsigned long end)
 {
-	struct mm_struct *mm = vma->vm_mm;
 	int nr_pages = (end - start) / PAGE_SIZE;
 	BUG_ON(!(vma->vm_flags & VM_LOCKED));
 
@@ -314,20 +310,8 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
 	if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
 			is_vm_hugetlb_page(vma) ||
 			vma == get_gate_vma(current))) {
-		long error;
-		downgrade_write(&mm->mmap_sem);
-
-		error = __mlock_vma_pages_range(vma, start, end, 1);
 
-		up_read(&mm->mmap_sem);
-		/* vma can change or disappear */
-		down_write(&mm->mmap_sem);
-		vma = find_vma(mm, start);
-		/* non-NULL vma must contain @start, but need to check @end */
-		if (!vma ||  end > vma->vm_end)
-			return -ENOMEM;
-
-		return 0;	/* hide other errors from mmap(), et al */
+		return __mlock_vma_pages_range(vma, start, end, 1);
 	}
 
 	/*
@@ -438,41 +422,14 @@ success:
 	vma->vm_flags = newflags;
 
 	if (lock) {
-		/*
-		 * mmap_sem is currently held for write.  Downgrade the write
-		 * lock to a read lock so that other faults, mmap scans, ...
-		 * while we fault in all pages.
-		 */
-		downgrade_write(&mm->mmap_sem);
-
 		ret = __mlock_vma_pages_range(vma, start, end, 1);
 
-		/*
-		 * Need to reacquire mmap sem in write mode, as our callers
-		 * expect this.  We have no support for atomically upgrading
-		 * a sem to write, so we need to check for ranges while sem
-		 * is unlocked.
-		 */
-		up_read(&mm->mmap_sem);
-		/* vma can change or disappear */
-		down_write(&mm->mmap_sem);
-		*prev = find_vma(mm, start);
-		/* non-NULL *prev must contain @start, but need to check @end */
-		if (!(*prev) || end > (*prev)->vm_end)
-			ret = -ENOMEM;
-		else if (ret > 0) {
+		if (ret > 0) {
 			mm->locked_vm -= ret;
 			ret = 0;
 		} else
 			ret = __mlock_posix_error_return(ret); /* translate if needed */
 	} else {
-		/*
-		 * TODO:  for unlocking, pages will already be resident, so
-		 * we don't need to wait for allocations/reclaim/pagein, ...
-		 * However, unlocking a very large region can still take a
-		 * while.  Should we downgrade the semaphore for both lock
-		 * AND unlock ?
-		 */
 		__mlock_vma_pages_range(vma, start, end, 0);
 	}
 
-- 
cgit v1.2.1