summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2013-11-21 14:32:02 -0800
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2014-02-06 11:05:46 -0800
commit50d8f1b5c57bb29f02ab5834be334b4f7922b856 (patch)
tree3a9ead44eb4186570bf51ba5f20d3d6030ae271f /mm
parent18c8053c6b59f11bcd941850c1d13b3b502e1e74 (diff)
downloadlinux-rt-50d8f1b5c57bb29f02ab5834be334b4f7922b856.tar.gz
mm: hugetlbfs: fix hugetlbfs optimization
commit 27c73ae759774e63313c1fbfeb17ba076cea64c5 upstream. Commit 7cb2ef56e6a8 ("mm: fix aio performance regression for database caused by THP") can cause dereference of a dangling pointer if split_huge_page runs during PageHuge() if there are updates to the tail_page->private field. Also it is repeating compound_head twice for hugetlbfs and it is running compound_head+compound_trans_head for THP when a single one is needed in both cases. The new code within the PageSlab() check doesn't need to verify that the THP page size is never bigger than the smallest hugetlbfs page size, to avoid memory corruption. A longstanding theoretical race condition was found while fixing the above (see the change right after the skip_unlock label, that is relevant for the compound_lock path too). By re-establishing the _mapcount tail refcounting for all compound pages, this also fixes the below problem: echo 0 >/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages BUG: Bad page state in process bash pfn:59a01 page:ffffea000139b038 count:0 mapcount:10 mapping: (null) index:0x0 page flags: 0x1c00000000008000(tail) Modules linked in: CPU: 6 PID: 2018 Comm: bash Not tainted 3.12.0+ #25 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x55/0x76 bad_page+0xd5/0x130 free_pages_prepare+0x213/0x280 __free_pages+0x36/0x80 update_and_free_page+0xc1/0xd0 free_pool_huge_page+0xc2/0xe0 set_max_huge_pages.part.58+0x14c/0x220 nr_hugepages_store_common.isra.60+0xd0/0xf0 nr_hugepages_store+0x13/0x20 kobj_attr_store+0xf/0x20 sysfs_write_file+0x189/0x1e0 vfs_write+0xc5/0x1f0 SyS_write+0x55/0xb0 system_call_fastpath+0x16/0x1b Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Tested-by: Khalid Aziz <khalid.aziz@oracle.com> Cc: Pravin Shelar <pshelar@nicira.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Ben Hutchings <bhutchings@solarflare.com> Cc: Christoph Lameter <cl@linux.com> Cc: Johannes Weiner <jweiner@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Rik van Riel <riel@redhat.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Minchan Kim <minchan@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Cc: Guillaume Morin <guillaume@morinfr.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/hugetlb.c17
-rw-r--r--mm/swap.c107
2 files changed, 95 insertions, 29 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index af20b77a624a..7111f2f5af68 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -679,6 +679,23 @@ int PageHuge(struct page *page)
}
EXPORT_SYMBOL_GPL(PageHuge);
+/*
+ * PageHeadHuge() only returns true for hugetlbfs head page, but not for
+ * normal or transparent huge pages.
+ */
+int PageHeadHuge(struct page *page_head)
+{
+ compound_page_dtor *dtor;
+
+ if (!PageHead(page_head))
+ return 0;
+
+ dtor = get_compound_page_dtor(page_head);
+
+ return dtor == free_huge_page;
+}
+EXPORT_SYMBOL_GPL(PageHeadHuge);
+
pgoff_t __basepage_index(struct page *page)
{
struct page *page_head = compound_head(page);
diff --git a/mm/swap.c b/mm/swap.c
index f689e9a03204..a8feea68a606 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -77,18 +77,6 @@ static void __put_compound_page(struct page *page)
static void put_compound_page(struct page *page)
{
- /*
- * hugetlbfs pages cannot be split from under us. If this is a
- * hugetlbfs page, check refcount on head page and release the page if
- * the refcount becomes zero.
- */
- if (PageHuge(page)) {
- page = compound_head(page);
- if (put_page_testzero(page))
- __put_compound_page(page);
- return;
- }
-
if (unlikely(PageTail(page))) {
/* __split_huge_page_refcount can run under us */
struct page *page_head = compound_trans_head(page);
@@ -96,6 +84,35 @@ static void put_compound_page(struct page *page)
if (likely(page != page_head &&
get_page_unless_zero(page_head))) {
unsigned long flags;
+
+ if (PageHeadHuge(page_head)) {
+ if (likely(PageTail(page))) {
+ /*
+ * __split_huge_page_refcount
+ * cannot race here.
+ */
+ VM_BUG_ON(!PageHead(page_head));
+ atomic_dec(&page->_mapcount);
+ if (put_page_testzero(page_head))
+ VM_BUG_ON(1);
+ if (put_page_testzero(page_head))
+ __put_compound_page(page_head);
+ return;
+ } else {
+ /*
+ * __split_huge_page_refcount
+ * run before us, "page" was a
+ * THP tail. The split
+ * page_head has been freed
+ * and reallocated as slab or
+ * hugetlbfs page of smaller
+ * order (only possible if
+ * reallocated as slab on
+ * x86).
+ */
+ goto skip_lock;
+ }
+ }
/*
* page_head wasn't a dangling pointer but it
* may not be a head page anymore by the time
@@ -107,9 +124,29 @@ static void put_compound_page(struct page *page)
/* __split_huge_page_refcount run before us */
compound_unlock_irqrestore(page_head, flags);
VM_BUG_ON(PageHead(page_head));
- if (put_page_testzero(page_head))
- __put_single_page(page_head);
- out_put_single:
+skip_lock:
+ if (put_page_testzero(page_head)) {
+ /*
+ * The head page may have been
+ * freed and reallocated as a
+ * compound page of smaller
+ * order and then freed again.
+ * All we know is that it
+ * cannot have become: a THP
+ * page, a compound page of
+ * higher order, a tail page.
+ * That is because we still
+ * hold the refcount of the
+ * split THP tail and
+ * page_head was the THP head
+ * before the split.
+ */
+ if (PageHead(page_head))
+ __put_compound_page(page_head);
+ else
+ __put_single_page(page_head);
+ }
+out_put_single:
if (put_page_testzero(page))
__put_single_page(page);
return;
@@ -173,21 +210,34 @@ bool __get_page_tail(struct page *page)
*/
unsigned long flags;
bool got = false;
- struct page *page_head;
+ struct page *page_head = compound_trans_head(page);
- /*
- * If this is a hugetlbfs page it cannot be split under us. Simply
- * increment refcount for the head page.
- */
- if (PageHuge(page)) {
- page_head = compound_head(page);
- atomic_inc(&page_head->_count);
- got = true;
- goto out;
- }
-
- page_head = compound_trans_head(page);
if (likely(page != page_head && get_page_unless_zero(page_head))) {
+ /* Ref to put_compound_page() comment. */
+ if (PageHeadHuge(page_head)) {
+ if (likely(PageTail(page))) {
+ /*
+ * This is a hugetlbfs
+ * page. __split_huge_page_refcount
+ * cannot race here.
+ */
+ VM_BUG_ON(!PageHead(page_head));
+ __get_page_tail_foll(page, false);
+ return true;
+ } else {
+ /*
+ * __split_huge_page_refcount run
+ * before us, "page" was a THP
+ * tail. The split page_head has been
+ * freed and reallocated as slab or
+ * hugetlbfs page of smaller order
+ * (only possible if reallocated as
+ * slab on x86).
+ */
+ put_page(page_head);
+ return false;
+ }
+ }
/*
* page_head wasn't a dangling pointer but it
* may not be a head page anymore by the time
@@ -204,7 +254,6 @@ bool __get_page_tail(struct page *page)
if (unlikely(!got))
put_page(page_head);
}
-out:
return got;
}
EXPORT_SYMBOL(__get_page_tail);