diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4fc590a1257d..899f6a81e77a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1437,7 +1437,76 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) dissolve_free_huge_page(pfn_to_page(pfn)); } -static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) +/* + * There are 3 ways this can get called: + * 1. With vma+addr: we use the VMA's memory policy + * 2. With !vma, but nid=NUMA_NO_NODE: We try to allocate a huge + * page from any node, and let the buddy allocator itself figure + * it out. + * 3. With !vma, but nid!=NUMA_NO_NODE. We allocate a huge page + * strictly from 'nid' + */ +static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr, int nid) +{ + int order = huge_page_order(h); + gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN; + unsigned int cpuset_mems_cookie; + + /* + * We need a VMA to get a memory policy. If we do not + * have one, we use the 'nid' argument + */ + if (!vma) { + /* + * If a specific node is requested, make sure to + * get memory from there, but only when a node + * is explicitly specified. + */ + if (nid != NUMA_NO_NODE) + gfp |= __GFP_THISNODE; + /* + * Make sure to call something that can handle + * nid=NUMA_NO_NODE + */ + return alloc_pages_node(nid, gfp, order); + } + + /* + * OK, so we have a VMA. Fetch the mempolicy and try to + * allocate a huge page with it. + */ + do { + struct page *page; + struct mempolicy *mpol; + struct zonelist *zl; + nodemask_t *nodemask; + + cpuset_mems_cookie = read_mems_allowed_begin(); + zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask); + mpol_cond_put(mpol); + page = __alloc_pages_nodemask(gfp, order, zl, nodemask); + if (page) + return page; + } while (read_mems_allowed_retry(cpuset_mems_cookie)); + + return NULL; +} + +/* + * There are two ways to allocate a huge page: + * 1. When you have a VMA and an address (like a fault) + * 2. When you have no VMA (like when setting /proc/.../nr_hugepages) + * + * 'vma' and 'addr' are only for (1). 'nid' is always NUMA_NO_NODE in + * this case which signifies that the allocation should be done with + * respect for the VMA's memory policy. + * + * For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This + * implies that memory policies will not be taken in to account. + */ +static struct page *__alloc_buddy_huge_page(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr, int nid) { struct page *page; unsigned int r_nid; @@ -1445,6 +1514,15 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) if (hstate_is_gigantic(h)) return NULL; + /* + * Make sure that anyone specifying 'nid' is not also specifying a VMA. + * This makes sure the caller is picking _one_ of the modes with which + * we can call this function, not both. + */ + if (vma || (addr != -1)) { + WARN_ON_ONCE(addr == -1); + WARN_ON_ONCE(nid != NUMA_NO_NODE); + } /* * Assume we will successfully allocate the surplus page to * prevent racing processes from causing the surplus to exceed @@ -1478,14 +1556,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) } spin_unlock(&hugetlb_lock); - if (nid == NUMA_NO_NODE) - page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP| - __GFP_REPEAT|__GFP_NOWARN, - huge_page_order(h)); - else - page = __alloc_pages_node(nid, - htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| - __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); + page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid); spin_lock(&hugetlb_lock); if (page) { @@ -1509,6 +1580,27 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) return page; } +/* + * Allocate a huge page from 'nid'. Note, 'nid' may be + * NUMA_NO_NODE, which means that it may be allocated + * anywhere. + */ +struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid) +{ + unsigned long addr = -1; + + return __alloc_buddy_huge_page(h, NULL, addr, nid); +} + +/* + * Use the VMA's mpolicy to allocate a huge page from the buddy. + */ +struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE); +} + /* * This allocation function is useful in the context where vma is irrelevant. * E.g. soft-offlining uses this function because it only cares physical @@ -1524,7 +1616,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid) spin_unlock(&hugetlb_lock); if (!page) - page = alloc_buddy_huge_page(h, nid); + page = __alloc_buddy_huge_page_no_mpol(h, nid); return page; } @@ -1554,7 +1646,7 @@ static int gather_surplus_pages(struct hstate *h, int delta) retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = alloc_buddy_huge_page(h, NUMA_NO_NODE); + page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE); if (!page) { alloc_ok = false; break; @@ -1787,7 +1879,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); if (!page) { spin_unlock(&hugetlb_lock); - page = alloc_buddy_huge_page(h, NUMA_NO_NODE); + page = __alloc_buddy_huge_page_with_mpol(h, vma, addr); if (!page) goto out_uncharge_cgroup;